@c4t4/heyamigo 0.10.7 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/config.example.json +6 -0
- package/dist/audio/transcription.js +58 -0
- package/dist/config.js +15 -0
- package/dist/gateway/ingest.js +40 -1
- package/dist/store/media.js +9 -4
- package/package.json +1 -1
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { dirname } from 'path';
|
|
2
|
+
import { getProvider } from '../ai/providers.js';
|
|
3
|
+
import { config } from '../config.js';
|
|
4
|
+
import { logger } from '../logger.js';
|
|
5
|
+
const UNTRANSCRIBABLE = '[UNTRANSCRIBABLE]';
|
|
6
|
+
export async function transcribeAudioFile(params) {
|
|
7
|
+
const cfg = config.audio.transcription;
|
|
8
|
+
if (!cfg.enabled)
|
|
9
|
+
return null;
|
|
10
|
+
try {
|
|
11
|
+
const provider = getProvider();
|
|
12
|
+
const result = await provider.runTask({
|
|
13
|
+
input: [
|
|
14
|
+
'Transcribe the audio file at this exact path.',
|
|
15
|
+
'',
|
|
16
|
+
params.path,
|
|
17
|
+
'',
|
|
18
|
+
'Return only the spoken transcript text.',
|
|
19
|
+
`If the file is not readable or cannot be transcribed, return exactly ${UNTRANSCRIBABLE}.`,
|
|
20
|
+
'Do not answer the speaker. Do not summarize. Do not add labels, markdown, or commentary.',
|
|
21
|
+
].join('\n'),
|
|
22
|
+
caller: 'audio-transcription',
|
|
23
|
+
mode: 'read-only',
|
|
24
|
+
lane: 'background',
|
|
25
|
+
includeSystemPrompt: false,
|
|
26
|
+
addDirs: [dirname(params.path), config.storage.mediaDir],
|
|
27
|
+
});
|
|
28
|
+
const text = cleanupTranscript(result.reply);
|
|
29
|
+
if (!text)
|
|
30
|
+
return null;
|
|
31
|
+
logger.info({
|
|
32
|
+
provider: provider.name,
|
|
33
|
+
address: params.address,
|
|
34
|
+
externalMsgId: params.externalMsgId,
|
|
35
|
+
chars: text.length,
|
|
36
|
+
}, 'audio transcribed');
|
|
37
|
+
return text;
|
|
38
|
+
}
|
|
39
|
+
catch (err) {
|
|
40
|
+
logger.warn({
|
|
41
|
+
err,
|
|
42
|
+
provider: config.ai.provider,
|
|
43
|
+
address: params.address,
|
|
44
|
+
externalMsgId: params.externalMsgId,
|
|
45
|
+
}, 'audio transcription failed');
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
function cleanupTranscript(reply) {
|
|
50
|
+
let text = reply.trim();
|
|
51
|
+
if (!text)
|
|
52
|
+
return null;
|
|
53
|
+
text = text.replace(/^```(?:text)?\s*/i, '').replace(/\s*```$/i, '').trim();
|
|
54
|
+
text = text.replace(/^transcript:\s*/i, '').trim();
|
|
55
|
+
if (!text || text === UNTRANSCRIBABLE)
|
|
56
|
+
return null;
|
|
57
|
+
return text;
|
|
58
|
+
}
|
package/dist/config.js
CHANGED
|
@@ -38,6 +38,21 @@ const ConfigSchema = z.object({
|
|
|
38
38
|
provider: z.enum(['claude', 'codex', 'grok']).default('claude'),
|
|
39
39
|
})
|
|
40
40
|
.default({ provider: 'claude' }),
|
|
41
|
+
audio: z
|
|
42
|
+
.object({
|
|
43
|
+
transcription: z
|
|
44
|
+
.object({
|
|
45
|
+
enabled: z.boolean().default(true),
|
|
46
|
+
})
|
|
47
|
+
.default({
|
|
48
|
+
enabled: true,
|
|
49
|
+
}),
|
|
50
|
+
})
|
|
51
|
+
.default({
|
|
52
|
+
transcription: {
|
|
53
|
+
enabled: true,
|
|
54
|
+
},
|
|
55
|
+
}),
|
|
41
56
|
claude: z.object({
|
|
42
57
|
model: z.string(),
|
|
43
58
|
personalityFile: z.string(),
|
package/dist/gateway/ingest.js
CHANGED
|
@@ -2,6 +2,7 @@ import { unlink } from 'fs/promises';
|
|
|
2
2
|
import { resolve } from 'path';
|
|
3
3
|
import { getProvider } from '../ai/providers.js';
|
|
4
4
|
import { getSession } from '../ai/sessions.js';
|
|
5
|
+
import { transcribeAudioFile } from '../audio/transcription.js';
|
|
5
6
|
import { config } from '../config.js';
|
|
6
7
|
import { personIdForAddress } from '../db/identity-sync.js';
|
|
7
8
|
import { estimate as estimateJob } from '../estimates/index.js';
|
|
@@ -50,6 +51,24 @@ function buildImageGenRoutingContract() {
|
|
|
50
51
|
`Reply briefly and emit [ASYNC: Generate the requested image using current chat context. Save final files under ${outboxPath}/. Follow-up reply must include one [IMAGE: /absolute/path] tag per final image, or say: Image job failed before producing a file.]`,
|
|
51
52
|
].join('\n');
|
|
52
53
|
}
|
|
54
|
+
function shouldTranscribeAudio(params) {
|
|
55
|
+
if (params.media?.mediaType !== 'audio')
|
|
56
|
+
return false;
|
|
57
|
+
if (!params.respond)
|
|
58
|
+
return false;
|
|
59
|
+
if (params.selfChat)
|
|
60
|
+
return true;
|
|
61
|
+
return params.triggerMode !== 'off';
|
|
62
|
+
}
|
|
63
|
+
function mergeAudioTranscript(text, transcript) {
|
|
64
|
+
const cleanedTranscript = transcript.trim();
|
|
65
|
+
const cleanedText = text.trim();
|
|
66
|
+
if (!cleanedTranscript)
|
|
67
|
+
return text;
|
|
68
|
+
if (!cleanedText)
|
|
69
|
+
return cleanedTranscript;
|
|
70
|
+
return `${cleanedText}\n\n[Audio transcript]\n${cleanedTranscript}`;
|
|
71
|
+
}
|
|
53
72
|
export async function processIncomingMessage(incoming, opts = {}) {
|
|
54
73
|
const stored = toStored(incoming);
|
|
55
74
|
const ageMs = Date.now() - stored.timestamp * 1000;
|
|
@@ -118,6 +137,26 @@ export async function processIncomingMessage(incoming, opts = {}) {
|
|
|
118
137
|
stored.mediaPath = media.mediaPath;
|
|
119
138
|
stored.mediaMime = media.mediaMime;
|
|
120
139
|
}
|
|
140
|
+
const originalMediaText = stored.text;
|
|
141
|
+
let audioTranscript = null;
|
|
142
|
+
const transcribeThisAudio = shouldTranscribeAudio({
|
|
143
|
+
media,
|
|
144
|
+
respond: decision.respond,
|
|
145
|
+
triggerMode: decision.triggerMode,
|
|
146
|
+
selfChat: incoming.selfChat,
|
|
147
|
+
}) && media;
|
|
148
|
+
if (transcribeThisAudio) {
|
|
149
|
+
audioTranscript = await transcribeAudioFile({
|
|
150
|
+
path: transcribeThisAudio.mediaPath,
|
|
151
|
+
mime: transcribeThisAudio.mediaMime,
|
|
152
|
+
address: incoming.address,
|
|
153
|
+
externalMsgId: incoming.externalMsgId,
|
|
154
|
+
});
|
|
155
|
+
if (audioTranscript) {
|
|
156
|
+
stored.text = mergeAudioTranscript(stored.text, audioTranscript);
|
|
157
|
+
logCtx.text = stored.text.slice(0, 80);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
121
160
|
await append(stored);
|
|
122
161
|
if (!decision.respond) {
|
|
123
162
|
logger.info(logCtx, 'message captured, silent');
|
|
@@ -168,7 +207,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
|
|
|
168
207
|
const existingSession = getSession(stored.jid, getProvider().name);
|
|
169
208
|
let userContent = stored.text;
|
|
170
209
|
if (media) {
|
|
171
|
-
userContent = mediaPromptTag(media,
|
|
210
|
+
userContent = mediaPromptTag(media, originalMediaText, audioTranscript);
|
|
172
211
|
}
|
|
173
212
|
const memoryPreamble = buildMemoryPreamble({
|
|
174
213
|
jid: stored.jid,
|
package/dist/store/media.js
CHANGED
|
@@ -106,7 +106,7 @@ export async function downloadAndSave(msg, jid) {
|
|
|
106
106
|
return null;
|
|
107
107
|
}
|
|
108
108
|
}
|
|
109
|
-
export function mediaPromptTag(info, caption) {
|
|
109
|
+
export function mediaPromptTag(info, caption, transcript) {
|
|
110
110
|
const label = info.mediaType === 'image'
|
|
111
111
|
? 'an image'
|
|
112
112
|
: info.mediaType === 'video'
|
|
@@ -116,12 +116,17 @@ export function mediaPromptTag(info, caption) {
|
|
|
116
116
|
: info.mediaType === 'document'
|
|
117
117
|
? 'a document'
|
|
118
118
|
: 'a sticker';
|
|
119
|
+
const hasTranscript = info.mediaType === 'audio' && !!transcript?.trim();
|
|
119
120
|
const lines = [
|
|
120
121
|
`[User sent ${label}: ${info.mediaPath}]`,
|
|
121
|
-
|
|
122
|
+
hasTranscript
|
|
123
|
+
? 'Transcript provided below; use it as the spoken content.'
|
|
124
|
+
: 'Read this file to see what the user sent.',
|
|
122
125
|
];
|
|
123
|
-
if (caption)
|
|
124
|
-
lines.push(`Caption: "${caption}"`);
|
|
126
|
+
if (caption.trim())
|
|
127
|
+
lines.push(`Caption: "${caption.trim()}"`);
|
|
128
|
+
if (hasTranscript)
|
|
129
|
+
lines.push(`Transcript: "${transcript.trim()}"`);
|
|
125
130
|
return lines.join('\n');
|
|
126
131
|
}
|
|
127
132
|
/**
|
package/package.json
CHANGED