@c4t4/heyamigo 0.10.7 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/config.example.json +6 -0
- package/dist/audio/transcription.js +58 -0
- package/dist/config.js +15 -0
- package/dist/gateway/ingest.js +41 -1
- package/dist/gateway/triggers.js +117 -0
- package/dist/store/media.js +9 -4
- package/package.json +1 -1
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { dirname } from 'path';
|
|
2
|
+
import { getProvider } from '../ai/providers.js';
|
|
3
|
+
import { config } from '../config.js';
|
|
4
|
+
import { logger } from '../logger.js';
|
|
5
|
+
const UNTRANSCRIBABLE = '[UNTRANSCRIBABLE]';
|
|
6
|
+
export async function transcribeAudioFile(params) {
|
|
7
|
+
const cfg = config.audio.transcription;
|
|
8
|
+
if (!cfg.enabled)
|
|
9
|
+
return null;
|
|
10
|
+
try {
|
|
11
|
+
const provider = getProvider();
|
|
12
|
+
const result = await provider.runTask({
|
|
13
|
+
input: [
|
|
14
|
+
'Transcribe the audio file at this exact path.',
|
|
15
|
+
'',
|
|
16
|
+
params.path,
|
|
17
|
+
'',
|
|
18
|
+
'Return only the spoken transcript text.',
|
|
19
|
+
`If the file is not readable or cannot be transcribed, return exactly ${UNTRANSCRIBABLE}.`,
|
|
20
|
+
'Do not answer the speaker. Do not summarize. Do not add labels, markdown, or commentary.',
|
|
21
|
+
].join('\n'),
|
|
22
|
+
caller: 'audio-transcription',
|
|
23
|
+
mode: 'read-only',
|
|
24
|
+
lane: 'background',
|
|
25
|
+
includeSystemPrompt: false,
|
|
26
|
+
addDirs: [dirname(params.path), config.storage.mediaDir],
|
|
27
|
+
});
|
|
28
|
+
const text = cleanupTranscript(result.reply);
|
|
29
|
+
if (!text)
|
|
30
|
+
return null;
|
|
31
|
+
logger.info({
|
|
32
|
+
provider: provider.name,
|
|
33
|
+
address: params.address,
|
|
34
|
+
externalMsgId: params.externalMsgId,
|
|
35
|
+
chars: text.length,
|
|
36
|
+
}, 'audio transcribed');
|
|
37
|
+
return text;
|
|
38
|
+
}
|
|
39
|
+
catch (err) {
|
|
40
|
+
logger.warn({
|
|
41
|
+
err,
|
|
42
|
+
provider: config.ai.provider,
|
|
43
|
+
address: params.address,
|
|
44
|
+
externalMsgId: params.externalMsgId,
|
|
45
|
+
}, 'audio transcription failed');
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
function cleanupTranscript(reply) {
|
|
50
|
+
let text = reply.trim();
|
|
51
|
+
if (!text)
|
|
52
|
+
return null;
|
|
53
|
+
text = text.replace(/^```(?:text)?\s*/i, '').replace(/\s*```$/i, '').trim();
|
|
54
|
+
text = text.replace(/^transcript:\s*/i, '').trim();
|
|
55
|
+
if (!text || text === UNTRANSCRIBABLE)
|
|
56
|
+
return null;
|
|
57
|
+
return text;
|
|
58
|
+
}
|
package/dist/config.js
CHANGED
|
@@ -38,6 +38,21 @@ const ConfigSchema = z.object({
|
|
|
38
38
|
provider: z.enum(['claude', 'codex', 'grok']).default('claude'),
|
|
39
39
|
})
|
|
40
40
|
.default({ provider: 'claude' }),
|
|
41
|
+
audio: z
|
|
42
|
+
.object({
|
|
43
|
+
transcription: z
|
|
44
|
+
.object({
|
|
45
|
+
enabled: z.boolean().default(true),
|
|
46
|
+
})
|
|
47
|
+
.default({
|
|
48
|
+
enabled: true,
|
|
49
|
+
}),
|
|
50
|
+
})
|
|
51
|
+
.default({
|
|
52
|
+
transcription: {
|
|
53
|
+
enabled: true,
|
|
54
|
+
},
|
|
55
|
+
}),
|
|
41
56
|
claude: z.object({
|
|
42
57
|
model: z.string(),
|
|
43
58
|
personalityFile: z.string(),
|
package/dist/gateway/ingest.js
CHANGED
|
@@ -2,6 +2,7 @@ import { unlink } from 'fs/promises';
|
|
|
2
2
|
import { resolve } from 'path';
|
|
3
3
|
import { getProvider } from '../ai/providers.js';
|
|
4
4
|
import { getSession } from '../ai/sessions.js';
|
|
5
|
+
import { transcribeAudioFile } from '../audio/transcription.js';
|
|
5
6
|
import { config } from '../config.js';
|
|
6
7
|
import { personIdForAddress } from '../db/identity-sync.js';
|
|
7
8
|
import { estimate as estimateJob } from '../estimates/index.js';
|
|
@@ -50,6 +51,24 @@ function buildImageGenRoutingContract() {
|
|
|
50
51
|
`Reply briefly and emit [ASYNC: Generate the requested image using current chat context. Save final files under ${outboxPath}/. Follow-up reply must include one [IMAGE: /absolute/path] tag per final image, or say: Image job failed before producing a file.]`,
|
|
51
52
|
].join('\n');
|
|
52
53
|
}
|
|
54
|
+
function shouldTranscribeAudio(params) {
|
|
55
|
+
if (params.media?.mediaType !== 'audio')
|
|
56
|
+
return false;
|
|
57
|
+
if (!params.respond)
|
|
58
|
+
return false;
|
|
59
|
+
if (params.selfChat)
|
|
60
|
+
return true;
|
|
61
|
+
return params.triggerMode !== 'off';
|
|
62
|
+
}
|
|
63
|
+
function mergeAudioTranscript(text, transcript) {
|
|
64
|
+
const cleanedTranscript = transcript.trim();
|
|
65
|
+
const cleanedText = text.trim();
|
|
66
|
+
if (!cleanedTranscript)
|
|
67
|
+
return text;
|
|
68
|
+
if (!cleanedText)
|
|
69
|
+
return cleanedTranscript;
|
|
70
|
+
return `${cleanedText}\n\n[Audio transcript]\n${cleanedTranscript}`;
|
|
71
|
+
}
|
|
53
72
|
export async function processIncomingMessage(incoming, opts = {}) {
|
|
54
73
|
const stored = toStored(incoming);
|
|
55
74
|
const ageMs = Date.now() - stored.timestamp * 1000;
|
|
@@ -118,6 +137,26 @@ export async function processIncomingMessage(incoming, opts = {}) {
|
|
|
118
137
|
stored.mediaPath = media.mediaPath;
|
|
119
138
|
stored.mediaMime = media.mediaMime;
|
|
120
139
|
}
|
|
140
|
+
const originalMediaText = stored.text;
|
|
141
|
+
let audioTranscript = null;
|
|
142
|
+
const transcribeThisAudio = shouldTranscribeAudio({
|
|
143
|
+
media,
|
|
144
|
+
respond: decision.respond,
|
|
145
|
+
triggerMode: decision.triggerMode,
|
|
146
|
+
selfChat: incoming.selfChat,
|
|
147
|
+
}) && media;
|
|
148
|
+
if (transcribeThisAudio) {
|
|
149
|
+
audioTranscript = await transcribeAudioFile({
|
|
150
|
+
path: transcribeThisAudio.mediaPath,
|
|
151
|
+
mime: transcribeThisAudio.mediaMime,
|
|
152
|
+
address: incoming.address,
|
|
153
|
+
externalMsgId: incoming.externalMsgId,
|
|
154
|
+
});
|
|
155
|
+
if (audioTranscript) {
|
|
156
|
+
stored.text = mergeAudioTranscript(stored.text, audioTranscript);
|
|
157
|
+
logCtx.text = stored.text.slice(0, 80);
|
|
158
|
+
}
|
|
159
|
+
}
|
|
121
160
|
await append(stored);
|
|
122
161
|
if (!decision.respond) {
|
|
123
162
|
logger.info(logCtx, 'message captured, silent');
|
|
@@ -143,6 +182,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
|
|
|
143
182
|
const trigger = checkTrigger({
|
|
144
183
|
mode: decision.triggerMode,
|
|
145
184
|
text: stored.text,
|
|
185
|
+
audioTranscript: audioTranscript ?? undefined,
|
|
146
186
|
mentionedBot: incoming.triggerHints?.mentionedBot,
|
|
147
187
|
replyToBot: incoming.triggerHints?.replyToBot,
|
|
148
188
|
});
|
|
@@ -168,7 +208,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
|
|
|
168
208
|
const existingSession = getSession(stored.jid, getProvider().name);
|
|
169
209
|
let userContent = stored.text;
|
|
170
210
|
if (media) {
|
|
171
|
-
userContent = mediaPromptTag(media,
|
|
211
|
+
userContent = mediaPromptTag(media, originalMediaText, audioTranscript);
|
|
172
212
|
}
|
|
173
213
|
const memoryPreamble = buildMemoryPreamble({
|
|
174
214
|
jid: stored.jid,
|
package/dist/gateway/triggers.js
CHANGED
|
@@ -1,4 +1,68 @@
|
|
|
1
1
|
import { config } from '../config.js';
|
|
2
|
+
const AUDIO_ALIAS_VARIANTS = {
|
|
3
|
+
heyamigo: [
|
|
4
|
+
'hey amigo',
|
|
5
|
+
'hey amigos',
|
|
6
|
+
'hey amego',
|
|
7
|
+
'hey amico',
|
|
8
|
+
'hey a migo',
|
|
9
|
+
'hay amigo',
|
|
10
|
+
'hi amigo',
|
|
11
|
+
],
|
|
12
|
+
amigo: [
|
|
13
|
+
'a migo',
|
|
14
|
+
'amego',
|
|
15
|
+
'amico',
|
|
16
|
+
'amigos',
|
|
17
|
+
'amiga',
|
|
18
|
+
'migo',
|
|
19
|
+
],
|
|
20
|
+
claude: [
|
|
21
|
+
'cloud',
|
|
22
|
+
'clawd',
|
|
23
|
+
'clawed',
|
|
24
|
+
'clod',
|
|
25
|
+
'clode',
|
|
26
|
+
'cload',
|
|
27
|
+
'clout',
|
|
28
|
+
'claut',
|
|
29
|
+
'clause',
|
|
30
|
+
'claus',
|
|
31
|
+
],
|
|
32
|
+
clawd: [
|
|
33
|
+
'claude',
|
|
34
|
+
'cloud',
|
|
35
|
+
'clawed',
|
|
36
|
+
'clod',
|
|
37
|
+
'clode',
|
|
38
|
+
'cload',
|
|
39
|
+
'clout',
|
|
40
|
+
'claut',
|
|
41
|
+
],
|
|
42
|
+
grok: [
|
|
43
|
+
'grock',
|
|
44
|
+
'grog',
|
|
45
|
+
'gronk',
|
|
46
|
+
'grawk',
|
|
47
|
+
'groc',
|
|
48
|
+
],
|
|
49
|
+
codex: [
|
|
50
|
+
'code x',
|
|
51
|
+
'codec',
|
|
52
|
+
'codecs',
|
|
53
|
+
'codecks',
|
|
54
|
+
'codicks',
|
|
55
|
+
'kodeks',
|
|
56
|
+
'codacs',
|
|
57
|
+
],
|
|
58
|
+
xai: [
|
|
59
|
+
'x ai',
|
|
60
|
+
'x a i',
|
|
61
|
+
'ex ai',
|
|
62
|
+
'ex a i',
|
|
63
|
+
'x.ai',
|
|
64
|
+
],
|
|
65
|
+
};
|
|
2
66
|
function escapeRegex(s) {
|
|
3
67
|
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
4
68
|
}
|
|
@@ -10,6 +74,50 @@ function aliasMatches(text, aliases) {
|
|
|
10
74
|
}
|
|
11
75
|
return null;
|
|
12
76
|
}
|
|
77
|
+
function normalizeAudioText(text) {
|
|
78
|
+
return text
|
|
79
|
+
.toLowerCase()
|
|
80
|
+
.normalize('NFKD')
|
|
81
|
+
.replace(/[\u0300-\u036f]/g, '')
|
|
82
|
+
.replace(/[^a-z0-9]+/g, ' ')
|
|
83
|
+
.trim()
|
|
84
|
+
.replace(/\s+/g, ' ');
|
|
85
|
+
}
|
|
86
|
+
function phraseMatches(normalizedText, phrase) {
|
|
87
|
+
const normalizedPhrase = normalizeAudioText(phrase);
|
|
88
|
+
if (!normalizedPhrase)
|
|
89
|
+
return false;
|
|
90
|
+
const re = new RegExp(`(^| )${escapeRegex(normalizedPhrase)}($| )`, 'i');
|
|
91
|
+
return re.test(normalizedText);
|
|
92
|
+
}
|
|
93
|
+
function wakePhraseMatches(normalizedText, phrase) {
|
|
94
|
+
const normalizedPhrase = normalizeAudioText(phrase);
|
|
95
|
+
if (!normalizedPhrase)
|
|
96
|
+
return false;
|
|
97
|
+
const wake = '(hey|hi|hello|yo|ok|okay|oye|hola)';
|
|
98
|
+
const re = new RegExp(`(^| )${wake} ${escapeRegex(normalizedPhrase)}($| )`, 'i');
|
|
99
|
+
return re.test(normalizedText);
|
|
100
|
+
}
|
|
101
|
+
function audioAliasMatches(transcript, aliases) {
|
|
102
|
+
const normalizedTranscript = normalizeAudioText(transcript);
|
|
103
|
+
if (!normalizedTranscript)
|
|
104
|
+
return null;
|
|
105
|
+
for (const alias of aliases) {
|
|
106
|
+
const normalizedAlias = normalizeAudioText(alias);
|
|
107
|
+
if (phraseMatches(normalizedTranscript, normalizedAlias)) {
|
|
108
|
+
return { alias, variant: normalizedAlias };
|
|
109
|
+
}
|
|
110
|
+
const variants = new Set([
|
|
111
|
+
...(AUDIO_ALIAS_VARIANTS[normalizedAlias] ?? []),
|
|
112
|
+
]);
|
|
113
|
+
for (const variant of variants) {
|
|
114
|
+
if (wakePhraseMatches(normalizedTranscript, variant)) {
|
|
115
|
+
return { alias, variant: normalizeAudioText(variant) };
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return null;
|
|
120
|
+
}
|
|
13
121
|
export function checkTrigger(params) {
|
|
14
122
|
const { mode, text } = params;
|
|
15
123
|
if (mode === 'off')
|
|
@@ -27,6 +135,15 @@ export function checkTrigger(params) {
|
|
|
27
135
|
const alias = aliasMatches(text, config.triggers.aliases);
|
|
28
136
|
if (alias)
|
|
29
137
|
return { triggered: true, reason: `alias:${alias}` };
|
|
138
|
+
const audioAlias = params.audioTranscript
|
|
139
|
+
? audioAliasMatches(params.audioTranscript, config.triggers.aliases)
|
|
140
|
+
: null;
|
|
141
|
+
if (audioAlias) {
|
|
142
|
+
return {
|
|
143
|
+
triggered: true,
|
|
144
|
+
reason: `audio-alias:${audioAlias.alias}~${audioAlias.variant}`,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
30
147
|
// 2. Channel-provided mention signal, e.g. WhatsApp @mention or
|
|
31
148
|
// Telegram bot username mention.
|
|
32
149
|
if (params.mentionedBot)
|
package/dist/store/media.js
CHANGED
|
@@ -106,7 +106,7 @@ export async function downloadAndSave(msg, jid) {
|
|
|
106
106
|
return null;
|
|
107
107
|
}
|
|
108
108
|
}
|
|
109
|
-
export function mediaPromptTag(info, caption) {
|
|
109
|
+
export function mediaPromptTag(info, caption, transcript) {
|
|
110
110
|
const label = info.mediaType === 'image'
|
|
111
111
|
? 'an image'
|
|
112
112
|
: info.mediaType === 'video'
|
|
@@ -116,12 +116,17 @@ export function mediaPromptTag(info, caption) {
|
|
|
116
116
|
: info.mediaType === 'document'
|
|
117
117
|
? 'a document'
|
|
118
118
|
: 'a sticker';
|
|
119
|
+
const hasTranscript = info.mediaType === 'audio' && !!transcript?.trim();
|
|
119
120
|
const lines = [
|
|
120
121
|
`[User sent ${label}: ${info.mediaPath}]`,
|
|
121
|
-
|
|
122
|
+
hasTranscript
|
|
123
|
+
? 'Transcript provided below; use it as the spoken content.'
|
|
124
|
+
: 'Read this file to see what the user sent.',
|
|
122
125
|
];
|
|
123
|
-
if (caption)
|
|
124
|
-
lines.push(`Caption: "${caption}"`);
|
|
126
|
+
if (caption.trim())
|
|
127
|
+
lines.push(`Caption: "${caption.trim()}"`);
|
|
128
|
+
if (hasTranscript)
|
|
129
|
+
lines.push(`Transcript: "${transcript.trim()}"`);
|
|
125
130
|
return lines.join('\n');
|
|
126
131
|
}
|
|
127
132
|
/**
|
package/package.json
CHANGED