@c4t4/heyamigo 0.10.7 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,6 +33,12 @@
33
33
  "provider": "claude"
34
34
  },
35
35
 
36
+ "audio": {
37
+ "transcription": {
38
+ "enabled": true
39
+ }
40
+ },
41
+
36
42
  "claude": {
37
43
  "model": "claude-opus-4-7",
38
44
  "personalityFile": "./config/personalities/sharp.md",
@@ -0,0 +1,58 @@
1
+ import { dirname } from 'path';
2
+ import { getProvider } from '../ai/providers.js';
3
+ import { config } from '../config.js';
4
+ import { logger } from '../logger.js';
5
+ const UNTRANSCRIBABLE = '[UNTRANSCRIBABLE]';
6
+ export async function transcribeAudioFile(params) {
7
+ const cfg = config.audio.transcription;
8
+ if (!cfg.enabled)
9
+ return null;
10
+ try {
11
+ const provider = getProvider();
12
+ const result = await provider.runTask({
13
+ input: [
14
+ 'Transcribe the audio file at this exact path.',
15
+ '',
16
+ params.path,
17
+ '',
18
+ 'Return only the spoken transcript text.',
19
+ `If the file is not readable or cannot be transcribed, return exactly ${UNTRANSCRIBABLE}.`,
20
+ 'Do not answer the speaker. Do not summarize. Do not add labels, markdown, or commentary.',
21
+ ].join('\n'),
22
+ caller: 'audio-transcription',
23
+ mode: 'read-only',
24
+ lane: 'background',
25
+ includeSystemPrompt: false,
26
+ addDirs: [dirname(params.path), config.storage.mediaDir],
27
+ });
28
+ const text = cleanupTranscript(result.reply);
29
+ if (!text)
30
+ return null;
31
+ logger.info({
32
+ provider: provider.name,
33
+ address: params.address,
34
+ externalMsgId: params.externalMsgId,
35
+ chars: text.length,
36
+ }, 'audio transcribed');
37
+ return text;
38
+ }
39
+ catch (err) {
40
+ logger.warn({
41
+ err,
42
+ provider: config.ai.provider,
43
+ address: params.address,
44
+ externalMsgId: params.externalMsgId,
45
+ }, 'audio transcription failed');
46
+ return null;
47
+ }
48
+ }
49
+ function cleanupTranscript(reply) {
50
+ let text = reply.trim();
51
+ if (!text)
52
+ return null;
53
+ text = text.replace(/^```(?:text)?\s*/i, '').replace(/\s*```$/i, '').trim();
54
+ text = text.replace(/^transcript:\s*/i, '').trim();
55
+ if (!text || text === UNTRANSCRIBABLE)
56
+ return null;
57
+ return text;
58
+ }
package/dist/config.js CHANGED
@@ -38,6 +38,21 @@ const ConfigSchema = z.object({
38
38
  provider: z.enum(['claude', 'codex', 'grok']).default('claude'),
39
39
  })
40
40
  .default({ provider: 'claude' }),
41
+ audio: z
42
+ .object({
43
+ transcription: z
44
+ .object({
45
+ enabled: z.boolean().default(true),
46
+ })
47
+ .default({
48
+ enabled: true,
49
+ }),
50
+ })
51
+ .default({
52
+ transcription: {
53
+ enabled: true,
54
+ },
55
+ }),
41
56
  claude: z.object({
42
57
  model: z.string(),
43
58
  personalityFile: z.string(),
@@ -2,6 +2,7 @@ import { unlink } from 'fs/promises';
2
2
  import { resolve } from 'path';
3
3
  import { getProvider } from '../ai/providers.js';
4
4
  import { getSession } from '../ai/sessions.js';
5
+ import { transcribeAudioFile } from '../audio/transcription.js';
5
6
  import { config } from '../config.js';
6
7
  import { personIdForAddress } from '../db/identity-sync.js';
7
8
  import { estimate as estimateJob } from '../estimates/index.js';
@@ -50,6 +51,24 @@ function buildImageGenRoutingContract() {
50
51
  `Reply briefly and emit [ASYNC: Generate the requested image using current chat context. Save final files under ${outboxPath}/. Follow-up reply must include one [IMAGE: /absolute/path] tag per final image, or say: Image job failed before producing a file.]`,
51
52
  ].join('\n');
52
53
  }
54
+ function shouldTranscribeAudio(params) {
55
+ if (params.media?.mediaType !== 'audio')
56
+ return false;
57
+ if (!params.respond)
58
+ return false;
59
+ if (params.selfChat)
60
+ return true;
61
+ return params.triggerMode !== 'off';
62
+ }
63
+ function mergeAudioTranscript(text, transcript) {
64
+ const cleanedTranscript = transcript.trim();
65
+ const cleanedText = text.trim();
66
+ if (!cleanedTranscript)
67
+ return text;
68
+ if (!cleanedText)
69
+ return cleanedTranscript;
70
+ return `${cleanedText}\n\n[Audio transcript]\n${cleanedTranscript}`;
71
+ }
53
72
  export async function processIncomingMessage(incoming, opts = {}) {
54
73
  const stored = toStored(incoming);
55
74
  const ageMs = Date.now() - stored.timestamp * 1000;
@@ -118,6 +137,26 @@ export async function processIncomingMessage(incoming, opts = {}) {
118
137
  stored.mediaPath = media.mediaPath;
119
138
  stored.mediaMime = media.mediaMime;
120
139
  }
140
+ const originalMediaText = stored.text;
141
+ let audioTranscript = null;
142
+ const transcribeThisAudio = shouldTranscribeAudio({
143
+ media,
144
+ respond: decision.respond,
145
+ triggerMode: decision.triggerMode,
146
+ selfChat: incoming.selfChat,
147
+ }) && media;
148
+ if (transcribeThisAudio) {
149
+ audioTranscript = await transcribeAudioFile({
150
+ path: transcribeThisAudio.mediaPath,
151
+ mime: transcribeThisAudio.mediaMime,
152
+ address: incoming.address,
153
+ externalMsgId: incoming.externalMsgId,
154
+ });
155
+ if (audioTranscript) {
156
+ stored.text = mergeAudioTranscript(stored.text, audioTranscript);
157
+ logCtx.text = stored.text.slice(0, 80);
158
+ }
159
+ }
121
160
  await append(stored);
122
161
  if (!decision.respond) {
123
162
  logger.info(logCtx, 'message captured, silent');
@@ -143,6 +182,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
143
182
  const trigger = checkTrigger({
144
183
  mode: decision.triggerMode,
145
184
  text: stored.text,
185
+ audioTranscript: audioTranscript ?? undefined,
146
186
  mentionedBot: incoming.triggerHints?.mentionedBot,
147
187
  replyToBot: incoming.triggerHints?.replyToBot,
148
188
  });
@@ -168,7 +208,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
168
208
  const existingSession = getSession(stored.jid, getProvider().name);
169
209
  let userContent = stored.text;
170
210
  if (media) {
171
- userContent = mediaPromptTag(media, stored.text);
211
+ userContent = mediaPromptTag(media, originalMediaText, audioTranscript);
172
212
  }
173
213
  const memoryPreamble = buildMemoryPreamble({
174
214
  jid: stored.jid,
@@ -1,4 +1,68 @@
1
1
  import { config } from '../config.js';
2
+ const AUDIO_ALIAS_VARIANTS = {
3
+ heyamigo: [
4
+ 'hey amigo',
5
+ 'hey amigos',
6
+ 'hey amego',
7
+ 'hey amico',
8
+ 'hey a migo',
9
+ 'hay amigo',
10
+ 'hi amigo',
11
+ ],
12
+ amigo: [
13
+ 'a migo',
14
+ 'amego',
15
+ 'amico',
16
+ 'amigos',
17
+ 'amiga',
18
+ 'migo',
19
+ ],
20
+ claude: [
21
+ 'cloud',
22
+ 'clawd',
23
+ 'clawed',
24
+ 'clod',
25
+ 'clode',
26
+ 'cload',
27
+ 'clout',
28
+ 'claut',
29
+ 'clause',
30
+ 'claus',
31
+ ],
32
+ clawd: [
33
+ 'claude',
34
+ 'cloud',
35
+ 'clawed',
36
+ 'clod',
37
+ 'clode',
38
+ 'cload',
39
+ 'clout',
40
+ 'claut',
41
+ ],
42
+ grok: [
43
+ 'grock',
44
+ 'grog',
45
+ 'gronk',
46
+ 'grawk',
47
+ 'groc',
48
+ ],
49
+ codex: [
50
+ 'code x',
51
+ 'codec',
52
+ 'codecs',
53
+ 'codecks',
54
+ 'codicks',
55
+ 'kodeks',
56
+ 'codacs',
57
+ ],
58
+ xai: [
59
+ 'x ai',
60
+ 'x a i',
61
+ 'ex ai',
62
+ 'ex a i',
63
+ 'x.ai',
64
+ ],
65
+ };
2
66
  function escapeRegex(s) {
3
67
  return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
4
68
  }
@@ -10,6 +74,50 @@ function aliasMatches(text, aliases) {
10
74
  }
11
75
  return null;
12
76
  }
77
+ function normalizeAudioText(text) {
78
+ return text
79
+ .toLowerCase()
80
+ .normalize('NFKD')
81
+ .replace(/[\u0300-\u036f]/g, '')
82
+ .replace(/[^a-z0-9]+/g, ' ')
83
+ .trim()
84
+ .replace(/\s+/g, ' ');
85
+ }
86
+ function phraseMatches(normalizedText, phrase) {
87
+ const normalizedPhrase = normalizeAudioText(phrase);
88
+ if (!normalizedPhrase)
89
+ return false;
90
+ const re = new RegExp(`(^| )${escapeRegex(normalizedPhrase)}($| )`, 'i');
91
+ return re.test(normalizedText);
92
+ }
93
+ function wakePhraseMatches(normalizedText, phrase) {
94
+ const normalizedPhrase = normalizeAudioText(phrase);
95
+ if (!normalizedPhrase)
96
+ return false;
97
+ const wake = '(hey|hi|hello|yo|ok|okay|oye|hola)';
98
+ const re = new RegExp(`(^| )${wake} ${escapeRegex(normalizedPhrase)}($| )`, 'i');
99
+ return re.test(normalizedText);
100
+ }
101
+ function audioAliasMatches(transcript, aliases) {
102
+ const normalizedTranscript = normalizeAudioText(transcript);
103
+ if (!normalizedTranscript)
104
+ return null;
105
+ for (const alias of aliases) {
106
+ const normalizedAlias = normalizeAudioText(alias);
107
+ if (phraseMatches(normalizedTranscript, normalizedAlias)) {
108
+ return { alias, variant: normalizedAlias };
109
+ }
110
+ const variants = new Set([
111
+ ...(AUDIO_ALIAS_VARIANTS[normalizedAlias] ?? []),
112
+ ]);
113
+ for (const variant of variants) {
114
+ if (wakePhraseMatches(normalizedTranscript, variant)) {
115
+ return { alias, variant: normalizeAudioText(variant) };
116
+ }
117
+ }
118
+ }
119
+ return null;
120
+ }
13
121
  export function checkTrigger(params) {
14
122
  const { mode, text } = params;
15
123
  if (mode === 'off')
@@ -27,6 +135,15 @@ export function checkTrigger(params) {
27
135
  const alias = aliasMatches(text, config.triggers.aliases);
28
136
  if (alias)
29
137
  return { triggered: true, reason: `alias:${alias}` };
138
+ const audioAlias = params.audioTranscript
139
+ ? audioAliasMatches(params.audioTranscript, config.triggers.aliases)
140
+ : null;
141
+ if (audioAlias) {
142
+ return {
143
+ triggered: true,
144
+ reason: `audio-alias:${audioAlias.alias}~${audioAlias.variant}`,
145
+ };
146
+ }
30
147
  // 2. Channel-provided mention signal, e.g. WhatsApp @mention or
31
148
  // Telegram bot username mention.
32
149
  if (params.mentionedBot)
@@ -106,7 +106,7 @@ export async function downloadAndSave(msg, jid) {
106
106
  return null;
107
107
  }
108
108
  }
109
- export function mediaPromptTag(info, caption) {
109
+ export function mediaPromptTag(info, caption, transcript) {
110
110
  const label = info.mediaType === 'image'
111
111
  ? 'an image'
112
112
  : info.mediaType === 'video'
@@ -116,12 +116,17 @@ export function mediaPromptTag(info, caption) {
116
116
  : info.mediaType === 'document'
117
117
  ? 'a document'
118
118
  : 'a sticker';
119
+ const hasTranscript = info.mediaType === 'audio' && !!transcript?.trim();
119
120
  const lines = [
120
121
  `[User sent ${label}: ${info.mediaPath}]`,
121
- `Read this file to see what the user sent.`,
122
+ hasTranscript
123
+ ? 'Transcript provided below; use it as the spoken content.'
124
+ : 'Read this file to see what the user sent.',
122
125
  ];
123
- if (caption)
124
- lines.push(`Caption: "${caption}"`);
126
+ if (caption.trim())
127
+ lines.push(`Caption: "${caption.trim()}"`);
128
+ if (hasTranscript)
129
+ lines.push(`Transcript: "${transcript.trim()}"`);
125
130
  return lines.join('\n');
126
131
  }
127
132
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@c4t4/heyamigo",
3
- "version": "0.10.7",
3
+ "version": "0.11.1",
4
4
  "description": "WhatsApp and Telegram AI bot powered by Claude, Codex, or Grok with long-term memory, browser control, and role-based access",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",