@c4t4/heyamigo 0.10.6 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,6 +33,12 @@
33
33
  "provider": "claude"
34
34
  },
35
35
 
36
+ "audio": {
37
+ "transcription": {
38
+ "enabled": true
39
+ }
40
+ },
41
+
36
42
  "claude": {
37
43
  "model": "claude-opus-4-7",
38
44
  "personalityFile": "./config/personalities/sharp.md",
@@ -0,0 +1,58 @@
1
+ import { dirname } from 'path';
2
+ import { getProvider } from '../ai/providers.js';
3
+ import { config } from '../config.js';
4
+ import { logger } from '../logger.js';
5
+ const UNTRANSCRIBABLE = '[UNTRANSCRIBABLE]';
6
+ export async function transcribeAudioFile(params) {
7
+ const cfg = config.audio.transcription;
8
+ if (!cfg.enabled)
9
+ return null;
10
+ try {
11
+ const provider = getProvider();
12
+ const result = await provider.runTask({
13
+ input: [
14
+ 'Transcribe the audio file at this exact path.',
15
+ '',
16
+ params.path,
17
+ '',
18
+ 'Return only the spoken transcript text.',
19
+ `If the file is not readable or cannot be transcribed, return exactly ${UNTRANSCRIBABLE}.`,
20
+ 'Do not answer the speaker. Do not summarize. Do not add labels, markdown, or commentary.',
21
+ ].join('\n'),
22
+ caller: 'audio-transcription',
23
+ mode: 'read-only',
24
+ lane: 'background',
25
+ includeSystemPrompt: false,
26
+ addDirs: [dirname(params.path), config.storage.mediaDir],
27
+ });
28
+ const text = cleanupTranscript(result.reply);
29
+ if (!text)
30
+ return null;
31
+ logger.info({
32
+ provider: provider.name,
33
+ address: params.address,
34
+ externalMsgId: params.externalMsgId,
35
+ chars: text.length,
36
+ }, 'audio transcribed');
37
+ return text;
38
+ }
39
+ catch (err) {
40
+ logger.warn({
41
+ err,
42
+ provider: config.ai.provider,
43
+ address: params.address,
44
+ externalMsgId: params.externalMsgId,
45
+ }, 'audio transcription failed');
46
+ return null;
47
+ }
48
+ }
49
+ function cleanupTranscript(reply) {
50
+ let text = reply.trim();
51
+ if (!text)
52
+ return null;
53
+ text = text.replace(/^```(?:text)?\s*/i, '').replace(/\s*```$/i, '').trim();
54
+ text = text.replace(/^transcript:\s*/i, '').trim();
55
+ if (!text || text === UNTRANSCRIBABLE)
56
+ return null;
57
+ return text;
58
+ }
package/dist/config.js CHANGED
@@ -38,6 +38,21 @@ const ConfigSchema = z.object({
38
38
  provider: z.enum(['claude', 'codex', 'grok']).default('claude'),
39
39
  })
40
40
  .default({ provider: 'claude' }),
41
+ audio: z
42
+ .object({
43
+ transcription: z
44
+ .object({
45
+ enabled: z.boolean().default(true),
46
+ })
47
+ .default({
48
+ enabled: true,
49
+ }),
50
+ })
51
+ .default({
52
+ transcription: {
53
+ enabled: true,
54
+ },
55
+ }),
41
56
  claude: z.object({
42
57
  model: z.string(),
43
58
  personalityFile: z.string(),
@@ -2,6 +2,7 @@ import { unlink } from 'fs/promises';
2
2
  import { resolve } from 'path';
3
3
  import { getProvider } from '../ai/providers.js';
4
4
  import { getSession } from '../ai/sessions.js';
5
+ import { transcribeAudioFile } from '../audio/transcription.js';
5
6
  import { config } from '../config.js';
6
7
  import { personIdForAddress } from '../db/identity-sync.js';
7
8
  import { estimate as estimateJob } from '../estimates/index.js';
@@ -50,6 +51,24 @@ function buildImageGenRoutingContract() {
50
51
  `Reply briefly and emit [ASYNC: Generate the requested image using current chat context. Save final files under ${outboxPath}/. Follow-up reply must include one [IMAGE: /absolute/path] tag per final image, or say: Image job failed before producing a file.]`,
51
52
  ].join('\n');
52
53
  }
54
+ function shouldTranscribeAudio(params) {
55
+ if (params.media?.mediaType !== 'audio')
56
+ return false;
57
+ if (!params.respond)
58
+ return false;
59
+ if (params.selfChat)
60
+ return true;
61
+ return params.triggerMode !== 'off';
62
+ }
63
+ function mergeAudioTranscript(text, transcript) {
64
+ const cleanedTranscript = transcript.trim();
65
+ const cleanedText = text.trim();
66
+ if (!cleanedTranscript)
67
+ return text;
68
+ if (!cleanedText)
69
+ return cleanedTranscript;
70
+ return `${cleanedText}\n\n[Audio transcript]\n${cleanedTranscript}`;
71
+ }
53
72
  export async function processIncomingMessage(incoming, opts = {}) {
54
73
  const stored = toStored(incoming);
55
74
  const ageMs = Date.now() - stored.timestamp * 1000;
@@ -118,6 +137,26 @@ export async function processIncomingMessage(incoming, opts = {}) {
118
137
  stored.mediaPath = media.mediaPath;
119
138
  stored.mediaMime = media.mediaMime;
120
139
  }
140
+ const originalMediaText = stored.text;
141
+ let audioTranscript = null;
142
+ const transcribeThisAudio = shouldTranscribeAudio({
143
+ media,
144
+ respond: decision.respond,
145
+ triggerMode: decision.triggerMode,
146
+ selfChat: incoming.selfChat,
147
+ }) && media;
148
+ if (transcribeThisAudio) {
149
+ audioTranscript = await transcribeAudioFile({
150
+ path: transcribeThisAudio.mediaPath,
151
+ mime: transcribeThisAudio.mediaMime,
152
+ address: incoming.address,
153
+ externalMsgId: incoming.externalMsgId,
154
+ });
155
+ if (audioTranscript) {
156
+ stored.text = mergeAudioTranscript(stored.text, audioTranscript);
157
+ logCtx.text = stored.text.slice(0, 80);
158
+ }
159
+ }
121
160
  await append(stored);
122
161
  if (!decision.respond) {
123
162
  logger.info(logCtx, 'message captured, silent');
@@ -168,7 +207,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
168
207
  const existingSession = getSession(stored.jid, getProvider().name);
169
208
  let userContent = stored.text;
170
209
  if (media) {
171
- userContent = mediaPromptTag(media, stored.text);
210
+ userContent = mediaPromptTag(media, originalMediaText, audioTranscript);
172
211
  }
173
212
  const memoryPreamble = buildMemoryPreamble({
174
213
  jid: stored.jid,
@@ -28,6 +28,7 @@ function buildCoreQueueContract(outboxPath) {
28
28
  `Media: [IMAGE|VIDEO|AUDIO|DOCUMENT: /absolute/path] from ${outboxPath}/`,
29
29
  'Memory: [DIGEST: reason], [JOURNAL:slug - note], [JOURNAL-NEW:slug - purpose]',
30
30
  'Time: [REMIND: YYYY-MM-DD HH:MM - text], [CRON: expr SAY|PROMPT|ASYNC|BROWSER - body]',
31
+ 'Jobs: check jobs/<name>/job.json first; run/create self-contained jobs/<name>/job.sh installers when useful.',
31
32
  'Threads: THREAD-* for active open loops shown in [Live threads]. Full grammar in tag docs.',
32
33
  ].join('\n');
33
34
  }
@@ -106,7 +106,7 @@ export async function downloadAndSave(msg, jid) {
106
106
  return null;
107
107
  }
108
108
  }
109
- export function mediaPromptTag(info, caption) {
109
+ export function mediaPromptTag(info, caption, transcript) {
110
110
  const label = info.mediaType === 'image'
111
111
  ? 'an image'
112
112
  : info.mediaType === 'video'
@@ -116,12 +116,17 @@ export function mediaPromptTag(info, caption) {
116
116
  : info.mediaType === 'document'
117
117
  ? 'a document'
118
118
  : 'a sticker';
119
+ const hasTranscript = info.mediaType === 'audio' && !!transcript?.trim();
119
120
  const lines = [
120
121
  `[User sent ${label}: ${info.mediaPath}]`,
121
- `Read this file to see what the user sent.`,
122
+ hasTranscript
123
+ ? 'Transcript provided below; use it as the spoken content.'
124
+ : 'Read this file to see what the user sent.',
122
125
  ];
123
- if (caption)
124
- lines.push(`Caption: "${caption}"`);
126
+ if (caption.trim())
127
+ lines.push(`Caption: "${caption.trim()}"`);
128
+ if (hasTranscript)
129
+ lines.push(`Transcript: "${transcript.trim()}"`);
125
130
  return lines.join('\n');
126
131
  }
127
132
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@c4t4/heyamigo",
3
- "version": "0.10.6",
3
+ "version": "0.11.0",
4
4
  "description": "WhatsApp and Telegram AI bot powered by Claude, Codex, or Grok with long-term memory, browser control, and role-based access",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",