npm - @c4t4/heyamigo - Versions diffs - 0.10.6 → 0.11.0 - Mend

@c4t4/heyamigo 0.10.6 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/config/config.example.json +6 -0
package/dist/audio/transcription.js +58 -0
package/dist/config.js +15 -0
package/dist/gateway/ingest.js +40 -1
package/dist/memory/preamble.js +1 -0
package/dist/store/media.js +9 -4
package/package.json +1 -1

package/config/config.example.json CHANGED Viewed

@@ -33,6 +33,12 @@
     "provider": "claude"
   },
+  "audio": {
+    "transcription": {
+      "enabled": true
+    }
+  },
   "claude": {
     "model": "claude-opus-4-7",
     "personalityFile": "./config/personalities/sharp.md",

package/dist/audio/transcription.js ADDED Viewed

@@ -0,0 +1,58 @@
+import { dirname } from 'path';
+import { getProvider } from '../ai/providers.js';
+import { config } from '../config.js';
+import { logger } from '../logger.js';
+const UNTRANSCRIBABLE = '[UNTRANSCRIBABLE]';
+export async function transcribeAudioFile(params) {
+    const cfg = config.audio.transcription;
+    if (!cfg.enabled)
+        return null;
+    try {
+        const provider = getProvider();
+        const result = await provider.runTask({
+            input: [
+                'Transcribe the audio file at this exact path.',
+                '',
+                params.path,
+                '',
+                'Return only the spoken transcript text.',
+                `If the file is not readable or cannot be transcribed, return exactly ${UNTRANSCRIBABLE}.`,
+                'Do not answer the speaker. Do not summarize. Do not add labels, markdown, or commentary.',
+            ].join('\n'),
+            caller: 'audio-transcription',
+            mode: 'read-only',
+            lane: 'background',
+            includeSystemPrompt: false,
+            addDirs: [dirname(params.path), config.storage.mediaDir],
+        });
+        const text = cleanupTranscript(result.reply);
+        if (!text)
+            return null;
+        logger.info({
+            provider: provider.name,
+            address: params.address,
+            externalMsgId: params.externalMsgId,
+            chars: text.length,
+        }, 'audio transcribed');
+        return text;
+    }
+    catch (err) {
+        logger.warn({
+            err,
+            provider: config.ai.provider,
+            address: params.address,
+            externalMsgId: params.externalMsgId,
+        }, 'audio transcription failed');
+        return null;
+    }
+}
+function cleanupTranscript(reply) {
+    let text = reply.trim();
+    if (!text)
+        return null;
+    text = text.replace(/^```(?:text)?\s*/i, '').replace(/\s*```$/i, '').trim();
+    text = text.replace(/^transcript:\s*/i, '').trim();
+    if (!text || text === UNTRANSCRIBABLE)
+        return null;
+    return text;
+}

package/dist/config.js CHANGED Viewed

@@ -38,6 +38,21 @@ const ConfigSchema = z.object({
         provider: z.enum(['claude', 'codex', 'grok']).default('claude'),
     })
         .default({ provider: 'claude' }),
+    audio: z
+        .object({
+        transcription: z
+            .object({
+            enabled: z.boolean().default(true),
+        })
+            .default({
+            enabled: true,
+        }),
+    })
+        .default({
+        transcription: {
+            enabled: true,
+        },
+    }),
     claude: z.object({
         model: z.string(),
         personalityFile: z.string(),

package/dist/gateway/ingest.js CHANGED Viewed

@@ -2,6 +2,7 @@ import { unlink } from 'fs/promises';
 import { resolve } from 'path';
 import { getProvider } from '../ai/providers.js';
 import { getSession } from '../ai/sessions.js';
+import { transcribeAudioFile } from '../audio/transcription.js';
 import { config } from '../config.js';
 import { personIdForAddress } from '../db/identity-sync.js';
 import { estimate as estimateJob } from '../estimates/index.js';
@@ -50,6 +51,24 @@ function buildImageGenRoutingContract() {
         `Reply briefly and emit [ASYNC: Generate the requested image using current chat context. Save final files under ${outboxPath}/. Follow-up reply must include one [IMAGE: /absolute/path] tag per final image, or say: Image job failed before producing a file.]`,
     ].join('\n');
 }
+function shouldTranscribeAudio(params) {
+    if (params.media?.mediaType !== 'audio')
+        return false;
+    if (!params.respond)
+        return false;
+    if (params.selfChat)
+        return true;
+    return params.triggerMode !== 'off';
+}
+function mergeAudioTranscript(text, transcript) {
+    const cleanedTranscript = transcript.trim();
+    const cleanedText = text.trim();
+    if (!cleanedTranscript)
+        return text;
+    if (!cleanedText)
+        return cleanedTranscript;
+    return `${cleanedText}\n\n[Audio transcript]\n${cleanedTranscript}`;
+}
 export async function processIncomingMessage(incoming, opts = {}) {
     const stored = toStored(incoming);
     const ageMs = Date.now() - stored.timestamp * 1000;
@@ -118,6 +137,26 @@ export async function processIncomingMessage(incoming, opts = {}) {
         stored.mediaPath = media.mediaPath;
         stored.mediaMime = media.mediaMime;
     }
+    const originalMediaText = stored.text;
+    let audioTranscript = null;
+    const transcribeThisAudio = shouldTranscribeAudio({
+        media,
+        respond: decision.respond,
+        triggerMode: decision.triggerMode,
+        selfChat: incoming.selfChat,
+    }) && media;
+    if (transcribeThisAudio) {
+        audioTranscript = await transcribeAudioFile({
+            path: transcribeThisAudio.mediaPath,
+            mime: transcribeThisAudio.mediaMime,
+            address: incoming.address,
+            externalMsgId: incoming.externalMsgId,
+        });
+        if (audioTranscript) {
+            stored.text = mergeAudioTranscript(stored.text, audioTranscript);
+            logCtx.text = stored.text.slice(0, 80);
+        }
+    }
     await append(stored);
     if (!decision.respond) {
         logger.info(logCtx, 'message captured, silent');
@@ -168,7 +207,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
     const existingSession = getSession(stored.jid, getProvider().name);
     let userContent = stored.text;
     if (media) {
-        userContent = mediaPromptTag(media, stored.text);
+        userContent = mediaPromptTag(media, originalMediaText, audioTranscript);
     }
     const memoryPreamble = buildMemoryPreamble({
         jid: stored.jid,

package/dist/memory/preamble.js CHANGED Viewed

@@ -28,6 +28,7 @@ function buildCoreQueueContract(outboxPath) {
         `Media: [IMAGE|VIDEO|AUDIO|DOCUMENT: /absolute/path] from ${outboxPath}/`,
         'Memory: [DIGEST: reason], [JOURNAL:slug - note], [JOURNAL-NEW:slug - purpose]',
         'Time: [REMIND: YYYY-MM-DD HH:MM - text], [CRON: expr SAY|PROMPT|ASYNC|BROWSER - body]',
+        'Jobs: check jobs/<name>/job.json first; run/create self-contained jobs/<name>/job.sh installers when useful.',
         'Threads: THREAD-* for active open loops shown in [Live threads]. Full grammar in tag docs.',
     ].join('\n');
 }

package/dist/store/media.js CHANGED Viewed

@@ -106,7 +106,7 @@ export async function downloadAndSave(msg, jid) {
         return null;
     }
 }
-export function mediaPromptTag(info, caption) {
+export function mediaPromptTag(info, caption, transcript) {
     const label = info.mediaType === 'image'
         ? 'an image'
         : info.mediaType === 'video'
@@ -116,12 +116,17 @@ export function mediaPromptTag(info, caption) {
                 : info.mediaType === 'document'
                     ? 'a document'
                     : 'a sticker';
+    const hasTranscript = info.mediaType === 'audio' && !!transcript?.trim();
     const lines = [
         `[User sent ${label}: ${info.mediaPath}]`,
-        `Read this file to see what the user sent.`,
+        hasTranscript
+            ? 'Transcript provided below; use it as the spoken content.'
+            : 'Read this file to see what the user sent.',
     ];
-    if (caption)
-        lines.push(`Caption: "${caption}"`);
+    if (caption.trim())
+        lines.push(`Caption: "${caption.trim()}"`);
+    if (hasTranscript)
+        lines.push(`Transcript: "${transcript.trim()}"`);
     return lines.join('\n');
 }
 /**

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@c4t4/heyamigo",
-  "version": "0.10.6",
+  "version": "0.11.0",
   "description": "WhatsApp and Telegram AI bot powered by Claude, Codex, or Grok with long-term memory, browser control, and role-based access",
   "type": "module",
   "main": "dist/index.js",