npm - @c4t4/heyamigo - Versions diffs - 0.10.7 → 0.11.1 - Mend

@c4t4/heyamigo 0.10.7 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/config/config.example.json +6 -0
package/dist/audio/transcription.js +58 -0
package/dist/config.js +15 -0
package/dist/gateway/ingest.js +41 -1
package/dist/gateway/triggers.js +117 -0
package/dist/store/media.js +9 -4
package/package.json +1 -1

package/config/config.example.json CHANGED Viewed

@@ -33,6 +33,12 @@
     "provider": "claude"
   },
+  "audio": {
+    "transcription": {
+      "enabled": true
+    }
+  },
   "claude": {
     "model": "claude-opus-4-7",
     "personalityFile": "./config/personalities/sharp.md",

package/dist/audio/transcription.js ADDED Viewed

@@ -0,0 +1,58 @@
+import { dirname } from 'path';
+import { getProvider } from '../ai/providers.js';
+import { config } from '../config.js';
+import { logger } from '../logger.js';
+const UNTRANSCRIBABLE = '[UNTRANSCRIBABLE]';
+export async function transcribeAudioFile(params) {
+    const cfg = config.audio.transcription;
+    if (!cfg.enabled)
+        return null;
+    try {
+        const provider = getProvider();
+        const result = await provider.runTask({
+            input: [
+                'Transcribe the audio file at this exact path.',
+                '',
+                params.path,
+                '',
+                'Return only the spoken transcript text.',
+                `If the file is not readable or cannot be transcribed, return exactly ${UNTRANSCRIBABLE}.`,
+                'Do not answer the speaker. Do not summarize. Do not add labels, markdown, or commentary.',
+            ].join('\n'),
+            caller: 'audio-transcription',
+            mode: 'read-only',
+            lane: 'background',
+            includeSystemPrompt: false,
+            addDirs: [dirname(params.path), config.storage.mediaDir],
+        });
+        const text = cleanupTranscript(result.reply);
+        if (!text)
+            return null;
+        logger.info({
+            provider: provider.name,
+            address: params.address,
+            externalMsgId: params.externalMsgId,
+            chars: text.length,
+        }, 'audio transcribed');
+        return text;
+    }
+    catch (err) {
+        logger.warn({
+            err,
+            provider: config.ai.provider,
+            address: params.address,
+            externalMsgId: params.externalMsgId,
+        }, 'audio transcription failed');
+        return null;
+    }
+}
+function cleanupTranscript(reply) {
+    let text = reply.trim();
+    if (!text)
+        return null;
+    text = text.replace(/^```(?:text)?\s*/i, '').replace(/\s*```$/i, '').trim();
+    text = text.replace(/^transcript:\s*/i, '').trim();
+    if (!text || text === UNTRANSCRIBABLE)
+        return null;
+    return text;
+}

package/dist/config.js CHANGED Viewed

@@ -38,6 +38,21 @@ const ConfigSchema = z.object({
         provider: z.enum(['claude', 'codex', 'grok']).default('claude'),
     })
         .default({ provider: 'claude' }),
+    audio: z
+        .object({
+        transcription: z
+            .object({
+            enabled: z.boolean().default(true),
+        })
+            .default({
+            enabled: true,
+        }),
+    })
+        .default({
+        transcription: {
+            enabled: true,
+        },
+    }),
     claude: z.object({
         model: z.string(),
         personalityFile: z.string(),

package/dist/gateway/ingest.js CHANGED Viewed

@@ -2,6 +2,7 @@ import { unlink } from 'fs/promises';
 import { resolve } from 'path';
 import { getProvider } from '../ai/providers.js';
 import { getSession } from '../ai/sessions.js';
+import { transcribeAudioFile } from '../audio/transcription.js';
 import { config } from '../config.js';
 import { personIdForAddress } from '../db/identity-sync.js';
 import { estimate as estimateJob } from '../estimates/index.js';
@@ -50,6 +51,24 @@ function buildImageGenRoutingContract() {
         `Reply briefly and emit [ASYNC: Generate the requested image using current chat context. Save final files under ${outboxPath}/. Follow-up reply must include one [IMAGE: /absolute/path] tag per final image, or say: Image job failed before producing a file.]`,
     ].join('\n');
 }
+function shouldTranscribeAudio(params) {
+    if (params.media?.mediaType !== 'audio')
+        return false;
+    if (!params.respond)
+        return false;
+    if (params.selfChat)
+        return true;
+    return params.triggerMode !== 'off';
+}
+function mergeAudioTranscript(text, transcript) {
+    const cleanedTranscript = transcript.trim();
+    const cleanedText = text.trim();
+    if (!cleanedTranscript)
+        return text;
+    if (!cleanedText)
+        return cleanedTranscript;
+    return `${cleanedText}\n\n[Audio transcript]\n${cleanedTranscript}`;
+}
 export async function processIncomingMessage(incoming, opts = {}) {
     const stored = toStored(incoming);
     const ageMs = Date.now() - stored.timestamp * 1000;
@@ -118,6 +137,26 @@ export async function processIncomingMessage(incoming, opts = {}) {
         stored.mediaPath = media.mediaPath;
         stored.mediaMime = media.mediaMime;
     }
+    const originalMediaText = stored.text;
+    let audioTranscript = null;
+    const transcribeThisAudio = shouldTranscribeAudio({
+        media,
+        respond: decision.respond,
+        triggerMode: decision.triggerMode,
+        selfChat: incoming.selfChat,
+    }) && media;
+    if (transcribeThisAudio) {
+        audioTranscript = await transcribeAudioFile({
+            path: transcribeThisAudio.mediaPath,
+            mime: transcribeThisAudio.mediaMime,
+            address: incoming.address,
+            externalMsgId: incoming.externalMsgId,
+        });
+        if (audioTranscript) {
+            stored.text = mergeAudioTranscript(stored.text, audioTranscript);
+            logCtx.text = stored.text.slice(0, 80);
+        }
+    }
     await append(stored);
     if (!decision.respond) {
         logger.info(logCtx, 'message captured, silent');
@@ -143,6 +182,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
         const trigger = checkTrigger({
             mode: decision.triggerMode,
             text: stored.text,
+            audioTranscript: audioTranscript ?? undefined,
             mentionedBot: incoming.triggerHints?.mentionedBot,
             replyToBot: incoming.triggerHints?.replyToBot,
         });
@@ -168,7 +208,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
     const existingSession = getSession(stored.jid, getProvider().name);
     let userContent = stored.text;
     if (media) {
-        userContent = mediaPromptTag(media, stored.text);
+        userContent = mediaPromptTag(media, originalMediaText, audioTranscript);
     }
     const memoryPreamble = buildMemoryPreamble({
         jid: stored.jid,

package/dist/gateway/triggers.js CHANGED Viewed

@@ -1,4 +1,68 @@
 import { config } from '../config.js';
+const AUDIO_ALIAS_VARIANTS = {
+    heyamigo: [
+        'hey amigo',
+        'hey amigos',
+        'hey amego',
+        'hey amico',
+        'hey a migo',
+        'hay amigo',
+        'hi amigo',
+    ],
+    amigo: [
+        'a migo',
+        'amego',
+        'amico',
+        'amigos',
+        'amiga',
+        'migo',
+    ],
+    claude: [
+        'cloud',
+        'clawd',
+        'clawed',
+        'clod',
+        'clode',
+        'cload',
+        'clout',
+        'claut',
+        'clause',
+        'claus',
+    ],
+    clawd: [
+        'claude',
+        'cloud',
+        'clawed',
+        'clod',
+        'clode',
+        'cload',
+        'clout',
+        'claut',
+    ],
+    grok: [
+        'grock',
+        'grog',
+        'gronk',
+        'grawk',
+        'groc',
+    ],
+    codex: [
+        'code x',
+        'codec',
+        'codecs',
+        'codecks',
+        'codicks',
+        'kodeks',
+        'codacs',
+    ],
+    xai: [
+        'x ai',
+        'x a i',
+        'ex ai',
+        'ex a i',
+        'x.ai',
+    ],
+};
 function escapeRegex(s) {
     return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
 }
@@ -10,6 +74,50 @@ function aliasMatches(text, aliases) {
     }
     return null;
 }
+function normalizeAudioText(text) {
+    return text
+        .toLowerCase()
+        .normalize('NFKD')
+        .replace(/[\u0300-\u036f]/g, '')
+        .replace(/[^a-z0-9]+/g, ' ')
+        .trim()
+        .replace(/\s+/g, ' ');
+}
+function phraseMatches(normalizedText, phrase) {
+    const normalizedPhrase = normalizeAudioText(phrase);
+    if (!normalizedPhrase)
+        return false;
+    const re = new RegExp(`(^| )${escapeRegex(normalizedPhrase)}($| )`, 'i');
+    return re.test(normalizedText);
+}
+function wakePhraseMatches(normalizedText, phrase) {
+    const normalizedPhrase = normalizeAudioText(phrase);
+    if (!normalizedPhrase)
+        return false;
+    const wake = '(hey|hi|hello|yo|ok|okay|oye|hola)';
+    const re = new RegExp(`(^| )${wake} ${escapeRegex(normalizedPhrase)}($| )`, 'i');
+    return re.test(normalizedText);
+}
+function audioAliasMatches(transcript, aliases) {
+    const normalizedTranscript = normalizeAudioText(transcript);
+    if (!normalizedTranscript)
+        return null;
+    for (const alias of aliases) {
+        const normalizedAlias = normalizeAudioText(alias);
+        if (phraseMatches(normalizedTranscript, normalizedAlias)) {
+            return { alias, variant: normalizedAlias };
+        }
+        const variants = new Set([
+            ...(AUDIO_ALIAS_VARIANTS[normalizedAlias] ?? []),
+        ]);
+        for (const variant of variants) {
+            if (wakePhraseMatches(normalizedTranscript, variant)) {
+                return { alias, variant: normalizeAudioText(variant) };
+            }
+        }
+    }
+    return null;
+}
 export function checkTrigger(params) {
     const { mode, text } = params;
     if (mode === 'off')
@@ -27,6 +135,15 @@ export function checkTrigger(params) {
     const alias = aliasMatches(text, config.triggers.aliases);
     if (alias)
         return { triggered: true, reason: `alias:${alias}` };
+    const audioAlias = params.audioTranscript
+        ? audioAliasMatches(params.audioTranscript, config.triggers.aliases)
+        : null;
+    if (audioAlias) {
+        return {
+            triggered: true,
+            reason: `audio-alias:${audioAlias.alias}~${audioAlias.variant}`,
+        };
+    }
     // 2. Channel-provided mention signal, e.g. WhatsApp @mention or
     // Telegram bot username mention.
     if (params.mentionedBot)

package/dist/store/media.js CHANGED Viewed

@@ -106,7 +106,7 @@ export async function downloadAndSave(msg, jid) {
         return null;
     }
 }
-export function mediaPromptTag(info, caption) {
+export function mediaPromptTag(info, caption, transcript) {
     const label = info.mediaType === 'image'
         ? 'an image'
         : info.mediaType === 'video'
@@ -116,12 +116,17 @@ export function mediaPromptTag(info, caption) {
                 : info.mediaType === 'document'
                     ? 'a document'
                     : 'a sticker';
+    const hasTranscript = info.mediaType === 'audio' && !!transcript?.trim();
     const lines = [
         `[User sent ${label}: ${info.mediaPath}]`,
-        `Read this file to see what the user sent.`,
+        hasTranscript
+            ? 'Transcript provided below; use it as the spoken content.'
+            : 'Read this file to see what the user sent.',
     ];
-    if (caption)
-        lines.push(`Caption: "${caption}"`);
+    if (caption.trim())
+        lines.push(`Caption: "${caption.trim()}"`);
+    if (hasTranscript)
+        lines.push(`Transcript: "${transcript.trim()}"`);
     return lines.join('\n');
 }
 /**

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@c4t4/heyamigo",
-  "version": "0.10.7",
+  "version": "0.11.1",
   "description": "WhatsApp and Telegram AI bot powered by Claude, Codex, or Grok with long-term memory, browser control, and role-based access",
   "type": "module",
   "main": "dist/index.js",