npm - @c4t4/heyamigo - Versions diffs - 0.11.1 → 0.12.0 - Mend

@c4t4/heyamigo 0.11.1 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/config/config.example.json +11 -0
package/dist/channels/baileys.js +10 -1
package/dist/config.js +21 -0
package/dist/gateway/ingest.js +13 -0
package/dist/gateway/outgoing.js +30 -0
package/dist/gateway/triggers.js +1 -9
package/dist/voice/elevenlabs.js +90 -0
package/dist/voice/request.js +14 -0
package/package.json +1 -1

package/config/config.example.json CHANGED Viewed

@@ -39,6 +39,17 @@
     }
   },
+  "voice": {
+    "enabled": false,
+    "provider": "elevenlabs",
+    "apiKeyEnv": "ELEVENLABS_API_KEY",
+    "voiceId": "",
+    "modelId": "eleven_multilingual_v2",
+    "outputFormat": "mp3_44100_128",
+    "maxChars": 1200,
+    "timeoutMs": 30000
+  },
   "claude": {
     "model": "claude-opus-4-7",
     "personalityFile": "./config/personalities/sharp.md",

package/dist/channels/baileys.js CHANGED Viewed

@@ -61,6 +61,11 @@ function requireFile(path) {
         throw new PermanentChannelError(`media file unreadable: ${path} (${err.message})`, err);
     }
 }
+function isGeneratedVoiceReply(msg) {
+    return (msg.kind === 'audio' &&
+        !!msg.mediaPath &&
+        basename(msg.mediaPath).startsWith('voice-'));
+}
 // Map a Baileys send error onto our transient/permanent classification.
 // Network/connection issues → transient. Anything else (invalid jid,
 // payload, etc.) → permanent.
@@ -120,7 +125,11 @@ async function sendOne(sock, jid, msg) {
                 throw new PermanentChannelError('audio outbound missing mediaPath');
             }
             const { buf } = requireFile(msg.mediaPath);
-            return sock.sendMessage(jid, { audio: buf, mimetype: mimeFor(msg.mediaPath, msg.mediaMime) }, quoteOpts);
+            return sock.sendMessage(jid, {
+                audio: buf,
+                mimetype: mimeFor(msg.mediaPath, msg.mediaMime),
+                ptt: isGeneratedVoiceReply(msg),
+            }, quoteOpts);
         }
         case 'document': {
             if (!msg.mediaPath) {

package/dist/config.js CHANGED Viewed

@@ -53,6 +53,27 @@ const ConfigSchema = z.object({
             enabled: true,
         },
     }),
+    voice: z
+        .object({
+        enabled: z.boolean().default(false),
+        provider: z.enum(['elevenlabs']).default('elevenlabs'),
+        apiKeyEnv: z.string().default('ELEVENLABS_API_KEY'),
+        voiceId: z.string().default(''),
+        modelId: z.string().default('eleven_multilingual_v2'),
+        outputFormat: z.string().default('mp3_44100_128'),
+        maxChars: z.number().int().positive().default(1200),
+        timeoutMs: z.number().int().positive().default(30000),
+    })
+        .default({
+        enabled: false,
+        provider: 'elevenlabs',
+        apiKeyEnv: 'ELEVENLABS_API_KEY',
+        voiceId: '',
+        modelId: 'eleven_multilingual_v2',
+        outputFormat: 'mp3_44100_128',
+        maxChars: 1200,
+        timeoutMs: 30000,
+    }),
     claude: z.object({
         model: z.string(),
         personalityFile: z.string(),

package/dist/gateway/ingest.js CHANGED Viewed

@@ -13,6 +13,7 @@ import { enqueueOutbound } from '../queue/outbound.js';
 import { mediaPromptTag } from '../store/media.js';
 import { append } from '../store/messages.js';
 import { getDailyTokens } from '../store/usage.js';
+import { wantsVoiceReply } from '../voice/request.js';
 import { checkAccess, discoverAddressGroupIfNew, getLimitsForUser, getRoleForContext, } from '../wa/whitelist.js';
 import { buildInitPayload, buildRecentContext } from './bootstrap.js';
 import { tryCommand } from './commands.js';
@@ -69,6 +70,13 @@ function mergeAudioTranscript(text, transcript) {
         return cleanedTranscript;
     return `${cleanedText}\n\n[Audio transcript]\n${cleanedTranscript}`;
 }
+function buildVoiceReplyContract() {
+    return [
+        '[Voice reply requested]',
+        'The user asked for a spoken/voice reply.',
+        'Write the reply as concise natural speech. Do not mention text-to-speech or audio generation.',
+    ].join('\n');
+}
 export async function processIncomingMessage(incoming, opts = {}) {
     const stored = toStored(incoming);
     const ageMs = Date.now() - stored.timestamp * 1000;
@@ -241,10 +249,14 @@ export async function processIncomingMessage(incoming, opts = {}) {
         senderPersonId: actorPersonId ?? undefined,
     });
     const jobKind = est?.kind ?? null;
+    const replyWithVoice = wantsVoiceReply(stored.text);
     let input = `${memoryPreamble}\n\n---\n\n${core}`;
     if (est?.kind === 'image-gen') {
         input = `${input}\n\n---\n\n${buildImageGenRoutingContract()}`;
     }
+    if (replyWithVoice) {
+        input = `${input}\n\n---\n\n${buildVoiceReplyContract()}`;
+    }
     logger.info({ ...logCtx, resume: !!existingSession, trigger: triggerReason }, 'message captured, enqueuing');
     const job = {
         jid: stored.jid,
@@ -257,6 +269,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
         fromMe: stored.fromMe,
         allowedTools: role.tools,
         allowedTags: role.tags,
+        replyWithVoice,
     };
     if (est) {
         enqueueOutbound({

package/dist/gateway/outgoing.js CHANGED Viewed

@@ -5,6 +5,7 @@ import { formatAddress, jidToAddress } from '../db/address.js';
 import { logger } from '../logger.js';
 import { addressForJob } from '../queue/job-address.js';
 import { enqueueOutbound } from '../queue/outbound.js';
+import { synthesizeVoiceReply } from '../voice/elevenlabs.js';
 import { detectMediaType } from '../wa/sender.js';
 // Matches [FILE: path], [IMAGE: path], [VIDEO: path], [AUDIO: path], [DOCUMENT: path]
 const FILE_TAG_RE = /\[(?:FILE|IMAGE|VIDEO|AUDIO|DOCUMENT):\s*([^\]]+)\]/gi;
@@ -145,6 +146,35 @@ export async function handleReply(job, result, _originalMsg) {
     const enqueuePiece = (input) => {
         enqueueOutbound({ ...input, idempotencyKey: `${baseKey}-${pieceIdx++}` });
     };
+    if (job.replyWithVoice && text && files.length === 0) {
+        const voice = await synthesizeVoiceReply(text);
+        if (voice) {
+            enqueuePiece({
+                address,
+                kind: 'audio',
+                mediaPath: voice.path,
+                mediaMime: voice.mime,
+                mediaBytes: voice.bytes,
+            });
+            for (const card of result.jobCards ?? []) {
+                enqueueOutbound({
+                    address,
+                    kind: 'text',
+                    text: card.text,
+                    idempotencyKey: card.idempotencyKey,
+                });
+            }
+            logger.info({
+                jid: job.jid,
+                files: 1,
+                chars: text.length,
+                pieces: pieceIdx,
+                cards: result.jobCards?.length ?? 0,
+                voice: true,
+            }, 'reply enqueued for outbound');
+            return;
+        }
+    }
     // Files first. Caption goes on the single-file-with-short-text case,
     // matching pre-refactor behavior.
     for (let i = 0; i < files.length; i++) {

package/dist/gateway/triggers.js CHANGED Viewed

@@ -90,14 +90,6 @@ function phraseMatches(normalizedText, phrase) {
     const re = new RegExp(`(^| )${escapeRegex(normalizedPhrase)}($| )`, 'i');
     return re.test(normalizedText);
 }
-function wakePhraseMatches(normalizedText, phrase) {
-    const normalizedPhrase = normalizeAudioText(phrase);
-    if (!normalizedPhrase)
-        return false;
-    const wake = '(hey|hi|hello|yo|ok|okay|oye|hola)';
-    const re = new RegExp(`(^| )${wake} ${escapeRegex(normalizedPhrase)}($| )`, 'i');
-    return re.test(normalizedText);
-}
 function audioAliasMatches(transcript, aliases) {
     const normalizedTranscript = normalizeAudioText(transcript);
     if (!normalizedTranscript)
@@ -111,7 +103,7 @@ function audioAliasMatches(transcript, aliases) {
             ...(AUDIO_ALIAS_VARIANTS[normalizedAlias] ?? []),
         ]);
         for (const variant of variants) {
-            if (wakePhraseMatches(normalizedTranscript, variant)) {
+            if (phraseMatches(normalizedTranscript, variant)) {
                 return { alias, variant: normalizeAudioText(variant) };
             }
         }

package/dist/voice/elevenlabs.js ADDED Viewed

@@ -0,0 +1,90 @@
+import { randomUUID } from 'crypto';
+import { mkdir, stat, writeFile } from 'fs/promises';
+import { resolve } from 'path';
+import { config } from '../config.js';
+import { logger } from '../logger.js';
+const ELEVENLABS_TTS_BASE_URL = 'https://api.elevenlabs.io/v1/text-to-speech';
+function outputMeta(outputFormat) {
+    if (outputFormat.startsWith('mp3'))
+        return { ext: 'mp3', mime: 'audio/mpeg' };
+    if (outputFormat.startsWith('opus'))
+        return { ext: 'opus', mime: 'audio/opus' };
+    if (outputFormat.startsWith('wav'))
+        return { ext: 'wav', mime: 'audio/wav' };
+    if (outputFormat.startsWith('pcm'))
+        return { ext: 'pcm', mime: 'audio/L16' };
+    if (outputFormat.startsWith('ulaw'))
+        return { ext: 'ulaw', mime: 'audio/basic' };
+    if (outputFormat.startsWith('alaw'))
+        return { ext: 'alaw', mime: 'audio/basic' };
+    return { ext: 'bin', mime: 'application/octet-stream' };
+}
+async function outboxVoicePath(outputFormat) {
+    const meta = outputMeta(outputFormat);
+    const dir = resolve(process.cwd(), 'storage/outbox');
+    await mkdir(dir, { recursive: true });
+    return {
+        path: resolve(dir, `voice-${Date.now()}-${randomUUID()}.${meta.ext}`),
+        mime: meta.mime,
+    };
+}
+export async function synthesizeVoiceReply(text) {
+    const voice = config.voice;
+    if (!voice.enabled)
+        return null;
+    if (voice.provider !== 'elevenlabs')
+        return null;
+    const apiKey = process.env[voice.apiKeyEnv];
+    if (!apiKey) {
+        logger.warn({ apiKeyEnv: voice.apiKeyEnv }, 'voice reply skipped; API key env var is not set');
+        return null;
+    }
+    if (!voice.voiceId.trim()) {
+        logger.warn('voice reply skipped; voice.voiceId is not configured');
+        return null;
+    }
+    const cleaned = text.trim();
+    if (!cleaned)
+        return null;
+    if (cleaned.length > voice.maxChars) {
+        logger.warn({ chars: cleaned.length, maxChars: voice.maxChars }, 'voice reply skipped; text is too long');
+        return null;
+    }
+    const controller = new AbortController();
+    const timeout = setTimeout(() => controller.abort(), voice.timeoutMs);
+    timeout.unref();
+    try {
+        const url = new URL(`${ELEVENLABS_TTS_BASE_URL}/${encodeURIComponent(voice.voiceId)}`);
+        url.searchParams.set('output_format', voice.outputFormat);
+        const res = await fetch(url, {
+            method: 'POST',
+            headers: {
+                'content-type': 'application/json',
+                'xi-api-key': apiKey,
+            },
+            body: JSON.stringify({
+                text: cleaned,
+                model_id: voice.modelId,
+            }),
+            signal: controller.signal,
+        });
+        if (!res.ok) {
+            const body = await res.text().catch(() => '');
+            logger.warn({ status: res.status, body: body.slice(0, 500) }, 'voice reply synthesis failed');
+            return null;
+        }
+        const buffer = Buffer.from(await res.arrayBuffer());
+        const file = await outboxVoicePath(voice.outputFormat);
+        await writeFile(file.path, buffer);
+        const s = await stat(file.path);
+        logger.info({ path: file.path, chars: cleaned.length, bytes: s.size }, 'voice reply synthesized');
+        return { path: file.path, mime: file.mime, bytes: s.size };
+    }
+    catch (err) {
+        logger.warn({ err, timeout: err.name === 'AbortError' ? voice.timeoutMs : undefined }, 'voice reply synthesis failed');
+        return null;
+    }
+    finally {
+        clearTimeout(timeout);
+    }
+}

package/dist/voice/request.js ADDED Viewed

@@ -0,0 +1,14 @@
+const VOICE_REQUEST_PATTERNS = [
+    /\b(?:reply|respond|answer|send|say|speak|talk)\b.{0,40}\b(?:voice|audio|spoken|out loud|aloud)\b/i,
+    /\b(?:voice|audio|spoken)\b.{0,40}\b(?:reply|response|answer|message|note)\b/i,
+    /\b(?:send|reply with|respond with)\b.{0,20}\b(?:a )?(?:voice note|voice message|audio message)\b/i,
+    /\b(?:can you|could you|please)?\s*(?:speak|say it out loud|talk to me)\b/i,
+    /\b(?:responde|contestame|contesta|habla|dilo)\b.{0,40}\b(?:voz|audio|hablado)\b/i,
+    /\b(?:mensaje|nota|respuesta)\b.{0,40}\b(?:de voz|en audio)\b/i,
+];
+export function wantsVoiceReply(text) {
+    const cleaned = text.trim();
+    if (!cleaned)
+        return false;
+    return VOICE_REQUEST_PATTERNS.some((re) => re.test(cleaned));
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@c4t4/heyamigo",
-  "version": "0.11.1",
+  "version": "0.12.0",
   "description": "WhatsApp and Telegram AI bot powered by Claude, Codex, or Grok with long-term memory, browser control, and role-based access",
   "type": "module",
   "main": "dist/index.js",