@c4t4/heyamigo 0.11.1 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,6 +39,17 @@
39
39
  }
40
40
  },
41
41
 
42
+ "voice": {
43
+ "enabled": false,
44
+ "provider": "elevenlabs",
45
+ "apiKeyEnv": "ELEVENLABS_API_KEY",
46
+ "voiceId": "",
47
+ "modelId": "eleven_multilingual_v2",
48
+ "outputFormat": "mp3_44100_128",
49
+ "maxChars": 1200,
50
+ "timeoutMs": 30000
51
+ },
52
+
42
53
  "claude": {
43
54
  "model": "claude-opus-4-7",
44
55
  "personalityFile": "./config/personalities/sharp.md",
@@ -61,6 +61,11 @@ function requireFile(path) {
61
61
  throw new PermanentChannelError(`media file unreadable: ${path} (${err.message})`, err);
62
62
  }
63
63
  }
64
+ function isGeneratedVoiceReply(msg) {
65
+ return (msg.kind === 'audio' &&
66
+ !!msg.mediaPath &&
67
+ basename(msg.mediaPath).startsWith('voice-'));
68
+ }
64
69
  // Map a Baileys send error onto our transient/permanent classification.
65
70
  // Network/connection issues → transient. Anything else (invalid jid,
66
71
  // payload, etc.) → permanent.
@@ -120,7 +125,11 @@ async function sendOne(sock, jid, msg) {
120
125
  throw new PermanentChannelError('audio outbound missing mediaPath');
121
126
  }
122
127
  const { buf } = requireFile(msg.mediaPath);
123
- return sock.sendMessage(jid, { audio: buf, mimetype: mimeFor(msg.mediaPath, msg.mediaMime) }, quoteOpts);
128
+ return sock.sendMessage(jid, {
129
+ audio: buf,
130
+ mimetype: mimeFor(msg.mediaPath, msg.mediaMime),
131
+ ptt: isGeneratedVoiceReply(msg),
132
+ }, quoteOpts);
124
133
  }
125
134
  case 'document': {
126
135
  if (!msg.mediaPath) {
package/dist/config.js CHANGED
@@ -53,6 +53,27 @@ const ConfigSchema = z.object({
53
53
  enabled: true,
54
54
  },
55
55
  }),
56
+ voice: z
57
+ .object({
58
+ enabled: z.boolean().default(false),
59
+ provider: z.enum(['elevenlabs']).default('elevenlabs'),
60
+ apiKeyEnv: z.string().default('ELEVENLABS_API_KEY'),
61
+ voiceId: z.string().default(''),
62
+ modelId: z.string().default('eleven_multilingual_v2'),
63
+ outputFormat: z.string().default('mp3_44100_128'),
64
+ maxChars: z.number().int().positive().default(1200),
65
+ timeoutMs: z.number().int().positive().default(30000),
66
+ })
67
+ .default({
68
+ enabled: false,
69
+ provider: 'elevenlabs',
70
+ apiKeyEnv: 'ELEVENLABS_API_KEY',
71
+ voiceId: '',
72
+ modelId: 'eleven_multilingual_v2',
73
+ outputFormat: 'mp3_44100_128',
74
+ maxChars: 1200,
75
+ timeoutMs: 30000,
76
+ }),
56
77
  claude: z.object({
57
78
  model: z.string(),
58
79
  personalityFile: z.string(),
@@ -13,6 +13,7 @@ import { enqueueOutbound } from '../queue/outbound.js';
13
13
  import { mediaPromptTag } from '../store/media.js';
14
14
  import { append } from '../store/messages.js';
15
15
  import { getDailyTokens } from '../store/usage.js';
16
+ import { wantsVoiceReply } from '../voice/request.js';
16
17
  import { checkAccess, discoverAddressGroupIfNew, getLimitsForUser, getRoleForContext, } from '../wa/whitelist.js';
17
18
  import { buildInitPayload, buildRecentContext } from './bootstrap.js';
18
19
  import { tryCommand } from './commands.js';
@@ -69,6 +70,13 @@ function mergeAudioTranscript(text, transcript) {
69
70
  return cleanedTranscript;
70
71
  return `${cleanedText}\n\n[Audio transcript]\n${cleanedTranscript}`;
71
72
  }
73
+ function buildVoiceReplyContract() {
74
+ return [
75
+ '[Voice reply requested]',
76
+ 'The user asked for a spoken/voice reply.',
77
+ 'Write the reply as concise natural speech. Do not mention text-to-speech or audio generation.',
78
+ ].join('\n');
79
+ }
72
80
  export async function processIncomingMessage(incoming, opts = {}) {
73
81
  const stored = toStored(incoming);
74
82
  const ageMs = Date.now() - stored.timestamp * 1000;
@@ -241,10 +249,14 @@ export async function processIncomingMessage(incoming, opts = {}) {
241
249
  senderPersonId: actorPersonId ?? undefined,
242
250
  });
243
251
  const jobKind = est?.kind ?? null;
252
+ const replyWithVoice = wantsVoiceReply(stored.text);
244
253
  let input = `${memoryPreamble}\n\n---\n\n${core}`;
245
254
  if (est?.kind === 'image-gen') {
246
255
  input = `${input}\n\n---\n\n${buildImageGenRoutingContract()}`;
247
256
  }
257
+ if (replyWithVoice) {
258
+ input = `${input}\n\n---\n\n${buildVoiceReplyContract()}`;
259
+ }
248
260
  logger.info({ ...logCtx, resume: !!existingSession, trigger: triggerReason }, 'message captured, enqueuing');
249
261
  const job = {
250
262
  jid: stored.jid,
@@ -257,6 +269,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
257
269
  fromMe: stored.fromMe,
258
270
  allowedTools: role.tools,
259
271
  allowedTags: role.tags,
272
+ replyWithVoice,
260
273
  };
261
274
  if (est) {
262
275
  enqueueOutbound({
@@ -5,6 +5,7 @@ import { formatAddress, jidToAddress } from '../db/address.js';
5
5
  import { logger } from '../logger.js';
6
6
  import { addressForJob } from '../queue/job-address.js';
7
7
  import { enqueueOutbound } from '../queue/outbound.js';
8
+ import { synthesizeVoiceReply } from '../voice/elevenlabs.js';
8
9
  import { detectMediaType } from '../wa/sender.js';
9
10
  // Matches [FILE: path], [IMAGE: path], [VIDEO: path], [AUDIO: path], [DOCUMENT: path]
10
11
  const FILE_TAG_RE = /\[(?:FILE|IMAGE|VIDEO|AUDIO|DOCUMENT):\s*([^\]]+)\]/gi;
@@ -145,6 +146,35 @@ export async function handleReply(job, result, _originalMsg) {
145
146
  const enqueuePiece = (input) => {
146
147
  enqueueOutbound({ ...input, idempotencyKey: `${baseKey}-${pieceIdx++}` });
147
148
  };
149
+ if (job.replyWithVoice && text && files.length === 0) {
150
+ const voice = await synthesizeVoiceReply(text);
151
+ if (voice) {
152
+ enqueuePiece({
153
+ address,
154
+ kind: 'audio',
155
+ mediaPath: voice.path,
156
+ mediaMime: voice.mime,
157
+ mediaBytes: voice.bytes,
158
+ });
159
+ for (const card of result.jobCards ?? []) {
160
+ enqueueOutbound({
161
+ address,
162
+ kind: 'text',
163
+ text: card.text,
164
+ idempotencyKey: card.idempotencyKey,
165
+ });
166
+ }
167
+ logger.info({
168
+ jid: job.jid,
169
+ files: 1,
170
+ chars: text.length,
171
+ pieces: pieceIdx,
172
+ cards: result.jobCards?.length ?? 0,
173
+ voice: true,
174
+ }, 'reply enqueued for outbound');
175
+ return;
176
+ }
177
+ }
148
178
  // Files first. Caption goes on the single-file-with-short-text case,
149
179
  // matching pre-refactor behavior.
150
180
  for (let i = 0; i < files.length; i++) {
@@ -90,14 +90,6 @@ function phraseMatches(normalizedText, phrase) {
90
90
  const re = new RegExp(`(^| )${escapeRegex(normalizedPhrase)}($| )`, 'i');
91
91
  return re.test(normalizedText);
92
92
  }
93
- function wakePhraseMatches(normalizedText, phrase) {
94
- const normalizedPhrase = normalizeAudioText(phrase);
95
- if (!normalizedPhrase)
96
- return false;
97
- const wake = '(hey|hi|hello|yo|ok|okay|oye|hola)';
98
- const re = new RegExp(`(^| )${wake} ${escapeRegex(normalizedPhrase)}($| )`, 'i');
99
- return re.test(normalizedText);
100
- }
101
93
  function audioAliasMatches(transcript, aliases) {
102
94
  const normalizedTranscript = normalizeAudioText(transcript);
103
95
  if (!normalizedTranscript)
@@ -111,7 +103,7 @@ function audioAliasMatches(transcript, aliases) {
111
103
  ...(AUDIO_ALIAS_VARIANTS[normalizedAlias] ?? []),
112
104
  ]);
113
105
  for (const variant of variants) {
114
- if (wakePhraseMatches(normalizedTranscript, variant)) {
106
+ if (phraseMatches(normalizedTranscript, variant)) {
115
107
  return { alias, variant: normalizeAudioText(variant) };
116
108
  }
117
109
  }
@@ -0,0 +1,90 @@
1
+ import { randomUUID } from 'crypto';
2
+ import { mkdir, stat, writeFile } from 'fs/promises';
3
+ import { resolve } from 'path';
4
+ import { config } from '../config.js';
5
+ import { logger } from '../logger.js';
6
+ const ELEVENLABS_TTS_BASE_URL = 'https://api.elevenlabs.io/v1/text-to-speech';
7
+ function outputMeta(outputFormat) {
8
+ if (outputFormat.startsWith('mp3'))
9
+ return { ext: 'mp3', mime: 'audio/mpeg' };
10
+ if (outputFormat.startsWith('opus'))
11
+ return { ext: 'opus', mime: 'audio/opus' };
12
+ if (outputFormat.startsWith('wav'))
13
+ return { ext: 'wav', mime: 'audio/wav' };
14
+ if (outputFormat.startsWith('pcm'))
15
+ return { ext: 'pcm', mime: 'audio/L16' };
16
+ if (outputFormat.startsWith('ulaw'))
17
+ return { ext: 'ulaw', mime: 'audio/basic' };
18
+ if (outputFormat.startsWith('alaw'))
19
+ return { ext: 'alaw', mime: 'audio/basic' };
20
+ return { ext: 'bin', mime: 'application/octet-stream' };
21
+ }
22
+ async function outboxVoicePath(outputFormat) {
23
+ const meta = outputMeta(outputFormat);
24
+ const dir = resolve(process.cwd(), 'storage/outbox');
25
+ await mkdir(dir, { recursive: true });
26
+ return {
27
+ path: resolve(dir, `voice-${Date.now()}-${randomUUID()}.${meta.ext}`),
28
+ mime: meta.mime,
29
+ };
30
+ }
31
+ export async function synthesizeVoiceReply(text) {
32
+ const voice = config.voice;
33
+ if (!voice.enabled)
34
+ return null;
35
+ if (voice.provider !== 'elevenlabs')
36
+ return null;
37
+ const apiKey = process.env[voice.apiKeyEnv];
38
+ if (!apiKey) {
39
+ logger.warn({ apiKeyEnv: voice.apiKeyEnv }, 'voice reply skipped; API key env var is not set');
40
+ return null;
41
+ }
42
+ if (!voice.voiceId.trim()) {
43
+ logger.warn('voice reply skipped; voice.voiceId is not configured');
44
+ return null;
45
+ }
46
+ const cleaned = text.trim();
47
+ if (!cleaned)
48
+ return null;
49
+ if (cleaned.length > voice.maxChars) {
50
+ logger.warn({ chars: cleaned.length, maxChars: voice.maxChars }, 'voice reply skipped; text is too long');
51
+ return null;
52
+ }
53
+ const controller = new AbortController();
54
+ const timeout = setTimeout(() => controller.abort(), voice.timeoutMs);
55
+ timeout.unref();
56
+ try {
57
+ const url = new URL(`${ELEVENLABS_TTS_BASE_URL}/${encodeURIComponent(voice.voiceId)}`);
58
+ url.searchParams.set('output_format', voice.outputFormat);
59
+ const res = await fetch(url, {
60
+ method: 'POST',
61
+ headers: {
62
+ 'content-type': 'application/json',
63
+ 'xi-api-key': apiKey,
64
+ },
65
+ body: JSON.stringify({
66
+ text: cleaned,
67
+ model_id: voice.modelId,
68
+ }),
69
+ signal: controller.signal,
70
+ });
71
+ if (!res.ok) {
72
+ const body = await res.text().catch(() => '');
73
+ logger.warn({ status: res.status, body: body.slice(0, 500) }, 'voice reply synthesis failed');
74
+ return null;
75
+ }
76
+ const buffer = Buffer.from(await res.arrayBuffer());
77
+ const file = await outboxVoicePath(voice.outputFormat);
78
+ await writeFile(file.path, buffer);
79
+ const s = await stat(file.path);
80
+ logger.info({ path: file.path, chars: cleaned.length, bytes: s.size }, 'voice reply synthesized');
81
+ return { path: file.path, mime: file.mime, bytes: s.size };
82
+ }
83
+ catch (err) {
84
+ logger.warn({ err, timeout: err.name === 'AbortError' ? voice.timeoutMs : undefined }, 'voice reply synthesis failed');
85
+ return null;
86
+ }
87
+ finally {
88
+ clearTimeout(timeout);
89
+ }
90
+ }
@@ -0,0 +1,14 @@
1
+ const VOICE_REQUEST_PATTERNS = [
2
+ /\b(?:reply|respond|answer|send|say|speak|talk)\b.{0,40}\b(?:voice|audio|spoken|out loud|aloud)\b/i,
3
+ /\b(?:voice|audio|spoken)\b.{0,40}\b(?:reply|response|answer|message|note)\b/i,
4
+ /\b(?:send|reply with|respond with)\b.{0,20}\b(?:a )?(?:voice note|voice message|audio message)\b/i,
5
+ /\b(?:can you|could you|please)?\s*(?:speak|say it out loud|talk to me)\b/i,
6
+ /\b(?:responde|contestame|contesta|habla|dilo)\b.{0,40}\b(?:voz|audio|hablado)\b/i,
7
+ /\b(?:mensaje|nota|respuesta)\b.{0,40}\b(?:de voz|en audio)\b/i,
8
+ ];
9
+ export function wantsVoiceReply(text) {
10
+ const cleaned = text.trim();
11
+ if (!cleaned)
12
+ return false;
13
+ return VOICE_REQUEST_PATTERNS.some((re) => re.test(cleaned));
14
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@c4t4/heyamigo",
3
- "version": "0.11.1",
3
+ "version": "0.12.0",
4
4
  "description": "WhatsApp and Telegram AI bot powered by Claude, Codex, or Grok with long-term memory, browser control, and role-based access",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",