@c4t4/heyamigo 0.11.2 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/config.example.json +11 -0
- package/dist/channels/baileys.js +10 -1
- package/dist/config.js +21 -0
- package/dist/gateway/ingest.js +13 -0
- package/dist/gateway/outgoing.js +30 -0
- package/dist/voice/elevenlabs.js +90 -0
- package/dist/voice/request.js +14 -0
- package/package.json +1 -1
|
@@ -39,6 +39,17 @@
|
|
|
39
39
|
}
|
|
40
40
|
},
|
|
41
41
|
|
|
42
|
+
"voice": {
|
|
43
|
+
"enabled": false,
|
|
44
|
+
"provider": "elevenlabs",
|
|
45
|
+
"apiKeyEnv": "ELEVENLABS_API_KEY",
|
|
46
|
+
"voiceId": "",
|
|
47
|
+
"modelId": "eleven_multilingual_v2",
|
|
48
|
+
"outputFormat": "mp3_44100_128",
|
|
49
|
+
"maxChars": 1200,
|
|
50
|
+
"timeoutMs": 30000
|
|
51
|
+
},
|
|
52
|
+
|
|
42
53
|
"claude": {
|
|
43
54
|
"model": "claude-opus-4-7",
|
|
44
55
|
"personalityFile": "./config/personalities/sharp.md",
|
package/dist/channels/baileys.js
CHANGED
|
@@ -61,6 +61,11 @@ function requireFile(path) {
|
|
|
61
61
|
throw new PermanentChannelError(`media file unreadable: ${path} (${err.message})`, err);
|
|
62
62
|
}
|
|
63
63
|
}
|
|
64
|
+
function isGeneratedVoiceReply(msg) {
|
|
65
|
+
return (msg.kind === 'audio' &&
|
|
66
|
+
!!msg.mediaPath &&
|
|
67
|
+
basename(msg.mediaPath).startsWith('voice-'));
|
|
68
|
+
}
|
|
64
69
|
// Map a Baileys send error onto our transient/permanent classification.
|
|
65
70
|
// Network/connection issues → transient. Anything else (invalid jid,
|
|
66
71
|
// payload, etc.) → permanent.
|
|
@@ -120,7 +125,11 @@ async function sendOne(sock, jid, msg) {
|
|
|
120
125
|
throw new PermanentChannelError('audio outbound missing mediaPath');
|
|
121
126
|
}
|
|
122
127
|
const { buf } = requireFile(msg.mediaPath);
|
|
123
|
-
return sock.sendMessage(jid, {
|
|
128
|
+
return sock.sendMessage(jid, {
|
|
129
|
+
audio: buf,
|
|
130
|
+
mimetype: mimeFor(msg.mediaPath, msg.mediaMime),
|
|
131
|
+
ptt: isGeneratedVoiceReply(msg),
|
|
132
|
+
}, quoteOpts);
|
|
124
133
|
}
|
|
125
134
|
case 'document': {
|
|
126
135
|
if (!msg.mediaPath) {
|
package/dist/config.js
CHANGED
|
@@ -53,6 +53,27 @@ const ConfigSchema = z.object({
|
|
|
53
53
|
enabled: true,
|
|
54
54
|
},
|
|
55
55
|
}),
|
|
56
|
+
voice: z
|
|
57
|
+
.object({
|
|
58
|
+
enabled: z.boolean().default(false),
|
|
59
|
+
provider: z.enum(['elevenlabs']).default('elevenlabs'),
|
|
60
|
+
apiKeyEnv: z.string().default('ELEVENLABS_API_KEY'),
|
|
61
|
+
voiceId: z.string().default(''),
|
|
62
|
+
modelId: z.string().default('eleven_multilingual_v2'),
|
|
63
|
+
outputFormat: z.string().default('mp3_44100_128'),
|
|
64
|
+
maxChars: z.number().int().positive().default(1200),
|
|
65
|
+
timeoutMs: z.number().int().positive().default(30000),
|
|
66
|
+
})
|
|
67
|
+
.default({
|
|
68
|
+
enabled: false,
|
|
69
|
+
provider: 'elevenlabs',
|
|
70
|
+
apiKeyEnv: 'ELEVENLABS_API_KEY',
|
|
71
|
+
voiceId: '',
|
|
72
|
+
modelId: 'eleven_multilingual_v2',
|
|
73
|
+
outputFormat: 'mp3_44100_128',
|
|
74
|
+
maxChars: 1200,
|
|
75
|
+
timeoutMs: 30000,
|
|
76
|
+
}),
|
|
56
77
|
claude: z.object({
|
|
57
78
|
model: z.string(),
|
|
58
79
|
personalityFile: z.string(),
|
package/dist/gateway/ingest.js
CHANGED
|
@@ -13,6 +13,7 @@ import { enqueueOutbound } from '../queue/outbound.js';
|
|
|
13
13
|
import { mediaPromptTag } from '../store/media.js';
|
|
14
14
|
import { append } from '../store/messages.js';
|
|
15
15
|
import { getDailyTokens } from '../store/usage.js';
|
|
16
|
+
import { wantsVoiceReply } from '../voice/request.js';
|
|
16
17
|
import { checkAccess, discoverAddressGroupIfNew, getLimitsForUser, getRoleForContext, } from '../wa/whitelist.js';
|
|
17
18
|
import { buildInitPayload, buildRecentContext } from './bootstrap.js';
|
|
18
19
|
import { tryCommand } from './commands.js';
|
|
@@ -69,6 +70,13 @@ function mergeAudioTranscript(text, transcript) {
|
|
|
69
70
|
return cleanedTranscript;
|
|
70
71
|
return `${cleanedText}\n\n[Audio transcript]\n${cleanedTranscript}`;
|
|
71
72
|
}
|
|
73
|
+
function buildVoiceReplyContract() {
|
|
74
|
+
return [
|
|
75
|
+
'[Voice reply requested]',
|
|
76
|
+
'The user asked for a spoken/voice reply.',
|
|
77
|
+
'Write the reply as concise natural speech. Do not mention text-to-speech or audio generation.',
|
|
78
|
+
].join('\n');
|
|
79
|
+
}
|
|
72
80
|
export async function processIncomingMessage(incoming, opts = {}) {
|
|
73
81
|
const stored = toStored(incoming);
|
|
74
82
|
const ageMs = Date.now() - stored.timestamp * 1000;
|
|
@@ -241,10 +249,14 @@ export async function processIncomingMessage(incoming, opts = {}) {
|
|
|
241
249
|
senderPersonId: actorPersonId ?? undefined,
|
|
242
250
|
});
|
|
243
251
|
const jobKind = est?.kind ?? null;
|
|
252
|
+
const replyWithVoice = wantsVoiceReply(stored.text);
|
|
244
253
|
let input = `${memoryPreamble}\n\n---\n\n${core}`;
|
|
245
254
|
if (est?.kind === 'image-gen') {
|
|
246
255
|
input = `${input}\n\n---\n\n${buildImageGenRoutingContract()}`;
|
|
247
256
|
}
|
|
257
|
+
if (replyWithVoice) {
|
|
258
|
+
input = `${input}\n\n---\n\n${buildVoiceReplyContract()}`;
|
|
259
|
+
}
|
|
248
260
|
logger.info({ ...logCtx, resume: !!existingSession, trigger: triggerReason }, 'message captured, enqueuing');
|
|
249
261
|
const job = {
|
|
250
262
|
jid: stored.jid,
|
|
@@ -257,6 +269,7 @@ export async function processIncomingMessage(incoming, opts = {}) {
|
|
|
257
269
|
fromMe: stored.fromMe,
|
|
258
270
|
allowedTools: role.tools,
|
|
259
271
|
allowedTags: role.tags,
|
|
272
|
+
replyWithVoice,
|
|
260
273
|
};
|
|
261
274
|
if (est) {
|
|
262
275
|
enqueueOutbound({
|
package/dist/gateway/outgoing.js
CHANGED
|
@@ -5,6 +5,7 @@ import { formatAddress, jidToAddress } from '../db/address.js';
|
|
|
5
5
|
import { logger } from '../logger.js';
|
|
6
6
|
import { addressForJob } from '../queue/job-address.js';
|
|
7
7
|
import { enqueueOutbound } from '../queue/outbound.js';
|
|
8
|
+
import { synthesizeVoiceReply } from '../voice/elevenlabs.js';
|
|
8
9
|
import { detectMediaType } from '../wa/sender.js';
|
|
9
10
|
// Matches [FILE: path], [IMAGE: path], [VIDEO: path], [AUDIO: path], [DOCUMENT: path]
|
|
10
11
|
const FILE_TAG_RE = /\[(?:FILE|IMAGE|VIDEO|AUDIO|DOCUMENT):\s*([^\]]+)\]/gi;
|
|
@@ -145,6 +146,35 @@ export async function handleReply(job, result, _originalMsg) {
|
|
|
145
146
|
const enqueuePiece = (input) => {
|
|
146
147
|
enqueueOutbound({ ...input, idempotencyKey: `${baseKey}-${pieceIdx++}` });
|
|
147
148
|
};
|
|
149
|
+
if (job.replyWithVoice && text && files.length === 0) {
|
|
150
|
+
const voice = await synthesizeVoiceReply(text);
|
|
151
|
+
if (voice) {
|
|
152
|
+
enqueuePiece({
|
|
153
|
+
address,
|
|
154
|
+
kind: 'audio',
|
|
155
|
+
mediaPath: voice.path,
|
|
156
|
+
mediaMime: voice.mime,
|
|
157
|
+
mediaBytes: voice.bytes,
|
|
158
|
+
});
|
|
159
|
+
for (const card of result.jobCards ?? []) {
|
|
160
|
+
enqueueOutbound({
|
|
161
|
+
address,
|
|
162
|
+
kind: 'text',
|
|
163
|
+
text: card.text,
|
|
164
|
+
idempotencyKey: card.idempotencyKey,
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
logger.info({
|
|
168
|
+
jid: job.jid,
|
|
169
|
+
files: 1,
|
|
170
|
+
chars: text.length,
|
|
171
|
+
pieces: pieceIdx,
|
|
172
|
+
cards: result.jobCards?.length ?? 0,
|
|
173
|
+
voice: true,
|
|
174
|
+
}, 'reply enqueued for outbound');
|
|
175
|
+
return;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
148
178
|
// Files first. Caption goes on the single-file-with-short-text case,
|
|
149
179
|
// matching pre-refactor behavior.
|
|
150
180
|
for (let i = 0; i < files.length; i++) {
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { randomUUID } from 'crypto';
|
|
2
|
+
import { mkdir, stat, writeFile } from 'fs/promises';
|
|
3
|
+
import { resolve } from 'path';
|
|
4
|
+
import { config } from '../config.js';
|
|
5
|
+
import { logger } from '../logger.js';
|
|
6
|
+
const ELEVENLABS_TTS_BASE_URL = 'https://api.elevenlabs.io/v1/text-to-speech';
|
|
7
|
+
function outputMeta(outputFormat) {
|
|
8
|
+
if (outputFormat.startsWith('mp3'))
|
|
9
|
+
return { ext: 'mp3', mime: 'audio/mpeg' };
|
|
10
|
+
if (outputFormat.startsWith('opus'))
|
|
11
|
+
return { ext: 'opus', mime: 'audio/opus' };
|
|
12
|
+
if (outputFormat.startsWith('wav'))
|
|
13
|
+
return { ext: 'wav', mime: 'audio/wav' };
|
|
14
|
+
if (outputFormat.startsWith('pcm'))
|
|
15
|
+
return { ext: 'pcm', mime: 'audio/L16' };
|
|
16
|
+
if (outputFormat.startsWith('ulaw'))
|
|
17
|
+
return { ext: 'ulaw', mime: 'audio/basic' };
|
|
18
|
+
if (outputFormat.startsWith('alaw'))
|
|
19
|
+
return { ext: 'alaw', mime: 'audio/basic' };
|
|
20
|
+
return { ext: 'bin', mime: 'application/octet-stream' };
|
|
21
|
+
}
|
|
22
|
+
async function outboxVoicePath(outputFormat) {
|
|
23
|
+
const meta = outputMeta(outputFormat);
|
|
24
|
+
const dir = resolve(process.cwd(), 'storage/outbox');
|
|
25
|
+
await mkdir(dir, { recursive: true });
|
|
26
|
+
return {
|
|
27
|
+
path: resolve(dir, `voice-${Date.now()}-${randomUUID()}.${meta.ext}`),
|
|
28
|
+
mime: meta.mime,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
export async function synthesizeVoiceReply(text) {
|
|
32
|
+
const voice = config.voice;
|
|
33
|
+
if (!voice.enabled)
|
|
34
|
+
return null;
|
|
35
|
+
if (voice.provider !== 'elevenlabs')
|
|
36
|
+
return null;
|
|
37
|
+
const apiKey = process.env[voice.apiKeyEnv];
|
|
38
|
+
if (!apiKey) {
|
|
39
|
+
logger.warn({ apiKeyEnv: voice.apiKeyEnv }, 'voice reply skipped; API key env var is not set');
|
|
40
|
+
return null;
|
|
41
|
+
}
|
|
42
|
+
if (!voice.voiceId.trim()) {
|
|
43
|
+
logger.warn('voice reply skipped; voice.voiceId is not configured');
|
|
44
|
+
return null;
|
|
45
|
+
}
|
|
46
|
+
const cleaned = text.trim();
|
|
47
|
+
if (!cleaned)
|
|
48
|
+
return null;
|
|
49
|
+
if (cleaned.length > voice.maxChars) {
|
|
50
|
+
logger.warn({ chars: cleaned.length, maxChars: voice.maxChars }, 'voice reply skipped; text is too long');
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
const controller = new AbortController();
|
|
54
|
+
const timeout = setTimeout(() => controller.abort(), voice.timeoutMs);
|
|
55
|
+
timeout.unref();
|
|
56
|
+
try {
|
|
57
|
+
const url = new URL(`${ELEVENLABS_TTS_BASE_URL}/${encodeURIComponent(voice.voiceId)}`);
|
|
58
|
+
url.searchParams.set('output_format', voice.outputFormat);
|
|
59
|
+
const res = await fetch(url, {
|
|
60
|
+
method: 'POST',
|
|
61
|
+
headers: {
|
|
62
|
+
'content-type': 'application/json',
|
|
63
|
+
'xi-api-key': apiKey,
|
|
64
|
+
},
|
|
65
|
+
body: JSON.stringify({
|
|
66
|
+
text: cleaned,
|
|
67
|
+
model_id: voice.modelId,
|
|
68
|
+
}),
|
|
69
|
+
signal: controller.signal,
|
|
70
|
+
});
|
|
71
|
+
if (!res.ok) {
|
|
72
|
+
const body = await res.text().catch(() => '');
|
|
73
|
+
logger.warn({ status: res.status, body: body.slice(0, 500) }, 'voice reply synthesis failed');
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
const buffer = Buffer.from(await res.arrayBuffer());
|
|
77
|
+
const file = await outboxVoicePath(voice.outputFormat);
|
|
78
|
+
await writeFile(file.path, buffer);
|
|
79
|
+
const s = await stat(file.path);
|
|
80
|
+
logger.info({ path: file.path, chars: cleaned.length, bytes: s.size }, 'voice reply synthesized');
|
|
81
|
+
return { path: file.path, mime: file.mime, bytes: s.size };
|
|
82
|
+
}
|
|
83
|
+
catch (err) {
|
|
84
|
+
logger.warn({ err, timeout: err.name === 'AbortError' ? voice.timeoutMs : undefined }, 'voice reply synthesis failed');
|
|
85
|
+
return null;
|
|
86
|
+
}
|
|
87
|
+
finally {
|
|
88
|
+
clearTimeout(timeout);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
const VOICE_REQUEST_PATTERNS = [
|
|
2
|
+
/\b(?:reply|respond|answer|send|say|speak|talk)\b.{0,40}\b(?:voice|audio|spoken|out loud|aloud)\b/i,
|
|
3
|
+
/\b(?:voice|audio|spoken)\b.{0,40}\b(?:reply|response|answer|message|note)\b/i,
|
|
4
|
+
/\b(?:send|reply with|respond with)\b.{0,20}\b(?:a )?(?:voice note|voice message|audio message)\b/i,
|
|
5
|
+
/\b(?:can you|could you|please)?\s*(?:speak|say it out loud|talk to me)\b/i,
|
|
6
|
+
/\b(?:responde|contestame|contesta|habla|dilo)\b.{0,40}\b(?:voz|audio|hablado)\b/i,
|
|
7
|
+
/\b(?:mensaje|nota|respuesta)\b.{0,40}\b(?:de voz|en audio)\b/i,
|
|
8
|
+
];
|
|
9
|
+
export function wantsVoiceReply(text) {
|
|
10
|
+
const cleaned = text.trim();
|
|
11
|
+
if (!cleaned)
|
|
12
|
+
return false;
|
|
13
|
+
return VOICE_REQUEST_PATTERNS.some((re) => re.test(cleaned));
|
|
14
|
+
}
|
package/package.json
CHANGED