@agfpd/voice-connect 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +72 -0
- package/bin/peer-voice-http.mjs +20 -0
- package/bin/peer-voice-mcp.mjs +18 -0
- package/package.json +61 -0
- package/src/apikey.mjs +61 -0
- package/src/audio.mjs +112 -0
- package/src/config.mjs +75 -0
- package/src/configfile.mjs +38 -0
- package/src/engines/f5.mjs +111 -0
- package/src/engines/gemini.mjs +199 -0
- package/src/engines/gptaudio.mjs +230 -0
- package/src/engines/mlxwhisper.mjs +70 -0
- package/src/engines/speaches.mjs +69 -0
- package/src/engines/supertonic.mjs +177 -0
- package/src/home.mjs +15 -0
- package/src/http.mjs +252 -0
- package/src/jobs.mjs +95 -0
- package/src/langsplit.mjs +129 -0
- package/src/profile.mjs +165 -0
- package/src/providers.mjs +210 -0
- package/src/ref.mjs +157 -0
- package/src/router.mjs +91 -0
- package/src/ruaccent.mjs +114 -0
- package/src/ruaccent_stress.py +66 -0
- package/src/server.mjs +278 -0
- package/src/stress.mjs +25 -0
- package/src/stt.mjs +48 -0
- package/src/synthlog.mjs +46 -0
- package/src/voice.mjs +201 -0
- package/src/worker.mjs +120 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Middle engine — F5-TTS, served on the RTX 4090 box (Windows PC, 192.168.0.141).
|
|
3
|
+
*
|
|
4
|
+
* Sits between Gemini (primary, cloud) and Supertonic (local, offline floor):
|
|
5
|
+
* a quality fallback with LIVE prosody — better than Supertonic's flat 66M
|
|
6
|
+
* model — for when Gemini's free tier is exhausted (10/day). The service is a
|
|
7
|
+
* single POST that returns a wav; we normalize it to canonical PCM via the same
|
|
8
|
+
* ffmpeg path as Supertonic (audio.mjs).
|
|
9
|
+
*
|
|
10
|
+
* Contract (verified live 2026-06-01):
|
|
11
|
+
* POST http://192.168.0.141:8183/synthesize
|
|
12
|
+
* {text, nfe_step:32, speed:1.0, ref_audio_b64?, ref_text?}
|
|
13
|
+
* → HTTP 200, audio/wav (RIFF PCM s16le, mono, 24 kHz — our canonical rate)
|
|
14
|
+
*
|
|
15
|
+
* Per-peer voice (6b): when an optional `ref` is passed (a base64 wav sample of
|
|
16
|
+
* the peer's voice + its transcript), F5 clones THAT voice, so the peer sounds
|
|
17
|
+
* like itself on an overflow. Without a ref, F5 uses its built-in default voice.
|
|
18
|
+
* This client only SENDS the ref — minting/caching it is ref.mjs's job, on the
|
|
19
|
+
* Gemini path (a ref can't be made here, where Gemini is by definition down).
|
|
20
|
+
*
|
|
21
|
+
* If F5 is unavailable — network error, non-200 (incl. the service refusing for
|
|
22
|
+
* lack of free VRAM), or timeout — we throw F5UnavailableError so the ladder
|
|
23
|
+
* falls through to Supertonic. Other failures (e.g. a malformed-but-200 body)
|
|
24
|
+
* propagate as plain errors, mirroring Gemini (only the "unavailable" class
|
|
25
|
+
* triggers the fallback; real bugs surface instead of being masked).
|
|
26
|
+
*/
|
|
27
|
+
import { writeFile } from 'node:fs/promises';
|
|
28
|
+
import { accentPlus } from '../ruaccent.mjs';
|
|
29
|
+
|
|
30
|
+
/** Full model name — the key under interfaces.voice for the F5 voice (6b). */
|
|
31
|
+
export const F5_MODEL = 'f5-tts';
|
|
32
|
+
/** The service's single fixed reference voice. Per-peer refs land in 6b. */
|
|
33
|
+
export const DEFAULT_F5_VOICE = process.env.PEER_VOICE_F5_VOICE || 'default';
|
|
34
|
+
|
|
35
|
+
const F5_URL = process.env.PEER_VOICE_F5_URL || 'http://192.168.0.141:8183/synthesize';
|
|
36
|
+
// nfe_step 32 / speed 1.0 — the contract defaults. Override via env.
|
|
37
|
+
const NFE_STEP = Number(process.env.PEER_VOICE_F5_NFE_STEP || '32');
|
|
38
|
+
const SPEED = Number(process.env.PEER_VOICE_F5_SPEED || '1.0');
|
|
39
|
+
// Short text synthesizes in ~1–2 s; a cold model + long text can be slower.
|
|
40
|
+
const TIMEOUT_MS = Number(process.env.PEER_VOICE_F5_TIMEOUT_MS || '120000');
|
|
41
|
+
|
|
42
|
+
/** Thrown when F5 is unavailable (network / non-200 / VRAM refusal / timeout) —
|
|
43
|
+
* the trigger for falling through to Supertonic. */
|
|
44
|
+
export class F5UnavailableError extends Error {
|
|
45
|
+
constructor(message) {
|
|
46
|
+
super(message);
|
|
47
|
+
this.name = 'F5UnavailableError';
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Synthesize one text to a wav file via the F5 service.
|
|
53
|
+
* @param {string} text
|
|
54
|
+
* @param {'ru'|'en'|string} lang used only to decide Russian stress marking
|
|
55
|
+
* @param {string} wavPath where to write the returned wav
|
|
56
|
+
* @param {{audioB64: string, text: string}|null} [ref] per-peer voice reference
|
|
57
|
+
* (6b): a base64 wav sample + its transcript. When present, F5 clones that
|
|
58
|
+
* voice; when absent/null, F5 uses its built-in default voice.
|
|
59
|
+
* @returns {Promise<string>} wavPath
|
|
60
|
+
*/
|
|
61
|
+
export async function f5TTS(text, lang, wavPath, ref = null) {
|
|
62
|
+
// Test hook: simulate F5 being down to exercise the F5→Supertonic fall
|
|
63
|
+
// without taking the real service offline. Documented; off unless set.
|
|
64
|
+
if (process.env.PEER_VOICE_SIMULATE_F5_DOWN === '1') {
|
|
65
|
+
throw new F5UnavailableError('simulated F5 down (PEER_VOICE_SIMULATE_F5_DOWN=1)');
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// F5 reads the '+' stress marker natively — send the raw ruaccent output
|
|
69
|
+
// (NO U+0301 mapping, unlike Supertonic). Russian only; Gemini-style auto
|
|
70
|
+
// languages need no help. Best-effort: unaccented text on any ruaccent failure.
|
|
71
|
+
const speakText = lang === 'ru' ? await accentPlus(text) : text;
|
|
72
|
+
|
|
73
|
+
const body = { text: speakText, nfe_step: NFE_STEP, speed: SPEED };
|
|
74
|
+
// Per-peer voice (6b): clone the peer's voice from its cached reference. The
|
|
75
|
+
// ref_text is the transcript of the sample, NOT the text being synthesized.
|
|
76
|
+
if (ref && ref.audioB64 && ref.text) {
|
|
77
|
+
body.ref_audio_b64 = ref.audioB64;
|
|
78
|
+
body.ref_text = ref.text;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const controller = new AbortController();
|
|
82
|
+
const timer = setTimeout(() => controller.abort(), TIMEOUT_MS);
|
|
83
|
+
let res;
|
|
84
|
+
try {
|
|
85
|
+
res = await fetch(F5_URL, {
|
|
86
|
+
method: 'POST',
|
|
87
|
+
headers: { 'Content-Type': 'application/json' },
|
|
88
|
+
body: JSON.stringify(body),
|
|
89
|
+
signal: controller.signal,
|
|
90
|
+
});
|
|
91
|
+
} catch (err) {
|
|
92
|
+
const why = err && err.name === 'AbortError' ? `timeout ${TIMEOUT_MS}ms` : 'network';
|
|
93
|
+
throw new F5UnavailableError(`F5 request failed (${why}): ${err.message}`);
|
|
94
|
+
} finally {
|
|
95
|
+
clearTimeout(timer);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (!res.ok) {
|
|
99
|
+
const detail = (await res.text().catch(() => '')).slice(0, 400);
|
|
100
|
+
// Any non-200 means F5 will not serve this request — including a refusal for
|
|
101
|
+
// insufficient free VRAM. Treat the whole class as "unavailable" → fall.
|
|
102
|
+
throw new F5UnavailableError(`F5 HTTP ${res.status}: ${detail}`);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
const buf = Buffer.from(await res.arrayBuffer());
|
|
106
|
+
if (buf.length === 0) {
|
|
107
|
+
throw new F5UnavailableError('F5 returned an empty body');
|
|
108
|
+
}
|
|
109
|
+
await writeFile(wavPath, buf);
|
|
110
|
+
return wavPath;
|
|
111
|
+
}
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Primary engine — Gemini 3.1 Flash TTS.
|
|
3
|
+
*
|
|
4
|
+
* Recipe verified live 2026-05-31 (see MergeMind "Голосовые сообщения от
|
|
5
|
+
* агентов — дизайн"): one POST, AUDIO modality, prebuilt voice. The response
|
|
6
|
+
* carries base64 PCM s16le 24 kHz mono inline. Gemini detects language itself,
|
|
7
|
+
* so ru+en go through in a SINGLE pass — no per-language splitting needed here.
|
|
8
|
+
*/
|
|
9
|
+
import { readGeminiKey } from '../apikey.mjs';
|
|
10
|
+
|
|
11
|
+
const MODEL = process.env.PEER_VOICE_GEMINI_MODEL || 'gemini-3.1-flash-tts-preview';
|
|
12
|
+
/** Full model name — the key under interfaces.voice for the Gemini voice. */
|
|
13
|
+
export const GEMINI_MODEL = MODEL;
|
|
14
|
+
export const DEFAULT_GEMINI_VOICE = 'Aoede'; // verdict: fits natalya's personality
|
|
15
|
+
|
|
16
|
+
/** Thrown when Gemini signals a rate/quota limit — the trigger for fallback. */
|
|
17
|
+
export class GeminiQuotaError extends Error {
|
|
18
|
+
constructor(message) {
|
|
19
|
+
super(message);
|
|
20
|
+
this.name = 'GeminiQuotaError';
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** Thrown when no Gemini key is available at all (config gap, not a limit). */
|
|
25
|
+
export class GeminiNoKeyError extends Error {
|
|
26
|
+
constructor(message) {
|
|
27
|
+
super(message);
|
|
28
|
+
this.name = 'GeminiNoKeyError';
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Thrown when Gemini finished abnormally (finishReason ≠ STOP — e.g. MAX_TOKENS,
|
|
34
|
+
* SAFETY, RECITATION). The audio in such a response is partial/empty, so we must
|
|
35
|
+
* NOT return it silently as if it were complete. Treated as an engine-failure
|
|
36
|
+
* class like the two above: in `auto` it triggers fallback to F5/Supertonic
|
|
37
|
+
* (which have no token cap and synthesize the full text); under a forced
|
|
38
|
+
* `engine:'gemini'` it surfaces as a hard error instead of a truncated buffer.
|
|
39
|
+
*/
|
|
40
|
+
export class GeminiTruncatedError extends Error {
|
|
41
|
+
constructor(message) {
|
|
42
|
+
super(message);
|
|
43
|
+
this.name = 'GeminiTruncatedError';
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Thrown when Gemini cannot serve at all — region block (HTTP 400
|
|
49
|
+
* FAILED_PRECONDITION "User location is not supported", seen live 2026-06-12
|
|
50
|
+
* with the host VPN off), auth (401/403), 5xx, or a network failure. Mirrors
|
|
51
|
+
* the GptAudioUnavailableError posture: the whole "can't serve this" class
|
|
52
|
+
* triggers the fall to the next rung instead of killing the call — Supertonic
|
|
53
|
+
* is the OFFLINE floor, so a dead cloud rung must never surface as a hard
|
|
54
|
+
* error in `auto`.
|
|
55
|
+
*/
|
|
56
|
+
export class GeminiUnavailableError extends Error {
|
|
57
|
+
constructor(message) {
|
|
58
|
+
super(message);
|
|
59
|
+
this.name = 'GeminiUnavailableError';
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/** The only finishReason that means "the whole utterance was generated". */
|
|
64
|
+
const FINISH_OK = 'STOP';
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Build the prompt text for a synthesis call, folding in an optional style.
|
|
68
|
+
*
|
|
69
|
+
* Gemini TTS controls delivery (tone/emotion/tempo/accent) via a natural-language
|
|
70
|
+
* directive prefixed to the text — its documented single-speaker style control,
|
|
71
|
+
* e.g. "Say cheerfully: Have a wonderful day!". The model voices only what
|
|
72
|
+
* follows the directive; the directive itself is read as guidance, not spoken.
|
|
73
|
+
* Without a style, the raw text is sent unchanged (the prior behaviour) — so we
|
|
74
|
+
* were NOT using this layer before; `style` wires it in.
|
|
75
|
+
*
|
|
76
|
+
* The unstyled `text` remains the canonical spoken content (e.g. for the F5 ref
|
|
77
|
+
* transcript) — only the synthesis prompt carries the directive.
|
|
78
|
+
* @param {string} text
|
|
79
|
+
* @param {string} [style] delivery directive
|
|
80
|
+
* @returns {string}
|
|
81
|
+
*/
|
|
82
|
+
export function geminiPrompt(text, style) {
|
|
83
|
+
return style && String(style).trim() ? `${String(style).trim()}: ${text}` : text;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Pull the audio out of a parsed generateContent response, enforcing a clean
|
|
88
|
+
* finish. Shared by the live path and the PEER_VOICE_SIMULATE_GEMINI_FINISH hook
|
|
89
|
+
* so the truncation check is exercised by the exact same code in both.
|
|
90
|
+
*
|
|
91
|
+
* A finishReason that is PRESENT and ≠ STOP is an abnormal stop (truncation /
|
|
92
|
+
* safety / recitation) → throw GeminiTruncatedError, even if a partial inlineData
|
|
93
|
+
* is attached. An absent finishReason is tolerated (treated as OK) so we never
|
|
94
|
+
* false-positive on a well-formed audio response that simply omits the field.
|
|
95
|
+
*
|
|
96
|
+
* @param {object} data parsed JSON from generateContent
|
|
97
|
+
* @returns {{pcm: Buffer, finishReason: string|null}}
|
|
98
|
+
*/
|
|
99
|
+
export function parseGeminiResponse(data) {
|
|
100
|
+
const cand = data?.candidates?.[0];
|
|
101
|
+
const finishReason = cand?.finishReason ?? null;
|
|
102
|
+
if (finishReason && finishReason !== FINISH_OK) {
|
|
103
|
+
throw new GeminiTruncatedError(
|
|
104
|
+
`Gemini finished abnormally (finishReason=${finishReason}) — audio would be ` +
|
|
105
|
+
`truncated/empty; refusing to return a partial buffer.`,
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
const b64 = cand?.content?.parts?.[0]?.inlineData?.data;
|
|
109
|
+
if (!b64) {
|
|
110
|
+
throw new Error(
|
|
111
|
+
`Gemini returned no audio inlineData: ${JSON.stringify(data).slice(0, 300)}`,
|
|
112
|
+
);
|
|
113
|
+
}
|
|
114
|
+
return { pcm: Buffer.from(b64, 'base64'), finishReason };
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Synthesize one text to canonical PCM (s16le, 24 kHz, mono).
|
|
119
|
+
* @param {string} text
|
|
120
|
+
* @param {string} voice prebuilt voice name (default Aoede)
|
|
121
|
+
* @param {string} [style] delivery directive (tone/emotion/tempo) — prefixed to
|
|
122
|
+
* the prompt as Gemini's natural-language style control.
|
|
123
|
+
* @returns {Promise<{pcm: Buffer, finishReason: string|null}>} raw PCM + how the
|
|
124
|
+
* generation stopped (STOP on a clean pass; abnormal reasons throw).
|
|
125
|
+
*/
|
|
126
|
+
export async function geminiTTS(text, voice = DEFAULT_GEMINI_VOICE, style = undefined) {
|
|
127
|
+
// Test hook: simulate a quota limit without spending a real call, to exercise
|
|
128
|
+
// the auto-fallback routing live. Documented; off unless explicitly set.
|
|
129
|
+
if (process.env.PEER_VOICE_SIMULATE_GEMINI_429 === '1') {
|
|
130
|
+
throw new GeminiQuotaError('simulated Gemini 429 (PEER_VOICE_SIMULATE_GEMINI_429=1)');
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Test hook: simulate the can't-serve class (region block / 5xx / network)
|
|
134
|
+
// to exercise the unavailable→fall routing offline, same posture as the 429 hook.
|
|
135
|
+
if (process.env.PEER_VOICE_SIMULATE_GEMINI_UNAVAILABLE === '1') {
|
|
136
|
+
throw new GeminiUnavailableError(
|
|
137
|
+
'simulated Gemini unavailable (PEER_VOICE_SIMULATE_GEMINI_UNAVAILABLE=1)',
|
|
138
|
+
);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Test hook: simulate a given finishReason WITHOUT a real call, so the
|
|
142
|
+
// truncation guard (and that STOP still passes) is verifiable offline. The
|
|
143
|
+
// synthetic response runs through the SAME parseGeminiResponse as the live
|
|
144
|
+
// path. e.g. PEER_VOICE_SIMULATE_GEMINI_FINISH=MAX_TOKENS → GeminiTruncatedError.
|
|
145
|
+
const simFinish = process.env.PEER_VOICE_SIMULATE_GEMINI_FINISH;
|
|
146
|
+
if (simFinish) {
|
|
147
|
+
return parseGeminiResponse({
|
|
148
|
+
candidates: [{
|
|
149
|
+
finishReason: simFinish,
|
|
150
|
+
content: { parts: [{ inlineData: { data: Buffer.from('simulated-pcm').toString('base64') } }] },
|
|
151
|
+
}],
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
const key = readGeminiKey();
|
|
156
|
+
if (!key) {
|
|
157
|
+
throw new GeminiNoKeyError(
|
|
158
|
+
'GEMINI_API_KEY not found in env or shell rc files (~/.zshrc ...).',
|
|
159
|
+
);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const body = {
|
|
163
|
+
contents: [{ parts: [{ text: geminiPrompt(text, style) }] }],
|
|
164
|
+
generationConfig: {
|
|
165
|
+
responseModalities: ['AUDIO'],
|
|
166
|
+
speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: voice } } },
|
|
167
|
+
},
|
|
168
|
+
};
|
|
169
|
+
|
|
170
|
+
let res;
|
|
171
|
+
try {
|
|
172
|
+
res = await fetch(
|
|
173
|
+
`https://generativelanguage.googleapis.com/v1beta/models/${MODEL}:generateContent?key=${key}`,
|
|
174
|
+
{
|
|
175
|
+
method: 'POST',
|
|
176
|
+
headers: { 'Content-Type': 'application/json' },
|
|
177
|
+
body: JSON.stringify(body),
|
|
178
|
+
},
|
|
179
|
+
);
|
|
180
|
+
} catch (err) {
|
|
181
|
+
throw new GeminiUnavailableError(`Gemini request failed (network): ${err.message}`);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if (!res.ok) {
|
|
185
|
+
const detail = (await res.text().catch(() => '')).slice(0, 400);
|
|
186
|
+
// 429 / RESOURCE_EXHAUSTED → quota; this is the fallback trigger.
|
|
187
|
+
if (res.status === 429 || /RESOURCE_EXHAUSTED|quota/i.test(detail)) {
|
|
188
|
+
throw new GeminiQuotaError(`Gemini quota/limit (HTTP ${res.status}): ${detail}`);
|
|
189
|
+
}
|
|
190
|
+
// Region block / auth / 5xx — the whole can't-serve class → fall to the
|
|
191
|
+
// next rung (same posture as gptaudio.mjs).
|
|
192
|
+
throw new GeminiUnavailableError(`Gemini HTTP ${res.status}: ${detail}`);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
const data = await res.json();
|
|
196
|
+
// Enforce a clean finishReason before trusting the buffer — a MAX_TOKENS /
|
|
197
|
+
// SAFETY stop carries partial-or-empty audio that must NOT pass as complete.
|
|
198
|
+
return parseGeminiResponse(data);
|
|
199
|
+
}
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Second-rung engine — OpenAI gpt-audio, served over OpenRouter.
|
|
3
|
+
*
|
|
4
|
+
* Sits between Gemini (primary, direct Google) and F5 (RTX 4090 box): a cloud
|
|
5
|
+
* quality fallback for when the direct Gemini key is exhausted (free tier 10/day)
|
|
6
|
+
* BEFORE degrading to the local rungs. Multilingual in one pass (ru+en+fr — Artur
|
|
7
|
+
* picked it after a live demo, 2026-06-01), so it is tried for ALL languages,
|
|
8
|
+
* exactly like Gemini, ahead of the language-specific F5/Supertonic split.
|
|
9
|
+
*
|
|
10
|
+
* Why NOT Gemini-via-OpenRouter: verified live 2026-06-01 that OpenRouter routes
|
|
11
|
+
* `google/gemini-3.1-flash-tts-preview` to Google but no provider serves its
|
|
12
|
+
* audio output (chat/completions → 404 "no endpoints support audio"; /audio/speech
|
|
13
|
+
* → persistent 503 "no provider"). That rung would be dead. gpt-audio is a real,
|
|
14
|
+
* served audio model — but a DIFFERENT engine/voice, not "Gemini through another
|
|
15
|
+
* provider".
|
|
16
|
+
*
|
|
17
|
+
* Contract (verified live 2026-06-01, recipe from boris/Artur + confirmed here):
|
|
18
|
+
* POST https://openrouter.ai/api/v1/chat/completions
|
|
19
|
+
* { model:"openai/gpt-audio", modalities:["text","audio"],
|
|
20
|
+
* audio:{ voice, format:"pcm16" }, stream:true,
|
|
21
|
+
* messages:[{ role:"user", content:<verbatim instruction + text> }] }
|
|
22
|
+
* → text/event-stream; audio arrives as base64 fragments in
|
|
23
|
+
* choices[].delta.audio.data. Concatenating the fragments yields the full
|
|
24
|
+
* base64 of canonical PCM (s16le, 24 kHz, mono) — decoded once, it feeds
|
|
25
|
+
* straight into encodePcmToOgg (same canonical format as Gemini/Supertonic).
|
|
26
|
+
*
|
|
27
|
+
* It is a chat model, so it must be INSTRUCTED to voice the text verbatim (not
|
|
28
|
+
* answer it). Style (HOW to speak) is folded into that same instruction.
|
|
29
|
+
*
|
|
30
|
+
* Failures throw a typed error so the ladder falls through to F5:
|
|
31
|
+
* - GptAudioNoKeyError — no OPENROUTER_API_KEY (config gap)
|
|
32
|
+
* - GptAudioQuotaError — 429 / rate-limit / quota
|
|
33
|
+
* - GptAudioUnavailableError — auth (401/403), 5xx / 503 no-provider, network,
|
|
34
|
+
* timeout, or a 200 that carried no audio. The whole "can't serve this"
|
|
35
|
+
* class → fall. Other (unexpected) errors propagate so real bugs surface.
|
|
36
|
+
*/
|
|
37
|
+
import { readOpenRouterKey } from '../apikey.mjs';
|
|
38
|
+
|
|
39
|
+
const MODEL = process.env.PEER_VOICE_GPTAUDIO_MODEL || 'openai/gpt-audio';
|
|
40
|
+
/** Full model name — the key under interfaces.voice for the gpt-audio voice. */
|
|
41
|
+
export const GPTAUDIO_MODEL = MODEL;
|
|
42
|
+
/** Built-in default voice (one of OpenRouter's 13: alloy, echo, fable, onyx,
|
|
43
|
+
* nova, shimmer, coral, verse, ballad, ash, sage, marin, cedar). Per-peer voice
|
|
44
|
+
* comes from interfaces.voice["openai/gpt-audio"] when set. */
|
|
45
|
+
export const DEFAULT_GPTAUDIO_VOICE = process.env.PEER_VOICE_GPTAUDIO_VOICE || 'alloy';
|
|
46
|
+
|
|
47
|
+
const OPENROUTER_URL =
|
|
48
|
+
process.env.PEER_VOICE_OPENROUTER_URL || 'https://openrouter.ai/api/v1/chat/completions';
|
|
49
|
+
// Streaming completion can run minutes for long text (the async worker path).
|
|
50
|
+
// One overall cap; env-tunable for ops/tests.
|
|
51
|
+
const TIMEOUT_MS = Number(process.env.PEER_VOICE_GPTAUDIO_TIMEOUT_MS || '600000');
|
|
52
|
+
|
|
53
|
+
/** No OPENROUTER_API_KEY configured (config gap, not a limit). */
|
|
54
|
+
export class GptAudioNoKeyError extends Error {
|
|
55
|
+
constructor(message) {
|
|
56
|
+
super(message);
|
|
57
|
+
this.name = 'GptAudioNoKeyError';
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/** OpenRouter signalled a rate/quota limit — like Gemini's quota, a fall trigger. */
|
|
62
|
+
export class GptAudioQuotaError extends Error {
|
|
63
|
+
constructor(message) {
|
|
64
|
+
super(message);
|
|
65
|
+
this.name = 'GptAudioQuotaError';
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/** gpt-audio could not serve the request (auth / 5xx / 503 no-provider / network /
|
|
70
|
+
* timeout / empty audio) — the trigger for falling through to F5. */
|
|
71
|
+
export class GptAudioUnavailableError extends Error {
|
|
72
|
+
constructor(message) {
|
|
73
|
+
super(message);
|
|
74
|
+
this.name = 'GptAudioUnavailableError';
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Build the single user-message prompt. gpt-audio is a chat model, so we must
|
|
80
|
+
* tell it to VOICE the text verbatim rather than respond to it; the optional
|
|
81
|
+
* style directive (HOW to speak) folds into the same instruction.
|
|
82
|
+
* @param {string} text the words to speak
|
|
83
|
+
* @param {string} [style] delivery directive (tone/emotion/tempo/accent)
|
|
84
|
+
* @returns {string}
|
|
85
|
+
*/
|
|
86
|
+
export function buildGptAudioPrompt(text, style) {
|
|
87
|
+
const base =
|
|
88
|
+
'Read the following text aloud, exactly as written, word for word. Do not ' +
|
|
89
|
+
'translate it, do not answer it, do not add or remove anything — only voice it.';
|
|
90
|
+
const styleLine = style && String(style).trim() ? ` Speak in this style: ${String(style).trim()}.` : '';
|
|
91
|
+
return `${base}${styleLine}\n\nText:\n${text}`;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Pull the base64 audio fragment out of one SSE `data:` line, or null when the
|
|
96
|
+
* line carries no audio (comment, role/transcript delta, [DONE], unparseable).
|
|
97
|
+
* Pure — the unit-testable core of the stream reassembly.
|
|
98
|
+
* @param {string} line one line of the event stream
|
|
99
|
+
* @returns {string|null}
|
|
100
|
+
*/
|
|
101
|
+
export function audioB64FromDataLine(line) {
|
|
102
|
+
if (typeof line !== 'string' || !line.startsWith('data:')) return null;
|
|
103
|
+
const payload = line.slice(5).trim();
|
|
104
|
+
if (!payload || payload === '[DONE]') return null;
|
|
105
|
+
let json;
|
|
106
|
+
try {
|
|
107
|
+
json = JSON.parse(payload);
|
|
108
|
+
} catch {
|
|
109
|
+
return null;
|
|
110
|
+
}
|
|
111
|
+
const data = json?.choices?.[0]?.delta?.audio?.data;
|
|
112
|
+
return typeof data === 'string' && data.length ? data : null;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Concatenate all base64 audio fragments from a COMPLETE SSE text blob. Pure;
|
|
117
|
+
* used by tests and as the reassembly contract. Concatenating the fragments
|
|
118
|
+
* (rather than decoding each separately) is the safe choice: verified live that
|
|
119
|
+
* fragments carry no mid-stream padding, so their concatenation is itself valid
|
|
120
|
+
* base64 of the whole PCM — robust even if a fragment were split mid-unit.
|
|
121
|
+
* @param {string} sseText
|
|
122
|
+
* @returns {string} concatenated base64 (decode once with Buffer.from(_, 'base64'))
|
|
123
|
+
*/
|
|
124
|
+
export function collectAudioB64(sseText) {
|
|
125
|
+
let b64 = '';
|
|
126
|
+
for (const line of String(sseText).split(/\r?\n/)) {
|
|
127
|
+
const frag = audioB64FromDataLine(line);
|
|
128
|
+
if (frag) b64 += frag;
|
|
129
|
+
}
|
|
130
|
+
return b64;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/** Stream the SSE body via a reader, accumulating audio base64 line by line. */
|
|
134
|
+
async function streamAudioB64(body) {
|
|
135
|
+
const reader = body.getReader();
|
|
136
|
+
const decoder = new TextDecoder();
|
|
137
|
+
let b64 = '';
|
|
138
|
+
let buf = '';
|
|
139
|
+
for (;;) {
|
|
140
|
+
const { value, done } = await reader.read();
|
|
141
|
+
if (done) break;
|
|
142
|
+
buf += decoder.decode(value, { stream: true });
|
|
143
|
+
let nl;
|
|
144
|
+
while ((nl = buf.indexOf('\n')) >= 0) {
|
|
145
|
+
const frag = audioB64FromDataLine(buf.slice(0, nl));
|
|
146
|
+
if (frag) b64 += frag;
|
|
147
|
+
buf = buf.slice(nl + 1);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
buf += decoder.decode(); // flush any multibyte tail
|
|
151
|
+
for (const line of buf.split(/\r?\n/)) {
|
|
152
|
+
const frag = audioB64FromDataLine(line);
|
|
153
|
+
if (frag) b64 += frag;
|
|
154
|
+
}
|
|
155
|
+
return b64;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Synthesize one text to canonical PCM (s16le, 24 kHz, mono) via gpt-audio.
|
|
160
|
+
* @param {string} text
|
|
161
|
+
* @param {string} [voice] one of OpenRouter's 13 gpt-audio voices
|
|
162
|
+
* @param {string} [style] delivery directive (folded into the prompt)
|
|
163
|
+
* @returns {Promise<{pcm: Buffer}>}
|
|
164
|
+
*/
|
|
165
|
+
export async function gptAudioTTS(text, voice = DEFAULT_GPTAUDIO_VOICE, style = undefined) {
|
|
166
|
+
// Test hook: simulate gpt-audio being unavailable to exercise the
|
|
167
|
+
// gpt-audio→F5 fall offline. Documented; off unless explicitly set.
|
|
168
|
+
if (process.env.PEER_VOICE_SIMULATE_GPTAUDIO_DOWN === '1') {
|
|
169
|
+
throw new GptAudioUnavailableError('simulated gpt-audio down (PEER_VOICE_SIMULATE_GPTAUDIO_DOWN=1)');
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const key = readOpenRouterKey();
|
|
173
|
+
if (!key) {
|
|
174
|
+
throw new GptAudioNoKeyError(
|
|
175
|
+
'OPENROUTER_API_KEY not found in env or shell rc files (~/.zshrc ...).',
|
|
176
|
+
);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
const body = {
|
|
180
|
+
model: MODEL,
|
|
181
|
+
modalities: ['text', 'audio'],
|
|
182
|
+
audio: { voice, format: 'pcm16' },
|
|
183
|
+
stream: true,
|
|
184
|
+
messages: [{ role: 'user', content: buildGptAudioPrompt(text, style) }],
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
const controller = new AbortController();
|
|
188
|
+
const timer = setTimeout(() => controller.abort(), TIMEOUT_MS);
|
|
189
|
+
let res;
|
|
190
|
+
try {
|
|
191
|
+
res = await fetch(OPENROUTER_URL, {
|
|
192
|
+
method: 'POST',
|
|
193
|
+
headers: {
|
|
194
|
+
Authorization: `Bearer ${key}`,
|
|
195
|
+
'Content-Type': 'application/json',
|
|
196
|
+
},
|
|
197
|
+
body: JSON.stringify(body),
|
|
198
|
+
signal: controller.signal,
|
|
199
|
+
});
|
|
200
|
+
} catch (err) {
|
|
201
|
+
clearTimeout(timer);
|
|
202
|
+
const why = err && err.name === 'AbortError' ? `timeout ${TIMEOUT_MS}ms` : 'network';
|
|
203
|
+
throw new GptAudioUnavailableError(`gpt-audio request failed (${why}): ${err.message}`);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
if (!res.ok) {
|
|
207
|
+
clearTimeout(timer);
|
|
208
|
+
const detail = (await res.text().catch(() => '')).slice(0, 400);
|
|
209
|
+
if (res.status === 429 || /RESOURCE_EXHAUSTED|rate.?limit|quota/i.test(detail)) {
|
|
210
|
+
throw new GptAudioQuotaError(`gpt-audio quota/limit (HTTP ${res.status}): ${detail}`);
|
|
211
|
+
}
|
|
212
|
+
// 401/403 (bad key), 5xx, 503 "no provider" — the whole can't-serve class → fall.
|
|
213
|
+
throw new GptAudioUnavailableError(`gpt-audio HTTP ${res.status}: ${detail}`);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
let b64;
|
|
217
|
+
try {
|
|
218
|
+
b64 = await streamAudioB64(res.body);
|
|
219
|
+
} catch (err) {
|
|
220
|
+
const why = err && err.name === 'AbortError' ? `timeout ${TIMEOUT_MS}ms` : 'stream';
|
|
221
|
+
throw new GptAudioUnavailableError(`gpt-audio stream failed (${why}): ${err.message}`);
|
|
222
|
+
} finally {
|
|
223
|
+
clearTimeout(timer);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
if (!b64) {
|
|
227
|
+
throw new GptAudioUnavailableError('gpt-audio returned no audio data in the stream');
|
|
228
|
+
}
|
|
229
|
+
return { pcm: Buffer.from(b64, 'base64') };
|
|
230
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* mlx-whisper STT engine — the local, offline transcription floor (Apple-silicon
|
|
3
|
+
* Whisper via the `mlx_whisper` CLI). Same invocation telegram-runtime uses for
|
|
4
|
+
* its fallback tier: write a .txt to a temp --output-dir, then read it back.
|
|
5
|
+
*
|
|
6
|
+
* Command override: PEER_VOICE_STT_FALLBACK_CMD (default `mlx_whisper`, empty to
|
|
7
|
+
* disable); PEER_VOICE_STT_FALLBACK_MODEL for the model flag.
|
|
8
|
+
*/
|
|
9
|
+
import { spawnSync } from 'node:child_process';
|
|
10
|
+
import { readFileSync, mkdirSync, existsSync, rmSync } from 'node:fs';
|
|
11
|
+
import { join, basename } from 'node:path';
|
|
12
|
+
import { tmpdir } from 'node:os';
|
|
13
|
+
import { randomBytes } from 'node:crypto';
|
|
14
|
+
|
|
15
|
+
/** Thrown when the local fallback is disabled or fails. */
|
|
16
|
+
export class MlxWhisperError extends Error {
|
|
17
|
+
constructor(message) {
|
|
18
|
+
super(message);
|
|
19
|
+
this.name = 'MlxWhisperError';
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/** The configured fallback command (default `mlx_whisper`; '' = disabled). */
|
|
24
|
+
export function fallbackCmd() {
|
|
25
|
+
const c = process.env.PEER_VOICE_STT_FALLBACK_CMD;
|
|
26
|
+
return c === undefined ? 'mlx_whisper' : c.trim();
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** Pure: the .txt path mlx_whisper writes for `audioPath` inside `outDir`. */
|
|
30
|
+
export function txtOutPath(outDir, audioPath) {
|
|
31
|
+
return join(outDir, basename(audioPath).replace(/\.[^.]+$/, '') + '.txt');
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Transcribe an audio file with the local mlx_whisper CLI.
|
|
36
|
+
* @param {string} audioPath
|
|
37
|
+
* @param {{model?:string, lang?:string, prompt?:string, timeoutMs?:number}} [o]
|
|
38
|
+
* @returns {Promise<{text: string}>}
|
|
39
|
+
*/
|
|
40
|
+
export async function mlxWhisperSTT(audioPath, { model, lang, prompt, timeoutMs } = {}) {
|
|
41
|
+
const cmd = fallbackCmd();
|
|
42
|
+
if (!cmd) throw new MlxWhisperError('local STT fallback disabled (PEER_VOICE_STT_FALLBACK_CMD empty)');
|
|
43
|
+
|
|
44
|
+
const outDir = join(tmpdir(), `vc-stt-${randomBytes(6).toString('hex')}`);
|
|
45
|
+
try {
|
|
46
|
+
mkdirSync(outDir, { recursive: true });
|
|
47
|
+
const args = [audioPath, '--output-format', 'txt', '--output-dir', outDir];
|
|
48
|
+
const m = model ?? (process.env.PEER_VOICE_STT_FALLBACK_MODEL || '').trim();
|
|
49
|
+
if (m) args.push('--model', m);
|
|
50
|
+
if (lang) args.push('--language', lang);
|
|
51
|
+
if (prompt) args.push('--initial-prompt', prompt);
|
|
52
|
+
|
|
53
|
+
const r = spawnSync(cmd, args, { encoding: 'utf8', timeout: (timeoutMs || 30000) * 4 });
|
|
54
|
+
if (r.error) throw new MlxWhisperError(`${cmd} spawn error: ${r.error.message}`);
|
|
55
|
+
if (r.status !== 0) {
|
|
56
|
+
throw new MlxWhisperError(`${cmd} exit ${r.status ?? 'signal'}: ${String(r.stderr || '').slice(0, 200)}`);
|
|
57
|
+
}
|
|
58
|
+
const txtPath = txtOutPath(outDir, audioPath);
|
|
59
|
+
if (!existsSync(txtPath)) throw new MlxWhisperError('fallback produced no transcript file');
|
|
60
|
+
const text = readFileSync(txtPath, 'utf8').trim();
|
|
61
|
+
if (!text) throw new MlxWhisperError('fallback produced empty transcript');
|
|
62
|
+
return { text };
|
|
63
|
+
} finally {
|
|
64
|
+
try {
|
|
65
|
+
rmSync(outDir, { recursive: true, force: true });
|
|
66
|
+
} catch {
|
|
67
|
+
/* best-effort cleanup */
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* speaches STT engine — an OpenAI-compatible POST /v1/audio/transcriptions
|
|
3
|
+
* service (speaches / faster-whisper-server). This is the same contract
|
|
4
|
+
* telegram-runtime already drives (multipart file + model/language/prompt +
|
|
5
|
+
* response_format=text), lifted here so STT lives in voice-connect's core.
|
|
6
|
+
*
|
|
7
|
+
* The endpoint is operator-configured (no universal default): PEER_VOICE_STT_ENDPOINT
|
|
8
|
+
* is the full /v1/audio/transcriptions URL. When unset, the provider is simply
|
|
9
|
+
* not applicable and the cascade falls to the local mlx-whisper floor.
|
|
10
|
+
*/
|
|
11
|
+
import { readFileSync } from 'node:fs';
|
|
12
|
+
import { basename } from 'node:path';
|
|
13
|
+
|
|
14
|
+
/** Thrown when speaches is unreachable / not configured / errors — advances STT cascade. */
|
|
15
|
+
export class SpeachesUnavailableError extends Error {
|
|
16
|
+
constructor(message) {
|
|
17
|
+
super(message);
|
|
18
|
+
this.name = 'SpeachesUnavailableError';
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/** The configured endpoint URL, or '' (→ provider not applicable). */
|
|
23
|
+
export function sttEndpoint() {
|
|
24
|
+
return (process.env.PEER_VOICE_STT_ENDPOINT || '').trim();
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/** Default model name sent to the endpoint (empty → endpoint's own default). */
|
|
28
|
+
export function sttModel() {
|
|
29
|
+
return (process.env.PEER_VOICE_STT_MODEL || '').trim();
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Pure: normalize the transcription response body (response_format=text → plain text). */
|
|
33
|
+
export function parseTranscription(body) {
|
|
34
|
+
return typeof body === 'string' ? body.trim() : '';
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Transcribe an audio file via the OpenAI-compatible endpoint.
|
|
39
|
+
* @param {string} audioPath
|
|
40
|
+
* @param {{model?:string, lang?:string, prompt?:string, timeoutMs?:number, endpoint?:string}} [o]
|
|
41
|
+
* @returns {Promise<{text: string}>}
|
|
42
|
+
*/
|
|
43
|
+
export async function speachesSTT(audioPath, { model, lang, prompt, timeoutMs, endpoint } = {}) {
|
|
44
|
+
const url = (endpoint || sttEndpoint());
|
|
45
|
+
if (!url) throw new SpeachesUnavailableError('no STT endpoint (set PEER_VOICE_STT_ENDPOINT)');
|
|
46
|
+
|
|
47
|
+
const form = new FormData();
|
|
48
|
+
form.append('file', new File([readFileSync(audioPath)], basename(audioPath)));
|
|
49
|
+
const m = model ?? sttModel();
|
|
50
|
+
if (m) form.append('model', m);
|
|
51
|
+
if (lang) form.append('language', lang);
|
|
52
|
+
if (prompt) form.append('prompt', prompt);
|
|
53
|
+
form.append('response_format', 'text');
|
|
54
|
+
|
|
55
|
+
let res;
|
|
56
|
+
try {
|
|
57
|
+
res = await fetch(url, {
|
|
58
|
+
method: 'POST',
|
|
59
|
+
body: form,
|
|
60
|
+
signal: AbortSignal.timeout(timeoutMs || 30000),
|
|
61
|
+
});
|
|
62
|
+
} catch (err) {
|
|
63
|
+
throw new SpeachesUnavailableError(`STT endpoint request failed: ${err && err.message ? err.message : err}`);
|
|
64
|
+
}
|
|
65
|
+
if (!res.ok) throw new SpeachesUnavailableError(`STT endpoint HTTP ${res.status}`);
|
|
66
|
+
const text = parseTranscription(await res.text());
|
|
67
|
+
if (!text) throw new SpeachesUnavailableError('STT endpoint returned empty text');
|
|
68
|
+
return { text };
|
|
69
|
+
}
|