@agfpd/voice-connect 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,165 @@
1
+ /**
2
+ * Caller voice resolution from the IAPeer profiles.
3
+ *
4
+ * Voice is an agent INTERFACE (like its telegram bot), so it lives in the
5
+ * caller's peer profile under `interfaces.voice`, keyed by the FULL model name
6
+ * as used by the API — self-documenting and robust to model swaps:
7
+ *
8
+ * interfaces.voice = {
9
+ * "gemini-3.1-flash-tts-preview": "<voice>",
10
+ * "supertonic-3": "<voice>"
11
+ * }
12
+ *
13
+ * SOURCE OF TRUTH is the per-cwd profile `<cwd>/.iapeer/peer-profile.json` — the
14
+ * IAP code is explicit that the host index ~/.iapeer/peers-profiles.json is a
15
+ * DERIVED artifact regenerated from per-cwd profiles on connect/upsert, so it
16
+ * can lag. Voice is therefore read from the per-cwd profile; the host index is
17
+ * used only to look up the caller's stable `cwd` (the peer env carries
18
+ * PEER_PERSONALITY but not PEER_CWD).
19
+ *
20
+ * Resolution of the voice map:
21
+ * 1. PEER_PERSONALITY → host index entry → its `cwd`.
22
+ * 2. Read `<cwd>/.iapeer/peer-profile.json` (source of truth). If it is
23
+ * readable+parseable, its `interfaces.voice` (or {} when the field is
24
+ * absent — "no voice set") is authoritative; we do NOT fall back to the
25
+ * index, which could carry a stale value.
26
+ * 3. Only if the per-cwd profile is UNAVAILABLE (missing / empty / unparseable)
27
+ * do we defensively fall back to the host index entry's `interfaces.voice`.
28
+ * 4. Otherwise {} — the caller uses the built-in defaults.
29
+ *
30
+ * Reading the field is decoupled from populating it (the operator fills profiles
31
+ * separately).
32
+ */
33
+ import { readFileSync } from 'node:fs';
34
+ import { homedir } from 'node:os';
35
+ import { join } from 'node:path';
36
+
37
+ function registryPath() {
38
+ return process.env.PEER_VOICE_PEERS_PROFILES
39
+ || join(homedir(), '.iapeer', 'peers-profiles.json');
40
+ }
41
+
42
+ /** The host index entry for a personality, or null. Real shape:
43
+ * { version, peers: [ { personality, cwd, ..., interfaces } ] }. A map keyed by
44
+ * personality is also accepted defensively. */
45
+ function indexEntry(personality) {
46
+ let registry;
47
+ try {
48
+ registry = JSON.parse(readFileSync(registryPath(), 'utf8'));
49
+ } catch {
50
+ return null;
51
+ }
52
+ if (!registry || typeof registry !== 'object') return null;
53
+ const peers = registry.peers != null ? registry.peers : registry;
54
+ if (Array.isArray(peers)) {
55
+ return peers.find(p => p && p.personality === personality) || null;
56
+ }
57
+ if (peers && typeof peers === 'object') return peers[personality] || null;
58
+ return null;
59
+ }
60
+
61
+ function voiceOf(entry) {
62
+ const v = entry && entry.interfaces && entry.interfaces.voice;
63
+ return v && typeof v === 'object' ? v : null;
64
+ }
65
+
66
+ /**
67
+ * The peer profile at the PROCESS's own cwd, or null. Both the claude and the
68
+ * codex MCP launchers start this server with cwd = the calling peer's directory
69
+ * (verified live 2026-06-12: claude/boris and codex/linus children both had
70
+ * cwd ~/Peers/<peer>), so `<cwd>/.iapeer/peer-profile.json` IS the caller's
71
+ * source-of-truth profile.
72
+ * @returns {object|null}
73
+ */
74
+ function cwdProfile() {
75
+ let profile;
76
+ try {
77
+ profile = JSON.parse(
78
+ readFileSync(join(process.cwd(), '.iapeer', 'peer-profile.json'), 'utf8'),
79
+ );
80
+ } catch {
81
+ return null;
82
+ }
83
+ return profile && typeof profile === 'object' ? profile : null;
84
+ }
85
+
86
+ /**
87
+ * The calling peer's personality, or null.
88
+ *
89
+ * Ladder: PEER_PERSONALITY env → the cwd profile's `personality`. The env var
90
+ * is how claude sessions identify the caller, but the codex MCP launcher does
91
+ * NOT propagate parent env to MCP children (verified live 2026-06-12: the codex
92
+ * parent had PEER_PERSONALITY/PEER_IDENTITY/PEER_RUNTIME, its MCP child had
93
+ * none, while manifest env did arrive) — so in codex sessions the identity is
94
+ * recovered from the cwd profile instead.
95
+ * @returns {string|null}
96
+ */
97
+ export function callerPersonality() {
98
+ const env = process.env.PEER_PERSONALITY;
99
+ if (env && env.trim()) return env.trim();
100
+ const p = cwdProfile();
101
+ const own = p && typeof p.personality === 'string' ? p.personality.trim() : '';
102
+ return own || null;
103
+ }
104
+
105
+ /**
106
+ * Read `interfaces.voice` from a per-cwd profile.
107
+ * @returns {{available: boolean, voice: Record<string,string>}}
108
+ * available=false → the profile file is missing / empty / unparseable
109
+ * (caller should fall back to the index); available=true → the source of
110
+ * truth was read (voice may be {} = no voice set, which is authoritative).
111
+ */
112
+ function perCwdVoice(cwd) {
113
+ if (!cwd) return { available: false, voice: {} };
114
+ let profile;
115
+ try {
116
+ profile = JSON.parse(readFileSync(join(cwd, '.iapeer', 'peer-profile.json'), 'utf8'));
117
+ } catch {
118
+ return { available: false, voice: {} };
119
+ }
120
+ if (!profile || typeof profile !== 'object') return { available: false, voice: {} };
121
+ return { available: true, voice: voiceOf(profile) || {} };
122
+ }
123
+
124
+ /**
125
+ * The calling peer's `interfaces.voice` map (model-name → voice), or {}.
126
+ * @param {string} [personality] override the env-derived caller identity (tests)
127
+ * @returns {Record<string,string>}
128
+ */
129
+ export function callerVoiceMap(personality = process.env.PEER_PERSONALITY) {
130
+ if (!personality) {
131
+ // No env identity (codex MCP children — parent env is not propagated):
132
+ // the cwd profile is the caller's per-cwd source of truth, so read the
133
+ // voice straight from it — no index hop, personality and voice can never
134
+ // disagree. cwd not a peer dir → null → {} (built-in defaults), as before.
135
+ const p = cwdProfile();
136
+ return p ? (voiceOf(p) || {}) : {};
137
+ }
138
+ const entry = indexEntry(personality);
139
+
140
+ // 1. Source of truth: the per-cwd profile (cwd looked up from the index).
141
+ const perCwd = perCwdVoice(entry && entry.cwd);
142
+ if (perCwd.available) return perCwd.voice;
143
+
144
+ // 2. Defensive: per-cwd unavailable → host index entry's voice.
145
+ return voiceOf(entry) || {};
146
+ }
147
+
148
+ /**
149
+ * Resolve the voice for a model, applying the override/profile/default ladder.
150
+ * @param {object} a
151
+ * @param {string} a.modelName full model name (the interfaces.voice key)
152
+ * @param {string} a.def built-in default voice for this model
153
+ * @param {boolean} a.applyOverride whether the explicit override applies here
154
+ * (true only for the call's PRIMARY engine —
155
+ * an override must not leak across a fallback)
156
+ * @param {string} [a.override] explicit voice argument
157
+ * @param {Record<string,string>} a.voiceMap caller's interfaces.voice
158
+ * @returns {{voice: string, source: 'override'|'profile'|'default'}}
159
+ */
160
+ export function resolveVoice({ modelName, def, applyOverride, override, voiceMap }) {
161
+ if (applyOverride && override) return { voice: override, source: 'override' };
162
+ const fromProfile = voiceMap && voiceMap[modelName];
163
+ if (fromProfile) return { voice: fromProfile, source: 'profile' };
164
+ return { voice: def, source: 'default' };
165
+ }
@@ -0,0 +1,210 @@
1
+ /**
2
+ * TTS providers — the four engines behind one uniform contract, plus the
3
+ * declarative cascade table that the router runs.
4
+ *
5
+ * A provider is data + one method:
6
+ * {
7
+ * name, model, defaultVoice,
8
+ * capabilities: { style, langs, streaming }, // first-class, declared up front
9
+ * isFallbackError?(err): boolean, // known can't-serve → advance cascade
10
+ * async synthesize(ctx): { pcm, voice, lang?, finishReason? },
11
+ * }
12
+ * ctx carries { text, style, personality, tmpDir, voice?, lang? }. Each provider
13
+ * encapsulates its own quirk: Gemini mints the per-peer F5 ref on success, F5
14
+ * loads that ref and decodes wav→pcm, Supertonic decodes wav→pcm. Adding an
15
+ * engine = adding one object here; the router and voice.mjs stay untouched.
16
+ *
17
+ * Idea lineage (voice-gateway research): base class with one synth method
18
+ * (pipecat/livekit) + Capabilities as a first-class object (livekit).
19
+ */
20
+ import { join } from 'node:path';
21
+ import {
22
+ geminiTTS,
23
+ GeminiQuotaError,
24
+ GeminiNoKeyError,
25
+ GeminiTruncatedError,
26
+ GeminiUnavailableError,
27
+ DEFAULT_GEMINI_VOICE,
28
+ GEMINI_MODEL,
29
+ } from './engines/gemini.mjs';
30
+ import {
31
+ gptAudioTTS,
32
+ GptAudioQuotaError,
33
+ GptAudioNoKeyError,
34
+ GptAudioUnavailableError,
35
+ DEFAULT_GPTAUDIO_VOICE,
36
+ GPTAUDIO_MODEL,
37
+ } from './engines/gptaudio.mjs';
38
+ import { f5TTS, F5UnavailableError, DEFAULT_F5_VOICE, F5_MODEL } from './engines/f5.mjs';
39
+ import {
40
+ supertonicTTS,
41
+ DEFAULT_SUPERTONIC_VOICE,
42
+ SUPERTONIC_MODEL,
43
+ } from './engines/supertonic.mjs';
44
+ import { decodeToPcmBuffer } from './audio.mjs';
45
+ import { saveRef, loadRef } from './ref.mjs';
46
+ import { speachesSTT, SpeachesUnavailableError, sttEndpoint } from './engines/speaches.mjs';
47
+ import { mlxWhisperSTT } from './engines/mlxwhisper.mjs';
48
+
49
+ /** Gemini 3.1 Flash TTS — cloud primary, ru+en in one pass, supports style. */
50
+ export const gemini = {
51
+ name: 'gemini',
52
+ model: GEMINI_MODEL,
53
+ defaultVoice: DEFAULT_GEMINI_VOICE,
54
+ capabilities: { style: true, langs: 'any', streaming: false },
55
+ isFallbackError: (e) =>
56
+ e instanceof GeminiQuotaError ||
57
+ e instanceof GeminiNoKeyError ||
58
+ e instanceof GeminiTruncatedError ||
59
+ e instanceof GeminiUnavailableError,
60
+ // `deps` is a test seam (engine + ref-store fns); defaults are the real
61
+ // modules, so the router's single-arg call is unchanged. Same pattern as
62
+ // server.mjs's injectable {voice, dispatch, transcribe}.
63
+ async synthesize({ text, voice, style, personality }, deps = {}) {
64
+ const tts = deps.tts ?? geminiTTS;
65
+ const save = deps.saveRef ?? saveRef;
66
+ const { pcm, finishReason } = await tts(text, voice, style); // single pass
67
+ // Gemini just succeeded — the one moment we can capture this peer's voice for
68
+ // F5 (a ref can't be minted on the F5 path, where Gemini is down). Reuse THIS
69
+ // output as the ref, keyed to the voice. Best-effort, never affects the audio.
70
+ await save(personality, pcm, text, voice).catch(() => {});
71
+ return { pcm, voice, finishReason };
72
+ },
73
+ };
74
+
75
+ /** gpt-audio over OpenRouter — cloud second rung, multilingual one pass, style. */
76
+ export const gptAudio = {
77
+ name: 'gpt-audio',
78
+ model: GPTAUDIO_MODEL,
79
+ defaultVoice: DEFAULT_GPTAUDIO_VOICE,
80
+ capabilities: { style: true, langs: 'any', streaming: false },
81
+ isFallbackError: (e) =>
82
+ e instanceof GptAudioQuotaError ||
83
+ e instanceof GptAudioNoKeyError ||
84
+ e instanceof GptAudioUnavailableError,
85
+ async synthesize({ text, voice, style }, deps = {}) {
86
+ const tts = deps.tts ?? gptAudioTTS;
87
+ const { pcm } = await tts(text, voice, style);
88
+ return { pcm, voice };
89
+ },
90
+ };
91
+
92
+ /** F5-TTS — live-prosody Russian rung with per-peer voice cloning from a ref. */
93
+ export const f5 = {
94
+ name: 'f5',
95
+ model: F5_MODEL,
96
+ defaultVoice: DEFAULT_F5_VOICE,
97
+ capabilities: { style: false, langs: ['ru'], streaming: false },
98
+ isFallbackError: (e) => e instanceof F5UnavailableError,
99
+ async synthesize({ text, personality, tmpDir }, deps = {}) {
100
+ const tts = deps.tts ?? f5TTS;
101
+ const load = deps.loadRef ?? loadRef;
102
+ const decode = deps.decode ?? decodeToPcmBuffer;
103
+ // Per-peer voice: load this peer's cached ref → F5 clones their voice. Never
104
+ // generates here; null when the peer has no ref yet → F5 default voice.
105
+ const ref = await load(personality);
106
+ const wav = join(tmpDir, 'f5.wav');
107
+ await tts(text, 'ru', wav, ref); // F5 reads ruaccent '+' natively
108
+ const pcm = await decode(wav);
109
+ return { pcm, voice: ref ? personality : DEFAULT_F5_VOICE, lang: 'ru' };
110
+ },
111
+ };
112
+
113
+ /** Supertonic 3 — local, offline floor. One pass in the given language. */
114
+ export const supertonic = {
115
+ name: 'supertonic',
116
+ model: SUPERTONIC_MODEL,
117
+ defaultVoice: DEFAULT_SUPERTONIC_VOICE,
118
+ capabilities: { style: false, langs: ['ru', 'en', 'na'], streaming: false },
119
+ // No isFallbackError: the floor. Any error here propagates (real failure).
120
+ async synthesize({ text, voice, lang, tmpDir }, deps = {}) {
121
+ const tts = deps.tts ?? supertonicTTS;
122
+ const decode = deps.decode ?? decodeToPcmBuffer;
123
+ const wav = join(tmpDir, 'st.wav');
124
+ await tts(text, lang, wav, voice);
125
+ const pcm = await decode(wav);
126
+ return { pcm, voice, lang };
127
+ },
128
+ };
129
+
130
+ export const ttsProviders = { gemini, gptAudio, f5, supertonic };
131
+
132
+ /** The provider for a forced `engine` value, or null for 'auto'. */
133
+ export function ttsProviderByEngine(engine) {
134
+ if (engine === 'gemini') return gemini;
135
+ if (engine === 'gpt-audio') return gptAudio;
136
+ if (engine === 'supertonic') return supertonic;
137
+ return null;
138
+ }
139
+
140
+ /**
141
+ * The declarative `auto` cascade (priority order). Voices are baked per entry;
142
+ * F5 applies ONLY on the ru route (its English is weak); Supertonic is the floor
143
+ * and runs in the routed language. Returns entries shaped for runCascade().
144
+ * @param {{route:'ru'|'en'|'na', gemVoice:string, gaVoice:string, stVoice:string}} a
145
+ */
146
+ export function buildTtsCascade({ route, gemVoice, gaVoice, stVoice }) {
147
+ const wrap = (provider, extra) => ({
148
+ name: provider.name,
149
+ applicable: extra.applicable,
150
+ isFallbackError: provider.isFallbackError,
151
+ run: (ctx) => provider.synthesize({ ...ctx, voice: extra.voice, lang: extra.lang }),
152
+ });
153
+ return [
154
+ wrap(gemini, { voice: gemVoice }),
155
+ wrap(gptAudio, { voice: gaVoice }),
156
+ wrap(f5, { lang: 'ru', applicable: () => route === 'ru' }),
157
+ wrap(supertonic, { voice: stVoice, lang: route }),
158
+ ];
159
+ }
160
+
161
+ // ── STT providers ───────────────────────────────────────────────────────────
162
+ // Mirror of the TTS side: uniform contract, declarative cascade. speaches (the
163
+ // OpenAI-compatible HTTP service telegram-runtime already uses) is the primary
164
+ // rung but applies ONLY when an endpoint is configured; mlx-whisper is the local
165
+ // floor. Adding spokenly / Gemini-Flash STT later = one more object here.
166
+
167
+ /** speaches (faster-whisper, OpenAI /v1/audio/transcriptions) — primary STT. */
168
+ export const speaches = {
169
+ name: 'speaches',
170
+ capabilities: { streaming: false, languages: 'any' },
171
+ isFallbackError: (e) => e instanceof SpeachesUnavailableError,
172
+ applicable: () => Boolean(sttEndpoint()), // skipped entirely when no endpoint set
173
+ async transcribe({ audioPath, lang, prompt }, deps = {}) {
174
+ const stt = deps.stt ?? speachesSTT;
175
+ const { text } = await stt(audioPath, { lang, prompt });
176
+ return { text };
177
+ },
178
+ };
179
+
180
+ /** mlx_whisper CLI — local, offline STT floor. */
181
+ export const mlxWhisper = {
182
+ name: 'mlx-whisper',
183
+ capabilities: { streaming: false, languages: 'any' },
184
+ // No isFallbackError: the floor. Any error here propagates (real failure).
185
+ async transcribe({ audioPath, lang, prompt }, deps = {}) {
186
+ const stt = deps.stt ?? mlxWhisperSTT;
187
+ const { text } = await stt(audioPath, { lang, prompt });
188
+ return { text };
189
+ },
190
+ };
191
+
192
+ export const sttProviders = { speaches, mlxWhisper };
193
+
194
+ /** The provider for a forced STT `engine`, or null for 'auto'. */
195
+ export function sttProviderByEngine(engine) {
196
+ if (engine === 'speaches') return speaches;
197
+ if (engine === 'mlx-whisper') return mlxWhisper;
198
+ return null;
199
+ }
200
+
201
+ /** The declarative STT cascade: speaches (if configured) → mlx-whisper floor. */
202
+ export function buildSttCascade() {
203
+ const wrap = (provider) => ({
204
+ name: provider.name,
205
+ applicable: provider.applicable,
206
+ isFallbackError: provider.isFallbackError,
207
+ run: (ctx) => provider.transcribe(ctx),
208
+ });
209
+ return [wrap(speaches), wrap(mlxWhisper)];
210
+ }
package/src/ref.mjs ADDED
@@ -0,0 +1,157 @@
1
+ /**
2
+ * Per-peer F5 voice reference — voice-as-identity on the F5 ladder rung (6b).
3
+ *
4
+ * F5-TTS clones the voice of a reference sample: POST .../synthesize accepts
5
+ * {ref_audio_b64, ref_text} and speaks the request in THAT voice. So peer X
6
+ * sounds like itself on an F5 overflow if we hand F5 a sample of X's voice.
7
+ *
8
+ * The catch: a ref of X's voice can only be MINTED while Gemini works (Gemini is
9
+ * X's primary voice), but F5 runs precisely when Gemini is exhausted. So the ref
10
+ * is NOT generated on the F5 path. Instead it is REGISTERED opportunistically on
11
+ * a successful Gemini synthesis (saveRef), reusing that real Gemini output — the
12
+ * canonical PCM, before ogg encoding — as the sample, with the spoken text as
13
+ * its transcript. Zero extra Gemini calls. The F5 path only ever loads + sends
14
+ * (loadRef); if no ref exists yet (peer never spoke via Gemini), F5 uses its
15
+ * default voice — a self-healing edge case that resolves after X's first synth.
16
+ *
17
+ * Mutability: the ref is keyed to the peer's Gemini voice. saveRef stamps the
18
+ * voice into the meta; when the peer's profile voice later differs, the ref is
19
+ * STALE and the next well-sized Gemini output refreshes it automatically — so
20
+ * changing a peer's voice propagates to its F5 voice with no manual step. A
21
+ * manual reset is just deleting the ref files.
22
+ *
23
+ * Cache (per peer X), under $PEER_VOICE_HOME/refs/ (default ~/.iapeer/cache/peer-voice):
24
+ * X.wav — the voice sample (RIFF wav, from the Gemini PCM)
25
+ * X.json — { gemini_voice, ref_text, registered_at }
26
+ */
27
+ import { mkdir, readFile, writeFile, access } from 'node:fs/promises';
28
+ import { constants as FS } from 'node:fs';
29
+ import { join } from 'node:path';
30
+ import { peerVoiceHome } from './home.mjs';
31
+ import {
32
+ encodePcmToWav,
33
+ SAMPLE_RATE,
34
+ CHANNELS,
35
+ BYTES_PER_SAMPLE,
36
+ } from './audio.mjs';
37
+
38
+ const BYTES_PER_SEC = SAMPLE_RATE * CHANNELS * BYTES_PER_SAMPLE; // 48000 B/s
39
+
40
+ // A good voice clone wants a few seconds of clean speech: too short underdefines
41
+ // the timbre, too long bloats the request and risks artefacts. We register the
42
+ // first suitable Gemini output and skip the rest. Env-tunable.
43
+ const MIN_SEC = Number(process.env.PEER_VOICE_REF_MIN_SEC || '4');
44
+ const MAX_SEC = Number(process.env.PEER_VOICE_REF_MAX_SEC || '15');
45
+
46
+ // IAP personalities are lowercase slugs; keep the cache filename to that shape so
47
+ // a stray/odd identity can never escape the refs dir.
48
+ const PERSONALITY_RE = /^[a-z][a-z0-9-]{0,63}$/;
49
+
50
+ function refsDir() {
51
+ return join(peerVoiceHome(), 'refs');
52
+ }
53
+ function refPaths(personality) {
54
+ const base = join(refsDir(), personality);
55
+ return { wav: `${base}.wav`, meta: `${base}.json` };
56
+ }
57
+ function validPersonality(personality) {
58
+ return typeof personality === 'string' && PERSONALITY_RE.test(personality);
59
+ }
60
+ async function exists(p) {
61
+ try { await access(p, FS.F_OK); return true; } catch { return false; }
62
+ }
63
+ function log(msg) { process.stderr.write(`[peer-voice/ref] ${msg}\n`); }
64
+
65
+ /** Duration in seconds of a canonical-PCM buffer. */
66
+ export function pcmDurationSec(pcmBuffer) {
67
+ if (!pcmBuffer || !pcmBuffer.length) return 0;
68
+ return pcmBuffer.length / BYTES_PER_SEC;
69
+ }
70
+
71
+ /** Parsed meta for peer X ({ gemini_voice, ref_text, registered_at }), or null. */
72
+ export async function refMeta(personality) {
73
+ if (!validPersonality(personality)) return null;
74
+ const { meta } = refPaths(personality);
75
+ try {
76
+ const obj = JSON.parse(await readFile(meta, 'utf8'));
77
+ return obj && typeof obj === 'object' ? obj : null;
78
+ } catch {
79
+ return null;
80
+ }
81
+ }
82
+
83
+ /** Whether peer X has a usable cached ref (both the sample and its meta). */
84
+ export async function hasRef(personality) {
85
+ if (!validPersonality(personality)) return false;
86
+ const { wav, meta } = refPaths(personality);
87
+ return (await exists(wav)) && (await exists(meta));
88
+ }
89
+
90
+ /**
91
+ * Opportunistically register/refresh peer X's F5 ref from a successful Gemini
92
+ * synthesis. MUST be called only on the Gemini-success path (never on F5).
93
+ * Writes nothing — and never touches ffmpeg — when: the personality is odd, the
94
+ * text is empty, no Gemini voice is given, a CURRENT ref already exists (same
95
+ * voice), or the audio is outside the [MIN_SEC, MAX_SEC] window (wait for a
96
+ * better-sized one). When the stored ref was minted for a DIFFERENT Gemini voice
97
+ * it is treated as stale and refreshed. Best-effort; returns a small result.
98
+ * @param {string} personality
99
+ * @param {Buffer} pcmBuffer canonical PCM (s16le 24 kHz mono), BEFORE ogg encode
100
+ * @param {string} text the text Gemini spoke — becomes ref_text (transcript)
101
+ * @param {string} geminiVoice the peer's Gemini voice this sample was spoken in
102
+ * @returns {Promise<{saved: boolean, refreshed?: boolean, reason?: string, seconds?: number}>}
103
+ */
104
+ export async function saveRef(personality, pcmBuffer, text, geminiVoice) {
105
+ if (!validPersonality(personality)) return { saved: false, reason: 'bad-personality' };
106
+ if (!text || !String(text).trim()) return { saved: false, reason: 'empty-text' };
107
+ if (!geminiVoice || !String(geminiVoice).trim()) return { saved: false, reason: 'no-voice' };
108
+
109
+ // A ref minted for the SAME voice is current — keep it (registered once per
110
+ // (peer, voice)). A different voice means the peer's voice changed → stale.
111
+ const meta = await refMeta(personality);
112
+ const stale = meta && meta.gemini_voice !== geminiVoice;
113
+ if (meta && !stale) return { saved: false, reason: 'exists-current' };
114
+
115
+ const seconds = pcmDurationSec(pcmBuffer);
116
+ if (seconds < MIN_SEC || seconds > MAX_SEC) {
117
+ return { saved: false, reason: 'duration-out-of-window', seconds };
118
+ }
119
+
120
+ const { wav, meta: metaPath } = refPaths(personality);
121
+ try {
122
+ await mkdir(refsDir(), { recursive: true });
123
+ await encodePcmToWav(pcmBuffer, wav);
124
+ await writeFile(metaPath, JSON.stringify({
125
+ gemini_voice: geminiVoice,
126
+ ref_text: String(text),
127
+ registered_at: new Date().toISOString(),
128
+ }, null, 2), 'utf8');
129
+ log(`${stale ? 'refreshed' : 'registered'} ref for "${personality}" `
130
+ + `(voice ${geminiVoice}, ${seconds.toFixed(1)}s).`);
131
+ return { saved: true, refreshed: !!stale, seconds };
132
+ } catch (e) {
133
+ log(`saveRef failed for "${personality}" (${e.message}); will retry on a later synthesis.`);
134
+ return { saved: false, reason: 'write-error' };
135
+ }
136
+ }
137
+
138
+ /**
139
+ * Load peer X's cached F5 ref, or null when there is none / it is unreadable.
140
+ * Never generates — generation is saveRef's job, on the Gemini path.
141
+ * @param {string} personality
142
+ * @returns {Promise<{personality: string, audioB64: string, text: string}|null>}
143
+ */
144
+ export async function loadRef(personality) {
145
+ if (!(await hasRef(personality))) return null;
146
+ const { wav } = refPaths(personality);
147
+ try {
148
+ const meta = await refMeta(personality);
149
+ const text = meta && meta.ref_text ? String(meta.ref_text).trim() : '';
150
+ const audioB64 = (await readFile(wav)).toString('base64');
151
+ if (!audioB64 || !text) return null;
152
+ return { personality, audioB64, text };
153
+ } catch (e) {
154
+ log(`loadRef failed for "${personality}" (${e.message}); using default voice.`);
155
+ return null;
156
+ }
157
+ }
package/src/router.mjs ADDED
@@ -0,0 +1,91 @@
1
+ /**
2
+ * Declarative provider router — the formalized engine cascade.
3
+ *
4
+ * Before this, voice.mjs hard-coded the fallback ladder as nested try/catch.
5
+ * The router turns that into data: an ordered list of entries, each with an
6
+ * `applicable` guard and an `isFallbackError` predicate. The cascade runs them
7
+ * in order, advancing to the next entry ONLY on a known "can't-serve" error;
8
+ * any other error propagates so real bugs surface instead of being masked.
9
+ *
10
+ * Idea lineage (see the voice-gateway research): a declarative table with
11
+ * priority + fallback + cooldown (litellm Router). Cooldown is implemented but
12
+ * INERT by default (cooldownSec = 0) — Phase 1 is a behavior-preserving refactor;
13
+ * flipping cooldown on is a later, deliberate config change.
14
+ *
15
+ * Modality-agnostic: it runs TTS and STT cascades alike. An entry is:
16
+ * {
17
+ * name: string, // 'gemini' | 'speaches' | ...
18
+ * applicable?: (ctx) => boolean, // skip this entry when false (e.g. F5 only for ru)
19
+ * isFallbackError?: (err) => boolean, // true → advance the cascade; false/absent → propagate
20
+ * run: (ctx) => Promise<object>, // do the work (synthesize / transcribe / …)
21
+ * }
22
+ *
23
+ * Returns the successful entry's result, augmented with:
24
+ * - name: the winning entry's name
25
+ * - fallbackFrom: 'gemini:GeminiQuotaError→gptaudio:GptAudioNoKeyError' when the
26
+ * cascade fell through failures first, or undefined if the first
27
+ * applicable entry succeeded.
28
+ */
29
+
30
+ /**
31
+ * @param {Array} entries ordered cascade entries (highest priority first)
32
+ * @param {object} ctx synthesis context passed to each entry's synthesize/applicable
33
+ * @param {object} [opts]
34
+ * @param {number} [opts.cooldownSec=0] seconds a failed entry is skipped (0 = inert)
35
+ * @param {Map} [opts.cooldownStore] name → epoch-ms-until-usable (persist across calls to make cooldown stick)
36
+ * @param {() => number} [opts.now] clock (injectable for tests)
37
+ * @param {(name: string, err: Error) => void} [opts.onAdvance] notified each time the cascade falls through an entry
38
+ * @returns {Promise<object>} the winning result + { name, fallbackFrom }
39
+ */
40
+ export async function runCascade(entries, ctx, opts = {}) {
41
+ const {
42
+ cooldownSec = 0,
43
+ cooldownStore = new Map(),
44
+ now = () => Date.now(),
45
+ onAdvance,
46
+ } = opts;
47
+
48
+ const trail = [];
49
+ let lastErr;
50
+ let skippedForCooldown = false;
51
+
52
+ for (const entry of entries) {
53
+ if (entry.applicable && !entry.applicable(ctx)) continue;
54
+
55
+ if (cooldownSec > 0) {
56
+ const until = cooldownStore.get(entry.name);
57
+ if (until && until > now()) {
58
+ skippedForCooldown = true;
59
+ continue;
60
+ }
61
+ }
62
+
63
+ try {
64
+ const result = await entry.run(ctx);
65
+ return {
66
+ ...result,
67
+ name: entry.name,
68
+ fallbackFrom: trail.length ? trail.join('→') : undefined,
69
+ };
70
+ } catch (err) {
71
+ // Only KNOWN can't-serve errors advance the cascade. Anything else is a
72
+ // real failure and must surface — never silently swallowed by a fallback.
73
+ if (entry.isFallbackError && entry.isFallbackError(err)) {
74
+ trail.push(`${entry.name}:${err.name}`);
75
+ if (cooldownSec > 0) cooldownStore.set(entry.name, now() + cooldownSec * 1000);
76
+ if (onAdvance) onAdvance(entry.name, err);
77
+ lastErr = err;
78
+ continue;
79
+ }
80
+ throw err;
81
+ }
82
+ }
83
+
84
+ // No applicable entry produced audio.
85
+ if (lastErr) throw lastErr;
86
+ throw new Error(
87
+ skippedForCooldown
88
+ ? 'voice router: every applicable provider is in cooldown'
89
+ : 'voice router: no applicable provider for this request',
90
+ );
91
+ }