@agfpd/voice-connect 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +72 -0
- package/bin/peer-voice-http.mjs +20 -0
- package/bin/peer-voice-mcp.mjs +18 -0
- package/package.json +61 -0
- package/src/apikey.mjs +61 -0
- package/src/audio.mjs +112 -0
- package/src/config.mjs +75 -0
- package/src/configfile.mjs +38 -0
- package/src/engines/f5.mjs +111 -0
- package/src/engines/gemini.mjs +199 -0
- package/src/engines/gptaudio.mjs +230 -0
- package/src/engines/mlxwhisper.mjs +70 -0
- package/src/engines/speaches.mjs +69 -0
- package/src/engines/supertonic.mjs +177 -0
- package/src/home.mjs +15 -0
- package/src/http.mjs +252 -0
- package/src/jobs.mjs +95 -0
- package/src/langsplit.mjs +129 -0
- package/src/profile.mjs +165 -0
- package/src/providers.mjs +210 -0
- package/src/ref.mjs +157 -0
- package/src/router.mjs +91 -0
- package/src/ruaccent.mjs +114 -0
- package/src/ruaccent_stress.py +66 -0
- package/src/server.mjs +278 -0
- package/src/stress.mjs +25 -0
- package/src/stt.mjs +48 -0
- package/src/synthlog.mjs +46 -0
- package/src/voice.mjs +201 -0
- package/src/worker.mjs +120 -0
package/src/profile.mjs
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Caller voice resolution from the IAPeer profiles.
|
|
3
|
+
*
|
|
4
|
+
* Voice is an agent INTERFACE (like its telegram bot), so it lives in the
|
|
5
|
+
* caller's peer profile under `interfaces.voice`, keyed by the FULL model name
|
|
6
|
+
* as used by the API — self-documenting and robust to model swaps:
|
|
7
|
+
*
|
|
8
|
+
* interfaces.voice = {
|
|
9
|
+
* "gemini-3.1-flash-tts-preview": "<voice>",
|
|
10
|
+
* "supertonic-3": "<voice>"
|
|
11
|
+
* }
|
|
12
|
+
*
|
|
13
|
+
* SOURCE OF TRUTH is the per-cwd profile `<cwd>/.iapeer/peer-profile.json` — the
|
|
14
|
+
* IAP code is explicit that the host index ~/.iapeer/peers-profiles.json is a
|
|
15
|
+
* DERIVED artifact regenerated from per-cwd profiles on connect/upsert, so it
|
|
16
|
+
* can lag. Voice is therefore read from the per-cwd profile; the host index is
|
|
17
|
+
* used only to look up the caller's stable `cwd` (the peer env carries
|
|
18
|
+
* PEER_PERSONALITY but not PEER_CWD).
|
|
19
|
+
*
|
|
20
|
+
* Resolution of the voice map:
|
|
21
|
+
* 1. PEER_PERSONALITY → host index entry → its `cwd`.
|
|
22
|
+
* 2. Read `<cwd>/.iapeer/peer-profile.json` (source of truth). If it is
|
|
23
|
+
* readable+parseable, its `interfaces.voice` (or {} when the field is
|
|
24
|
+
* absent — "no voice set") is authoritative; we do NOT fall back to the
|
|
25
|
+
* index, which could carry a stale value.
|
|
26
|
+
* 3. Only if the per-cwd profile is UNAVAILABLE (missing / empty / unparseable)
|
|
27
|
+
* do we defensively fall back to the host index entry's `interfaces.voice`.
|
|
28
|
+
* 4. Otherwise {} — the caller uses the built-in defaults.
|
|
29
|
+
*
|
|
30
|
+
* Reading the field is decoupled from populating it (the operator fills profiles
|
|
31
|
+
* separately).
|
|
32
|
+
*/
|
|
33
|
+
import { readFileSync } from 'node:fs';
|
|
34
|
+
import { homedir } from 'node:os';
|
|
35
|
+
import { join } from 'node:path';
|
|
36
|
+
|
|
37
|
+
function registryPath() {
|
|
38
|
+
return process.env.PEER_VOICE_PEERS_PROFILES
|
|
39
|
+
|| join(homedir(), '.iapeer', 'peers-profiles.json');
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/** The host index entry for a personality, or null. Real shape:
|
|
43
|
+
* { version, peers: [ { personality, cwd, ..., interfaces } ] }. A map keyed by
|
|
44
|
+
* personality is also accepted defensively. */
|
|
45
|
+
function indexEntry(personality) {
|
|
46
|
+
let registry;
|
|
47
|
+
try {
|
|
48
|
+
registry = JSON.parse(readFileSync(registryPath(), 'utf8'));
|
|
49
|
+
} catch {
|
|
50
|
+
return null;
|
|
51
|
+
}
|
|
52
|
+
if (!registry || typeof registry !== 'object') return null;
|
|
53
|
+
const peers = registry.peers != null ? registry.peers : registry;
|
|
54
|
+
if (Array.isArray(peers)) {
|
|
55
|
+
return peers.find(p => p && p.personality === personality) || null;
|
|
56
|
+
}
|
|
57
|
+
if (peers && typeof peers === 'object') return peers[personality] || null;
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function voiceOf(entry) {
|
|
62
|
+
const v = entry && entry.interfaces && entry.interfaces.voice;
|
|
63
|
+
return v && typeof v === 'object' ? v : null;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* The peer profile at the PROCESS's own cwd, or null. Both the claude and the
|
|
68
|
+
* codex MCP launchers start this server with cwd = the calling peer's directory
|
|
69
|
+
* (verified live 2026-06-12: claude/boris and codex/linus children both had
|
|
70
|
+
* cwd ~/Peers/<peer>), so `<cwd>/.iapeer/peer-profile.json` IS the caller's
|
|
71
|
+
* source-of-truth profile.
|
|
72
|
+
* @returns {object|null}
|
|
73
|
+
*/
|
|
74
|
+
function cwdProfile() {
|
|
75
|
+
let profile;
|
|
76
|
+
try {
|
|
77
|
+
profile = JSON.parse(
|
|
78
|
+
readFileSync(join(process.cwd(), '.iapeer', 'peer-profile.json'), 'utf8'),
|
|
79
|
+
);
|
|
80
|
+
} catch {
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
return profile && typeof profile === 'object' ? profile : null;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* The calling peer's personality, or null.
|
|
88
|
+
*
|
|
89
|
+
* Ladder: PEER_PERSONALITY env → the cwd profile's `personality`. The env var
|
|
90
|
+
* is how claude sessions identify the caller, but the codex MCP launcher does
|
|
91
|
+
* NOT propagate parent env to MCP children (verified live 2026-06-12: the codex
|
|
92
|
+
* parent had PEER_PERSONALITY/PEER_IDENTITY/PEER_RUNTIME, its MCP child had
|
|
93
|
+
* none, while manifest env did arrive) — so in codex sessions the identity is
|
|
94
|
+
* recovered from the cwd profile instead.
|
|
95
|
+
* @returns {string|null}
|
|
96
|
+
*/
|
|
97
|
+
export function callerPersonality() {
|
|
98
|
+
const env = process.env.PEER_PERSONALITY;
|
|
99
|
+
if (env && env.trim()) return env.trim();
|
|
100
|
+
const p = cwdProfile();
|
|
101
|
+
const own = p && typeof p.personality === 'string' ? p.personality.trim() : '';
|
|
102
|
+
return own || null;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Read `interfaces.voice` from a per-cwd profile.
|
|
107
|
+
* @returns {{available: boolean, voice: Record<string,string>}}
|
|
108
|
+
* available=false → the profile file is missing / empty / unparseable
|
|
109
|
+
* (caller should fall back to the index); available=true → the source of
|
|
110
|
+
* truth was read (voice may be {} = no voice set, which is authoritative).
|
|
111
|
+
*/
|
|
112
|
+
function perCwdVoice(cwd) {
|
|
113
|
+
if (!cwd) return { available: false, voice: {} };
|
|
114
|
+
let profile;
|
|
115
|
+
try {
|
|
116
|
+
profile = JSON.parse(readFileSync(join(cwd, '.iapeer', 'peer-profile.json'), 'utf8'));
|
|
117
|
+
} catch {
|
|
118
|
+
return { available: false, voice: {} };
|
|
119
|
+
}
|
|
120
|
+
if (!profile || typeof profile !== 'object') return { available: false, voice: {} };
|
|
121
|
+
return { available: true, voice: voiceOf(profile) || {} };
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* The calling peer's `interfaces.voice` map (model-name → voice), or {}.
|
|
126
|
+
* @param {string} [personality] override the env-derived caller identity (tests)
|
|
127
|
+
* @returns {Record<string,string>}
|
|
128
|
+
*/
|
|
129
|
+
export function callerVoiceMap(personality = process.env.PEER_PERSONALITY) {
|
|
130
|
+
if (!personality) {
|
|
131
|
+
// No env identity (codex MCP children — parent env is not propagated):
|
|
132
|
+
// the cwd profile is the caller's per-cwd source of truth, so read the
|
|
133
|
+
// voice straight from it — no index hop, personality and voice can never
|
|
134
|
+
// disagree. cwd not a peer dir → null → {} (built-in defaults), as before.
|
|
135
|
+
const p = cwdProfile();
|
|
136
|
+
return p ? (voiceOf(p) || {}) : {};
|
|
137
|
+
}
|
|
138
|
+
const entry = indexEntry(personality);
|
|
139
|
+
|
|
140
|
+
// 1. Source of truth: the per-cwd profile (cwd looked up from the index).
|
|
141
|
+
const perCwd = perCwdVoice(entry && entry.cwd);
|
|
142
|
+
if (perCwd.available) return perCwd.voice;
|
|
143
|
+
|
|
144
|
+
// 2. Defensive: per-cwd unavailable → host index entry's voice.
|
|
145
|
+
return voiceOf(entry) || {};
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Resolve the voice for a model, applying the override/profile/default ladder.
|
|
150
|
+
* @param {object} a
|
|
151
|
+
* @param {string} a.modelName full model name (the interfaces.voice key)
|
|
152
|
+
* @param {string} a.def built-in default voice for this model
|
|
153
|
+
* @param {boolean} a.applyOverride whether the explicit override applies here
|
|
154
|
+
* (true only for the call's PRIMARY engine —
|
|
155
|
+
* an override must not leak across a fallback)
|
|
156
|
+
* @param {string} [a.override] explicit voice argument
|
|
157
|
+
* @param {Record<string,string>} a.voiceMap caller's interfaces.voice
|
|
158
|
+
* @returns {{voice: string, source: 'override'|'profile'|'default'}}
|
|
159
|
+
*/
|
|
160
|
+
export function resolveVoice({ modelName, def, applyOverride, override, voiceMap }) {
|
|
161
|
+
if (applyOverride && override) return { voice: override, source: 'override' };
|
|
162
|
+
const fromProfile = voiceMap && voiceMap[modelName];
|
|
163
|
+
if (fromProfile) return { voice: fromProfile, source: 'profile' };
|
|
164
|
+
return { voice: def, source: 'default' };
|
|
165
|
+
}
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TTS providers — the four engines behind one uniform contract, plus the
|
|
3
|
+
* declarative cascade table that the router runs.
|
|
4
|
+
*
|
|
5
|
+
* A provider is data + one method:
|
|
6
|
+
* {
|
|
7
|
+
* name, model, defaultVoice,
|
|
8
|
+
* capabilities: { style, langs, streaming }, // first-class, declared up front
|
|
9
|
+
* isFallbackError?(err): boolean, // known can't-serve → advance cascade
|
|
10
|
+
* async synthesize(ctx): { pcm, voice, lang?, finishReason? },
|
|
11
|
+
* }
|
|
12
|
+
* ctx carries { text, style, personality, tmpDir, voice?, lang? }. Each provider
|
|
13
|
+
* encapsulates its own quirk: Gemini mints the per-peer F5 ref on success, F5
|
|
14
|
+
* loads that ref and decodes wav→pcm, Supertonic decodes wav→pcm. Adding an
|
|
15
|
+
* engine = adding one object here; the router and voice.mjs stay untouched.
|
|
16
|
+
*
|
|
17
|
+
* Idea lineage (voice-gateway research): base class with one synth method
|
|
18
|
+
* (pipecat/livekit) + Capabilities as a first-class object (livekit).
|
|
19
|
+
*/
|
|
20
|
+
import { join } from 'node:path';
|
|
21
|
+
import {
|
|
22
|
+
geminiTTS,
|
|
23
|
+
GeminiQuotaError,
|
|
24
|
+
GeminiNoKeyError,
|
|
25
|
+
GeminiTruncatedError,
|
|
26
|
+
GeminiUnavailableError,
|
|
27
|
+
DEFAULT_GEMINI_VOICE,
|
|
28
|
+
GEMINI_MODEL,
|
|
29
|
+
} from './engines/gemini.mjs';
|
|
30
|
+
import {
|
|
31
|
+
gptAudioTTS,
|
|
32
|
+
GptAudioQuotaError,
|
|
33
|
+
GptAudioNoKeyError,
|
|
34
|
+
GptAudioUnavailableError,
|
|
35
|
+
DEFAULT_GPTAUDIO_VOICE,
|
|
36
|
+
GPTAUDIO_MODEL,
|
|
37
|
+
} from './engines/gptaudio.mjs';
|
|
38
|
+
import { f5TTS, F5UnavailableError, DEFAULT_F5_VOICE, F5_MODEL } from './engines/f5.mjs';
|
|
39
|
+
import {
|
|
40
|
+
supertonicTTS,
|
|
41
|
+
DEFAULT_SUPERTONIC_VOICE,
|
|
42
|
+
SUPERTONIC_MODEL,
|
|
43
|
+
} from './engines/supertonic.mjs';
|
|
44
|
+
import { decodeToPcmBuffer } from './audio.mjs';
|
|
45
|
+
import { saveRef, loadRef } from './ref.mjs';
|
|
46
|
+
import { speachesSTT, SpeachesUnavailableError, sttEndpoint } from './engines/speaches.mjs';
|
|
47
|
+
import { mlxWhisperSTT } from './engines/mlxwhisper.mjs';
|
|
48
|
+
|
|
49
|
+
/** Gemini 3.1 Flash TTS — cloud primary, ru+en in one pass, supports style. */
|
|
50
|
+
export const gemini = {
|
|
51
|
+
name: 'gemini',
|
|
52
|
+
model: GEMINI_MODEL,
|
|
53
|
+
defaultVoice: DEFAULT_GEMINI_VOICE,
|
|
54
|
+
capabilities: { style: true, langs: 'any', streaming: false },
|
|
55
|
+
isFallbackError: (e) =>
|
|
56
|
+
e instanceof GeminiQuotaError ||
|
|
57
|
+
e instanceof GeminiNoKeyError ||
|
|
58
|
+
e instanceof GeminiTruncatedError ||
|
|
59
|
+
e instanceof GeminiUnavailableError,
|
|
60
|
+
// `deps` is a test seam (engine + ref-store fns); defaults are the real
|
|
61
|
+
// modules, so the router's single-arg call is unchanged. Same pattern as
|
|
62
|
+
// server.mjs's injectable {voice, dispatch, transcribe}.
|
|
63
|
+
async synthesize({ text, voice, style, personality }, deps = {}) {
|
|
64
|
+
const tts = deps.tts ?? geminiTTS;
|
|
65
|
+
const save = deps.saveRef ?? saveRef;
|
|
66
|
+
const { pcm, finishReason } = await tts(text, voice, style); // single pass
|
|
67
|
+
// Gemini just succeeded — the one moment we can capture this peer's voice for
|
|
68
|
+
// F5 (a ref can't be minted on the F5 path, where Gemini is down). Reuse THIS
|
|
69
|
+
// output as the ref, keyed to the voice. Best-effort, never affects the audio.
|
|
70
|
+
await save(personality, pcm, text, voice).catch(() => {});
|
|
71
|
+
return { pcm, voice, finishReason };
|
|
72
|
+
},
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
/** gpt-audio over OpenRouter — cloud second rung, multilingual one pass, style. */
|
|
76
|
+
export const gptAudio = {
|
|
77
|
+
name: 'gpt-audio',
|
|
78
|
+
model: GPTAUDIO_MODEL,
|
|
79
|
+
defaultVoice: DEFAULT_GPTAUDIO_VOICE,
|
|
80
|
+
capabilities: { style: true, langs: 'any', streaming: false },
|
|
81
|
+
isFallbackError: (e) =>
|
|
82
|
+
e instanceof GptAudioQuotaError ||
|
|
83
|
+
e instanceof GptAudioNoKeyError ||
|
|
84
|
+
e instanceof GptAudioUnavailableError,
|
|
85
|
+
async synthesize({ text, voice, style }, deps = {}) {
|
|
86
|
+
const tts = deps.tts ?? gptAudioTTS;
|
|
87
|
+
const { pcm } = await tts(text, voice, style);
|
|
88
|
+
return { pcm, voice };
|
|
89
|
+
},
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
/** F5-TTS — live-prosody Russian rung with per-peer voice cloning from a ref. */
|
|
93
|
+
export const f5 = {
|
|
94
|
+
name: 'f5',
|
|
95
|
+
model: F5_MODEL,
|
|
96
|
+
defaultVoice: DEFAULT_F5_VOICE,
|
|
97
|
+
capabilities: { style: false, langs: ['ru'], streaming: false },
|
|
98
|
+
isFallbackError: (e) => e instanceof F5UnavailableError,
|
|
99
|
+
async synthesize({ text, personality, tmpDir }, deps = {}) {
|
|
100
|
+
const tts = deps.tts ?? f5TTS;
|
|
101
|
+
const load = deps.loadRef ?? loadRef;
|
|
102
|
+
const decode = deps.decode ?? decodeToPcmBuffer;
|
|
103
|
+
// Per-peer voice: load this peer's cached ref → F5 clones their voice. Never
|
|
104
|
+
// generates here; null when the peer has no ref yet → F5 default voice.
|
|
105
|
+
const ref = await load(personality);
|
|
106
|
+
const wav = join(tmpDir, 'f5.wav');
|
|
107
|
+
await tts(text, 'ru', wav, ref); // F5 reads ruaccent '+' natively
|
|
108
|
+
const pcm = await decode(wav);
|
|
109
|
+
return { pcm, voice: ref ? personality : DEFAULT_F5_VOICE, lang: 'ru' };
|
|
110
|
+
},
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
/** Supertonic 3 — local, offline floor. One pass in the given language. */
|
|
114
|
+
export const supertonic = {
|
|
115
|
+
name: 'supertonic',
|
|
116
|
+
model: SUPERTONIC_MODEL,
|
|
117
|
+
defaultVoice: DEFAULT_SUPERTONIC_VOICE,
|
|
118
|
+
capabilities: { style: false, langs: ['ru', 'en', 'na'], streaming: false },
|
|
119
|
+
// No isFallbackError: the floor. Any error here propagates (real failure).
|
|
120
|
+
async synthesize({ text, voice, lang, tmpDir }, deps = {}) {
|
|
121
|
+
const tts = deps.tts ?? supertonicTTS;
|
|
122
|
+
const decode = deps.decode ?? decodeToPcmBuffer;
|
|
123
|
+
const wav = join(tmpDir, 'st.wav');
|
|
124
|
+
await tts(text, lang, wav, voice);
|
|
125
|
+
const pcm = await decode(wav);
|
|
126
|
+
return { pcm, voice, lang };
|
|
127
|
+
},
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
export const ttsProviders = { gemini, gptAudio, f5, supertonic };
|
|
131
|
+
|
|
132
|
+
/** The provider for a forced `engine` value, or null for 'auto'. */
|
|
133
|
+
export function ttsProviderByEngine(engine) {
|
|
134
|
+
if (engine === 'gemini') return gemini;
|
|
135
|
+
if (engine === 'gpt-audio') return gptAudio;
|
|
136
|
+
if (engine === 'supertonic') return supertonic;
|
|
137
|
+
return null;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* The declarative `auto` cascade (priority order). Voices are baked per entry;
|
|
142
|
+
* F5 applies ONLY on the ru route (its English is weak); Supertonic is the floor
|
|
143
|
+
* and runs in the routed language. Returns entries shaped for runCascade().
|
|
144
|
+
* @param {{route:'ru'|'en'|'na', gemVoice:string, gaVoice:string, stVoice:string}} a
|
|
145
|
+
*/
|
|
146
|
+
export function buildTtsCascade({ route, gemVoice, gaVoice, stVoice }) {
|
|
147
|
+
const wrap = (provider, extra) => ({
|
|
148
|
+
name: provider.name,
|
|
149
|
+
applicable: extra.applicable,
|
|
150
|
+
isFallbackError: provider.isFallbackError,
|
|
151
|
+
run: (ctx) => provider.synthesize({ ...ctx, voice: extra.voice, lang: extra.lang }),
|
|
152
|
+
});
|
|
153
|
+
return [
|
|
154
|
+
wrap(gemini, { voice: gemVoice }),
|
|
155
|
+
wrap(gptAudio, { voice: gaVoice }),
|
|
156
|
+
wrap(f5, { lang: 'ru', applicable: () => route === 'ru' }),
|
|
157
|
+
wrap(supertonic, { voice: stVoice, lang: route }),
|
|
158
|
+
];
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// ── STT providers ───────────────────────────────────────────────────────────
|
|
162
|
+
// Mirror of the TTS side: uniform contract, declarative cascade. speaches (the
|
|
163
|
+
// OpenAI-compatible HTTP service telegram-runtime already uses) is the primary
|
|
164
|
+
// rung but applies ONLY when an endpoint is configured; mlx-whisper is the local
|
|
165
|
+
// floor. Adding spokenly / Gemini-Flash STT later = one more object here.
|
|
166
|
+
|
|
167
|
+
/** speaches (faster-whisper, OpenAI /v1/audio/transcriptions) — primary STT. */
|
|
168
|
+
export const speaches = {
|
|
169
|
+
name: 'speaches',
|
|
170
|
+
capabilities: { streaming: false, languages: 'any' },
|
|
171
|
+
isFallbackError: (e) => e instanceof SpeachesUnavailableError,
|
|
172
|
+
applicable: () => Boolean(sttEndpoint()), // skipped entirely when no endpoint set
|
|
173
|
+
async transcribe({ audioPath, lang, prompt }, deps = {}) {
|
|
174
|
+
const stt = deps.stt ?? speachesSTT;
|
|
175
|
+
const { text } = await stt(audioPath, { lang, prompt });
|
|
176
|
+
return { text };
|
|
177
|
+
},
|
|
178
|
+
};
|
|
179
|
+
|
|
180
|
+
/** mlx_whisper CLI — local, offline STT floor. */
|
|
181
|
+
export const mlxWhisper = {
|
|
182
|
+
name: 'mlx-whisper',
|
|
183
|
+
capabilities: { streaming: false, languages: 'any' },
|
|
184
|
+
// No isFallbackError: the floor. Any error here propagates (real failure).
|
|
185
|
+
async transcribe({ audioPath, lang, prompt }, deps = {}) {
|
|
186
|
+
const stt = deps.stt ?? mlxWhisperSTT;
|
|
187
|
+
const { text } = await stt(audioPath, { lang, prompt });
|
|
188
|
+
return { text };
|
|
189
|
+
},
|
|
190
|
+
};
|
|
191
|
+
|
|
192
|
+
export const sttProviders = { speaches, mlxWhisper };
|
|
193
|
+
|
|
194
|
+
/** The provider for a forced STT `engine`, or null for 'auto'. */
|
|
195
|
+
export function sttProviderByEngine(engine) {
|
|
196
|
+
if (engine === 'speaches') return speaches;
|
|
197
|
+
if (engine === 'mlx-whisper') return mlxWhisper;
|
|
198
|
+
return null;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/** The declarative STT cascade: speaches (if configured) → mlx-whisper floor. */
|
|
202
|
+
export function buildSttCascade() {
|
|
203
|
+
const wrap = (provider) => ({
|
|
204
|
+
name: provider.name,
|
|
205
|
+
applicable: provider.applicable,
|
|
206
|
+
isFallbackError: provider.isFallbackError,
|
|
207
|
+
run: (ctx) => provider.transcribe(ctx),
|
|
208
|
+
});
|
|
209
|
+
return [wrap(speaches), wrap(mlxWhisper)];
|
|
210
|
+
}
|
package/src/ref.mjs
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-peer F5 voice reference — voice-as-identity on the F5 ladder rung (6b).
|
|
3
|
+
*
|
|
4
|
+
* F5-TTS clones the voice of a reference sample: POST .../synthesize accepts
|
|
5
|
+
* {ref_audio_b64, ref_text} and speaks the request in THAT voice. So peer X
|
|
6
|
+
* sounds like itself on an F5 overflow if we hand F5 a sample of X's voice.
|
|
7
|
+
*
|
|
8
|
+
* The catch: a ref of X's voice can only be MINTED while Gemini works (Gemini is
|
|
9
|
+
* X's primary voice), but F5 runs precisely when Gemini is exhausted. So the ref
|
|
10
|
+
* is NOT generated on the F5 path. Instead it is REGISTERED opportunistically on
|
|
11
|
+
* a successful Gemini synthesis (saveRef), reusing that real Gemini output — the
|
|
12
|
+
* canonical PCM, before ogg encoding — as the sample, with the spoken text as
|
|
13
|
+
* its transcript. Zero extra Gemini calls. The F5 path only ever loads + sends
|
|
14
|
+
* (loadRef); if no ref exists yet (peer never spoke via Gemini), F5 uses its
|
|
15
|
+
* default voice — a self-healing edge case that resolves after X's first synth.
|
|
16
|
+
*
|
|
17
|
+
* Mutability: the ref is keyed to the peer's Gemini voice. saveRef stamps the
|
|
18
|
+
* voice into the meta; when the peer's profile voice later differs, the ref is
|
|
19
|
+
* STALE and the next well-sized Gemini output refreshes it automatically — so
|
|
20
|
+
* changing a peer's voice propagates to its F5 voice with no manual step. A
|
|
21
|
+
* manual reset is just deleting the ref files.
|
|
22
|
+
*
|
|
23
|
+
* Cache (per peer X), under $PEER_VOICE_HOME/refs/ (default ~/.iapeer/cache/peer-voice):
|
|
24
|
+
* X.wav — the voice sample (RIFF wav, from the Gemini PCM)
|
|
25
|
+
* X.json — { gemini_voice, ref_text, registered_at }
|
|
26
|
+
*/
|
|
27
|
+
import { mkdir, readFile, writeFile, access } from 'node:fs/promises';
|
|
28
|
+
import { constants as FS } from 'node:fs';
|
|
29
|
+
import { join } from 'node:path';
|
|
30
|
+
import { peerVoiceHome } from './home.mjs';
|
|
31
|
+
import {
|
|
32
|
+
encodePcmToWav,
|
|
33
|
+
SAMPLE_RATE,
|
|
34
|
+
CHANNELS,
|
|
35
|
+
BYTES_PER_SAMPLE,
|
|
36
|
+
} from './audio.mjs';
|
|
37
|
+
|
|
38
|
+
const BYTES_PER_SEC = SAMPLE_RATE * CHANNELS * BYTES_PER_SAMPLE; // 48000 B/s
|
|
39
|
+
|
|
40
|
+
// A good voice clone wants a few seconds of clean speech: too short underdefines
|
|
41
|
+
// the timbre, too long bloats the request and risks artefacts. We register the
|
|
42
|
+
// first suitable Gemini output and skip the rest. Env-tunable.
|
|
43
|
+
const MIN_SEC = Number(process.env.PEER_VOICE_REF_MIN_SEC || '4');
|
|
44
|
+
const MAX_SEC = Number(process.env.PEER_VOICE_REF_MAX_SEC || '15');
|
|
45
|
+
|
|
46
|
+
// IAP personalities are lowercase slugs; keep the cache filename to that shape so
|
|
47
|
+
// a stray/odd identity can never escape the refs dir.
|
|
48
|
+
const PERSONALITY_RE = /^[a-z][a-z0-9-]{0,63}$/;
|
|
49
|
+
|
|
50
|
+
function refsDir() {
|
|
51
|
+
return join(peerVoiceHome(), 'refs');
|
|
52
|
+
}
|
|
53
|
+
function refPaths(personality) {
|
|
54
|
+
const base = join(refsDir(), personality);
|
|
55
|
+
return { wav: `${base}.wav`, meta: `${base}.json` };
|
|
56
|
+
}
|
|
57
|
+
function validPersonality(personality) {
|
|
58
|
+
return typeof personality === 'string' && PERSONALITY_RE.test(personality);
|
|
59
|
+
}
|
|
60
|
+
async function exists(p) {
|
|
61
|
+
try { await access(p, FS.F_OK); return true; } catch { return false; }
|
|
62
|
+
}
|
|
63
|
+
function log(msg) { process.stderr.write(`[peer-voice/ref] ${msg}\n`); }
|
|
64
|
+
|
|
65
|
+
/** Duration in seconds of a canonical-PCM buffer. */
|
|
66
|
+
export function pcmDurationSec(pcmBuffer) {
|
|
67
|
+
if (!pcmBuffer || !pcmBuffer.length) return 0;
|
|
68
|
+
return pcmBuffer.length / BYTES_PER_SEC;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/** Parsed meta for peer X ({ gemini_voice, ref_text, registered_at }), or null. */
|
|
72
|
+
export async function refMeta(personality) {
|
|
73
|
+
if (!validPersonality(personality)) return null;
|
|
74
|
+
const { meta } = refPaths(personality);
|
|
75
|
+
try {
|
|
76
|
+
const obj = JSON.parse(await readFile(meta, 'utf8'));
|
|
77
|
+
return obj && typeof obj === 'object' ? obj : null;
|
|
78
|
+
} catch {
|
|
79
|
+
return null;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/** Whether peer X has a usable cached ref (both the sample and its meta). */
|
|
84
|
+
export async function hasRef(personality) {
|
|
85
|
+
if (!validPersonality(personality)) return false;
|
|
86
|
+
const { wav, meta } = refPaths(personality);
|
|
87
|
+
return (await exists(wav)) && (await exists(meta));
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Opportunistically register/refresh peer X's F5 ref from a successful Gemini
|
|
92
|
+
* synthesis. MUST be called only on the Gemini-success path (never on F5).
|
|
93
|
+
* Writes nothing — and never touches ffmpeg — when: the personality is odd, the
|
|
94
|
+
* text is empty, no Gemini voice is given, a CURRENT ref already exists (same
|
|
95
|
+
* voice), or the audio is outside the [MIN_SEC, MAX_SEC] window (wait for a
|
|
96
|
+
* better-sized one). When the stored ref was minted for a DIFFERENT Gemini voice
|
|
97
|
+
* it is treated as stale and refreshed. Best-effort; returns a small result.
|
|
98
|
+
* @param {string} personality
|
|
99
|
+
* @param {Buffer} pcmBuffer canonical PCM (s16le 24 kHz mono), BEFORE ogg encode
|
|
100
|
+
* @param {string} text the text Gemini spoke — becomes ref_text (transcript)
|
|
101
|
+
* @param {string} geminiVoice the peer's Gemini voice this sample was spoken in
|
|
102
|
+
* @returns {Promise<{saved: boolean, refreshed?: boolean, reason?: string, seconds?: number}>}
|
|
103
|
+
*/
|
|
104
|
+
export async function saveRef(personality, pcmBuffer, text, geminiVoice) {
|
|
105
|
+
if (!validPersonality(personality)) return { saved: false, reason: 'bad-personality' };
|
|
106
|
+
if (!text || !String(text).trim()) return { saved: false, reason: 'empty-text' };
|
|
107
|
+
if (!geminiVoice || !String(geminiVoice).trim()) return { saved: false, reason: 'no-voice' };
|
|
108
|
+
|
|
109
|
+
// A ref minted for the SAME voice is current — keep it (registered once per
|
|
110
|
+
// (peer, voice)). A different voice means the peer's voice changed → stale.
|
|
111
|
+
const meta = await refMeta(personality);
|
|
112
|
+
const stale = meta && meta.gemini_voice !== geminiVoice;
|
|
113
|
+
if (meta && !stale) return { saved: false, reason: 'exists-current' };
|
|
114
|
+
|
|
115
|
+
const seconds = pcmDurationSec(pcmBuffer);
|
|
116
|
+
if (seconds < MIN_SEC || seconds > MAX_SEC) {
|
|
117
|
+
return { saved: false, reason: 'duration-out-of-window', seconds };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const { wav, meta: metaPath } = refPaths(personality);
|
|
121
|
+
try {
|
|
122
|
+
await mkdir(refsDir(), { recursive: true });
|
|
123
|
+
await encodePcmToWav(pcmBuffer, wav);
|
|
124
|
+
await writeFile(metaPath, JSON.stringify({
|
|
125
|
+
gemini_voice: geminiVoice,
|
|
126
|
+
ref_text: String(text),
|
|
127
|
+
registered_at: new Date().toISOString(),
|
|
128
|
+
}, null, 2), 'utf8');
|
|
129
|
+
log(`${stale ? 'refreshed' : 'registered'} ref for "${personality}" `
|
|
130
|
+
+ `(voice ${geminiVoice}, ${seconds.toFixed(1)}s).`);
|
|
131
|
+
return { saved: true, refreshed: !!stale, seconds };
|
|
132
|
+
} catch (e) {
|
|
133
|
+
log(`saveRef failed for "${personality}" (${e.message}); will retry on a later synthesis.`);
|
|
134
|
+
return { saved: false, reason: 'write-error' };
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Load peer X's cached F5 ref, or null when there is none / it is unreadable.
|
|
140
|
+
* Never generates — generation is saveRef's job, on the Gemini path.
|
|
141
|
+
* @param {string} personality
|
|
142
|
+
* @returns {Promise<{personality: string, audioB64: string, text: string}|null>}
|
|
143
|
+
*/
|
|
144
|
+
export async function loadRef(personality) {
|
|
145
|
+
if (!(await hasRef(personality))) return null;
|
|
146
|
+
const { wav } = refPaths(personality);
|
|
147
|
+
try {
|
|
148
|
+
const meta = await refMeta(personality);
|
|
149
|
+
const text = meta && meta.ref_text ? String(meta.ref_text).trim() : '';
|
|
150
|
+
const audioB64 = (await readFile(wav)).toString('base64');
|
|
151
|
+
if (!audioB64 || !text) return null;
|
|
152
|
+
return { personality, audioB64, text };
|
|
153
|
+
} catch (e) {
|
|
154
|
+
log(`loadRef failed for "${personality}" (${e.message}); using default voice.`);
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
}
|
package/src/router.mjs
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Declarative provider router — the formalized engine cascade.
|
|
3
|
+
*
|
|
4
|
+
* Before this, voice.mjs hard-coded the fallback ladder as nested try/catch.
|
|
5
|
+
* The router turns that into data: an ordered list of entries, each with an
|
|
6
|
+
* `applicable` guard and an `isFallbackError` predicate. The cascade runs them
|
|
7
|
+
* in order, advancing to the next entry ONLY on a known "can't-serve" error;
|
|
8
|
+
* any other error propagates so real bugs surface instead of being masked.
|
|
9
|
+
*
|
|
10
|
+
* Idea lineage (see the voice-gateway research): a declarative table with
|
|
11
|
+
* priority + fallback + cooldown (litellm Router). Cooldown is implemented but
|
|
12
|
+
* INERT by default (cooldownSec = 0) — Phase 1 is a behavior-preserving refactor;
|
|
13
|
+
* flipping cooldown on is a later, deliberate config change.
|
|
14
|
+
*
|
|
15
|
+
* Modality-agnostic: it runs TTS and STT cascades alike. An entry is:
|
|
16
|
+
* {
|
|
17
|
+
* name: string, // 'gemini' | 'speaches' | ...
|
|
18
|
+
* applicable?: (ctx) => boolean, // skip this entry when false (e.g. F5 only for ru)
|
|
19
|
+
* isFallbackError?: (err) => boolean, // true → advance the cascade; false/absent → propagate
|
|
20
|
+
* run: (ctx) => Promise<object>, // do the work (synthesize / transcribe / …)
|
|
21
|
+
* }
|
|
22
|
+
*
|
|
23
|
+
* Returns the successful entry's result, augmented with:
|
|
24
|
+
* - name: the winning entry's name
|
|
25
|
+
* - fallbackFrom: 'gemini:GeminiQuotaError→gptaudio:GptAudioNoKeyError' when the
|
|
26
|
+
* cascade fell through failures first, or undefined if the first
|
|
27
|
+
* applicable entry succeeded.
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* @param {Array} entries ordered cascade entries (highest priority first)
|
|
32
|
+
* @param {object} ctx synthesis context passed to each entry's synthesize/applicable
|
|
33
|
+
* @param {object} [opts]
|
|
34
|
+
* @param {number} [opts.cooldownSec=0] seconds a failed entry is skipped (0 = inert)
|
|
35
|
+
* @param {Map} [opts.cooldownStore] name → epoch-ms-until-usable (persist across calls to make cooldown stick)
|
|
36
|
+
* @param {() => number} [opts.now] clock (injectable for tests)
|
|
37
|
+
* @param {(name: string, err: Error) => void} [opts.onAdvance] notified each time the cascade falls through an entry
|
|
38
|
+
* @returns {Promise<object>} the winning result + { name, fallbackFrom }
|
|
39
|
+
*/
|
|
40
|
+
export async function runCascade(entries, ctx, opts = {}) {
|
|
41
|
+
const {
|
|
42
|
+
cooldownSec = 0,
|
|
43
|
+
cooldownStore = new Map(),
|
|
44
|
+
now = () => Date.now(),
|
|
45
|
+
onAdvance,
|
|
46
|
+
} = opts;
|
|
47
|
+
|
|
48
|
+
const trail = [];
|
|
49
|
+
let lastErr;
|
|
50
|
+
let skippedForCooldown = false;
|
|
51
|
+
|
|
52
|
+
for (const entry of entries) {
|
|
53
|
+
if (entry.applicable && !entry.applicable(ctx)) continue;
|
|
54
|
+
|
|
55
|
+
if (cooldownSec > 0) {
|
|
56
|
+
const until = cooldownStore.get(entry.name);
|
|
57
|
+
if (until && until > now()) {
|
|
58
|
+
skippedForCooldown = true;
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
try {
|
|
64
|
+
const result = await entry.run(ctx);
|
|
65
|
+
return {
|
|
66
|
+
...result,
|
|
67
|
+
name: entry.name,
|
|
68
|
+
fallbackFrom: trail.length ? trail.join('→') : undefined,
|
|
69
|
+
};
|
|
70
|
+
} catch (err) {
|
|
71
|
+
// Only KNOWN can't-serve errors advance the cascade. Anything else is a
|
|
72
|
+
// real failure and must surface — never silently swallowed by a fallback.
|
|
73
|
+
if (entry.isFallbackError && entry.isFallbackError(err)) {
|
|
74
|
+
trail.push(`${entry.name}:${err.name}`);
|
|
75
|
+
if (cooldownSec > 0) cooldownStore.set(entry.name, now() + cooldownSec * 1000);
|
|
76
|
+
if (onAdvance) onAdvance(entry.name, err);
|
|
77
|
+
lastErr = err;
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
throw err;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// No applicable entry produced audio.
|
|
85
|
+
if (lastErr) throw lastErr;
|
|
86
|
+
throw new Error(
|
|
87
|
+
skippedForCooldown
|
|
88
|
+
? 'voice router: every applicable provider is in cooldown'
|
|
89
|
+
: 'voice router: no applicable provider for this request',
|
|
90
|
+
);
|
|
91
|
+
}
|