@agfpd/voice-connect 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/voice.mjs ADDED
@@ -0,0 +1,201 @@
1
+ /**
2
+ * voice_create — TTS routing & synthesis.
3
+ *
4
+ * Contract (per Artur, 2026-06-01): voice_create(text, voice?). Plain text is
5
+ * ALWAYS a single generation pass — no cutting, no joining, no artificial
6
+ * pauses — on every engine. Minimal load on the agent: give text, get an ogg.
7
+ *
8
+ * Engine routing is a four-rung fallback ladder, now expressed declaratively:
9
+ * the provider table lives in providers.mjs and the cascade is run by router.mjs.
10
+ * - Gemini 3.1 Flash TTS primary — direct Google API, one pass (ru+en).
11
+ * - gpt-audio second (OpenRouter) — cloud-quality fallback before the local
12
+ * rungs; multilingual one pass, tried for ALL languages.
13
+ * - F5-TTS third — LIVE-prosody Russian rung, per-peer voice from a cached ref.
14
+ * - Supertonic 3 local floor — offline, one pass in the routed language.
15
+ *
16
+ * A `style` directive (HOW to speak) rides on the cloud rungs (Gemini, gpt-audio)
17
+ * and is ignored by the local rungs. Each rung falls through ONLY on its known
18
+ * engine-failure classes; other errors propagate so real bugs surface.
19
+ *
20
+ * The `||` change-of-language marker is a LATER phase; Phase 1 flattens it away
21
+ * so it is never read aloud. The tool only PRODUCES the file — delivery
22
+ * (send_to_peer attachments) is the caller's job.
23
+ */
24
+ import { mkdtemp, rm, mkdir } from 'node:fs/promises';
25
+ import { tmpdir } from 'node:os';
26
+ import { join, dirname } from 'node:path';
27
+ import { randomBytes } from 'node:crypto';
28
+ import { classifyLangMode, supertonicLang } from './langsplit.mjs';
29
+ import { encodePcmToOgg, probe } from './audio.mjs';
30
+ import { callerPersonality, resolveVoice } from './profile.mjs';
31
+ import { voiceMap as configVoiceMap } from './config.mjs';
32
+ import { peerVoiceHome } from './home.mjs';
33
+ import { logSynthesis } from './synthlog.mjs';
34
+ import { runCascade } from './router.mjs';
35
+ import { ttsProviders, ttsProviderByEngine, buildTtsCascade } from './providers.mjs';
36
+
37
+ function defaultOutPath() {
38
+ const stamp = `${Date.now()}-${randomBytes(4).toString('hex')}`;
39
+ return join(peerVoiceHome(), 'out', `voice-${stamp}.ogg`);
40
+ }
41
+
42
+ /** Phase 1: drop the reserved `||` marker so it is not spoken; collapse space. */
43
+ function flattenText(text) {
44
+ if (typeof text !== 'string') return '';
45
+ return text.replace(/\|\|/g, ' ').replace(/\s+/g, ' ').trim();
46
+ }
47
+
48
+ /** Caller-supplied lang, normalized to a routing tag or undefined. */
49
+ function normalizeLang(lang) {
50
+ return ['ru', 'en', 'na'].includes(lang) ? lang : undefined;
51
+ }
52
+
53
+ /**
54
+ * Which fallback engine/mode to use when the cloud rungs are down. Returns:
55
+ * 'ru' → F5 (Russian accent-tune; Supertonic-ru is the floor if F5 is down)
56
+ * 'en' → Supertonic --lang en
57
+ * 'na' → Supertonic --lang na (native multilingual, one pass)
58
+ * An explicit caller `lang` LEADS; otherwise the share-based autodetect
59
+ * (classifyLangMode); 'mixed' there maps to 'na'.
60
+ * @param {string} text
61
+ * @param {'ru'|'en'|'na'|undefined} langOpt already normalized
62
+ * @returns {'ru'|'en'|'na'}
63
+ */
64
+ function fallbackRoute(text, langOpt) {
65
+ if (langOpt) return langOpt;
66
+ const mode = classifyLangMode(text);
67
+ return mode === 'mixed' ? 'na' : mode;
68
+ }
69
+
70
+ /**
71
+ * Create a voice file from text.
72
+ *
73
+ * @param {object} opts
74
+ * @param {string} opts.text text to speak (mixed ru+en is fine)
75
+ * @param {string} [opts.voice] voice for the call's PRIMARY engine (default Aoede on Gemini)
76
+ * @param {'auto'|'gemini'|'gpt-audio'|'supertonic'} [opts.engine] default 'auto'
77
+ * @param {'ru'|'en'|'na'} [opts.lang] caller-declared language for the F5/Supertonic rungs
78
+ * @param {string} [opts.style] delivery directive for the cloud rungs
79
+ * @param {string} [opts.out_path] output .ogg path
80
+ * @returns {Promise<{path,engine,voice,lang?,probe,fallback_from?}>}
81
+ */
82
+ export async function createVoice(opts = {}) {
83
+ const engine = ['gemini', 'gpt-audio', 'supertonic', 'auto'].includes(opts.engine) ? opts.engine : 'auto';
84
+ const langOpt = normalizeLang(opts.lang);
85
+ const style = typeof opts.style === 'string' && opts.style.trim() ? opts.style.trim() : undefined;
86
+
87
+ // Voice resolution: explicit override → caller's interfaces.voice[<model>] →
88
+ // built-in default. The override is a single voice in the namespace of the
89
+ // call's PRIMARY engine, so it applies ONLY there — never leaking across a
90
+ // fallback (a Gemini voice name is invalid for Supertonic, etc.).
91
+ const override = opts.voice;
92
+ const personality = callerPersonality(); // whose voice — for the F5 per-peer ref (env → cwd profile)
93
+ const voiceMap = configVoiceMap(); // mode-aware: iapeer peer-profile → autonomous config file
94
+ const gemVoice = resolveVoice({
95
+ modelName: ttsProviders.gemini.model,
96
+ def: ttsProviders.gemini.defaultVoice,
97
+ applyOverride: engine === 'auto' || engine === 'gemini',
98
+ override,
99
+ voiceMap,
100
+ }).voice;
101
+ const gaVoice = resolveVoice({
102
+ modelName: ttsProviders.gptAudio.model,
103
+ def: ttsProviders.gptAudio.defaultVoice,
104
+ applyOverride: engine === 'gpt-audio',
105
+ override,
106
+ voiceMap,
107
+ }).voice;
108
+ const stVoice = resolveVoice({
109
+ modelName: ttsProviders.supertonic.model,
110
+ def: ttsProviders.supertonic.defaultVoice,
111
+ applyOverride: engine === 'supertonic',
112
+ override,
113
+ voiceMap,
114
+ }).voice;
115
+
116
+ const text = flattenText(opts.text);
117
+ if (!text) {
118
+ throw new Error('voice_create: `text` is required and must be non-empty.');
119
+ }
120
+
121
+ const outPath = opts.out_path || defaultOutPath();
122
+ await mkdir(dirname(outPath), { recursive: true });
123
+ const tmp = await mkdtemp(join(tmpdir(), 'peer-voice-'));
124
+
125
+ let usedEngine;
126
+ let usedVoice;
127
+ let usedLang;
128
+ let usedFinishReason; // Gemini-only: how the (final, non-fallback) pass stopped
129
+ let fallbackFrom;
130
+ try {
131
+ const ctx = { text, style, personality, tmpDir: tmp };
132
+
133
+ let result;
134
+ if (engine === 'auto') {
135
+ // Cloud-quality first (Gemini → gpt-audio), then the language-routed local
136
+ // rungs (ru → F5 → Supertonic-ru; en/na → Supertonic). An explicit caller
137
+ // `lang` leads; otherwise the letter-share autodetect.
138
+ const route = fallbackRoute(text, langOpt); // 'ru' | 'en' | 'na'
139
+ result = await runCascade(buildTtsCascade({ route, gemVoice, gaVoice, stVoice }), ctx, {
140
+ onAdvance: (name, err) =>
141
+ process.stderr.write(`[peer-voice] ${name} unavailable (${err.message}); advancing cascade.\n`),
142
+ });
143
+ } else {
144
+ // Forced single engine — no cascade; its failure propagates to the caller.
145
+ const provider = ttsProviderByEngine(engine);
146
+ const voice = engine === 'gemini' ? gemVoice : engine === 'gpt-audio' ? gaVoice : stVoice;
147
+ // Forced Supertonic honors an explicit lang, else picks ru/en/na by share.
148
+ const lang = engine === 'supertonic' ? (langOpt ?? supertonicLang(text)) : undefined;
149
+ result = { ...(await provider.synthesize({ ...ctx, voice, lang })), name: provider.name };
150
+ }
151
+
152
+ usedEngine = result.name;
153
+ usedVoice = result.voice;
154
+ usedLang = result.lang;
155
+ usedFinishReason = result.finishReason;
156
+ fallbackFrom = result.fallbackFrom;
157
+
158
+ await encodePcmToOgg(result.pcm, outPath);
159
+ const info = await probe(outPath);
160
+
161
+ // Structured trace of THIS synthesis — findable post-hoc by output path.
162
+ await logSynthesis({
163
+ ok: true,
164
+ engine: usedEngine,
165
+ chars: text.length,
166
+ voice: usedVoice,
167
+ lang: usedLang ?? null,
168
+ style: style ?? null,
169
+ duration: info.duration ?? null,
170
+ finishReason: usedFinishReason ?? null,
171
+ fallback_from: fallbackFrom ?? null,
172
+ path: outPath,
173
+ });
174
+
175
+ return {
176
+ path: outPath,
177
+ engine: usedEngine,
178
+ voice: usedVoice,
179
+ ...(usedLang ? { lang: usedLang } : {}),
180
+ probe: info,
181
+ ...(fallbackFrom ? { fallback_from: fallbackFrom } : {}),
182
+ };
183
+ } catch (err) {
184
+ // Failures leave a trace too — a silent error was half of the original
185
+ // diagnosis gap. Log what is known, then propagate unchanged.
186
+ await logSynthesis({
187
+ ok: false,
188
+ engine: usedEngine ?? null,
189
+ chars: text.length,
190
+ voice: usedVoice ?? null,
191
+ lang: usedLang ?? null,
192
+ style: style ?? null,
193
+ finishReason: usedFinishReason ?? null,
194
+ fallback_from: fallbackFrom ?? null,
195
+ error: err instanceof Error ? err.message : String(err),
196
+ });
197
+ throw err;
198
+ } finally {
199
+ await rm(tmp, { recursive: true, force: true }).catch(() => {});
200
+ }
201
+ }
package/src/worker.mjs ADDED
@@ -0,0 +1,120 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Detached background worker for async voice jobs (Phase 7, variant B).
4
+ *
5
+ * Spawned by jobs.mjs (detached + unref) with one argument: the job file path.
6
+ * It runs the normal synthesis pipeline OUT of the MCP request, then IAP-
7
+ * notifies the calling peer:
8
+ * - done: "voice job <id> done path=<path> note=<note>"
9
+ * - failed: "voice job <id> failed reason=<...>"
10
+ * The agent receives that IAP message and delivers the ogg itself
11
+ * (send_to_peer(<from note>, attachments=[path])).
12
+ *
13
+ * Notification goes through the IAP CLI (`iapeer send <personality> --message …`),
14
+ * resolved from PATH or PEER_VOICE_IAP_BIN — same posture as ffmpeg. The worker
15
+ * also writes <job>.result.json so completion is observable without the IAP.
16
+ */
17
+ import { readFile, writeFile } from 'node:fs/promises';
18
+ import { execFile } from 'node:child_process';
19
+ import { promisify } from 'node:util';
20
+ import { fileURLToPath, pathToFileURL } from 'node:url';
21
+ import { createVoice } from './voice.mjs';
22
+
23
+ const pexecFile = promisify(execFile);
24
+ const IAP_BIN = process.env.PEER_VOICE_IAP_BIN || 'iapeer';
25
+
26
+ function log(msg) {
27
+ process.stdout.write(`[peer-voice/worker] ${new Date().toISOString()} ${msg}\n`);
28
+ }
29
+
30
+ /**
31
+ * Where to run `iapeer` from. The worker lives inside the CALLING agent's context,
32
+ * but an IAP message TO that caller must come FROM a different identity — iapeer
33
+ * refuses send-to-self and ties the sender to the cwd's .iapeer profile. So we
34
+ * notify AS peer-voice: run iapeer from a dir whose profile is peer-voice (the
35
+ * package root in dev; PEER_VOICE_NOTIFIER_CWD in deployments where the package
36
+ * has no profile of its own).
37
+ */
38
+ function notifierCwd() {
39
+ return process.env.PEER_VOICE_NOTIFIER_CWD
40
+ || fileURLToPath(new URL('..', import.meta.url)); // package root (one up from src/)
41
+ }
42
+
43
+ /** Notify the calling peer over IAP, sent AS peer-voice (not the caller — that
44
+ * would be a self-send). Best-effort: a failed send is logged, not thrown. */
45
+ async function notify(personality, message) {
46
+ if (!personality) {
47
+ log(`no personality on job — cannot IAP-notify; message was: ${message}`);
48
+ return;
49
+ }
50
+ const cwd = notifierCwd();
51
+ // iapeer reads the sender from cwd's .iapeer profile and rejects an inherited
52
+ // identity that disagrees with it (it checks PEER_IDENTITY == PEER_RUNTIME +
53
+ // "-" + PEER_PERSONALITY against the profile). In production the worker carries
54
+ // the CALLER's identity vars (e.g. claude-natalya), which would clash with the
55
+ // peer-voice notifier profile — so strip all three and let iapeer derive the
56
+ // sender purely from the notifier cwd (→ peer-voice), TO the caller.
57
+ const env = { ...process.env };
58
+ delete env.PEER_PERSONALITY;
59
+ delete env.PEER_IDENTITY;
60
+ delete env.PEER_RUNTIME;
61
+ log(`${IAP_BIN} send ${personality} --message ${JSON.stringify(message)} (from cwd ${cwd})`);
62
+ try {
63
+ const { stdout, stderr } = await pexecFile(
64
+ IAP_BIN, ['send', personality, '--message', message, '--topic', 'voice-job'],
65
+ { cwd, env },
66
+ );
67
+ log(`${IAP_BIN} send ok: ${String(stdout || stderr || '').trim().slice(0, 200)}`);
68
+ } catch (e) {
69
+ log(`${IAP_BIN} send FAILED: ${e.message}`);
70
+ }
71
+ }
72
+
73
+ export async function runJob(jobFile) {
74
+ const job = JSON.parse(await readFile(jobFile, 'utf8'));
75
+ // The per-peer F5 ref keys off PEER_PERSONALITY — carry the caller's identity
76
+ // into this detached process so an overflow still speaks in their voice.
77
+ if (job.personality) process.env.PEER_PERSONALITY = job.personality;
78
+ const resultFile = jobFile.replace(/\.json$/, '.result.json');
79
+ const noteSuffix = job.note ? ` note=${job.note}` : '';
80
+
81
+ try {
82
+ log(`job ${job.job_id} start (chars=${(job.text || '').length}, engine=${job.engine})`);
83
+ const r = await createVoice({
84
+ text: job.text,
85
+ voice: job.voice ?? undefined,
86
+ engine: job.engine,
87
+ lang: job.lang ?? undefined,
88
+ style: job.style ?? undefined,
89
+ out_path: job.out_path ?? undefined,
90
+ });
91
+ log(`job ${job.job_id} synthesized engine=${r.engine} path=${r.path}`);
92
+ await notify(job.personality, `voice job ${job.job_id} done path=${r.path}${noteSuffix}`);
93
+ await writeFile(resultFile, JSON.stringify({
94
+ ...r, job_id: job.job_id, status: 'done', note: job.note ?? null,
95
+ finished_at: new Date().toISOString(),
96
+ }, null, 2), 'utf8').catch(() => {});
97
+ } catch (e) {
98
+ const reason = e && e.message ? e.message : String(e);
99
+ log(`job ${job.job_id} FAILED: ${reason}`);
100
+ await notify(job.personality, `voice job ${job.job_id} failed reason=${reason}`);
101
+ await writeFile(resultFile, JSON.stringify({
102
+ job_id: job.job_id, status: 'failed', reason, note: job.note ?? null,
103
+ finished_at: new Date().toISOString(),
104
+ }, null, 2), 'utf8').catch(() => {});
105
+ process.exitCode = 1;
106
+ }
107
+ }
108
+
109
+ // Run when invoked directly (the detached spawn); importable without side effects for tests.
110
+ if (process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href) {
111
+ const jobFile = process.argv[2];
112
+ if (!jobFile) {
113
+ process.stderr.write('peer-voice worker: missing job file argument\n');
114
+ process.exit(2);
115
+ }
116
+ runJob(jobFile).catch(err => {
117
+ process.stderr.write(`peer-voice worker fatal: ${err && err.stack ? err.stack : err}\n`);
118
+ process.exit(1);
119
+ });
120
+ }