@agfpd/voice-connect 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +72 -0
- package/bin/peer-voice-http.mjs +20 -0
- package/bin/peer-voice-mcp.mjs +18 -0
- package/package.json +61 -0
- package/src/apikey.mjs +61 -0
- package/src/audio.mjs +112 -0
- package/src/config.mjs +75 -0
- package/src/configfile.mjs +38 -0
- package/src/engines/f5.mjs +111 -0
- package/src/engines/gemini.mjs +199 -0
- package/src/engines/gptaudio.mjs +230 -0
- package/src/engines/mlxwhisper.mjs +70 -0
- package/src/engines/speaches.mjs +69 -0
- package/src/engines/supertonic.mjs +177 -0
- package/src/home.mjs +15 -0
- package/src/http.mjs +252 -0
- package/src/jobs.mjs +95 -0
- package/src/langsplit.mjs +129 -0
- package/src/profile.mjs +165 -0
- package/src/providers.mjs +210 -0
- package/src/ref.mjs +157 -0
- package/src/router.mjs +91 -0
- package/src/ruaccent.mjs +114 -0
- package/src/ruaccent_stress.py +66 -0
- package/src/server.mjs +278 -0
- package/src/stress.mjs +25 -0
- package/src/stt.mjs +48 -0
- package/src/synthlog.mjs +46 -0
- package/src/voice.mjs +201 -0
- package/src/worker.mjs +120 -0
package/src/voice.mjs
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* voice_create — TTS routing & synthesis.
|
|
3
|
+
*
|
|
4
|
+
* Contract (per Artur, 2026-06-01): voice_create(text, voice?). Plain text is
|
|
5
|
+
* ALWAYS a single generation pass — no cutting, no joining, no artificial
|
|
6
|
+
* pauses — on every engine. Minimal load on the agent: give text, get an ogg.
|
|
7
|
+
*
|
|
8
|
+
* Engine routing is a four-rung fallback ladder, now expressed declaratively:
|
|
9
|
+
* the provider table lives in providers.mjs and the cascade is run by router.mjs.
|
|
10
|
+
* - Gemini 3.1 Flash TTS primary — direct Google API, one pass (ru+en).
|
|
11
|
+
* - gpt-audio second (OpenRouter) — cloud-quality fallback before the local
|
|
12
|
+
* rungs; multilingual one pass, tried for ALL languages.
|
|
13
|
+
* - F5-TTS third — LIVE-prosody Russian rung, per-peer voice from a cached ref.
|
|
14
|
+
* - Supertonic 3 local floor — offline, one pass in the routed language.
|
|
15
|
+
*
|
|
16
|
+
* A `style` directive (HOW to speak) rides on the cloud rungs (Gemini, gpt-audio)
|
|
17
|
+
* and is ignored by the local rungs. Each rung falls through ONLY on its known
|
|
18
|
+
* engine-failure classes; other errors propagate so real bugs surface.
|
|
19
|
+
*
|
|
20
|
+
* The `||` change-of-language marker is a LATER phase; Phase 1 flattens it away
|
|
21
|
+
* so it is never read aloud. The tool only PRODUCES the file — delivery
|
|
22
|
+
* (send_to_peer attachments) is the caller's job.
|
|
23
|
+
*/
|
|
24
|
+
import { mkdtemp, rm, mkdir } from 'node:fs/promises';
|
|
25
|
+
import { tmpdir } from 'node:os';
|
|
26
|
+
import { join, dirname } from 'node:path';
|
|
27
|
+
import { randomBytes } from 'node:crypto';
|
|
28
|
+
import { classifyLangMode, supertonicLang } from './langsplit.mjs';
|
|
29
|
+
import { encodePcmToOgg, probe } from './audio.mjs';
|
|
30
|
+
import { callerPersonality, resolveVoice } from './profile.mjs';
|
|
31
|
+
import { voiceMap as configVoiceMap } from './config.mjs';
|
|
32
|
+
import { peerVoiceHome } from './home.mjs';
|
|
33
|
+
import { logSynthesis } from './synthlog.mjs';
|
|
34
|
+
import { runCascade } from './router.mjs';
|
|
35
|
+
import { ttsProviders, ttsProviderByEngine, buildTtsCascade } from './providers.mjs';
|
|
36
|
+
|
|
37
|
+
function defaultOutPath() {
|
|
38
|
+
const stamp = `${Date.now()}-${randomBytes(4).toString('hex')}`;
|
|
39
|
+
return join(peerVoiceHome(), 'out', `voice-${stamp}.ogg`);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/** Phase 1: drop the reserved `||` marker so it is not spoken; collapse space. */
|
|
43
|
+
function flattenText(text) {
|
|
44
|
+
if (typeof text !== 'string') return '';
|
|
45
|
+
return text.replace(/\|\|/g, ' ').replace(/\s+/g, ' ').trim();
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Caller-supplied lang, normalized to a routing tag or undefined. */
|
|
49
|
+
function normalizeLang(lang) {
|
|
50
|
+
return ['ru', 'en', 'na'].includes(lang) ? lang : undefined;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Which fallback engine/mode to use when the cloud rungs are down. Returns:
|
|
55
|
+
* 'ru' → F5 (Russian accent-tune; Supertonic-ru is the floor if F5 is down)
|
|
56
|
+
* 'en' → Supertonic --lang en
|
|
57
|
+
* 'na' → Supertonic --lang na (native multilingual, one pass)
|
|
58
|
+
* An explicit caller `lang` LEADS; otherwise the share-based autodetect
|
|
59
|
+
* (classifyLangMode); 'mixed' there maps to 'na'.
|
|
60
|
+
* @param {string} text
|
|
61
|
+
* @param {'ru'|'en'|'na'|undefined} langOpt already normalized
|
|
62
|
+
* @returns {'ru'|'en'|'na'}
|
|
63
|
+
*/
|
|
64
|
+
function fallbackRoute(text, langOpt) {
|
|
65
|
+
if (langOpt) return langOpt;
|
|
66
|
+
const mode = classifyLangMode(text);
|
|
67
|
+
return mode === 'mixed' ? 'na' : mode;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Create a voice file from text.
|
|
72
|
+
*
|
|
73
|
+
* @param {object} opts
|
|
74
|
+
* @param {string} opts.text text to speak (mixed ru+en is fine)
|
|
75
|
+
* @param {string} [opts.voice] voice for the call's PRIMARY engine (default Aoede on Gemini)
|
|
76
|
+
* @param {'auto'|'gemini'|'gpt-audio'|'supertonic'} [opts.engine] default 'auto'
|
|
77
|
+
* @param {'ru'|'en'|'na'} [opts.lang] caller-declared language for the F5/Supertonic rungs
|
|
78
|
+
* @param {string} [opts.style] delivery directive for the cloud rungs
|
|
79
|
+
* @param {string} [opts.out_path] output .ogg path
|
|
80
|
+
* @returns {Promise<{path,engine,voice,lang?,probe,fallback_from?}>}
|
|
81
|
+
*/
|
|
82
|
+
export async function createVoice(opts = {}) {
|
|
83
|
+
const engine = ['gemini', 'gpt-audio', 'supertonic', 'auto'].includes(opts.engine) ? opts.engine : 'auto';
|
|
84
|
+
const langOpt = normalizeLang(opts.lang);
|
|
85
|
+
const style = typeof opts.style === 'string' && opts.style.trim() ? opts.style.trim() : undefined;
|
|
86
|
+
|
|
87
|
+
// Voice resolution: explicit override → caller's interfaces.voice[<model>] →
|
|
88
|
+
// built-in default. The override is a single voice in the namespace of the
|
|
89
|
+
// call's PRIMARY engine, so it applies ONLY there — never leaking across a
|
|
90
|
+
// fallback (a Gemini voice name is invalid for Supertonic, etc.).
|
|
91
|
+
const override = opts.voice;
|
|
92
|
+
const personality = callerPersonality(); // whose voice — for the F5 per-peer ref (env → cwd profile)
|
|
93
|
+
const voiceMap = configVoiceMap(); // mode-aware: iapeer peer-profile → autonomous config file
|
|
94
|
+
const gemVoice = resolveVoice({
|
|
95
|
+
modelName: ttsProviders.gemini.model,
|
|
96
|
+
def: ttsProviders.gemini.defaultVoice,
|
|
97
|
+
applyOverride: engine === 'auto' || engine === 'gemini',
|
|
98
|
+
override,
|
|
99
|
+
voiceMap,
|
|
100
|
+
}).voice;
|
|
101
|
+
const gaVoice = resolveVoice({
|
|
102
|
+
modelName: ttsProviders.gptAudio.model,
|
|
103
|
+
def: ttsProviders.gptAudio.defaultVoice,
|
|
104
|
+
applyOverride: engine === 'gpt-audio',
|
|
105
|
+
override,
|
|
106
|
+
voiceMap,
|
|
107
|
+
}).voice;
|
|
108
|
+
const stVoice = resolveVoice({
|
|
109
|
+
modelName: ttsProviders.supertonic.model,
|
|
110
|
+
def: ttsProviders.supertonic.defaultVoice,
|
|
111
|
+
applyOverride: engine === 'supertonic',
|
|
112
|
+
override,
|
|
113
|
+
voiceMap,
|
|
114
|
+
}).voice;
|
|
115
|
+
|
|
116
|
+
const text = flattenText(opts.text);
|
|
117
|
+
if (!text) {
|
|
118
|
+
throw new Error('voice_create: `text` is required and must be non-empty.');
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const outPath = opts.out_path || defaultOutPath();
|
|
122
|
+
await mkdir(dirname(outPath), { recursive: true });
|
|
123
|
+
const tmp = await mkdtemp(join(tmpdir(), 'peer-voice-'));
|
|
124
|
+
|
|
125
|
+
let usedEngine;
|
|
126
|
+
let usedVoice;
|
|
127
|
+
let usedLang;
|
|
128
|
+
let usedFinishReason; // Gemini-only: how the (final, non-fallback) pass stopped
|
|
129
|
+
let fallbackFrom;
|
|
130
|
+
try {
|
|
131
|
+
const ctx = { text, style, personality, tmpDir: tmp };
|
|
132
|
+
|
|
133
|
+
let result;
|
|
134
|
+
if (engine === 'auto') {
|
|
135
|
+
// Cloud-quality first (Gemini → gpt-audio), then the language-routed local
|
|
136
|
+
// rungs (ru → F5 → Supertonic-ru; en/na → Supertonic). An explicit caller
|
|
137
|
+
// `lang` leads; otherwise the letter-share autodetect.
|
|
138
|
+
const route = fallbackRoute(text, langOpt); // 'ru' | 'en' | 'na'
|
|
139
|
+
result = await runCascade(buildTtsCascade({ route, gemVoice, gaVoice, stVoice }), ctx, {
|
|
140
|
+
onAdvance: (name, err) =>
|
|
141
|
+
process.stderr.write(`[peer-voice] ${name} unavailable (${err.message}); advancing cascade.\n`),
|
|
142
|
+
});
|
|
143
|
+
} else {
|
|
144
|
+
// Forced single engine — no cascade; its failure propagates to the caller.
|
|
145
|
+
const provider = ttsProviderByEngine(engine);
|
|
146
|
+
const voice = engine === 'gemini' ? gemVoice : engine === 'gpt-audio' ? gaVoice : stVoice;
|
|
147
|
+
// Forced Supertonic honors an explicit lang, else picks ru/en/na by share.
|
|
148
|
+
const lang = engine === 'supertonic' ? (langOpt ?? supertonicLang(text)) : undefined;
|
|
149
|
+
result = { ...(await provider.synthesize({ ...ctx, voice, lang })), name: provider.name };
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
usedEngine = result.name;
|
|
153
|
+
usedVoice = result.voice;
|
|
154
|
+
usedLang = result.lang;
|
|
155
|
+
usedFinishReason = result.finishReason;
|
|
156
|
+
fallbackFrom = result.fallbackFrom;
|
|
157
|
+
|
|
158
|
+
await encodePcmToOgg(result.pcm, outPath);
|
|
159
|
+
const info = await probe(outPath);
|
|
160
|
+
|
|
161
|
+
// Structured trace of THIS synthesis — findable post-hoc by output path.
|
|
162
|
+
await logSynthesis({
|
|
163
|
+
ok: true,
|
|
164
|
+
engine: usedEngine,
|
|
165
|
+
chars: text.length,
|
|
166
|
+
voice: usedVoice,
|
|
167
|
+
lang: usedLang ?? null,
|
|
168
|
+
style: style ?? null,
|
|
169
|
+
duration: info.duration ?? null,
|
|
170
|
+
finishReason: usedFinishReason ?? null,
|
|
171
|
+
fallback_from: fallbackFrom ?? null,
|
|
172
|
+
path: outPath,
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
return {
|
|
176
|
+
path: outPath,
|
|
177
|
+
engine: usedEngine,
|
|
178
|
+
voice: usedVoice,
|
|
179
|
+
...(usedLang ? { lang: usedLang } : {}),
|
|
180
|
+
probe: info,
|
|
181
|
+
...(fallbackFrom ? { fallback_from: fallbackFrom } : {}),
|
|
182
|
+
};
|
|
183
|
+
} catch (err) {
|
|
184
|
+
// Failures leave a trace too — a silent error was half of the original
|
|
185
|
+
// diagnosis gap. Log what is known, then propagate unchanged.
|
|
186
|
+
await logSynthesis({
|
|
187
|
+
ok: false,
|
|
188
|
+
engine: usedEngine ?? null,
|
|
189
|
+
chars: text.length,
|
|
190
|
+
voice: usedVoice ?? null,
|
|
191
|
+
lang: usedLang ?? null,
|
|
192
|
+
style: style ?? null,
|
|
193
|
+
finishReason: usedFinishReason ?? null,
|
|
194
|
+
fallback_from: fallbackFrom ?? null,
|
|
195
|
+
error: err instanceof Error ? err.message : String(err),
|
|
196
|
+
});
|
|
197
|
+
throw err;
|
|
198
|
+
} finally {
|
|
199
|
+
await rm(tmp, { recursive: true, force: true }).catch(() => {});
|
|
200
|
+
}
|
|
201
|
+
}
|
package/src/worker.mjs
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Detached background worker for async voice jobs (Phase 7, variant B).
|
|
4
|
+
*
|
|
5
|
+
* Spawned by jobs.mjs (detached + unref) with one argument: the job file path.
|
|
6
|
+
* It runs the normal synthesis pipeline OUT of the MCP request, then IAP-
|
|
7
|
+
* notifies the calling peer:
|
|
8
|
+
* - done: "voice job <id> done path=<path> note=<note>"
|
|
9
|
+
* - failed: "voice job <id> failed reason=<...>"
|
|
10
|
+
* The agent receives that IAP message and delivers the ogg itself
|
|
11
|
+
* (send_to_peer(<from note>, attachments=[path])).
|
|
12
|
+
*
|
|
13
|
+
* Notification goes through the IAP CLI (`iapeer send <personality> --message …`),
|
|
14
|
+
* resolved from PATH or PEER_VOICE_IAP_BIN — same posture as ffmpeg. The worker
|
|
15
|
+
* also writes <job>.result.json so completion is observable without the IAP.
|
|
16
|
+
*/
|
|
17
|
+
import { readFile, writeFile } from 'node:fs/promises';
|
|
18
|
+
import { execFile } from 'node:child_process';
|
|
19
|
+
import { promisify } from 'node:util';
|
|
20
|
+
import { fileURLToPath, pathToFileURL } from 'node:url';
|
|
21
|
+
import { createVoice } from './voice.mjs';
|
|
22
|
+
|
|
23
|
+
const pexecFile = promisify(execFile);
|
|
24
|
+
const IAP_BIN = process.env.PEER_VOICE_IAP_BIN || 'iapeer';
|
|
25
|
+
|
|
26
|
+
function log(msg) {
|
|
27
|
+
process.stdout.write(`[peer-voice/worker] ${new Date().toISOString()} ${msg}\n`);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Where to run `iapeer` from. The worker lives inside the CALLING agent's context,
|
|
32
|
+
* but an IAP message TO that caller must come FROM a different identity — iapeer
|
|
33
|
+
* refuses send-to-self and ties the sender to the cwd's .iapeer profile. So we
|
|
34
|
+
* notify AS peer-voice: run iapeer from a dir whose profile is peer-voice (the
|
|
35
|
+
* package root in dev; PEER_VOICE_NOTIFIER_CWD in deployments where the package
|
|
36
|
+
* has no profile of its own).
|
|
37
|
+
*/
|
|
38
|
+
function notifierCwd() {
|
|
39
|
+
return process.env.PEER_VOICE_NOTIFIER_CWD
|
|
40
|
+
|| fileURLToPath(new URL('..', import.meta.url)); // package root (one up from src/)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** Notify the calling peer over IAP, sent AS peer-voice (not the caller — that
|
|
44
|
+
* would be a self-send). Best-effort: a failed send is logged, not thrown. */
|
|
45
|
+
async function notify(personality, message) {
|
|
46
|
+
if (!personality) {
|
|
47
|
+
log(`no personality on job — cannot IAP-notify; message was: ${message}`);
|
|
48
|
+
return;
|
|
49
|
+
}
|
|
50
|
+
const cwd = notifierCwd();
|
|
51
|
+
// iapeer reads the sender from cwd's .iapeer profile and rejects an inherited
|
|
52
|
+
// identity that disagrees with it (it checks PEER_IDENTITY == PEER_RUNTIME +
|
|
53
|
+
// "-" + PEER_PERSONALITY against the profile). In production the worker carries
|
|
54
|
+
// the CALLER's identity vars (e.g. claude-natalya), which would clash with the
|
|
55
|
+
// peer-voice notifier profile — so strip all three and let iapeer derive the
|
|
56
|
+
// sender purely from the notifier cwd (→ peer-voice), TO the caller.
|
|
57
|
+
const env = { ...process.env };
|
|
58
|
+
delete env.PEER_PERSONALITY;
|
|
59
|
+
delete env.PEER_IDENTITY;
|
|
60
|
+
delete env.PEER_RUNTIME;
|
|
61
|
+
log(`${IAP_BIN} send ${personality} --message ${JSON.stringify(message)} (from cwd ${cwd})`);
|
|
62
|
+
try {
|
|
63
|
+
const { stdout, stderr } = await pexecFile(
|
|
64
|
+
IAP_BIN, ['send', personality, '--message', message, '--topic', 'voice-job'],
|
|
65
|
+
{ cwd, env },
|
|
66
|
+
);
|
|
67
|
+
log(`${IAP_BIN} send ok: ${String(stdout || stderr || '').trim().slice(0, 200)}`);
|
|
68
|
+
} catch (e) {
|
|
69
|
+
log(`${IAP_BIN} send FAILED: ${e.message}`);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
export async function runJob(jobFile) {
|
|
74
|
+
const job = JSON.parse(await readFile(jobFile, 'utf8'));
|
|
75
|
+
// The per-peer F5 ref keys off PEER_PERSONALITY — carry the caller's identity
|
|
76
|
+
// into this detached process so an overflow still speaks in their voice.
|
|
77
|
+
if (job.personality) process.env.PEER_PERSONALITY = job.personality;
|
|
78
|
+
const resultFile = jobFile.replace(/\.json$/, '.result.json');
|
|
79
|
+
const noteSuffix = job.note ? ` note=${job.note}` : '';
|
|
80
|
+
|
|
81
|
+
try {
|
|
82
|
+
log(`job ${job.job_id} start (chars=${(job.text || '').length}, engine=${job.engine})`);
|
|
83
|
+
const r = await createVoice({
|
|
84
|
+
text: job.text,
|
|
85
|
+
voice: job.voice ?? undefined,
|
|
86
|
+
engine: job.engine,
|
|
87
|
+
lang: job.lang ?? undefined,
|
|
88
|
+
style: job.style ?? undefined,
|
|
89
|
+
out_path: job.out_path ?? undefined,
|
|
90
|
+
});
|
|
91
|
+
log(`job ${job.job_id} synthesized engine=${r.engine} path=${r.path}`);
|
|
92
|
+
await notify(job.personality, `voice job ${job.job_id} done path=${r.path}${noteSuffix}`);
|
|
93
|
+
await writeFile(resultFile, JSON.stringify({
|
|
94
|
+
...r, job_id: job.job_id, status: 'done', note: job.note ?? null,
|
|
95
|
+
finished_at: new Date().toISOString(),
|
|
96
|
+
}, null, 2), 'utf8').catch(() => {});
|
|
97
|
+
} catch (e) {
|
|
98
|
+
const reason = e && e.message ? e.message : String(e);
|
|
99
|
+
log(`job ${job.job_id} FAILED: ${reason}`);
|
|
100
|
+
await notify(job.personality, `voice job ${job.job_id} failed reason=${reason}`);
|
|
101
|
+
await writeFile(resultFile, JSON.stringify({
|
|
102
|
+
job_id: job.job_id, status: 'failed', reason, note: job.note ?? null,
|
|
103
|
+
finished_at: new Date().toISOString(),
|
|
104
|
+
}, null, 2), 'utf8').catch(() => {});
|
|
105
|
+
process.exitCode = 1;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Run when invoked directly (the detached spawn); importable without side effects for tests.
|
|
110
|
+
if (process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href) {
|
|
111
|
+
const jobFile = process.argv[2];
|
|
112
|
+
if (!jobFile) {
|
|
113
|
+
process.stderr.write('peer-voice worker: missing job file argument\n');
|
|
114
|
+
process.exit(2);
|
|
115
|
+
}
|
|
116
|
+
runJob(jobFile).catch(err => {
|
|
117
|
+
process.stderr.write(`peer-voice worker fatal: ${err && err.stack ? err.stack : err}\n`);
|
|
118
|
+
process.exit(1);
|
|
119
|
+
});
|
|
120
|
+
}
|