@agfpd/voice-connect 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +72 -0
- package/bin/peer-voice-http.mjs +20 -0
- package/bin/peer-voice-mcp.mjs +18 -0
- package/package.json +61 -0
- package/src/apikey.mjs +61 -0
- package/src/audio.mjs +112 -0
- package/src/config.mjs +75 -0
- package/src/configfile.mjs +38 -0
- package/src/engines/f5.mjs +111 -0
- package/src/engines/gemini.mjs +199 -0
- package/src/engines/gptaudio.mjs +230 -0
- package/src/engines/mlxwhisper.mjs +70 -0
- package/src/engines/speaches.mjs +69 -0
- package/src/engines/supertonic.mjs +177 -0
- package/src/home.mjs +15 -0
- package/src/http.mjs +252 -0
- package/src/jobs.mjs +95 -0
- package/src/langsplit.mjs +129 -0
- package/src/profile.mjs +165 -0
- package/src/providers.mjs +210 -0
- package/src/ref.mjs +157 -0
- package/src/router.mjs +91 -0
- package/src/ruaccent.mjs +114 -0
- package/src/ruaccent_stress.py +66 -0
- package/src/server.mjs +278 -0
- package/src/stress.mjs +25 -0
- package/src/stt.mjs +48 -0
- package/src/synthlog.mjs +46 -0
- package/src/voice.mjs +201 -0
- package/src/worker.mjs +120 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fallback engine — Supertonic 3, local & offline.
|
|
3
|
+
*
|
|
4
|
+
* Triggered when Gemini hits its quota (or no key is configured). The caller
|
|
5
|
+
* passes an explicit --lang: 'ru'/'en' for a dominant single language, or 'na'
|
|
6
|
+
* (supertonic-3's native multilingual mode) for a balanced ru+en blend — 'na'
|
|
7
|
+
* reads both in ONE pass with no seams, so mixed text is NOT split by script.
|
|
8
|
+
* Prosody is flat (66M model — no natural pauses/intonation), but the audio is
|
|
9
|
+
* clean and it is free/offline, which is exactly what a fallback needs.
|
|
10
|
+
*
|
|
11
|
+
* Plug-and-play (§1): the venv and model are provisioned lazily and
|
|
12
|
+
* idempotently on first fallback use — no manual install step. Resolution
|
|
13
|
+
* order for the CLI:
|
|
14
|
+
* 1. PEER_VOICE_SUPERTONIC_BIN (explicit override)
|
|
15
|
+
* 2. managed venv at $PEER_VOICE_HOME/supertonic-venv (default ~/.iapeer/cache/peer-voice)
|
|
16
|
+
* 3. `supertonic` already on PATH
|
|
17
|
+
* 4. create the managed venv (python3 -m venv) + pip install supertonic
|
|
18
|
+
* The model (~/.cache/supertonic3) is fetched via `supertonic download` if absent.
|
|
19
|
+
*/
|
|
20
|
+
import { execFile } from 'node:child_process';
|
|
21
|
+
import { promisify } from 'node:util';
|
|
22
|
+
import { homedir } from 'node:os';
|
|
23
|
+
import { join, dirname } from 'node:path';
|
|
24
|
+
import { access, mkdir, readdir } from 'node:fs/promises';
|
|
25
|
+
import { constants as FS } from 'node:fs';
|
|
26
|
+
import { mapStressToUnicode } from '../stress.mjs';
|
|
27
|
+
import { accentPlus } from '../ruaccent.mjs';
|
|
28
|
+
import { hasCyrillic } from '../langsplit.mjs';
|
|
29
|
+
import { peerVoiceHome } from '../home.mjs';
|
|
30
|
+
|
|
31
|
+
const pexecFile = promisify(execFile);
|
|
32
|
+
|
|
33
|
+
/** Full model name — the key under interfaces.voice for the Supertonic voice. */
|
|
34
|
+
export const SUPERTONIC_MODEL = 'supertonic-3';
|
|
35
|
+
export const DEFAULT_SUPERTONIC_VOICE = process.env.PEER_VOICE_SUPERTONIC_VOICE || 'F3';
|
|
36
|
+
// 12 = Supertonic's documented quality ceiling (range 5–12; above it plateaus —
|
|
37
|
+
// the 16/32/64 differences Artur heard were generation nondeterminism, not
|
|
38
|
+
// quality). Override via env.
|
|
39
|
+
const STEPS = process.env.PEER_VOICE_SUPERTONIC_STEPS || '12';
|
|
40
|
+
// Supertonic's own default --speed is 1.05 (slightly rushed). 0.9 reads at a
|
|
41
|
+
// calmer, more natural pace — Artur's call on the palette. Override via env.
|
|
42
|
+
const SPEED = process.env.PEER_VOICE_SUPERTONIC_SPEED || '0.9';
|
|
43
|
+
|
|
44
|
+
function managedVenvBin() {
|
|
45
|
+
return join(peerVoiceHome(), 'supertonic-venv', 'bin', 'supertonic');
|
|
46
|
+
}
|
|
47
|
+
function modelDir() {
|
|
48
|
+
return process.env.SUPERTONIC_MODEL_DIR || join(homedir(), '.cache', 'supertonic3');
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async function exists(p) {
|
|
52
|
+
try { await access(p, FS.X_OK); return true; } catch { return false; }
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* The model is "present" iff at least one *.onnx weight file exists under the
|
|
57
|
+
* model dir. A bare/partial dir (e.g. a download interrupted mid-flight) does
|
|
58
|
+
* NOT count — checking the dir alone would wrongly skip a needed download, and
|
|
59
|
+
* `supertonic download` is destructive (it rebuilds the target dir).
|
|
60
|
+
*/
|
|
61
|
+
async function modelPresent(dir) {
|
|
62
|
+
let entries;
|
|
63
|
+
try { entries = await readdir(dir, { withFileTypes: true }); } catch { return false; }
|
|
64
|
+
for (const e of entries) {
|
|
65
|
+
const full = join(dir, e.name);
|
|
66
|
+
if (e.isDirectory()) {
|
|
67
|
+
if (await modelPresent(full)) return true;
|
|
68
|
+
} else if (e.name.endsWith('.onnx')) {
|
|
69
|
+
return true;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
return false;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function log(msg) {
|
|
76
|
+
process.stderr.write(`[peer-voice/supertonic] ${msg}\n`);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
async function run(bin, args, opts = {}) {
|
|
80
|
+
return pexecFile(bin, args, { maxBuffer: 64 * 1024 * 1024, ...opts });
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
let cachedBin = null;
|
|
84
|
+
|
|
85
|
+
/** Resolve (and if needed provision) the supertonic CLI. Idempotent. */
|
|
86
|
+
export async function ensureSupertonic() {
|
|
87
|
+
if (cachedBin) return cachedBin;
|
|
88
|
+
|
|
89
|
+
// 1. explicit override
|
|
90
|
+
if (process.env.PEER_VOICE_SUPERTONIC_BIN) {
|
|
91
|
+
cachedBin = process.env.PEER_VOICE_SUPERTONIC_BIN;
|
|
92
|
+
return cachedBin;
|
|
93
|
+
}
|
|
94
|
+
// 2. managed venv
|
|
95
|
+
const managed = managedVenvBin();
|
|
96
|
+
if (await exists(managed)) {
|
|
97
|
+
cachedBin = managed;
|
|
98
|
+
await ensureModel(cachedBin);
|
|
99
|
+
return cachedBin;
|
|
100
|
+
}
|
|
101
|
+
// 3. already on PATH
|
|
102
|
+
try {
|
|
103
|
+
await run('supertonic', ['--version']);
|
|
104
|
+
cachedBin = 'supertonic';
|
|
105
|
+
await ensureModel(cachedBin);
|
|
106
|
+
return cachedBin;
|
|
107
|
+
} catch { /* not on PATH — provision below */ }
|
|
108
|
+
|
|
109
|
+
// 4. create managed venv + install
|
|
110
|
+
const venvDir = join(peerVoiceHome(), 'supertonic-venv');
|
|
111
|
+
log(`provisioning venv at ${venvDir} (one-time)…`);
|
|
112
|
+
await mkdir(peerVoiceHome(), { recursive: true });
|
|
113
|
+
const py = process.env.PEER_VOICE_PYTHON || 'python3';
|
|
114
|
+
await run(py, ['-m', 'venv', venvDir]);
|
|
115
|
+
const pip = join(venvDir, 'bin', 'pip');
|
|
116
|
+
log('pip install supertonic + ruaccent (downloading onnxruntime — may take a few minutes)…');
|
|
117
|
+
await run(pip, ['install', '--quiet', '--upgrade', 'pip']);
|
|
118
|
+
// ruaccent ships alongside supertonic (Russian stress, see accentRussian).
|
|
119
|
+
// transformers + onnxruntime, no torch — light. Idempotent on reruns.
|
|
120
|
+
await run(pip, ['install', '--quiet', 'supertonic', 'ruaccent']);
|
|
121
|
+
cachedBin = managed;
|
|
122
|
+
await ensureModel(cachedBin);
|
|
123
|
+
return cachedBin;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/** Fetch the model from Hugging Face if absent. Idempotent (skips if a *.onnx
|
|
127
|
+
* weight already exists under the model dir). */
|
|
128
|
+
async function ensureModel(bin) {
|
|
129
|
+
if (await modelPresent(modelDir())) return;
|
|
130
|
+
log('model not found — running `supertonic download` (one-time, from Hugging Face)…');
|
|
131
|
+
const args = ['download'];
|
|
132
|
+
if (process.env.SUPERTONIC_MODEL_DIR) args.push('--out', process.env.SUPERTONIC_MODEL_DIR);
|
|
133
|
+
await run(bin, args);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// --- Russian stress via ruaccent ------------------------------------------
|
|
137
|
+
//
|
|
138
|
+
// Supertonic's weak Russian stress (синтЕза/сИнтеза, omographs замок/замок) is
|
|
139
|
+
// the main complaint. ruaccent marks stress with '+' before the stressed vowel
|
|
140
|
+
// (shared module ruaccent.mjs); Supertonic mis-reads the raw '+' as a sound, so
|
|
141
|
+
// stress.mjs maps '+vowel' -> 'vowel'+U+0301, which Supertonic honors. Gemini
|
|
142
|
+
// stresses Russian correctly on its own, so this is the Supertonic branch ONLY.
|
|
143
|
+
// Best-effort: any failure falls back to the original text — stress is an
|
|
144
|
+
// enhancement, never a hard dependency for synthesis.
|
|
145
|
+
|
|
146
|
+
/** Add Russian stress (U+0301) to text, or return it unchanged on any failure.
|
|
147
|
+
* Hint ruaccent at the python beside our resolved CLI (managed venv, or an
|
|
148
|
+
* override pointing at a venv); ruaccent.mjs falls back to the managed venv. */
|
|
149
|
+
async function accentRussian(text) {
|
|
150
|
+
const hint = cachedBin ? join(dirname(cachedBin), 'python') : undefined;
|
|
151
|
+
return mapStressToUnicode(await accentPlus(text, hint));
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Synthesize one text to a wav file.
|
|
156
|
+
* @param {string} text
|
|
157
|
+
* @param {'ru'|'en'|'na'|string} lang 'na' = native multilingual (balanced ru+en)
|
|
158
|
+
* @param {string} wavPath
|
|
159
|
+
* @param {string} [voice]
|
|
160
|
+
* @returns {Promise<string>} wavPath
|
|
161
|
+
*/
|
|
162
|
+
export async function supertonicTTS(text, lang, wavPath, voice = DEFAULT_SUPERTONIC_VOICE) {
|
|
163
|
+
const bin = await ensureSupertonic();
|
|
164
|
+
// Place Russian stress whenever the text has Cyrillic — for 'ru' AND for the
|
|
165
|
+
// multilingual 'na' blend, whose Russian part needs it just as much. ruaccent
|
|
166
|
+
// marks only Russian vowels (English passes through untouched), so this is
|
|
167
|
+
// safe on mixed text. Pure-'en' (no Cyrillic) skips it. Best-effort.
|
|
168
|
+
const speakText = hasCyrillic(text) ? await accentRussian(text) : text;
|
|
169
|
+
// --model pinned explicitly: the package default could change in a future
|
|
170
|
+
// release; supertonic-3 is the model we provision and key interfaces.voice by.
|
|
171
|
+
await run(bin, [
|
|
172
|
+
'tts', speakText, '-o', wavPath,
|
|
173
|
+
'--model', SUPERTONIC_MODEL,
|
|
174
|
+
'--lang', lang, '--voice', voice, '--steps', STEPS, '--speed', SPEED,
|
|
175
|
+
]);
|
|
176
|
+
return wavPath;
|
|
177
|
+
}
|
package/src/home.mjs
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Plugin data home.
|
|
3
|
+
*
|
|
4
|
+
* Team convention: agent/plugin data lives under ~/.iapeer/cache/<plugin>/
|
|
5
|
+
* (alongside mergemind, spawned-peer, persistent-peer). peer-voice keeps its
|
|
6
|
+
* managed Supertonic venv, ref cache, and output files there too — one place,
|
|
7
|
+
* consistent with the rest of the stand. Override the whole root via the
|
|
8
|
+
* PEER_VOICE_HOME env var.
|
|
9
|
+
*/
|
|
10
|
+
import { homedir } from 'node:os';
|
|
11
|
+
import { join } from 'node:path';
|
|
12
|
+
|
|
13
|
+
export function peerVoiceHome() {
|
|
14
|
+
return process.env.PEER_VOICE_HOME || join(homedir(), '.iapeer', 'cache', 'peer-voice');
|
|
15
|
+
}
|
package/src/http.mjs
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* voice-connect HTTP facade — an OpenAI-compatible local audio service over the
|
|
3
|
+
* SAME core as the MCP server (voice.mjs createVoice + stt.mjs transcribe via
|
|
4
|
+
* router/providers). This is the path runtimes (telegram-runtime, voicetalk)
|
|
5
|
+
* take directly, without MCP:
|
|
6
|
+
*
|
|
7
|
+
* POST /v1/audio/speech — TTS. JSON in, Ogg/Opus bytes out.
|
|
8
|
+
* POST /v1/audio/transcriptions — STT. multipart/form-data in, text/JSON out.
|
|
9
|
+
* GET /health — liveness for launchd / monitoring.
|
|
10
|
+
*
|
|
11
|
+
* The contracts mirror OpenAI's audio API so an off-the-shelf client (or the
|
|
12
|
+
* speaches/Kokoro shape telegram-runtime already drives) can point at this
|
|
13
|
+
* facade unchanged. The endpoints reuse the core verbatim — no synthesis or
|
|
14
|
+
* routing logic is duplicated here; this file is just HTTP wiring, the twin of
|
|
15
|
+
* server.mjs's MCP wiring.
|
|
16
|
+
*
|
|
17
|
+
* HTTP is synchronous request/response: a runtime holds the connection and
|
|
18
|
+
* wants bytes back. So /v1/audio/speech ALWAYS synthesizes inline (no async
|
|
19
|
+
* job / IAP-notify path — that belongs to the MCP tool, where an agent would
|
|
20
|
+
* otherwise block). Long input simply takes longer to return.
|
|
21
|
+
*
|
|
22
|
+
* Pure Node ESM, zero new deps: multipart is parsed by building a web `Request`
|
|
23
|
+
* from the raw body and calling `.formData()` (undici, built in since Node 18),
|
|
24
|
+
* not a hand-rolled boundary parser.
|
|
25
|
+
*/
|
|
26
|
+
import { createServer as nodeCreateServer } from 'node:http';
|
|
27
|
+
import { readFile, writeFile, rm } from 'node:fs/promises';
|
|
28
|
+
import { tmpdir } from 'node:os';
|
|
29
|
+
import { join } from 'node:path';
|
|
30
|
+
import { randomBytes } from 'node:crypto';
|
|
31
|
+
import { createVoice } from './voice.mjs';
|
|
32
|
+
import { transcribe as transcribeAudio } from './stt.mjs';
|
|
33
|
+
import { readVersion } from './server.mjs';
|
|
34
|
+
|
|
35
|
+
/** Upload cap — generous for a voice message, bounded so a bad request can't OOM us. */
|
|
36
|
+
const MAX_BODY = 64 * 1024 * 1024;
|
|
37
|
+
|
|
38
|
+
export const DEFAULT_HTTP_PORT = 8127;
|
|
39
|
+
|
|
40
|
+
/** Listening port: env PEER_VOICE_HTTP_PORT, else the default. */
|
|
41
|
+
export function httpPort() {
|
|
42
|
+
const p = parseInt(process.env.PEER_VOICE_HTTP_PORT || '', 10);
|
|
43
|
+
return Number.isInteger(p) && p > 0 ? p : DEFAULT_HTTP_PORT;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Bind host: env PEER_VOICE_HTTP_HOST, else loopback (local-only by default). */
|
|
47
|
+
export function httpHost() {
|
|
48
|
+
return (process.env.PEER_VOICE_HTTP_HOST || '127.0.0.1').trim() || '127.0.0.1';
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const TTS_ENGINES = ['gemini', 'gpt-audio', 'supertonic', 'auto'];
|
|
52
|
+
const STT_ENGINES = ['speaches', 'mlx-whisper', 'auto'];
|
|
53
|
+
const LANGS = ['ru', 'en', 'na'];
|
|
54
|
+
|
|
55
|
+
/** Read the full request body into a Buffer, enforcing the size cap (→ 413). */
|
|
56
|
+
async function readBody(req, limit = MAX_BODY) {
|
|
57
|
+
const chunks = [];
|
|
58
|
+
let size = 0;
|
|
59
|
+
for await (const chunk of req) {
|
|
60
|
+
size += chunk.length;
|
|
61
|
+
if (size > limit) {
|
|
62
|
+
const err = new Error('request body too large');
|
|
63
|
+
err.statusCode = 413;
|
|
64
|
+
throw err;
|
|
65
|
+
}
|
|
66
|
+
chunks.push(chunk);
|
|
67
|
+
}
|
|
68
|
+
return Buffer.concat(chunks);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function sendJson(res, status, obj) {
|
|
72
|
+
const body = JSON.stringify(obj);
|
|
73
|
+
res.writeHead(status, { 'content-type': 'application/json; charset=utf-8' });
|
|
74
|
+
res.end(body);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/** OpenAI-shaped error envelope: { error: { message, type } }. */
|
|
78
|
+
function sendError(res, status, message, type = 'invalid_request_error') {
|
|
79
|
+
sendJson(res, status, { error: { message, type } });
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function str(v) {
|
|
83
|
+
return typeof v === 'string' && v.trim() ? v.trim() : undefined;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* POST /v1/audio/speech — text → Ogg/Opus bytes.
|
|
88
|
+
*
|
|
89
|
+
* Request JSON (OpenAI `audio/speech` plus extensions):
|
|
90
|
+
* input (or text) required — the text to speak
|
|
91
|
+
* voice optional — voice override for the primary engine
|
|
92
|
+
* model optional — if it names one of our engines, selects it
|
|
93
|
+
* engine optional — explicit engine (wins over model); default auto
|
|
94
|
+
* lang optional — ru|en|na hint for the fallback rungs
|
|
95
|
+
* style optional — delivery directive for the cloud rungs
|
|
96
|
+
* response_format advisory — we always emit Ogg/Opus (see header X-Voice-Format)
|
|
97
|
+
*
|
|
98
|
+
* Always synchronous: the response body is the audio. Engine/fallback are
|
|
99
|
+
* surfaced in X-Voice-* headers for observability.
|
|
100
|
+
*/
|
|
101
|
+
async function handleSpeech(req, res, { voice }) {
|
|
102
|
+
const raw = await readBody(req);
|
|
103
|
+
let body;
|
|
104
|
+
try {
|
|
105
|
+
body = raw.length ? JSON.parse(raw.toString('utf8')) : {};
|
|
106
|
+
} catch {
|
|
107
|
+
return sendError(res, 400, 'invalid JSON body');
|
|
108
|
+
}
|
|
109
|
+
const text = str(body.input) ?? str(body.text);
|
|
110
|
+
if (!text) return sendError(res, 400, '`input` (the text to speak) is required');
|
|
111
|
+
|
|
112
|
+
// engine: explicit `engine` leads; else `model` when it names one of ours; else auto.
|
|
113
|
+
const engine = TTS_ENGINES.includes(body.engine)
|
|
114
|
+
? body.engine
|
|
115
|
+
: TTS_ENGINES.includes(body.model)
|
|
116
|
+
? body.model
|
|
117
|
+
: 'auto';
|
|
118
|
+
const voiceArg = str(body.voice);
|
|
119
|
+
const lang = LANGS.includes(body.lang) ? body.lang : undefined;
|
|
120
|
+
const style = str(body.style);
|
|
121
|
+
|
|
122
|
+
// Synthesize to a throwaway temp file, stream its bytes, then delete it —
|
|
123
|
+
// the runtime gets the audio in the response and needs nothing on disk.
|
|
124
|
+
const outPath = join(tmpdir(), `peer-voice-http-${Date.now()}-${randomBytes(4).toString('hex')}.ogg`);
|
|
125
|
+
try {
|
|
126
|
+
const result = await voice({ text, voice: voiceArg, engine, lang, style, out_path: outPath });
|
|
127
|
+
const bytes = await readFile(result.path);
|
|
128
|
+
res.writeHead(200, {
|
|
129
|
+
'content-type': 'audio/ogg',
|
|
130
|
+
'content-length': bytes.length,
|
|
131
|
+
'x-voice-format': 'opus',
|
|
132
|
+
...(result.engine ? { 'x-voice-engine': String(result.engine) } : {}),
|
|
133
|
+
...(result.voice ? { 'x-voice-voice': String(result.voice) } : {}),
|
|
134
|
+
...(result.fallback_from ? { 'x-voice-fallback-from': String(result.fallback_from) } : {}),
|
|
135
|
+
});
|
|
136
|
+
res.end(bytes);
|
|
137
|
+
} finally {
|
|
138
|
+
await rm(outPath, { force: true }).catch(() => {});
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* POST /v1/audio/transcriptions — audio file → text.
|
|
144
|
+
*
|
|
145
|
+
* Request multipart/form-data (OpenAI `audio/transcriptions` plus extensions):
|
|
146
|
+
* file required — the audio upload (.ogg/.wav/.mp3/…)
|
|
147
|
+
* language optional — ISO-639-1 hint
|
|
148
|
+
* prompt optional — decoder-priming prompt (term spelling/casing)
|
|
149
|
+
* engine optional — speaches|mlx-whisper|auto (default auto)
|
|
150
|
+
* response_format optional — text → plain body; else JSON { text }
|
|
151
|
+
*
|
|
152
|
+
* `response_format=text` mirrors what telegram-runtime's speaches client sends
|
|
153
|
+
* (it reads res.text()), so this facade is a drop-in for that endpoint.
|
|
154
|
+
*/
|
|
155
|
+
async function handleTranscriptions(req, res, { transcribe }) {
|
|
156
|
+
const ctype = req.headers['content-type'] || '';
|
|
157
|
+
if (!ctype.includes('multipart/form-data')) {
|
|
158
|
+
return sendError(res, 400, 'expected multipart/form-data with a `file` field');
|
|
159
|
+
}
|
|
160
|
+
const raw = await readBody(req);
|
|
161
|
+
let form;
|
|
162
|
+
try {
|
|
163
|
+
const request = new Request('http://localhost/v1/audio/transcriptions', {
|
|
164
|
+
method: 'POST',
|
|
165
|
+
headers: { 'content-type': ctype },
|
|
166
|
+
body: raw,
|
|
167
|
+
});
|
|
168
|
+
form = await request.formData();
|
|
169
|
+
} catch (err) {
|
|
170
|
+
return sendError(res, 400, `could not parse multipart body: ${err && err.message ? err.message : err}`);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const file = form.get('file');
|
|
174
|
+
if (!file || typeof file === 'string' || typeof file.arrayBuffer !== 'function') {
|
|
175
|
+
return sendError(res, 400, '`file` field (the audio upload) is required');
|
|
176
|
+
}
|
|
177
|
+
const lang = str(form.get('language'));
|
|
178
|
+
const prompt = str(form.get('prompt'));
|
|
179
|
+
const respFormat = str(form.get('response_format')) ?? 'json';
|
|
180
|
+
const engineRaw = form.get('engine');
|
|
181
|
+
const engine = STT_ENGINES.includes(engineRaw) ? engineRaw : 'auto';
|
|
182
|
+
|
|
183
|
+
// The core transcribe() takes a path, so persist the upload to a temp file,
|
|
184
|
+
// preserving any extension hint so a whisper backend can sniff the container.
|
|
185
|
+
const name = typeof file.name === 'string' ? file.name : '';
|
|
186
|
+
const ext = /\.[a-z0-9]{1,5}$/i.test(name) ? name.slice(name.lastIndexOf('.')) : '.ogg';
|
|
187
|
+
const tmpPath = join(tmpdir(), `peer-voice-stt-${Date.now()}-${randomBytes(4).toString('hex')}${ext}`);
|
|
188
|
+
try {
|
|
189
|
+
await writeFile(tmpPath, Buffer.from(await file.arrayBuffer()));
|
|
190
|
+
const result = await transcribe({ audioPath: tmpPath, lang, prompt, engine });
|
|
191
|
+
if (respFormat === 'text') {
|
|
192
|
+
res.writeHead(200, { 'content-type': 'text/plain; charset=utf-8' });
|
|
193
|
+
res.end(result.text);
|
|
194
|
+
} else {
|
|
195
|
+
sendJson(res, 200, { text: result.text });
|
|
196
|
+
}
|
|
197
|
+
} finally {
|
|
198
|
+
await rm(tmpPath, { force: true }).catch(() => {});
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Build the HTTP server. Seams `voice`/`transcribe` are injectable so tests can
|
|
204
|
+
* drive real HTTP against a stub core (no network, no ffmpeg). The factory is
|
|
205
|
+
* pure (no listen); main() owns the side-effecting bind.
|
|
206
|
+
*/
|
|
207
|
+
export function createHttpServer({ voice, transcribe, version } = {}) {
|
|
208
|
+
const voiceImpl = voice ?? createVoice;
|
|
209
|
+
const transcribeImpl = transcribe ?? transcribeAudio;
|
|
210
|
+
const ver = version ?? readVersion();
|
|
211
|
+
|
|
212
|
+
return nodeCreateServer(async (req, res) => {
|
|
213
|
+
try {
|
|
214
|
+
const url = new URL(req.url, 'http://localhost');
|
|
215
|
+
const path = url.pathname.replace(/\/+$/, '') || '/';
|
|
216
|
+
|
|
217
|
+
if (req.method === 'GET' && (path === '/' || path === '/health' || path === '/healthz')) {
|
|
218
|
+
return sendJson(res, 200, { status: 'ok', service: 'voice-connect', version: ver });
|
|
219
|
+
}
|
|
220
|
+
if (path === '/v1/audio/speech') {
|
|
221
|
+
if (req.method !== 'POST') return sendError(res, 405, 'method not allowed; use POST');
|
|
222
|
+
return await handleSpeech(req, res, { voice: voiceImpl });
|
|
223
|
+
}
|
|
224
|
+
if (path === '/v1/audio/transcriptions') {
|
|
225
|
+
if (req.method !== 'POST') return sendError(res, 405, 'method not allowed; use POST');
|
|
226
|
+
return await handleTranscriptions(req, res, { transcribe: transcribeImpl });
|
|
227
|
+
}
|
|
228
|
+
return sendError(res, 404, `unknown route: ${req.method} ${path}`, 'not_found');
|
|
229
|
+
} catch (err) {
|
|
230
|
+
const status = err && err.statusCode ? err.statusCode : 500;
|
|
231
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
232
|
+
if (!res.headersSent) sendError(res, status, msg, status === 500 ? 'server_error' : 'invalid_request_error');
|
|
233
|
+
else res.end();
|
|
234
|
+
}
|
|
235
|
+
});
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/** Bootstrap: bind and run the always-on facade. Importable without side effects. */
|
|
239
|
+
export async function main() {
|
|
240
|
+
const server = createHttpServer();
|
|
241
|
+
const port = httpPort();
|
|
242
|
+
const host = httpHost();
|
|
243
|
+
await new Promise((resolve, reject) => {
|
|
244
|
+
server.once('error', reject);
|
|
245
|
+
server.listen(port, host, resolve);
|
|
246
|
+
});
|
|
247
|
+
process.stderr.write(`voice-connect http: listening on http://${host}:${port} (version ${readVersion()})\n`);
|
|
248
|
+
for (const sig of ['SIGINT', 'SIGTERM']) {
|
|
249
|
+
process.on(sig, () => server.close(() => process.exit(0)));
|
|
250
|
+
}
|
|
251
|
+
return server;
|
|
252
|
+
}
|
package/src/jobs.mjs
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Async voice jobs (Phase 7, variant B — notify-agent).
|
|
3
|
+
*
|
|
4
|
+
* Long synthesis (10–20 min of audio) takes minutes; a synchronous tool call
|
|
5
|
+
* would block the calling agent — and its billing — for that whole time. So
|
|
6
|
+
* voice_create splits by length:
|
|
7
|
+
* - short text (≤ threshold) → handled synchronously (returns {path,...}).
|
|
8
|
+
* - long text (> threshold) → dispatched here: a DETACHED worker process
|
|
9
|
+
* does the synthesis out-of-band and IAP-notifies the agent on completion,
|
|
10
|
+
* while the tool returns {job_id, status:"started"} in <1s.
|
|
11
|
+
*
|
|
12
|
+
* The worker (worker.mjs) survives the MCP handler's return (detached + unref,
|
|
13
|
+
* own session), reads the job file, runs the normal pipeline, and on done/fail
|
|
14
|
+
* sends the calling peer an IAP message. The agent then delivers the ogg itself.
|
|
15
|
+
*
|
|
16
|
+
* Job artifacts live under $PEER_VOICE_HOME/jobs/ (default ~/.iapeer/cache/peer-voice):
|
|
17
|
+
* <job_id>.json — the dispatched job (text, voice, note, personality…)
|
|
18
|
+
* <job_id>.log — the worker's stdout/stderr
|
|
19
|
+
* <job_id>.result.json — written by the worker on completion (status, path|reason)
|
|
20
|
+
*/
|
|
21
|
+
import { mkdir, writeFile } from 'node:fs/promises';
|
|
22
|
+
import { openSync, closeSync } from 'node:fs';
|
|
23
|
+
import { spawn } from 'node:child_process';
|
|
24
|
+
import { randomBytes } from 'node:crypto';
|
|
25
|
+
import { join } from 'node:path';
|
|
26
|
+
import { fileURLToPath } from 'node:url';
|
|
27
|
+
import { peerVoiceHome } from './home.mjs';
|
|
28
|
+
|
|
29
|
+
/** Char count above which a request goes async. ~2000 chars ≈ ~1 min of audio.
|
|
30
|
+
* Read at call time so it is env-tunable (tests, ops). */
|
|
31
|
+
export function asyncThresholdChars() {
|
|
32
|
+
return Number(process.env.PEER_VOICE_ASYNC_THRESHOLD_CHARS || '2000');
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/** Whether this text should be synthesized asynchronously (too long for sync). */
|
|
36
|
+
export function isLongText(text) {
|
|
37
|
+
return typeof text === 'string' && text.length > asyncThresholdChars();
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export function jobsDir() {
|
|
41
|
+
return join(peerVoiceHome(), 'jobs');
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** A sortable, collision-resistant job id (time36 + random). */
|
|
45
|
+
export function newJobId() {
|
|
46
|
+
return `${Date.now().toString(36)}-${randomBytes(4).toString('hex')}`;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Persist the job spec so the detached worker can pick it up. Returns its path. */
|
|
50
|
+
export async function writeJobFile(job) {
|
|
51
|
+
const dir = jobsDir();
|
|
52
|
+
await mkdir(dir, { recursive: true });
|
|
53
|
+
const jobFile = join(dir, `${job.job_id}.json`);
|
|
54
|
+
await writeFile(jobFile, JSON.stringify(job, null, 2), 'utf8');
|
|
55
|
+
return jobFile;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Dispatch a long synthesis to a detached background worker.
|
|
60
|
+
* @param {object} opts {text, voice?, engine?, lang?, style?, out_path?, note?, personality?}
|
|
61
|
+
* @returns {Promise<{job_id: string, status: 'started'}>} returned in <1s
|
|
62
|
+
*/
|
|
63
|
+
export async function dispatchVoiceJob(opts = {}) {
|
|
64
|
+
const job_id = newJobId();
|
|
65
|
+
const job = {
|
|
66
|
+
job_id,
|
|
67
|
+
text: opts.text,
|
|
68
|
+
voice: opts.voice ?? null,
|
|
69
|
+
engine: opts.engine ?? 'auto',
|
|
70
|
+
lang: opts.lang ?? null,
|
|
71
|
+
style: opts.style ?? null,
|
|
72
|
+
out_path: opts.out_path ?? null,
|
|
73
|
+
note: opts.note ?? null,
|
|
74
|
+
personality: opts.personality ?? null,
|
|
75
|
+
created_at: new Date().toISOString(),
|
|
76
|
+
};
|
|
77
|
+
const jobFile = await writeJobFile(job);
|
|
78
|
+
|
|
79
|
+
// Detached so it outlives this handler's return (and even this MCP server):
|
|
80
|
+
// own session, unref'd, stdio to a per-job log. The worker reads jobFile.
|
|
81
|
+
const logFile = join(jobsDir(), `${job_id}.log`);
|
|
82
|
+
const out = openSync(logFile, 'a');
|
|
83
|
+
const workerPath = fileURLToPath(new URL('./worker.mjs', import.meta.url));
|
|
84
|
+
try {
|
|
85
|
+
const child = spawn(process.execPath, [workerPath, jobFile], {
|
|
86
|
+
detached: true,
|
|
87
|
+
stdio: ['ignore', out, out],
|
|
88
|
+
env: process.env,
|
|
89
|
+
});
|
|
90
|
+
child.unref();
|
|
91
|
+
} finally {
|
|
92
|
+
closeSync(out); // the child holds its own dup of the fd
|
|
93
|
+
}
|
|
94
|
+
return { job_id, status: 'started' };
|
|
95
|
+
}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Language helpers.
|
|
3
|
+
*
|
|
4
|
+
* `detectDominantLang` — plain text is ALWAYS one pass, so a monolingual call
|
|
5
|
+
* needs the single dominant language. Cyrillic vs Latin character count; ru on a
|
|
6
|
+
* tie or when neither is present (the team default).
|
|
7
|
+
*
|
|
8
|
+
* `classifyLangMode` / `supertonicLang` — language-aware fallback-engine routing
|
|
9
|
+
* (Artur 2026-06-01). The fallback engine (when Gemini is down) is picked by the
|
|
10
|
+
* text's language mix, not just its dominant script. See classifyLangMode.
|
|
11
|
+
*
|
|
12
|
+
* Phase 4 (reserved, not wired into the tool yet): `splitByLang` — when an
|
|
13
|
+
* explicit `||` change-of-language marker is present, text is cut by the marker
|
|
14
|
+
* and each chunk synthesized in its own language; the join produces a natural
|
|
15
|
+
* pause at the seam. Kept here for that phase; Phase 1 flattens `||` away.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
const CYR = /[Ѐ-ӿ]/;
|
|
19
|
+
const LAT = /[A-Za-z]/;
|
|
20
|
+
const CYR_G = /[Ѐ-ӿ]/g;
|
|
21
|
+
const LAT_G = /[A-Za-z]/g;
|
|
22
|
+
|
|
23
|
+
// A language is "dominant" only when it holds at least this share of the letters
|
|
24
|
+
// (Artur's ~80% guideline). Below it, neither side overwhelms → the text is a
|
|
25
|
+
// genuine blend. A monolingual --lang would mangle the minority script, so a
|
|
26
|
+
// blend goes to Supertonic's native multilingual ('na') pass instead. The same
|
|
27
|
+
// 0.8 cutoff guards the F5 (Russian) branch: F5's Russian is far better, and it
|
|
28
|
+
// reads a ≤20% English minority intelligibly, so ru ≥0.8 is worth F5; below that
|
|
29
|
+
// we don't risk F5's weak English. Env-tunable for live calibration.
|
|
30
|
+
const DOMINANT_SHARE = Number(process.env.PEER_VOICE_DOMINANT_SHARE || '0.8');
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Dominant language of the whole text.
|
|
34
|
+
* @param {string} text
|
|
35
|
+
* @param {'ru'|'en'} [fallback] used when neither script appears / on a tie
|
|
36
|
+
* @returns {'ru'|'en'|string}
|
|
37
|
+
*/
|
|
38
|
+
export function detectDominantLang(text, fallback = 'ru') {
|
|
39
|
+
const cyr = (text.match(CYR_G) || []).length;
|
|
40
|
+
const lat = (text.match(LAT_G) || []).length;
|
|
41
|
+
if (cyr === 0 && lat === 0) return fallback;
|
|
42
|
+
if (cyr === lat) return fallback;
|
|
43
|
+
return cyr > lat ? 'ru' : 'en';
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Three-way language mode for fallback-engine routing (Artur 2026-06-01) — the
|
|
48
|
+
* DEFAULT used only when the caller does not pass an explicit `lang`:
|
|
49
|
+
* 'ru' — Russian holds ≥ DOMINANT_SHARE of the letters (English inserts ok).
|
|
50
|
+
* 'en' — English holds ≥ DOMINANT_SHARE (Russian negligible).
|
|
51
|
+
* 'mixed' — neither overwhelms: a genuine ru+en blend.
|
|
52
|
+
* The asymmetry the routing wants is encoded by the consumers, not here: 'ru'
|
|
53
|
+
* → F5 (Russian accent-tune), 'en' → Supertonic --lang en, 'mixed' → Supertonic
|
|
54
|
+
* --lang na (native multilingual, one pass). A monolingual --lang would mangle
|
|
55
|
+
* the minority script, which is exactly why a blend uses 'na'.
|
|
56
|
+
* @param {string} text
|
|
57
|
+
* @returns {'ru'|'en'|'mixed'}
|
|
58
|
+
*/
|
|
59
|
+
export function classifyLangMode(text) {
|
|
60
|
+
const s = String(text);
|
|
61
|
+
const cyr = (s.match(CYR_G) || []).length;
|
|
62
|
+
const lat = (s.match(LAT_G) || []).length;
|
|
63
|
+
const total = cyr + lat;
|
|
64
|
+
if (total === 0) return 'ru'; // neutral-only → team default
|
|
65
|
+
const cyrShare = cyr / total;
|
|
66
|
+
if (cyrShare >= DOMINANT_SHARE) return 'ru'; // Russian overwhelmingly leads
|
|
67
|
+
if (cyrShare <= 1 - DOMINANT_SHARE) return 'en'; // English overwhelmingly leads
|
|
68
|
+
return 'mixed'; // both substantial → 'na'
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Supertonic --lang code for a text under the share heuristic: 'ru' | 'en' |
|
|
73
|
+
* 'na' (na = supertonic-3's native multilingual mode for a balanced blend).
|
|
74
|
+
* @param {string} text
|
|
75
|
+
* @returns {'ru'|'en'|'na'}
|
|
76
|
+
*/
|
|
77
|
+
export function supertonicLang(text) {
|
|
78
|
+
const mode = classifyLangMode(text);
|
|
79
|
+
return mode === 'mixed' ? 'na' : mode;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/** True iff the text contains any Cyrillic letter → it wants Russian stress
|
|
83
|
+
* (ruaccent), whether the synthesis lang is 'ru' or the multilingual 'na'. */
|
|
84
|
+
export function hasCyrillic(text) {
|
|
85
|
+
return CYR.test(String(text));
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function classify(token) {
|
|
89
|
+
if (CYR.test(token)) return 'ru';
|
|
90
|
+
if (LAT.test(token)) return 'en';
|
|
91
|
+
return null; // neutral: punctuation / digits / spaces / symbols
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Phase 4 — split mixed text into single-language runs (Cyrillic→ru, Latin→en).
|
|
96
|
+
* Neutral tokens glue onto the current run so nothing is lost. Not used by the
|
|
97
|
+
* Phase 1 tool surface.
|
|
98
|
+
* @param {string} text
|
|
99
|
+
* @param {'ru'|'en'} [fallbackLang] language for neutral-only text
|
|
100
|
+
* @returns {Array<{text: string, lang: 'ru'|'en'}>}
|
|
101
|
+
*/
|
|
102
|
+
export function splitByLang(text, fallbackLang = 'ru') {
|
|
103
|
+
const tokens = text.match(/\S+|\s+/g) || [];
|
|
104
|
+
const runs = [];
|
|
105
|
+
let cur = null;
|
|
106
|
+
let pendingNeutral = '';
|
|
107
|
+
|
|
108
|
+
for (const tok of tokens) {
|
|
109
|
+
const lang = classify(tok);
|
|
110
|
+
if (lang === null) {
|
|
111
|
+
if (cur) cur.text += tok;
|
|
112
|
+
else pendingNeutral += tok;
|
|
113
|
+
continue;
|
|
114
|
+
}
|
|
115
|
+
if (cur && cur.lang === lang) {
|
|
116
|
+
cur.text += tok;
|
|
117
|
+
} else {
|
|
118
|
+
if (cur) runs.push(cur);
|
|
119
|
+
cur = { text: pendingNeutral + tok, lang };
|
|
120
|
+
pendingNeutral = '';
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
if (cur) runs.push(cur);
|
|
124
|
+
else if (pendingNeutral.trim()) runs.push({ text: pendingNeutral, lang: fallbackLang });
|
|
125
|
+
|
|
126
|
+
return runs
|
|
127
|
+
.map(r => ({ text: r.text.trim(), lang: r.lang }))
|
|
128
|
+
.filter(r => r.text.length > 0);
|
|
129
|
+
}
|