npm - @agfpd/voice-connect - Versions diffs - 0.1.11 - Mend

@agfpd/voice-connect 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/LICENSE +201 -0
package/README.md +72 -0
package/bin/peer-voice-http.mjs +20 -0
package/bin/peer-voice-mcp.mjs +18 -0
package/package.json +61 -0
package/src/apikey.mjs +61 -0
package/src/audio.mjs +112 -0
package/src/config.mjs +75 -0
package/src/configfile.mjs +38 -0
package/src/engines/f5.mjs +111 -0
package/src/engines/gemini.mjs +199 -0
package/src/engines/gptaudio.mjs +230 -0
package/src/engines/mlxwhisper.mjs +70 -0
package/src/engines/speaches.mjs +69 -0
package/src/engines/supertonic.mjs +177 -0
package/src/home.mjs +15 -0
package/src/http.mjs +252 -0
package/src/jobs.mjs +95 -0
package/src/langsplit.mjs +129 -0
package/src/profile.mjs +165 -0
package/src/providers.mjs +210 -0
package/src/ref.mjs +157 -0
package/src/router.mjs +91 -0
package/src/ruaccent.mjs +114 -0
package/src/ruaccent_stress.py +66 -0
package/src/server.mjs +278 -0
package/src/stress.mjs +25 -0
package/src/stt.mjs +48 -0
package/src/synthlog.mjs +46 -0
package/src/voice.mjs +201 -0
package/src/worker.mjs +120 -0

package/src/ruaccent.mjs ADDED Viewed

@@ -0,0 +1,114 @@
+/**
+ * Shared Russian-stress accentuation via ruaccent.
+ *
+ * ruaccent (an ML model) marks the stressed vowel with a '+' immediately BEFORE
+ * it — verified live 2026-06-01 ("Сл+ожная с+интеза. З+амок ..."). This module
+ * produces that raw '+'-marked text; the two TTS consumers differ in what they
+ * do with it:
+ *   - F5-TTS understands the '+' marker NATIVELY → uses accentPlus() output as-is.
+ *   - Supertonic mis-reads a raw '+' as a sound → maps '+<vowel>' → '<vowel>' +
+ *     U+0301 on top of accentPlus() (see stress.mjs).
+ * Gemini stresses Russian correctly on its own and uses neither.
+ *
+ * ruaccent is pip-installed into the managed Supertonic venv, so the python is
+ * resolved from there (or an explicit override / caller hint). Best-effort: any
+ * failure returns the text unchanged — stress is an enhancement, never a hard
+ * dependency for synthesis.
+ */
+import { execFile } from 'node:child_process';
+import { promisify } from 'node:util';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import { access } from 'node:fs/promises';
+import { constants as FS } from 'node:fs';
+import { peerVoiceHome } from './home.mjs';
+const pexecFile = promisify(execFile);
+/** The python CLI helper that actually runs ruaccent (shared by both engines). */
+const HELPER = join(dirname(fileURLToPath(import.meta.url)), 'ruaccent_stress.py');
+/** Canonical home of ruaccent: the managed Supertonic venv (pip installs it there). */
+function managedVenvPython() {
+  return join(peerVoiceHome(), 'supertonic-venv', 'bin', 'python');
+}
+async function exists(p) {
+  try { await access(p, FS.X_OK); return true; } catch { return false; }
+}
+function log(msg) { process.stderr.write(`[peer-voice/ruaccent] ${msg}\n`); }
+async function run(bin, args) {
+  return pexecFile(bin, args, { maxBuffer: 64 * 1024 * 1024 });
+}
+let checked = false;
+let resolvedPython = null;
+/**
+ * Resolve a python that can `import ruaccent` (installing it once if a sibling
+ * pip is present). Resolution order:
+ *   1. PEER_VOICE_RUACCENT_PYTHON  (explicit override)
+ *   2. caller `hint`               (e.g. supertonic's resolved-CLI sibling python)
+ *   3. managed venv python         ($PEER_VOICE_HOME/supertonic-venv/bin/python,
+ *                                   default ~/.iapeer/cache/peer-voice)
+ * Returns null — accentuation is skipped gracefully — when disabled, when no
+ * candidate python exists, or when install fails. Memoized after the first call,
+ * so `hint` only matters on that first resolution (env override covers the rest).
+ * @param {string} [hint] preferred python path, tried before the managed venv
+ * @returns {Promise<string|null>}
+ */
+export async function ensureRuaccentPython(hint) {
+  if (process.env.PEER_VOICE_RUACCENT === '0') return null; // operator opt-out
+  if (checked) return resolvedPython;
+  checked = true;
+  const candidates = [
+    process.env.PEER_VOICE_RUACCENT_PYTHON,
+    hint,
+    managedVenvPython(),
+  ].filter(Boolean);
+  for (const py of candidates) {
+    if (!(await exists(py))) continue;
+    try {
+      await run(py, ['-c', 'import ruaccent']);
+      resolvedPython = py;
+      return py;
+    } catch { /* python present but ruaccent missing — try to install below */ }
+    const pip = join(dirname(py), 'pip');
+    if (!(await exists(pip))) continue;
+    log('pip install ruaccent (one-time; transformers+onnxruntime, no torch)…');
+    try {
+      await run(pip, ['install', '--quiet', 'ruaccent']);
+      await run(py, ['-c', 'import ruaccent']);
+      resolvedPython = py;
+      return py;
+    } catch (e) {
+      log(`ruaccent install failed (${e.message}); trying next candidate.`);
+    }
+  }
+  log('no venv python with ruaccent found; Russian stress disabled (synthesis continues).');
+  return null;
+}
+/**
+ * Russian text with ruaccent '+' stress markers, or the original text on any
+ * failure. Does NOT map to U+0301 — that is Supertonic's concern (stress.mjs);
+ * F5 reads the '+' natively.
+ * @param {string} text
+ * @param {string} [hint] preferred python path (see ensureRuaccentPython)
+ * @returns {Promise<string>}
+ */
+export async function accentPlus(text, hint) {
+  const py = await ensureRuaccentPython(hint);
+  if (!py) return text;
+  try {
+    const { stdout } = await run(py, [HELPER, text]);
+    const marked = (stdout != null ? String(stdout) : '').trim();
+    return marked || text;
+  } catch (e) {
+    log(`ruaccent processing failed (${e.message}); using unaccented text.`);
+    return text;
+  }
+}

package/src/ruaccent_stress.py ADDED Viewed

@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""Russian stress marker for the Peer-Voice Supertonic branch.
+Reads UTF-8 text (argv[1], else stdin), runs ruaccent to place stress, and
+writes the '+'-marked text to stdout — '+' immediately before each stressed
+vowel (ruaccent's native format). The JS side maps '+<vowel>' -> '<vowel>'+
+U+0301 (combining acute): Supertonic mis-reads the raw '+' as a sound but
+honors the combining accent. Verified live 2026-06-01 (Artur confirmed by ear).
+Only the accented text goes to stdout — model-load/import chatter is redirected
+to stderr so the caller can read stdout cleanly.
+"""
+import sys
+import contextlib
+def _align_token_type_ids(accentizer):
+    """Re-add 'token_type_ids' to each ruaccent sub-model tokenizer that needs it.
+    transformers 5.x dropped 'token_type_ids' from the default
+    PreTrainedTokenizer.model_input_names (4.x had it). ruaccent's *accent*
+    model is a BERT-architecture ONNX graph (exported under transformers 4.29)
+    whose feed REQUIRES token_type_ids, but its CharTokenizer relies on that
+    base-class default and so stopped emitting them — onnxruntime then rejects
+    the run with "Required inputs (['token_type_ids']) are missing", ruaccent
+    raises, and the caller falls back to UNACCENTED text. This only bites words
+    absent from the dictionary (the neural path); all-dictionary text masks it.
+    Fix: for any sub-model whose ONNX graph requires token_type_ids, add it back
+    to that tokenizer's model_input_names. The tokenizer then fills them with
+    zeros (single sequence) — exactly what the 4.29-era export expects.
+    Defensive throughout: a probe failure must never block synthesis.
+    """
+    submodels = ('accent_model', 'omograph_model',
+                 'yo_homograph_model', 'stress_usage_predictor')
+    for name in submodels:
+        model = getattr(accentizer, name, None)
+        session = getattr(model, 'session', None)
+        tokenizer = getattr(model, 'tokenizer', None)
+        if session is None or tokenizer is None:
+            continue
+        try:
+            required = {i.name for i in session.get_inputs()}
+            names = list(tokenizer.model_input_names)
+            if 'token_type_ids' in required and 'token_type_ids' not in names:
+                tokenizer.model_input_names = names + ['token_type_ids']
+        except Exception:
+            continue
+def main():
+    text = sys.argv[1] if len(sys.argv) > 1 else sys.stdin.read()
+    if not text.strip():
+        sys.stdout.write(text)
+        return
+    with contextlib.redirect_stdout(sys.stderr):
+        from ruaccent import RUAccent
+        accentizer = RUAccent()
+        accentizer.load(omograph_model_size='turbo3.1', use_dictionary=True)
+        _align_token_type_ids(accentizer)
+        result = accentizer.process_all(text)
+    sys.stdout.write(result)
+if __name__ == '__main__':
+    main()

package/src/server.mjs ADDED Viewed

@@ -0,0 +1,278 @@
+/**
+ * voice-connect MCP server. Exposes the voice tools any peer can call:
+ *   - `tts`  — turn text into a ready-to-send .ogg/opus voice file.
+ *   - `stt`  — transcribe an audio file to text.
+ *   - `voice_create` — DEPRECATED alias for `tts` (one release, then removed).
+ *
+ * Engine routing with fallback lives in voice.mjs (tts) and stt.mjs (stt); this
+ * file is just the MCP wiring. Like spawned-peer's server, it separates the pure
+ * factory (createServer) from the side-effecting bootstrap (main), so tests can
+ * import createServer without touching stdio.
+ */
+import { Server } from '@modelcontextprotocol/sdk/server/index.js';
+import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
+import {
+  ListToolsRequestSchema,
+  CallToolRequestSchema,
+} from '@modelcontextprotocol/sdk/types.js';
+import { readFileSync } from 'node:fs';
+import { fileURLToPath } from 'node:url';
+import { dirname, join } from 'node:path';
+import { createVoice } from './voice.mjs';
+import { transcribe as transcribeAudio } from './stt.mjs';
+import { isLongText, dispatchVoiceJob } from './jobs.mjs';
+import { callerPersonality } from './profile.mjs';
+const here = dirname(fileURLToPath(import.meta.url));
+export function readVersion() {
+  try {
+    const pkg = JSON.parse(readFileSync(join(here, '..', 'package.json'), 'utf8'));
+    return pkg.version ?? '0.0.0';
+  } catch {
+    return '0.0.0';
+  }
+}
+// Shared voice (TTS) tool surface — used verbatim by `tts` and its deprecated
+// `voice_create` alias, so the two never drift.
+const VOICE_DESCRIPTION =
+  `Synthesize speech from text into a ready-to-send .ogg/opus voice file. ` +
+  `Give text, get an ogg — one generation pass, smooth speech, no pauses.\n\n` +
+  `SYNC vs ASYNC (automatic, by length):\n` +
+  `• SHORT text → returns { path, engine, voice, lang?, probe, fallback_from? } ` +
+  `synchronously. Attach it: send_to_peer(personality, attachments=[path]).\n` +
+  `• LONG text → returns { job_id, status:"started" } in <1s — you are NOT ` +
+  `blocked for the (minutes-long) synthesis. A background worker finishes it ` +
+  `and sends YOU an IAP message: "voice job <id> done path=<path> note=<note>" ` +
+  `on success, or "voice job <id> failed reason=<...>" on error. When that ` +
+  `arrives, deliver the ogg yourself: send_to_peer(<recipient>, attachments=[path]). ` +
+  `Use the note arg to remind your future self who to send it to / a caption.\n\n` +
+  `Delivery is never automatic — the tool only produces the file (or the job).\n\n` +
+  `Engine routing (automatic, both modes): Gemini 3.1 Flash TTS primary (mixed ` +
+  `ru+en in one pass, natural prosody); gpt-audio over OpenRouter second (cloud ` +
+  `quality when the Gemini key is exhausted); F5-TTS third (live-prosody, per-peer ` +
+  `voice); Supertonic 3 local floor (offline). Transparent — you normally only ` +
+  `pass text (and optionally voice / style / note).`;
+const VOICE_INPUT_SCHEMA = {
+  type: 'object',
+  properties: {
+    text: {
+      type: 'string',
+      description:
+        `Text to speak. Mixed ru+en is fine — Gemini detects language and ` +
+        `reads it in one smooth pass. Long text is synthesized asynchronously ` +
+        `(you get a job_id and an IAP "done" message later).`,
+      minLength: 1,
+    },
+    voice: {
+      type: 'string',
+      description:
+        `Gemini prebuilt voice. Default "Aoede" (chosen for the natalya tutor ` +
+        `personality). Other good female voices: Kore, Leda, Zephyr.`,
+    },
+    lang: {
+      type: 'string',
+      enum: ['ru', 'en', 'na'],
+      description:
+        `Optional language hint for the FALLBACK engine (Gemini stays primary ` +
+        `and reads any language itself — this only matters if Gemini is down). ` +
+        `You know your text, so set it: "ru" Russian, including Russian-dominant ` +
+        `with a few English words (F5 voices them intelligibly); "en" English; ` +
+        `"na" a balanced ru+en mix (Supertonic reads both in one native pass). ` +
+        `Omit to auto-detect by character share.`,
+    },
+    note: {
+      type: 'string',
+      description:
+        `Optional reminder for async (long) jobs, echoed back verbatim in the ` +
+        `IAP "done" message — e.g. who to deliver the ogg to or a caption. ` +
+        `Ignored for short (synchronous) text, which returns the path directly.`,
+    },
+    style: {
+      type: 'string',
+      description:
+        `Optional delivery directive — HOW to voice the text (tone, emotion, ` +
+        `tempo, accent), separate from WHAT to say. Applies to the cloud engines: ` +
+        `gpt-audio (folded into its prompt) and Gemini (natural-language style ` +
+        `prefix). The local fallbacks (F5/Supertonic) have no style layer and ` +
+        `ignore it gracefully. E.g. "спокойно и тепло", "медленно, шёпотом", ` +
+        `"as an excited sports announcer".`,
+    },
+    engine: {
+      type: 'string',
+      enum: ['auto', 'gemini', 'gpt-audio', 'supertonic'],
+      description:
+        `Advanced/testing. "auto" (default): Gemini direct → gpt-audio (OpenRouter) ` +
+        `→ F5 → Supertonic, falling on quota/no-key/unavailable. "gemini", ` +
+        `"gpt-audio" or "supertonic" forces that engine.`,
+    },
+    out_path: {
+      type: 'string',
+      description:
+        `Advanced. Absolute output .ogg path. Default: a unique file under ~/.iapeer/cache/peer-voice/out/.`,
+    },
+  },
+  required: ['text'],
+  additionalProperties: false,
+};
+const VOICE_ANNOTATIONS = {
+  title: 'Create a voice file from text',
+  readOnlyHint: false,
+  destructiveHint: false,
+  idempotentHint: false,
+  openWorldHint: true,
+};
+const STT_INPUT_SCHEMA = {
+  type: 'object',
+  properties: {
+    audio_path: {
+      type: 'string',
+      description:
+        `Absolute path to the audio file to transcribe (e.g. a received voice ` +
+        `.ogg/.wav/.mp3). The transcript is returned as text.`,
+      minLength: 1,
+    },
+    lang: {
+      type: 'string',
+      description:
+        `Optional language hint (ISO-639-1: "en", "ru", …). Omit to auto-detect.`,
+    },
+    prompt: {
+      type: 'string',
+      description:
+        `Optional decoder-priming prompt — biases spelling/casing of terms ` +
+        `(e.g. keep "Claude Code"/"Gemini" in Latin). Not part of the output.`,
+    },
+    engine: {
+      type: 'string',
+      enum: ['auto', 'speaches', 'mlx-whisper'],
+      description:
+        `Advanced/testing. "auto" (default): speaches (when an endpoint is ` +
+        `configured) → mlx-whisper local floor. Force one with "speaches" or ` +
+        `"mlx-whisper".`,
+    },
+  },
+  required: ['audio_path'],
+  additionalProperties: false,
+};
+export function createServer({ version, voice, dispatch, transcribe } = {}) {
+  const mcp = new Server(
+    { name: 'voice-connect', version: version ?? readVersion() },
+    {
+      capabilities: { tools: {} },
+      instructions:
+        `voice-connect — voice for agents. Two tools over one core:\n` +
+        `• tts(text) — synthesize an .ogg/opus voice file. SHORT text → returns ` +
+        `{ path, ... } synchronously; LONG text → returns { job_id, status:"started" } ` +
+        `and a background worker IAP-messages you "voice job <id> done path=<path> ` +
+        `note=<note>" when ready. The tool does NOT deliver — attach the path ` +
+        `yourself: send_to_peer(personality, attachments=[path]).\n` +
+        `• stt(audio_path) — transcribe an audio file to text.\n` +
+        `(voice_create is a DEPRECATED alias for tts.)\n\n` +
+        `Routing is automatic with fallback inside each tool — TTS: Gemini → ` +
+        `gpt-audio (OpenRouter) → F5 → Supertonic local floor. STT: speaches ` +
+        `(when configured) → mlx-whisper local floor.`,
+    },
+  );
+  mcp.setRequestHandler(ListToolsRequestSchema, async () => ({
+    tools: [
+      { name: 'tts', description: VOICE_DESCRIPTION, inputSchema: VOICE_INPUT_SCHEMA, annotations: VOICE_ANNOTATIONS },
+      {
+        name: 'voice_create',
+        description: `DEPRECATED — alias for \`tts\`, kept one release for compatibility; use \`tts\`.\n\n${VOICE_DESCRIPTION}`,
+        inputSchema: VOICE_INPUT_SCHEMA,
+        annotations: VOICE_ANNOTATIONS,
+      },
+      {
+        name: 'stt',
+        description:
+          `Transcribe an audio file to text. Give a path to an audio file (e.g. a ` +
+          `received voice .ogg), get back the transcript. Engine cascade inside the ` +
+          `tool: speaches (OpenAI-compatible, when PEER_VOICE_STT_ENDPOINT is set) → ` +
+          `mlx-whisper local floor (offline). Returns { text, engine, fallback_from? }.`,
+        inputSchema: STT_INPUT_SCHEMA,
+        annotations: {
+          title: 'Transcribe audio to text',
+          readOnlyHint: true,
+          destructiveHint: false,
+          idempotentHint: true,
+          openWorldHint: true,
+        },
+      },
+    ],
+  }));
+  const voiceImpl = voice ?? createVoice;
+  const dispatchImpl = dispatch ?? dispatchVoiceJob;
+  const transcribeImpl = transcribe ?? transcribeAudio;
+  async function handleVoice(raw) {
+    const text = typeof raw.text === 'string' ? raw.text : undefined;
+    const voiceArg = typeof raw.voice === 'string' ? raw.voice : undefined;
+    const engine = ['gemini', 'gpt-audio', 'supertonic', 'auto'].includes(raw.engine) ? raw.engine : 'auto';
+    const lang = ['ru', 'en', 'na'].includes(raw.lang) ? raw.lang : undefined;
+    const style = typeof raw.style === 'string' ? raw.style : undefined;
+    const out_path = typeof raw.out_path === 'string' ? raw.out_path : undefined;
+    const note = typeof raw.note === 'string' ? raw.note : undefined;
+    // Long text → async: dispatch a detached worker and return {job_id} now, so
+    // the agent isn't blocked for the minutes-long synthesis. Short → sync.
+    if (isLongText(text)) {
+      const job = await dispatchImpl({
+        text, voice: voiceArg, engine, lang, style, out_path, note,
+        // env → cwd-profile ladder: codex MCP children get no parent env, so
+        // the IAP-notify target must be recoverable from the cwd profile.
+        personality: callerPersonality(),
+      });
+      return { content: [{ type: 'text', text: JSON.stringify(job, null, 2) }], structuredContent: job };
+    }
+    const result = await voiceImpl({ text, voice: voiceArg, engine, lang, style, out_path });
+    return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }], structuredContent: result };
+  }
+  async function handleStt(raw) {
+    const audioPath = typeof raw.audio_path === 'string' ? raw.audio_path : undefined;
+    const engine = ['speaches', 'mlx-whisper', 'auto'].includes(raw.engine) ? raw.engine : 'auto';
+    const lang = typeof raw.lang === 'string' ? raw.lang : undefined;
+    const prompt = typeof raw.prompt === 'string' ? raw.prompt : undefined;
+    const result = await transcribeImpl({ audioPath, engine, lang, prompt });
+    return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }], structuredContent: result };
+  }
+  mcp.setRequestHandler(CallToolRequestSchema, async req => {
+    const name = req.params.name;
+    const raw = req.params.arguments ?? {};
+    try {
+      if (name === 'tts' || name === 'voice_create') return await handleVoice(raw);
+      if (name === 'stt') return await handleStt(raw);
+      return { isError: true, content: [{ type: 'text', text: `unknown tool: ${name}` }] };
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : String(err);
+      return { isError: true, content: [{ type: 'text', text: `${name} failed: ${msg}` }] };
+    }
+  });
+  return mcp;
+}
+export async function main() {
+  const version = readVersion();
+  const mcp = createServer({ version });
+  await mcp.connect(new StdioServerTransport());
+  process.stderr.write(`voice-connect: started (version ${version})\n`);
+  // Self-reap if orphaned (parent died / stdin closed) — same guard as spawned-peer.
+  const bootPpid = process.ppid;
+  setInterval(() => {
+    const orphaned =
+      (process.platform !== 'win32' && process.ppid !== bootPpid) ||
+      process.stdin.destroyed ||
+      process.stdin.readableEnded;
+    if (orphaned) process.exit(0);
+  }, 5000).unref();
+}

package/src/stress.mjs ADDED Viewed

@@ -0,0 +1,25 @@
+/**
+ * Russian stress mapping for the Supertonic branch.
+ *
+ * ruaccent emits '+' immediately BEFORE each stressed vowel — verified live
+ * 2026-06-01 against real ruaccent output ("Сл+ожная с+интеза. З+амок ...
+ * +орган т+ела." — every '+' precedes the stressed vowel). Supertonic mis-reads
+ * the raw '+' as a sound (garbles output), but honors a U+0301 combining acute.
+ * The combining acute attaches to the PRECEDING base character, so to accent the
+ * vowel that follows the '+', we drop the '+' and append U+0301 after that vowel:
+ * '+<vowel>' -> '<vowel>' + U+0301. Artur confirmed the U+0301 result by ear.
+ * Gemini does not need this (it stresses Russian correctly on its own).
+ *
+ * The map is intentionally narrow: only a '+' immediately followed by a Russian
+ * vowel is rewritten, so a stray '+' in source text ("C++", "2+2") is left
+ * untouched, and Latin/digits are never affected.
+ */
+const STRESS_VOWELS = 'аеёиоуыэюяАЕЁИОУЫЭЮЯ';
+const COMBINING_ACUTE = '́';
+const STRESS_RE = new RegExp(`\\+([${STRESS_VOWELS}])`, 'g');
+/** @param {string} text ruaccent '+'-marked text @returns {string} */
+export function mapStressToUnicode(text) {
+  if (typeof text !== 'string' || !text) return text;
+  return text.replace(STRESS_RE, (_, vowel) => vowel + COMBINING_ACUTE);
+}

package/src/stt.mjs ADDED Viewed

@@ -0,0 +1,48 @@
+/**
+ * stt — transcribe audio to text. The STT half of the core, symmetric with
+ * voice.mjs (tts): one call, an engine cascade with fallback INSIDE the tool.
+ *
+ *   auto: speaches (when an endpoint is configured) → mlx-whisper local floor.
+ *   forced: a single named engine, whose failure propagates to the caller.
+ *
+ * Audio in (a file path), text out. No delivery, no encoding — the caller (MCP
+ * stt tool in Ф4, HTTP /v1/audio/transcriptions in Ф5) owns I/O.
+ */
+import { runCascade } from './router.mjs';
+import { sttProviderByEngine, buildSttCascade } from './providers.mjs';
+/**
+ * @param {object} opts
+ * @param {string}  opts.audioPath        path to the audio file to transcribe
+ * @param {'auto'|'speaches'|'mlx-whisper'} [opts.engine] default 'auto'
+ * @param {string}  [opts.lang]           language hint (ISO-639-1: 'en', 'ru', …)
+ * @param {string}  [opts.prompt]         decoder-priming prompt (term spelling/casing)
+ * @returns {Promise<{text, engine, fallback_from?}>}
+ */
+export async function transcribe(opts = {}) {
+  const audioPath = opts.audioPath;
+  if (!audioPath || typeof audioPath !== 'string') {
+    throw new Error('stt: `audioPath` is required.');
+  }
+  const engine = ['speaches', 'mlx-whisper', 'auto'].includes(opts.engine) ? opts.engine : 'auto';
+  const lang = typeof opts.lang === 'string' && opts.lang.trim() ? opts.lang.trim() : undefined;
+  const prompt = typeof opts.prompt === 'string' && opts.prompt.trim() ? opts.prompt.trim() : undefined;
+  const ctx = { audioPath, lang, prompt };
+  if (engine === 'auto') {
+    const result = await runCascade(buildSttCascade(), ctx, {
+      onAdvance: (name, err) =>
+        process.stderr.write(`[peer-voice] STT ${name} unavailable (${err.message}); advancing cascade.\n`),
+    });
+    return {
+      text: result.text,
+      engine: result.name,
+      ...(result.fallbackFrom ? { fallback_from: result.fallbackFrom } : {}),
+    };
+  }
+  // Forced single engine — no cascade; its failure propagates to the caller.
+  const provider = sttProviderByEngine(engine);
+  const { text } = await provider.transcribe(ctx);
+  return { text, engine: provider.name };
+}

package/src/synthlog.mjs ADDED Viewed

@@ -0,0 +1,46 @@
+/**
+ * Structured synthesis log — one JSON line per createVoice() call.
+ *
+ * Why: the async worker logs richly to its per-job <job_id>.log, but the
+ * SYNCHRONOUS path (short text, ≤ async threshold) used to leave no trace at
+ * all — when Artur reported a "truncated Natalya voice-note", the engine and
+ * generation params had to be reconstructed from the author peer after the
+ * fact. This closes that gap on the right layer: createVoice (the single
+ * synthesis entry point for BOTH sync and async) appends a structured record
+ * here for every generation, success or failure.
+ *
+ * Sink: $PEER_VOICE_HOME/voice.log (default ~/.iapeer/cache/peer-voice/voice.log),
+ * append-only JSON Lines. Find the record for a specific call by the output
+ * `path` it returned, e.g.:
+ *   grep '"<that .ogg path>"' ~/.iapeer/cache/peer-voice/voice.log
+ *
+ * Each record carries: ts, ok, engine, chars (text length), voice, lang,
+ * duration (final audio seconds, from ffprobe), fallback_from, and — for Gemini
+ * — finishReason. Best-effort: a logging failure is swallowed and never affects
+ * the returned audio (same posture as saveRef).
+ */
+import { appendFile, mkdir } from 'node:fs/promises';
+import { dirname, join } from 'node:path';
+import { peerVoiceHome } from './home.mjs';
+/** Absolute path of the JSON-Lines synthesis log. Env-tunable via PEER_VOICE_HOME. */
+export function synthLogPath() {
+  return join(peerVoiceHome(), 'voice.log');
+}
+/**
+ * Append one synthesis record as a JSON line. Best-effort, never throws.
+ * @param {object} record fields to log (ts is added if absent)
+ * @param {string} [nowIso] ISO timestamp (injectable for tests)
+ * @returns {Promise<void>}
+ */
+export async function logSynthesis(record, nowIso) {
+  const line = JSON.stringify({ ts: nowIso ?? new Date().toISOString(), ...record }) + '\n';
+  const file = synthLogPath();
+  try {
+    await mkdir(dirname(file), { recursive: true });
+    await appendFile(file, line, 'utf8');
+  } catch {
+    // Observability must never break synthesis — drop the line silently.
+  }
+}