@glissade/narrate 0.5.0-pre.0 → 0.5.0-pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -111,23 +111,29 @@ declare function mapAsrToScript(timed: {
111
111
  start: number;
112
112
  end: number;
113
113
  }[];
114
- interface WavMono {
115
- /** mono samples in [-1, 1] */
116
- samples: Float32Array;
117
- sampleRate: number;
114
+ /** one word from vosk-align's JSON output */
115
+ interface VoskAlignWord {
116
+ word: string;
117
+ start: number;
118
+ end: number;
119
+ conf?: number;
118
120
  }
119
- /** Decode a 16-bit PCM RIFF/WAV to mono float samples (channels averaged). */
120
- declare function decodeWavMono(wav: Buffer): WavMono;
121
- /** Linear-resample mono float to a 16 kHz int16 LE PCM buffer (Vosk's input). */
122
- declare function resampleTo16kPcm(input: WavMono): Buffer;
123
121
  /**
124
- * Word timings via Vosk (alphacephei) — offline, Apache-2.0, ~50 MB model, a
125
- * real Node binding (no Python, no Docker, no multi-GB download). `vosk` is an
126
- * OPTIONAL peer: install it (`npm i vosk`) and point at a model
127
- * (`opts.model` / `VOSK_MODEL`) only if you use this aligner. ASR words are
128
- * mapped onto the script tokens by `mapAsrToScript`.
122
+ * Word timings via Vosk (alphacephei) — offline ASR, Apache-2.0. Shells out to
123
+ * a `vosk-align` command (the Python `vosk` binding + ffmpeg deliberately NOT
124
+ * the npm `vosk` package, whose `ffi-napi` native build is broken on modern
125
+ * Node). The command reads any audio and writes
126
+ * { "words": [ { "word", "start", "end", "conf"? }, … ] }
127
+ * to stdout; its recognized words are LCS-mapped onto the script tokens by
128
+ * `mapAsrToScript`, so mis-recognitions (e.g. an unknown proper noun) just
129
+ * interpolate cleanly between the words around them.
130
+ *
131
+ * Provide the command via `opts.command` / `VOSK_ALIGN` (default `vosk-align`);
132
+ * the model is the command's own concern (its default, or `--model`/VOSK_MODEL),
133
+ * passed through with `opts.model`.
129
134
  */
130
135
  declare function voskAligner(opts?: {
136
+ command?: string;
131
137
  model?: string;
132
138
  }): Aligner;
133
139
  /** Resolve an aligner id; 'none' disables alignment (word-less segments). */
@@ -170,4 +176,4 @@ declare function synthesizeScript(scriptPath: string, opts?: SynthesizeOptions):
170
176
  /** Resolve `<scene>.narration.json` for a scene-module path (or accept the script itself). */
171
177
  declare function scriptPathFor(input: string): string;
172
178
  //#endregion
173
- export { AlignRequest, Aligner, SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, alignerById, cacheKey, decodeWavMono, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, resampleTo16kPcm, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
179
+ export { AlignRequest, Aligner, SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, VoskAlignWord, alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
package/dist/providers.js CHANGED
@@ -144,8 +144,12 @@ function piperProvider(opts = {}) {
144
144
  id: "piper",
145
145
  version: () => {
146
146
  const r = spawnSync("piper", ["--version"], { encoding: "utf8" });
147
- if (r.status !== 0) throw new NarrationError("piper not found on PATH — install rhasspy/piper, or use --provider fake/espeak/openai");
148
- const v = (r.stdout.trim() || r.stderr.trim() || "piper").split("\n")[0];
147
+ if (r.error) {
148
+ if (r.error.code === "ENOENT") throw new NarrationError("piper not found on PATH — `pip install piper-tts` (or the standalone rhasspy/piper), or use --provider fake/espeak/openai");
149
+ throw new NarrationError(`could not run piper: ${r.error.message}`);
150
+ }
151
+ const m = /\b\d+\.\d+\.\d+\b/.exec(r.stdout ?? "");
152
+ const v = m ? `piper ${m[0]}` : "piper (version unknown)";
149
153
  return Promise.resolve(opts.model ? `${v} ${basename(opts.model)}` : v);
150
154
  },
151
155
  synthesize: (req) => {
@@ -299,100 +303,51 @@ function mapAsrToScript(timed, scriptText) {
299
303
  };
300
304
  }));
301
305
  }
302
- /** Decode a 16-bit PCM RIFF/WAV to mono float samples (channels averaged). */
303
- function decodeWavMono(wav) {
304
- if (wav.length < 44 || wav.toString("ascii", 0, 4) !== "RIFF" || wav.toString("ascii", 8, 12) !== "WAVE") throw new NarrationError("not a RIFF/WAVE file");
305
- let channels = 1;
306
- let sampleRate = 16e3;
307
- let bits = 16;
308
- let dataOffset = -1;
309
- let dataSize = 0;
310
- let offset = 12;
311
- while (offset + 8 <= wav.length) {
312
- const id = wav.toString("ascii", offset, offset + 4);
313
- const size = wav.readUInt32LE(offset + 4);
314
- if (id === "fmt ") {
315
- channels = wav.readUInt16LE(offset + 10);
316
- sampleRate = wav.readUInt32LE(offset + 12);
317
- bits = wav.readUInt16LE(offset + 22);
318
- } else if (id === "data") {
319
- dataOffset = offset + 8;
320
- dataSize = size;
321
- }
322
- offset += 8 + size + size % 2;
323
- }
324
- if (bits !== 16) throw new NarrationError(`only 16-bit PCM WAV is supported (got ${bits}-bit)`);
325
- if (dataOffset < 0) throw new NarrationError("WAV has no data chunk");
326
- const frames = Math.floor(dataSize / 2 / Math.max(1, channels));
327
- const samples = new Float32Array(frames);
328
- for (let f = 0; f < frames; f++) {
329
- let acc = 0;
330
- for (let c = 0; c < channels; c++) acc += wav.readInt16LE(dataOffset + (f * channels + c) * 2);
331
- samples[f] = acc / channels / 32768;
332
- }
333
- return {
334
- samples,
335
- sampleRate
336
- };
337
- }
338
- /** Linear-resample mono float to a 16 kHz int16 LE PCM buffer (Vosk's input). */
339
- function resampleTo16kPcm(input) {
340
- const ratio = input.sampleRate / 16e3;
341
- const outLen = Math.max(1, Math.round(input.samples.length / ratio));
342
- const out = Buffer.alloc(outLen * 2);
343
- for (let i = 0; i < outLen; i++) {
344
- const src = i * ratio;
345
- const j = Math.floor(src);
346
- const frac = src - j;
347
- const a = input.samples[j] ?? 0;
348
- const b = input.samples[j + 1] ?? a;
349
- const v = Math.max(-1, Math.min(1, a + (b - a) * frac));
350
- out.writeInt16LE(Math.round(v * 32767), i * 2);
351
- }
352
- return out;
353
- }
354
306
  /**
355
- * Word timings via Vosk (alphacephei) — offline, Apache-2.0, ~50 MB model, a
356
- * real Node binding (no Python, no Docker, no multi-GB download). `vosk` is an
357
- * OPTIONAL peer: install it (`npm i vosk`) and point at a model
358
- * (`opts.model` / `VOSK_MODEL`) only if you use this aligner. ASR words are
359
- * mapped onto the script tokens by `mapAsrToScript`.
307
+ * Word timings via Vosk (alphacephei) — offline ASR, Apache-2.0. Shells out to
308
+ * a `vosk-align` command (the Python `vosk` binding + ffmpeg deliberately NOT
309
+ * the npm `vosk` package, whose `ffi-napi` native build is broken on modern
310
+ * Node). The command reads any audio and writes
311
+ * { "words": [ { "word", "start", "end", "conf"? }, … ] }
312
+ * to stdout; its recognized words are LCS-mapped onto the script tokens by
313
+ * `mapAsrToScript`, so mis-recognitions (e.g. an unknown proper noun) just
314
+ * interpolate cleanly between the words around them.
315
+ *
316
+ * Provide the command via `opts.command` / `VOSK_ALIGN` (default `vosk-align`);
317
+ * the model is the command's own concern (its default, or `--model`/VOSK_MODEL),
318
+ * passed through with `opts.model`.
360
319
  */
361
320
  function voskAligner(opts = {}) {
362
- const modelPath = opts.model ?? process.env["VOSK_MODEL"];
363
- let vosk = null;
364
- const load = async () => {
365
- if (vosk) return vosk;
366
- try {
367
- vosk = await import("vosk");
368
- } catch {
369
- throw new NarrationError("vosk is not installed — `npm i vosk` and download a model, or use --align heuristic");
370
- }
371
- vosk.setLogLevel(-1);
372
- return vosk;
373
- };
321
+ const command = opts.command ?? process.env["VOSK_ALIGN"] ?? "vosk-align";
374
322
  return {
375
323
  id: "vosk",
376
- version: async () => {
377
- if (!modelPath) throw new NarrationError("vosk needs a model — set VOSK_MODEL or pass { model } (alphacephei.com/vosk/models)");
378
- if (!existsSync(modelPath)) throw new NarrationError(`vosk model not found at ${modelPath}`);
379
- await load();
380
- return `vosk:${basename(modelPath)}`;
324
+ version: () => {
325
+ const r = spawnSync(command, ["--help"], { encoding: "utf8" });
326
+ if (r.error) {
327
+ if (r.error.code === "ENOENT") throw new NarrationError(`'${command}' not found — provide a vosk-align command (Apache-2.0 Vosk + ffmpeg, JSON {words:[{word,start,end}]} on stdout), or use --align heuristic`);
328
+ throw new NarrationError(`could not run ${command}: ${r.error.message}`);
329
+ }
330
+ return Promise.resolve(opts.model ? `vosk ${basename(opts.model)}` : "vosk");
381
331
  },
382
- align: async (req) => {
383
- const v = await load();
384
- const model = new v.Model(modelPath);
385
- const rec = new v.Recognizer({
386
- model,
387
- sampleRate: 16e3
388
- });
332
+ align: (req) => {
333
+ const tag = createHash("sha256").update(req.text).digest("hex").slice(0, 8);
334
+ const wavPath = join(tmpdir(), `glissade-vosk-${process.pid}-${tag}.wav`);
389
335
  try {
390
- rec.setWords(true);
391
- rec.acceptWaveform(resampleTo16kPcm(decodeWavMono(req.wav)));
392
- return mapAsrToScript(rec.finalResult().result ?? [], req.text);
336
+ writeFileSync(wavPath, req.wav);
337
+ const r = spawnSync(command, [wavPath, ...opts.model ? ["--model", opts.model] : []], {
338
+ encoding: "utf8",
339
+ maxBuffer: 64 * 1024 * 1024
340
+ });
341
+ if (r.error) throw new NarrationError(`${command} failed to run: ${r.error.message}`);
342
+ if (r.status !== 0) throw new NarrationError(`${command} failed: ${(r.stderr || "").slice(0, 300)}`);
343
+ const timed = (JSON.parse(r.stdout).words ?? []).filter((w) => typeof w.start === "number" && typeof w.end === "number").map((w) => ({
344
+ word: w.word,
345
+ start: w.start,
346
+ end: w.end
347
+ }));
348
+ return Promise.resolve(mapAsrToScript(timed, req.text));
393
349
  } finally {
394
- rec.free();
395
- model.free();
350
+ if (existsSync(wavPath)) unlinkSync(wavPath);
396
351
  }
397
352
  }
398
353
  };
@@ -539,4 +494,4 @@ function scriptPathFor(input) {
539
494
  return candidate;
540
495
  }
541
496
  //#endregion
542
- export { alignerById, cacheKey, decodeWavMono, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, resampleTo16kPcm, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
497
+ export { alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@glissade/narrate",
3
- "version": "0.5.0-pre.0",
3
+ "version": "0.5.0-pre.1",
4
4
  "description": "glissade narration + captions: TTS at prepare time (gs narrate), deterministic caching, narration-anchored timeline beats, and captions as plain tracks. Render stays offline.",
5
5
  "license": "Apache-2.0",
6
6
  "type": "module",
@@ -19,8 +19,8 @@
19
19
  "dist"
20
20
  ],
21
21
  "dependencies": {
22
- "@glissade/core": "0.5.0-pre.0",
23
- "@glissade/scene": "0.5.0-pre.0"
22
+ "@glissade/core": "0.5.0-pre.1",
23
+ "@glissade/scene": "0.5.0-pre.1"
24
24
  },
25
25
  "repository": {
26
26
  "type": "git",