npm - @glissade/narrate - Versions diffs - 0.5.0-pre.0 → 0.5.0-pre.1 - Mend

@glissade/narrate 0.5.0-pre.0 → 0.5.0-pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/providers.d.ts CHANGED Viewed

@@ -111,23 +111,29 @@ declare function mapAsrToScript(timed: {
   start: number;
   end: number;
 }[];
-interface WavMono {
-  /** mono samples in [-1, 1] */
-  samples: Float32Array;
-  sampleRate: number;
+/** one word from vosk-align's JSON output */
+interface VoskAlignWord {
+  word: string;
+  start: number;
+  end: number;
+  conf?: number;
 }
-/** Decode a 16-bit PCM RIFF/WAV to mono float samples (channels averaged). */
-declare function decodeWavMono(wav: Buffer): WavMono;
-/** Linear-resample mono float to a 16 kHz int16 LE PCM buffer (Vosk's input). */
-declare function resampleTo16kPcm(input: WavMono): Buffer;
 /**
- * Word timings via Vosk (alphacephei) — offline, Apache-2.0, ~50 MB model, a
- * real Node binding (no Python, no Docker, no multi-GB download). `vosk` is an
- * OPTIONAL peer: install it (`npm i vosk`) and point at a model
- * (`opts.model` / `VOSK_MODEL`) only if you use this aligner. ASR words are
- * mapped onto the script tokens by `mapAsrToScript`.
+ * Word timings via Vosk (alphacephei) — offline ASR, Apache-2.0. Shells out to
+ * a `vosk-align` command (the Python `vosk` binding + ffmpeg — deliberately NOT
+ * the npm `vosk` package, whose `ffi-napi` native build is broken on modern
+ * Node). The command reads any audio and writes
+ *   { "words": [ { "word", "start", "end", "conf"? }, … ] }
+ * to stdout; its recognized words are LCS-mapped onto the script tokens by
+ * `mapAsrToScript`, so mis-recognitions (e.g. an unknown proper noun) just
+ * interpolate cleanly between the words around them.
+ *
+ * Provide the command via `opts.command` / `VOSK_ALIGN` (default `vosk-align`);
+ * the model is the command's own concern (its default, or `--model`/VOSK_MODEL),
+ * passed through with `opts.model`.
  */
 declare function voskAligner(opts?: {
+  command?: string;
   model?: string;
 }): Aligner;
 /** Resolve an aligner id; 'none' disables alignment (word-less segments). */
@@ -170,4 +176,4 @@ declare function synthesizeScript(scriptPath: string, opts?: SynthesizeOptions):
 /** Resolve `<scene>.narration.json` for a scene-module path (or accept the script itself). */
 declare function scriptPathFor(input: string): string;
 //#endregion
-export { AlignRequest, Aligner, SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, alignerById, cacheKey, decodeWavMono, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, resampleTo16kPcm, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
+export { AlignRequest, Aligner, SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, VoskAlignWord, alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, scriptPathFor, synthesizeScript, voskAligner, wavDuration };

package/dist/providers.js CHANGED Viewed

@@ -144,8 +144,12 @@ function piperProvider(opts = {}) {
 		id: "piper",
 		version: () => {
 			const r = spawnSync("piper", ["--version"], { encoding: "utf8" });
-			if (r.status !== 0) throw new NarrationError("piper not found on PATH — install rhasspy/piper, or use --provider fake/espeak/openai");
-			const v = (r.stdout.trim() || r.stderr.trim() || "piper").split("\n")[0];
+			if (r.error) {
+				if (r.error.code === "ENOENT") throw new NarrationError("piper not found on PATH — `pip install piper-tts` (or the standalone rhasspy/piper), or use --provider fake/espeak/openai");
+				throw new NarrationError(`could not run piper: ${r.error.message}`);
+			}
+			const m = /\b\d+\.\d+\.\d+\b/.exec(r.stdout ?? "");
+			const v = m ? `piper ${m[0]}` : "piper (version unknown)";
 			return Promise.resolve(opts.model ? `${v} ${basename(opts.model)}` : v);
 		},
 		synthesize: (req) => {
@@ -299,100 +303,51 @@ function mapAsrToScript(timed, scriptText) {
 		};
 	}));
 }
-/** Decode a 16-bit PCM RIFF/WAV to mono float samples (channels averaged). */
-function decodeWavMono(wav) {
-	if (wav.length < 44 || wav.toString("ascii", 0, 4) !== "RIFF" || wav.toString("ascii", 8, 12) !== "WAVE") throw new NarrationError("not a RIFF/WAVE file");
-	let channels = 1;
-	let sampleRate = 16e3;
-	let bits = 16;
-	let dataOffset = -1;
-	let dataSize = 0;
-	let offset = 12;
-	while (offset + 8 <= wav.length) {
-		const id = wav.toString("ascii", offset, offset + 4);
-		const size = wav.readUInt32LE(offset + 4);
-		if (id === "fmt ") {
-			channels = wav.readUInt16LE(offset + 10);
-			sampleRate = wav.readUInt32LE(offset + 12);
-			bits = wav.readUInt16LE(offset + 22);
-		} else if (id === "data") {
-			dataOffset = offset + 8;
-			dataSize = size;
-		}
-		offset += 8 + size + size % 2;
-	}
-	if (bits !== 16) throw new NarrationError(`only 16-bit PCM WAV is supported (got ${bits}-bit)`);
-	if (dataOffset < 0) throw new NarrationError("WAV has no data chunk");
-	const frames = Math.floor(dataSize / 2 / Math.max(1, channels));
-	const samples = new Float32Array(frames);
-	for (let f = 0; f < frames; f++) {
-		let acc = 0;
-		for (let c = 0; c < channels; c++) acc += wav.readInt16LE(dataOffset + (f * channels + c) * 2);
-		samples[f] = acc / channels / 32768;
-	}
-	return {
-		samples,
-		sampleRate
-	};
-}
-/** Linear-resample mono float to a 16 kHz int16 LE PCM buffer (Vosk's input). */
-function resampleTo16kPcm(input) {
-	const ratio = input.sampleRate / 16e3;
-	const outLen = Math.max(1, Math.round(input.samples.length / ratio));
-	const out = Buffer.alloc(outLen * 2);
-	for (let i = 0; i < outLen; i++) {
-		const src = i * ratio;
-		const j = Math.floor(src);
-		const frac = src - j;
-		const a = input.samples[j] ?? 0;
-		const b = input.samples[j + 1] ?? a;
-		const v = Math.max(-1, Math.min(1, a + (b - a) * frac));
-		out.writeInt16LE(Math.round(v * 32767), i * 2);
-	}
-	return out;
-}
 /**
-* Word timings via Vosk (alphacephei) — offline, Apache-2.0, ~50 MB model, a
-* real Node binding (no Python, no Docker, no multi-GB download). `vosk` is an
-* OPTIONAL peer: install it (`npm i vosk`) and point at a model
-* (`opts.model` / `VOSK_MODEL`) only if you use this aligner. ASR words are
-* mapped onto the script tokens by `mapAsrToScript`.
+* Word timings via Vosk (alphacephei) — offline ASR, Apache-2.0. Shells out to
+* a `vosk-align` command (the Python `vosk` binding + ffmpeg — deliberately NOT
+* the npm `vosk` package, whose `ffi-napi` native build is broken on modern
+* Node). The command reads any audio and writes
+*   { "words": [ { "word", "start", "end", "conf"? }, … ] }
+* to stdout; its recognized words are LCS-mapped onto the script tokens by
+* `mapAsrToScript`, so mis-recognitions (e.g. an unknown proper noun) just
+* interpolate cleanly between the words around them.
+*
+* Provide the command via `opts.command` / `VOSK_ALIGN` (default `vosk-align`);
+* the model is the command's own concern (its default, or `--model`/VOSK_MODEL),
+* passed through with `opts.model`.
 */
 function voskAligner(opts = {}) {
-	const modelPath = opts.model ?? process.env["VOSK_MODEL"];
-	let vosk = null;
-	const load = async () => {
-		if (vosk) return vosk;
-		try {
-			vosk = await import("vosk");
-		} catch {
-			throw new NarrationError("vosk is not installed — `npm i vosk` and download a model, or use --align heuristic");
-		}
-		vosk.setLogLevel(-1);
-		return vosk;
-	};
+	const command = opts.command ?? process.env["VOSK_ALIGN"] ?? "vosk-align";
 	return {
 		id: "vosk",
-		version: async () => {
-			if (!modelPath) throw new NarrationError("vosk needs a model — set VOSK_MODEL or pass { model } (alphacephei.com/vosk/models)");
-			if (!existsSync(modelPath)) throw new NarrationError(`vosk model not found at ${modelPath}`);
-			await load();
-			return `vosk:${basename(modelPath)}`;
+		version: () => {
+			const r = spawnSync(command, ["--help"], { encoding: "utf8" });
+			if (r.error) {
+				if (r.error.code === "ENOENT") throw new NarrationError(`'${command}' not found — provide a vosk-align command (Apache-2.0 Vosk + ffmpeg, JSON {words:[{word,start,end}]} on stdout), or use --align heuristic`);
+				throw new NarrationError(`could not run ${command}: ${r.error.message}`);
+			}
+			return Promise.resolve(opts.model ? `vosk ${basename(opts.model)}` : "vosk");
 		},
-		align: async (req) => {
-			const v = await load();
-			const model = new v.Model(modelPath);
-			const rec = new v.Recognizer({
-				model,
-				sampleRate: 16e3
-			});
+		align: (req) => {
+			const tag = createHash("sha256").update(req.text).digest("hex").slice(0, 8);
+			const wavPath = join(tmpdir(), `glissade-vosk-${process.pid}-${tag}.wav`);
 			try {
-				rec.setWords(true);
-				rec.acceptWaveform(resampleTo16kPcm(decodeWavMono(req.wav)));
-				return mapAsrToScript(rec.finalResult().result ?? [], req.text);
+				writeFileSync(wavPath, req.wav);
+				const r = spawnSync(command, [wavPath, ...opts.model ? ["--model", opts.model] : []], {
+					encoding: "utf8",
+					maxBuffer: 64 * 1024 * 1024
+				});
+				if (r.error) throw new NarrationError(`${command} failed to run: ${r.error.message}`);
+				if (r.status !== 0) throw new NarrationError(`${command} failed: ${(r.stderr || "").slice(0, 300)}`);
+				const timed = (JSON.parse(r.stdout).words ?? []).filter((w) => typeof w.start === "number" && typeof w.end === "number").map((w) => ({
+					word: w.word,
+					start: w.start,
+					end: w.end
+				}));
+				return Promise.resolve(mapAsrToScript(timed, req.text));
 			} finally {
-				rec.free();
-				model.free();
+				if (existsSync(wavPath)) unlinkSync(wavPath);
 			}
 		}
 	};
@@ -539,4 +494,4 @@ function scriptPathFor(input) {
 	return candidate;
 }
 //#endregion
-export { alignerById, cacheKey, decodeWavMono, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, resampleTo16kPcm, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
+export { alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, scriptPathFor, synthesizeScript, voskAligner, wavDuration };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@glissade/narrate",
-  "version": "0.5.0-pre.0",
+  "version": "0.5.0-pre.1",
   "description": "glissade narration + captions: TTS at prepare time (gs narrate), deterministic caching, narration-anchored timeline beats, and captions as plain tracks. Render stays offline.",
   "license": "Apache-2.0",
   "type": "module",
@@ -19,8 +19,8 @@
     "dist"
   ],
   "dependencies": {
-    "@glissade/core": "0.5.0-pre.0",
-    "@glissade/scene": "0.5.0-pre.0"
+    "@glissade/core": "0.5.0-pre.1",
+    "@glissade/scene": "0.5.0-pre.1"
   },
   "repository": {
     "type": "git",