npm - @glissade/narrate - Versions diffs - 0.5.0-pre.0 → 0.5.0-pre.2 - Mend

@glissade/narrate 0.5.0-pre.0 → 0.5.0-pre.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -12,6 +12,29 @@ interface NarrationSegment {
   /** silence after THIS segment (s); overrides the script default */
   gapAfter?: number;
 }
+/** What the music bed does across a pause window. */
+type BedMode = /** hold the current (ducked) level across the pause — no swell, the default */
+'hold'
+/** cut the bed to a floor for the window (a dramatic silence) */ | 'silence'
+/** let the bed breathe back up to base while the voice rests */ | 'swell';
+/**
+ * An explicit silence beat between segments — an addressable WINDOW, not just
+ * dead air. It shifts every later segment's start (re-flows on re-narrate) and
+ * gives you anchors (`beats.start/end/duration('id')`) to hang visuals and SFX
+ * on, plus a per-pause `bed` mode for the music. A pause supplies its own
+ * silence, so it suppresses the default inter-segment gap around it.
+ */
+interface NarrationPause {
+  id: string;
+  /** the silence length in seconds */
+  pause: number;
+  /** what the music bed does across this window; default 'hold' */
+  bed?: BedMode;
+}
+/** A script element: a spoken segment or an explicit pause beat. */
+type NarrationElement = NarrationSegment | NarrationPause;
+/** A pause element is the one carrying a numeric `pause` field. */
+declare function isPause(el: NarrationElement): el is NarrationPause;
 interface NarrationScript {
   narrationVersion: 1;
   provider?: string;
@@ -28,7 +51,8 @@ interface NarrationScript {
    * segments word-less. Providers that supply their own words ignore this.
    */
   align?: string;
-  segments: NarrationSegment[];
+  /** spoken segments and explicit pause beats, in playback order */
+  segments: NarrationElement[];
 }
 interface TimedWord {
   word: string;
@@ -46,24 +70,35 @@ interface TimedSegment {
   /** present only when the provider supplies word timestamps */
   words?: TimedWord[];
 }
+/** A resolved pause window in the committed manifest. */
+interface TimedPause {
+  id: string;
+  start: number;
+  duration: number;
+  bed: BedMode;
+}
 interface NarrationTiming {
   timingVersion: 1;
   provider: string;
   providerVersion: string;
   totalDuration: number;
   segments: TimedSegment[];
+  /** explicit pause windows, addressable like segments; omitted when none */
+  pauses?: TimedPause[];
 }
 declare class NarrationError extends Error {
   constructor(message: string);
 }
 interface NarrationAnchors {
-  /** segment start, absolute timeline seconds */
+  /** segment OR pause start, absolute timeline seconds */
   start(id: string): number;
-  /** segment end (start + duration) */
+  /** segment OR pause end (start + duration) */
   end(id: string): number;
   duration(id: string): number;
+  /** start + offset — a sub-beat inside a segment or pause window */
+  at(id: string, offset?: number): number;
   readonly totalDuration: number;
-  /** '<id>.start' / '<id>.end' labels — merge into the timeline for studio visibility */
+  /** '<id>.start' / '<id>.end' labels (segments + pauses) — merge into the timeline for studio visibility */
   labels(): Record<string, number>;
   /** narration clips on the existing AudioClip machinery; baseUrl prefixes each file */
   clips(baseUrl: string): AudioClip[];
@@ -115,11 +150,15 @@ interface DuckOptions {
   mergeGap?: number;
   /** the music clip's `at` on the timeline; gain keys are CLIP-local. Default 0. */
   clipAt?: number;
+  /** gain a 'silence' pause ducks the bed to; default 0 (a true cut). */
+  silence?: number;
 }
 /**
  * The bed-ducking envelope every narrated video needs: duck windows are the
- * narration segments, with attack/release ramps and near-window merging.
- * Pure function of the committed manifest — re-narrate and the ducking
+ * narration segments, with attack/release ramps and near-window merging. Pause
+ * beats join in by their `bed` mode — `hold` (default) keeps the bed ducked
+ * across the pause, `silence` cuts it to a floor, `swell` lets it breathe back
+ * to base. Pure function of the committed manifest — re-narrate and the ducking
  * re-flows. Returns a keys-only gain envelope for AudioClip.gain.
  */
 declare function duckEnvelope(timing: NarrationTiming, opts?: DuckOptions): {
@@ -184,4 +223,4 @@ declare function music(timing: MusicTiming, at?: number): MusicAnchors;
 declare function toSrt(timing: NarrationTiming): string;
 declare function toVtt(timing: NarrationTiming): string;
 //#endregion
-export { CaptionStyle, CaptionTrackOptions, DuckOptions, MusicAnchors, MusicClipOptions, MusicTiming, NarrationAnchors, NarrationError, NarrationScript, NarrationSegment, NarrationTiming, TimedSegment, TimedWord, captionNode, captionTrack, duckEnvelope, music, narration, toSrt, toVtt, validateMusicTiming };
+export { BedMode, CaptionStyle, CaptionTrackOptions, DuckOptions, MusicAnchors, MusicClipOptions, MusicTiming, NarrationAnchors, NarrationElement, NarrationError, NarrationPause, NarrationScript, NarrationSegment, NarrationTiming, TimedPause, TimedSegment, TimedWord, captionNode, captionTrack, duckEnvelope, isPause, music, narration, toSrt, toVtt, validateMusicTiming };

package/dist/index.js CHANGED Viewed

@@ -8,6 +8,10 @@ import { Text, glow } from "@glissade/scene";
 * offline and deterministic. Captions are a plain string track driving a
 * Text node — they live in the timeline JSON and golden-frame CI covers them.
 */
+/** A pause element is the one carrying a numeric `pause` field. */
+function isPause(el) {
+	return typeof el.pause === "number";
+}
 var NarrationError = class extends Error {
 	constructor(message) {
 		super(message);
@@ -15,22 +19,37 @@ var NarrationError = class extends Error {
 	}
 };
 function narration(timing) {
-	const byId = new Map(timing.segments.map((s) => [s.id, s]));
-	const seg = (id) => {
-		const s = byId.get(id);
-		if (!s) throw new NarrationError(`no narration segment '${id}' (have: ${[...byId.keys()].join(", ")})`);
-		return s;
+	const byId = /* @__PURE__ */ new Map();
+	for (const s of timing.segments) {
+		if (byId.has(s.id)) throw new NarrationError(`duplicate narration id '${s.id}'`);
+		byId.set(s.id, {
+			start: s.start,
+			duration: s.duration
+		});
+	}
+	for (const p of timing.pauses ?? []) {
+		if (byId.has(p.id)) throw new NarrationError(`duplicate narration id '${p.id}' (segment and pause collide)`);
+		byId.set(p.id, {
+			start: p.start,
+			duration: p.duration
+		});
+	}
+	const beat = (id) => {
+		const b = byId.get(id);
+		if (!b) throw new NarrationError(`no narration beat '${id}' (have: ${[...byId.keys()].join(", ")})`);
+		return b;
 	};
 	return {
-		start: (id) => seg(id).start,
-		end: (id) => seg(id).start + seg(id).duration,
-		duration: (id) => seg(id).duration,
+		start: (id) => beat(id).start,
+		end: (id) => beat(id).start + beat(id).duration,
+		duration: (id) => beat(id).duration,
+		at: (id, offset = 0) => beat(id).start + offset,
 		totalDuration: timing.totalDuration,
 		labels: () => {
 			const out = {};
-			for (const s of timing.segments) {
-				out[`${s.id}.start`] = s.start;
-				out[`${s.id}.end`] = s.start + s.duration;
+			for (const [id, b] of byId) {
+				out[`${id}.start`] = b.start;
+				out[`${id}.end`] = b.start + b.duration;
 			}
 			return out;
 		},
@@ -89,8 +108,10 @@ function captionNode(size, style = {}) {
 }
 /**
 * The bed-ducking envelope every narrated video needs: duck windows are the
-* narration segments, with attack/release ramps and near-window merging.
-* Pure function of the committed manifest — re-narrate and the ducking
+* narration segments, with attack/release ramps and near-window merging. Pause
+* beats join in by their `bed` mode — `hold` (default) keeps the bed ducked
+* across the pause, `silence` cuts it to a floor, `swell` lets it breathe back
+* to base. Pure function of the committed manifest — re-narrate and the ducking
 * re-flows. Returns a keys-only gain envelope for AudioClip.gain.
 */
 function duckEnvelope(timing, opts = {}) {
@@ -100,31 +121,83 @@ function duckEnvelope(timing, opts = {}) {
 	const release = opts.release ?? .4;
 	const mergeGap = opts.mergeGap ?? .5;
 	const clipAt = opts.clipAt ?? 0;
-	const windows = [];
-	for (const s of [...timing.segments].sort((a, b) => a.start - b.start)) {
-		const last = windows[windows.length - 1];
-		if (last && s.start - last.end < attack + release + mergeGap) last.end = Math.max(last.end, s.start + s.duration);
-		else windows.push({
-			start: s.start,
-			end: s.start + s.duration
+	const silence = opts.silence ?? 0;
+	const levelOf = (bed) => bed === "silence" ? silence : bed === "swell" ? base : duck;
+	const raw = [...timing.segments.map((s) => ({
+		start: s.start,
+		end: s.start + s.duration,
+		level: duck
+	})), ...(timing.pauses ?? []).map((p) => ({
+		start: p.start,
+		end: p.start + p.duration,
+		level: levelOf(p.bed)
+	}))].sort((a, b) => a.start - b.start);
+	const merged = [];
+	for (const w of raw) {
+		const last = merged[merged.length - 1];
+		if (last && last.level === w.level && w.start - last.end < attack + release + mergeGap) last.end = Math.max(last.end, w.end);
+		else merged.push({ ...w });
+	}
+	const active = merged.filter((w) => w.level !== base);
+	if (active.length === 0) return { keys: [key(0, base)] };
+	const regions = [];
+	for (const w of active) {
+		const prev = regions[regions.length - 1];
+		if (prev && w.start > prev.end) regions.push({
+			start: prev.end,
+			end: w.start,
+			level: base
 		});
+		regions.push({ ...w });
+	}
+	const transitions = [{
+		t: regions[0].start,
+		from: base,
+		to: regions[0].level
+	}];
+	for (let i = 0; i < regions.length - 1; i++) if (regions[i].level !== regions[i + 1].level) transitions.push({
+		t: regions[i].end,
+		from: regions[i].level,
+		to: regions[i + 1].level
+	});
+	const lastRegion = regions[regions.length - 1];
+	transitions.push({
+		t: lastRegion.end,
+		from: lastRegion.level,
+		to: base
+	});
+	let keys = [];
+	for (const tr of transitions) {
+		if (tr.to === tr.from) continue;
+		if (tr.to < tr.from) keys.push(key(tr.t - attack, tr.from), key(tr.t, tr.to));
+		else keys.push(key(tr.t, tr.from), key(tr.t + release, tr.to));
 	}
-	const keys = [];
-	for (const w of windows) {
-		const rampStart = w.start - attack - clipAt;
-		const down = w.start - clipAt;
-		const up = w.end - clipAt;
-		const rampEnd = w.end + release - clipAt;
-		if (rampEnd <= 0) continue;
-		if (rampStart > 0) keys.push(key(rampStart, base));
-		if (down > 0) keys.push(key(down, duck));
-		else if (keys.length === 0) keys.push(key(0, duck));
-		keys.push(key(Math.max(up, 1e-6), duck));
-		keys.push(key(rampEnd, base));
+	keys = keys.map((k) => ({
+		t: k.t - clipAt,
+		value: k.value
+	})).sort((a, b) => a.t - b.t);
+	const ordered = [];
+	for (const k of keys) {
+		const prev = ordered[ordered.length - 1];
+		if (prev && k.t <= prev.t) prev.value = k.value;
+		else ordered.push(k);
+	}
+	const out = [];
+	for (let i = 0; i < ordered.length; i++) {
+		const k = ordered[i];
+		if (k.t < 0) {
+			const next = ordered[i + 1];
+			if (!next || next.t >= 0) {
+				const v = next && next.t > k.t ? k.value + (next.value - k.value) * ((0 - k.t) / (next.t - k.t)) : k.value;
+				out.push(key(0, v));
+			}
+			continue;
+		}
+		out.push(k);
 	}
-	if (keys.length === 0) keys.push(key(0, base));
-	if (keys[0].t > 0) keys.unshift(key(0, base));
-	return { keys };
+	if (out.length === 0) out.push(key(0, base));
+	if (out[0].t > 0) out.unshift(key(0, base));
+	return { keys: out };
 }
 function validateMusicTiming(timing) {
 	if (timing.musicVersion !== 1) throw new NarrationError(`unsupported musicVersion ${String(timing.musicVersion)}`);
@@ -191,4 +264,4 @@ function toVtt(timing) {
 	return "WEBVTT\n\n" + timing.segments.map((s) => `${srtTime(s.start, ".")} --> ${srtTime(s.start + s.duration, ".")}\n${s.text}`).join("\n\n") + "\n";
 }
 //#endregion
-export { NarrationError, captionNode, captionTrack, duckEnvelope, music, narration, toSrt, toVtt, validateMusicTiming };
+export { NarrationError, captionNode, captionTrack, duckEnvelope, isPause, music, narration, toSrt, toVtt, validateMusicTiming };

package/dist/providers.d.ts CHANGED Viewed

@@ -41,9 +41,19 @@ declare function openaiProvider(opts?: {
  * offline. Needs a voice MODEL (`.onnx` + sibling `.onnx.json`) — pass its
  * path as `model`, or per-segment as `voice`. Emits no word timestamps; the
  * alignment step (below) fills them in.
+ *
+ * DETERMINISTIC by default: VITS adds noise (generator + the stochastic
+ * duration predictor), so the same text re-synthesizes to slightly different
+ * audio/durations. glissade zeroes both noise scales so re-synth is
+ * byte-identical — reproducible pipelines, glissade's determinism contract.
+ * For piper's more-natural (but drifting) prosody, pass its defaults
+ * (`{ noiseScale: 0.667, noiseWScale: 0.8 }`) and wire via `providerImpl`.
+ * The noise mode is part of `version()`, so changing it invalidates the cache.
  */
 declare function piperProvider(opts?: {
   model?: string;
+  noiseScale?: number;
+  noiseWScale?: number;
 }): TtsProvider;
 declare function providerById(id: string): TtsProvider;
 interface AlignRequest {
@@ -111,23 +121,29 @@ declare function mapAsrToScript(timed: {
   start: number;
   end: number;
 }[];
-interface WavMono {
-  /** mono samples in [-1, 1] */
-  samples: Float32Array;
-  sampleRate: number;
+/** one word from vosk-align's JSON output */
+interface VoskAlignWord {
+  word: string;
+  start: number;
+  end: number;
+  conf?: number;
 }
-/** Decode a 16-bit PCM RIFF/WAV to mono float samples (channels averaged). */
-declare function decodeWavMono(wav: Buffer): WavMono;
-/** Linear-resample mono float to a 16 kHz int16 LE PCM buffer (Vosk's input). */
-declare function resampleTo16kPcm(input: WavMono): Buffer;
 /**
- * Word timings via Vosk (alphacephei) — offline, Apache-2.0, ~50 MB model, a
- * real Node binding (no Python, no Docker, no multi-GB download). `vosk` is an
- * OPTIONAL peer: install it (`npm i vosk`) and point at a model
- * (`opts.model` / `VOSK_MODEL`) only if you use this aligner. ASR words are
- * mapped onto the script tokens by `mapAsrToScript`.
+ * Word timings via Vosk (alphacephei) — offline ASR, Apache-2.0. Shells out to
+ * a `vosk-align` command (the Python `vosk` binding + ffmpeg — deliberately NOT
+ * the npm `vosk` package, whose `ffi-napi` native build is broken on modern
+ * Node). The command reads any audio and writes
+ *   { "words": [ { "word", "start", "end", "conf"? }, … ] }
+ * to stdout; its recognized words are LCS-mapped onto the script tokens by
+ * `mapAsrToScript`, so mis-recognitions (e.g. an unknown proper noun) just
+ * interpolate cleanly between the words around them.
+ *
+ * Provide the command via `opts.command` / `VOSK_ALIGN` (default `vosk-align`);
+ * the model is the command's own concern (its default, or `--model`/VOSK_MODEL),
+ * passed through with `opts.model`.
  */
 declare function voskAligner(opts?: {
+  command?: string;
   model?: string;
 }): Aligner;
 /** Resolve an aligner id; 'none' disables alignment (word-less segments). */
@@ -170,4 +186,4 @@ declare function synthesizeScript(scriptPath: string, opts?: SynthesizeOptions):
 /** Resolve `<scene>.narration.json` for a scene-module path (or accept the script itself). */
 declare function scriptPathFor(input: string): string;
 //#endregion
-export { AlignRequest, Aligner, SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, alignerById, cacheKey, decodeWavMono, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, resampleTo16kPcm, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
+export { AlignRequest, Aligner, SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, VoskAlignWord, alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, scriptPathFor, synthesizeScript, voskAligner, wavDuration };

package/dist/providers.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { NarrationError } from "./index.js";
+import { NarrationError, isPause } from "./index.js";
 import { createHash } from "node:crypto";
 import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
 import { basename, dirname, join } from "node:path";
@@ -138,15 +138,34 @@ function openaiProvider(opts = {}) {
 * offline. Needs a voice MODEL (`.onnx` + sibling `.onnx.json`) — pass its
 * path as `model`, or per-segment as `voice`. Emits no word timestamps; the
 * alignment step (below) fills them in.
+*
+* DETERMINISTIC by default: VITS adds noise (generator + the stochastic
+* duration predictor), so the same text re-synthesizes to slightly different
+* audio/durations. glissade zeroes both noise scales so re-synth is
+* byte-identical — reproducible pipelines, glissade's determinism contract.
+* For piper's more-natural (but drifting) prosody, pass its defaults
+* (`{ noiseScale: 0.667, noiseWScale: 0.8 }`) and wire via `providerImpl`.
+* The noise mode is part of `version()`, so changing it invalidates the cache.
 */
 function piperProvider(opts = {}) {
+	const noiseScale = opts.noiseScale ?? 0;
+	const noiseWScale = opts.noiseWScale ?? 0;
 	return {
 		id: "piper",
 		version: () => {
 			const r = spawnSync("piper", ["--version"], { encoding: "utf8" });
-			if (r.status !== 0) throw new NarrationError("piper not found on PATH — install rhasspy/piper, or use --provider fake/espeak/openai");
-			const v = (r.stdout.trim() || r.stderr.trim() || "piper").split("\n")[0];
-			return Promise.resolve(opts.model ? `${v} ${basename(opts.model)}` : v);
+			if (r.error) {
+				if (r.error.code === "ENOENT") throw new NarrationError("piper not found on PATH — `pip install piper-tts` (or the standalone rhasspy/piper), or use --provider fake/espeak/openai");
+				throw new NarrationError(`could not run piper: ${r.error.message}`);
+			}
+			const m = /\b\d+\.\d+\.\d+\b/.exec(r.stdout ?? "");
+			const noise = `noise=${noiseScale}/${noiseWScale}`;
+			const v = m ? `piper ${m[0]}` : "piper";
+			return Promise.resolve([
+				v,
+				noise,
+				opts.model ? basename(opts.model) : null
+			].filter(Boolean).join(" "));
 		},
 		synthesize: (req) => {
 			const model = req.voice ?? opts.model;
@@ -157,7 +176,11 @@ function piperProvider(opts = {}) {
 				"--model",
 				model,
 				"--output_file",
-				out
+				out,
+				"--noise-scale",
+				String(noiseScale),
+				"--noise-w-scale",
+				String(noiseWScale)
 			];
 			if (req.rate !== void 0 && req.rate > 0) args.push("--length_scale", String(1 / req.rate));
 			const r = spawnSync("piper", args, {
@@ -299,100 +322,51 @@ function mapAsrToScript(timed, scriptText) {
 		};
 	}));
 }
-/** Decode a 16-bit PCM RIFF/WAV to mono float samples (channels averaged). */
-function decodeWavMono(wav) {
-	if (wav.length < 44 || wav.toString("ascii", 0, 4) !== "RIFF" || wav.toString("ascii", 8, 12) !== "WAVE") throw new NarrationError("not a RIFF/WAVE file");
-	let channels = 1;
-	let sampleRate = 16e3;
-	let bits = 16;
-	let dataOffset = -1;
-	let dataSize = 0;
-	let offset = 12;
-	while (offset + 8 <= wav.length) {
-		const id = wav.toString("ascii", offset, offset + 4);
-		const size = wav.readUInt32LE(offset + 4);
-		if (id === "fmt ") {
-			channels = wav.readUInt16LE(offset + 10);
-			sampleRate = wav.readUInt32LE(offset + 12);
-			bits = wav.readUInt16LE(offset + 22);
-		} else if (id === "data") {
-			dataOffset = offset + 8;
-			dataSize = size;
-		}
-		offset += 8 + size + size % 2;
-	}
-	if (bits !== 16) throw new NarrationError(`only 16-bit PCM WAV is supported (got ${bits}-bit)`);
-	if (dataOffset < 0) throw new NarrationError("WAV has no data chunk");
-	const frames = Math.floor(dataSize / 2 / Math.max(1, channels));
-	const samples = new Float32Array(frames);
-	for (let f = 0; f < frames; f++) {
-		let acc = 0;
-		for (let c = 0; c < channels; c++) acc += wav.readInt16LE(dataOffset + (f * channels + c) * 2);
-		samples[f] = acc / channels / 32768;
-	}
-	return {
-		samples,
-		sampleRate
-	};
-}
-/** Linear-resample mono float to a 16 kHz int16 LE PCM buffer (Vosk's input). */
-function resampleTo16kPcm(input) {
-	const ratio = input.sampleRate / 16e3;
-	const outLen = Math.max(1, Math.round(input.samples.length / ratio));
-	const out = Buffer.alloc(outLen * 2);
-	for (let i = 0; i < outLen; i++) {
-		const src = i * ratio;
-		const j = Math.floor(src);
-		const frac = src - j;
-		const a = input.samples[j] ?? 0;
-		const b = input.samples[j + 1] ?? a;
-		const v = Math.max(-1, Math.min(1, a + (b - a) * frac));
-		out.writeInt16LE(Math.round(v * 32767), i * 2);
-	}
-	return out;
-}
 /**
-* Word timings via Vosk (alphacephei) — offline, Apache-2.0, ~50 MB model, a
-* real Node binding (no Python, no Docker, no multi-GB download). `vosk` is an
-* OPTIONAL peer: install it (`npm i vosk`) and point at a model
-* (`opts.model` / `VOSK_MODEL`) only if you use this aligner. ASR words are
-* mapped onto the script tokens by `mapAsrToScript`.
+* Word timings via Vosk (alphacephei) — offline ASR, Apache-2.0. Shells out to
+* a `vosk-align` command (the Python `vosk` binding + ffmpeg — deliberately NOT
+* the npm `vosk` package, whose `ffi-napi` native build is broken on modern
+* Node). The command reads any audio and writes
+*   { "words": [ { "word", "start", "end", "conf"? }, … ] }
+* to stdout; its recognized words are LCS-mapped onto the script tokens by
+* `mapAsrToScript`, so mis-recognitions (e.g. an unknown proper noun) just
+* interpolate cleanly between the words around them.
+*
+* Provide the command via `opts.command` / `VOSK_ALIGN` (default `vosk-align`);
+* the model is the command's own concern (its default, or `--model`/VOSK_MODEL),
+* passed through with `opts.model`.
 */
 function voskAligner(opts = {}) {
-	const modelPath = opts.model ?? process.env["VOSK_MODEL"];
-	let vosk = null;
-	const load = async () => {
-		if (vosk) return vosk;
-		try {
-			vosk = await import("vosk");
-		} catch {
-			throw new NarrationError("vosk is not installed — `npm i vosk` and download a model, or use --align heuristic");
-		}
-		vosk.setLogLevel(-1);
-		return vosk;
-	};
+	const command = opts.command ?? process.env["VOSK_ALIGN"] ?? "vosk-align";
 	return {
 		id: "vosk",
-		version: async () => {
-			if (!modelPath) throw new NarrationError("vosk needs a model — set VOSK_MODEL or pass { model } (alphacephei.com/vosk/models)");
-			if (!existsSync(modelPath)) throw new NarrationError(`vosk model not found at ${modelPath}`);
-			await load();
-			return `vosk:${basename(modelPath)}`;
+		version: () => {
+			const r = spawnSync(command, ["--help"], { encoding: "utf8" });
+			if (r.error) {
+				if (r.error.code === "ENOENT") throw new NarrationError(`'${command}' not found — provide a vosk-align command (Apache-2.0 Vosk + ffmpeg, JSON {words:[{word,start,end}]} on stdout), or use --align heuristic`);
+				throw new NarrationError(`could not run ${command}: ${r.error.message}`);
+			}
+			return Promise.resolve(opts.model ? `vosk ${basename(opts.model)}` : "vosk");
 		},
-		align: async (req) => {
-			const v = await load();
-			const model = new v.Model(modelPath);
-			const rec = new v.Recognizer({
-				model,
-				sampleRate: 16e3
-			});
+		align: (req) => {
+			const tag = createHash("sha256").update(req.text).digest("hex").slice(0, 8);
+			const wavPath = join(tmpdir(), `glissade-vosk-${process.pid}-${tag}.wav`);
 			try {
-				rec.setWords(true);
-				rec.acceptWaveform(resampleTo16kPcm(decodeWavMono(req.wav)));
-				return mapAsrToScript(rec.finalResult().result ?? [], req.text);
+				writeFileSync(wavPath, req.wav);
+				const r = spawnSync(command, [wavPath, ...opts.model ? ["--model", opts.model] : []], {
+					encoding: "utf8",
+					maxBuffer: 64 * 1024 * 1024
+				});
+				if (r.error) throw new NarrationError(`${command} failed to run: ${r.error.message}`);
+				if (r.status !== 0) throw new NarrationError(`${command} failed: ${(r.stderr || "").slice(0, 300)}`);
+				const timed = (JSON.parse(r.stdout).words ?? []).filter((w) => typeof w.start === "number" && typeof w.end === "number").map((w) => ({
+					word: w.word,
+					start: w.start,
+					end: w.end
+				}));
+				return Promise.resolve(mapAsrToScript(timed, req.text));
 			} finally {
-				rec.free();
-				model.free();
+				if (existsSync(wavPath)) unlinkSync(wavPath);
 			}
 		}
 	};
@@ -424,9 +398,10 @@ async function synthesizeScript(scriptPath, opts = {}) {
 	const raw = JSON.parse(readFileSync(scriptPath, "utf8"));
 	if (raw.narrationVersion !== 1) throw new NarrationError(`unsupported narrationVersion ${String(raw.narrationVersion)}`);
 	const ids = /* @__PURE__ */ new Set();
-	for (const s of raw.segments) {
-		if (ids.has(s.id)) throw new NarrationError(`duplicate segment id '${s.id}'`);
-		ids.add(s.id);
+	for (const el of raw.segments) {
+		if (ids.has(el.id)) throw new NarrationError(`duplicate narration id '${el.id}'`);
+		ids.add(el.id);
+		if (isPause(el) && !(el.pause > 0)) throw new NarrationError(`pause '${el.id}' needs pause > 0`);
 	}
 	const provider = opts.providerImpl ?? providerById(opts.provider ?? raw.provider ?? "espeak");
 	const providerVersion = await provider.version();
@@ -449,8 +424,22 @@ async function synthesizeScript(scriptPath, opts = {}) {
 	const reused = [];
 	const aligned = [];
 	const segments = [];
+	const pauses = [];
 	let cursor = raw.leadIn ?? 0;
-	for (const seg of raw.segments) {
+	const elements = raw.segments;
+	for (let i = 0; i < elements.length; i++) {
+		const el = elements[i];
+		if (isPause(el)) {
+			pauses.push({
+				id: el.id,
+				start: cursor,
+				duration: el.pause,
+				bed: el.bed ?? "hold"
+			});
+			cursor += el.pause;
+			continue;
+		}
+		const seg = el;
 		const req = { text: seg.text };
 		const voice = seg.voice ?? raw.voice;
 		const rate = seg.rate ?? raw.rate;
@@ -508,16 +497,20 @@ async function synthesizeScript(scriptPath, opts = {}) {
 			end: cursor + w.end
 		}));
 		segments.push(timed);
-		cursor += duration + (seg.gapAfter ?? raw.gap ?? .35);
+		cursor += duration;
+		const next = elements[i + 1];
+		if (next && !isPause(next)) cursor += seg.gapAfter ?? raw.gap ?? .35;
 	}
 	cache.entries = Object.fromEntries(Object.entries(cache.entries).sort(([a], [b]) => a.localeCompare(b)));
 	writeFileSync(cachePath, JSON.stringify(cache, null, 2) + "\n");
+	const ends = [...segments.map((s) => s.start + s.duration), ...pauses.map((p) => p.start + p.duration)];
 	const timing = {
 		timingVersion: 1,
 		provider: provider.id,
 		providerVersion,
-		totalDuration: segments.length > 0 ? segments[segments.length - 1].start + segments[segments.length - 1].duration : 0,
-		segments
+		totalDuration: ends.length > 0 ? Math.max(...ends) : 0,
+		segments,
+		...pauses.length > 0 ? { pauses } : {}
 	};
 	const timingPath = `${base}.narration.timing.json`;
 	writeFileSync(timingPath, JSON.stringify(timing, null, 2) + "\n");
@@ -539,4 +532,4 @@ function scriptPathFor(input) {
 	return candidate;
 }
 //#endregion
-export { alignerById, cacheKey, decodeWavMono, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, resampleTo16kPcm, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
+export { alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, scriptPathFor, synthesizeScript, voskAligner, wavDuration };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@glissade/narrate",
-  "version": "0.5.0-pre.0",
+  "version": "0.5.0-pre.2",
   "description": "glissade narration + captions: TTS at prepare time (gs narrate), deterministic caching, narration-anchored timeline beats, and captions as plain tracks. Render stays offline.",
   "license": "Apache-2.0",
   "type": "module",
@@ -19,8 +19,8 @@
     "dist"
   ],
   "dependencies": {
-    "@glissade/core": "0.5.0-pre.0",
-    "@glissade/scene": "0.5.0-pre.0"
+    "@glissade/core": "0.5.0-pre.2",
+    "@glissade/scene": "0.5.0-pre.2"
   },
   "repository": {
     "type": "git",