@glissade/narrate 0.5.0-pre.0 → 0.5.0-pre.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -12,6 +12,29 @@ interface NarrationSegment {
12
12
  /** silence after THIS segment (s); overrides the script default */
13
13
  gapAfter?: number;
14
14
  }
15
+ /** What the music bed does across a pause window. */
16
+ type BedMode = /** hold the current (ducked) level across the pause — no swell, the default */
17
+ 'hold'
18
+ /** cut the bed to a floor for the window (a dramatic silence) */ | 'silence'
19
+ /** let the bed breathe back up to base while the voice rests */ | 'swell';
20
+ /**
21
+ * An explicit silence beat between segments — an addressable WINDOW, not just
22
+ * dead air. It shifts every later segment's start (re-flows on re-narrate) and
23
+ * gives you anchors (`beats.start/end/duration('id')`) to hang visuals and SFX
24
+ * on, plus a per-pause `bed` mode for the music. A pause supplies its own
25
+ * silence, so it suppresses the default inter-segment gap around it.
26
+ */
27
+ interface NarrationPause {
28
+ id: string;
29
+ /** the silence length in seconds */
30
+ pause: number;
31
+ /** what the music bed does across this window; default 'hold' */
32
+ bed?: BedMode;
33
+ }
34
+ /** A script element: a spoken segment or an explicit pause beat. */
35
+ type NarrationElement = NarrationSegment | NarrationPause;
36
+ /** A pause element is the one carrying a numeric `pause` field. */
37
+ declare function isPause(el: NarrationElement): el is NarrationPause;
15
38
  interface NarrationScript {
16
39
  narrationVersion: 1;
17
40
  provider?: string;
@@ -28,7 +51,8 @@ interface NarrationScript {
28
51
  * segments word-less. Providers that supply their own words ignore this.
29
52
  */
30
53
  align?: string;
31
- segments: NarrationSegment[];
54
+ /** spoken segments and explicit pause beats, in playback order */
55
+ segments: NarrationElement[];
32
56
  }
33
57
  interface TimedWord {
34
58
  word: string;
@@ -46,24 +70,35 @@ interface TimedSegment {
46
70
  /** present only when the provider supplies word timestamps */
47
71
  words?: TimedWord[];
48
72
  }
73
+ /** A resolved pause window in the committed manifest. */
74
+ interface TimedPause {
75
+ id: string;
76
+ start: number;
77
+ duration: number;
78
+ bed: BedMode;
79
+ }
49
80
  interface NarrationTiming {
50
81
  timingVersion: 1;
51
82
  provider: string;
52
83
  providerVersion: string;
53
84
  totalDuration: number;
54
85
  segments: TimedSegment[];
86
+ /** explicit pause windows, addressable like segments; omitted when none */
87
+ pauses?: TimedPause[];
55
88
  }
56
89
  declare class NarrationError extends Error {
57
90
  constructor(message: string);
58
91
  }
59
92
  interface NarrationAnchors {
60
- /** segment start, absolute timeline seconds */
93
+ /** segment OR pause start, absolute timeline seconds */
61
94
  start(id: string): number;
62
- /** segment end (start + duration) */
95
+ /** segment OR pause end (start + duration) */
63
96
  end(id: string): number;
64
97
  duration(id: string): number;
98
+ /** start + offset — a sub-beat inside a segment or pause window */
99
+ at(id: string, offset?: number): number;
65
100
  readonly totalDuration: number;
66
- /** '<id>.start' / '<id>.end' labels — merge into the timeline for studio visibility */
101
+ /** '<id>.start' / '<id>.end' labels (segments + pauses) — merge into the timeline for studio visibility */
67
102
  labels(): Record<string, number>;
68
103
  /** narration clips on the existing AudioClip machinery; baseUrl prefixes each file */
69
104
  clips(baseUrl: string): AudioClip[];
@@ -115,11 +150,15 @@ interface DuckOptions {
115
150
  mergeGap?: number;
116
151
  /** the music clip's `at` on the timeline; gain keys are CLIP-local. Default 0. */
117
152
  clipAt?: number;
153
+ /** gain a 'silence' pause ducks the bed to; default 0 (a true cut). */
154
+ silence?: number;
118
155
  }
119
156
  /**
120
157
  * The bed-ducking envelope every narrated video needs: duck windows are the
121
- * narration segments, with attack/release ramps and near-window merging.
122
- * Pure function of the committed manifestre-narrate and the ducking
158
+ * narration segments, with attack/release ramps and near-window merging. Pause
159
+ * beats join in by their `bed` mode `hold` (default) keeps the bed ducked
160
+ * across the pause, `silence` cuts it to a floor, `swell` lets it breathe back
161
+ * to base. Pure function of the committed manifest — re-narrate and the ducking
123
162
  * re-flows. Returns a keys-only gain envelope for AudioClip.gain.
124
163
  */
125
164
  declare function duckEnvelope(timing: NarrationTiming, opts?: DuckOptions): {
@@ -184,4 +223,4 @@ declare function music(timing: MusicTiming, at?: number): MusicAnchors;
184
223
  declare function toSrt(timing: NarrationTiming): string;
185
224
  declare function toVtt(timing: NarrationTiming): string;
186
225
  //#endregion
187
- export { CaptionStyle, CaptionTrackOptions, DuckOptions, MusicAnchors, MusicClipOptions, MusicTiming, NarrationAnchors, NarrationError, NarrationScript, NarrationSegment, NarrationTiming, TimedSegment, TimedWord, captionNode, captionTrack, duckEnvelope, music, narration, toSrt, toVtt, validateMusicTiming };
226
+ export { BedMode, CaptionStyle, CaptionTrackOptions, DuckOptions, MusicAnchors, MusicClipOptions, MusicTiming, NarrationAnchors, NarrationElement, NarrationError, NarrationPause, NarrationScript, NarrationSegment, NarrationTiming, TimedPause, TimedSegment, TimedWord, captionNode, captionTrack, duckEnvelope, isPause, music, narration, toSrt, toVtt, validateMusicTiming };
package/dist/index.js CHANGED
@@ -8,6 +8,10 @@ import { Text, glow } from "@glissade/scene";
8
8
  * offline and deterministic. Captions are a plain string track driving a
9
9
  * Text node — they live in the timeline JSON and golden-frame CI covers them.
10
10
  */
11
+ /** A pause element is the one carrying a numeric `pause` field. */
12
+ function isPause(el) {
13
+ return typeof el.pause === "number";
14
+ }
11
15
  var NarrationError = class extends Error {
12
16
  constructor(message) {
13
17
  super(message);
@@ -15,22 +19,37 @@ var NarrationError = class extends Error {
15
19
  }
16
20
  };
17
21
  function narration(timing) {
18
- const byId = new Map(timing.segments.map((s) => [s.id, s]));
19
- const seg = (id) => {
20
- const s = byId.get(id);
21
- if (!s) throw new NarrationError(`no narration segment '${id}' (have: ${[...byId.keys()].join(", ")})`);
22
- return s;
22
+ const byId = /* @__PURE__ */ new Map();
23
+ for (const s of timing.segments) {
24
+ if (byId.has(s.id)) throw new NarrationError(`duplicate narration id '${s.id}'`);
25
+ byId.set(s.id, {
26
+ start: s.start,
27
+ duration: s.duration
28
+ });
29
+ }
30
+ for (const p of timing.pauses ?? []) {
31
+ if (byId.has(p.id)) throw new NarrationError(`duplicate narration id '${p.id}' (segment and pause collide)`);
32
+ byId.set(p.id, {
33
+ start: p.start,
34
+ duration: p.duration
35
+ });
36
+ }
37
+ const beat = (id) => {
38
+ const b = byId.get(id);
39
+ if (!b) throw new NarrationError(`no narration beat '${id}' (have: ${[...byId.keys()].join(", ")})`);
40
+ return b;
23
41
  };
24
42
  return {
25
- start: (id) => seg(id).start,
26
- end: (id) => seg(id).start + seg(id).duration,
27
- duration: (id) => seg(id).duration,
43
+ start: (id) => beat(id).start,
44
+ end: (id) => beat(id).start + beat(id).duration,
45
+ duration: (id) => beat(id).duration,
46
+ at: (id, offset = 0) => beat(id).start + offset,
28
47
  totalDuration: timing.totalDuration,
29
48
  labels: () => {
30
49
  const out = {};
31
- for (const s of timing.segments) {
32
- out[`${s.id}.start`] = s.start;
33
- out[`${s.id}.end`] = s.start + s.duration;
50
+ for (const [id, b] of byId) {
51
+ out[`${id}.start`] = b.start;
52
+ out[`${id}.end`] = b.start + b.duration;
34
53
  }
35
54
  return out;
36
55
  },
@@ -89,8 +108,10 @@ function captionNode(size, style = {}) {
89
108
  }
90
109
  /**
91
110
  * The bed-ducking envelope every narrated video needs: duck windows are the
92
- * narration segments, with attack/release ramps and near-window merging.
93
- * Pure function of the committed manifestre-narrate and the ducking
111
+ * narration segments, with attack/release ramps and near-window merging. Pause
112
+ * beats join in by their `bed` mode `hold` (default) keeps the bed ducked
113
+ * across the pause, `silence` cuts it to a floor, `swell` lets it breathe back
114
+ * to base. Pure function of the committed manifest — re-narrate and the ducking
94
115
  * re-flows. Returns a keys-only gain envelope for AudioClip.gain.
95
116
  */
96
117
  function duckEnvelope(timing, opts = {}) {
@@ -100,31 +121,83 @@ function duckEnvelope(timing, opts = {}) {
100
121
  const release = opts.release ?? .4;
101
122
  const mergeGap = opts.mergeGap ?? .5;
102
123
  const clipAt = opts.clipAt ?? 0;
103
- const windows = [];
104
- for (const s of [...timing.segments].sort((a, b) => a.start - b.start)) {
105
- const last = windows[windows.length - 1];
106
- if (last && s.start - last.end < attack + release + mergeGap) last.end = Math.max(last.end, s.start + s.duration);
107
- else windows.push({
108
- start: s.start,
109
- end: s.start + s.duration
124
+ const silence = opts.silence ?? 0;
125
+ const levelOf = (bed) => bed === "silence" ? silence : bed === "swell" ? base : duck;
126
+ const raw = [...timing.segments.map((s) => ({
127
+ start: s.start,
128
+ end: s.start + s.duration,
129
+ level: duck
130
+ })), ...(timing.pauses ?? []).map((p) => ({
131
+ start: p.start,
132
+ end: p.start + p.duration,
133
+ level: levelOf(p.bed)
134
+ }))].sort((a, b) => a.start - b.start);
135
+ const merged = [];
136
+ for (const w of raw) {
137
+ const last = merged[merged.length - 1];
138
+ if (last && last.level === w.level && w.start - last.end < attack + release + mergeGap) last.end = Math.max(last.end, w.end);
139
+ else merged.push({ ...w });
140
+ }
141
+ const active = merged.filter((w) => w.level !== base);
142
+ if (active.length === 0) return { keys: [key(0, base)] };
143
+ const regions = [];
144
+ for (const w of active) {
145
+ const prev = regions[regions.length - 1];
146
+ if (prev && w.start > prev.end) regions.push({
147
+ start: prev.end,
148
+ end: w.start,
149
+ level: base
110
150
  });
151
+ regions.push({ ...w });
152
+ }
153
+ const transitions = [{
154
+ t: regions[0].start,
155
+ from: base,
156
+ to: regions[0].level
157
+ }];
158
+ for (let i = 0; i < regions.length - 1; i++) if (regions[i].level !== regions[i + 1].level) transitions.push({
159
+ t: regions[i].end,
160
+ from: regions[i].level,
161
+ to: regions[i + 1].level
162
+ });
163
+ const lastRegion = regions[regions.length - 1];
164
+ transitions.push({
165
+ t: lastRegion.end,
166
+ from: lastRegion.level,
167
+ to: base
168
+ });
169
+ let keys = [];
170
+ for (const tr of transitions) {
171
+ if (tr.to === tr.from) continue;
172
+ if (tr.to < tr.from) keys.push(key(tr.t - attack, tr.from), key(tr.t, tr.to));
173
+ else keys.push(key(tr.t, tr.from), key(tr.t + release, tr.to));
111
174
  }
112
- const keys = [];
113
- for (const w of windows) {
114
- const rampStart = w.start - attack - clipAt;
115
- const down = w.start - clipAt;
116
- const up = w.end - clipAt;
117
- const rampEnd = w.end + release - clipAt;
118
- if (rampEnd <= 0) continue;
119
- if (rampStart > 0) keys.push(key(rampStart, base));
120
- if (down > 0) keys.push(key(down, duck));
121
- else if (keys.length === 0) keys.push(key(0, duck));
122
- keys.push(key(Math.max(up, 1e-6), duck));
123
- keys.push(key(rampEnd, base));
175
+ keys = keys.map((k) => ({
176
+ t: k.t - clipAt,
177
+ value: k.value
178
+ })).sort((a, b) => a.t - b.t);
179
+ const ordered = [];
180
+ for (const k of keys) {
181
+ const prev = ordered[ordered.length - 1];
182
+ if (prev && k.t <= prev.t) prev.value = k.value;
183
+ else ordered.push(k);
184
+ }
185
+ const out = [];
186
+ for (let i = 0; i < ordered.length; i++) {
187
+ const k = ordered[i];
188
+ if (k.t < 0) {
189
+ const next = ordered[i + 1];
190
+ if (!next || next.t >= 0) {
191
+ const v = next && next.t > k.t ? k.value + (next.value - k.value) * ((0 - k.t) / (next.t - k.t)) : k.value;
192
+ out.push(key(0, v));
193
+ }
194
+ continue;
195
+ }
196
+ out.push(k);
124
197
  }
125
- if (keys.length === 0) keys.push(key(0, base));
126
- if (keys[0].t > 0) keys.unshift(key(0, base));
127
- return { keys };
198
+ if (out.length === 0) out.push(key(0, base));
199
+ if (out[0].t > 0) out.unshift(key(0, base));
200
+ return { keys: out };
128
201
  }
129
202
  function validateMusicTiming(timing) {
130
203
  if (timing.musicVersion !== 1) throw new NarrationError(`unsupported musicVersion ${String(timing.musicVersion)}`);
@@ -191,4 +264,4 @@ function toVtt(timing) {
191
264
  return "WEBVTT\n\n" + timing.segments.map((s) => `${srtTime(s.start, ".")} --> ${srtTime(s.start + s.duration, ".")}\n${s.text}`).join("\n\n") + "\n";
192
265
  }
193
266
  //#endregion
194
- export { NarrationError, captionNode, captionTrack, duckEnvelope, music, narration, toSrt, toVtt, validateMusicTiming };
267
+ export { NarrationError, captionNode, captionTrack, duckEnvelope, isPause, music, narration, toSrt, toVtt, validateMusicTiming };
@@ -41,9 +41,19 @@ declare function openaiProvider(opts?: {
41
41
  * offline. Needs a voice MODEL (`.onnx` + sibling `.onnx.json`) — pass its
42
42
  * path as `model`, or per-segment as `voice`. Emits no word timestamps; the
43
43
  * alignment step (below) fills them in.
44
+ *
45
+ * DETERMINISTIC by default: VITS adds noise (generator + the stochastic
46
+ * duration predictor), so the same text re-synthesizes to slightly different
47
+ * audio/durations. glissade zeroes both noise scales so re-synth is
48
+ * byte-identical — reproducible pipelines, glissade's determinism contract.
49
+ * For piper's more-natural (but drifting) prosody, pass its defaults
50
+ * (`{ noiseScale: 0.667, noiseWScale: 0.8 }`) and wire via `providerImpl`.
51
+ * The noise mode is part of `version()`, so changing it invalidates the cache.
44
52
  */
45
53
  declare function piperProvider(opts?: {
46
54
  model?: string;
55
+ noiseScale?: number;
56
+ noiseWScale?: number;
47
57
  }): TtsProvider;
48
58
  declare function providerById(id: string): TtsProvider;
49
59
  interface AlignRequest {
@@ -111,23 +121,29 @@ declare function mapAsrToScript(timed: {
111
121
  start: number;
112
122
  end: number;
113
123
  }[];
114
- interface WavMono {
115
- /** mono samples in [-1, 1] */
116
- samples: Float32Array;
117
- sampleRate: number;
124
+ /** one word from vosk-align's JSON output */
125
+ interface VoskAlignWord {
126
+ word: string;
127
+ start: number;
128
+ end: number;
129
+ conf?: number;
118
130
  }
119
- /** Decode a 16-bit PCM RIFF/WAV to mono float samples (channels averaged). */
120
- declare function decodeWavMono(wav: Buffer): WavMono;
121
- /** Linear-resample mono float to a 16 kHz int16 LE PCM buffer (Vosk's input). */
122
- declare function resampleTo16kPcm(input: WavMono): Buffer;
123
131
  /**
124
- * Word timings via Vosk (alphacephei) — offline, Apache-2.0, ~50 MB model, a
125
- * real Node binding (no Python, no Docker, no multi-GB download). `vosk` is an
126
- * OPTIONAL peer: install it (`npm i vosk`) and point at a model
127
- * (`opts.model` / `VOSK_MODEL`) only if you use this aligner. ASR words are
128
- * mapped onto the script tokens by `mapAsrToScript`.
132
+ * Word timings via Vosk (alphacephei) — offline ASR, Apache-2.0. Shells out to
133
+ * a `vosk-align` command (the Python `vosk` binding + ffmpeg deliberately NOT
134
+ * the npm `vosk` package, whose `ffi-napi` native build is broken on modern
135
+ * Node). The command reads any audio and writes
136
+ * { "words": [ { "word", "start", "end", "conf"? }, … ] }
137
+ * to stdout; its recognized words are LCS-mapped onto the script tokens by
138
+ * `mapAsrToScript`, so mis-recognitions (e.g. an unknown proper noun) just
139
+ * interpolate cleanly between the words around them.
140
+ *
141
+ * Provide the command via `opts.command` / `VOSK_ALIGN` (default `vosk-align`);
142
+ * the model is the command's own concern (its default, or `--model`/VOSK_MODEL),
143
+ * passed through with `opts.model`.
129
144
  */
130
145
  declare function voskAligner(opts?: {
146
+ command?: string;
131
147
  model?: string;
132
148
  }): Aligner;
133
149
  /** Resolve an aligner id; 'none' disables alignment (word-less segments). */
@@ -170,4 +186,4 @@ declare function synthesizeScript(scriptPath: string, opts?: SynthesizeOptions):
170
186
  /** Resolve `<scene>.narration.json` for a scene-module path (or accept the script itself). */
171
187
  declare function scriptPathFor(input: string): string;
172
188
  //#endregion
173
- export { AlignRequest, Aligner, SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, alignerById, cacheKey, decodeWavMono, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, resampleTo16kPcm, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
189
+ export { AlignRequest, Aligner, SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, VoskAlignWord, alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
package/dist/providers.js CHANGED
@@ -1,4 +1,4 @@
1
- import { NarrationError } from "./index.js";
1
+ import { NarrationError, isPause } from "./index.js";
2
2
  import { createHash } from "node:crypto";
3
3
  import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
4
4
  import { basename, dirname, join } from "node:path";
@@ -138,15 +138,34 @@ function openaiProvider(opts = {}) {
138
138
  * offline. Needs a voice MODEL (`.onnx` + sibling `.onnx.json`) — pass its
139
139
  * path as `model`, or per-segment as `voice`. Emits no word timestamps; the
140
140
  * alignment step (below) fills them in.
141
+ *
142
+ * DETERMINISTIC by default: VITS adds noise (generator + the stochastic
143
+ * duration predictor), so the same text re-synthesizes to slightly different
144
+ * audio/durations. glissade zeroes both noise scales so re-synth is
145
+ * byte-identical — reproducible pipelines, glissade's determinism contract.
146
+ * For piper's more-natural (but drifting) prosody, pass its defaults
147
+ * (`{ noiseScale: 0.667, noiseWScale: 0.8 }`) and wire via `providerImpl`.
148
+ * The noise mode is part of `version()`, so changing it invalidates the cache.
141
149
  */
142
150
  function piperProvider(opts = {}) {
151
+ const noiseScale = opts.noiseScale ?? 0;
152
+ const noiseWScale = opts.noiseWScale ?? 0;
143
153
  return {
144
154
  id: "piper",
145
155
  version: () => {
146
156
  const r = spawnSync("piper", ["--version"], { encoding: "utf8" });
147
- if (r.status !== 0) throw new NarrationError("piper not found on PATH — install rhasspy/piper, or use --provider fake/espeak/openai");
148
- const v = (r.stdout.trim() || r.stderr.trim() || "piper").split("\n")[0];
149
- return Promise.resolve(opts.model ? `${v} ${basename(opts.model)}` : v);
157
+ if (r.error) {
158
+ if (r.error.code === "ENOENT") throw new NarrationError("piper not found on PATH — `pip install piper-tts` (or the standalone rhasspy/piper), or use --provider fake/espeak/openai");
159
+ throw new NarrationError(`could not run piper: ${r.error.message}`);
160
+ }
161
+ const m = /\b\d+\.\d+\.\d+\b/.exec(r.stdout ?? "");
162
+ const noise = `noise=${noiseScale}/${noiseWScale}`;
163
+ const v = m ? `piper ${m[0]}` : "piper";
164
+ return Promise.resolve([
165
+ v,
166
+ noise,
167
+ opts.model ? basename(opts.model) : null
168
+ ].filter(Boolean).join(" "));
150
169
  },
151
170
  synthesize: (req) => {
152
171
  const model = req.voice ?? opts.model;
@@ -157,7 +176,11 @@ function piperProvider(opts = {}) {
157
176
  "--model",
158
177
  model,
159
178
  "--output_file",
160
- out
179
+ out,
180
+ "--noise-scale",
181
+ String(noiseScale),
182
+ "--noise-w-scale",
183
+ String(noiseWScale)
161
184
  ];
162
185
  if (req.rate !== void 0 && req.rate > 0) args.push("--length_scale", String(1 / req.rate));
163
186
  const r = spawnSync("piper", args, {
@@ -299,100 +322,51 @@ function mapAsrToScript(timed, scriptText) {
299
322
  };
300
323
  }));
301
324
  }
302
- /** Decode a 16-bit PCM RIFF/WAV to mono float samples (channels averaged). */
303
- function decodeWavMono(wav) {
304
- if (wav.length < 44 || wav.toString("ascii", 0, 4) !== "RIFF" || wav.toString("ascii", 8, 12) !== "WAVE") throw new NarrationError("not a RIFF/WAVE file");
305
- let channels = 1;
306
- let sampleRate = 16e3;
307
- let bits = 16;
308
- let dataOffset = -1;
309
- let dataSize = 0;
310
- let offset = 12;
311
- while (offset + 8 <= wav.length) {
312
- const id = wav.toString("ascii", offset, offset + 4);
313
- const size = wav.readUInt32LE(offset + 4);
314
- if (id === "fmt ") {
315
- channels = wav.readUInt16LE(offset + 10);
316
- sampleRate = wav.readUInt32LE(offset + 12);
317
- bits = wav.readUInt16LE(offset + 22);
318
- } else if (id === "data") {
319
- dataOffset = offset + 8;
320
- dataSize = size;
321
- }
322
- offset += 8 + size + size % 2;
323
- }
324
- if (bits !== 16) throw new NarrationError(`only 16-bit PCM WAV is supported (got ${bits}-bit)`);
325
- if (dataOffset < 0) throw new NarrationError("WAV has no data chunk");
326
- const frames = Math.floor(dataSize / 2 / Math.max(1, channels));
327
- const samples = new Float32Array(frames);
328
- for (let f = 0; f < frames; f++) {
329
- let acc = 0;
330
- for (let c = 0; c < channels; c++) acc += wav.readInt16LE(dataOffset + (f * channels + c) * 2);
331
- samples[f] = acc / channels / 32768;
332
- }
333
- return {
334
- samples,
335
- sampleRate
336
- };
337
- }
338
- /** Linear-resample mono float to a 16 kHz int16 LE PCM buffer (Vosk's input). */
339
- function resampleTo16kPcm(input) {
340
- const ratio = input.sampleRate / 16e3;
341
- const outLen = Math.max(1, Math.round(input.samples.length / ratio));
342
- const out = Buffer.alloc(outLen * 2);
343
- for (let i = 0; i < outLen; i++) {
344
- const src = i * ratio;
345
- const j = Math.floor(src);
346
- const frac = src - j;
347
- const a = input.samples[j] ?? 0;
348
- const b = input.samples[j + 1] ?? a;
349
- const v = Math.max(-1, Math.min(1, a + (b - a) * frac));
350
- out.writeInt16LE(Math.round(v * 32767), i * 2);
351
- }
352
- return out;
353
- }
354
325
  /**
355
- * Word timings via Vosk (alphacephei) — offline, Apache-2.0, ~50 MB model, a
356
- * real Node binding (no Python, no Docker, no multi-GB download). `vosk` is an
357
- * OPTIONAL peer: install it (`npm i vosk`) and point at a model
358
- * (`opts.model` / `VOSK_MODEL`) only if you use this aligner. ASR words are
359
- * mapped onto the script tokens by `mapAsrToScript`.
326
+ * Word timings via Vosk (alphacephei) — offline ASR, Apache-2.0. Shells out to
327
+ * a `vosk-align` command (the Python `vosk` binding + ffmpeg deliberately NOT
328
+ * the npm `vosk` package, whose `ffi-napi` native build is broken on modern
329
+ * Node). The command reads any audio and writes
330
+ * { "words": [ { "word", "start", "end", "conf"? }, … ] }
331
+ * to stdout; its recognized words are LCS-mapped onto the script tokens by
332
+ * `mapAsrToScript`, so mis-recognitions (e.g. an unknown proper noun) just
333
+ * interpolate cleanly between the words around them.
334
+ *
335
+ * Provide the command via `opts.command` / `VOSK_ALIGN` (default `vosk-align`);
336
+ * the model is the command's own concern (its default, or `--model`/VOSK_MODEL),
337
+ * passed through with `opts.model`.
360
338
  */
361
339
  function voskAligner(opts = {}) {
362
- const modelPath = opts.model ?? process.env["VOSK_MODEL"];
363
- let vosk = null;
364
- const load = async () => {
365
- if (vosk) return vosk;
366
- try {
367
- vosk = await import("vosk");
368
- } catch {
369
- throw new NarrationError("vosk is not installed — `npm i vosk` and download a model, or use --align heuristic");
370
- }
371
- vosk.setLogLevel(-1);
372
- return vosk;
373
- };
340
+ const command = opts.command ?? process.env["VOSK_ALIGN"] ?? "vosk-align";
374
341
  return {
375
342
  id: "vosk",
376
- version: async () => {
377
- if (!modelPath) throw new NarrationError("vosk needs a model — set VOSK_MODEL or pass { model } (alphacephei.com/vosk/models)");
378
- if (!existsSync(modelPath)) throw new NarrationError(`vosk model not found at ${modelPath}`);
379
- await load();
380
- return `vosk:${basename(modelPath)}`;
343
+ version: () => {
344
+ const r = spawnSync(command, ["--help"], { encoding: "utf8" });
345
+ if (r.error) {
346
+ if (r.error.code === "ENOENT") throw new NarrationError(`'${command}' not found — provide a vosk-align command (Apache-2.0 Vosk + ffmpeg, JSON {words:[{word,start,end}]} on stdout), or use --align heuristic`);
347
+ throw new NarrationError(`could not run ${command}: ${r.error.message}`);
348
+ }
349
+ return Promise.resolve(opts.model ? `vosk ${basename(opts.model)}` : "vosk");
381
350
  },
382
- align: async (req) => {
383
- const v = await load();
384
- const model = new v.Model(modelPath);
385
- const rec = new v.Recognizer({
386
- model,
387
- sampleRate: 16e3
388
- });
351
+ align: (req) => {
352
+ const tag = createHash("sha256").update(req.text).digest("hex").slice(0, 8);
353
+ const wavPath = join(tmpdir(), `glissade-vosk-${process.pid}-${tag}.wav`);
389
354
  try {
390
- rec.setWords(true);
391
- rec.acceptWaveform(resampleTo16kPcm(decodeWavMono(req.wav)));
392
- return mapAsrToScript(rec.finalResult().result ?? [], req.text);
355
+ writeFileSync(wavPath, req.wav);
356
+ const r = spawnSync(command, [wavPath, ...opts.model ? ["--model", opts.model] : []], {
357
+ encoding: "utf8",
358
+ maxBuffer: 64 * 1024 * 1024
359
+ });
360
+ if (r.error) throw new NarrationError(`${command} failed to run: ${r.error.message}`);
361
+ if (r.status !== 0) throw new NarrationError(`${command} failed: ${(r.stderr || "").slice(0, 300)}`);
362
+ const timed = (JSON.parse(r.stdout).words ?? []).filter((w) => typeof w.start === "number" && typeof w.end === "number").map((w) => ({
363
+ word: w.word,
364
+ start: w.start,
365
+ end: w.end
366
+ }));
367
+ return Promise.resolve(mapAsrToScript(timed, req.text));
393
368
  } finally {
394
- rec.free();
395
- model.free();
369
+ if (existsSync(wavPath)) unlinkSync(wavPath);
396
370
  }
397
371
  }
398
372
  };
@@ -424,9 +398,10 @@ async function synthesizeScript(scriptPath, opts = {}) {
424
398
  const raw = JSON.parse(readFileSync(scriptPath, "utf8"));
425
399
  if (raw.narrationVersion !== 1) throw new NarrationError(`unsupported narrationVersion ${String(raw.narrationVersion)}`);
426
400
  const ids = /* @__PURE__ */ new Set();
427
- for (const s of raw.segments) {
428
- if (ids.has(s.id)) throw new NarrationError(`duplicate segment id '${s.id}'`);
429
- ids.add(s.id);
401
+ for (const el of raw.segments) {
402
+ if (ids.has(el.id)) throw new NarrationError(`duplicate narration id '${el.id}'`);
403
+ ids.add(el.id);
404
+ if (isPause(el) && !(el.pause > 0)) throw new NarrationError(`pause '${el.id}' needs pause > 0`);
430
405
  }
431
406
  const provider = opts.providerImpl ?? providerById(opts.provider ?? raw.provider ?? "espeak");
432
407
  const providerVersion = await provider.version();
@@ -449,8 +424,22 @@ async function synthesizeScript(scriptPath, opts = {}) {
449
424
  const reused = [];
450
425
  const aligned = [];
451
426
  const segments = [];
427
+ const pauses = [];
452
428
  let cursor = raw.leadIn ?? 0;
453
- for (const seg of raw.segments) {
429
+ const elements = raw.segments;
430
+ for (let i = 0; i < elements.length; i++) {
431
+ const el = elements[i];
432
+ if (isPause(el)) {
433
+ pauses.push({
434
+ id: el.id,
435
+ start: cursor,
436
+ duration: el.pause,
437
+ bed: el.bed ?? "hold"
438
+ });
439
+ cursor += el.pause;
440
+ continue;
441
+ }
442
+ const seg = el;
454
443
  const req = { text: seg.text };
455
444
  const voice = seg.voice ?? raw.voice;
456
445
  const rate = seg.rate ?? raw.rate;
@@ -508,16 +497,20 @@ async function synthesizeScript(scriptPath, opts = {}) {
508
497
  end: cursor + w.end
509
498
  }));
510
499
  segments.push(timed);
511
- cursor += duration + (seg.gapAfter ?? raw.gap ?? .35);
500
+ cursor += duration;
501
+ const next = elements[i + 1];
502
+ if (next && !isPause(next)) cursor += seg.gapAfter ?? raw.gap ?? .35;
512
503
  }
513
504
  cache.entries = Object.fromEntries(Object.entries(cache.entries).sort(([a], [b]) => a.localeCompare(b)));
514
505
  writeFileSync(cachePath, JSON.stringify(cache, null, 2) + "\n");
506
+ const ends = [...segments.map((s) => s.start + s.duration), ...pauses.map((p) => p.start + p.duration)];
515
507
  const timing = {
516
508
  timingVersion: 1,
517
509
  provider: provider.id,
518
510
  providerVersion,
519
- totalDuration: segments.length > 0 ? segments[segments.length - 1].start + segments[segments.length - 1].duration : 0,
520
- segments
511
+ totalDuration: ends.length > 0 ? Math.max(...ends) : 0,
512
+ segments,
513
+ ...pauses.length > 0 ? { pauses } : {}
521
514
  };
522
515
  const timingPath = `${base}.narration.timing.json`;
523
516
  writeFileSync(timingPath, JSON.stringify(timing, null, 2) + "\n");
@@ -539,4 +532,4 @@ function scriptPathFor(input) {
539
532
  return candidate;
540
533
  }
541
534
  //#endregion
542
- export { alignerById, cacheKey, decodeWavMono, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, resampleTo16kPcm, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
535
+ export { alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@glissade/narrate",
3
- "version": "0.5.0-pre.0",
3
+ "version": "0.5.0-pre.2",
4
4
  "description": "glissade narration + captions: TTS at prepare time (gs narrate), deterministic caching, narration-anchored timeline beats, and captions as plain tracks. Render stays offline.",
5
5
  "license": "Apache-2.0",
6
6
  "type": "module",
@@ -19,8 +19,8 @@
19
19
  "dist"
20
20
  ],
21
21
  "dependencies": {
22
- "@glissade/core": "0.5.0-pre.0",
23
- "@glissade/scene": "0.5.0-pre.0"
22
+ "@glissade/core": "0.5.0-pre.2",
23
+ "@glissade/scene": "0.5.0-pre.2"
24
24
  },
25
25
  "repository": {
26
26
  "type": "git",