@glissade/narrate 0.4.5 → 0.5.0-pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +7 -0
- package/dist/providers.d.ts +114 -2
- package/dist/providers.js +257 -9
- package/package.json +3 -3
package/dist/index.d.ts
CHANGED
|
@@ -21,6 +21,13 @@ interface NarrationScript {
|
|
|
21
21
|
gap?: number;
|
|
22
22
|
/** silence before the first segment (s); default 0 */
|
|
23
23
|
leadIn?: number;
|
|
24
|
+
/**
|
|
25
|
+
* Word-timing aligner for providers that don't emit word timestamps
|
|
26
|
+
* (espeak / openai / piper). 'heuristic' (default) estimates from text;
|
|
27
|
+
* 'vosk' derives real timings from the audio (offline ASR); 'none' leaves
|
|
28
|
+
* segments word-less. Providers that supply their own words ignore this.
|
|
29
|
+
*/
|
|
30
|
+
align?: string;
|
|
24
31
|
segments: NarrationSegment[];
|
|
25
32
|
}
|
|
26
33
|
interface TimedWord {
|
package/dist/providers.d.ts
CHANGED
|
@@ -36,10 +36,118 @@ declare function espeakProvider(): TtsProvider;
|
|
|
36
36
|
declare function openaiProvider(opts?: {
|
|
37
37
|
model?: string;
|
|
38
38
|
}): TtsProvider;
|
|
39
|
+
/**
|
|
40
|
+
* VITS-based local TTS: far more natural than espeak, runs on CPU, fully
|
|
41
|
+
* offline. Needs a voice MODEL (`.onnx` + sibling `.onnx.json`) — pass its
|
|
42
|
+
* path as `model`, or per-segment as `voice`. Emits no word timestamps; the
|
|
43
|
+
* alignment step (below) fills them in.
|
|
44
|
+
*/
|
|
45
|
+
declare function piperProvider(opts?: {
|
|
46
|
+
model?: string;
|
|
47
|
+
}): TtsProvider;
|
|
39
48
|
declare function providerById(id: string): TtsProvider;
|
|
49
|
+
interface AlignRequest {
|
|
50
|
+
/** the synthesized RIFF/WAV bytes */
|
|
51
|
+
wav: Buffer;
|
|
52
|
+
/** the spoken text (the segment text) */
|
|
53
|
+
text: string;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Turns (audio, known text) into per-word timings — the provider-independent
|
|
57
|
+
* way to get word timestamps. Runs ONLY in the prepare step (heavy work is
|
|
58
|
+
* fine; it runs once and the result is cached). Same three-member shape as
|
|
59
|
+
* TtsProvider: `version()` participates in the cache so swapping aligners
|
|
60
|
+
* re-aligns the CACHED wav without re-synthesizing.
|
|
61
|
+
*/
|
|
62
|
+
interface Aligner {
|
|
63
|
+
readonly id: string;
|
|
64
|
+
version(): Promise<string>;
|
|
65
|
+
align(req: AlignRequest): Promise<{
|
|
66
|
+
word: string;
|
|
67
|
+
start: number;
|
|
68
|
+
end: number;
|
|
69
|
+
}[]>;
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Distribute words across the clip by estimated spoken length (syllables, not
|
|
73
|
+
* characters — closer to real timing). Pure, deterministic, zero-dependency:
|
|
74
|
+
* the always-available floor. Good enough for captions; karaoke on a very
|
|
75
|
+
* slow/fast word wants a real aligner.
|
|
76
|
+
*/
|
|
77
|
+
declare function heuristicWords(text: string, duration: number): {
|
|
78
|
+
word: string;
|
|
79
|
+
start: number;
|
|
80
|
+
end: number;
|
|
81
|
+
}[];
|
|
82
|
+
declare function heuristicAligner(): Aligner;
|
|
83
|
+
/**
|
|
84
|
+
* Fill words whose start/end are NaN by linear interpolation between their
|
|
85
|
+
* known neighbours (edges clamp). Keeps the result monotonic. Used after
|
|
86
|
+
* mapping when some script words got no timing.
|
|
87
|
+
*/
|
|
88
|
+
declare function interpolateMissing(words: {
|
|
89
|
+
word: string;
|
|
90
|
+
start: number;
|
|
91
|
+
end: number;
|
|
92
|
+
}[]): {
|
|
93
|
+
word: string;
|
|
94
|
+
start: number;
|
|
95
|
+
end: number;
|
|
96
|
+
}[];
|
|
97
|
+
/**
|
|
98
|
+
* Transfer timed words (from an aligner) onto the script's own word tokens.
|
|
99
|
+
* Forced aligners return near-identical words; ASR (whisper) can differ
|
|
100
|
+
* (numbers spelled out, punctuation), so we LCS-align the normalized
|
|
101
|
+
* sequences and interpolate script words the aligner didn't time. Output
|
|
102
|
+
* length === script word count, in script order — what `wordBoxes()` indexes
|
|
103
|
+
* against. If nothing matched, distribute by syllable over the timed span.
|
|
104
|
+
*/
|
|
105
|
+
declare function mapAsrToScript(timed: {
|
|
106
|
+
word: string;
|
|
107
|
+
start: number;
|
|
108
|
+
end: number;
|
|
109
|
+
}[], scriptText: string): {
|
|
110
|
+
word: string;
|
|
111
|
+
start: number;
|
|
112
|
+
end: number;
|
|
113
|
+
}[];
|
|
114
|
+
/** one word from vosk-align's JSON output */
|
|
115
|
+
interface VoskAlignWord {
|
|
116
|
+
word: string;
|
|
117
|
+
start: number;
|
|
118
|
+
end: number;
|
|
119
|
+
conf?: number;
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Word timings via Vosk (alphacephei) — offline ASR, Apache-2.0. Shells out to
|
|
123
|
+
* a `vosk-align` command (the Python `vosk` binding + ffmpeg — deliberately NOT
|
|
124
|
+
* the npm `vosk` package, whose `ffi-napi` native build is broken on modern
|
|
125
|
+
* Node). The command reads any audio and writes
|
|
126
|
+
* { "words": [ { "word", "start", "end", "conf"? }, … ] }
|
|
127
|
+
* to stdout; its recognized words are LCS-mapped onto the script tokens by
|
|
128
|
+
* `mapAsrToScript`, so mis-recognitions (e.g. an unknown proper noun) just
|
|
129
|
+
* interpolate cleanly between the words around them.
|
|
130
|
+
*
|
|
131
|
+
* Provide the command via `opts.command` / `VOSK_ALIGN` (default `vosk-align`);
|
|
132
|
+
* the model is the command's own concern (its default, or `--model`/VOSK_MODEL),
|
|
133
|
+
* passed through with `opts.model`.
|
|
134
|
+
*/
|
|
135
|
+
declare function voskAligner(opts?: {
|
|
136
|
+
command?: string;
|
|
137
|
+
model?: string;
|
|
138
|
+
}): Aligner;
|
|
139
|
+
/** Resolve an aligner id; 'none' disables alignment (word-less segments). */
|
|
140
|
+
declare function alignerById(id: string): Aligner | null;
|
|
40
141
|
interface SynthesizeOptions {
|
|
41
|
-
/** override the script's provider */
|
|
142
|
+
/** override the script's provider (by id) */
|
|
42
143
|
provider?: string;
|
|
144
|
+
/** override the script's aligner ('heuristic' | 'vosk' | 'none') */
|
|
145
|
+
aligner?: string;
|
|
146
|
+
/** a provider INSTANCE — wins over `provider`; the bring-your-own seam
|
|
147
|
+
* (e.g. a custom ElevenLabs/Azure TtsProvider) */
|
|
148
|
+
providerImpl?: TtsProvider;
|
|
149
|
+
/** an aligner INSTANCE (or null to disable) — wins over `aligner` */
|
|
150
|
+
alignerImpl?: Aligner | null;
|
|
43
151
|
/** ignore the cache and re-synthesize everything */
|
|
44
152
|
force?: boolean;
|
|
45
153
|
}
|
|
@@ -49,6 +157,10 @@ interface SynthesizeResult {
|
|
|
49
157
|
cacheDir: string;
|
|
50
158
|
synthesized: string[];
|
|
51
159
|
reused: string[];
|
|
160
|
+
/** segment ids whose words came from the aligner (not the provider) */
|
|
161
|
+
aligned: string[];
|
|
162
|
+
/** the aligner id used, or null when alignment was disabled */
|
|
163
|
+
aligner: string | null;
|
|
52
164
|
}
|
|
53
165
|
declare function cacheKey(seg: {
|
|
54
166
|
text: string;
|
|
@@ -64,4 +176,4 @@ declare function synthesizeScript(scriptPath: string, opts?: SynthesizeOptions):
|
|
|
64
176
|
/** Resolve `<scene>.narration.json` for a scene-module path (or accept the script itself). */
|
|
65
177
|
declare function scriptPathFor(input: string): string;
|
|
66
178
|
//#endregion
|
|
67
|
-
export { SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, cacheKey, espeakProvider, fakeProvider, openaiProvider, providerById, scriptPathFor, synthesizeScript, wavDuration };
|
|
179
|
+
export { AlignRequest, Aligner, SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, VoskAlignWord, alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
|
package/dist/providers.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { NarrationError } from "./index.js";
|
|
2
2
|
import { createHash } from "node:crypto";
|
|
3
|
-
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
|
+
import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|
4
4
|
import { basename, dirname, join } from "node:path";
|
|
5
|
+
import { tmpdir } from "node:os";
|
|
5
6
|
import { spawnSync } from "node:child_process";
|
|
6
7
|
//#region src/providers.ts
|
|
7
8
|
/**
|
|
@@ -132,12 +133,232 @@ function openaiProvider(opts = {}) {
|
|
|
132
133
|
}
|
|
133
134
|
};
|
|
134
135
|
}
|
|
136
|
+
/**
|
|
137
|
+
* VITS-based local TTS: far more natural than espeak, runs on CPU, fully
|
|
138
|
+
* offline. Needs a voice MODEL (`.onnx` + sibling `.onnx.json`) — pass its
|
|
139
|
+
* path as `model`, or per-segment as `voice`. Emits no word timestamps; the
|
|
140
|
+
* alignment step (below) fills them in.
|
|
141
|
+
*/
|
|
142
|
+
function piperProvider(opts = {}) {
|
|
143
|
+
return {
|
|
144
|
+
id: "piper",
|
|
145
|
+
version: () => {
|
|
146
|
+
const r = spawnSync("piper", ["--version"], { encoding: "utf8" });
|
|
147
|
+
if (r.error) {
|
|
148
|
+
if (r.error.code === "ENOENT") throw new NarrationError("piper not found on PATH — `pip install piper-tts` (or the standalone rhasspy/piper), or use --provider fake/espeak/openai");
|
|
149
|
+
throw new NarrationError(`could not run piper: ${r.error.message}`);
|
|
150
|
+
}
|
|
151
|
+
const m = /\b\d+\.\d+\.\d+\b/.exec(r.stdout ?? "");
|
|
152
|
+
const v = m ? `piper ${m[0]}` : "piper (version unknown)";
|
|
153
|
+
return Promise.resolve(opts.model ? `${v} ${basename(opts.model)}` : v);
|
|
154
|
+
},
|
|
155
|
+
synthesize: (req) => {
|
|
156
|
+
const model = req.voice ?? opts.model;
|
|
157
|
+
if (!model) throw new NarrationError("piper needs a voice model (.onnx) — pass { model }, or set the segment voice to its path");
|
|
158
|
+
const tag = createHash("sha256").update(req.text).digest("hex").slice(0, 8);
|
|
159
|
+
const out = join(tmpdir(), `glissade-piper-${process.pid}-${tag}.wav`);
|
|
160
|
+
const args = [
|
|
161
|
+
"--model",
|
|
162
|
+
model,
|
|
163
|
+
"--output_file",
|
|
164
|
+
out
|
|
165
|
+
];
|
|
166
|
+
if (req.rate !== void 0 && req.rate > 0) args.push("--length_scale", String(1 / req.rate));
|
|
167
|
+
const r = spawnSync("piper", args, {
|
|
168
|
+
input: req.text,
|
|
169
|
+
maxBuffer: 64 * 1024 * 1024
|
|
170
|
+
});
|
|
171
|
+
try {
|
|
172
|
+
if (r.status !== 0 || !existsSync(out)) throw new NarrationError(`piper failed: ${r.stderr?.toString().slice(0, 300) ?? "no output"}`);
|
|
173
|
+
const wav = readFileSync(out);
|
|
174
|
+
return Promise.resolve({
|
|
175
|
+
wav,
|
|
176
|
+
duration: wavDuration(wav)
|
|
177
|
+
});
|
|
178
|
+
} finally {
|
|
179
|
+
if (existsSync(out)) unlinkSync(out);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
};
|
|
183
|
+
}
|
|
135
184
|
function providerById(id) {
|
|
136
185
|
switch (id) {
|
|
137
186
|
case "fake": return fakeProvider();
|
|
138
187
|
case "espeak": return espeakProvider();
|
|
188
|
+
case "piper": return piperProvider();
|
|
139
189
|
case "openai": return openaiProvider();
|
|
140
|
-
default: throw new NarrationError(`unknown TTS provider '${id}' (have: fake, espeak, openai)`);
|
|
190
|
+
default: throw new NarrationError(`unknown TTS provider '${id}' (have: fake, espeak, piper, openai)`);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
/** ≈ syllable count: vowel groups, floored at 1 — a cheap spoken-length proxy. */
|
|
194
|
+
function syllableWeight(word) {
|
|
195
|
+
const groups = word.toLowerCase().match(/[aeiouy]+/g);
|
|
196
|
+
return Math.max(1, groups ? groups.length : 1);
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Distribute words across the clip by estimated spoken length (syllables, not
|
|
200
|
+
* characters — closer to real timing). Pure, deterministic, zero-dependency:
|
|
201
|
+
* the always-available floor. Good enough for captions; karaoke on a very
|
|
202
|
+
* slow/fast word wants a real aligner.
|
|
203
|
+
*/
|
|
204
|
+
function heuristicWords(text, duration) {
|
|
205
|
+
const words = text.trim().split(/\s+/).filter(Boolean);
|
|
206
|
+
if (words.length === 0) return [];
|
|
207
|
+
const weights = words.map(syllableWeight);
|
|
208
|
+
const total = weights.reduce((a, b) => a + b, 0);
|
|
209
|
+
const out = [];
|
|
210
|
+
let cursor = 0;
|
|
211
|
+
for (let i = 0; i < words.length; i++) {
|
|
212
|
+
const span = weights[i] / total * duration;
|
|
213
|
+
out.push({
|
|
214
|
+
word: words[i],
|
|
215
|
+
start: cursor,
|
|
216
|
+
end: cursor + span
|
|
217
|
+
});
|
|
218
|
+
cursor += span;
|
|
219
|
+
}
|
|
220
|
+
return out;
|
|
221
|
+
}
|
|
222
|
+
function heuristicAligner() {
|
|
223
|
+
return {
|
|
224
|
+
id: "heuristic",
|
|
225
|
+
version: () => Promise.resolve("heuristic-1"),
|
|
226
|
+
align: (req) => Promise.resolve(heuristicWords(req.text, wavDuration(req.wav)))
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
const normalizeWord = (w) => w.toLowerCase().replace(/[^\p{L}\p{N}]+/gu, "");
|
|
230
|
+
/**
|
|
231
|
+
* Fill words whose start/end are NaN by linear interpolation between their
|
|
232
|
+
* known neighbours (edges clamp). Keeps the result monotonic. Used after
|
|
233
|
+
* mapping when some script words got no timing.
|
|
234
|
+
*/
|
|
235
|
+
function interpolateMissing(words) {
|
|
236
|
+
const out = words.map((w) => ({ ...w }));
|
|
237
|
+
const n = out.length;
|
|
238
|
+
let k = 0;
|
|
239
|
+
while (k < n) {
|
|
240
|
+
if (!Number.isNaN(out[k].start)) {
|
|
241
|
+
k++;
|
|
242
|
+
continue;
|
|
243
|
+
}
|
|
244
|
+
let j = k;
|
|
245
|
+
while (j < n && Number.isNaN(out[j].start)) j++;
|
|
246
|
+
const lo = k > 0 ? out[k - 1].end : j < n ? out[j].start : 0;
|
|
247
|
+
const hi = j < n ? out[j].start : lo;
|
|
248
|
+
const count = j - k;
|
|
249
|
+
const span = Math.max(0, hi - lo);
|
|
250
|
+
for (let t = 0; t < count; t++) {
|
|
251
|
+
out[k + t].start = lo + span * t / count;
|
|
252
|
+
out[k + t].end = lo + span * (t + 1) / count;
|
|
253
|
+
}
|
|
254
|
+
k = j;
|
|
255
|
+
}
|
|
256
|
+
return out;
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Transfer timed words (from an aligner) onto the script's own word tokens.
|
|
260
|
+
* Forced aligners return near-identical words; ASR (whisper) can differ
|
|
261
|
+
* (numbers spelled out, punctuation), so we LCS-align the normalized
|
|
262
|
+
* sequences and interpolate script words the aligner didn't time. Output
|
|
263
|
+
* length === script word count, in script order — what `wordBoxes()` indexes
|
|
264
|
+
* against. If nothing matched, distribute by syllable over the timed span.
|
|
265
|
+
*/
|
|
266
|
+
function mapAsrToScript(timed, scriptText) {
|
|
267
|
+
const script = scriptText.trim().split(/\s+/).filter(Boolean);
|
|
268
|
+
if (script.length === 0 || timed.length === 0) return [];
|
|
269
|
+
const s = script.map(normalizeWord);
|
|
270
|
+
const a = timed.map((w) => normalizeWord(w.word));
|
|
271
|
+
const n = s.length;
|
|
272
|
+
const m = a.length;
|
|
273
|
+
const dp = Array.from({ length: n + 1 }, () => new Array(m + 1).fill(0));
|
|
274
|
+
for (let i = n - 1; i >= 0; i--) for (let j = m - 1; j >= 0; j--) dp[i][j] = s[i] !== "" && s[i] === a[j] ? dp[i + 1][j + 1] + 1 : Math.max(dp[i + 1][j], dp[i][j + 1]);
|
|
275
|
+
const matched = new Array(n).fill(null);
|
|
276
|
+
let i = 0;
|
|
277
|
+
let j = 0;
|
|
278
|
+
while (i < n && j < m) if (s[i] !== "" && s[i] === a[j]) {
|
|
279
|
+
matched[i] = j;
|
|
280
|
+
i++;
|
|
281
|
+
j++;
|
|
282
|
+
} else if (dp[i + 1][j] >= dp[i][j + 1]) i++;
|
|
283
|
+
else j++;
|
|
284
|
+
if (matched.every((x) => x === null)) {
|
|
285
|
+
const lo = Math.min(...timed.map((w) => w.start));
|
|
286
|
+
const hi = Math.max(...timed.map((w) => w.end));
|
|
287
|
+
return heuristicWords(scriptText, Math.max(0, hi - lo)).map((w) => ({
|
|
288
|
+
...w,
|
|
289
|
+
start: w.start + lo,
|
|
290
|
+
end: w.end + lo
|
|
291
|
+
}));
|
|
292
|
+
}
|
|
293
|
+
return interpolateMissing(script.map((word, k) => {
|
|
294
|
+
const mi = matched[k];
|
|
295
|
+
return mi != null ? {
|
|
296
|
+
word,
|
|
297
|
+
start: timed[mi].start,
|
|
298
|
+
end: timed[mi].end
|
|
299
|
+
} : {
|
|
300
|
+
word,
|
|
301
|
+
start: NaN,
|
|
302
|
+
end: NaN
|
|
303
|
+
};
|
|
304
|
+
}));
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Word timings via Vosk (alphacephei) — offline ASR, Apache-2.0. Shells out to
|
|
308
|
+
* a `vosk-align` command (the Python `vosk` binding + ffmpeg — deliberately NOT
|
|
309
|
+
* the npm `vosk` package, whose `ffi-napi` native build is broken on modern
|
|
310
|
+
* Node). The command reads any audio and writes
|
|
311
|
+
* { "words": [ { "word", "start", "end", "conf"? }, … ] }
|
|
312
|
+
* to stdout; its recognized words are LCS-mapped onto the script tokens by
|
|
313
|
+
* `mapAsrToScript`, so mis-recognitions (e.g. an unknown proper noun) just
|
|
314
|
+
* interpolate cleanly between the words around them.
|
|
315
|
+
*
|
|
316
|
+
* Provide the command via `opts.command` / `VOSK_ALIGN` (default `vosk-align`);
|
|
317
|
+
* the model is the command's own concern (its default, or `--model`/VOSK_MODEL),
|
|
318
|
+
* passed through with `opts.model`.
|
|
319
|
+
*/
|
|
320
|
+
function voskAligner(opts = {}) {
|
|
321
|
+
const command = opts.command ?? process.env["VOSK_ALIGN"] ?? "vosk-align";
|
|
322
|
+
return {
|
|
323
|
+
id: "vosk",
|
|
324
|
+
version: () => {
|
|
325
|
+
const r = spawnSync(command, ["--help"], { encoding: "utf8" });
|
|
326
|
+
if (r.error) {
|
|
327
|
+
if (r.error.code === "ENOENT") throw new NarrationError(`'${command}' not found — provide a vosk-align command (Apache-2.0 Vosk + ffmpeg, JSON {words:[{word,start,end}]} on stdout), or use --align heuristic`);
|
|
328
|
+
throw new NarrationError(`could not run ${command}: ${r.error.message}`);
|
|
329
|
+
}
|
|
330
|
+
return Promise.resolve(opts.model ? `vosk ${basename(opts.model)}` : "vosk");
|
|
331
|
+
},
|
|
332
|
+
align: (req) => {
|
|
333
|
+
const tag = createHash("sha256").update(req.text).digest("hex").slice(0, 8);
|
|
334
|
+
const wavPath = join(tmpdir(), `glissade-vosk-${process.pid}-${tag}.wav`);
|
|
335
|
+
try {
|
|
336
|
+
writeFileSync(wavPath, req.wav);
|
|
337
|
+
const r = spawnSync(command, [wavPath, ...opts.model ? ["--model", opts.model] : []], {
|
|
338
|
+
encoding: "utf8",
|
|
339
|
+
maxBuffer: 64 * 1024 * 1024
|
|
340
|
+
});
|
|
341
|
+
if (r.error) throw new NarrationError(`${command} failed to run: ${r.error.message}`);
|
|
342
|
+
if (r.status !== 0) throw new NarrationError(`${command} failed: ${(r.stderr || "").slice(0, 300)}`);
|
|
343
|
+
const timed = (JSON.parse(r.stdout).words ?? []).filter((w) => typeof w.start === "number" && typeof w.end === "number").map((w) => ({
|
|
344
|
+
word: w.word,
|
|
345
|
+
start: w.start,
|
|
346
|
+
end: w.end
|
|
347
|
+
}));
|
|
348
|
+
return Promise.resolve(mapAsrToScript(timed, req.text));
|
|
349
|
+
} finally {
|
|
350
|
+
if (existsSync(wavPath)) unlinkSync(wavPath);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
};
|
|
354
|
+
}
|
|
355
|
+
/** Resolve an aligner id; 'none' disables alignment (word-less segments). */
|
|
356
|
+
function alignerById(id) {
|
|
357
|
+
switch (id) {
|
|
358
|
+
case "none": return null;
|
|
359
|
+
case "heuristic": return heuristicAligner();
|
|
360
|
+
case "vosk": return voskAligner();
|
|
361
|
+
default: throw new NarrationError(`unknown aligner '${id}' (have: heuristic, vosk, none)`);
|
|
141
362
|
}
|
|
142
363
|
}
|
|
143
364
|
function cacheKey(seg, provider, providerVersion) {
|
|
@@ -162,8 +383,14 @@ async function synthesizeScript(scriptPath, opts = {}) {
|
|
|
162
383
|
if (ids.has(s.id)) throw new NarrationError(`duplicate segment id '${s.id}'`);
|
|
163
384
|
ids.add(s.id);
|
|
164
385
|
}
|
|
165
|
-
const provider = providerById(opts.provider ?? raw.provider ?? "espeak");
|
|
386
|
+
const provider = opts.providerImpl ?? providerById(opts.provider ?? raw.provider ?? "espeak");
|
|
166
387
|
const providerVersion = await provider.version();
|
|
388
|
+
const aligner = opts.alignerImpl !== void 0 ? opts.alignerImpl : alignerById(opts.aligner ?? raw.align ?? "heuristic");
|
|
389
|
+
let alignerTag = null;
|
|
390
|
+
const alignerTagFor = async () => {
|
|
391
|
+
if (alignerTag === null) alignerTag = `${aligner.id}@${await aligner.version()}`;
|
|
392
|
+
return alignerTag;
|
|
393
|
+
};
|
|
167
394
|
const base = scriptPath.replace(/\.narration\.json$/, "");
|
|
168
395
|
if (base === scriptPath) throw new NarrationError(`script path must end with .narration.json: ${scriptPath}`);
|
|
169
396
|
const cacheDir = `${base}.narration-cache`;
|
|
@@ -175,6 +402,7 @@ async function synthesizeScript(scriptPath, opts = {}) {
|
|
|
175
402
|
};
|
|
176
403
|
const synthesized = [];
|
|
177
404
|
const reused = [];
|
|
405
|
+
const aligned = [];
|
|
178
406
|
const segments = [];
|
|
179
407
|
let cursor = raw.leadIn ?? 0;
|
|
180
408
|
for (const seg of raw.segments) {
|
|
@@ -186,24 +414,42 @@ async function synthesizeScript(scriptPath, opts = {}) {
|
|
|
186
414
|
const hash = cacheKey(req, provider.id, providerVersion);
|
|
187
415
|
let entry = cache.entries[hash];
|
|
188
416
|
let duration;
|
|
417
|
+
let wavBuf;
|
|
189
418
|
let words;
|
|
190
419
|
if (entry !== void 0 && !opts.force && existsSync(join(cacheDir, entry.file))) {
|
|
191
|
-
|
|
192
|
-
|
|
420
|
+
wavBuf = readFileSync(join(cacheDir, entry.file));
|
|
421
|
+
duration = wavDuration(wavBuf);
|
|
193
422
|
reused.push(seg.id);
|
|
194
423
|
} else {
|
|
195
424
|
const result = await provider.synthesize(req);
|
|
196
425
|
const file = `${seg.id}-${hash.slice(0, 8)}.wav`;
|
|
197
426
|
writeFileSync(join(cacheDir, file), result.wav);
|
|
427
|
+
wavBuf = result.wav;
|
|
198
428
|
duration = wavDuration(result.wav);
|
|
199
|
-
words = result.words;
|
|
200
429
|
entry = {
|
|
201
430
|
file,
|
|
202
|
-
...words !== void 0 ? {
|
|
431
|
+
...result.words !== void 0 ? {
|
|
432
|
+
words: result.words,
|
|
433
|
+
wordsFrom: "provider"
|
|
434
|
+
} : {}
|
|
203
435
|
};
|
|
204
436
|
cache.entries[hash] = entry;
|
|
205
437
|
synthesized.push(seg.id);
|
|
206
438
|
}
|
|
439
|
+
if (entry.wordsFrom === "provider") words = entry.words;
|
|
440
|
+
else if (aligner !== null) {
|
|
441
|
+
const tag = await alignerTagFor();
|
|
442
|
+
if (entry.wordsFrom === tag && entry.words !== void 0) words = entry.words;
|
|
443
|
+
else {
|
|
444
|
+
words = await aligner.align({
|
|
445
|
+
wav: wavBuf,
|
|
446
|
+
text: seg.text
|
|
447
|
+
});
|
|
448
|
+
entry.words = words;
|
|
449
|
+
entry.wordsFrom = tag;
|
|
450
|
+
aligned.push(seg.id);
|
|
451
|
+
}
|
|
452
|
+
}
|
|
207
453
|
const timed = {
|
|
208
454
|
id: seg.id,
|
|
209
455
|
text: seg.text,
|
|
@@ -235,7 +481,9 @@ async function synthesizeScript(scriptPath, opts = {}) {
|
|
|
235
481
|
timingPath,
|
|
236
482
|
cacheDir,
|
|
237
483
|
synthesized,
|
|
238
|
-
reused
|
|
484
|
+
reused,
|
|
485
|
+
aligned,
|
|
486
|
+
aligner: aligner?.id ?? null
|
|
239
487
|
};
|
|
240
488
|
}
|
|
241
489
|
/** Resolve `<scene>.narration.json` for a scene-module path (or accept the script itself). */
|
|
@@ -246,4 +494,4 @@ function scriptPathFor(input) {
|
|
|
246
494
|
return candidate;
|
|
247
495
|
}
|
|
248
496
|
//#endregion
|
|
249
|
-
export { cacheKey, espeakProvider, fakeProvider, openaiProvider, providerById, scriptPathFor, synthesizeScript, wavDuration };
|
|
497
|
+
export { alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, scriptPathFor, synthesizeScript, voskAligner, wavDuration };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@glissade/narrate",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.5.0-pre.1",
|
|
4
4
|
"description": "glissade narration + captions: TTS at prepare time (gs narrate), deterministic caching, narration-anchored timeline beats, and captions as plain tracks. Render stays offline.",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"type": "module",
|
|
@@ -19,8 +19,8 @@
|
|
|
19
19
|
"dist"
|
|
20
20
|
],
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"@glissade/core": "0.
|
|
23
|
-
"@glissade/scene": "0.
|
|
22
|
+
"@glissade/core": "0.5.0-pre.1",
|
|
23
|
+
"@glissade/scene": "0.5.0-pre.1"
|
|
24
24
|
},
|
|
25
25
|
"repository": {
|
|
26
26
|
"type": "git",
|