@glissade/narrate 0.8.0-pre.1 → 0.8.1-pre.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -46,7 +46,7 @@ interface NarrationScript {
46
46
  leadIn?: number;
47
47
  /**
48
48
  * Word-timing aligner for providers that don't emit word timestamps
49
- * (espeak / openai / piper). 'heuristic' (default) estimates from text;
49
+ * (espeak / openai / piper / kokoro). 'heuristic' (default) estimates from text;
50
50
  * 'vosk' derives real timings from the audio (offline ASR); 'none' leaves
51
51
  * segments word-less. Providers that supply their own words ignore this.
52
52
  */
@@ -68,6 +68,27 @@ declare function piperProvider(opts?: {
68
68
  noiseScale?: number;
69
69
  noiseWScale?: number;
70
70
  }): TtsProvider;
71
+ /** PCM16 mono WAV from float samples in [-1, 1]. Round-to-nearest → deterministic. */
72
+ declare function floatToWav(samples: Float32Array, sampleRate: number): Buffer;
73
+ type KokoroDtype = 'fp32' | 'fp16' | 'q8' | 'q4' | 'q4f16';
74
+ /**
75
+ * Apache-2.0 82M neural TTS — markedly more natural than espeak/piper, fully
76
+ * offline on CPU via onnxruntime, no API key. Pure-Node through `kokoro-js`
77
+ * (Transformers.js), so unlike piper there is no `pip install` / external
78
+ * binary; `kokoro-js` is an OPTIONAL peer dep, lazy-loaded here.
79
+ *
80
+ * DETERMINISTIC by construction: inference takes tokenized phonemes + a FIXED
81
+ * voice/style embedding (not diffusion-sampled per call), so the same text →
82
+ * byte-identical PCM — no noise to zero out (piper's trick). `version()` pins
83
+ * the lib version + model + dtype, so any of those moving invalidates the
84
+ * cache. The model (~q8 92MB / fp32 326MB) downloads + caches on first use; it
85
+ * stays out of the bundle and the determinism-critical path.
86
+ */
87
+ declare function kokoroProvider(opts?: {
88
+ model?: string;
89
+ voice?: string;
90
+ dtype?: KokoroDtype;
91
+ }): TtsProvider;
71
92
  declare function providerById(id: string): TtsProvider;
72
93
  interface AlignRequest {
73
94
  /** the synthesized RIFF/WAV bytes */
@@ -199,4 +220,4 @@ declare function synthesizeScript(scriptPath: string, opts?: SynthesizeOptions):
199
220
  /** Resolve `<scene>.narration.json` for a scene-module path (or accept the script itself). */
200
221
  declare function scriptPathFor(input: string): string;
201
222
  //#endregion
202
- export { AlignRequest, Aligner, SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, VoskAlignWord, alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, resolvePiperVoice, scriptPathFor, stderrTail, synthesizeScript, voskAligner, wavDuration };
223
+ export { AlignRequest, Aligner, KokoroDtype, SynthesizeOptions, SynthesizeResult, TtsProvider, TtsRequest, TtsResult, VoskAlignWord, alignerById, cacheKey, espeakProvider, fakeProvider, floatToWav, heuristicAligner, heuristicWords, interpolateMissing, kokoroProvider, mapAsrToScript, openaiProvider, piperProvider, providerById, resolvePiperVoice, scriptPathFor, stderrTail, synthesizeScript, voskAligner, wavDuration };
package/dist/providers.js CHANGED
@@ -1,4 +1,5 @@
1
1
  import { NarrationError, isPause } from "./index.js";
2
+ import { createRequire } from "node:module";
2
3
  import { createHash } from "node:crypto";
3
4
  import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
4
5
  import { basename, dirname, isAbsolute, join, resolve } from "node:path";
@@ -225,13 +226,101 @@ function piperProvider(opts = {}) {
225
226
  }
226
227
  };
227
228
  }
229
+ /** PCM16 mono WAV from float samples in [-1, 1]. Round-to-nearest → deterministic. */
230
+ function floatToWav(samples, sampleRate) {
231
+ const data = Buffer.alloc(samples.length * 2);
232
+ for (let i = 0; i < samples.length; i++) {
233
+ const s = Math.max(-1, Math.min(1, samples[i]));
234
+ data.writeInt16LE(Math.round(s * 32767), i * 2);
235
+ }
236
+ const header = Buffer.alloc(44);
237
+ header.write("RIFF", 0, "ascii");
238
+ header.writeUInt32LE(36 + data.length, 4);
239
+ header.write("WAVE", 8, "ascii");
240
+ header.write("fmt ", 12, "ascii");
241
+ header.writeUInt32LE(16, 16);
242
+ header.writeUInt16LE(1, 20);
243
+ header.writeUInt16LE(1, 22);
244
+ header.writeUInt32LE(sampleRate, 24);
245
+ header.writeUInt32LE(sampleRate * 2, 28);
246
+ header.writeUInt16LE(2, 32);
247
+ header.writeUInt16LE(16, 34);
248
+ header.write("data", 36, "ascii");
249
+ header.writeUInt32LE(data.length, 40);
250
+ return Buffer.concat([header, data]);
251
+ }
252
+ const KOKORO_MODEL = "onnx-community/Kokoro-82M-v1.0-ONNX";
253
+ const KOKORO_DEFAULT_VOICE = "af_heart";
254
+ /**
255
+ * Apache-2.0 82M neural TTS — markedly more natural than espeak/piper, fully
256
+ * offline on CPU via onnxruntime, no API key. Pure-Node through `kokoro-js`
257
+ * (Transformers.js), so unlike piper there is no `pip install` / external
258
+ * binary; `kokoro-js` is an OPTIONAL peer dep, lazy-loaded here.
259
+ *
260
+ * DETERMINISTIC by construction: inference takes tokenized phonemes + a FIXED
261
+ * voice/style embedding (not diffusion-sampled per call), so the same text →
262
+ * byte-identical PCM — no noise to zero out (piper's trick). `version()` pins
263
+ * the lib version + model + dtype, so any of those moving invalidates the
264
+ * cache. The model (~q8 92MB / fp32 326MB) downloads + caches on first use; it
265
+ * stays out of the bundle and the determinism-critical path.
266
+ */
267
+ function kokoroProvider(opts = {}) {
268
+ const modelId = opts.model ?? KOKORO_MODEL;
269
+ const dtype = opts.dtype ?? "q8";
270
+ let loaded = null;
271
+ const loadLib = async () => {
272
+ try {
273
+ return await import("kokoro-js");
274
+ } catch {
275
+ throw new NarrationError("kokoro-js not found — `npm install kokoro-js` (it pulls onnxruntime-node), or use --provider piper/espeak/openai");
276
+ }
277
+ };
278
+ const getModel = () => loaded ??= loadLib().then((k) => k.KokoroTTS.from_pretrained(modelId, {
279
+ dtype,
280
+ device: "cpu"
281
+ }));
282
+ return {
283
+ id: "kokoro",
284
+ version: () => {
285
+ let lib = "kokoro-js";
286
+ try {
287
+ const req = createRequire(import.meta.url);
288
+ const pkg = JSON.parse(readFileSync(req.resolve("kokoro-js/package.json"), "utf8"));
289
+ if (pkg.version) lib = `kokoro-js ${pkg.version}`;
290
+ } catch {
291
+ throw new NarrationError("kokoro-js not found — `npm install kokoro-js` (it pulls onnxruntime-node), or use --provider piper/espeak/openai");
292
+ }
293
+ return Promise.resolve(`${lib} ${basename(modelId)} dtype=${dtype}`);
294
+ },
295
+ synthesize: async (req) => {
296
+ const tts = await getModel();
297
+ const voice = req.voice ?? opts.voice ?? KOKORO_DEFAULT_VOICE;
298
+ const genOpts = req.rate !== void 0 && req.rate > 0 ? {
299
+ voice,
300
+ speed: req.rate
301
+ } : { voice };
302
+ let audio;
303
+ try {
304
+ audio = await tts.generate(req.text, genOpts);
305
+ } catch (e) {
306
+ throw new NarrationError(`kokoro synthesis failed: ${e instanceof Error ? e.message : String(e)}`);
307
+ }
308
+ const wav = floatToWav(audio.audio, audio.sampling_rate);
309
+ return {
310
+ wav,
311
+ duration: wavDuration(wav)
312
+ };
313
+ }
314
+ };
315
+ }
228
316
  function providerById(id) {
229
317
  switch (id) {
230
318
  case "fake": return fakeProvider();
231
319
  case "espeak": return espeakProvider();
232
320
  case "piper": return piperProvider();
321
+ case "kokoro": return kokoroProvider();
233
322
  case "openai": return openaiProvider();
234
- default: throw new NarrationError(`unknown TTS provider '${id}' (have: fake, espeak, piper, openai)`);
323
+ default: throw new NarrationError(`unknown TTS provider '${id}' (have: fake, espeak, piper, kokoro, openai)`);
235
324
  }
236
325
  }
237
326
  /** ≈ syllable count: vowel groups, floored at 1 — a cheap spoken-length proxy. */
@@ -558,4 +647,4 @@ function scriptPathFor(input) {
558
647
  return candidate;
559
648
  }
560
649
  //#endregion
561
- export { alignerById, cacheKey, espeakProvider, fakeProvider, heuristicAligner, heuristicWords, interpolateMissing, mapAsrToScript, openaiProvider, piperProvider, providerById, resolvePiperVoice, scriptPathFor, stderrTail, synthesizeScript, voskAligner, wavDuration };
650
+ export { alignerById, cacheKey, espeakProvider, fakeProvider, floatToWav, heuristicAligner, heuristicWords, interpolateMissing, kokoroProvider, mapAsrToScript, openaiProvider, piperProvider, providerById, resolvePiperVoice, scriptPathFor, stderrTail, synthesizeScript, voskAligner, wavDuration };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@glissade/narrate",
3
- "version": "0.8.0-pre.1",
3
+ "version": "0.8.1-pre.0",
4
4
  "description": "glissade narration + captions: TTS at prepare time (gs narrate), deterministic caching, narration-anchored timeline beats, and captions as plain tracks. Render stays offline.",
5
5
  "license": "Apache-2.0",
6
6
  "type": "module",
@@ -19,14 +19,25 @@
19
19
  "dist"
20
20
  ],
21
21
  "dependencies": {
22
- "@glissade/core": "0.8.0-pre.1",
23
- "@glissade/scene": "0.8.0-pre.1"
22
+ "@glissade/core": "0.8.1-pre.0",
23
+ "@glissade/scene": "0.8.1-pre.0"
24
+ },
25
+ "peerDependencies": {
26
+ "kokoro-js": "^1.2.0"
27
+ },
28
+ "peerDependenciesMeta": {
29
+ "kokoro-js": {
30
+ "optional": true
31
+ }
24
32
  },
25
33
  "repository": {
26
34
  "type": "git",
27
35
  "url": "git+https://github.com/tyevco/glissade.git",
28
36
  "directory": "packages/narrate"
29
37
  },
38
+ "devDependencies": {
39
+ "kokoro-js": "^1.2.1"
40
+ },
30
41
  "scripts": {
31
42
  "build": "tsdown",
32
43
  "typecheck": "tsc --noEmit"