@uzen/kokoro-js 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,62 @@
1
+ {
2
+ "name": "@uzen/kokoro-js",
3
+ "version": "1.2.3",
4
+ "type": "module",
5
+ "exports": {
6
+ "types": "./types/kokoro.d.ts",
7
+ "node": {
8
+ "import": "./dist/kokoro.js",
9
+ "require": "./dist/kokoro.cjs"
10
+ },
11
+ "default": "./dist/kokoro.js"
12
+ },
13
+ "scripts": {
14
+ "build": "node -e \"fs.rmSync('dist',{recursive:true,force:true});fs.rmSync('types',{recursive:true,force:true})\" && rollup -c && tsc && node -e \"fs.copyFileSync('../LICENSE','LICENSE')\"",
15
+ "format": "prettier --write . --print-width 1000",
16
+ "test": "vitest run"
17
+ },
18
+ "keywords": [
19
+ "kokoro",
20
+ "tts",
21
+ "text-to-speech"
22
+ ],
23
+ "author": {
24
+ "name": "hexgrad",
25
+ "email": "hello@hexgrad.com"
26
+ },
27
+ "browser": {
28
+ "path": false,
29
+ "fs/promises": false
30
+ },
31
+ "contributors": [
32
+ "Xenova",
33
+ "uzen.zone"
34
+ ],
35
+ "license": "Apache-2.0",
36
+ "description": "High-quality text-to-speech for the web",
37
+ "dependencies": {
38
+ "@huggingface/transformers": "^4.2.0",
39
+ "phonemize": "^1.2.0",
40
+ "phonemizer": "^1.2.1",
41
+ "pinyin-pro": "^3.28.1"
42
+ },
43
+ "devDependencies": {
44
+ "@rollup/plugin-node-resolve": "^16.0.0",
45
+ "@rollup/plugin-terser": "^0.4.4",
46
+ "prettier": "3.4.2",
47
+ "rollup": "^4.30.1",
48
+ "typescript": "^5.7.3",
49
+ "vitest": "^3.1.2"
50
+ },
51
+ "files": [
52
+ "types",
53
+ "dist",
54
+ "README.md",
55
+ "LICENSE"
56
+ ],
57
+ "publishConfig": {
58
+ "access": "public"
59
+ },
60
+ "jsdelivr": "./dist/kokoro.web.js",
61
+ "unpkg": "./dist/kokoro.web.js"
62
+ }
@@ -0,0 +1,626 @@
1
+ /**
2
+ * @typedef {Object} GenerateOptions
3
+ * @property {keyof typeof VOICES} [voice="zf_001"] The voice
4
+ * @property {number} [speed=1] The speaking speed
5
+ */
6
+ /**
7
+ * @typedef {Object} StreamProperties
8
+ * @property {RegExp} [split_pattern] The pattern to split the input text. If unset, the default sentence splitter will be used.
9
+ * @property {number} [maxChunkLength=200] Maximum character length per chunk. Longer chunks are split at punctuation boundaries.
10
+ * @typedef {GenerateOptions & StreamProperties} StreamGenerateOptions
11
+ */
12
+ export class KokoroTTS {
13
+ /**
14
+ * Load a KokoroTTS model from the Hugging Face Hub.
15
+ * @param {string} model_id The model id
16
+ * @param {Object} options Additional options
17
+ * @param {"fp32"|"fp16"|"q8"|"q4"|"q4f16"} [options.dtype="fp32"] The data type to use.
18
+ * @param {"wasm"|"webgpu"|"cpu"|null} [options.device=null] The device to run the model on.
19
+ * @param {string|null} [options.model_file_name=null] Override the ONNX model file name, excluding the .onnx suffix.
20
+ * @param {string} [options.voicePath="/kokoro/voices"] Base path/directory for voice data files.
21
+ * @param {import("@huggingface/transformers").ProgressCallback} [options.progress_callback=null] A callback function that is called with progress information.
22
+ * @returns {Promise<KokoroTTS>} The loaded model
23
+ */
24
+ static from_pretrained(model_id: string, { dtype, device, model_file_name, voicePath, progress_callback }?: {
25
+ dtype?: "fp32" | "fp16" | "q8" | "q4" | "q4f16";
26
+ device?: "wasm" | "webgpu" | "cpu" | null;
27
+ model_file_name?: string | null;
28
+ voicePath?: string;
29
+ progress_callback?: import("@huggingface/transformers").ProgressCallback;
30
+ }): Promise<KokoroTTS>;
31
+ /**
32
+ * Create a new KokoroTTS instance.
33
+ * @param {import('@huggingface/transformers').StyleTextToSpeech2Model} model The model
34
+ * @param {import('@huggingface/transformers').PreTrainedTokenizer} tokenizer The tokenizer
35
+ */
36
+ constructor(model: import("@huggingface/transformers").StyleTextToSpeech2Model, tokenizer: import("@huggingface/transformers").PreTrainedTokenizer);
37
+ model: StyleTextToSpeech2Model;
38
+ tokenizer: import("@huggingface/transformers").PreTrainedTokenizer;
39
+ get voices(): Readonly<{
40
+ af_maple: {
41
+ name: string;
42
+ language: string;
43
+ gender: string;
44
+ };
45
+ af_sol: {
46
+ name: string;
47
+ language: string;
48
+ gender: string;
49
+ };
50
+ bf_vale: {
51
+ name: string;
52
+ language: string;
53
+ gender: string;
54
+ };
55
+ zf_001: {
56
+ name: string;
57
+ language: string;
58
+ gender: string;
59
+ };
60
+ zf_002: {
61
+ name: string;
62
+ language: string;
63
+ gender: string;
64
+ };
65
+ zf_003: {
66
+ name: string;
67
+ language: string;
68
+ gender: string;
69
+ };
70
+ zf_004: {
71
+ name: string;
72
+ language: string;
73
+ gender: string;
74
+ };
75
+ zf_005: {
76
+ name: string;
77
+ language: string;
78
+ gender: string;
79
+ };
80
+ zf_006: {
81
+ name: string;
82
+ language: string;
83
+ gender: string;
84
+ };
85
+ zf_007: {
86
+ name: string;
87
+ language: string;
88
+ gender: string;
89
+ };
90
+ zf_008: {
91
+ name: string;
92
+ language: string;
93
+ gender: string;
94
+ };
95
+ zf_017: {
96
+ name: string;
97
+ language: string;
98
+ gender: string;
99
+ };
100
+ zf_018: {
101
+ name: string;
102
+ language: string;
103
+ gender: string;
104
+ };
105
+ zf_019: {
106
+ name: string;
107
+ language: string;
108
+ gender: string;
109
+ };
110
+ zf_021: {
111
+ name: string;
112
+ language: string;
113
+ gender: string;
114
+ };
115
+ zf_022: {
116
+ name: string;
117
+ language: string;
118
+ gender: string;
119
+ };
120
+ zf_023: {
121
+ name: string;
122
+ language: string;
123
+ gender: string;
124
+ };
125
+ zf_024: {
126
+ name: string;
127
+ language: string;
128
+ gender: string;
129
+ };
130
+ zf_026: {
131
+ name: string;
132
+ language: string;
133
+ gender: string;
134
+ };
135
+ zf_027: {
136
+ name: string;
137
+ language: string;
138
+ gender: string;
139
+ };
140
+ zf_028: {
141
+ name: string;
142
+ language: string;
143
+ gender: string;
144
+ };
145
+ zf_032: {
146
+ name: string;
147
+ language: string;
148
+ gender: string;
149
+ };
150
+ zf_036: {
151
+ name: string;
152
+ language: string;
153
+ gender: string;
154
+ };
155
+ zf_038: {
156
+ name: string;
157
+ language: string;
158
+ gender: string;
159
+ };
160
+ zf_039: {
161
+ name: string;
162
+ language: string;
163
+ gender: string;
164
+ };
165
+ zf_040: {
166
+ name: string;
167
+ language: string;
168
+ gender: string;
169
+ };
170
+ zf_042: {
171
+ name: string;
172
+ language: string;
173
+ gender: string;
174
+ };
175
+ zf_043: {
176
+ name: string;
177
+ language: string;
178
+ gender: string;
179
+ };
180
+ zf_044: {
181
+ name: string;
182
+ language: string;
183
+ gender: string;
184
+ };
185
+ zf_046: {
186
+ name: string;
187
+ language: string;
188
+ gender: string;
189
+ };
190
+ zf_047: {
191
+ name: string;
192
+ language: string;
193
+ gender: string;
194
+ };
195
+ zf_048: {
196
+ name: string;
197
+ language: string;
198
+ gender: string;
199
+ };
200
+ zf_049: {
201
+ name: string;
202
+ language: string;
203
+ gender: string;
204
+ };
205
+ zf_051: {
206
+ name: string;
207
+ language: string;
208
+ gender: string;
209
+ };
210
+ zf_059: {
211
+ name: string;
212
+ language: string;
213
+ gender: string;
214
+ };
215
+ zf_060: {
216
+ name: string;
217
+ language: string;
218
+ gender: string;
219
+ };
220
+ zf_067: {
221
+ name: string;
222
+ language: string;
223
+ gender: string;
224
+ };
225
+ zf_070: {
226
+ name: string;
227
+ language: string;
228
+ gender: string;
229
+ };
230
+ zf_071: {
231
+ name: string;
232
+ language: string;
233
+ gender: string;
234
+ };
235
+ zf_072: {
236
+ name: string;
237
+ language: string;
238
+ gender: string;
239
+ };
240
+ zf_073: {
241
+ name: string;
242
+ language: string;
243
+ gender: string;
244
+ };
245
+ zf_074: {
246
+ name: string;
247
+ language: string;
248
+ gender: string;
249
+ };
250
+ zf_075: {
251
+ name: string;
252
+ language: string;
253
+ gender: string;
254
+ };
255
+ zf_076: {
256
+ name: string;
257
+ language: string;
258
+ gender: string;
259
+ };
260
+ zf_077: {
261
+ name: string;
262
+ language: string;
263
+ gender: string;
264
+ };
265
+ zf_078: {
266
+ name: string;
267
+ language: string;
268
+ gender: string;
269
+ };
270
+ zf_079: {
271
+ name: string;
272
+ language: string;
273
+ gender: string;
274
+ };
275
+ zf_083: {
276
+ name: string;
277
+ language: string;
278
+ gender: string;
279
+ };
280
+ zf_084: {
281
+ name: string;
282
+ language: string;
283
+ gender: string;
284
+ };
285
+ zf_085: {
286
+ name: string;
287
+ language: string;
288
+ gender: string;
289
+ };
290
+ zf_086: {
291
+ name: string;
292
+ language: string;
293
+ gender: string;
294
+ };
295
+ zf_087: {
296
+ name: string;
297
+ language: string;
298
+ gender: string;
299
+ };
300
+ zf_088: {
301
+ name: string;
302
+ language: string;
303
+ gender: string;
304
+ };
305
+ zf_090: {
306
+ name: string;
307
+ language: string;
308
+ gender: string;
309
+ };
310
+ zf_092: {
311
+ name: string;
312
+ language: string;
313
+ gender: string;
314
+ };
315
+ zf_093: {
316
+ name: string;
317
+ language: string;
318
+ gender: string;
319
+ };
320
+ zf_094: {
321
+ name: string;
322
+ language: string;
323
+ gender: string;
324
+ };
325
+ zf_099: {
326
+ name: string;
327
+ language: string;
328
+ gender: string;
329
+ };
330
+ zm_009: {
331
+ name: string;
332
+ language: string;
333
+ gender: string;
334
+ };
335
+ zm_010: {
336
+ name: string;
337
+ language: string;
338
+ gender: string;
339
+ };
340
+ zm_011: {
341
+ name: string;
342
+ language: string;
343
+ gender: string;
344
+ };
345
+ zm_012: {
346
+ name: string;
347
+ language: string;
348
+ gender: string;
349
+ };
350
+ zm_013: {
351
+ name: string;
352
+ language: string;
353
+ gender: string;
354
+ };
355
+ zm_014: {
356
+ name: string;
357
+ language: string;
358
+ gender: string;
359
+ };
360
+ zm_015: {
361
+ name: string;
362
+ language: string;
363
+ gender: string;
364
+ };
365
+ zm_016: {
366
+ name: string;
367
+ language: string;
368
+ gender: string;
369
+ };
370
+ zm_020: {
371
+ name: string;
372
+ language: string;
373
+ gender: string;
374
+ };
375
+ zm_025: {
376
+ name: string;
377
+ language: string;
378
+ gender: string;
379
+ };
380
+ zm_029: {
381
+ name: string;
382
+ language: string;
383
+ gender: string;
384
+ };
385
+ zm_030: {
386
+ name: string;
387
+ language: string;
388
+ gender: string;
389
+ };
390
+ zm_031: {
391
+ name: string;
392
+ language: string;
393
+ gender: string;
394
+ };
395
+ zm_033: {
396
+ name: string;
397
+ language: string;
398
+ gender: string;
399
+ };
400
+ zm_034: {
401
+ name: string;
402
+ language: string;
403
+ gender: string;
404
+ };
405
+ zm_035: {
406
+ name: string;
407
+ language: string;
408
+ gender: string;
409
+ };
410
+ zm_037: {
411
+ name: string;
412
+ language: string;
413
+ gender: string;
414
+ };
415
+ zm_041: {
416
+ name: string;
417
+ language: string;
418
+ gender: string;
419
+ };
420
+ zm_045: {
421
+ name: string;
422
+ language: string;
423
+ gender: string;
424
+ };
425
+ zm_050: {
426
+ name: string;
427
+ language: string;
428
+ gender: string;
429
+ };
430
+ zm_052: {
431
+ name: string;
432
+ language: string;
433
+ gender: string;
434
+ };
435
+ zm_053: {
436
+ name: string;
437
+ language: string;
438
+ gender: string;
439
+ };
440
+ zm_054: {
441
+ name: string;
442
+ language: string;
443
+ gender: string;
444
+ };
445
+ zm_055: {
446
+ name: string;
447
+ language: string;
448
+ gender: string;
449
+ };
450
+ zm_056: {
451
+ name: string;
452
+ language: string;
453
+ gender: string;
454
+ };
455
+ zm_057: {
456
+ name: string;
457
+ language: string;
458
+ gender: string;
459
+ };
460
+ zm_058: {
461
+ name: string;
462
+ language: string;
463
+ gender: string;
464
+ };
465
+ zm_061: {
466
+ name: string;
467
+ language: string;
468
+ gender: string;
469
+ };
470
+ zm_062: {
471
+ name: string;
472
+ language: string;
473
+ gender: string;
474
+ };
475
+ zm_063: {
476
+ name: string;
477
+ language: string;
478
+ gender: string;
479
+ };
480
+ zm_064: {
481
+ name: string;
482
+ language: string;
483
+ gender: string;
484
+ };
485
+ zm_065: {
486
+ name: string;
487
+ language: string;
488
+ gender: string;
489
+ };
490
+ zm_066: {
491
+ name: string;
492
+ language: string;
493
+ gender: string;
494
+ };
495
+ zm_068: {
496
+ name: string;
497
+ language: string;
498
+ gender: string;
499
+ };
500
+ zm_069: {
501
+ name: string;
502
+ language: string;
503
+ gender: string;
504
+ };
505
+ zm_080: {
506
+ name: string;
507
+ language: string;
508
+ gender: string;
509
+ };
510
+ zm_081: {
511
+ name: string;
512
+ language: string;
513
+ gender: string;
514
+ };
515
+ zm_082: {
516
+ name: string;
517
+ language: string;
518
+ gender: string;
519
+ };
520
+ zm_089: {
521
+ name: string;
522
+ language: string;
523
+ gender: string;
524
+ };
525
+ zm_091: {
526
+ name: string;
527
+ language: string;
528
+ gender: string;
529
+ };
530
+ zm_095: {
531
+ name: string;
532
+ language: string;
533
+ gender: string;
534
+ };
535
+ zm_096: {
536
+ name: string;
537
+ language: string;
538
+ gender: string;
539
+ };
540
+ zm_097: {
541
+ name: string;
542
+ language: string;
543
+ gender: string;
544
+ };
545
+ zm_098: {
546
+ name: string;
547
+ language: string;
548
+ gender: string;
549
+ };
550
+ zm_100: {
551
+ name: string;
552
+ language: string;
553
+ gender: string;
554
+ };
555
+ }>;
556
+ list_voices(): void;
557
+ _validate_voice(voice: any): "b" | "z" | "a";
558
+ /**
559
+ * Generate audio from text.
560
+ *
561
+ * @param {string} text The input text
562
+ * @param {GenerateOptions} options Additional options
563
+ * @returns {Promise<RawAudio>} The generated audio
564
+ */
565
+ generate(text: string, { voice, speed }?: GenerateOptions): Promise<RawAudio>;
566
+ /**
567
+ * Generate audio from input ids.
568
+ * @param {Tensor} input_ids The input ids
569
+ * @param {GenerateOptions} options Additional options
570
+ * @returns {Promise<RawAudio>} The generated audio
571
+ */
572
+ generate_from_ids(input_ids: Tensor, { voice, speed }?: GenerateOptions): Promise<RawAudio>;
573
+ /**
574
+ * Split a long text into smaller chunks at punctuation boundaries.
575
+ * Falls back to hard split at maxLength if no suitable boundary is found.
576
+ * @param {string} text The text to split
577
+ * @param {number} maxLength Maximum length of each chunk
578
+ * @returns {string[]} Array of text chunks
579
+ * @private
580
+ */
581
+ private _splitLongText;
582
+ /**
583
+ * Generate audio from text in a streaming fashion.
584
+ * @param {string|TextSplitterStream} text The input text
585
+ * @param {StreamGenerateOptions} options Additional options
586
+ * @returns {AsyncGenerator<{text: string, phonemes: string, audio: RawAudio}, void, void>}
587
+ */
588
+ stream(text: string | TextSplitterStream, { voice, speed, split_pattern, maxChunkLength }?: StreamGenerateOptions): AsyncGenerator<{
589
+ text: string;
590
+ phonemes: string;
591
+ audio: RawAudio;
592
+ }, void, void>;
593
+ }
594
+ export namespace env {
595
+ let cacheDir: string;
596
+ let allowLocalModels: boolean;
597
+ let wasmPaths: import("onnxruntime-common").Env.WasmPrefixOrFilePaths;
598
+ }
599
+ export { TextSplitterStream };
600
+ export type GenerateOptions = {
601
+ /**
602
+ * The voice
603
+ */
604
+ voice?: keyof typeof VOICES;
605
+ /**
606
+ * The speaking speed
607
+ */
608
+ speed?: number;
609
+ };
610
+ export type StreamProperties = {
611
+ /**
612
+ * The pattern to split the input text. If unset, the default sentence splitter will be used.
613
+ */
614
+ split_pattern?: RegExp;
615
+ /**
616
+ * Maximum character length per chunk. Longer chunks are split at punctuation boundaries.
617
+ */
618
+ maxChunkLength?: number;
619
+ };
620
+ export type StreamGenerateOptions = GenerateOptions & StreamProperties;
621
+ import { StyleTextToSpeech2Model } from "@huggingface/transformers";
622
+ import { RawAudio } from "@huggingface/transformers";
623
+ import { Tensor } from "@huggingface/transformers";
624
+ import { TextSplitterStream } from "./splitter.js";
625
+ import { VOICES } from "./voices.js";
626
+ //# sourceMappingURL=kokoro.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"kokoro.d.ts","sourceRoot":"","sources":["../src/kokoro.js"],"names":[],"mappings":"AAQA;;;;GAIG;AAEH;;;;;GAKG;AAEH;IAWE;;;;;;;;;;OAUG;IACH,iCATW,MAAM,qEAEd;QAAkD,KAAK,GAA/C,MAAM,GAAC,MAAM,GAAC,IAAI,GAAC,IAAI,GAAC,OAAO;QACM,MAAM,GAA3C,MAAM,GAAC,QAAQ,GAAC,KAAK,GAAC,IAAI;QACJ,eAAe,GAArC,MAAM,GAAC,IAAI;QACM,SAAS,GAA1B,MAAM;QACyD,iBAAiB,GAAhF,OAAO,2BAA2B,EAAE,gBAAgB;KAC5D,GAAU,OAAO,CAAC,SAAS,CAAC,CAU9B;IA7BD;;;;OAIG;IACH,mBAHW,OAAO,2BAA2B,EAAE,uBAAuB,aAC3D,OAAO,2BAA2B,EAAE,mBAAmB,EAKjE;IAFC,+BAAkB;IAClB,mEAA0B;IAwB5B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;OAEC;IAED,oBAEC;IAED,6CAQC;IAED;;;;;;OAMG;IACH,eAJW,MAAM,qBACN,eAAe,GACb,OAAO,CAAC,QAAQ,CAAC,CAW7B;IAED;;;;;OAKG;IACH,6BAJW,MAAM,qBACN,eAAe,GACb,OAAO,CAAC,QAAQ,CAAC,CAqB7B;IAED;;;;;;;OAOG;IACH,uBAsBC;IAED;;;;;OAKG;IACH,aAJW,MAAM,GAAC,kBAAkB,oDACzB,qBAAqB,GACnB,cAAc,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,QAAQ,CAAA;KAAC,EAAE,IAAI,EAAE,IAAI,CAAC,CAmCzF;CACF;;;;;;;;;;;YAlLa,MAAM,OAAO,MAAM;;;;YACnB,MAAM;;;;;;oBAKN,MAAM;;;;qBACN,MAAM;;oCACP,eAAe,GAAG,gBAAgB;wCAlBqC,2BAA2B;yBAA3B,2BAA2B;uBAA3B,2BAA2B;mCAE5E,eAAe;uBACC,aAAa"}
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Phonemize text using the language-specific phonemizer
3
+ * @param {string} text The text to phonemize
4
+ * @param {"a"|"b"|"z"} language The language to use
5
+ * @param {boolean} norm Whether to normalize the text
6
+ * @returns {Promise<string>} The phonemized text
7
+ */
8
+ export function phonemize(text: string, language?: "a" | "b" | "z", norm?: boolean): Promise<string>;
9
+ //# sourceMappingURL=phonemize.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"phonemize.d.ts","sourceRoot":"","sources":["../src/phonemize.js"],"names":[],"mappings":"AA6+BA;;;;;;GAMG;AACH,gCALW,MAAM,aACN,GAAG,GAAC,GAAG,GAAC,GAAG,SACX,OAAO,GACL,OAAO,CAAC,MAAM,CAAC,CAoC3B"}