@polytts/node-adapters 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 DengQing dengqing0821@gmail.com
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # @polytts/node-adapters
2
+
3
+ [![npm version](https://img.shields.io/npm/v/@polytts/node-adapters)](https://www.npmjs.com/package/@polytts/node-adapters)
4
+
5
+ Official Node adapter implementations for [`polytts`](https://github.com/Dunqing/polytts).
6
+
7
+ Use this package when you want Node-capable adapters without the higher-level Node controller.
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ npm install @polytts/node-adapters
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ ```ts
18
+ import { officialNodeAdapters, piperNodeAdapter } from "@polytts/node-adapters";
19
+
20
+ void officialNodeAdapters;
21
+ void piperNodeAdapter;
22
+ ```
23
+
24
+ ## Included adapters
25
+
26
+ - `kokoroNodeAdapter`
27
+ - `piperNodeAdapter`
28
+ - `kittenNodeAdapter`
29
+ - `supertonicNodeAdapter`
30
+
31
+ Catalog and model metadata live in `@polytts/presets`.
@@ -0,0 +1,17 @@
1
+ import { SynthesizingModelInstance, TTSAdapter } from "@polytts/core";
2
+
3
+ //#region src/kitten.d.ts
4
+ declare const kittenNodeAdapter: TTSAdapter<SynthesizingModelInstance>;
5
+ //#endregion
6
+ //#region src/supertonic.d.ts
7
+ declare const supertonicNodeAdapter: TTSAdapter<SynthesizingModelInstance>;
8
+ //#endregion
9
+ //#region src/index.d.ts
10
+ /** TTS adapter for Kokoro models on Node.js using kokoro-js. */
11
+ declare const kokoroNodeAdapter: TTSAdapter<SynthesizingModelInstance>;
12
+ /** TTS adapter for Piper models on Node.js using native ONNX Runtime. */
13
+ declare const piperNodeAdapter: TTSAdapter<SynthesizingModelInstance>;
14
+ /** All official Node.js TTS adapters bundled for convenience. */
15
+ declare const officialNodeAdapters: TTSAdapter<SynthesizingModelInstance>[];
16
+ //#endregion
17
+ export { kittenNodeAdapter, kokoroNodeAdapter, officialNodeAdapters, piperNodeAdapter, supertonicNodeAdapter };
package/dist/index.mjs ADDED
@@ -0,0 +1,942 @@
1
+ import { getModelAssets, pcmToAudioData } from "@polytts/core";
2
+ import { inflateRawSync } from "node:zlib";
3
+ //#region src/kitten-npz-reader.ts
4
+ function parseNpyHeader(bytes) {
5
+ if (bytes[0] !== 147 || String.fromCharCode(bytes[1], bytes[2], bytes[3], bytes[4], bytes[5]) !== "NUMPY") throw new Error("Not a valid .npy file");
6
+ const majorVersion = bytes[6];
7
+ const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
8
+ const headerLength = majorVersion === 1 ? view.getUint16(8, true) : view.getUint32(8, true);
9
+ const headerOffset = majorVersion === 1 ? 10 : 12;
10
+ const header = new TextDecoder().decode(bytes.slice(headerOffset, headerOffset + headerLength));
11
+ const dtype = header.match(/'descr'\s*:\s*'([^']+)'/)?.[1];
12
+ const shape = header.match(/'shape'\s*:\s*\(([^)]*)\)/)?.[1]?.split(",").map((value) => Number.parseInt(value.trim(), 10)).filter((value) => Number.isFinite(value)) ?? [];
13
+ if (!dtype) throw new Error(`Could not parse dtype from .npy header: ${header}`);
14
+ return {
15
+ dtype,
16
+ shape,
17
+ dataOffset: headerOffset + headerLength
18
+ };
19
+ }
20
+ function npyToFloat32(bytes) {
21
+ const { dtype, shape, dataOffset } = parseNpyHeader(bytes);
22
+ const raw = bytes.slice(dataOffset);
23
+ const aligned = new ArrayBuffer(raw.length);
24
+ new Uint8Array(aligned).set(raw);
25
+ if (dtype === "<f4" || dtype === "float32") return {
26
+ data: new Float32Array(aligned),
27
+ shape
28
+ };
29
+ if (dtype === "<f8" || dtype === "float64") {
30
+ const source = new Float64Array(aligned);
31
+ const target = new Float32Array(source.length);
32
+ for (let index = 0; index < source.length; index += 1) target[index] = source[index];
33
+ return {
34
+ data: target,
35
+ shape
36
+ };
37
+ }
38
+ throw new Error(`Unsupported npy dtype: ${dtype}`);
39
+ }
40
+ function extractZipEntries(buffer) {
41
+ const bytes = new Uint8Array(buffer);
42
+ const view = new DataView(buffer);
43
+ const entries = /* @__PURE__ */ new Map();
44
+ let eocdOffset = -1;
45
+ for (let index = bytes.length - 22; index >= 0; index -= 1) if (view.getUint32(index, true) === 101010256) {
46
+ eocdOffset = index;
47
+ break;
48
+ }
49
+ if (eocdOffset === -1) throw new Error("Could not find End of Central Directory");
50
+ const centralDirectoryOffset = view.getUint32(eocdOffset + 16, true);
51
+ const centralDirectoryEntries = view.getUint16(eocdOffset + 10, true);
52
+ let offset = centralDirectoryOffset;
53
+ for (let index = 0; index < centralDirectoryEntries; index += 1) {
54
+ if (view.getUint32(offset, true) !== 33639248) break;
55
+ const compressionMethod = view.getUint16(offset + 10, true);
56
+ const compressedSize = view.getUint32(offset + 20, true);
57
+ const uncompressedSize = view.getUint32(offset + 24, true);
58
+ const fileNameLength = view.getUint16(offset + 28, true);
59
+ const extraLength = view.getUint16(offset + 30, true);
60
+ const commentLength = view.getUint16(offset + 32, true);
61
+ const localHeaderOffset = view.getUint32(offset + 42, true);
62
+ const fileName = new TextDecoder().decode(bytes.slice(offset + 46, offset + 46 + fileNameLength));
63
+ const localFileNameLength = view.getUint16(localHeaderOffset + 26, true);
64
+ const localExtraLength = view.getUint16(localHeaderOffset + 28, true);
65
+ const dataStart = localHeaderOffset + 30 + localFileNameLength + localExtraLength;
66
+ const compressed = bytes.slice(dataStart, dataStart + compressedSize);
67
+ let fileData;
68
+ if (compressionMethod === 0) fileData = compressed;
69
+ else if (compressionMethod === 8) fileData = inflateRawSync(compressed);
70
+ else {
71
+ offset += 46 + fileNameLength + extraLength + commentLength;
72
+ continue;
73
+ }
74
+ if (fileData.length !== uncompressedSize) throw new Error(`Unexpected size for archive entry "${fileName}"`);
75
+ entries.set(fileName, fileData);
76
+ offset += 46 + fileNameLength + extraLength + commentLength;
77
+ }
78
+ return entries;
79
+ }
80
+ function loadKittenVoicesFromArchive(archive) {
81
+ const entries = extractZipEntries(archive);
82
+ const voices = {};
83
+ for (const [fileName, fileData] of entries) {
84
+ if (!fileName.endsWith(".npy")) continue;
85
+ const voiceName = fileName.replace(/\.npy$/, "");
86
+ const { data, shape } = npyToFloat32(fileData);
87
+ voices[voiceName] = {
88
+ data,
89
+ shape: [shape[0] || 1, shape[1] || data.length]
90
+ };
91
+ }
92
+ return voices;
93
+ }
94
+ //#endregion
95
+ //#region src/kitten-tokenizer.ts
96
+ /**
97
+ * Phoneme tokenizer for the KittenTTS ONNX model. Ported from the upstream Python TextCleaner
98
+ * class: https://github.com/KittenML/KittenTTS/blob/main/kittentts/onnx_model.py
99
+ */
100
+ const PAD = "$";
101
+ const PUNCTUATION = ";:,.!?¡¿—…\"«»\"\" ";
102
+ const LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
103
+ const LETTERS_IPA = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ";
104
+ const symbols = [
105
+ PAD,
106
+ ...PUNCTUATION,
107
+ ...LETTERS,
108
+ ...LETTERS_IPA
109
+ ];
110
+ const charToIndex = {};
111
+ for (let index = 0; index < symbols.length; index += 1) charToIndex[symbols[index]] = index;
112
+ function cleanText(text) {
113
+ const indexes = [];
114
+ for (const char of text) {
115
+ const mapped = charToIndex[char];
116
+ if (mapped !== void 0) indexes.push(mapped);
117
+ }
118
+ return indexes;
119
+ }
120
+ function tokenizePhonemes(phonemes) {
121
+ const tokens = cleanText(phonemes);
122
+ tokens.unshift(0);
123
+ tokens.push(10);
124
+ tokens.push(0);
125
+ return tokens;
126
+ }
127
+ //#endregion
128
+ //#region src/kitten.shared.ts
129
+ const KITTEN_SAMPLE_RATE = 24e3;
130
+ const HF_BASE = "https://huggingface.co";
131
+ function resolveKittenUrl(repoId, fileName) {
132
+ return `${HF_BASE}/${repoId}/resolve/main/${fileName}`;
133
+ }
134
+ function resolveKittenModelFile(repoId, config) {
135
+ return repoId.startsWith("onnx-community/") ? "onnx/model.onnx" : config.model_file;
136
+ }
137
+ function resolveKittenVoiceId(config, voiceId) {
138
+ return config.voice_aliases?.[voiceId] ?? voiceId;
139
+ }
140
+ //#endregion
141
+ //#region src/kitten.ts
142
+ const KITTEN_CONFIG_ASSET_NAME = "kitten-config.json";
143
+ const KITTEN_MODEL_ASSET_NAME = "model.onnx";
144
+ const KITTEN_VOICES_ASSET_NAME = "voices.npz";
145
+ function bundleFor(spec) {
146
+ return {
147
+ adapterId: spec.adapterId,
148
+ modelId: spec.id,
149
+ revision: spec.revision
150
+ };
151
+ }
152
+ function ensurePunctuation(text) {
153
+ const trimmed = text.trim();
154
+ if (!trimmed) return trimmed;
155
+ return ".!?,;:".includes(trimmed[trimmed.length - 1]) ? trimmed : `${trimmed}.`;
156
+ }
157
+ function chunkText$1(text, maxLength = 400) {
158
+ const sentences = text.match(/[^.!?]*[.!?]+|[^.!?]+$/g) || [text];
159
+ const chunks = [];
160
+ for (let sentence of sentences) {
161
+ sentence = sentence.trim();
162
+ if (!sentence) continue;
163
+ if (sentence.length <= maxLength) {
164
+ chunks.push(ensurePunctuation(sentence));
165
+ continue;
166
+ }
167
+ const words = sentence.split(/\s+/);
168
+ let buffer = "";
169
+ for (const word of words) if (buffer.length + word.length + 1 <= maxLength) buffer += `${buffer ? " " : ""}${word}`;
170
+ else {
171
+ if (buffer) chunks.push(ensurePunctuation(buffer));
172
+ buffer = word;
173
+ }
174
+ if (buffer) chunks.push(ensurePunctuation(buffer));
175
+ }
176
+ return chunks;
177
+ }
178
+ function basicTokenize(text) {
179
+ return text.match(/[\p{L}\p{N}_]+|[^\p{L}\p{N}_\s]/gu) || [];
180
+ }
181
+ var KittenNodeModel = class {
182
+ kind = "synthesizing";
183
+ modelId;
184
+ adapterId;
185
+ session = null;
186
+ voices = {};
187
+ config = null;
188
+ phonemize = null;
189
+ constructor(spec, context) {
190
+ this.spec = spec;
191
+ this.context = context;
192
+ this.modelId = spec.id;
193
+ this.adapterId = spec.adapterId;
194
+ }
195
+ async load(signal, onProgress) {
196
+ const repoId = this.spec.config?.modelId ?? "onnx-community/KittenTTS-Mini-v0.8-ONNX";
197
+ const phonemizerModule = await import("phonemizer");
198
+ const ort = await import("onnxruntime-node");
199
+ const configBuffer = await this.loadConfigBuffer(repoId, signal);
200
+ this.config = JSON.parse(new TextDecoder().decode(configBuffer));
201
+ onProgress?.(.15);
202
+ const modelUrl = resolveKittenUrl(repoId, resolveKittenModelFile(repoId, this.config));
203
+ const voicesUrl = resolveKittenUrl(repoId, this.config.voices);
204
+ const [modelBuffer, voicesArchive] = await Promise.all([this.loadNamedAsset(KITTEN_MODEL_ASSET_NAME, modelUrl, signal), this.loadNamedAsset(KITTEN_VOICES_ASSET_NAME, voicesUrl, signal)]);
205
+ onProgress?.(.65);
206
+ this.voices = loadKittenVoicesFromArchive(voicesArchive);
207
+ this.session = await ort.InferenceSession.create(new Uint8Array(modelBuffer));
208
+ this.phonemize = phonemizerModule.phonemize;
209
+ onProgress?.(1);
210
+ }
211
+ async loadConfigBuffer(repoId, signal) {
212
+ const cached = await this.context.assetStore.getAsset(bundleFor(this.spec), KITTEN_CONFIG_ASSET_NAME);
213
+ if (cached) return cached;
214
+ const configResponse = await this.context.fetch(resolveKittenUrl(repoId, "kitten_config.json"), { signal });
215
+ if (configResponse.ok) return configResponse.arrayBuffer();
216
+ const fallback = await this.context.fetch(resolveKittenUrl(repoId, "config.json"), { signal });
217
+ if (!fallback.ok) throw new Error(`Failed to fetch Kitten config: HTTP ${fallback.status}`);
218
+ return fallback.arrayBuffer();
219
+ }
220
+ async generate(text, voiceId, signal, speed) {
221
+ if (!this.session || !this.config || !this.phonemize) throw new Error("Kitten model not loaded");
222
+ const chunks = chunkText$1(text);
223
+ const outputs = [];
224
+ for (const chunk of chunks) outputs.push(await this.generateChunk(chunk, voiceId, speed ?? 1, signal));
225
+ const totalLength = outputs.reduce((sum, chunk) => sum + chunk.length, 0);
226
+ const combined = new Float32Array(totalLength);
227
+ let offset = 0;
228
+ for (const chunk of outputs) {
229
+ combined.set(chunk, offset);
230
+ offset += chunk.length;
231
+ }
232
+ return pcmToAudioData(combined, KITTEN_SAMPLE_RATE);
233
+ }
234
+ async *stream(text, voiceId, signal, speed) {
235
+ if (!this.session || !this.config || !this.phonemize) throw new Error("Kitten model not loaded");
236
+ const chunks = chunkText$1(text);
237
+ for (const chunk of chunks) {
238
+ if (signal.aborted) throw this.context.createAbortError();
239
+ yield pcmToAudioData(await this.generateChunk(chunk, voiceId, speed ?? 1, signal), KITTEN_SAMPLE_RATE);
240
+ }
241
+ }
242
+ listVoices() {
243
+ return this.spec.voices ?? [];
244
+ }
245
+ dispose() {
246
+ this.session = null;
247
+ this.voices = {};
248
+ this.config = null;
249
+ this.phonemize = null;
250
+ }
251
+ async generateChunk(text, voiceKey, speed, signal) {
252
+ if (!this.session || !this.config || !this.phonemize) throw new Error("Kitten model not loaded");
253
+ const resolvedVoiceId = resolveKittenVoiceId(this.config, voiceKey);
254
+ const voiceData = this.voices[resolvedVoiceId];
255
+ if (!voiceData) throw new Error(`Voice "${voiceKey}" not found`);
256
+ if (this.config.speed_priors?.[resolvedVoiceId]) speed *= this.config.speed_priors[resolvedVoiceId];
257
+ const phonemes = await this.phonemizeText(text, signal);
258
+ if (signal.aborted) throw this.context.createAbortError();
259
+ const inputIds = tokenizePhonemes(phonemes);
260
+ const referenceIndex = Math.min(text.length, voiceData.shape[0] - 1);
261
+ const styleDimension = voiceData.shape[1];
262
+ const referenceStyle = voiceData.data.slice(referenceIndex * styleDimension, (referenceIndex + 1) * styleDimension);
263
+ const ort = await import("onnxruntime-node");
264
+ const inputs = {
265
+ input_ids: new ort.Tensor("int64", BigInt64Array.from(inputIds.map(BigInt)), [1, inputIds.length]),
266
+ style: new ort.Tensor("float32", referenceStyle, [1, styleDimension]),
267
+ speed: new ort.Tensor("float32", new Float32Array([speed]), [1])
268
+ };
269
+ const audio = (await this.session.run(inputs))[this.session.outputNames[0]]?.data;
270
+ if (audio.length > 24e3) return audio.slice(0, audio.length - 5e3);
271
+ return audio;
272
+ }
273
+ async phonemizeText(text, signal) {
274
+ if (!this.phonemize) throw new Error("Kitten phonemizer not loaded");
275
+ const punctuation = /(\s*[;:,.!?¡¿—…"«»""()[\]{}]+\s*)+/g;
276
+ const sections = [];
277
+ let lastIndex = 0;
278
+ for (const match of text.matchAll(punctuation)) {
279
+ if (lastIndex < match.index) sections.push({
280
+ punctuation: false,
281
+ text: text.slice(lastIndex, match.index)
282
+ });
283
+ sections.push({
284
+ punctuation: true,
285
+ text: match[0]
286
+ });
287
+ lastIndex = match.index + match[0].length;
288
+ }
289
+ if (lastIndex < text.length) sections.push({
290
+ punctuation: false,
291
+ text: text.slice(lastIndex)
292
+ });
293
+ return basicTokenize((await Promise.all(sections.map(async (section) => {
294
+ if (section.punctuation) return section.text;
295
+ const result = await this.phonemize(section.text, "en-us");
296
+ if (signal.aborted) throw this.context.createAbortError();
297
+ return result.join(" ");
298
+ }))).join("")).join(" ");
299
+ }
300
+ async fetchArrayBuffer(url, signal) {
301
+ const response = await this.context.fetch(url, { signal });
302
+ if (!response.ok) throw new Error(`Failed to fetch Kitten asset: HTTP ${response.status}`);
303
+ return response.arrayBuffer();
304
+ }
305
+ async loadNamedAsset(assetName, url, signal) {
306
+ const cached = await this.context.assetStore.getAsset(bundleFor(this.spec), assetName);
307
+ if (cached) return cached;
308
+ return this.fetchArrayBuffer(url, signal);
309
+ }
310
+ };
311
+ const kittenNodeAdapter = {
312
+ id: "kitten-node",
313
+ name: "KittenTTS Node",
314
+ capabilities: {
315
+ install: true,
316
+ speak: false,
317
+ synthesize: true,
318
+ stream: true,
319
+ dynamicVoices: false
320
+ },
321
+ async install(spec, context, signal, onProgress) {
322
+ const repoId = spec.config?.modelId ?? "onnx-community/KittenTTS-Mini-v0.8-ONNX";
323
+ const bundle = bundleFor(spec);
324
+ const configBuffer = await (async () => {
325
+ const primary = await context.fetch(resolveKittenUrl(repoId, "kitten_config.json"), { signal });
326
+ if (primary.ok) return primary.arrayBuffer();
327
+ const fallback = await context.fetch(resolveKittenUrl(repoId, "config.json"), { signal });
328
+ if (!fallback.ok) throw new Error(`Failed to fetch Kitten config: HTTP ${fallback.status}`);
329
+ return fallback.arrayBuffer();
330
+ })();
331
+ await context.assetStore.stageAsset(bundle, KITTEN_CONFIG_ASSET_NAME, configBuffer);
332
+ onProgress?.(.15);
333
+ const config = JSON.parse(new TextDecoder().decode(configBuffer));
334
+ const modelUrl = resolveKittenUrl(repoId, resolveKittenModelFile(repoId, config));
335
+ const voicesUrl = resolveKittenUrl(repoId, config.voices);
336
+ const modelBuffer = await context.fetch(modelUrl, { signal }).then(async (response) => {
337
+ if (!response.ok) throw new Error(`Failed to fetch Kitten model: HTTP ${response.status}`);
338
+ return response.arrayBuffer();
339
+ });
340
+ await context.assetStore.stageAsset(bundle, KITTEN_MODEL_ASSET_NAME, modelBuffer);
341
+ onProgress?.(.8);
342
+ const voicesBuffer = await context.fetch(voicesUrl, { signal }).then(async (response) => {
343
+ if (!response.ok) throw new Error(`Failed to fetch Kitten voices: HTTP ${response.status}`);
344
+ return response.arrayBuffer();
345
+ });
346
+ await context.assetStore.stageAsset(bundle, KITTEN_VOICES_ASSET_NAME, voicesBuffer);
347
+ await context.assetStore.activateBundle(bundle, [
348
+ KITTEN_CONFIG_ASSET_NAME,
349
+ KITTEN_MODEL_ASSET_NAME,
350
+ KITTEN_VOICES_ASSET_NAME
351
+ ]);
352
+ onProgress?.(1);
353
+ },
354
+ createModel(spec, context) {
355
+ return new KittenNodeModel(spec, context);
356
+ }
357
+ };
358
+ //#endregion
359
+ //#region src/supertonic.runtime.ts
360
+ const SUPERTONIC_MODEL_SESSION_KEYS = [
361
+ "durationPredictor",
362
+ "textEncoder",
363
+ "vectorEstimator",
364
+ "vocoder"
365
+ ];
366
+ function gaussianRandom() {
367
+ const u1 = Math.max(1e-4, Math.random());
368
+ const u2 = Math.random();
369
+ return Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
370
+ }
371
+ function flatten3d(data) {
372
+ return new Float32Array(data.flat(2));
373
+ }
374
+ function isValidLanguage(value) {
375
+ return value === "en" || value === "ko" || value === "es" || value === "pt" || value === "fr";
376
+ }
377
+ function createDefaultAbortError() {
378
+ const error = /* @__PURE__ */ new Error("Aborted");
379
+ error.name = "AbortError";
380
+ return error;
381
+ }
382
+ var SupertonicStyle = class {
383
+ constructor(ttl, dp) {
384
+ this.ttl = ttl;
385
+ this.dp = dp;
386
+ }
387
+ };
388
+ var SupertonicUnicodeProcessor = class {
389
+ constructor(indexer) {
390
+ this.indexer = indexer;
391
+ }
392
+ call(textList, langList) {
393
+ const processedTexts = textList.map((text, index) => this.preprocessText(text, langList[index]));
394
+ const lengths = processedTexts.map((text) => text.length);
395
+ const maxLength = Math.max(...lengths);
396
+ return {
397
+ textIds: processedTexts.map((text) => {
398
+ const row = Array.from({ length: maxLength }, () => 0);
399
+ for (let index = 0; index < text.length; index += 1) {
400
+ const codePoint = text.codePointAt(index);
401
+ row[index] = codePoint != null && codePoint < this.indexer.length ? this.indexer[codePoint] : -1;
402
+ }
403
+ return row;
404
+ }),
405
+ textMask: this.lengthToMask(lengths, maxLength)
406
+ };
407
+ }
408
+ preprocessText(text, lang) {
409
+ let normalized = text.normalize("NFKD");
410
+ normalized = normalized.replace(/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu, "");
411
+ for (const [from, to] of Object.entries({
412
+ "–": "-",
413
+ "‑": "-",
414
+ "—": "-",
415
+ _: " ",
416
+ "“": "\"",
417
+ "”": "\"",
418
+ "‘": "'",
419
+ "’": "'",
420
+ "´": "'",
421
+ "`": "'",
422
+ "[": " ",
423
+ "]": " ",
424
+ "|": " ",
425
+ "/": " ",
426
+ "#": " ",
427
+ "→": " ",
428
+ "←": " "
429
+ })) normalized = normalized.replaceAll(from, to);
430
+ normalized = normalized.replace(/[♥☆♡©\\]/g, "");
431
+ for (const [from, to] of Object.entries({
432
+ "@": " at ",
433
+ "e.g.,": "for example, ",
434
+ "i.e.,": "that is, "
435
+ })) normalized = normalized.replaceAll(from, to);
436
+ normalized = normalized.replace(/ ,/g, ",").replace(/ \./g, ".").replace(/ !/g, "!").replace(/ \?/g, "?").replace(/ ;/g, ";").replace(/ :/g, ":").replace(/ '/g, "'");
437
+ while (normalized.includes("\"\"")) normalized = normalized.replace("\"\"", "\"");
438
+ while (normalized.includes("''")) normalized = normalized.replace("''", "'");
439
+ while (normalized.includes("``")) normalized = normalized.replace("``", "`");
440
+ normalized = normalized.replace(/\s+/g, " ").trim();
441
+ if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(normalized)) normalized += ".";
442
+ if (!isValidLanguage(lang)) throw new Error(`Invalid Supertonic language: ${lang}`);
443
+ return `<${lang}>${normalized}</${lang}>`;
444
+ }
445
+ lengthToMask(lengths, maxLength) {
446
+ return lengths.map((length) => {
447
+ const row = Array.from({ length: maxLength }, () => 0);
448
+ for (let index = 0; index < Math.min(length, maxLength); index += 1) row[index] = 1;
449
+ return [row];
450
+ });
451
+ }
452
+ };
453
+ function createSupertonicStyle(ortModule, json) {
454
+ const ttlDims = json.style_ttl.dims;
455
+ const dpDims = json.style_dp.dims;
456
+ return new SupertonicStyle(new ortModule.Tensor("float32", flatten3d(json.style_ttl.data), ttlDims), new ortModule.Tensor("float32", flatten3d(json.style_dp.data), dpDims));
457
+ }
458
+ async function loadSupertonicSessions(ortModule, modelBuffers, onProgress) {
459
+ const sessions = {};
460
+ for (let index = 0; index < SUPERTONIC_MODEL_SESSION_KEYS.length; index += 1) {
461
+ const key = SUPERTONIC_MODEL_SESSION_KEYS[index];
462
+ sessions[key] = await ortModule.InferenceSession.create(new Uint8Array(modelBuffers[key]));
463
+ onProgress?.(index + 1, SUPERTONIC_MODEL_SESSION_KEYS.length);
464
+ }
465
+ return sessions;
466
+ }
467
+ var SupertonicTextToSpeech = class {
468
+ sampleRate;
469
+ constructor(ortModule, config, textProcessor, sessions) {
470
+ this.ortModule = ortModule;
471
+ this.config = config;
472
+ this.textProcessor = textProcessor;
473
+ this.sessions = sessions;
474
+ this.sampleRate = config.ae.sample_rate;
475
+ }
476
+ async generate(text, language, style, totalStep, speed = 1.05, silenceDuration = .3, onProgress, isAborted, createAbortError = createDefaultAbortError) {
477
+ if (style.ttl.dims[0] !== 1) throw new Error("Supertonic only supports single-style synthesis in this adapter");
478
+ const chunks = chunkText(text, language === "ko" ? 120 : 300);
479
+ let wav = [];
480
+ let duration = 0;
481
+ for (let index = 0; index < chunks.length; index += 1) {
482
+ if (isAborted?.()) throw createAbortError();
483
+ const chunk = chunks[index];
484
+ const result = await this.generateChunk(chunk, language, style, totalStep, speed, onProgress, isAborted, createAbortError);
485
+ if (wav.length === 0) {
486
+ wav = result.wav;
487
+ duration = result.duration;
488
+ } else {
489
+ const silenceLength = Math.floor(silenceDuration * this.sampleRate);
490
+ wav = [
491
+ ...wav,
492
+ ...Array.from({ length: silenceLength }, () => 0),
493
+ ...result.wav
494
+ ];
495
+ duration += result.duration + silenceDuration;
496
+ }
497
+ }
498
+ return {
499
+ wav: new Float32Array(wav),
500
+ duration
501
+ };
502
+ }
503
+ async generateChunk(text, language, style, totalStep, speed, onProgress, isAborted, createAbortError = createDefaultAbortError) {
504
+ const { textIds, textMask } = this.textProcessor.call([text], [language]);
505
+ const textIdsTensor = new this.ortModule.Tensor("int64", new BigInt64Array(textIds.flat().map((value) => BigInt(value))), [1, textIds[0].length]);
506
+ const textMaskTensor = new this.ortModule.Tensor("float32", new Float32Array(textMask.flat(2)), [
507
+ 1,
508
+ 1,
509
+ textMask[0][0].length
510
+ ]);
511
+ const durationOutputs = await this.sessions.durationPredictor.run({
512
+ text_ids: textIdsTensor,
513
+ style_dp: style.dp,
514
+ text_mask: textMaskTensor
515
+ });
516
+ const durations = Array.from(durationOutputs.duration.data);
517
+ for (let index = 0; index < durations.length; index += 1) durations[index] /= speed;
518
+ const textEncoderOutputs = await this.sessions.textEncoder.run({
519
+ text_ids: textIdsTensor,
520
+ style_ttl: style.ttl,
521
+ text_mask: textMaskTensor
522
+ });
523
+ let { xt, latentMask } = sampleNoisyLatent(durations, this.sampleRate, this.config.ae.base_chunk_size, this.config.ttl.chunk_compress_factor, this.config.ttl.latent_dim);
524
+ const latentMaskTensor = new this.ortModule.Tensor("float32", new Float32Array(latentMask.flat(2)), [
525
+ 1,
526
+ 1,
527
+ latentMask[0][0].length
528
+ ]);
529
+ const totalStepTensor = new this.ortModule.Tensor("float32", new Float32Array([totalStep]), [1]);
530
+ for (let step = 0; step < totalStep; step += 1) {
531
+ if (isAborted?.()) throw createAbortError();
532
+ onProgress?.(step + 1, totalStep);
533
+ const xtTensor = new this.ortModule.Tensor("float32", new Float32Array(xt.flat(2)), [
534
+ 1,
535
+ xt[0].length,
536
+ xt[0][0].length
537
+ ]);
538
+ const currentStepTensor = new this.ortModule.Tensor("float32", new Float32Array([step]), [1]);
539
+ const vectorEstimatorOutputs = await this.sessions.vectorEstimator.run({
540
+ noisy_latent: xtTensor,
541
+ text_emb: textEncoderOutputs.text_emb,
542
+ style_ttl: style.ttl,
543
+ latent_mask: latentMaskTensor,
544
+ text_mask: textMaskTensor,
545
+ current_step: currentStepTensor,
546
+ total_step: totalStepTensor
547
+ });
548
+ xt = reshapeLatent(Array.from(vectorEstimatorOutputs.denoised_latent.data), xt[0].length, xt[0][0].length);
549
+ }
550
+ const latentTensor = new this.ortModule.Tensor("float32", new Float32Array(xt.flat(2)), [
551
+ 1,
552
+ xt[0].length,
553
+ xt[0][0].length
554
+ ]);
555
+ const vocoderOutputs = await this.sessions.vocoder.run({ latent: latentTensor });
556
+ return {
557
+ wav: Array.from(vocoderOutputs.wav_tts.data),
558
+ duration: durations[0] ?? 0
559
+ };
560
+ }
561
+ };
562
+ function sampleNoisyLatent(durations, sampleRate, baseChunkSize, chunkCompressFactor, latentDim) {
563
+ const maxDuration = Math.max(...durations);
564
+ const wavLengthMax = Math.floor(maxDuration * sampleRate);
565
+ const chunkSize = baseChunkSize * chunkCompressFactor;
566
+ const latentLength = Math.floor((wavLengthMax + chunkSize - 1) / chunkSize);
567
+ const latentDimensions = latentDim * chunkCompressFactor;
568
+ const wavLengths = durations.map((duration) => Math.floor(duration * sampleRate));
569
+ const xt = [[]];
570
+ for (let dimension = 0; dimension < latentDimensions; dimension += 1) {
571
+ const row = [];
572
+ for (let time = 0; time < latentLength; time += 1) row.push(gaussianRandom());
573
+ xt[0].push(row);
574
+ }
575
+ return {
576
+ xt,
577
+ latentMask: [[Array.from({ length: latentLength }, (_, index) => {
578
+ return index * chunkSize < wavLengths[0] ? 1 : 0;
579
+ })]]
580
+ };
581
+ }
582
+ function reshapeLatent(data, dimensions, length) {
583
+ const batch = [];
584
+ let cursor = 0;
585
+ for (let dimension = 0; dimension < dimensions; dimension += 1) {
586
+ const row = [];
587
+ for (let index = 0; index < length; index += 1) {
588
+ row.push(data[cursor] ?? 0);
589
+ cursor += 1;
590
+ }
591
+ batch.push(row);
592
+ }
593
+ return [batch];
594
+ }
595
+ function chunkText(text, maxLength) {
596
+ const paragraphs = text.trim().split(/\n\s*\n+/).filter(Boolean);
597
+ const chunks = [];
598
+ for (let paragraph of paragraphs) {
599
+ paragraph = paragraph.trim();
600
+ if (!paragraph) continue;
601
+ const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/) || [paragraph];
602
+ let currentChunk = "";
603
+ for (const sentence of sentences) {
604
+ if (currentChunk.length + sentence.length + 1 <= maxLength) {
605
+ currentChunk += `${currentChunk ? " " : ""}${sentence}`;
606
+ continue;
607
+ }
608
+ if (currentChunk) chunks.push(currentChunk.trim());
609
+ currentChunk = sentence;
610
+ }
611
+ if (currentChunk) chunks.push(currentChunk.trim());
612
+ }
613
+ return chunks.length ? chunks : [text.trim()];
614
+ }
615
+ //#endregion
616
+ //#region src/supertonic.shared.ts
617
+ const SUPERTONIC_STYLE_IDS = [
618
+ "F1",
619
+ "F2",
620
+ "F3",
621
+ "F4",
622
+ "F5",
623
+ "M1",
624
+ "M2",
625
+ "M3",
626
+ "M4",
627
+ "M5"
628
+ ];
629
+ const SUPERTONIC_LANGUAGES = [
630
+ {
631
+ id: "en",
632
+ label: "English"
633
+ },
634
+ {
635
+ id: "ko",
636
+ label: "Korean"
637
+ },
638
+ {
639
+ id: "es",
640
+ label: "Spanish"
641
+ },
642
+ {
643
+ id: "pt",
644
+ label: "Portuguese"
645
+ },
646
+ {
647
+ id: "fr",
648
+ label: "French"
649
+ }
650
+ ];
651
+ new Map(SUPERTONIC_LANGUAGES.map((language) => [language.id, language]));
652
+ function parseSupertonicVoiceId(voiceId) {
653
+ const [rawLanguage, rawStyle] = (voiceId ?? "en:M1").split(":");
654
+ return {
655
+ language: SUPERTONIC_LANGUAGES.find((entry) => entry.id === rawLanguage)?.id ?? "en",
656
+ styleId: SUPERTONIC_STYLE_IDS.find((entry) => entry === rawStyle) ?? "M1"
657
+ };
658
+ }
659
+ //#endregion
660
+ //#region src/supertonic.ts
661
+ const SUPERTONIC_MODEL_ASSET_NAMES = {
662
+ durationPredictor: "onnx/duration_predictor.onnx",
663
+ textEncoder: "onnx/text_encoder.onnx",
664
+ vectorEstimator: "onnx/vector_estimator.onnx",
665
+ vocoder: "onnx/vocoder.onnx"
666
+ };
667
+ function readJson(buffer) {
668
+ return JSON.parse(new TextDecoder().decode(buffer));
669
+ }
670
+ var SupertonicNodeModel = class {
671
+ kind = "synthesizing";
672
+ modelId;
673
+ adapterId;
674
+ tts = null;
675
+ sessions = null;
676
+ styles = /* @__PURE__ */ new Map();
677
+ constructor(spec, context) {
678
+ this.spec = spec;
679
+ this.context = context;
680
+ this.modelId = spec.id;
681
+ this.adapterId = spec.adapterId;
682
+ }
683
+ async load(signal, onProgress) {
684
+ const config = readJson(await this.loadAsset("onnx/tts.json", signal));
685
+ const unicodeIndexer = readJson(await this.loadAsset("onnx/unicode_indexer.json", signal));
686
+ const ortModule = await import("onnxruntime-node");
687
+ if (signal.aborted) throw this.context.createAbortError();
688
+ const styles = /* @__PURE__ */ new Map();
689
+ for (let index = 0; index < SUPERTONIC_STYLE_IDS.length; index += 1) {
690
+ const styleId = SUPERTONIC_STYLE_IDS[index];
691
+ const json = readJson(await this.loadAsset(`voice_styles/${styleId}.json`, signal));
692
+ styles.set(styleId, createSupertonicStyle(ortModule, json));
693
+ }
694
+ onProgress?.(.2);
695
+ const modelBuffers = {};
696
+ for (let index = 0; index < SUPERTONIC_MODEL_SESSION_KEYS.length; index += 1) {
697
+ const key = SUPERTONIC_MODEL_SESSION_KEYS[index];
698
+ modelBuffers[key] = await this.loadAsset(SUPERTONIC_MODEL_ASSET_NAMES[key], signal);
699
+ onProgress?.(.2 + (index + 1) / SUPERTONIC_MODEL_SESSION_KEYS.length * .4);
700
+ }
701
+ const sessions = await loadSupertonicSessions(ortModule, modelBuffers, (completed, total) => {
702
+ onProgress?.(.6 + completed / total * .4);
703
+ });
704
+ if (signal.aborted) throw this.context.createAbortError();
705
+ this.styles = styles;
706
+ this.sessions = sessions;
707
+ this.tts = new SupertonicTextToSpeech(ortModule, config, new SupertonicUnicodeProcessor(unicodeIndexer), sessions);
708
+ onProgress?.(1);
709
+ }
710
+ async generate(text, voiceId, signal, speed) {
711
+ if (!this.tts) throw new Error("Supertonic model not loaded");
712
+ const { language, styleId } = parseSupertonicVoiceId(voiceId);
713
+ const style = this.styles.get(styleId);
714
+ if (!style) throw new Error(`Unsupported Supertonic style: ${styleId}`);
715
+ return pcmToAudioData((await this.tts.generate(text, language, style, Number(this.spec.config?.totalStep ?? 2), speed ?? 1, .3, void 0, () => signal.aborted, () => this.context.createAbortError())).wav, this.tts.sampleRate);
716
+ }
717
+ listVoices() {
718
+ return this.spec.voices ?? [];
719
+ }
720
+ dispose() {
721
+ this.tts = null;
722
+ this.styles.clear();
723
+ const sessions = this.sessions;
724
+ this.sessions = null;
725
+ if (!sessions) return;
726
+ for (const session of Object.values(sessions)) session.release?.call(session);
727
+ }
728
+ async loadAsset(assetName, signal) {
729
+ const bundle = {
730
+ adapterId: this.spec.adapterId,
731
+ modelId: this.spec.id,
732
+ revision: this.spec.revision
733
+ };
734
+ const cached = await this.context.assetStore.getAsset(bundle, assetName);
735
+ if (cached) return cached;
736
+ const asset = getModelAssets(this.spec).find((entry) => entry.name === assetName);
737
+ if (!asset) throw new Error(`Supertonic asset "${assetName}" is missing from model "${this.spec.id}"`);
738
+ const response = await this.context.fetch(asset.url, { signal });
739
+ if (!response.ok) throw new Error(`Failed to fetch Supertonic asset "${assetName}": HTTP ${response.status}`);
740
+ return response.arrayBuffer();
741
+ }
742
+ };
743
+ const supertonicNodeAdapter = {
744
+ id: "supertonic-node",
745
+ name: "Supertonic Node",
746
+ capabilities: {
747
+ install: false,
748
+ speak: false,
749
+ synthesize: true,
750
+ stream: false,
751
+ dynamicVoices: false
752
+ },
753
+ createModel(spec, context) {
754
+ return new SupertonicNodeModel(spec, context);
755
+ }
756
+ };
757
+ //#endregion
758
+ //#region src/index.ts
759
+ const DEFAULT_KOKORO_MODEL_REF = "onnx-community/Kokoro-82M-v1.0-ONNX";
760
+ function normalizeSpeakSpeed(speed) {
761
+ if (!Number.isFinite(speed)) return void 0;
762
+ return Math.min(2, Math.max(.5, speed));
763
+ }
764
+ function phonemesToIds(phonemes, config) {
765
+ const map = config.phoneme_id_map;
766
+ const ids = [];
767
+ if (map["^"]) ids.push(...map["^"]);
768
+ for (const phoneme of phonemes) {
769
+ const phonemeIds = map[phoneme];
770
+ if (phonemeIds === void 0) continue;
771
+ ids.push(...phonemeIds);
772
+ if (map["_"]) ids.push(...map["_"]);
773
+ }
774
+ if (map["$"]) ids.push(...map["$"]);
775
+ return ids;
776
+ }
777
+ async function loadSpecAsset(spec, assetName, context, signal) {
778
+ const bundle = {
779
+ adapterId: spec.adapterId,
780
+ modelId: spec.id,
781
+ revision: spec.revision
782
+ };
783
+ const cached = await context.assetStore.getAsset(bundle, assetName);
784
+ if (cached) return cached;
785
+ const asset = getModelAssets(spec).find((entry) => entry.name === assetName);
786
+ if (!asset) throw new Error(`Asset "${assetName}" is missing from model "${spec.id}"`);
787
+ const response = await context.fetch(asset.url, { signal });
788
+ if (!response.ok) throw new Error(`Failed to fetch ${asset.name}: HTTP ${response.status}`);
789
+ return response.arrayBuffer();
790
+ }
791
+ var KokoroNodeModel = class {
792
+ kind = "synthesizing";
793
+ modelId;
794
+ adapterId;
795
+ tts = null;
796
+ voices = [];
797
+ constructor(spec, createAbortError) {
798
+ this.spec = spec;
799
+ this.createAbortError = createAbortError;
800
+ this.modelId = spec.id;
801
+ this.adapterId = spec.adapterId;
802
+ }
803
+ async load(signal, onProgress) {
804
+ if (signal.aborted) throw this.createAbortError();
805
+ const kokoroModule = await import("kokoro-js");
806
+ const modelRef = this.spec.config?.modelId ?? DEFAULT_KOKORO_MODEL_REF;
807
+ this.tts = await kokoroModule.KokoroTTS.from_pretrained(modelRef, {
808
+ dtype: "q8",
809
+ device: "cpu",
810
+ progress_callback(progress) {
811
+ if (progress.status === "progress" && progress.loaded != null && progress.total) onProgress?.(progress.loaded / progress.total);
812
+ }
813
+ });
814
+ if (signal.aborted) throw this.createAbortError();
815
+ this.voices = Object.entries(this.tts.voices ?? {}).map(([id, info]) => ({
816
+ id,
817
+ name: info?.name ?? id,
818
+ language: info?.language ?? (id.startsWith("b") ? "en-GB" : "en-US"),
819
+ gender: info?.gender
820
+ }));
821
+ }
822
+ async generate(text, voiceId, signal, speed) {
823
+ if (!this.tts) throw new Error("Kokoro model not loaded");
824
+ if (signal.aborted) throw this.createAbortError();
825
+ const result = await this.tts.generate(text, {
826
+ voice: voiceId,
827
+ speed: normalizeSpeakSpeed(speed)
828
+ });
829
+ if (signal.aborted) throw this.createAbortError();
830
+ return {
831
+ sampleRate: result.sampling_rate ?? 24e3,
832
+ channels: [result.audio]
833
+ };
834
+ }
835
+ listVoices() {
836
+ return this.voices.length ? this.voices : this.spec.voices ?? [];
837
+ }
838
+ dispose() {
839
+ this.tts = null;
840
+ this.voices = [];
841
+ }
842
+ };
843
+ var PiperNodeModel = class {
844
+ kind = "synthesizing";
845
+ modelId;
846
+ adapterId;
847
+ session = null;
848
+ phonemize = null;
849
+ voiceConfig = null;
850
+ constructor(spec, context) {
851
+ this.spec = spec;
852
+ this.context = context;
853
+ this.modelId = spec.id;
854
+ this.adapterId = spec.adapterId;
855
+ }
856
+ async load(signal) {
857
+ const voiceAsset = getModelAssets(this.spec).find((asset) => asset.name.endsWith(".json"));
858
+ const modelAsset = getModelAssets(this.spec).find((asset) => asset.name.endsWith(".onnx"));
859
+ if (!voiceAsset || !modelAsset) throw new Error(`Piper model "${this.spec.id}" is missing required assets`);
860
+ const [configData, modelData] = await Promise.all([loadSpecAsset(this.spec, voiceAsset.name, this.context, signal), loadSpecAsset(this.spec, modelAsset.name, this.context, signal)]);
861
+ if (signal.aborted) throw this.context.createAbortError();
862
+ const session = await (await import("onnxruntime-node")).InferenceSession.create(new Uint8Array(modelData));
863
+ const phonemizerModule = await import("phonemizer");
864
+ if (signal.aborted) throw this.context.createAbortError();
865
+ this.voiceConfig = JSON.parse(new TextDecoder().decode(configData));
866
+ this.session = session;
867
+ this.phonemize = phonemizerModule.phonemize;
868
+ }
869
+ async generate(text, _voiceId, signal) {
870
+ if (!this.session || !this.phonemize || !this.voiceConfig) throw new Error("Piper model not loaded");
871
+ const languageTag = this.spec.languages[0]?.toLowerCase() ?? "en-us";
872
+ const phonemeTokens = await this.phonemize(text, languageTag);
873
+ if (signal.aborted) throw this.context.createAbortError();
874
+ const ids = phonemesToIds(Array.from(phonemeTokens.join("")), this.voiceConfig);
875
+ if (!ids.length) throw new Error("Phoneme mapping produced an empty sequence");
876
+ const ort = await import("onnxruntime-node");
877
+ const feedsMap = {
878
+ input: new ort.Tensor("int64", new BigInt64Array(ids.map(BigInt)), [1, ids.length]),
879
+ input_lengths: new ort.Tensor("int64", new BigInt64Array([BigInt(ids.length)]), [1]),
880
+ scales: new ort.Tensor("float32", new Float32Array([
881
+ this.voiceConfig.inference.noise_scale,
882
+ this.voiceConfig.inference.length_scale,
883
+ this.voiceConfig.inference.noise_w
884
+ ]), [3])
885
+ };
886
+ if (this.voiceConfig.num_speakers > 1) feedsMap.sid = new ort.Tensor("int64", new BigInt64Array([BigInt(0)]), [1]);
887
+ const results = await this.session.run(feedsMap);
888
+ const audioData = results[Object.keys(results)[0]].data;
889
+ if (signal.aborted) throw this.context.createAbortError();
890
+ return {
891
+ sampleRate: this.voiceConfig.audio.sample_rate,
892
+ channels: [audioData]
893
+ };
894
+ }
895
+ listVoices() {
896
+ return this.spec.voices ?? [];
897
+ }
898
+ dispose() {
899
+ this.session = null;
900
+ this.phonemize = null;
901
+ this.voiceConfig = null;
902
+ }
903
+ };
904
+ /** TTS adapter for Kokoro models on Node.js using kokoro-js. */
905
+ const kokoroNodeAdapter = {
906
+ id: "kokoro-node",
907
+ name: "Kokoro Node",
908
+ capabilities: {
909
+ install: false,
910
+ speak: false,
911
+ synthesize: true,
912
+ stream: false,
913
+ dynamicVoices: true
914
+ },
915
+ createModel(spec, context) {
916
+ return new KokoroNodeModel(spec, () => context.createAbortError());
917
+ }
918
+ };
919
+ /** TTS adapter for Piper models on Node.js using native ONNX Runtime. */
920
+ const piperNodeAdapter = {
921
+ id: "piper-node",
922
+ name: "Piper Node",
923
+ capabilities: {
924
+ install: false,
925
+ speak: false,
926
+ synthesize: true,
927
+ stream: false,
928
+ dynamicVoices: false
929
+ },
930
+ createModel(spec, context) {
931
+ return new PiperNodeModel(spec, context);
932
+ }
933
+ };
934
+ /** All official Node.js TTS adapters bundled for convenience. */
935
+ const officialNodeAdapters = [
936
+ kokoroNodeAdapter,
937
+ piperNodeAdapter,
938
+ kittenNodeAdapter,
939
+ supertonicNodeAdapter
940
+ ];
941
+ //#endregion
942
+ export { kittenNodeAdapter, kokoroNodeAdapter, officialNodeAdapters, piperNodeAdapter, supertonicNodeAdapter };
package/package.json ADDED
@@ -0,0 +1,57 @@
1
+ {
2
+ "name": "@polytts/node-adapters",
3
+ "version": "0.1.0",
4
+ "description": "Official Node adapter implementations for polytts.",
5
+ "keywords": [
6
+ "adapters",
7
+ "electron",
8
+ "node",
9
+ "polytts",
10
+ "raycast",
11
+ "text-to-speech",
12
+ "tts"
13
+ ],
14
+ "homepage": "https://github.com/Dunqing/polytts/tree/main/packages/node-adapters#readme",
15
+ "bugs": {
16
+ "url": "https://github.com/Dunqing/polytts/issues"
17
+ },
18
+ "license": "MIT",
19
+ "repository": {
20
+ "type": "git",
21
+ "url": "git+https://github.com/Dunqing/polytts.git",
22
+ "directory": "packages/node-adapters"
23
+ },
24
+ "files": [
25
+ "dist"
26
+ ],
27
+ "type": "module",
28
+ "exports": {
29
+ ".": {
30
+ "types": "./dist/index.d.mts",
31
+ "default": "./dist/index.mjs"
32
+ },
33
+ "./package.json": "./package.json"
34
+ },
35
+ "publishConfig": {
36
+ "access": "public"
37
+ },
38
+ "dependencies": {
39
+ "kokoro-js": "latest",
40
+ "onnxruntime-node": "latest",
41
+ "phonemizer": "latest",
42
+ "@polytts/core": "0.1.0"
43
+ },
44
+ "devDependencies": {
45
+ "@types/node": "^25.6.0",
46
+ "vite-plus": "latest",
47
+ "@polytts/presets": "0.1.0"
48
+ },
49
+ "scripts": {
50
+ "build": "vp pack",
51
+ "test": "vp test",
52
+ "test:run": "vp test run"
53
+ },
54
+ "main": "./dist/index.mjs",
55
+ "module": "./dist/index.mjs",
56
+ "types": "./dist/index.d.mts"
57
+ }