@polytts/node-adapters 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +31 -0
- package/dist/index.d.mts +17 -0
- package/dist/index.mjs +942 -0
- package/package.json +57 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 DengQing dengqing0821@gmail.com
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# @polytts/node-adapters
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/@polytts/node-adapters)
|
|
4
|
+
|
|
5
|
+
Official Node adapter implementations for [`polytts`](https://github.com/Dunqing/polytts).
|
|
6
|
+
|
|
7
|
+
Use this package when you want Node-capable adapters without the higher-level Node controller.
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
npm install @polytts/node-adapters
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Usage
|
|
16
|
+
|
|
17
|
+
```ts
|
|
18
|
+
import { officialNodeAdapters, piperNodeAdapter } from "@polytts/node-adapters";
|
|
19
|
+
|
|
20
|
+
void officialNodeAdapters;
|
|
21
|
+
void piperNodeAdapter;
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Included adapters
|
|
25
|
+
|
|
26
|
+
- `kokoroNodeAdapter`
|
|
27
|
+
- `piperNodeAdapter`
|
|
28
|
+
- `kittenNodeAdapter`
|
|
29
|
+
- `supertonicNodeAdapter`
|
|
30
|
+
|
|
31
|
+
Catalog and model metadata live in `@polytts/presets`.
|
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { SynthesizingModelInstance, TTSAdapter } from "@polytts/core";
|
|
2
|
+
|
|
3
|
+
//#region src/kitten.d.ts
|
|
4
|
+
declare const kittenNodeAdapter: TTSAdapter<SynthesizingModelInstance>;
|
|
5
|
+
//#endregion
|
|
6
|
+
//#region src/supertonic.d.ts
|
|
7
|
+
declare const supertonicNodeAdapter: TTSAdapter<SynthesizingModelInstance>;
|
|
8
|
+
//#endregion
|
|
9
|
+
//#region src/index.d.ts
|
|
10
|
+
/** TTS adapter for Kokoro models on Node.js using kokoro-js. */
|
|
11
|
+
declare const kokoroNodeAdapter: TTSAdapter<SynthesizingModelInstance>;
|
|
12
|
+
/** TTS adapter for Piper models on Node.js using native ONNX Runtime. */
|
|
13
|
+
declare const piperNodeAdapter: TTSAdapter<SynthesizingModelInstance>;
|
|
14
|
+
/** All official Node.js TTS adapters bundled for convenience. */
|
|
15
|
+
declare const officialNodeAdapters: TTSAdapter<SynthesizingModelInstance>[];
|
|
16
|
+
//#endregion
|
|
17
|
+
export { kittenNodeAdapter, kokoroNodeAdapter, officialNodeAdapters, piperNodeAdapter, supertonicNodeAdapter };
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,942 @@
|
|
|
1
|
+
import { getModelAssets, pcmToAudioData } from "@polytts/core";
|
|
2
|
+
import { inflateRawSync } from "node:zlib";
|
|
3
|
+
//#region src/kitten-npz-reader.ts
|
|
4
|
+
function parseNpyHeader(bytes) {
|
|
5
|
+
if (bytes[0] !== 147 || String.fromCharCode(bytes[1], bytes[2], bytes[3], bytes[4], bytes[5]) !== "NUMPY") throw new Error("Not a valid .npy file");
|
|
6
|
+
const majorVersion = bytes[6];
|
|
7
|
+
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
|
|
8
|
+
const headerLength = majorVersion === 1 ? view.getUint16(8, true) : view.getUint32(8, true);
|
|
9
|
+
const headerOffset = majorVersion === 1 ? 10 : 12;
|
|
10
|
+
const header = new TextDecoder().decode(bytes.slice(headerOffset, headerOffset + headerLength));
|
|
11
|
+
const dtype = header.match(/'descr'\s*:\s*'([^']+)'/)?.[1];
|
|
12
|
+
const shape = header.match(/'shape'\s*:\s*\(([^)]*)\)/)?.[1]?.split(",").map((value) => Number.parseInt(value.trim(), 10)).filter((value) => Number.isFinite(value)) ?? [];
|
|
13
|
+
if (!dtype) throw new Error(`Could not parse dtype from .npy header: ${header}`);
|
|
14
|
+
return {
|
|
15
|
+
dtype,
|
|
16
|
+
shape,
|
|
17
|
+
dataOffset: headerOffset + headerLength
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
function npyToFloat32(bytes) {
|
|
21
|
+
const { dtype, shape, dataOffset } = parseNpyHeader(bytes);
|
|
22
|
+
const raw = bytes.slice(dataOffset);
|
|
23
|
+
const aligned = new ArrayBuffer(raw.length);
|
|
24
|
+
new Uint8Array(aligned).set(raw);
|
|
25
|
+
if (dtype === "<f4" || dtype === "float32") return {
|
|
26
|
+
data: new Float32Array(aligned),
|
|
27
|
+
shape
|
|
28
|
+
};
|
|
29
|
+
if (dtype === "<f8" || dtype === "float64") {
|
|
30
|
+
const source = new Float64Array(aligned);
|
|
31
|
+
const target = new Float32Array(source.length);
|
|
32
|
+
for (let index = 0; index < source.length; index += 1) target[index] = source[index];
|
|
33
|
+
return {
|
|
34
|
+
data: target,
|
|
35
|
+
shape
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
throw new Error(`Unsupported npy dtype: ${dtype}`);
|
|
39
|
+
}
|
|
40
|
+
function extractZipEntries(buffer) {
|
|
41
|
+
const bytes = new Uint8Array(buffer);
|
|
42
|
+
const view = new DataView(buffer);
|
|
43
|
+
const entries = /* @__PURE__ */ new Map();
|
|
44
|
+
let eocdOffset = -1;
|
|
45
|
+
for (let index = bytes.length - 22; index >= 0; index -= 1) if (view.getUint32(index, true) === 101010256) {
|
|
46
|
+
eocdOffset = index;
|
|
47
|
+
break;
|
|
48
|
+
}
|
|
49
|
+
if (eocdOffset === -1) throw new Error("Could not find End of Central Directory");
|
|
50
|
+
const centralDirectoryOffset = view.getUint32(eocdOffset + 16, true);
|
|
51
|
+
const centralDirectoryEntries = view.getUint16(eocdOffset + 10, true);
|
|
52
|
+
let offset = centralDirectoryOffset;
|
|
53
|
+
for (let index = 0; index < centralDirectoryEntries; index += 1) {
|
|
54
|
+
if (view.getUint32(offset, true) !== 33639248) break;
|
|
55
|
+
const compressionMethod = view.getUint16(offset + 10, true);
|
|
56
|
+
const compressedSize = view.getUint32(offset + 20, true);
|
|
57
|
+
const uncompressedSize = view.getUint32(offset + 24, true);
|
|
58
|
+
const fileNameLength = view.getUint16(offset + 28, true);
|
|
59
|
+
const extraLength = view.getUint16(offset + 30, true);
|
|
60
|
+
const commentLength = view.getUint16(offset + 32, true);
|
|
61
|
+
const localHeaderOffset = view.getUint32(offset + 42, true);
|
|
62
|
+
const fileName = new TextDecoder().decode(bytes.slice(offset + 46, offset + 46 + fileNameLength));
|
|
63
|
+
const localFileNameLength = view.getUint16(localHeaderOffset + 26, true);
|
|
64
|
+
const localExtraLength = view.getUint16(localHeaderOffset + 28, true);
|
|
65
|
+
const dataStart = localHeaderOffset + 30 + localFileNameLength + localExtraLength;
|
|
66
|
+
const compressed = bytes.slice(dataStart, dataStart + compressedSize);
|
|
67
|
+
let fileData;
|
|
68
|
+
if (compressionMethod === 0) fileData = compressed;
|
|
69
|
+
else if (compressionMethod === 8) fileData = inflateRawSync(compressed);
|
|
70
|
+
else {
|
|
71
|
+
offset += 46 + fileNameLength + extraLength + commentLength;
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
if (fileData.length !== uncompressedSize) throw new Error(`Unexpected size for archive entry "${fileName}"`);
|
|
75
|
+
entries.set(fileName, fileData);
|
|
76
|
+
offset += 46 + fileNameLength + extraLength + commentLength;
|
|
77
|
+
}
|
|
78
|
+
return entries;
|
|
79
|
+
}
|
|
80
|
+
function loadKittenVoicesFromArchive(archive) {
|
|
81
|
+
const entries = extractZipEntries(archive);
|
|
82
|
+
const voices = {};
|
|
83
|
+
for (const [fileName, fileData] of entries) {
|
|
84
|
+
if (!fileName.endsWith(".npy")) continue;
|
|
85
|
+
const voiceName = fileName.replace(/\.npy$/, "");
|
|
86
|
+
const { data, shape } = npyToFloat32(fileData);
|
|
87
|
+
voices[voiceName] = {
|
|
88
|
+
data,
|
|
89
|
+
shape: [shape[0] || 1, shape[1] || data.length]
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
return voices;
|
|
93
|
+
}
|
|
94
|
+
//#endregion
|
|
95
|
+
//#region src/kitten-tokenizer.ts
|
|
96
|
+
/**
|
|
97
|
+
* Phoneme tokenizer for the KittenTTS ONNX model. Ported from the upstream Python TextCleaner
|
|
98
|
+
* class: https://github.com/KittenML/KittenTTS/blob/main/kittentts/onnx_model.py
|
|
99
|
+
*/
|
|
100
|
+
const PAD = "$";
|
|
101
|
+
const PUNCTUATION = ";:,.!?¡¿—…\"«»\"\" ";
|
|
102
|
+
const LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
|
103
|
+
const LETTERS_IPA = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ";
|
|
104
|
+
const symbols = [
|
|
105
|
+
PAD,
|
|
106
|
+
...PUNCTUATION,
|
|
107
|
+
...LETTERS,
|
|
108
|
+
...LETTERS_IPA
|
|
109
|
+
];
|
|
110
|
+
const charToIndex = {};
|
|
111
|
+
for (let index = 0; index < symbols.length; index += 1) charToIndex[symbols[index]] = index;
|
|
112
|
+
function cleanText(text) {
|
|
113
|
+
const indexes = [];
|
|
114
|
+
for (const char of text) {
|
|
115
|
+
const mapped = charToIndex[char];
|
|
116
|
+
if (mapped !== void 0) indexes.push(mapped);
|
|
117
|
+
}
|
|
118
|
+
return indexes;
|
|
119
|
+
}
|
|
120
|
+
function tokenizePhonemes(phonemes) {
|
|
121
|
+
const tokens = cleanText(phonemes);
|
|
122
|
+
tokens.unshift(0);
|
|
123
|
+
tokens.push(10);
|
|
124
|
+
tokens.push(0);
|
|
125
|
+
return tokens;
|
|
126
|
+
}
|
|
127
|
+
//#endregion
|
|
128
|
+
//#region src/kitten.shared.ts
|
|
129
|
+
const KITTEN_SAMPLE_RATE = 24e3;
|
|
130
|
+
const HF_BASE = "https://huggingface.co";
|
|
131
|
+
function resolveKittenUrl(repoId, fileName) {
|
|
132
|
+
return `${HF_BASE}/${repoId}/resolve/main/${fileName}`;
|
|
133
|
+
}
|
|
134
|
+
function resolveKittenModelFile(repoId, config) {
|
|
135
|
+
return repoId.startsWith("onnx-community/") ? "onnx/model.onnx" : config.model_file;
|
|
136
|
+
}
|
|
137
|
+
function resolveKittenVoiceId(config, voiceId) {
|
|
138
|
+
return config.voice_aliases?.[voiceId] ?? voiceId;
|
|
139
|
+
}
|
|
140
|
+
//#endregion
|
|
141
|
+
//#region src/kitten.ts
|
|
142
|
+
const KITTEN_CONFIG_ASSET_NAME = "kitten-config.json";
|
|
143
|
+
const KITTEN_MODEL_ASSET_NAME = "model.onnx";
|
|
144
|
+
const KITTEN_VOICES_ASSET_NAME = "voices.npz";
|
|
145
|
+
function bundleFor(spec) {
|
|
146
|
+
return {
|
|
147
|
+
adapterId: spec.adapterId,
|
|
148
|
+
modelId: spec.id,
|
|
149
|
+
revision: spec.revision
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
function ensurePunctuation(text) {
|
|
153
|
+
const trimmed = text.trim();
|
|
154
|
+
if (!trimmed) return trimmed;
|
|
155
|
+
return ".!?,;:".includes(trimmed[trimmed.length - 1]) ? trimmed : `${trimmed}.`;
|
|
156
|
+
}
|
|
157
|
+
function chunkText$1(text, maxLength = 400) {
|
|
158
|
+
const sentences = text.match(/[^.!?]*[.!?]+|[^.!?]+$/g) || [text];
|
|
159
|
+
const chunks = [];
|
|
160
|
+
for (let sentence of sentences) {
|
|
161
|
+
sentence = sentence.trim();
|
|
162
|
+
if (!sentence) continue;
|
|
163
|
+
if (sentence.length <= maxLength) {
|
|
164
|
+
chunks.push(ensurePunctuation(sentence));
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
const words = sentence.split(/\s+/);
|
|
168
|
+
let buffer = "";
|
|
169
|
+
for (const word of words) if (buffer.length + word.length + 1 <= maxLength) buffer += `${buffer ? " " : ""}${word}`;
|
|
170
|
+
else {
|
|
171
|
+
if (buffer) chunks.push(ensurePunctuation(buffer));
|
|
172
|
+
buffer = word;
|
|
173
|
+
}
|
|
174
|
+
if (buffer) chunks.push(ensurePunctuation(buffer));
|
|
175
|
+
}
|
|
176
|
+
return chunks;
|
|
177
|
+
}
|
|
178
|
+
function basicTokenize(text) {
|
|
179
|
+
return text.match(/[\p{L}\p{N}_]+|[^\p{L}\p{N}_\s]/gu) || [];
|
|
180
|
+
}
|
|
181
|
+
var KittenNodeModel = class {
|
|
182
|
+
kind = "synthesizing";
|
|
183
|
+
modelId;
|
|
184
|
+
adapterId;
|
|
185
|
+
session = null;
|
|
186
|
+
voices = {};
|
|
187
|
+
config = null;
|
|
188
|
+
phonemize = null;
|
|
189
|
+
constructor(spec, context) {
|
|
190
|
+
this.spec = spec;
|
|
191
|
+
this.context = context;
|
|
192
|
+
this.modelId = spec.id;
|
|
193
|
+
this.adapterId = spec.adapterId;
|
|
194
|
+
}
|
|
195
|
+
async load(signal, onProgress) {
|
|
196
|
+
const repoId = this.spec.config?.modelId ?? "onnx-community/KittenTTS-Mini-v0.8-ONNX";
|
|
197
|
+
const phonemizerModule = await import("phonemizer");
|
|
198
|
+
const ort = await import("onnxruntime-node");
|
|
199
|
+
const configBuffer = await this.loadConfigBuffer(repoId, signal);
|
|
200
|
+
this.config = JSON.parse(new TextDecoder().decode(configBuffer));
|
|
201
|
+
onProgress?.(.15);
|
|
202
|
+
const modelUrl = resolveKittenUrl(repoId, resolveKittenModelFile(repoId, this.config));
|
|
203
|
+
const voicesUrl = resolveKittenUrl(repoId, this.config.voices);
|
|
204
|
+
const [modelBuffer, voicesArchive] = await Promise.all([this.loadNamedAsset(KITTEN_MODEL_ASSET_NAME, modelUrl, signal), this.loadNamedAsset(KITTEN_VOICES_ASSET_NAME, voicesUrl, signal)]);
|
|
205
|
+
onProgress?.(.65);
|
|
206
|
+
this.voices = loadKittenVoicesFromArchive(voicesArchive);
|
|
207
|
+
this.session = await ort.InferenceSession.create(new Uint8Array(modelBuffer));
|
|
208
|
+
this.phonemize = phonemizerModule.phonemize;
|
|
209
|
+
onProgress?.(1);
|
|
210
|
+
}
|
|
211
|
+
async loadConfigBuffer(repoId, signal) {
|
|
212
|
+
const cached = await this.context.assetStore.getAsset(bundleFor(this.spec), KITTEN_CONFIG_ASSET_NAME);
|
|
213
|
+
if (cached) return cached;
|
|
214
|
+
const configResponse = await this.context.fetch(resolveKittenUrl(repoId, "kitten_config.json"), { signal });
|
|
215
|
+
if (configResponse.ok) return configResponse.arrayBuffer();
|
|
216
|
+
const fallback = await this.context.fetch(resolveKittenUrl(repoId, "config.json"), { signal });
|
|
217
|
+
if (!fallback.ok) throw new Error(`Failed to fetch Kitten config: HTTP ${fallback.status}`);
|
|
218
|
+
return fallback.arrayBuffer();
|
|
219
|
+
}
|
|
220
|
+
async generate(text, voiceId, signal, speed) {
|
|
221
|
+
if (!this.session || !this.config || !this.phonemize) throw new Error("Kitten model not loaded");
|
|
222
|
+
const chunks = chunkText$1(text);
|
|
223
|
+
const outputs = [];
|
|
224
|
+
for (const chunk of chunks) outputs.push(await this.generateChunk(chunk, voiceId, speed ?? 1, signal));
|
|
225
|
+
const totalLength = outputs.reduce((sum, chunk) => sum + chunk.length, 0);
|
|
226
|
+
const combined = new Float32Array(totalLength);
|
|
227
|
+
let offset = 0;
|
|
228
|
+
for (const chunk of outputs) {
|
|
229
|
+
combined.set(chunk, offset);
|
|
230
|
+
offset += chunk.length;
|
|
231
|
+
}
|
|
232
|
+
return pcmToAudioData(combined, KITTEN_SAMPLE_RATE);
|
|
233
|
+
}
|
|
234
|
+
async *stream(text, voiceId, signal, speed) {
|
|
235
|
+
if (!this.session || !this.config || !this.phonemize) throw new Error("Kitten model not loaded");
|
|
236
|
+
const chunks = chunkText$1(text);
|
|
237
|
+
for (const chunk of chunks) {
|
|
238
|
+
if (signal.aborted) throw this.context.createAbortError();
|
|
239
|
+
yield pcmToAudioData(await this.generateChunk(chunk, voiceId, speed ?? 1, signal), KITTEN_SAMPLE_RATE);
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
listVoices() {
|
|
243
|
+
return this.spec.voices ?? [];
|
|
244
|
+
}
|
|
245
|
+
dispose() {
|
|
246
|
+
this.session = null;
|
|
247
|
+
this.voices = {};
|
|
248
|
+
this.config = null;
|
|
249
|
+
this.phonemize = null;
|
|
250
|
+
}
|
|
251
|
+
async generateChunk(text, voiceKey, speed, signal) {
|
|
252
|
+
if (!this.session || !this.config || !this.phonemize) throw new Error("Kitten model not loaded");
|
|
253
|
+
const resolvedVoiceId = resolveKittenVoiceId(this.config, voiceKey);
|
|
254
|
+
const voiceData = this.voices[resolvedVoiceId];
|
|
255
|
+
if (!voiceData) throw new Error(`Voice "${voiceKey}" not found`);
|
|
256
|
+
if (this.config.speed_priors?.[resolvedVoiceId]) speed *= this.config.speed_priors[resolvedVoiceId];
|
|
257
|
+
const phonemes = await this.phonemizeText(text, signal);
|
|
258
|
+
if (signal.aborted) throw this.context.createAbortError();
|
|
259
|
+
const inputIds = tokenizePhonemes(phonemes);
|
|
260
|
+
const referenceIndex = Math.min(text.length, voiceData.shape[0] - 1);
|
|
261
|
+
const styleDimension = voiceData.shape[1];
|
|
262
|
+
const referenceStyle = voiceData.data.slice(referenceIndex * styleDimension, (referenceIndex + 1) * styleDimension);
|
|
263
|
+
const ort = await import("onnxruntime-node");
|
|
264
|
+
const inputs = {
|
|
265
|
+
input_ids: new ort.Tensor("int64", BigInt64Array.from(inputIds.map(BigInt)), [1, inputIds.length]),
|
|
266
|
+
style: new ort.Tensor("float32", referenceStyle, [1, styleDimension]),
|
|
267
|
+
speed: new ort.Tensor("float32", new Float32Array([speed]), [1])
|
|
268
|
+
};
|
|
269
|
+
const audio = (await this.session.run(inputs))[this.session.outputNames[0]]?.data;
|
|
270
|
+
if (audio.length > 24e3) return audio.slice(0, audio.length - 5e3);
|
|
271
|
+
return audio;
|
|
272
|
+
}
|
|
273
|
+
async phonemizeText(text, signal) {
|
|
274
|
+
if (!this.phonemize) throw new Error("Kitten phonemizer not loaded");
|
|
275
|
+
const punctuation = /(\s*[;:,.!?¡¿—…"«»""()[\]{}]+\s*)+/g;
|
|
276
|
+
const sections = [];
|
|
277
|
+
let lastIndex = 0;
|
|
278
|
+
for (const match of text.matchAll(punctuation)) {
|
|
279
|
+
if (lastIndex < match.index) sections.push({
|
|
280
|
+
punctuation: false,
|
|
281
|
+
text: text.slice(lastIndex, match.index)
|
|
282
|
+
});
|
|
283
|
+
sections.push({
|
|
284
|
+
punctuation: true,
|
|
285
|
+
text: match[0]
|
|
286
|
+
});
|
|
287
|
+
lastIndex = match.index + match[0].length;
|
|
288
|
+
}
|
|
289
|
+
if (lastIndex < text.length) sections.push({
|
|
290
|
+
punctuation: false,
|
|
291
|
+
text: text.slice(lastIndex)
|
|
292
|
+
});
|
|
293
|
+
return basicTokenize((await Promise.all(sections.map(async (section) => {
|
|
294
|
+
if (section.punctuation) return section.text;
|
|
295
|
+
const result = await this.phonemize(section.text, "en-us");
|
|
296
|
+
if (signal.aborted) throw this.context.createAbortError();
|
|
297
|
+
return result.join(" ");
|
|
298
|
+
}))).join("")).join(" ");
|
|
299
|
+
}
|
|
300
|
+
async fetchArrayBuffer(url, signal) {
|
|
301
|
+
const response = await this.context.fetch(url, { signal });
|
|
302
|
+
if (!response.ok) throw new Error(`Failed to fetch Kitten asset: HTTP ${response.status}`);
|
|
303
|
+
return response.arrayBuffer();
|
|
304
|
+
}
|
|
305
|
+
async loadNamedAsset(assetName, url, signal) {
|
|
306
|
+
const cached = await this.context.assetStore.getAsset(bundleFor(this.spec), assetName);
|
|
307
|
+
if (cached) return cached;
|
|
308
|
+
return this.fetchArrayBuffer(url, signal);
|
|
309
|
+
}
|
|
310
|
+
};
|
|
311
|
+
const kittenNodeAdapter = {
|
|
312
|
+
id: "kitten-node",
|
|
313
|
+
name: "KittenTTS Node",
|
|
314
|
+
capabilities: {
|
|
315
|
+
install: true,
|
|
316
|
+
speak: false,
|
|
317
|
+
synthesize: true,
|
|
318
|
+
stream: true,
|
|
319
|
+
dynamicVoices: false
|
|
320
|
+
},
|
|
321
|
+
async install(spec, context, signal, onProgress) {
|
|
322
|
+
const repoId = spec.config?.modelId ?? "onnx-community/KittenTTS-Mini-v0.8-ONNX";
|
|
323
|
+
const bundle = bundleFor(spec);
|
|
324
|
+
const configBuffer = await (async () => {
|
|
325
|
+
const primary = await context.fetch(resolveKittenUrl(repoId, "kitten_config.json"), { signal });
|
|
326
|
+
if (primary.ok) return primary.arrayBuffer();
|
|
327
|
+
const fallback = await context.fetch(resolveKittenUrl(repoId, "config.json"), { signal });
|
|
328
|
+
if (!fallback.ok) throw new Error(`Failed to fetch Kitten config: HTTP ${fallback.status}`);
|
|
329
|
+
return fallback.arrayBuffer();
|
|
330
|
+
})();
|
|
331
|
+
await context.assetStore.stageAsset(bundle, KITTEN_CONFIG_ASSET_NAME, configBuffer);
|
|
332
|
+
onProgress?.(.15);
|
|
333
|
+
const config = JSON.parse(new TextDecoder().decode(configBuffer));
|
|
334
|
+
const modelUrl = resolveKittenUrl(repoId, resolveKittenModelFile(repoId, config));
|
|
335
|
+
const voicesUrl = resolveKittenUrl(repoId, config.voices);
|
|
336
|
+
const modelBuffer = await context.fetch(modelUrl, { signal }).then(async (response) => {
|
|
337
|
+
if (!response.ok) throw new Error(`Failed to fetch Kitten model: HTTP ${response.status}`);
|
|
338
|
+
return response.arrayBuffer();
|
|
339
|
+
});
|
|
340
|
+
await context.assetStore.stageAsset(bundle, KITTEN_MODEL_ASSET_NAME, modelBuffer);
|
|
341
|
+
onProgress?.(.8);
|
|
342
|
+
const voicesBuffer = await context.fetch(voicesUrl, { signal }).then(async (response) => {
|
|
343
|
+
if (!response.ok) throw new Error(`Failed to fetch Kitten voices: HTTP ${response.status}`);
|
|
344
|
+
return response.arrayBuffer();
|
|
345
|
+
});
|
|
346
|
+
await context.assetStore.stageAsset(bundle, KITTEN_VOICES_ASSET_NAME, voicesBuffer);
|
|
347
|
+
await context.assetStore.activateBundle(bundle, [
|
|
348
|
+
KITTEN_CONFIG_ASSET_NAME,
|
|
349
|
+
KITTEN_MODEL_ASSET_NAME,
|
|
350
|
+
KITTEN_VOICES_ASSET_NAME
|
|
351
|
+
]);
|
|
352
|
+
onProgress?.(1);
|
|
353
|
+
},
|
|
354
|
+
createModel(spec, context) {
|
|
355
|
+
return new KittenNodeModel(spec, context);
|
|
356
|
+
}
|
|
357
|
+
};
|
|
358
|
+
//#endregion
|
|
359
|
+
//#region src/supertonic.runtime.ts
|
|
360
|
+
const SUPERTONIC_MODEL_SESSION_KEYS = [
|
|
361
|
+
"durationPredictor",
|
|
362
|
+
"textEncoder",
|
|
363
|
+
"vectorEstimator",
|
|
364
|
+
"vocoder"
|
|
365
|
+
];
|
|
366
|
+
function gaussianRandom() {
|
|
367
|
+
const u1 = Math.max(1e-4, Math.random());
|
|
368
|
+
const u2 = Math.random();
|
|
369
|
+
return Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
|
|
370
|
+
}
|
|
371
|
+
function flatten3d(data) {
|
|
372
|
+
return new Float32Array(data.flat(2));
|
|
373
|
+
}
|
|
374
|
+
function isValidLanguage(value) {
|
|
375
|
+
return value === "en" || value === "ko" || value === "es" || value === "pt" || value === "fr";
|
|
376
|
+
}
|
|
377
|
+
function createDefaultAbortError() {
|
|
378
|
+
const error = /* @__PURE__ */ new Error("Aborted");
|
|
379
|
+
error.name = "AbortError";
|
|
380
|
+
return error;
|
|
381
|
+
}
|
|
382
|
+
var SupertonicStyle = class {
|
|
383
|
+
constructor(ttl, dp) {
|
|
384
|
+
this.ttl = ttl;
|
|
385
|
+
this.dp = dp;
|
|
386
|
+
}
|
|
387
|
+
};
|
|
388
|
+
var SupertonicUnicodeProcessor = class {
|
|
389
|
+
constructor(indexer) {
|
|
390
|
+
this.indexer = indexer;
|
|
391
|
+
}
|
|
392
|
+
call(textList, langList) {
|
|
393
|
+
const processedTexts = textList.map((text, index) => this.preprocessText(text, langList[index]));
|
|
394
|
+
const lengths = processedTexts.map((text) => text.length);
|
|
395
|
+
const maxLength = Math.max(...lengths);
|
|
396
|
+
return {
|
|
397
|
+
textIds: processedTexts.map((text) => {
|
|
398
|
+
const row = Array.from({ length: maxLength }, () => 0);
|
|
399
|
+
for (let index = 0; index < text.length; index += 1) {
|
|
400
|
+
const codePoint = text.codePointAt(index);
|
|
401
|
+
row[index] = codePoint != null && codePoint < this.indexer.length ? this.indexer[codePoint] : -1;
|
|
402
|
+
}
|
|
403
|
+
return row;
|
|
404
|
+
}),
|
|
405
|
+
textMask: this.lengthToMask(lengths, maxLength)
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
preprocessText(text, lang) {
|
|
409
|
+
let normalized = text.normalize("NFKD");
|
|
410
|
+
normalized = normalized.replace(/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu, "");
|
|
411
|
+
for (const [from, to] of Object.entries({
|
|
412
|
+
"–": "-",
|
|
413
|
+
"‑": "-",
|
|
414
|
+
"—": "-",
|
|
415
|
+
_: " ",
|
|
416
|
+
"“": "\"",
|
|
417
|
+
"”": "\"",
|
|
418
|
+
"‘": "'",
|
|
419
|
+
"’": "'",
|
|
420
|
+
"´": "'",
|
|
421
|
+
"`": "'",
|
|
422
|
+
"[": " ",
|
|
423
|
+
"]": " ",
|
|
424
|
+
"|": " ",
|
|
425
|
+
"/": " ",
|
|
426
|
+
"#": " ",
|
|
427
|
+
"→": " ",
|
|
428
|
+
"←": " "
|
|
429
|
+
})) normalized = normalized.replaceAll(from, to);
|
|
430
|
+
normalized = normalized.replace(/[♥☆♡©\\]/g, "");
|
|
431
|
+
for (const [from, to] of Object.entries({
|
|
432
|
+
"@": " at ",
|
|
433
|
+
"e.g.,": "for example, ",
|
|
434
|
+
"i.e.,": "that is, "
|
|
435
|
+
})) normalized = normalized.replaceAll(from, to);
|
|
436
|
+
normalized = normalized.replace(/ ,/g, ",").replace(/ \./g, ".").replace(/ !/g, "!").replace(/ \?/g, "?").replace(/ ;/g, ";").replace(/ :/g, ":").replace(/ '/g, "'");
|
|
437
|
+
while (normalized.includes("\"\"")) normalized = normalized.replace("\"\"", "\"");
|
|
438
|
+
while (normalized.includes("''")) normalized = normalized.replace("''", "'");
|
|
439
|
+
while (normalized.includes("``")) normalized = normalized.replace("``", "`");
|
|
440
|
+
normalized = normalized.replace(/\s+/g, " ").trim();
|
|
441
|
+
if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(normalized)) normalized += ".";
|
|
442
|
+
if (!isValidLanguage(lang)) throw new Error(`Invalid Supertonic language: ${lang}`);
|
|
443
|
+
return `<${lang}>${normalized}</${lang}>`;
|
|
444
|
+
}
|
|
445
|
+
lengthToMask(lengths, maxLength) {
|
|
446
|
+
return lengths.map((length) => {
|
|
447
|
+
const row = Array.from({ length: maxLength }, () => 0);
|
|
448
|
+
for (let index = 0; index < Math.min(length, maxLength); index += 1) row[index] = 1;
|
|
449
|
+
return [row];
|
|
450
|
+
});
|
|
451
|
+
}
|
|
452
|
+
};
|
|
453
|
+
function createSupertonicStyle(ortModule, json) {
|
|
454
|
+
const ttlDims = json.style_ttl.dims;
|
|
455
|
+
const dpDims = json.style_dp.dims;
|
|
456
|
+
return new SupertonicStyle(new ortModule.Tensor("float32", flatten3d(json.style_ttl.data), ttlDims), new ortModule.Tensor("float32", flatten3d(json.style_dp.data), dpDims));
|
|
457
|
+
}
|
|
458
|
+
async function loadSupertonicSessions(ortModule, modelBuffers, onProgress) {
|
|
459
|
+
const sessions = {};
|
|
460
|
+
for (let index = 0; index < SUPERTONIC_MODEL_SESSION_KEYS.length; index += 1) {
|
|
461
|
+
const key = SUPERTONIC_MODEL_SESSION_KEYS[index];
|
|
462
|
+
sessions[key] = await ortModule.InferenceSession.create(new Uint8Array(modelBuffers[key]));
|
|
463
|
+
onProgress?.(index + 1, SUPERTONIC_MODEL_SESSION_KEYS.length);
|
|
464
|
+
}
|
|
465
|
+
return sessions;
|
|
466
|
+
}
|
|
467
|
+
var SupertonicTextToSpeech = class {
|
|
468
|
+
sampleRate;
|
|
469
|
+
constructor(ortModule, config, textProcessor, sessions) {
|
|
470
|
+
this.ortModule = ortModule;
|
|
471
|
+
this.config = config;
|
|
472
|
+
this.textProcessor = textProcessor;
|
|
473
|
+
this.sessions = sessions;
|
|
474
|
+
this.sampleRate = config.ae.sample_rate;
|
|
475
|
+
}
|
|
476
|
+
async generate(text, language, style, totalStep, speed = 1.05, silenceDuration = .3, onProgress, isAborted, createAbortError = createDefaultAbortError) {
|
|
477
|
+
if (style.ttl.dims[0] !== 1) throw new Error("Supertonic only supports single-style synthesis in this adapter");
|
|
478
|
+
const chunks = chunkText(text, language === "ko" ? 120 : 300);
|
|
479
|
+
let wav = [];
|
|
480
|
+
let duration = 0;
|
|
481
|
+
for (let index = 0; index < chunks.length; index += 1) {
|
|
482
|
+
if (isAborted?.()) throw createAbortError();
|
|
483
|
+
const chunk = chunks[index];
|
|
484
|
+
const result = await this.generateChunk(chunk, language, style, totalStep, speed, onProgress, isAborted, createAbortError);
|
|
485
|
+
if (wav.length === 0) {
|
|
486
|
+
wav = result.wav;
|
|
487
|
+
duration = result.duration;
|
|
488
|
+
} else {
|
|
489
|
+
const silenceLength = Math.floor(silenceDuration * this.sampleRate);
|
|
490
|
+
wav = [
|
|
491
|
+
...wav,
|
|
492
|
+
...Array.from({ length: silenceLength }, () => 0),
|
|
493
|
+
...result.wav
|
|
494
|
+
];
|
|
495
|
+
duration += result.duration + silenceDuration;
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
return {
|
|
499
|
+
wav: new Float32Array(wav),
|
|
500
|
+
duration
|
|
501
|
+
};
|
|
502
|
+
}
|
|
503
|
+
async generateChunk(text, language, style, totalStep, speed, onProgress, isAborted, createAbortError = createDefaultAbortError) {
|
|
504
|
+
const { textIds, textMask } = this.textProcessor.call([text], [language]);
|
|
505
|
+
const textIdsTensor = new this.ortModule.Tensor("int64", new BigInt64Array(textIds.flat().map((value) => BigInt(value))), [1, textIds[0].length]);
|
|
506
|
+
const textMaskTensor = new this.ortModule.Tensor("float32", new Float32Array(textMask.flat(2)), [
|
|
507
|
+
1,
|
|
508
|
+
1,
|
|
509
|
+
textMask[0][0].length
|
|
510
|
+
]);
|
|
511
|
+
const durationOutputs = await this.sessions.durationPredictor.run({
|
|
512
|
+
text_ids: textIdsTensor,
|
|
513
|
+
style_dp: style.dp,
|
|
514
|
+
text_mask: textMaskTensor
|
|
515
|
+
});
|
|
516
|
+
const durations = Array.from(durationOutputs.duration.data);
|
|
517
|
+
for (let index = 0; index < durations.length; index += 1) durations[index] /= speed;
|
|
518
|
+
const textEncoderOutputs = await this.sessions.textEncoder.run({
|
|
519
|
+
text_ids: textIdsTensor,
|
|
520
|
+
style_ttl: style.ttl,
|
|
521
|
+
text_mask: textMaskTensor
|
|
522
|
+
});
|
|
523
|
+
let { xt, latentMask } = sampleNoisyLatent(durations, this.sampleRate, this.config.ae.base_chunk_size, this.config.ttl.chunk_compress_factor, this.config.ttl.latent_dim);
|
|
524
|
+
const latentMaskTensor = new this.ortModule.Tensor("float32", new Float32Array(latentMask.flat(2)), [
|
|
525
|
+
1,
|
|
526
|
+
1,
|
|
527
|
+
latentMask[0][0].length
|
|
528
|
+
]);
|
|
529
|
+
const totalStepTensor = new this.ortModule.Tensor("float32", new Float32Array([totalStep]), [1]);
|
|
530
|
+
for (let step = 0; step < totalStep; step += 1) {
|
|
531
|
+
if (isAborted?.()) throw createAbortError();
|
|
532
|
+
onProgress?.(step + 1, totalStep);
|
|
533
|
+
const xtTensor = new this.ortModule.Tensor("float32", new Float32Array(xt.flat(2)), [
|
|
534
|
+
1,
|
|
535
|
+
xt[0].length,
|
|
536
|
+
xt[0][0].length
|
|
537
|
+
]);
|
|
538
|
+
const currentStepTensor = new this.ortModule.Tensor("float32", new Float32Array([step]), [1]);
|
|
539
|
+
const vectorEstimatorOutputs = await this.sessions.vectorEstimator.run({
|
|
540
|
+
noisy_latent: xtTensor,
|
|
541
|
+
text_emb: textEncoderOutputs.text_emb,
|
|
542
|
+
style_ttl: style.ttl,
|
|
543
|
+
latent_mask: latentMaskTensor,
|
|
544
|
+
text_mask: textMaskTensor,
|
|
545
|
+
current_step: currentStepTensor,
|
|
546
|
+
total_step: totalStepTensor
|
|
547
|
+
});
|
|
548
|
+
xt = reshapeLatent(Array.from(vectorEstimatorOutputs.denoised_latent.data), xt[0].length, xt[0][0].length);
|
|
549
|
+
}
|
|
550
|
+
const latentTensor = new this.ortModule.Tensor("float32", new Float32Array(xt.flat(2)), [
|
|
551
|
+
1,
|
|
552
|
+
xt[0].length,
|
|
553
|
+
xt[0][0].length
|
|
554
|
+
]);
|
|
555
|
+
const vocoderOutputs = await this.sessions.vocoder.run({ latent: latentTensor });
|
|
556
|
+
return {
|
|
557
|
+
wav: Array.from(vocoderOutputs.wav_tts.data),
|
|
558
|
+
duration: durations[0] ?? 0
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
};
|
|
562
|
+
function sampleNoisyLatent(durations, sampleRate, baseChunkSize, chunkCompressFactor, latentDim) {
|
|
563
|
+
const maxDuration = Math.max(...durations);
|
|
564
|
+
const wavLengthMax = Math.floor(maxDuration * sampleRate);
|
|
565
|
+
const chunkSize = baseChunkSize * chunkCompressFactor;
|
|
566
|
+
const latentLength = Math.floor((wavLengthMax + chunkSize - 1) / chunkSize);
|
|
567
|
+
const latentDimensions = latentDim * chunkCompressFactor;
|
|
568
|
+
const wavLengths = durations.map((duration) => Math.floor(duration * sampleRate));
|
|
569
|
+
const xt = [[]];
|
|
570
|
+
for (let dimension = 0; dimension < latentDimensions; dimension += 1) {
|
|
571
|
+
const row = [];
|
|
572
|
+
for (let time = 0; time < latentLength; time += 1) row.push(gaussianRandom());
|
|
573
|
+
xt[0].push(row);
|
|
574
|
+
}
|
|
575
|
+
return {
|
|
576
|
+
xt,
|
|
577
|
+
latentMask: [[Array.from({ length: latentLength }, (_, index) => {
|
|
578
|
+
return index * chunkSize < wavLengths[0] ? 1 : 0;
|
|
579
|
+
})]]
|
|
580
|
+
};
|
|
581
|
+
}
|
|
582
|
+
function reshapeLatent(data, dimensions, length) {
|
|
583
|
+
const batch = [];
|
|
584
|
+
let cursor = 0;
|
|
585
|
+
for (let dimension = 0; dimension < dimensions; dimension += 1) {
|
|
586
|
+
const row = [];
|
|
587
|
+
for (let index = 0; index < length; index += 1) {
|
|
588
|
+
row.push(data[cursor] ?? 0);
|
|
589
|
+
cursor += 1;
|
|
590
|
+
}
|
|
591
|
+
batch.push(row);
|
|
592
|
+
}
|
|
593
|
+
return [batch];
|
|
594
|
+
}
|
|
595
|
+
function chunkText(text, maxLength) {
|
|
596
|
+
const paragraphs = text.trim().split(/\n\s*\n+/).filter(Boolean);
|
|
597
|
+
const chunks = [];
|
|
598
|
+
for (let paragraph of paragraphs) {
|
|
599
|
+
paragraph = paragraph.trim();
|
|
600
|
+
if (!paragraph) continue;
|
|
601
|
+
const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/) || [paragraph];
|
|
602
|
+
let currentChunk = "";
|
|
603
|
+
for (const sentence of sentences) {
|
|
604
|
+
if (currentChunk.length + sentence.length + 1 <= maxLength) {
|
|
605
|
+
currentChunk += `${currentChunk ? " " : ""}${sentence}`;
|
|
606
|
+
continue;
|
|
607
|
+
}
|
|
608
|
+
if (currentChunk) chunks.push(currentChunk.trim());
|
|
609
|
+
currentChunk = sentence;
|
|
610
|
+
}
|
|
611
|
+
if (currentChunk) chunks.push(currentChunk.trim());
|
|
612
|
+
}
|
|
613
|
+
return chunks.length ? chunks : [text.trim()];
|
|
614
|
+
}
|
|
615
|
+
//#endregion
|
|
616
|
+
//#region src/supertonic.shared.ts
|
|
617
|
+
const SUPERTONIC_STYLE_IDS = [
|
|
618
|
+
"F1",
|
|
619
|
+
"F2",
|
|
620
|
+
"F3",
|
|
621
|
+
"F4",
|
|
622
|
+
"F5",
|
|
623
|
+
"M1",
|
|
624
|
+
"M2",
|
|
625
|
+
"M3",
|
|
626
|
+
"M4",
|
|
627
|
+
"M5"
|
|
628
|
+
];
|
|
629
|
+
const SUPERTONIC_LANGUAGES = [
|
|
630
|
+
{
|
|
631
|
+
id: "en",
|
|
632
|
+
label: "English"
|
|
633
|
+
},
|
|
634
|
+
{
|
|
635
|
+
id: "ko",
|
|
636
|
+
label: "Korean"
|
|
637
|
+
},
|
|
638
|
+
{
|
|
639
|
+
id: "es",
|
|
640
|
+
label: "Spanish"
|
|
641
|
+
},
|
|
642
|
+
{
|
|
643
|
+
id: "pt",
|
|
644
|
+
label: "Portuguese"
|
|
645
|
+
},
|
|
646
|
+
{
|
|
647
|
+
id: "fr",
|
|
648
|
+
label: "French"
|
|
649
|
+
}
|
|
650
|
+
];
|
|
651
|
+
new Map(SUPERTONIC_LANGUAGES.map((language) => [language.id, language]));
|
|
652
|
+
function parseSupertonicVoiceId(voiceId) {
|
|
653
|
+
const [rawLanguage, rawStyle] = (voiceId ?? "en:M1").split(":");
|
|
654
|
+
return {
|
|
655
|
+
language: SUPERTONIC_LANGUAGES.find((entry) => entry.id === rawLanguage)?.id ?? "en",
|
|
656
|
+
styleId: SUPERTONIC_STYLE_IDS.find((entry) => entry === rawStyle) ?? "M1"
|
|
657
|
+
};
|
|
658
|
+
}
|
|
659
|
+
//#endregion
|
|
660
|
+
//#region src/supertonic.ts
|
|
661
|
+
const SUPERTONIC_MODEL_ASSET_NAMES = {
|
|
662
|
+
durationPredictor: "onnx/duration_predictor.onnx",
|
|
663
|
+
textEncoder: "onnx/text_encoder.onnx",
|
|
664
|
+
vectorEstimator: "onnx/vector_estimator.onnx",
|
|
665
|
+
vocoder: "onnx/vocoder.onnx"
|
|
666
|
+
};
|
|
667
|
+
function readJson(buffer) {
|
|
668
|
+
return JSON.parse(new TextDecoder().decode(buffer));
|
|
669
|
+
}
|
|
670
|
+
var SupertonicNodeModel = class {
|
|
671
|
+
kind = "synthesizing";
|
|
672
|
+
modelId;
|
|
673
|
+
adapterId;
|
|
674
|
+
tts = null;
|
|
675
|
+
sessions = null;
|
|
676
|
+
styles = /* @__PURE__ */ new Map();
|
|
677
|
+
constructor(spec, context) {
|
|
678
|
+
this.spec = spec;
|
|
679
|
+
this.context = context;
|
|
680
|
+
this.modelId = spec.id;
|
|
681
|
+
this.adapterId = spec.adapterId;
|
|
682
|
+
}
|
|
683
|
+
async load(signal, onProgress) {
|
|
684
|
+
const config = readJson(await this.loadAsset("onnx/tts.json", signal));
|
|
685
|
+
const unicodeIndexer = readJson(await this.loadAsset("onnx/unicode_indexer.json", signal));
|
|
686
|
+
const ortModule = await import("onnxruntime-node");
|
|
687
|
+
if (signal.aborted) throw this.context.createAbortError();
|
|
688
|
+
const styles = /* @__PURE__ */ new Map();
|
|
689
|
+
for (let index = 0; index < SUPERTONIC_STYLE_IDS.length; index += 1) {
|
|
690
|
+
const styleId = SUPERTONIC_STYLE_IDS[index];
|
|
691
|
+
const json = readJson(await this.loadAsset(`voice_styles/${styleId}.json`, signal));
|
|
692
|
+
styles.set(styleId, createSupertonicStyle(ortModule, json));
|
|
693
|
+
}
|
|
694
|
+
onProgress?.(.2);
|
|
695
|
+
const modelBuffers = {};
|
|
696
|
+
for (let index = 0; index < SUPERTONIC_MODEL_SESSION_KEYS.length; index += 1) {
|
|
697
|
+
const key = SUPERTONIC_MODEL_SESSION_KEYS[index];
|
|
698
|
+
modelBuffers[key] = await this.loadAsset(SUPERTONIC_MODEL_ASSET_NAMES[key], signal);
|
|
699
|
+
onProgress?.(.2 + (index + 1) / SUPERTONIC_MODEL_SESSION_KEYS.length * .4);
|
|
700
|
+
}
|
|
701
|
+
const sessions = await loadSupertonicSessions(ortModule, modelBuffers, (completed, total) => {
|
|
702
|
+
onProgress?.(.6 + completed / total * .4);
|
|
703
|
+
});
|
|
704
|
+
if (signal.aborted) throw this.context.createAbortError();
|
|
705
|
+
this.styles = styles;
|
|
706
|
+
this.sessions = sessions;
|
|
707
|
+
this.tts = new SupertonicTextToSpeech(ortModule, config, new SupertonicUnicodeProcessor(unicodeIndexer), sessions);
|
|
708
|
+
onProgress?.(1);
|
|
709
|
+
}
|
|
710
|
+
async generate(text, voiceId, signal, speed) {
|
|
711
|
+
if (!this.tts) throw new Error("Supertonic model not loaded");
|
|
712
|
+
const { language, styleId } = parseSupertonicVoiceId(voiceId);
|
|
713
|
+
const style = this.styles.get(styleId);
|
|
714
|
+
if (!style) throw new Error(`Unsupported Supertonic style: ${styleId}`);
|
|
715
|
+
return pcmToAudioData((await this.tts.generate(text, language, style, Number(this.spec.config?.totalStep ?? 2), speed ?? 1, .3, void 0, () => signal.aborted, () => this.context.createAbortError())).wav, this.tts.sampleRate);
|
|
716
|
+
}
|
|
717
|
+
listVoices() {
|
|
718
|
+
return this.spec.voices ?? [];
|
|
719
|
+
}
|
|
720
|
+
dispose() {
|
|
721
|
+
this.tts = null;
|
|
722
|
+
this.styles.clear();
|
|
723
|
+
const sessions = this.sessions;
|
|
724
|
+
this.sessions = null;
|
|
725
|
+
if (!sessions) return;
|
|
726
|
+
for (const session of Object.values(sessions)) session.release?.call(session);
|
|
727
|
+
}
|
|
728
|
+
async loadAsset(assetName, signal) {
|
|
729
|
+
const bundle = {
|
|
730
|
+
adapterId: this.spec.adapterId,
|
|
731
|
+
modelId: this.spec.id,
|
|
732
|
+
revision: this.spec.revision
|
|
733
|
+
};
|
|
734
|
+
const cached = await this.context.assetStore.getAsset(bundle, assetName);
|
|
735
|
+
if (cached) return cached;
|
|
736
|
+
const asset = getModelAssets(this.spec).find((entry) => entry.name === assetName);
|
|
737
|
+
if (!asset) throw new Error(`Supertonic asset "${assetName}" is missing from model "${this.spec.id}"`);
|
|
738
|
+
const response = await this.context.fetch(asset.url, { signal });
|
|
739
|
+
if (!response.ok) throw new Error(`Failed to fetch Supertonic asset "${assetName}": HTTP ${response.status}`);
|
|
740
|
+
return response.arrayBuffer();
|
|
741
|
+
}
|
|
742
|
+
};
|
|
743
|
+
const supertonicNodeAdapter = {
|
|
744
|
+
id: "supertonic-node",
|
|
745
|
+
name: "Supertonic Node",
|
|
746
|
+
capabilities: {
|
|
747
|
+
install: false,
|
|
748
|
+
speak: false,
|
|
749
|
+
synthesize: true,
|
|
750
|
+
stream: false,
|
|
751
|
+
dynamicVoices: false
|
|
752
|
+
},
|
|
753
|
+
createModel(spec, context) {
|
|
754
|
+
return new SupertonicNodeModel(spec, context);
|
|
755
|
+
}
|
|
756
|
+
};
|
|
757
|
+
//#endregion
|
|
758
|
+
//#region src/index.ts
|
|
759
|
+
const DEFAULT_KOKORO_MODEL_REF = "onnx-community/Kokoro-82M-v1.0-ONNX";
|
|
760
|
+
function normalizeSpeakSpeed(speed) {
|
|
761
|
+
if (!Number.isFinite(speed)) return void 0;
|
|
762
|
+
return Math.min(2, Math.max(.5, speed));
|
|
763
|
+
}
|
|
764
|
+
function phonemesToIds(phonemes, config) {
|
|
765
|
+
const map = config.phoneme_id_map;
|
|
766
|
+
const ids = [];
|
|
767
|
+
if (map["^"]) ids.push(...map["^"]);
|
|
768
|
+
for (const phoneme of phonemes) {
|
|
769
|
+
const phonemeIds = map[phoneme];
|
|
770
|
+
if (phonemeIds === void 0) continue;
|
|
771
|
+
ids.push(...phonemeIds);
|
|
772
|
+
if (map["_"]) ids.push(...map["_"]);
|
|
773
|
+
}
|
|
774
|
+
if (map["$"]) ids.push(...map["$"]);
|
|
775
|
+
return ids;
|
|
776
|
+
}
|
|
777
|
+
async function loadSpecAsset(spec, assetName, context, signal) {
|
|
778
|
+
const bundle = {
|
|
779
|
+
adapterId: spec.adapterId,
|
|
780
|
+
modelId: spec.id,
|
|
781
|
+
revision: spec.revision
|
|
782
|
+
};
|
|
783
|
+
const cached = await context.assetStore.getAsset(bundle, assetName);
|
|
784
|
+
if (cached) return cached;
|
|
785
|
+
const asset = getModelAssets(spec).find((entry) => entry.name === assetName);
|
|
786
|
+
if (!asset) throw new Error(`Asset "${assetName}" is missing from model "${spec.id}"`);
|
|
787
|
+
const response = await context.fetch(asset.url, { signal });
|
|
788
|
+
if (!response.ok) throw new Error(`Failed to fetch ${asset.name}: HTTP ${response.status}`);
|
|
789
|
+
return response.arrayBuffer();
|
|
790
|
+
}
|
|
791
|
+
var KokoroNodeModel = class {
|
|
792
|
+
kind = "synthesizing";
|
|
793
|
+
modelId;
|
|
794
|
+
adapterId;
|
|
795
|
+
tts = null;
|
|
796
|
+
voices = [];
|
|
797
|
+
constructor(spec, createAbortError) {
|
|
798
|
+
this.spec = spec;
|
|
799
|
+
this.createAbortError = createAbortError;
|
|
800
|
+
this.modelId = spec.id;
|
|
801
|
+
this.adapterId = spec.adapterId;
|
|
802
|
+
}
|
|
803
|
+
async load(signal, onProgress) {
|
|
804
|
+
if (signal.aborted) throw this.createAbortError();
|
|
805
|
+
const kokoroModule = await import("kokoro-js");
|
|
806
|
+
const modelRef = this.spec.config?.modelId ?? DEFAULT_KOKORO_MODEL_REF;
|
|
807
|
+
this.tts = await kokoroModule.KokoroTTS.from_pretrained(modelRef, {
|
|
808
|
+
dtype: "q8",
|
|
809
|
+
device: "cpu",
|
|
810
|
+
progress_callback(progress) {
|
|
811
|
+
if (progress.status === "progress" && progress.loaded != null && progress.total) onProgress?.(progress.loaded / progress.total);
|
|
812
|
+
}
|
|
813
|
+
});
|
|
814
|
+
if (signal.aborted) throw this.createAbortError();
|
|
815
|
+
this.voices = Object.entries(this.tts.voices ?? {}).map(([id, info]) => ({
|
|
816
|
+
id,
|
|
817
|
+
name: info?.name ?? id,
|
|
818
|
+
language: info?.language ?? (id.startsWith("b") ? "en-GB" : "en-US"),
|
|
819
|
+
gender: info?.gender
|
|
820
|
+
}));
|
|
821
|
+
}
|
|
822
|
+
async generate(text, voiceId, signal, speed) {
|
|
823
|
+
if (!this.tts) throw new Error("Kokoro model not loaded");
|
|
824
|
+
if (signal.aborted) throw this.createAbortError();
|
|
825
|
+
const result = await this.tts.generate(text, {
|
|
826
|
+
voice: voiceId,
|
|
827
|
+
speed: normalizeSpeakSpeed(speed)
|
|
828
|
+
});
|
|
829
|
+
if (signal.aborted) throw this.createAbortError();
|
|
830
|
+
return {
|
|
831
|
+
sampleRate: result.sampling_rate ?? 24e3,
|
|
832
|
+
channels: [result.audio]
|
|
833
|
+
};
|
|
834
|
+
}
|
|
835
|
+
listVoices() {
|
|
836
|
+
return this.voices.length ? this.voices : this.spec.voices ?? [];
|
|
837
|
+
}
|
|
838
|
+
dispose() {
|
|
839
|
+
this.tts = null;
|
|
840
|
+
this.voices = [];
|
|
841
|
+
}
|
|
842
|
+
};
|
|
843
|
+
var PiperNodeModel = class {
|
|
844
|
+
kind = "synthesizing";
|
|
845
|
+
modelId;
|
|
846
|
+
adapterId;
|
|
847
|
+
session = null;
|
|
848
|
+
phonemize = null;
|
|
849
|
+
voiceConfig = null;
|
|
850
|
+
constructor(spec, context) {
|
|
851
|
+
this.spec = spec;
|
|
852
|
+
this.context = context;
|
|
853
|
+
this.modelId = spec.id;
|
|
854
|
+
this.adapterId = spec.adapterId;
|
|
855
|
+
}
|
|
856
|
+
async load(signal) {
|
|
857
|
+
const voiceAsset = getModelAssets(this.spec).find((asset) => asset.name.endsWith(".json"));
|
|
858
|
+
const modelAsset = getModelAssets(this.spec).find((asset) => asset.name.endsWith(".onnx"));
|
|
859
|
+
if (!voiceAsset || !modelAsset) throw new Error(`Piper model "${this.spec.id}" is missing required assets`);
|
|
860
|
+
const [configData, modelData] = await Promise.all([loadSpecAsset(this.spec, voiceAsset.name, this.context, signal), loadSpecAsset(this.spec, modelAsset.name, this.context, signal)]);
|
|
861
|
+
if (signal.aborted) throw this.context.createAbortError();
|
|
862
|
+
const session = await (await import("onnxruntime-node")).InferenceSession.create(new Uint8Array(modelData));
|
|
863
|
+
const phonemizerModule = await import("phonemizer");
|
|
864
|
+
if (signal.aborted) throw this.context.createAbortError();
|
|
865
|
+
this.voiceConfig = JSON.parse(new TextDecoder().decode(configData));
|
|
866
|
+
this.session = session;
|
|
867
|
+
this.phonemize = phonemizerModule.phonemize;
|
|
868
|
+
}
|
|
869
|
+
async generate(text, _voiceId, signal) {
|
|
870
|
+
if (!this.session || !this.phonemize || !this.voiceConfig) throw new Error("Piper model not loaded");
|
|
871
|
+
const languageTag = this.spec.languages[0]?.toLowerCase() ?? "en-us";
|
|
872
|
+
const phonemeTokens = await this.phonemize(text, languageTag);
|
|
873
|
+
if (signal.aborted) throw this.context.createAbortError();
|
|
874
|
+
const ids = phonemesToIds(Array.from(phonemeTokens.join("")), this.voiceConfig);
|
|
875
|
+
if (!ids.length) throw new Error("Phoneme mapping produced an empty sequence");
|
|
876
|
+
const ort = await import("onnxruntime-node");
|
|
877
|
+
const feedsMap = {
|
|
878
|
+
input: new ort.Tensor("int64", new BigInt64Array(ids.map(BigInt)), [1, ids.length]),
|
|
879
|
+
input_lengths: new ort.Tensor("int64", new BigInt64Array([BigInt(ids.length)]), [1]),
|
|
880
|
+
scales: new ort.Tensor("float32", new Float32Array([
|
|
881
|
+
this.voiceConfig.inference.noise_scale,
|
|
882
|
+
this.voiceConfig.inference.length_scale,
|
|
883
|
+
this.voiceConfig.inference.noise_w
|
|
884
|
+
]), [3])
|
|
885
|
+
};
|
|
886
|
+
if (this.voiceConfig.num_speakers > 1) feedsMap.sid = new ort.Tensor("int64", new BigInt64Array([BigInt(0)]), [1]);
|
|
887
|
+
const results = await this.session.run(feedsMap);
|
|
888
|
+
const audioData = results[Object.keys(results)[0]].data;
|
|
889
|
+
if (signal.aborted) throw this.context.createAbortError();
|
|
890
|
+
return {
|
|
891
|
+
sampleRate: this.voiceConfig.audio.sample_rate,
|
|
892
|
+
channels: [audioData]
|
|
893
|
+
};
|
|
894
|
+
}
|
|
895
|
+
listVoices() {
|
|
896
|
+
return this.spec.voices ?? [];
|
|
897
|
+
}
|
|
898
|
+
dispose() {
|
|
899
|
+
this.session = null;
|
|
900
|
+
this.phonemize = null;
|
|
901
|
+
this.voiceConfig = null;
|
|
902
|
+
}
|
|
903
|
+
};
|
|
904
|
+
/** TTS adapter for Kokoro models on Node.js using kokoro-js. */
|
|
905
|
+
const kokoroNodeAdapter = {
|
|
906
|
+
id: "kokoro-node",
|
|
907
|
+
name: "Kokoro Node",
|
|
908
|
+
capabilities: {
|
|
909
|
+
install: false,
|
|
910
|
+
speak: false,
|
|
911
|
+
synthesize: true,
|
|
912
|
+
stream: false,
|
|
913
|
+
dynamicVoices: true
|
|
914
|
+
},
|
|
915
|
+
createModel(spec, context) {
|
|
916
|
+
return new KokoroNodeModel(spec, () => context.createAbortError());
|
|
917
|
+
}
|
|
918
|
+
};
|
|
919
|
+
/** TTS adapter for Piper models on Node.js using native ONNX Runtime. */
|
|
920
|
+
const piperNodeAdapter = {
|
|
921
|
+
id: "piper-node",
|
|
922
|
+
name: "Piper Node",
|
|
923
|
+
capabilities: {
|
|
924
|
+
install: false,
|
|
925
|
+
speak: false,
|
|
926
|
+
synthesize: true,
|
|
927
|
+
stream: false,
|
|
928
|
+
dynamicVoices: false
|
|
929
|
+
},
|
|
930
|
+
createModel(spec, context) {
|
|
931
|
+
return new PiperNodeModel(spec, context);
|
|
932
|
+
}
|
|
933
|
+
};
|
|
934
|
+
/** All official Node.js TTS adapters bundled for convenience. */
|
|
935
|
+
const officialNodeAdapters = [
|
|
936
|
+
kokoroNodeAdapter,
|
|
937
|
+
piperNodeAdapter,
|
|
938
|
+
kittenNodeAdapter,
|
|
939
|
+
supertonicNodeAdapter
|
|
940
|
+
];
|
|
941
|
+
//#endregion
|
|
942
|
+
export { kittenNodeAdapter, kokoroNodeAdapter, officialNodeAdapters, piperNodeAdapter, supertonicNodeAdapter };
|
package/package.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@polytts/node-adapters",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Official Node adapter implementations for polytts.",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"adapters",
|
|
7
|
+
"electron",
|
|
8
|
+
"node",
|
|
9
|
+
"polytts",
|
|
10
|
+
"raycast",
|
|
11
|
+
"text-to-speech",
|
|
12
|
+
"tts"
|
|
13
|
+
],
|
|
14
|
+
"homepage": "https://github.com/Dunqing/polytts/tree/main/packages/node-adapters#readme",
|
|
15
|
+
"bugs": {
|
|
16
|
+
"url": "https://github.com/Dunqing/polytts/issues"
|
|
17
|
+
},
|
|
18
|
+
"license": "MIT",
|
|
19
|
+
"repository": {
|
|
20
|
+
"type": "git",
|
|
21
|
+
"url": "git+https://github.com/Dunqing/polytts.git",
|
|
22
|
+
"directory": "packages/node-adapters"
|
|
23
|
+
},
|
|
24
|
+
"files": [
|
|
25
|
+
"dist"
|
|
26
|
+
],
|
|
27
|
+
"type": "module",
|
|
28
|
+
"exports": {
|
|
29
|
+
".": {
|
|
30
|
+
"types": "./dist/index.d.mts",
|
|
31
|
+
"default": "./dist/index.mjs"
|
|
32
|
+
},
|
|
33
|
+
"./package.json": "./package.json"
|
|
34
|
+
},
|
|
35
|
+
"publishConfig": {
|
|
36
|
+
"access": "public"
|
|
37
|
+
},
|
|
38
|
+
"dependencies": {
|
|
39
|
+
"kokoro-js": "latest",
|
|
40
|
+
"onnxruntime-node": "latest",
|
|
41
|
+
"phonemizer": "latest",
|
|
42
|
+
"@polytts/core": "0.1.0"
|
|
43
|
+
},
|
|
44
|
+
"devDependencies": {
|
|
45
|
+
"@types/node": "^25.6.0",
|
|
46
|
+
"vite-plus": "latest",
|
|
47
|
+
"@polytts/presets": "0.1.0"
|
|
48
|
+
},
|
|
49
|
+
"scripts": {
|
|
50
|
+
"build": "vp pack",
|
|
51
|
+
"test": "vp test",
|
|
52
|
+
"test:run": "vp test run"
|
|
53
|
+
},
|
|
54
|
+
"main": "./dist/index.mjs",
|
|
55
|
+
"module": "./dist/index.mjs",
|
|
56
|
+
"types": "./dist/index.d.mts"
|
|
57
|
+
}
|