speechflow 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/README.md +4 -4
  3. package/package.json +4 -4
  4. package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
  5. package/speechflow-cli/dst/speechflow-main-cli.js +1 -0
  6. package/speechflow-cli/dst/speechflow-main-cli.js.map +1 -1
  7. package/speechflow-cli/dst/speechflow-main-graph.js +2 -4
  8. package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
  9. package/speechflow-cli/dst/speechflow-main-nodes.js +1 -0
  10. package/speechflow-cli/dst/speechflow-main-nodes.js.map +1 -1
  11. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +1 -0
  12. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
  13. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +7 -9
  14. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
  15. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -0
  16. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js.map +1 -1
  17. package/speechflow-cli/dst/speechflow-node-a2a-expander.js +8 -9
  18. package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
  19. package/speechflow-cli/dst/speechflow-node-a2a-filler.js +2 -0
  20. package/speechflow-cli/dst/speechflow-node-a2a-filler.js.map +1 -1
  21. package/speechflow-cli/dst/speechflow-node-a2a-gender.js +1 -1
  22. package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
  23. package/speechflow-cli/dst/speechflow-node-a2a-meter.js +1 -1
  24. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +11 -9
  25. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
  26. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js +1 -0
  27. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js.map +1 -1
  28. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js.map +1 -1
  29. package/speechflow-cli/dst/speechflow-node-a2a-speex.js +4 -2
  30. package/speechflow-cli/dst/speechflow-node-a2a-speex.js.map +1 -1
  31. package/speechflow-cli/dst/speechflow-node-a2a-vad.js +19 -22
  32. package/speechflow-cli/dst/speechflow-node-a2a-vad.js.map +1 -1
  33. package/speechflow-cli/dst/speechflow-node-a2a-wav.js +7 -0
  34. package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
  35. package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
  36. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +2 -11
  37. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
  38. package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +0 -1
  39. package/speechflow-cli/dst/speechflow-node-a2t-google.js +0 -6
  40. package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -1
  41. package/speechflow-cli/dst/speechflow-node-a2t-openai.js +6 -1
  42. package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
  43. package/speechflow-cli/dst/speechflow-node-t2a-amazon.d.ts +1 -1
  44. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +27 -7
  45. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
  46. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.d.ts +1 -1
  47. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +5 -3
  48. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
  49. package/speechflow-cli/dst/speechflow-node-t2a-google.js +1 -4
  50. package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -1
  51. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +1 -1
  52. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +27 -6
  53. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
  54. package/speechflow-cli/dst/speechflow-node-t2a-openai.js +1 -4
  55. package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -1
  56. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +2 -3
  57. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +97 -459
  58. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -1
  59. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js +0 -2
  60. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
  61. package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
  62. package/speechflow-cli/dst/speechflow-node-t2t-google.js.map +1 -1
  63. package/speechflow-cli/dst/speechflow-node-t2t-opus.js +18 -16
  64. package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -1
  65. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +2 -3
  66. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -1
  67. package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js +2 -3
  68. package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -1
  69. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +5 -2
  70. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
  71. package/speechflow-cli/dst/speechflow-node-t2t-summary.js +2 -3
  72. package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -1
  73. package/speechflow-cli/dst/speechflow-node-t2t-translate.js +1 -2
  74. package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -1
  75. package/speechflow-cli/dst/speechflow-node-x2x-filter.js +2 -0
  76. package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
  77. package/speechflow-cli/dst/speechflow-node-xio-exec.js +1 -0
  78. package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -1
  79. package/speechflow-cli/dst/speechflow-node-xio-file.js +3 -5
  80. package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
  81. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
  82. package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -1
  83. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +2 -0
  84. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -1
  85. package/speechflow-cli/dst/speechflow-node-xio-websocket.js +9 -9
  86. package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
  87. package/speechflow-cli/dst/speechflow-util-audio.js +4 -0
  88. package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
  89. package/speechflow-cli/dst/speechflow-util-llm.d.ts +0 -1
  90. package/speechflow-cli/dst/speechflow-util-llm.js +4 -8
  91. package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -1
  92. package/speechflow-cli/dst/speechflow-util-queue.js +2 -1
  93. package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
  94. package/speechflow-cli/dst/speechflow-util.js +1 -0
  95. package/speechflow-cli/dst/speechflow-util.js.map +1 -1
  96. package/speechflow-cli/dst/test.d.ts +1 -0
  97. package/speechflow-cli/dst/test.js +18 -0
  98. package/speechflow-cli/dst/test.js.map +1 -0
  99. package/speechflow-cli/etc/oxlint.jsonc +3 -1
  100. package/speechflow-cli/package.json +16 -16
  101. package/speechflow-cli/src/speechflow-main-api.ts +16 -16
  102. package/speechflow-cli/src/speechflow-main-cli.ts +1 -0
  103. package/speechflow-cli/src/speechflow-main-graph.ts +7 -9
  104. package/speechflow-cli/src/speechflow-main-nodes.ts +1 -0
  105. package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +1 -0
  106. package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +8 -10
  107. package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -0
  108. package/speechflow-cli/src/speechflow-node-a2a-expander.ts +9 -10
  109. package/speechflow-cli/src/speechflow-node-a2a-filler.ts +2 -0
  110. package/speechflow-cli/src/speechflow-node-a2a-gender.ts +3 -3
  111. package/speechflow-cli/src/speechflow-node-a2a-meter.ts +2 -2
  112. package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +11 -9
  113. package/speechflow-cli/src/speechflow-node-a2a-rnnoise-wt.ts +1 -0
  114. package/speechflow-cli/src/speechflow-node-a2a-rnnoise.ts +1 -1
  115. package/speechflow-cli/src/speechflow-node-a2a-speex.ts +5 -3
  116. package/speechflow-cli/src/speechflow-node-a2a-vad.ts +20 -23
  117. package/speechflow-cli/src/speechflow-node-a2a-wav.ts +7 -0
  118. package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +6 -18
  119. package/speechflow-cli/src/speechflow-node-a2t-google.ts +4 -11
  120. package/speechflow-cli/src/speechflow-node-a2t-openai.ts +12 -7
  121. package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +32 -10
  122. package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +6 -4
  123. package/speechflow-cli/src/speechflow-node-t2a-google.ts +1 -4
  124. package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +33 -10
  125. package/speechflow-cli/src/speechflow-node-t2a-openai.ts +1 -4
  126. package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +106 -571
  127. package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +1 -3
  128. package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +2 -2
  129. package/speechflow-cli/src/speechflow-node-t2t-google.ts +1 -1
  130. package/speechflow-cli/src/speechflow-node-t2t-opus.ts +19 -18
  131. package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +2 -3
  132. package/speechflow-cli/src/speechflow-node-t2t-spellcheck.ts +2 -3
  133. package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +5 -2
  134. package/speechflow-cli/src/speechflow-node-t2t-summary.ts +2 -3
  135. package/speechflow-cli/src/speechflow-node-t2t-translate.ts +1 -2
  136. package/speechflow-cli/src/speechflow-node-x2x-filter.ts +2 -0
  137. package/speechflow-cli/src/speechflow-node-xio-exec.ts +1 -0
  138. package/speechflow-cli/src/speechflow-node-xio-file.ts +3 -5
  139. package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +2 -2
  140. package/speechflow-cli/src/speechflow-node-xio-vban.ts +5 -5
  141. package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +2 -0
  142. package/speechflow-cli/src/speechflow-node-xio-websocket.ts +9 -9
  143. package/speechflow-cli/src/speechflow-util-audio.ts +5 -0
  144. package/speechflow-cli/src/speechflow-util-llm.ts +4 -9
  145. package/speechflow-cli/src/speechflow-util-queue.ts +4 -4
  146. package/speechflow-cli/src/speechflow-util.ts +1 -0
  147. package/speechflow-ui-db/dst/index.js +14 -14
  148. package/speechflow-ui-db/package.json +6 -6
  149. package/speechflow-ui-st/dst/index.js +32 -32
  150. package/speechflow-ui-st/package.json +6 -6
@@ -42,398 +42,22 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
42
42
  };
43
43
  Object.defineProperty(exports, "__esModule", { value: true });
44
44
  /* standard dependencies */
45
- const node_fs_1 = __importDefault(require("node:fs"));
46
- const node_path_1 = __importDefault(require("node:path"));
47
45
  const node_stream_1 = __importDefault(require("node:stream"));
48
46
  /* external dependencies */
49
- const mkdirp_1 = require("mkdirp");
50
- const HF = __importStar(require("@huggingface/hub"));
47
+ const Transformers = __importStar(require("@huggingface/transformers"));
51
48
  const speex_resampler_1 = __importDefault(require("speex-resampler"));
52
49
  const luxon_1 = require("luxon");
53
- /* @ts-expect-error no type available */
54
- const ORT = __importStar(require("onnxruntime-node"));
55
50
  /* internal dependencies */
56
51
  const speechflow_node_1 = __importDefault(require("./speechflow-node"));
57
52
  const util = __importStar(require("./speechflow-util"));
58
- /* convert lengths to binary mask */
59
- function lengthToMask(lengths, maxLen = null) {
60
- /* handle empty input */
61
- if (lengths.length === 0)
62
- return [];
63
- /* determine maximum length */
64
- maxLen = maxLen ?? Math.max(...lengths);
65
- /* build mask array */
66
- const mask = [];
67
- for (let i = 0; i < lengths.length; i++) {
68
- const row = [];
69
- for (let j = 0; j < maxLen; j++)
70
- row.push(j < lengths[i] ? 1.0 : 0.0);
71
- mask.push([row]);
72
- }
73
- return mask;
74
- }
75
- /* get latent mask from wav lengths */
76
- function getLatentMask(wavLengths, baseChunkSize, chunkCompressFactor) {
77
- /* calculate latent size and lengths */
78
- const latentSize = baseChunkSize * chunkCompressFactor;
79
- const latentLengths = wavLengths.map((len) => Math.floor((len + latentSize - 1) / latentSize));
80
- /* generate mask from latent lengths */
81
- return lengthToMask(latentLengths);
82
- }
83
- /* convert array to ONNX tensor */
84
- function arrayToTensor(array, dims) {
85
- /* flatten array and create float32 tensor */
86
- const flat = array.flat(Infinity);
87
- return new ORT.Tensor("float32", Float32Array.from(flat), dims);
88
- }
89
- /* convert int array to ONNX tensor */
90
- function intArrayToTensor(array, dims) {
91
- /* flatten array and create int64 tensor */
92
- const flat = array.flat(Infinity);
93
- return new ORT.Tensor("int64", BigInt64Array.from(flat.map(BigInt)), dims);
94
- }
95
- /* chunk text into manageable segments */
96
- function chunkText(text, maxLen = 300) {
97
- /* validate input type */
98
- if (typeof text !== "string")
99
- throw new Error(`chunkText expects a string, got ${typeof text}`);
100
- /* split by paragraph (two or more newlines) */
101
- const paragraphs = text.trim().split(/\n\s*\n+/).filter((p) => p.trim());
102
- /* process each paragraph into chunks */
103
- const chunks = [];
104
- for (let paragraph of paragraphs) {
105
- paragraph = paragraph.trim();
106
- if (!paragraph)
107
- continue;
108
- /* split by sentence boundaries (period, question mark, exclamation mark followed by space)
109
- but exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F. */
110
- const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/);
111
- /* accumulate sentences into chunks respecting max length */
112
- let currentChunk = "";
113
- for (const sentence of sentences) {
114
- if (currentChunk.length + sentence.length + 1 <= maxLen)
115
- currentChunk += (currentChunk ? " " : "") + sentence;
116
- else {
117
- if (currentChunk)
118
- chunks.push(currentChunk.trim());
119
- currentChunk = sentence;
120
- }
121
- }
122
- /* push remaining chunk */
123
- if (currentChunk)
124
- chunks.push(currentChunk.trim());
125
- }
126
- return chunks;
127
- }
128
- /* unicode text processor class */
129
- class SupertonicTextProcessor {
130
- indexer;
131
- constructor(unicodeIndexerJsonPath) {
132
- /* load and parse unicode indexer JSON */
133
- try {
134
- this.indexer = JSON.parse(node_fs_1.default.readFileSync(unicodeIndexerJsonPath, "utf8"));
135
- }
136
- catch (err) {
137
- throw new Error(`failed to parse unicode indexer JSON "${unicodeIndexerJsonPath}"`, { cause: err });
138
- }
139
- }
140
- preprocessText(text) {
141
- /* normalize text */
142
- text = text.normalize("NFKD");
143
- /* remove emojis (wide Unicode range) */
144
- const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu;
145
- text = text.replace(emojiPattern, "");
146
- /* replace various dashes and symbols */
147
- const replacements = {
148
- "–": "-",
149
- "‑": "-",
150
- "—": "-",
151
- "¯": " ",
152
- "_": " ",
153
- "\u201C": "\"",
154
- "\u201D": "\"",
155
- "\u2018": "'",
156
- "\u2019": "'",
157
- "´": "'",
158
- "`": "'",
159
- "[": " ",
160
- "]": " ",
161
- "|": " ",
162
- "/": " ",
163
- "#": " ",
164
- "→": " ",
165
- "←": " "
166
- };
167
- for (const [k, v] of Object.entries(replacements))
168
- text = text.replaceAll(k, v);
169
- /* remove combining diacritics */
170
- text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, "");
171
- /* remove special symbols */
172
- text = text.replace(/[♥☆♡©\\]/g, "");
173
- /* replace known expressions */
174
- const exprReplacements = {
175
- "@": " at ",
176
- "e.g.,": "for example, ",
177
- "i.e.,": "that is, "
178
- };
179
- for (const [k, v] of Object.entries(exprReplacements))
180
- text = text.replaceAll(k, v);
181
- /* fix spacing around punctuation */
182
- text = text.replace(/ ,/g, ",");
183
- text = text.replace(/ \./g, ".");
184
- text = text.replace(/ !/g, "!");
185
- text = text.replace(/ \?/g, "?");
186
- text = text.replace(/ ;/g, ";");
187
- text = text.replace(/ :/g, ":");
188
- text = text.replace(/ '/g, "'");
189
- /* remove duplicate quotes */
190
- text = text.replace(/""+/g, "\"");
191
- text = text.replace(/''+/g, "'");
192
- text = text.replace(/``+/g, "`");
193
- /* remove extra spaces */
194
- text = text.replace(/\s+/g, " ").trim();
195
- /* if text doesn't end with punctuation, add a period */
196
- if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text))
197
- text += ".";
198
- return text;
199
- }
200
- textToUnicodeValues(text) {
201
- /* convert text characters to unicode code points */
202
- return Array.from(text).map((char) => char.charCodeAt(0));
203
- }
204
- call(textList) {
205
- /* handle empty input */
206
- if (textList.length === 0)
207
- return { textIds: [], textMask: [] };
208
- /* preprocess all texts */
209
- const processedTexts = textList.map((t) => this.preprocessText(t));
210
- const textIdsLengths = processedTexts.map((t) => t.length);
211
- const maxLen = Math.max(...textIdsLengths);
212
- /* convert texts to indexed token arrays */
213
- const textIds = [];
214
- for (let i = 0; i < processedTexts.length; i++) {
215
- const row = Array.from({ length: maxLen }).fill(0);
216
- const unicodeVals = this.textToUnicodeValues(processedTexts[i]);
217
- for (let j = 0; j < unicodeVals.length; j++)
218
- row[j] = this.indexer[unicodeVals[j]] ?? 0;
219
- textIds.push(row);
220
- }
221
- /* generate text mask from lengths */
222
- const textMask = lengthToMask(textIdsLengths);
223
- return { textIds, textMask };
224
- }
225
- }
226
- /* Supertonic TTS engine class */
227
- class SupertonicTTS {
228
- sampleRate;
229
- cfgs;
230
- textProcessor;
231
- dpOrt;
232
- textEncOrt;
233
- vectorEstOrt;
234
- vocoderOrt;
235
- baseChunkSize;
236
- chunkCompressFactor;
237
- latentDim;
238
- constructor(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt) {
239
- /* store configuration and dependencies */
240
- this.cfgs = cfgs;
241
- this.textProcessor = textProcessor;
242
- this.dpOrt = dpOrt;
243
- this.textEncOrt = textEncOrt;
244
- this.vectorEstOrt = vectorEstOrt;
245
- this.vocoderOrt = vocoderOrt;
246
- /* extract configuration values */
247
- this.sampleRate = cfgs.ae.sample_rate;
248
- this.baseChunkSize = cfgs.ae.base_chunk_size;
249
- this.chunkCompressFactor = cfgs.ttl.chunk_compress_factor;
250
- this.latentDim = cfgs.ttl.latent_dim;
251
- }
252
- sampleNoisyLatent(duration) {
253
- /* calculate dimensions for latent space */
254
- const wavLenMax = Math.max(...duration) * this.sampleRate;
255
- const wavLengths = duration.map((d) => Math.floor(d * this.sampleRate));
256
- const chunkSize = this.baseChunkSize * this.chunkCompressFactor;
257
- const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize);
258
- const latentDimExpanded = this.latentDim * this.chunkCompressFactor;
259
- /* generate random noise (pre-allocate arrays for performance) */
260
- const noisyLatent = Array.from({ length: duration.length });
261
- for (let b = 0; b < duration.length; b++) {
262
- const batch = Array.from({ length: latentDimExpanded });
263
- for (let d = 0; d < latentDimExpanded; d++) {
264
- const row = Array.from({ length: latentLen });
265
- for (let t = 0; t < latentLen; t++) {
266
- /* Box-Muller transform for normal distribution */
267
- const eps = 1e-10;
268
- const u1 = Math.max(eps, Math.random());
269
- const u2 = Math.random();
270
- row[t] = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2);
271
- }
272
- batch[d] = row;
273
- }
274
- noisyLatent[b] = batch;
275
- }
276
- /* apply mask */
277
- const latentMask = getLatentMask(wavLengths, this.baseChunkSize, this.chunkCompressFactor);
278
- for (let b = 0; b < noisyLatent.length; b++) {
279
- for (let d = 0; d < noisyLatent[b].length; d++) {
280
- for (let t = 0; t < noisyLatent[b][d].length; t++)
281
- noisyLatent[b][d][t] *= latentMask[b][0][t];
282
- }
283
- }
284
- return { noisyLatent, latentMask };
285
- }
286
- async infer(textList, style, totalStep, speed) {
287
- /* validate batch size matches style vectors */
288
- if (textList.length !== style.ttl.dims[0])
289
- throw new Error("Number of texts must match number of style vectors");
290
- /* process text into token IDs and masks */
291
- const batchSize = textList.length;
292
- const { textIds, textMask } = this.textProcessor.call(textList);
293
- const textIdsShape = [batchSize, textIds[0].length];
294
- const textMaskShape = [batchSize, 1, textMask[0][0].length];
295
- const textMaskTensor = arrayToTensor(textMask, textMaskShape);
296
- /* run duration predictor model */
297
- const dpResult = await this.dpOrt.run({
298
- text_ids: intArrayToTensor(textIds, textIdsShape),
299
- style_dp: style.dp,
300
- text_mask: textMaskTensor
301
- });
302
- const predictedDurations = Array.from(dpResult.duration.data);
303
- /* apply speed factor to duration */
304
- for (let i = 0; i < predictedDurations.length; i++)
305
- predictedDurations[i] /= speed;
306
- /* run text encoder model */
307
- const textEncResult = await this.textEncOrt.run({
308
- text_ids: intArrayToTensor(textIds, textIdsShape),
309
- style_ttl: style.ttl,
310
- text_mask: textMaskTensor
311
- });
312
- const textEmbTensor = textEncResult.text_emb;
313
- /* sample initial noisy latent vectors */
314
- const { noisyLatent, latentMask } = this.sampleNoisyLatent(predictedDurations);
315
- const latentShape = [batchSize, noisyLatent[0].length, noisyLatent[0][0].length];
316
- const latentMaskShape = [batchSize, 1, latentMask[0][0].length];
317
- const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape);
318
- /* prepare step tensors */
319
- const totalStepArray = Array.from({ length: batchSize }).fill(totalStep);
320
- const scalarShape = [batchSize];
321
- const totalStepTensor = arrayToTensor(totalStepArray, scalarShape);
322
- /* iteratively denoise latent vectors */
323
- for (let step = 0; step < totalStep; step++) {
324
- const currentStepArray = Array.from({ length: batchSize }).fill(step);
325
- /* run vector estimator model */
326
- const vectorEstResult = await this.vectorEstOrt.run({
327
- noisy_latent: arrayToTensor(noisyLatent, latentShape),
328
- text_emb: textEmbTensor,
329
- style_ttl: style.ttl,
330
- text_mask: textMaskTensor,
331
- latent_mask: latentMaskTensor,
332
- total_step: totalStepTensor,
333
- current_step: arrayToTensor(currentStepArray, scalarShape)
334
- });
335
- const denoisedLatent = Array.from(vectorEstResult.denoised_latent.data);
336
- /* update latent with the denoised output */
337
- let idx = 0;
338
- for (let b = 0; b < noisyLatent.length; b++)
339
- for (let d = 0; d < noisyLatent[b].length; d++)
340
- for (let t = 0; t < noisyLatent[b][d].length; t++)
341
- noisyLatent[b][d][t] = denoisedLatent[idx++];
342
- }
343
- /* run vocoder to generate audio waveform */
344
- const vocoderResult = await this.vocoderOrt.run({
345
- latent: arrayToTensor(noisyLatent, latentShape)
346
- });
347
- const wav = Array.from(vocoderResult.wav_tts.data);
348
- return { wav, duration: predictedDurations };
349
- }
350
- async synthesize(text, style, totalStep, speed, silenceDuration = 0.3) {
351
- /* validate single speaker mode */
352
- if (style.ttl.dims[0] !== 1)
353
- throw new Error("Single speaker text to speech only supports single style");
354
- /* chunk text into segments */
355
- const textList = chunkText(text);
356
- if (textList.length === 0)
357
- return { wav: [], duration: 0 };
358
- /* synthesize each chunk and concatenate with silence */
359
- const wavParts = [];
360
- let totalDuration = 0;
361
- for (const chunk of textList) {
362
- const { wav, duration } = await this.infer([chunk], style, totalStep, speed);
363
- /* insert silence between chunks */
364
- if (wavParts.length > 0) {
365
- const silenceLen = Math.floor(silenceDuration * this.sampleRate);
366
- wavParts.push(Array.from({ length: silenceLen }).fill(0));
367
- totalDuration += silenceDuration;
368
- }
369
- wavParts.push(wav);
370
- totalDuration += duration[0];
371
- }
372
- return { wav: wavParts.flat(), duration: totalDuration };
373
- }
374
- async release() {
375
- /* release all ONNX inference sessions */
376
- await Promise.all([
377
- this.dpOrt.release(),
378
- this.textEncOrt.release(),
379
- this.vectorEstOrt.release(),
380
- this.vocoderOrt.release()
381
- ]);
382
- }
383
- }
384
- /* load voice style from JSON file */
385
- async function loadVoiceStyle(voiceStylePath) {
386
- /* read and parse voice style JSON */
387
- let voiceStyle;
388
- try {
389
- voiceStyle = JSON.parse(await node_fs_1.default.promises.readFile(voiceStylePath, "utf8"));
390
- }
391
- catch (err) {
392
- throw new Error(`failed to parse voice style JSON "${voiceStylePath}"`, { cause: err });
393
- }
394
- /* extract dimensions and data */
395
- const ttlDims = voiceStyle.style_ttl.dims;
396
- const dpDims = voiceStyle.style_dp.dims;
397
- const ttlData = voiceStyle.style_ttl.data.flat(Infinity);
398
- const dpData = voiceStyle.style_dp.data.flat(Infinity);
399
- /* create ONNX tensors for style vectors */
400
- const ttlStyle = new ORT.Tensor("float32", Float32Array.from(ttlData), ttlDims);
401
- const dpStyle = new ORT.Tensor("float32", Float32Array.from(dpData), dpDims);
402
- return { ttl: ttlStyle, dp: dpStyle };
403
- }
404
- /* load TTS engine from ONNX models */
405
- async function loadSupertonic(assetsDir) {
406
- /* load configuration */
407
- const cfgPath = node_path_1.default.join(assetsDir, "onnx", "tts.json");
408
- let cfgs;
409
- try {
410
- cfgs = JSON.parse(await node_fs_1.default.promises.readFile(cfgPath, "utf8"));
411
- }
412
- catch (err) {
413
- throw new Error(`failed to parse TTS config JSON "${cfgPath}"`, { cause: err });
414
- }
415
- /* load text processor */
416
- const unicodeIndexerPath = node_path_1.default.join(assetsDir, "onnx", "unicode_indexer.json");
417
- const textProcessor = new SupertonicTextProcessor(unicodeIndexerPath);
418
- /* load ONNX models */
419
- const opts = {};
420
- const [dpOrt, textEncOrt, vectorEstOrt, vocoderOrt] = await Promise.all([
421
- ORT.InferenceSession.create(node_path_1.default.join(assetsDir, "onnx", "duration_predictor.onnx"), opts),
422
- ORT.InferenceSession.create(node_path_1.default.join(assetsDir, "onnx", "text_encoder.onnx"), opts),
423
- ORT.InferenceSession.create(node_path_1.default.join(assetsDir, "onnx", "vector_estimator.onnx"), opts),
424
- ORT.InferenceSession.create(node_path_1.default.join(assetsDir, "onnx", "vocoder.onnx"), opts)
425
- ]);
426
- return new SupertonicTTS(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt);
427
- }
428
- /* ==== SPEECHFLOW NODE IMPLEMENTATION ==== */
429
53
  /* SpeechFlow node for Supertonic text-to-speech conversion */
430
54
  class SpeechFlowNodeT2ASupertonic extends speechflow_node_1.default {
431
55
  /* declare official node name */
432
56
  static name = "t2a-supertonic";
433
57
  /* internal state */
434
- supertonic = null;
435
- style = null;
58
+ tts = null;
436
59
  resampler = null;
60
+ sampleRate = 44100;
437
61
  closing = false;
438
62
  /* construct node */
439
63
  constructor(id, cfg, opts, args) {
@@ -452,75 +76,98 @@ class SpeechFlowNodeT2ASupertonic extends speechflow_node_1.default {
452
76
  async status() {
453
77
  return {};
454
78
  }
455
- /* download HuggingFace assets */
456
- async downloadAssets() {
457
- /* define HuggingFace repository and required files */
458
- const assetRepo = "Supertone/supertonic";
459
- const assetFiles = [
460
- "voice_styles/F1.json",
461
- "voice_styles/F2.json",
462
- "voice_styles/M1.json",
463
- "voice_styles/M2.json",
464
- "onnx/tts.json",
465
- "onnx/duration_predictor.onnx",
466
- "onnx/text_encoder.onnx",
467
- "onnx/unicode_indexer.json",
468
- "onnx/vector_estimator.onnx",
469
- "onnx/vocoder.onnx",
470
- ];
471
- /* create asset directories */
472
- const assetDir = node_path_1.default.join(this.config.cacheDir, "supertonic");
473
- await (0, mkdirp_1.mkdirp)(node_path_1.default.join(assetDir, "voice_styles"), { mode: 0o750 });
474
- await (0, mkdirp_1.mkdirp)(node_path_1.default.join(assetDir, "onnx"), { mode: 0o750 });
475
- /* download missing asset files */
476
- for (const assetFile of assetFiles) {
477
- const url = `${assetRepo}/${assetFile}`;
478
- const file = node_path_1.default.join(assetDir, assetFile);
479
- const stat = await node_fs_1.default.promises.stat(file).catch((_err) => null);
480
- if (stat === null || !stat.isFile()) {
481
- this.log("info", `downloading from HuggingFace "${url}"`);
482
- const response = await HF.downloadFile({ repo: assetRepo, path: assetFile });
483
- if (!response)
484
- throw new Error(`failed to download from HuggingFace "${url}"`);
485
- const buffer = Buffer.from(await response.arrayBuffer());
486
- await node_fs_1.default.promises.writeFile(file, buffer);
487
- }
488
- }
489
- return assetDir;
490
- }
491
79
  /* open node */
492
80
  async open() {
493
81
  this.closing = false;
494
- /* download assets */
495
- const assetsDir = await this.downloadAssets();
496
- /* download ONNX models */
497
- this.log("info", `loading ONNX models (asset dir: "${assetsDir}")`);
498
- this.supertonic = await loadSupertonic(assetsDir);
499
- this.log("info", `loaded ONNX models (sample rate: ${this.supertonic.sampleRate}Hz)`);
500
- /* load voice style */
501
- const voiceStylePath = node_path_1.default.join(assetsDir, "voice_styles", `${this.params.voice}.json`);
502
- if (!node_fs_1.default.existsSync(voiceStylePath))
503
- throw new Error(`voice style not found: ${voiceStylePath}`);
504
- this.log("info", `loading voice style "${this.params.voice}"`);
505
- this.style = await loadVoiceStyle(voiceStylePath);
506
- this.log("info", `loaded voice style "${this.params.voice}"`);
82
+ /* load Supertonic TTS pipeline via transformers.js */
83
+ const model = "onnx-community/Supertonic-TTS-ONNX";
84
+ this.log("info", `loading Supertonic TTS model "${model}"`);
85
+ /* track download progress */
86
+ const progressState = new Map();
87
+ const progressCallback = (progress) => {
88
+ let artifact = model;
89
+ if (typeof progress.file === "string")
90
+ artifact += `:${progress.file}`;
91
+ let percent = 0;
92
+ if (typeof progress.loaded === "number" && typeof progress.total === "number")
93
+ percent = (progress.loaded / progress.total) * 100;
94
+ else if (typeof progress.progress === "number")
95
+ percent = progress.progress;
96
+ if (percent > 0)
97
+ progressState.set(artifact, percent);
98
+ };
99
+ let interval = setInterval(() => {
100
+ for (const [artifact, percent] of progressState) {
101
+ this.log("info", `downloaded ${percent.toFixed(2)}% of artifact "${artifact}"`);
102
+ if (percent >= 100.0)
103
+ progressState.delete(artifact);
104
+ }
105
+ if (progressState.size === 0 && interval !== null) {
106
+ clearInterval(interval);
107
+ interval = null;
108
+ }
109
+ }, 1000);
110
+ /* create TTS pipeline */
111
+ try {
112
+ const tts = Transformers.pipeline("text-to-speech", model, {
113
+ dtype: "fp32",
114
+ progress_callback: progressCallback
115
+ });
116
+ this.tts = await tts;
117
+ }
118
+ finally {
119
+ if (interval !== null) {
120
+ clearInterval(interval);
121
+ interval = null;
122
+ }
123
+ }
124
+ if (this.tts === null)
125
+ throw new Error("failed to instantiate Supertonic TTS pipeline");
126
+ /* determine sample rate from model config */
127
+ const config = this.tts.model?.config;
128
+ if (config?.sampling_rate)
129
+ this.sampleRate = config.sampling_rate;
130
+ this.log("info", `loaded Supertonic TTS model (sample rate: ${this.sampleRate}Hz)`);
507
131
  /* establish resampler from Supertonic's output sample rate to our standard audio sample rate (48kHz) */
508
- this.resampler = new speex_resampler_1.default(1, this.supertonic.sampleRate, this.config.audioSampleRate, 7);
132
+ this.resampler = new speex_resampler_1.default(1, this.sampleRate, this.config.audioSampleRate, 7);
133
+ /* map voice names to speaker embedding URLs */
134
+ const voiceUrls = {
135
+ "M1": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/M1.bin",
136
+ "M2": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/M2.bin",
137
+ "F1": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/F1.bin",
138
+ "F2": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/F2.bin"
139
+ };
140
+ const speakerEmbeddings = voiceUrls[this.params.voice];
141
+ if (speakerEmbeddings === undefined)
142
+ throw new Error(`invalid Supertonic voice "${this.params.voice}"`);
143
+ this.log("info", `using voice "${this.params.voice}"`);
509
144
  /* perform text-to-speech operation with Supertonic */
510
145
  const text2speech = async (text) => {
511
- /* synthesize speech from text */
512
146
  this.log("info", `Supertonic: input: "${text}"`);
513
- const { wav, duration } = await this.supertonic.synthesize(text, this.style, this.params.steps, this.params.speed);
147
+ /* generate speech using transformers.js pipeline */
148
+ const result = await this.tts(text, {
149
+ speaker_embeddings: speakerEmbeddings,
150
+ num_inference_steps: this.params.steps,
151
+ speed: this.params.speed
152
+ });
153
+ /* extract audio samples and sample rate */
154
+ if (!(result.audio instanceof Float32Array))
155
+ throw new Error("unexpected Supertonic result: audio is not a Float32Array");
156
+ if (typeof result.sampling_rate !== "number")
157
+ throw new Error("unexpected Supertonic result: sampling_rate is not a number");
158
+ const samples = result.audio;
159
+ const outputSampleRate = result.sampling_rate;
160
+ if (outputSampleRate !== this.sampleRate)
161
+ this.log("warn", `unexpected sample rate ${outputSampleRate}Hz (expected ${this.sampleRate}Hz)`);
162
+ /* calculate duration */
163
+ const duration = samples.length / outputSampleRate;
514
164
  this.log("info", `Supertonic: synthesized ${duration.toFixed(2)}s of audio`);
515
165
  /* convert audio samples from PCM/F32 to PCM/I16 */
516
- const buffer1 = Buffer.alloc(wav.length * 2);
517
- for (let i = 0; i < wav.length; i++) {
518
- const sample = Math.max(-1, Math.min(1, wav[i]));
519
- buffer1.writeInt16LE(sample * 0x7FFF, i * 2);
520
- }
521
- /* resample audio samples from 44.1kHz to 48kHz */
522
- const buffer2 = this.resampler.processChunk(buffer1);
523
- return buffer2;
166
+ const buffer1 = util.convertF32ToBuf(samples);
167
+ /* resample audio samples from Supertonic sample rate to 48kHz */
168
+ if (this.resampler === null)
169
+ throw new Error("resampler destroyed during TTS processing");
170
+ return this.resampler.processChunk(buffer1);
524
171
  };
525
172
  /* create transform stream and connect it to the Supertonic TTS */
526
173
  const self = this;
@@ -529,11 +176,13 @@ class SpeechFlowNodeT2ASupertonic extends speechflow_node_1.default {
529
176
  readableObjectMode: true,
530
177
  decodeStrings: false,
531
178
  highWaterMark: 1,
532
- async transform(chunk, encoding, callback) {
179
+ transform(chunk, encoding, callback) {
533
180
  if (self.closing)
534
181
  callback(new Error("stream already destroyed"));
535
182
  else if (Buffer.isBuffer(chunk.payload))
536
183
  callback(new Error("invalid chunk payload type"));
184
+ else if (chunk.payload === "")
185
+ callback();
537
186
  else {
538
187
  let processTimeout = setTimeout(() => {
539
188
  processTimeout = null;
@@ -545,13 +194,7 @@ class SpeechFlowNodeT2ASupertonic extends speechflow_node_1.default {
545
194
  processTimeout = null;
546
195
  }
547
196
  };
548
- try {
549
- if (self.closing) {
550
- clearProcessTimeout();
551
- callback(new Error("stream destroyed during processing"));
552
- return;
553
- }
554
- const buffer = await text2speech(chunk.payload);
197
+ text2speech(chunk.payload).then((buffer) => {
555
198
  if (self.closing) {
556
199
  clearProcessTimeout();
557
200
  callback(new Error("stream destroyed during processing"));
@@ -565,16 +208,13 @@ class SpeechFlowNodeT2ASupertonic extends speechflow_node_1.default {
565
208
  chunkNew.type = "audio";
566
209
  chunkNew.payload = buffer;
567
210
  chunkNew.timestampEnd = luxon_1.Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs);
568
- /* push chunk and complete transform */
569
211
  clearProcessTimeout();
570
212
  this.push(chunkNew);
571
213
  callback();
572
- }
573
- catch (error) {
574
- /* handle processing errors */
214
+ }).catch((error) => {
575
215
  clearProcessTimeout();
576
216
  callback(util.ensureError(error, "Supertonic processing failed"));
577
- }
217
+ });
578
218
  }
579
219
  },
580
220
  final(callback) {
@@ -591,16 +231,14 @@ class SpeechFlowNodeT2ASupertonic extends speechflow_node_1.default {
591
231
  await util.destroyStream(this.stream);
592
232
  this.stream = null;
593
233
  }
594
- /* destroy voice style */
595
- if (this.style !== null)
596
- this.style = null;
597
234
  /* destroy resampler */
598
235
  if (this.resampler !== null)
599
236
  this.resampler = null;
600
- /* destroy Supertonic TTS */
601
- if (this.supertonic !== null) {
602
- await this.supertonic.release();
603
- this.supertonic = null;
237
+ /* destroy TTS pipeline */
238
+ if (this.tts !== null) {
239
+ /* dispose of the pipeline if possible */
240
+ await this.tts.dispose();
241
+ this.tts = null;
604
242
  }
605
243
  }
606
244
  }