speechflow 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/README.md +4 -4
  3. package/package.json +4 -4
  4. package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
  5. package/speechflow-cli/dst/speechflow-main-cli.js +1 -0
  6. package/speechflow-cli/dst/speechflow-main-cli.js.map +1 -1
  7. package/speechflow-cli/dst/speechflow-main-graph.js +2 -4
  8. package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
  9. package/speechflow-cli/dst/speechflow-main-nodes.js +1 -0
  10. package/speechflow-cli/dst/speechflow-main-nodes.js.map +1 -1
  11. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +1 -0
  12. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
  13. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +7 -9
  14. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
  15. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -0
  16. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js.map +1 -1
  17. package/speechflow-cli/dst/speechflow-node-a2a-expander.js +8 -9
  18. package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
  19. package/speechflow-cli/dst/speechflow-node-a2a-filler.js +2 -0
  20. package/speechflow-cli/dst/speechflow-node-a2a-filler.js.map +1 -1
  21. package/speechflow-cli/dst/speechflow-node-a2a-gender.js +1 -1
  22. package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
  23. package/speechflow-cli/dst/speechflow-node-a2a-meter.js +1 -1
  24. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +11 -9
  25. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
  26. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js +1 -0
  27. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js.map +1 -1
  28. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js.map +1 -1
  29. package/speechflow-cli/dst/speechflow-node-a2a-speex.js +4 -2
  30. package/speechflow-cli/dst/speechflow-node-a2a-speex.js.map +1 -1
  31. package/speechflow-cli/dst/speechflow-node-a2a-vad.js +19 -22
  32. package/speechflow-cli/dst/speechflow-node-a2a-vad.js.map +1 -1
  33. package/speechflow-cli/dst/speechflow-node-a2a-wav.js +7 -0
  34. package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
  35. package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
  36. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +2 -11
  37. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
  38. package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +0 -1
  39. package/speechflow-cli/dst/speechflow-node-a2t-google.js +0 -6
  40. package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -1
  41. package/speechflow-cli/dst/speechflow-node-a2t-openai.js +6 -1
  42. package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
  43. package/speechflow-cli/dst/speechflow-node-t2a-amazon.d.ts +1 -1
  44. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +27 -7
  45. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
  46. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.d.ts +1 -1
  47. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +5 -3
  48. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
  49. package/speechflow-cli/dst/speechflow-node-t2a-google.js +1 -4
  50. package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -1
  51. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +1 -1
  52. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +27 -6
  53. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
  54. package/speechflow-cli/dst/speechflow-node-t2a-openai.js +1 -4
  55. package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -1
  56. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +2 -3
  57. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +97 -459
  58. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -1
  59. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js +0 -2
  60. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
  61. package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
  62. package/speechflow-cli/dst/speechflow-node-t2t-google.js.map +1 -1
  63. package/speechflow-cli/dst/speechflow-node-t2t-opus.js +18 -16
  64. package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -1
  65. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +2 -3
  66. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -1
  67. package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js +2 -3
  68. package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -1
  69. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +5 -2
  70. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
  71. package/speechflow-cli/dst/speechflow-node-t2t-summary.js +2 -3
  72. package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -1
  73. package/speechflow-cli/dst/speechflow-node-t2t-translate.js +1 -2
  74. package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -1
  75. package/speechflow-cli/dst/speechflow-node-x2x-filter.js +2 -0
  76. package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
  77. package/speechflow-cli/dst/speechflow-node-xio-exec.js +1 -0
  78. package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -1
  79. package/speechflow-cli/dst/speechflow-node-xio-file.js +3 -5
  80. package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
  81. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
  82. package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -1
  83. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +2 -0
  84. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -1
  85. package/speechflow-cli/dst/speechflow-node-xio-websocket.js +9 -9
  86. package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
  87. package/speechflow-cli/dst/speechflow-util-audio.js +4 -0
  88. package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
  89. package/speechflow-cli/dst/speechflow-util-llm.d.ts +0 -1
  90. package/speechflow-cli/dst/speechflow-util-llm.js +4 -8
  91. package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -1
  92. package/speechflow-cli/dst/speechflow-util-queue.js +2 -1
  93. package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
  94. package/speechflow-cli/dst/speechflow-util.js +1 -0
  95. package/speechflow-cli/dst/speechflow-util.js.map +1 -1
  96. package/speechflow-cli/dst/test.d.ts +1 -0
  97. package/speechflow-cli/dst/test.js +18 -0
  98. package/speechflow-cli/dst/test.js.map +1 -0
  99. package/speechflow-cli/etc/oxlint.jsonc +3 -1
  100. package/speechflow-cli/package.json +16 -16
  101. package/speechflow-cli/src/speechflow-main-api.ts +16 -16
  102. package/speechflow-cli/src/speechflow-main-cli.ts +1 -0
  103. package/speechflow-cli/src/speechflow-main-graph.ts +7 -9
  104. package/speechflow-cli/src/speechflow-main-nodes.ts +1 -0
  105. package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +1 -0
  106. package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +8 -10
  107. package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -0
  108. package/speechflow-cli/src/speechflow-node-a2a-expander.ts +9 -10
  109. package/speechflow-cli/src/speechflow-node-a2a-filler.ts +2 -0
  110. package/speechflow-cli/src/speechflow-node-a2a-gender.ts +3 -3
  111. package/speechflow-cli/src/speechflow-node-a2a-meter.ts +2 -2
  112. package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +11 -9
  113. package/speechflow-cli/src/speechflow-node-a2a-rnnoise-wt.ts +1 -0
  114. package/speechflow-cli/src/speechflow-node-a2a-rnnoise.ts +1 -1
  115. package/speechflow-cli/src/speechflow-node-a2a-speex.ts +5 -3
  116. package/speechflow-cli/src/speechflow-node-a2a-vad.ts +20 -23
  117. package/speechflow-cli/src/speechflow-node-a2a-wav.ts +7 -0
  118. package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +6 -18
  119. package/speechflow-cli/src/speechflow-node-a2t-google.ts +4 -11
  120. package/speechflow-cli/src/speechflow-node-a2t-openai.ts +12 -7
  121. package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +32 -10
  122. package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +6 -4
  123. package/speechflow-cli/src/speechflow-node-t2a-google.ts +1 -4
  124. package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +33 -10
  125. package/speechflow-cli/src/speechflow-node-t2a-openai.ts +1 -4
  126. package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +106 -571
  127. package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +1 -3
  128. package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +2 -2
  129. package/speechflow-cli/src/speechflow-node-t2t-google.ts +1 -1
  130. package/speechflow-cli/src/speechflow-node-t2t-opus.ts +19 -18
  131. package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +2 -3
  132. package/speechflow-cli/src/speechflow-node-t2t-spellcheck.ts +2 -3
  133. package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +5 -2
  134. package/speechflow-cli/src/speechflow-node-t2t-summary.ts +2 -3
  135. package/speechflow-cli/src/speechflow-node-t2t-translate.ts +1 -2
  136. package/speechflow-cli/src/speechflow-node-x2x-filter.ts +2 -0
  137. package/speechflow-cli/src/speechflow-node-xio-exec.ts +1 -0
  138. package/speechflow-cli/src/speechflow-node-xio-file.ts +3 -5
  139. package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +2 -2
  140. package/speechflow-cli/src/speechflow-node-xio-vban.ts +5 -5
  141. package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +2 -0
  142. package/speechflow-cli/src/speechflow-node-xio-websocket.ts +9 -9
  143. package/speechflow-cli/src/speechflow-util-audio.ts +5 -0
  144. package/speechflow-cli/src/speechflow-util-llm.ts +4 -9
  145. package/speechflow-cli/src/speechflow-util-queue.ts +4 -4
  146. package/speechflow-cli/src/speechflow-util.ts +1 -0
  147. package/speechflow-ui-db/dst/index.js +14 -14
  148. package/speechflow-ui-db/package.json +6 -6
  149. package/speechflow-ui-st/dst/index.js +32 -32
  150. package/speechflow-ui-st/package.json +6 -6
@@ -5,500 +5,27 @@
5
5
  */
6
6
 
7
7
  /* standard dependencies */
8
- import fs from "node:fs"
9
- import path from "node:path"
10
8
  import Stream from "node:stream"
11
9
 
12
10
  /* external dependencies */
13
- import { mkdirp } from "mkdirp"
14
- import * as HF from "@huggingface/hub"
15
- import SpeexResampler from "speex-resampler"
16
- import { Duration } from "luxon"
17
-
18
- /* @ts-expect-error no type available */
19
- import * as ORT from "onnxruntime-node"
11
+ import * as Transformers from "@huggingface/transformers"
12
+ import SpeexResampler from "speex-resampler"
13
+ import { Duration } from "luxon"
20
14
 
21
15
  /* internal dependencies */
22
16
  import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
23
17
  import * as util from "./speechflow-util"
24
18
 
25
- /* ==== SUPERTONIC TTS IMPLEMENTATION ==== */
26
-
27
- /* type for voice style tensors */
28
- interface SupertonicStyle {
29
- ttl: ORT.Tensor
30
- dp: ORT.Tensor
31
- }
32
-
33
- /* type for TTS configuration */
34
- interface SupertonicConfig {
35
- ae: {
36
- sample_rate: number
37
- base_chunk_size: number
38
- chunk_compress_factor: number
39
- }
40
- ttl: {
41
- latent_dim: number
42
- chunk_compress_factor: number
43
- }
44
- }
45
-
46
- /* convert lengths to binary mask */
47
- function lengthToMask (lengths: number[], maxLen: number | null = null): number[][][] {
48
- /* handle empty input */
49
- if (lengths.length === 0)
50
- return []
51
-
52
- /* determine maximum length */
53
- maxLen = maxLen ?? Math.max(...lengths)
54
-
55
- /* build mask array */
56
- const mask: number[][][] = []
57
- for (let i = 0; i < lengths.length; i++) {
58
- const row: number[] = []
59
- for (let j = 0; j < maxLen; j++)
60
- row.push(j < lengths[i] ? 1.0 : 0.0)
61
- mask.push([ row ])
62
- }
63
- return mask
64
- }
65
-
66
- /* get latent mask from wav lengths */
67
- function getLatentMask (wavLengths: number[], baseChunkSize: number, chunkCompressFactor: number): number[][][] {
68
- /* calculate latent size and lengths */
69
- const latentSize = baseChunkSize * chunkCompressFactor
70
- const latentLengths = wavLengths.map((len) =>
71
- Math.floor((len + latentSize - 1) / latentSize))
72
-
73
- /* generate mask from latent lengths */
74
- return lengthToMask(latentLengths)
75
- }
76
-
77
- /* convert array to ONNX tensor */
78
- function arrayToTensor (array: number[] | number[][] | number[][][], dims: number[]): ORT.Tensor {
79
- /* flatten array and create float32 tensor */
80
- const flat = array.flat(Infinity) as number[]
81
- return new ORT.Tensor("float32", Float32Array.from(flat), dims)
82
- }
83
-
84
- /* convert int array to ONNX tensor */
85
- function intArrayToTensor (array: number[][], dims: number[]): ORT.Tensor {
86
- /* flatten array and create int64 tensor */
87
- const flat = array.flat(Infinity) as number[]
88
- return new ORT.Tensor("int64", BigInt64Array.from(flat.map(BigInt)), dims)
89
- }
90
-
91
- /* chunk text into manageable segments */
92
- function chunkText (text: string, maxLen = 300): string[] {
93
- /* validate input type */
94
- if (typeof text !== "string")
95
- throw new Error(`chunkText expects a string, got ${typeof text}`)
96
-
97
- /* split by paragraph (two or more newlines) */
98
- const paragraphs = text.trim().split(/\n\s*\n+/).filter((p) => p.trim())
99
-
100
- /* process each paragraph into chunks */
101
- const chunks: string[] = []
102
- for (let paragraph of paragraphs) {
103
- paragraph = paragraph.trim()
104
- if (!paragraph)
105
- continue
106
-
107
- /* split by sentence boundaries (period, question mark, exclamation mark followed by space)
108
- but exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F. */
109
- const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/)
110
-
111
- /* accumulate sentences into chunks respecting max length */
112
- let currentChunk = ""
113
- for (const sentence of sentences) {
114
- if (currentChunk.length + sentence.length + 1 <= maxLen)
115
- currentChunk += (currentChunk ? " " : "") + sentence
116
- else {
117
- if (currentChunk)
118
- chunks.push(currentChunk.trim())
119
- currentChunk = sentence
120
- }
121
- }
122
-
123
- /* push remaining chunk */
124
- if (currentChunk)
125
- chunks.push(currentChunk.trim())
126
- }
127
- return chunks
128
- }
129
-
130
- /* unicode text processor class */
131
- class SupertonicTextProcessor {
132
- private indexer: Record<number, number>
133
-
134
- constructor (unicodeIndexerJsonPath: string) {
135
- /* load and parse unicode indexer JSON */
136
- try {
137
- this.indexer = JSON.parse(fs.readFileSync(unicodeIndexerJsonPath, "utf8"))
138
- }
139
- catch (err) {
140
- throw new Error(`failed to parse unicode indexer JSON "${unicodeIndexerJsonPath}"`, { cause: err })
141
- }
142
- }
143
-
144
- private preprocessText (text: string): string {
145
- /* normalize text */
146
- text = text.normalize("NFKD")
147
-
148
- /* remove emojis (wide Unicode range) */
149
- const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu
150
- text = text.replace(emojiPattern, "")
151
-
152
- /* replace various dashes and symbols */
153
- const replacements: Record<string, string> = {
154
- "–": "-",
155
- "‑": "-",
156
- "—": "-",
157
- "¯": " ",
158
- "_": " ",
159
- "\u201C": "\"",
160
- "\u201D": "\"",
161
- "\u2018": "'",
162
- "\u2019": "'",
163
- "´": "'",
164
- "`": "'",
165
- "[": " ",
166
- "]": " ",
167
- "|": " ",
168
- "/": " ",
169
- "#": " ",
170
- "→": " ",
171
- "←": " "
172
- }
173
- for (const [ k, v ] of Object.entries(replacements))
174
- text = text.replaceAll(k, v)
175
-
176
- /* remove combining diacritics */
177
- text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, "")
178
-
179
- /* remove special symbols */
180
- text = text.replace(/[♥☆♡©\\]/g, "")
181
-
182
- /* replace known expressions */
183
- const exprReplacements: Record<string, string> = {
184
- "@": " at ",
185
- "e.g.,": "for example, ",
186
- "i.e.,": "that is, "
187
- }
188
- for (const [ k, v ] of Object.entries(exprReplacements))
189
- text = text.replaceAll(k, v)
190
-
191
- /* fix spacing around punctuation */
192
- text = text.replace(/ ,/g, ",")
193
- text = text.replace(/ \./g, ".")
194
- text = text.replace(/ !/g, "!")
195
- text = text.replace(/ \?/g, "?")
196
- text = text.replace(/ ;/g, ";")
197
- text = text.replace(/ :/g, ":")
198
- text = text.replace(/ '/g, "'")
199
-
200
- /* remove duplicate quotes */
201
- text = text.replace(/""+/g, "\"")
202
- text = text.replace(/''+/g, "'")
203
- text = text.replace(/``+/g, "`")
204
-
205
- /* remove extra spaces */
206
- text = text.replace(/\s+/g, " ").trim()
207
-
208
- /* if text doesn't end with punctuation, add a period */
209
- if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text))
210
- text += "."
211
- return text
212
- }
213
-
214
- private textToUnicodeValues (text: string): number[] {
215
- /* convert text characters to unicode code points */
216
- return Array.from(text).map((char) => char.charCodeAt(0))
217
- }
218
-
219
- call (textList: string[]): { textIds: number[][], textMask: number[][][] } {
220
- /* handle empty input */
221
- if (textList.length === 0)
222
- return { textIds: [], textMask: [] }
223
-
224
- /* preprocess all texts */
225
- const processedTexts = textList.map((t) => this.preprocessText(t))
226
- const textIdsLengths = processedTexts.map((t) => t.length)
227
- const maxLen = Math.max(...textIdsLengths)
228
-
229
- /* convert texts to indexed token arrays */
230
- const textIds: number[][] = []
231
- for (let i = 0; i < processedTexts.length; i++) {
232
- const row = Array.from<number>({ length: maxLen }).fill(0)
233
- const unicodeVals = this.textToUnicodeValues(processedTexts[i])
234
- for (let j = 0; j < unicodeVals.length; j++)
235
- row[j] = this.indexer[unicodeVals[j]] ?? 0
236
- textIds.push(row)
237
- }
238
-
239
- /* generate text mask from lengths */
240
- const textMask = lengthToMask(textIdsLengths)
241
- return { textIds, textMask }
242
- }
243
- }
244
-
245
- /* Supertonic TTS engine class */
246
- class SupertonicTTS {
247
- public sampleRate: number
248
-
249
- private cfgs: SupertonicConfig
250
- private textProcessor: SupertonicTextProcessor
251
- private dpOrt: ORT.InferenceSession
252
- private textEncOrt: ORT.InferenceSession
253
- private vectorEstOrt: ORT.InferenceSession
254
- private vocoderOrt: ORT.InferenceSession
255
- private baseChunkSize: number
256
- private chunkCompressFactor: number
257
- private latentDim: number
258
-
259
- constructor (
260
- cfgs: SupertonicConfig,
261
- textProcessor: SupertonicTextProcessor,
262
- dpOrt: ORT.InferenceSession,
263
- textEncOrt: ORT.InferenceSession,
264
- vectorEstOrt: ORT.InferenceSession,
265
- vocoderOrt: ORT.InferenceSession
266
- ) {
267
- /* store configuration and dependencies */
268
- this.cfgs = cfgs
269
- this.textProcessor = textProcessor
270
- this.dpOrt = dpOrt
271
- this.textEncOrt = textEncOrt
272
- this.vectorEstOrt = vectorEstOrt
273
- this.vocoderOrt = vocoderOrt
274
-
275
- /* extract configuration values */
276
- this.sampleRate = cfgs.ae.sample_rate
277
- this.baseChunkSize = cfgs.ae.base_chunk_size
278
- this.chunkCompressFactor = cfgs.ttl.chunk_compress_factor
279
- this.latentDim = cfgs.ttl.latent_dim
280
- }
281
-
282
- private sampleNoisyLatent (duration: number[]): { noisyLatent: number[][][], latentMask: number[][][] } {
283
- /* calculate dimensions for latent space */
284
- const wavLenMax = Math.max(...duration) * this.sampleRate
285
- const wavLengths = duration.map((d) => Math.floor(d * this.sampleRate))
286
- const chunkSize = this.baseChunkSize * this.chunkCompressFactor
287
- const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize)
288
- const latentDimExpanded = this.latentDim * this.chunkCompressFactor
289
-
290
- /* generate random noise (pre-allocate arrays for performance) */
291
- const noisyLatent: number[][][] = Array.from({ length: duration.length })
292
- for (let b = 0; b < duration.length; b++) {
293
- const batch: number[][] = Array.from({ length: latentDimExpanded })
294
- for (let d = 0; d < latentDimExpanded; d++) {
295
- const row: number[] = Array.from({ length: latentLen })
296
- for (let t = 0; t < latentLen; t++) {
297
-
298
- /* Box-Muller transform for normal distribution */
299
- const eps = 1e-10
300
- const u1 = Math.max(eps, Math.random())
301
- const u2 = Math.random()
302
- row[t] = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2)
303
- }
304
- batch[d] = row
305
- }
306
- noisyLatent[b] = batch
307
- }
308
-
309
- /* apply mask */
310
- const latentMask = getLatentMask(wavLengths, this.baseChunkSize, this.chunkCompressFactor)
311
- for (let b = 0; b < noisyLatent.length; b++) {
312
- for (let d = 0; d < noisyLatent[b].length; d++) {
313
- for (let t = 0; t < noisyLatent[b][d].length; t++)
314
- noisyLatent[b][d][t] *= latentMask[b][0][t]
315
- }
316
- }
317
- return { noisyLatent, latentMask }
318
- }
319
-
320
- private async infer (textList: string[], style: SupertonicStyle, totalStep: number, speed: number): Promise<{ wav: number[], duration: number[] }> {
321
- /* validate batch size matches style vectors */
322
- if (textList.length !== style.ttl.dims[0])
323
- throw new Error("Number of texts must match number of style vectors")
324
-
325
- /* process text into token IDs and masks */
326
- const batchSize = textList.length
327
- const { textIds, textMask } = this.textProcessor.call(textList)
328
- const textIdsShape = [ batchSize, textIds[0].length ]
329
- const textMaskShape = [ batchSize, 1, textMask[0][0].length ]
330
- const textMaskTensor = arrayToTensor(textMask, textMaskShape)
331
-
332
- /* run duration predictor model */
333
- const dpResult = await this.dpOrt.run({
334
- text_ids: intArrayToTensor(textIds, textIdsShape),
335
- style_dp: style.dp,
336
- text_mask: textMaskTensor
337
- })
338
- const predictedDurations = Array.from(dpResult.duration.data as Float32Array)
339
-
340
- /* apply speed factor to duration */
341
- for (let i = 0; i < predictedDurations.length; i++)
342
- predictedDurations[i] /= speed
343
-
344
- /* run text encoder model */
345
- const textEncResult = await this.textEncOrt.run({
346
- text_ids: intArrayToTensor(textIds, textIdsShape),
347
- style_ttl: style.ttl,
348
- text_mask: textMaskTensor
349
- })
350
- const textEmbTensor = textEncResult.text_emb
351
-
352
- /* sample initial noisy latent vectors */
353
- const { noisyLatent, latentMask } = this.sampleNoisyLatent(predictedDurations)
354
- const latentShape = [ batchSize, noisyLatent[0].length, noisyLatent[0][0].length ]
355
- const latentMaskShape = [ batchSize, 1, latentMask[0][0].length ]
356
- const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape)
357
-
358
- /* prepare step tensors */
359
- const totalStepArray = Array.from<number>({ length: batchSize }).fill(totalStep)
360
- const scalarShape = [ batchSize ]
361
- const totalStepTensor = arrayToTensor(totalStepArray, scalarShape)
362
-
363
- /* iteratively denoise latent vectors */
364
- for (let step = 0; step < totalStep; step++) {
365
- const currentStepArray = Array.from<number>({ length: batchSize }).fill(step)
366
-
367
- /* run vector estimator model */
368
- const vectorEstResult = await this.vectorEstOrt.run({
369
- noisy_latent: arrayToTensor(noisyLatent, latentShape),
370
- text_emb: textEmbTensor,
371
- style_ttl: style.ttl,
372
- text_mask: textMaskTensor,
373
- latent_mask: latentMaskTensor,
374
- total_step: totalStepTensor,
375
- current_step: arrayToTensor(currentStepArray, scalarShape)
376
- })
377
- const denoisedLatent = Array.from(vectorEstResult.denoised_latent.data as Float32Array)
378
-
379
- /* update latent with the denoised output */
380
- let idx = 0
381
- for (let b = 0; b < noisyLatent.length; b++)
382
- for (let d = 0; d < noisyLatent[b].length; d++)
383
- for (let t = 0; t < noisyLatent[b][d].length; t++)
384
- noisyLatent[b][d][t] = denoisedLatent[idx++]
385
- }
386
-
387
- /* run vocoder to generate audio waveform */
388
- const vocoderResult = await this.vocoderOrt.run({
389
- latent: arrayToTensor(noisyLatent, latentShape)
390
- })
391
- const wav = Array.from(vocoderResult.wav_tts.data as Float32Array)
392
- return { wav, duration: predictedDurations }
393
- }
394
-
395
- async synthesize (text: string, style: SupertonicStyle, totalStep: number, speed: number, silenceDuration = 0.3): Promise<{ wav: number[], duration: number }> {
396
- /* validate single speaker mode */
397
- if (style.ttl.dims[0] !== 1)
398
- throw new Error("Single speaker text to speech only supports single style")
399
-
400
- /* chunk text into segments */
401
- const textList = chunkText(text)
402
- if (textList.length === 0)
403
- return { wav: [], duration: 0 }
404
-
405
- /* synthesize each chunk and concatenate with silence */
406
- const wavParts: number[][] = []
407
- let totalDuration = 0
408
- for (const chunk of textList) {
409
- const { wav, duration } = await this.infer([ chunk ], style, totalStep, speed)
410
-
411
- /* insert silence between chunks */
412
- if (wavParts.length > 0) {
413
- const silenceLen = Math.floor(silenceDuration * this.sampleRate)
414
- wavParts.push(Array.from<number>({ length: silenceLen }).fill(0))
415
- totalDuration += silenceDuration
416
- }
417
- wavParts.push(wav)
418
- totalDuration += duration[0]
419
- }
420
- return { wav: wavParts.flat(), duration: totalDuration }
421
- }
422
-
423
- async release (): Promise<void> {
424
- /* release all ONNX inference sessions */
425
- await Promise.all([
426
- this.dpOrt.release(),
427
- this.textEncOrt.release(),
428
- this.vectorEstOrt.release(),
429
- this.vocoderOrt.release()
430
- ])
431
- }
432
- }
433
-
434
- /* type for voice style JSON file */
435
- interface VoiceStyleJSON {
436
- style_ttl: { dims: number[], data: number[][][] }
437
- style_dp: { dims: number[], data: number[][][] }
438
- }
439
-
440
- /* load voice style from JSON file */
441
- async function loadVoiceStyle (voiceStylePath: string): Promise<SupertonicStyle> {
442
- /* read and parse voice style JSON */
443
- let voiceStyle: VoiceStyleJSON
444
- try {
445
- voiceStyle = JSON.parse(await fs.promises.readFile(voiceStylePath, "utf8")) as VoiceStyleJSON
446
- }
447
- catch (err) {
448
- throw new Error(`failed to parse voice style JSON "${voiceStylePath}"`, { cause: err })
449
- }
450
-
451
- /* extract dimensions and data */
452
- const ttlDims = voiceStyle.style_ttl.dims
453
- const dpDims = voiceStyle.style_dp.dims
454
- const ttlData = voiceStyle.style_ttl.data.flat(Infinity) as number[]
455
- const dpData = voiceStyle.style_dp.data.flat(Infinity) as number[]
456
-
457
- /* create ONNX tensors for style vectors */
458
- const ttlStyle = new ORT.Tensor("float32", Float32Array.from(ttlData), ttlDims)
459
- const dpStyle = new ORT.Tensor("float32", Float32Array.from(dpData), dpDims)
460
- return { ttl: ttlStyle, dp: dpStyle }
461
- }
462
-
463
- /* load TTS engine from ONNX models */
464
- async function loadSupertonic (assetsDir: string): Promise<SupertonicTTS> {
465
- /* load configuration */
466
- const cfgPath = path.join(assetsDir, "onnx", "tts.json")
467
- let cfgs: SupertonicConfig
468
- try {
469
- cfgs = JSON.parse(await fs.promises.readFile(cfgPath, "utf8"))
470
- }
471
- catch (err) {
472
- throw new Error(`failed to parse TTS config JSON "${cfgPath}"`, { cause: err })
473
- }
474
-
475
- /* load text processor */
476
- const unicodeIndexerPath = path.join(assetsDir, "onnx", "unicode_indexer.json")
477
- const textProcessor = new SupertonicTextProcessor(unicodeIndexerPath)
478
-
479
- /* load ONNX models */
480
- const opts: ORT.InferenceSession.SessionOptions = {}
481
- const [ dpOrt, textEncOrt, vectorEstOrt, vocoderOrt ] = await Promise.all([
482
- ORT.InferenceSession.create(path.join(assetsDir, "onnx", "duration_predictor.onnx"), opts),
483
- ORT.InferenceSession.create(path.join(assetsDir, "onnx", "text_encoder.onnx"), opts),
484
- ORT.InferenceSession.create(path.join(assetsDir, "onnx", "vector_estimator.onnx"), opts),
485
- ORT.InferenceSession.create(path.join(assetsDir, "onnx", "vocoder.onnx"), opts)
486
- ])
487
- return new SupertonicTTS(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt)
488
- }
489
-
490
- /* ==== SPEECHFLOW NODE IMPLEMENTATION ==== */
491
-
492
19
  /* SpeechFlow node for Supertonic text-to-speech conversion */
493
20
  export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
494
21
  /* declare official node name */
495
22
  public static name = "t2a-supertonic"
496
23
 
497
24
  /* internal state */
498
- private supertonic: SupertonicTTS | null = null
499
- private style: SupertonicStyle | null = null
500
- private resampler: SpeexResampler | null = null
501
- private closing = false
25
+ private tts: Transformers.TextToAudioPipeline | null = null
26
+ private resampler: SpeexResampler | null = null
27
+ private sampleRate = 44100
28
+ private closing = false
502
29
 
503
30
  /* construct node */
504
31
  constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
@@ -521,90 +48,110 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
521
48
  return {}
522
49
  }
523
50
 
524
- /* download HuggingFace assets */
525
- private async downloadAssets () {
526
- /* define HuggingFace repository and required files */
527
- const assetRepo = "Supertone/supertonic"
528
- const assetFiles = [
529
- "voice_styles/F1.json",
530
- "voice_styles/F2.json",
531
- "voice_styles/M1.json",
532
- "voice_styles/M2.json",
533
- "onnx/tts.json",
534
- "onnx/duration_predictor.onnx",
535
- "onnx/text_encoder.onnx",
536
- "onnx/unicode_indexer.json",
537
- "onnx/vector_estimator.onnx",
538
- "onnx/vocoder.onnx",
539
- ]
540
-
541
- /* create asset directories */
542
- const assetDir = path.join(this.config.cacheDir, "supertonic")
543
- await mkdirp(path.join(assetDir, "voice_styles"), { mode: 0o750 })
544
- await mkdirp(path.join(assetDir, "onnx"), { mode: 0o750 })
545
-
546
- /* download missing asset files */
547
- for (const assetFile of assetFiles) {
548
- const url = `${assetRepo}/${assetFile}`
549
- const file = path.join(assetDir, assetFile)
550
- const stat = await fs.promises.stat(file).catch((_err) => null)
551
- if (stat === null || !stat.isFile()) {
552
- this.log("info", `downloading from HuggingFace "${url}"`)
553
- const response = await HF.downloadFile({ repo: assetRepo, path: assetFile })
554
- if (!response)
555
- throw new Error(`failed to download from HuggingFace "${url}"`)
556
- const buffer = Buffer.from(await response.arrayBuffer())
557
- await fs.promises.writeFile(file, buffer)
558
- }
559
- }
560
- return assetDir
561
- }
562
-
563
51
  /* open node */
564
52
  async open () {
565
53
  this.closing = false
566
54
 
567
- /* download assets */
568
- const assetsDir = await this.downloadAssets()
55
+ /* load Supertonic TTS pipeline via transformers.js */
56
+ const model = "onnx-community/Supertonic-TTS-ONNX"
57
+ this.log("info", `loading Supertonic TTS model "${model}"`)
58
+
59
+ /* track download progress */
60
+ const progressState = new Map<string, number>()
61
+ const progressCallback = (progress: any) => {
62
+ let artifact = model
63
+ if (typeof progress.file === "string")
64
+ artifact += `:${progress.file}`
65
+ let percent = 0
66
+ if (typeof progress.loaded === "number" && typeof progress.total === "number")
67
+ percent = (progress.loaded / progress.total) * 100
68
+ else if (typeof progress.progress === "number")
69
+ percent = progress.progress
70
+ if (percent > 0)
71
+ progressState.set(artifact, percent)
72
+ }
73
+ let interval: ReturnType<typeof setInterval> | null = setInterval(() => {
74
+ for (const [ artifact, percent ] of progressState) {
75
+ this.log("info", `downloaded ${percent.toFixed(2)}% of artifact "${artifact}"`)
76
+ if (percent >= 100.0)
77
+ progressState.delete(artifact)
78
+ }
79
+ if (progressState.size === 0 && interval !== null) {
80
+ clearInterval(interval)
81
+ interval = null
82
+ }
83
+ }, 1000)
569
84
 
570
- /* download ONNX models */
571
- this.log("info", `loading ONNX models (asset dir: "${assetsDir}")`)
572
- this.supertonic = await loadSupertonic(assetsDir)
573
- this.log("info", `loaded ONNX models (sample rate: ${this.supertonic.sampleRate}Hz)`)
85
+ /* create TTS pipeline */
86
+ try {
87
+ const tts = Transformers.pipeline("text-to-speech", model, {
88
+ dtype: "fp32",
89
+ progress_callback: progressCallback
90
+ })
91
+ this.tts = await tts
92
+ }
93
+ finally {
94
+ if (interval !== null) {
95
+ clearInterval(interval)
96
+ interval = null
97
+ }
98
+ }
99
+ if (this.tts === null)
100
+ throw new Error("failed to instantiate Supertonic TTS pipeline")
574
101
 
575
- /* load voice style */
576
- const voiceStylePath = path.join(assetsDir, "voice_styles", `${this.params.voice}.json`)
577
- if (!fs.existsSync(voiceStylePath))
578
- throw new Error(`voice style not found: ${voiceStylePath}`)
579
- this.log("info", `loading voice style "${this.params.voice}"`)
580
- this.style = await loadVoiceStyle(voiceStylePath)
581
- this.log("info", `loaded voice style "${this.params.voice}"`)
102
+ /* determine sample rate from model config */
103
+ const config = (this.tts as any).model?.config
104
+ if (config?.sampling_rate)
105
+ this.sampleRate = config.sampling_rate
106
+ this.log("info", `loaded Supertonic TTS model (sample rate: ${this.sampleRate}Hz)`)
582
107
 
583
108
  /* establish resampler from Supertonic's output sample rate to our standard audio sample rate (48kHz) */
584
- this.resampler = new SpeexResampler(1, this.supertonic.sampleRate, this.config.audioSampleRate, 7)
109
+ this.resampler = new SpeexResampler(1, this.sampleRate, this.config.audioSampleRate, 7)
110
+
111
+ /* map voice names to speaker embedding URLs */
112
+ const voiceUrls: Record<string, string> = {
113
+ "M1": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/M1.bin",
114
+ "M2": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/M2.bin",
115
+ "F1": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/F1.bin",
116
+ "F2": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/F2.bin"
117
+ }
118
+ const speakerEmbeddings = voiceUrls[this.params.voice]
119
+ if (speakerEmbeddings === undefined)
120
+ throw new Error(`invalid Supertonic voice "${this.params.voice}"`)
121
+ this.log("info", `using voice "${this.params.voice}"`)
585
122
 
586
123
  /* perform text-to-speech operation with Supertonic */
587
124
  const text2speech = async (text: string) => {
588
- /* synthesize speech from text */
589
125
  this.log("info", `Supertonic: input: "${text}"`)
590
- const { wav, duration } = await this.supertonic!.synthesize(
591
- text,
592
- this.style!,
593
- this.params.steps,
594
- this.params.speed
595
- )
126
+
127
+ /* generate speech using transformers.js pipeline */
128
+ const result = await this.tts!(text, {
129
+ speaker_embeddings: speakerEmbeddings,
130
+ num_inference_steps: this.params.steps,
131
+ speed: this.params.speed
132
+ })
133
+
134
+ /* extract audio samples and sample rate */
135
+ if (!(result.audio instanceof Float32Array))
136
+ throw new Error("unexpected Supertonic result: audio is not a Float32Array")
137
+ if (typeof result.sampling_rate !== "number")
138
+ throw new Error("unexpected Supertonic result: sampling_rate is not a number")
139
+ const samples = result.audio
140
+ const outputSampleRate = result.sampling_rate
141
+ if (outputSampleRate !== this.sampleRate)
142
+ this.log("warn", `unexpected sample rate ${outputSampleRate}Hz (expected ${this.sampleRate}Hz)`)
143
+
144
+ /* calculate duration */
145
+ const duration = samples.length / outputSampleRate
596
146
  this.log("info", `Supertonic: synthesized ${duration.toFixed(2)}s of audio`)
597
147
 
598
148
  /* convert audio samples from PCM/F32 to PCM/I16 */
599
- const buffer1 = Buffer.alloc(wav.length * 2)
600
- for (let i = 0; i < wav.length; i++) {
601
- const sample = Math.max(-1, Math.min(1, wav[i]))
602
- buffer1.writeInt16LE(sample * 0x7FFF, i * 2)
603
- }
149
+ const buffer1 = util.convertF32ToBuf(samples)
604
150
 
605
- /* resample audio samples from 44.1kHz to 48kHz */
606
- const buffer2 = this.resampler!.processChunk(buffer1)
607
- return buffer2
151
+ /* resample audio samples from Supertonic sample rate to 48kHz */
152
+ if (this.resampler === null)
153
+ throw new Error("resampler destroyed during TTS processing")
154
+ return this.resampler.processChunk(buffer1)
608
155
  }
609
156
 
610
157
  /* create transform stream and connect it to the Supertonic TTS */
@@ -614,11 +161,13 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
614
161
  readableObjectMode: true,
615
162
  decodeStrings: false,
616
163
  highWaterMark: 1,
617
- async transform (chunk: SpeechFlowChunk, encoding, callback) {
164
+ transform (chunk: SpeechFlowChunk, encoding, callback) {
618
165
  if (self.closing)
619
166
  callback(new Error("stream already destroyed"))
620
167
  else if (Buffer.isBuffer(chunk.payload))
621
168
  callback(new Error("invalid chunk payload type"))
169
+ else if (chunk.payload === "")
170
+ callback()
622
171
  else {
623
172
  let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
624
173
  processTimeout = null
@@ -630,13 +179,7 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
630
179
  processTimeout = null
631
180
  }
632
181
  }
633
- try {
634
- if (self.closing) {
635
- clearProcessTimeout()
636
- callback(new Error("stream destroyed during processing"))
637
- return
638
- }
639
- const buffer = await text2speech(chunk.payload as string)
182
+ text2speech(chunk.payload as string).then((buffer) => {
640
183
  if (self.closing) {
641
184
  clearProcessTimeout()
642
185
  callback(new Error("stream destroyed during processing"))
@@ -653,18 +196,13 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
653
196
  chunkNew.type = "audio"
654
197
  chunkNew.payload = buffer
655
198
  chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
656
-
657
- /* push chunk and complete transform */
658
199
  clearProcessTimeout()
659
200
  this.push(chunkNew)
660
201
  callback()
661
- }
662
- catch (error) {
663
-
664
- /* handle processing errors */
202
+ }).catch((error: unknown) => {
665
203
  clearProcessTimeout()
666
204
  callback(util.ensureError(error, "Supertonic processing failed"))
667
- }
205
+ })
668
206
  }
669
207
  },
670
208
  final (callback) {
@@ -684,18 +222,15 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
684
222
  this.stream = null
685
223
  }
686
224
 
687
- /* destroy voice style */
688
- if (this.style !== null)
689
- this.style = null
690
-
691
225
  /* destroy resampler */
692
226
  if (this.resampler !== null)
693
227
  this.resampler = null
694
228
 
695
- /* destroy Supertonic TTS */
696
- if (this.supertonic !== null) {
697
- await this.supertonic.release()
698
- this.supertonic = null
229
+ /* destroy TTS pipeline */
230
+ if (this.tts !== null) {
231
+ /* dispose of the pipeline if possible */
232
+ await this.tts.dispose()
233
+ this.tts = null
699
234
  }
700
235
  }
701
236
  }