speechflow 1.7.1 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/CHANGELOG.md +24 -0
  2. package/README.md +388 -120
  3. package/etc/claude.md +5 -5
  4. package/etc/speechflow.yaml +2 -2
  5. package/package.json +3 -3
  6. package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
  7. package/speechflow-cli/dst/speechflow-main-cli.js +1 -0
  8. package/speechflow-cli/dst/speechflow-main-cli.js.map +1 -1
  9. package/speechflow-cli/dst/speechflow-main-graph.d.ts +1 -0
  10. package/speechflow-cli/dst/speechflow-main-graph.js +30 -9
  11. package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
  12. package/speechflow-cli/dst/speechflow-main-nodes.js +1 -0
  13. package/speechflow-cli/dst/speechflow-main-nodes.js.map +1 -1
  14. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +1 -0
  15. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
  16. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +7 -9
  17. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
  18. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -0
  19. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js.map +1 -1
  20. package/speechflow-cli/dst/speechflow-node-a2a-expander.js +8 -9
  21. package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
  22. package/speechflow-cli/dst/speechflow-node-a2a-filler.js +2 -0
  23. package/speechflow-cli/dst/speechflow-node-a2a-filler.js.map +1 -1
  24. package/speechflow-cli/dst/speechflow-node-a2a-gender.js +1 -1
  25. package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
  26. package/speechflow-cli/dst/speechflow-node-a2a-meter.js +1 -1
  27. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +11 -9
  28. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
  29. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js +1 -0
  30. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js.map +1 -1
  31. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js.map +1 -1
  32. package/speechflow-cli/dst/speechflow-node-a2a-speex.js +4 -2
  33. package/speechflow-cli/dst/speechflow-node-a2a-speex.js.map +1 -1
  34. package/speechflow-cli/dst/speechflow-node-a2a-vad.js +19 -22
  35. package/speechflow-cli/dst/speechflow-node-a2a-vad.js.map +1 -1
  36. package/speechflow-cli/dst/speechflow-node-a2a-wav.js +31 -4
  37. package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
  38. package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
  39. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +2 -11
  40. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
  41. package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +16 -0
  42. package/speechflow-cli/dst/speechflow-node-a2t-google.js +314 -0
  43. package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -0
  44. package/speechflow-cli/dst/speechflow-node-a2t-openai.js +6 -1
  45. package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
  46. package/speechflow-cli/dst/speechflow-node-t2a-amazon.d.ts +1 -1
  47. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +27 -7
  48. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
  49. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.d.ts +1 -1
  50. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +5 -3
  51. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
  52. package/speechflow-cli/dst/speechflow-node-t2a-google.d.ts +15 -0
  53. package/speechflow-cli/dst/speechflow-node-t2a-google.js +215 -0
  54. package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -0
  55. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +1 -1
  56. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +27 -6
  57. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
  58. package/speechflow-cli/dst/speechflow-node-t2a-openai.d.ts +15 -0
  59. package/speechflow-cli/dst/speechflow-node-t2a-openai.js +192 -0
  60. package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -0
  61. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +17 -0
  62. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +619 -0
  63. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -0
  64. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js +0 -2
  65. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
  66. package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
  67. package/speechflow-cli/dst/speechflow-node-t2t-google.js.map +1 -1
  68. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.d.ts → speechflow-node-t2t-opus.d.ts} +1 -3
  69. package/speechflow-cli/dst/speechflow-node-t2t-opus.js +161 -0
  70. package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -0
  71. package/speechflow-cli/dst/speechflow-node-t2t-profanity.d.ts +11 -0
  72. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js +118 -0
  73. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js.map +1 -0
  74. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.d.ts +13 -0
  75. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +220 -0
  76. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -0
  77. package/speechflow-cli/dst/{speechflow-node-t2t-openai.d.ts → speechflow-node-t2t-spellcheck.d.ts} +2 -2
  78. package/speechflow-cli/dst/{speechflow-node-t2t-openai.js → speechflow-node-t2t-spellcheck.js} +48 -100
  79. package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -0
  80. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +8 -8
  81. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
  82. package/speechflow-cli/dst/speechflow-node-t2t-summary.d.ts +16 -0
  83. package/speechflow-cli/dst/speechflow-node-t2t-summary.js +241 -0
  84. package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -0
  85. package/speechflow-cli/dst/{speechflow-node-t2t-ollama.d.ts → speechflow-node-t2t-translate.d.ts} +2 -2
  86. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.js → speechflow-node-t2t-translate.js} +53 -115
  87. package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -0
  88. package/speechflow-cli/dst/speechflow-node-x2x-filter.js +2 -0
  89. package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
  90. package/speechflow-cli/dst/speechflow-node-xio-exec.d.ts +12 -0
  91. package/speechflow-cli/dst/speechflow-node-xio-exec.js +224 -0
  92. package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -0
  93. package/speechflow-cli/dst/speechflow-node-xio-file.d.ts +1 -0
  94. package/speechflow-cli/dst/speechflow-node-xio-file.js +78 -67
  95. package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
  96. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
  97. package/speechflow-cli/dst/speechflow-node-xio-vban.d.ts +17 -0
  98. package/speechflow-cli/dst/speechflow-node-xio-vban.js +330 -0
  99. package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -0
  100. package/speechflow-cli/dst/speechflow-node-xio-webrtc.d.ts +39 -0
  101. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +502 -0
  102. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -0
  103. package/speechflow-cli/dst/speechflow-node-xio-websocket.js +9 -9
  104. package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
  105. package/speechflow-cli/dst/speechflow-util-audio.js +8 -5
  106. package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
  107. package/speechflow-cli/dst/speechflow-util-error.d.ts +1 -0
  108. package/speechflow-cli/dst/speechflow-util-error.js +5 -0
  109. package/speechflow-cli/dst/speechflow-util-error.js.map +1 -1
  110. package/speechflow-cli/dst/speechflow-util-llm.d.ts +35 -0
  111. package/speechflow-cli/dst/speechflow-util-llm.js +363 -0
  112. package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -0
  113. package/speechflow-cli/dst/speechflow-util-queue.js +2 -1
  114. package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
  115. package/speechflow-cli/dst/speechflow-util.d.ts +1 -0
  116. package/speechflow-cli/dst/speechflow-util.js +2 -0
  117. package/speechflow-cli/dst/speechflow-util.js.map +1 -1
  118. package/speechflow-cli/etc/oxlint.jsonc +2 -1
  119. package/speechflow-cli/package.json +35 -18
  120. package/speechflow-cli/src/lib.d.ts +5 -0
  121. package/speechflow-cli/src/speechflow-main-api.ts +16 -16
  122. package/speechflow-cli/src/speechflow-main-cli.ts +1 -0
  123. package/speechflow-cli/src/speechflow-main-graph.ts +38 -14
  124. package/speechflow-cli/src/speechflow-main-nodes.ts +1 -0
  125. package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +1 -0
  126. package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +8 -10
  127. package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -0
  128. package/speechflow-cli/src/speechflow-node-a2a-expander.ts +9 -10
  129. package/speechflow-cli/src/speechflow-node-a2a-filler.ts +2 -0
  130. package/speechflow-cli/src/speechflow-node-a2a-gender.ts +3 -3
  131. package/speechflow-cli/src/speechflow-node-a2a-meter.ts +2 -2
  132. package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +11 -9
  133. package/speechflow-cli/src/speechflow-node-a2a-rnnoise-wt.ts +1 -0
  134. package/speechflow-cli/src/speechflow-node-a2a-rnnoise.ts +1 -1
  135. package/speechflow-cli/src/speechflow-node-a2a-speex.ts +5 -3
  136. package/speechflow-cli/src/speechflow-node-a2a-vad.ts +20 -23
  137. package/speechflow-cli/src/speechflow-node-a2a-wav.ts +31 -4
  138. package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +6 -18
  139. package/speechflow-cli/src/speechflow-node-a2t-google.ts +315 -0
  140. package/speechflow-cli/src/speechflow-node-a2t-openai.ts +12 -7
  141. package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +32 -10
  142. package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +6 -4
  143. package/speechflow-cli/src/speechflow-node-t2a-google.ts +203 -0
  144. package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +33 -10
  145. package/speechflow-cli/src/speechflow-node-t2a-openai.ts +176 -0
  146. package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +710 -0
  147. package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +3 -4
  148. package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +2 -2
  149. package/speechflow-cli/src/speechflow-node-t2t-google.ts +1 -1
  150. package/speechflow-cli/src/speechflow-node-t2t-opus.ts +137 -0
  151. package/speechflow-cli/src/speechflow-node-t2t-profanity.ts +93 -0
  152. package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +201 -0
  153. package/speechflow-cli/src/speechflow-node-t2t-spellcheck.ts +188 -0
  154. package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +8 -8
  155. package/speechflow-cli/src/speechflow-node-t2t-summary.ts +229 -0
  156. package/speechflow-cli/src/speechflow-node-t2t-translate.ts +181 -0
  157. package/speechflow-cli/src/speechflow-node-x2x-filter.ts +2 -0
  158. package/speechflow-cli/src/speechflow-node-xio-exec.ts +211 -0
  159. package/speechflow-cli/src/speechflow-node-xio-file.ts +91 -80
  160. package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +2 -2
  161. package/speechflow-cli/src/speechflow-node-xio-vban.ts +325 -0
  162. package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +535 -0
  163. package/speechflow-cli/src/speechflow-node-xio-websocket.ts +9 -9
  164. package/speechflow-cli/src/speechflow-util-audio.ts +10 -5
  165. package/speechflow-cli/src/speechflow-util-error.ts +9 -0
  166. package/speechflow-cli/src/speechflow-util-llm.ts +367 -0
  167. package/speechflow-cli/src/speechflow-util-queue.ts +3 -3
  168. package/speechflow-cli/src/speechflow-util.ts +2 -0
  169. package/speechflow-ui-db/package.json +9 -9
  170. package/speechflow-ui-st/package.json +9 -9
  171. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +0 -293
  172. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +0 -1
  173. package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +0 -1
  174. package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +0 -1
  175. package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +0 -281
  176. package/speechflow-cli/src/speechflow-node-t2t-openai.ts +0 -247
  177. package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +0 -247
@@ -0,0 +1,710 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import fs from "node:fs"
9
+ import path from "node:path"
10
+ import Stream from "node:stream"
11
+
12
+ /* external dependencies */
13
+ import { mkdirp } from "mkdirp"
14
+ import * as HF from "@huggingface/hub"
15
+ import SpeexResampler from "speex-resampler"
16
+ import { Duration } from "luxon"
17
+
18
+ /* @ts-expect-error no type available */
19
+ import * as ORT from "onnxruntime-node"
20
+
21
+ /* internal dependencies */
22
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
23
+ import * as util from "./speechflow-util"
24
+
25
+ /* ==== SUPERTONIC TTS IMPLEMENTATION ==== */
26
+
27
+ /* type for voice style tensors */
28
+ interface SupertonicStyle {
29
+ ttl: ORT.Tensor
30
+ dp: ORT.Tensor
31
+ }
32
+
33
+ /* type for TTS configuration */
34
+ interface SupertonicConfig {
35
+ ae: {
36
+ sample_rate: number
37
+ base_chunk_size: number
38
+ chunk_compress_factor: number
39
+ }
40
+ ttl: {
41
+ latent_dim: number
42
+ chunk_compress_factor: number
43
+ }
44
+ }
45
+
46
+ /* convert lengths to binary mask */
47
+ function lengthToMask (lengths: number[], maxLen: number | null = null): number[][][] {
48
+ /* handle empty input */
49
+ if (lengths.length === 0)
50
+ return []
51
+
52
+ /* determine maximum length */
53
+ maxLen = maxLen ?? Math.max(...lengths)
54
+
55
+ /* build mask array */
56
+ const mask: number[][][] = []
57
+ for (let i = 0; i < lengths.length; i++) {
58
+ const row: number[] = []
59
+ for (let j = 0; j < maxLen; j++)
60
+ row.push(j < lengths[i] ? 1.0 : 0.0)
61
+ mask.push([ row ])
62
+ }
63
+ return mask
64
+ }
65
+
66
+ /* get latent mask from wav lengths */
67
+ function getLatentMask (wavLengths: number[], baseChunkSize: number, chunkCompressFactor: number): number[][][] {
68
+ /* calculate latent size and lengths */
69
+ const latentSize = baseChunkSize * chunkCompressFactor
70
+ const latentLengths = wavLengths.map((len) =>
71
+ Math.floor((len + latentSize - 1) / latentSize))
72
+
73
+ /* generate mask from latent lengths */
74
+ return lengthToMask(latentLengths)
75
+ }
76
+
77
+ /* convert array to ONNX tensor */
78
+ function arrayToTensor (array: number[] | number[][] | number[][][], dims: number[]): ORT.Tensor {
79
+ /* flatten array and create float32 tensor */
80
+ const flat = array.flat(Infinity) as number[]
81
+ return new ORT.Tensor("float32", Float32Array.from(flat), dims)
82
+ }
83
+
84
+ /* convert int array to ONNX tensor */
85
+ function intArrayToTensor (array: number[][], dims: number[]): ORT.Tensor {
86
+ /* flatten array and create int64 tensor */
87
+ const flat = array.flat(Infinity) as number[]
88
+ return new ORT.Tensor("int64", BigInt64Array.from(flat.map(BigInt)), dims)
89
+ }
90
+
91
+ /* chunk text into manageable segments */
92
+ function chunkText (text: string, maxLen = 300): string[] {
93
+ /* validate input type */
94
+ if (typeof text !== "string")
95
+ throw new Error(`chunkText expects a string, got ${typeof text}`)
96
+
97
+ /* split by paragraph (two or more newlines) */
98
+ const paragraphs = text.trim().split(/\n\s*\n+/).filter((p) => p.trim())
99
+
100
+ /* process each paragraph into chunks */
101
+ const chunks: string[] = []
102
+ for (let paragraph of paragraphs) {
103
+ paragraph = paragraph.trim()
104
+ if (!paragraph)
105
+ continue
106
+
107
+ /* split by sentence boundaries (period, question mark, exclamation mark followed by space)
108
+ but exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F. */
109
+ const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/)
110
+
111
+ /* accumulate sentences into chunks respecting max length */
112
+ let currentChunk = ""
113
+ for (const sentence of sentences) {
114
+ if (currentChunk.length + sentence.length + 1 <= maxLen)
115
+ currentChunk += (currentChunk ? " " : "") + sentence
116
+ else {
117
+ if (currentChunk)
118
+ chunks.push(currentChunk.trim())
119
+ currentChunk = sentence
120
+ }
121
+ }
122
+
123
+ /* push remaining chunk */
124
+ if (currentChunk)
125
+ chunks.push(currentChunk.trim())
126
+ }
127
+ return chunks
128
+ }
129
+
130
+ /* unicode text processor class */
131
+ class SupertonicTextProcessor {
132
+ private indexer: Record<number, number>
133
+
134
+ /* construct text processor */
135
+ constructor (unicodeIndexerJsonPath: string) {
136
+ /* load and parse unicode indexer JSON */
137
+ try {
138
+ this.indexer = JSON.parse(fs.readFileSync(unicodeIndexerJsonPath, "utf8"))
139
+ }
140
+ catch (err) {
141
+ throw new Error(`failed to parse unicode indexer JSON "${unicodeIndexerJsonPath}"`, { cause: err })
142
+ }
143
+ }
144
+
145
+ /* preprocess text */
146
+ private preprocessText (text: string): string {
147
+ /* normalize text */
148
+ text = text.normalize("NFKD")
149
+
150
+ /* remove emojis (wide Unicode range) */
151
+ const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu
152
+ text = text.replace(emojiPattern, "")
153
+
154
+ /* replace various dashes and symbols */
155
+ const replacements: Record<string, string> = {
156
+ "–": "-",
157
+ "‑": "-",
158
+ "—": "-",
159
+ "¯": " ",
160
+ "_": " ",
161
+ "\u201C": "\"",
162
+ "\u201D": "\"",
163
+ "\u2018": "'",
164
+ "\u2019": "'",
165
+ "´": "'",
166
+ "`": "'",
167
+ "[": " ",
168
+ "]": " ",
169
+ "|": " ",
170
+ "/": " ",
171
+ "#": " ",
172
+ "→": " ",
173
+ "←": " "
174
+ }
175
+ for (const [ k, v ] of Object.entries(replacements))
176
+ text = text.replaceAll(k, v)
177
+
178
+ /* remove combining diacritics */
179
+ text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, "")
180
+
181
+ /* remove special symbols */
182
+ text = text.replace(/[♥☆♡©\\]/g, "")
183
+
184
+ /* replace known expressions */
185
+ const exprReplacements: Record<string, string> = {
186
+ "@": " at ",
187
+ "e.g.,": "for example, ",
188
+ "i.e.,": "that is, "
189
+ }
190
+ for (const [ k, v ] of Object.entries(exprReplacements))
191
+ text = text.replaceAll(k, v)
192
+
193
+ /* fix spacing around punctuation */
194
+ text = text.replace(/ ,/g, ",")
195
+ text = text.replace(/ \./g, ".")
196
+ text = text.replace(/ !/g, "!")
197
+ text = text.replace(/ \?/g, "?")
198
+ text = text.replace(/ ;/g, ";")
199
+ text = text.replace(/ :/g, ":")
200
+ text = text.replace(/ '/g, "'")
201
+
202
+ /* remove duplicate quotes */
203
+ text = text.replace(/""+/g, "\"")
204
+ text = text.replace(/''+/g, "'")
205
+ text = text.replace(/``+/g, "`")
206
+
207
+ /* remove extra spaces */
208
+ text = text.replace(/\s+/g, " ").trim()
209
+
210
+ /* if text doesn't end with punctuation, add a period */
211
+ if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text))
212
+ text += "."
213
+ return text
214
+ }
215
+
216
+ /* convert text to Unicode values */
217
+ private textToUnicodeValues (text: string): number[] {
218
+ /* convert text characters to unicode code points */
219
+ return Array.from(text).map((char) => char.charCodeAt(0))
220
+ }
221
+
222
+ /* process text list */
223
+ call (textList: string[]): { textIds: number[][], textMask: number[][][] } {
224
+ /* handle empty input */
225
+ if (textList.length === 0)
226
+ return { textIds: [], textMask: [] }
227
+
228
+ /* preprocess all texts */
229
+ const processedTexts = textList.map((t) => this.preprocessText(t))
230
+ const textIdsLengths = processedTexts.map((t) => t.length)
231
+ const maxLen = Math.max(...textIdsLengths)
232
+
233
+ /* convert texts to indexed token arrays */
234
+ const textIds: number[][] = []
235
+ for (let i = 0; i < processedTexts.length; i++) {
236
+ const row = Array.from<number>({ length: maxLen }).fill(0)
237
+ const unicodeVals = this.textToUnicodeValues(processedTexts[i])
238
+ for (let j = 0; j < unicodeVals.length; j++)
239
+ row[j] = this.indexer[unicodeVals[j]] ?? 0
240
+ textIds.push(row)
241
+ }
242
+
243
+ /* generate text mask from lengths */
244
+ const textMask = lengthToMask(textIdsLengths)
245
+ return { textIds, textMask }
246
+ }
247
+ }
248
+
249
+ /* Supertonic TTS engine class */
250
+ class SupertonicTTS {
251
+ public sampleRate: number
252
+
253
+ /* internal TTS state */
254
+ private cfgs: SupertonicConfig
255
+ private textProcessor: SupertonicTextProcessor
256
+ private dpOrt: ORT.InferenceSession
257
+ private textEncOrt: ORT.InferenceSession
258
+ private vectorEstOrt: ORT.InferenceSession
259
+ private vocoderOrt: ORT.InferenceSession
260
+ private baseChunkSize: number
261
+ private chunkCompressFactor: number
262
+ private latentDim: number
263
+
264
+ /* construct TTS engine */
265
+ constructor (
266
+ cfgs: SupertonicConfig,
267
+ textProcessor: SupertonicTextProcessor,
268
+ dpOrt: ORT.InferenceSession,
269
+ textEncOrt: ORT.InferenceSession,
270
+ vectorEstOrt: ORT.InferenceSession,
271
+ vocoderOrt: ORT.InferenceSession
272
+ ) {
273
+ /* store configuration and dependencies */
274
+ this.cfgs = cfgs
275
+ this.textProcessor = textProcessor
276
+ this.dpOrt = dpOrt
277
+ this.textEncOrt = textEncOrt
278
+ this.vectorEstOrt = vectorEstOrt
279
+ this.vocoderOrt = vocoderOrt
280
+
281
+ /* extract configuration values */
282
+ this.sampleRate = cfgs.ae.sample_rate
283
+ this.baseChunkSize = cfgs.ae.base_chunk_size
284
+ this.chunkCompressFactor = cfgs.ttl.chunk_compress_factor
285
+ this.latentDim = cfgs.ttl.latent_dim
286
+ }
287
+
288
+ /* sample noisy latent vectors */
289
+ private sampleNoisyLatent (duration: number[]): { noisyLatent: number[][][], latentMask: number[][][] } {
290
+ /* calculate dimensions for latent space */
291
+ const wavLenMax = Math.max(...duration) * this.sampleRate
292
+ const wavLengths = duration.map((d) => Math.floor(d * this.sampleRate))
293
+ const chunkSize = this.baseChunkSize * this.chunkCompressFactor
294
+ const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize)
295
+ const latentDimExpanded = this.latentDim * this.chunkCompressFactor
296
+
297
+ /* generate random noise (pre-allocate arrays for performance) */
298
+ const noisyLatent: number[][][] = Array.from({ length: duration.length })
299
+ for (let b = 0; b < duration.length; b++) {
300
+ const batch: number[][] = Array.from({ length: latentDimExpanded })
301
+ for (let d = 0; d < latentDimExpanded; d++) {
302
+ const row: number[] = Array.from({ length: latentLen })
303
+ for (let t = 0; t < latentLen; t++) {
304
+ /* Box-Muller transform for normal distribution */
305
+ const eps = 1e-10
306
+ const u1 = Math.max(eps, Math.random())
307
+ const u2 = Math.random()
308
+ row[t] = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2)
309
+ }
310
+ batch[d] = row
311
+ }
312
+ noisyLatent[b] = batch
313
+ }
314
+
315
+ /* apply mask */
316
+ const latentMask = getLatentMask(wavLengths, this.baseChunkSize, this.chunkCompressFactor)
317
+ for (let b = 0; b < noisyLatent.length; b++) {
318
+ for (let d = 0; d < noisyLatent[b].length; d++) {
319
+ for (let t = 0; t < noisyLatent[b][d].length; t++)
320
+ noisyLatent[b][d][t] *= latentMask[b][0][t]
321
+ }
322
+ }
323
+ return { noisyLatent, latentMask }
324
+ }
325
+
326
+ /* perform inference */
327
+ private async infer (textList: string[], style: SupertonicStyle, totalStep: number, speed: number): Promise<{ wav: number[], duration: number[] }> {
328
+ /* validate batch size matches style vectors */
329
+ if (textList.length !== style.ttl.dims[0])
330
+ throw new Error("Number of texts must match number of style vectors")
331
+
332
+ /* process text into token IDs and masks */
333
+ const batchSize = textList.length
334
+ const { textIds, textMask } = this.textProcessor.call(textList)
335
+ const textIdsShape = [ batchSize, textIds[0].length ]
336
+ const textMaskShape = [ batchSize, 1, textMask[0][0].length ]
337
+ const textMaskTensor = arrayToTensor(textMask, textMaskShape)
338
+
339
+ /* run duration predictor model */
340
+ const dpResult = await this.dpOrt.run({
341
+ text_ids: intArrayToTensor(textIds, textIdsShape),
342
+ style_dp: style.dp,
343
+ text_mask: textMaskTensor
344
+ })
345
+ const predictedDurations = Array.from(dpResult.duration.data as Float32Array)
346
+
347
+ /* apply speed factor to duration */
348
+ for (let i = 0; i < predictedDurations.length; i++)
349
+ predictedDurations[i] /= speed
350
+
351
+ /* run text encoder model */
352
+ const textEncResult = await this.textEncOrt.run({
353
+ text_ids: intArrayToTensor(textIds, textIdsShape),
354
+ style_ttl: style.ttl,
355
+ text_mask: textMaskTensor
356
+ })
357
+ const textEmbTensor = textEncResult.text_emb
358
+
359
+ /* sample initial noisy latent vectors */
360
+ const { noisyLatent, latentMask } = this.sampleNoisyLatent(predictedDurations)
361
+ const latentShape = [ batchSize, noisyLatent[0].length, noisyLatent[0][0].length ]
362
+ const latentMaskShape = [ batchSize, 1, latentMask[0][0].length ]
363
+ const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape)
364
+
365
+ /* prepare step tensors */
366
+ const totalStepArray = Array.from<number>({ length: batchSize }).fill(totalStep)
367
+ const scalarShape = [ batchSize ]
368
+ const totalStepTensor = arrayToTensor(totalStepArray, scalarShape)
369
+
370
+ /* iteratively denoise latent vectors */
371
+ for (let step = 0; step < totalStep; step++) {
372
+ const currentStepArray = Array.from<number>({ length: batchSize }).fill(step)
373
+
374
+ /* run vector estimator model */
375
+ const vectorEstResult = await this.vectorEstOrt.run({
376
+ noisy_latent: arrayToTensor(noisyLatent, latentShape),
377
+ text_emb: textEmbTensor,
378
+ style_ttl: style.ttl,
379
+ text_mask: textMaskTensor,
380
+ latent_mask: latentMaskTensor,
381
+ total_step: totalStepTensor,
382
+ current_step: arrayToTensor(currentStepArray, scalarShape)
383
+ })
384
+ const denoisedLatent = Array.from(vectorEstResult.denoised_latent.data as Float32Array)
385
+
386
+ /* update latent with the denoised output */
387
+ let idx = 0
388
+ for (let b = 0; b < noisyLatent.length; b++)
389
+ for (let d = 0; d < noisyLatent[b].length; d++)
390
+ for (let t = 0; t < noisyLatent[b][d].length; t++)
391
+ noisyLatent[b][d][t] = denoisedLatent[idx++]
392
+ }
393
+
394
+ /* run vocoder to generate audio waveform */
395
+ const vocoderResult = await this.vocoderOrt.run({
396
+ latent: arrayToTensor(noisyLatent, latentShape)
397
+ })
398
+ const wav = Array.from(vocoderResult.wav_tts.data as Float32Array)
399
+ return { wav, duration: predictedDurations }
400
+ }
401
+
402
+ /* synthesize speech from text */
403
+ async synthesize (text: string, style: SupertonicStyle, totalStep: number, speed: number, silenceDuration = 0.3): Promise<{ wav: number[], duration: number }> {
404
+ /* validate single speaker mode */
405
+ if (style.ttl.dims[0] !== 1)
406
+ throw new Error("Single speaker text to speech only supports single style")
407
+
408
+ /* chunk text into segments */
409
+ const textList = chunkText(text)
410
+ if (textList.length === 0)
411
+ return { wav: [], duration: 0 }
412
+
413
+ /* synthesize each chunk and concatenate with silence */
414
+ const wavParts: number[][] = []
415
+ let totalDuration = 0
416
+ for (const chunk of textList) {
417
+ const { wav, duration } = await this.infer([ chunk ], style, totalStep, speed)
418
+
419
+ /* insert silence between chunks */
420
+ if (wavParts.length > 0) {
421
+ const silenceLen = Math.floor(silenceDuration * this.sampleRate)
422
+ wavParts.push(Array.from<number>({ length: silenceLen }).fill(0))
423
+ totalDuration += silenceDuration
424
+ }
425
+ wavParts.push(wav)
426
+ totalDuration += duration[0]
427
+ }
428
+ return { wav: wavParts.flat(), duration: totalDuration }
429
+ }
430
+
431
+ /* release TTS engine resources */
432
+ async release (): Promise<void> {
433
+ /* release all ONNX inference sessions */
434
+ await Promise.all([
435
+ this.dpOrt.release(),
436
+ this.textEncOrt.release(),
437
+ this.vectorEstOrt.release(),
438
+ this.vocoderOrt.release()
439
+ ])
440
+ }
441
+ }
442
+
443
+ /* type for voice style JSON file */
444
+ interface VoiceStyleJSON {
445
+ style_ttl: { dims: number[], data: number[][][] }
446
+ style_dp: { dims: number[], data: number[][][] }
447
+ }
448
+
449
+ /* load voice style from JSON file */
450
+ async function loadVoiceStyle (voiceStylePath: string): Promise<SupertonicStyle> {
451
+ /* read and parse voice style JSON */
452
+ let voiceStyle: VoiceStyleJSON
453
+ try {
454
+ voiceStyle = JSON.parse(await fs.promises.readFile(voiceStylePath, "utf8")) as VoiceStyleJSON
455
+ }
456
+ catch (err) {
457
+ throw new Error(`failed to parse voice style JSON "${voiceStylePath}"`, { cause: err })
458
+ }
459
+
460
+ /* extract dimensions and data */
461
+ const ttlDims = voiceStyle.style_ttl.dims
462
+ const dpDims = voiceStyle.style_dp.dims
463
+ const ttlData = voiceStyle.style_ttl.data.flat(Infinity) as number[]
464
+ const dpData = voiceStyle.style_dp.data.flat(Infinity) as number[]
465
+
466
+ /* create ONNX tensors for style vectors */
467
+ const ttlStyle = new ORT.Tensor("float32", Float32Array.from(ttlData), ttlDims)
468
+ const dpStyle = new ORT.Tensor("float32", Float32Array.from(dpData), dpDims)
469
+ return { ttl: ttlStyle, dp: dpStyle }
470
+ }
471
+
472
+ /* load TTS engine from ONNX models */
473
+ async function loadSupertonic (assetsDir: string): Promise<SupertonicTTS> {
474
+ /* load configuration */
475
+ const cfgPath = path.join(assetsDir, "onnx", "tts.json")
476
+ let cfgs: SupertonicConfig
477
+ try {
478
+ cfgs = JSON.parse(await fs.promises.readFile(cfgPath, "utf8"))
479
+ }
480
+ catch (err) {
481
+ throw new Error(`failed to parse TTS config JSON "${cfgPath}"`, { cause: err })
482
+ }
483
+
484
+ /* load text processor */
485
+ const unicodeIndexerPath = path.join(assetsDir, "onnx", "unicode_indexer.json")
486
+ const textProcessor = new SupertonicTextProcessor(unicodeIndexerPath)
487
+
488
+ /* load ONNX models */
489
+ const opts: ORT.InferenceSession.SessionOptions = {}
490
+ const [ dpOrt, textEncOrt, vectorEstOrt, vocoderOrt ] = await Promise.all([
491
+ ORT.InferenceSession.create(path.join(assetsDir, "onnx", "duration_predictor.onnx"), opts),
492
+ ORT.InferenceSession.create(path.join(assetsDir, "onnx", "text_encoder.onnx"), opts),
493
+ ORT.InferenceSession.create(path.join(assetsDir, "onnx", "vector_estimator.onnx"), opts),
494
+ ORT.InferenceSession.create(path.join(assetsDir, "onnx", "vocoder.onnx"), opts)
495
+ ])
496
+ return new SupertonicTTS(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt)
497
+ }
498
+
499
+ /* ==== SPEECHFLOW NODE IMPLEMENTATION ==== */
500
+
501
+ /* SpeechFlow node for Supertonic text-to-speech conversion */
502
+ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
503
+ /* declare official node name */
504
+ public static name = "t2a-supertonic"
505
+
506
+ /* internal state */
507
+ private supertonic: SupertonicTTS | null = null
508
+ private style: SupertonicStyle | null = null
509
+ private resampler: SpeexResampler | null = null
510
+ private closing = false
511
+
512
+ /* construct node */
513
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
514
+ super(id, cfg, opts, args)
515
+
516
+ /* declare node configuration parameters */
517
+ this.configure({
518
+ voice: { type: "string", val: "M1", pos: 0, match: /^(?:M1|M2|F1|F2)$/ },
519
+ speed: { type: "number", val: 1.40, pos: 1, match: (n: number) => n >= 0.5 && n <= 2.0 },
520
+ steps: { type: "number", val: 20, pos: 2, match: (n: number) => n >= 1 && n <= 20 }
521
+ })
522
+
523
+ /* declare node input/output format */
524
+ this.input = "text"
525
+ this.output = "audio"
526
+ }
527
+
528
+ /* one-time status of node */
529
+ async status () {
530
+ return {}
531
+ }
532
+
533
+ /* download HuggingFace assets */
534
+ private async downloadAssets () {
535
+ /* define HuggingFace repository and required files */
536
+ const assetRepo = "Supertone/supertonic"
537
+ const assetFiles = [
538
+ "voice_styles/F1.json",
539
+ "voice_styles/F2.json",
540
+ "voice_styles/M1.json",
541
+ "voice_styles/M2.json",
542
+ "onnx/tts.json",
543
+ "onnx/duration_predictor.onnx",
544
+ "onnx/text_encoder.onnx",
545
+ "onnx/unicode_indexer.json",
546
+ "onnx/vector_estimator.onnx",
547
+ "onnx/vocoder.onnx"
548
+ ]
549
+
550
+ /* create asset directories */
551
+ const assetDir = path.join(this.config.cacheDir, "supertonic")
552
+ await mkdirp(path.join(assetDir, "voice_styles"), { mode: 0o750 })
553
+ await mkdirp(path.join(assetDir, "onnx"), { mode: 0o750 })
554
+
555
+ /* download missing asset files */
556
+ for (const assetFile of assetFiles) {
557
+ const url = `${assetRepo}/${assetFile}`
558
+ const file = path.join(assetDir, assetFile)
559
+ const stat = await fs.promises.stat(file).catch((_err) => null)
560
+ if (stat === null || !stat.isFile()) {
561
+ this.log("info", `downloading from HuggingFace "${url}"`)
562
+ const response = await HF.downloadFile({ repo: assetRepo, path: assetFile })
563
+ if (!response)
564
+ throw new Error(`failed to download from HuggingFace "${url}"`)
565
+ const buffer = Buffer.from(await response.arrayBuffer())
566
+ await fs.promises.writeFile(file, buffer)
567
+ }
568
+ }
569
+ return assetDir
570
+ }
571
+
572
+ /* open node */
573
+ async open () {
574
+ this.closing = false
575
+
576
+ /* download assets */
577
+ const assetsDir = await this.downloadAssets()
578
+
579
+ /* download ONNX models */
580
+ this.log("info", `loading ONNX models (asset dir: "${assetsDir}")`)
581
+ this.supertonic = await loadSupertonic(assetsDir)
582
+ this.log("info", `loaded ONNX models (sample rate: ${this.supertonic.sampleRate}Hz)`)
583
+
584
+ /* load voice style */
585
+ const voiceStylePath = path.join(assetsDir, "voice_styles", `${this.params.voice}.json`)
586
+ if (!fs.existsSync(voiceStylePath))
587
+ throw new Error(`voice style not found: ${voiceStylePath}`)
588
+ this.log("info", `loading voice style "${this.params.voice}"`)
589
+ this.style = await loadVoiceStyle(voiceStylePath)
590
+ this.log("info", `loaded voice style "${this.params.voice}"`)
591
+
592
+ /* establish resampler from Supertonic's output sample rate to our standard audio sample rate (48kHz) */
593
+ this.resampler = new SpeexResampler(1, this.supertonic.sampleRate, this.config.audioSampleRate, 7)
594
+
595
+ /* perform text-to-speech operation with Supertonic */
596
+ const text2speech = async (text: string) => {
597
+ /* synthesize speech from text */
598
+ this.log("info", `Supertonic: input: "${text}"`)
599
+ const { wav, duration } = await this.supertonic!.synthesize(
600
+ text,
601
+ this.style!,
602
+ this.params.steps,
603
+ this.params.speed
604
+ )
605
+ this.log("info", `Supertonic: synthesized ${duration.toFixed(2)}s of audio`)
606
+
607
+ /* convert audio samples from PCM/F32 to PCM/I16 */
608
+ const buffer1 = Buffer.alloc(wav.length * 2)
609
+ for (let i = 0; i < wav.length; i++) {
610
+ const sample = Math.max(-1, Math.min(1, wav[i]))
611
+ buffer1.writeInt16LE(sample * 0x7FFF, i * 2)
612
+ }
613
+
614
+ /* resample audio samples from Supertonic sample rate to 48kHz */
615
+ return this.resampler!.processChunk(buffer1)
616
+ }
617
+
618
+ /* create transform stream and connect it to the Supertonic TTS */
619
+ const self = this
620
+ this.stream = new Stream.Transform({
621
+ writableObjectMode: true,
622
+ readableObjectMode: true,
623
+ decodeStrings: false,
624
+ highWaterMark: 1,
625
+ async transform (chunk: SpeechFlowChunk, encoding, callback) {
626
+ if (self.closing)
627
+ callback(new Error("stream already destroyed"))
628
+ else if (Buffer.isBuffer(chunk.payload))
629
+ callback(new Error("invalid chunk payload type"))
630
+ else if (chunk.payload === "")
631
+ callback()
632
+ else {
633
+ let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
634
+ processTimeout = null
635
+ callback(new Error("Supertonic TTS timeout"))
636
+ }, 120 * 1000)
637
+ const clearProcessTimeout = () => {
638
+ if (processTimeout !== null) {
639
+ clearTimeout(processTimeout)
640
+ processTimeout = null
641
+ }
642
+ }
643
+ try {
644
+ if (self.closing) {
645
+ clearProcessTimeout()
646
+ callback(new Error("stream destroyed during processing"))
647
+ return
648
+ }
649
+ const buffer = await text2speech(chunk.payload as string)
650
+ if (self.closing) {
651
+ clearProcessTimeout()
652
+ callback(new Error("stream destroyed during processing"))
653
+ return
654
+ }
655
+ self.log("info", `Supertonic: received audio (buffer length: ${buffer.byteLength})`)
656
+
657
+ /* calculate actual audio duration from PCM buffer size */
658
+ const durationMs = util.audioBufferDuration(buffer,
659
+ self.config.audioSampleRate, self.config.audioBitDepth) * 1000
660
+
661
+ /* create new chunk with recalculated timestamps */
662
+ const chunkNew = chunk.clone()
663
+ chunkNew.type = "audio"
664
+ chunkNew.payload = buffer
665
+ chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
666
+
667
+ /* push chunk and complete transform */
668
+ clearProcessTimeout()
669
+ this.push(chunkNew)
670
+ callback()
671
+ }
672
+ catch (error) {
673
+ /* handle processing errors */
674
+ clearProcessTimeout()
675
+ callback(util.ensureError(error, "Supertonic processing failed"))
676
+ }
677
+ }
678
+ },
679
+ final (callback) {
680
+ callback()
681
+ }
682
+ })
683
+ }
684
+
685
+ /* close node */
686
+ async close () {
687
+ /* indicate closing */
688
+ this.closing = true
689
+
690
+ /* shutdown stream */
691
+ if (this.stream !== null) {
692
+ await util.destroyStream(this.stream)
693
+ this.stream = null
694
+ }
695
+
696
+ /* destroy voice style */
697
+ if (this.style !== null)
698
+ this.style = null
699
+
700
+ /* destroy resampler */
701
+ if (this.resampler !== null)
702
+ this.resampler = null
703
+
704
+ /* destroy Supertonic TTS */
705
+ if (this.supertonic !== null) {
706
+ await this.supertonic.release()
707
+ this.supertonic = null
708
+ }
709
+ }
710
+ }