speechflow 2.0.1 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/etc/claude.md +1 -1
- package/package.json +4 -4
- package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-graph.js +4 -4
- package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main.js +1 -1
- package/speechflow-cli/dst/speechflow-main.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +6 -6
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-filler.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-mute.js +2 -2
- package/speechflow-cli/dst/speechflow-node-a2a-mute.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-google.js +8 -8
- package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +9 -9
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-google.js +3 -3
- package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +2 -3
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +93 -466
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js +4 -4
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-deepl.js +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-format.js +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-format.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-google.js +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-google.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-modify.js +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-modify.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +1 -2
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js +1 -2
- package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +2 -2
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js +3 -4
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-translate.js +1 -2
- package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-exec.js +2 -2
- package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-file.js +2 -2
- package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +1 -1
- package/speechflow-cli/dst/speechflow-util-audio.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-util-audio.js +10 -3
- package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-llm.d.ts +0 -1
- package/speechflow-cli/dst/speechflow-util-llm.js +4 -8
- package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-stream.js +4 -5
- package/speechflow-cli/dst/speechflow-util-stream.js.map +1 -1
- package/speechflow-cli/etc/eslint.mjs +1 -3
- package/speechflow-cli/etc/oxlint.jsonc +4 -1
- package/speechflow-cli/etc/stx.conf +0 -1
- package/speechflow-cli/package.json +16 -19
- package/speechflow-cli/src/lib.d.ts +5 -1
- package/speechflow-cli/src/speechflow-main-api.ts +4 -4
- package/speechflow-cli/src/speechflow-main-cli.ts +1 -1
- package/speechflow-cli/src/speechflow-main-graph.ts +16 -16
- package/speechflow-cli/src/speechflow-main-nodes.ts +1 -1
- package/speechflow-cli/src/speechflow-main-status.ts +2 -2
- package/speechflow-cli/src/speechflow-main.ts +1 -1
- package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +3 -3
- package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +6 -6
- package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +2 -2
- package/speechflow-cli/src/speechflow-node-a2a-filler.ts +4 -4
- package/speechflow-cli/src/speechflow-node-a2a-gender.ts +1 -1
- package/speechflow-cli/src/speechflow-node-a2a-mute.ts +2 -2
- package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +1 -1
- package/speechflow-cli/src/speechflow-node-a2a-rnnoise-wt.ts +2 -2
- package/speechflow-cli/src/speechflow-node-a2a-rnnoise.ts +1 -1
- package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +2 -2
- package/speechflow-cli/src/speechflow-node-a2t-google.ts +8 -8
- package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +9 -9
- package/speechflow-cli/src/speechflow-node-t2a-google.ts +3 -3
- package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +103 -577
- package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +4 -4
- package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +1 -1
- package/speechflow-cli/src/speechflow-node-t2t-format.ts +1 -1
- package/speechflow-cli/src/speechflow-node-t2t-google.ts +2 -2
- package/speechflow-cli/src/speechflow-node-t2t-modify.ts +2 -2
- package/speechflow-cli/src/speechflow-node-t2t-opus.ts +1 -1
- package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +1 -2
- package/speechflow-cli/src/speechflow-node-t2t-spellcheck.ts +1 -2
- package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +2 -2
- package/speechflow-cli/src/speechflow-node-t2t-summary.ts +3 -4
- package/speechflow-cli/src/speechflow-node-t2t-translate.ts +1 -2
- package/speechflow-cli/src/speechflow-node-x2x-filter.ts +4 -4
- package/speechflow-cli/src/speechflow-node-xio-exec.ts +2 -2
- package/speechflow-cli/src/speechflow-node-xio-file.ts +2 -2
- package/speechflow-cli/src/speechflow-node-xio-vban.ts +4 -2
- package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +1 -1
- package/speechflow-cli/src/speechflow-util-audio.ts +11 -3
- package/speechflow-cli/src/speechflow-util-llm.ts +4 -9
- package/speechflow-cli/src/speechflow-util-queue.ts +1 -1
- package/speechflow-cli/src/speechflow-util-stream.ts +4 -5
- package/speechflow-ui-db/dst/index.js +13 -13
- package/speechflow-ui-db/etc/oxlint.jsonc +137 -0
- package/speechflow-ui-db/etc/stx.conf +4 -3
- package/speechflow-ui-db/package.json +9 -6
- package/speechflow-ui-st/dst/index.js +27 -27
- package/speechflow-ui-st/etc/oxlint.jsonc +137 -0
- package/speechflow-ui-st/etc/stx.conf +4 -3
- package/speechflow-ui-st/package.json +9 -6
- package/speechflow-cli/etc/biome.jsonc +0 -46
- package/speechflow-ui-db/src/lib.d.ts +0 -9
- package/speechflow-ui-st/src/lib.d.ts +0 -9
|
@@ -5,509 +5,27 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/* standard dependencies */
|
|
8
|
-
import fs from "node:fs"
|
|
9
|
-
import path from "node:path"
|
|
10
8
|
import Stream from "node:stream"
|
|
11
9
|
|
|
12
10
|
/* external dependencies */
|
|
13
|
-
import
|
|
14
|
-
import
|
|
15
|
-
import
|
|
16
|
-
import { Duration } from "luxon"
|
|
17
|
-
|
|
18
|
-
/* @ts-expect-error no type available */
|
|
19
|
-
import * as ORT from "onnxruntime-node"
|
|
11
|
+
import * as Transformers from "@huggingface/transformers"
|
|
12
|
+
import SpeexResampler from "speex-resampler"
|
|
13
|
+
import { Duration } from "luxon"
|
|
20
14
|
|
|
21
15
|
/* internal dependencies */
|
|
22
16
|
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
23
17
|
import * as util from "./speechflow-util"
|
|
24
18
|
|
|
25
|
-
/* ==== SUPERTONIC TTS IMPLEMENTATION ==== */
|
|
26
|
-
|
|
27
|
-
/* type for voice style tensors */
|
|
28
|
-
interface SupertonicStyle {
|
|
29
|
-
ttl: ORT.Tensor
|
|
30
|
-
dp: ORT.Tensor
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
/* type for TTS configuration */
|
|
34
|
-
interface SupertonicConfig {
|
|
35
|
-
ae: {
|
|
36
|
-
sample_rate: number
|
|
37
|
-
base_chunk_size: number
|
|
38
|
-
chunk_compress_factor: number
|
|
39
|
-
}
|
|
40
|
-
ttl: {
|
|
41
|
-
latent_dim: number
|
|
42
|
-
chunk_compress_factor: number
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
/* convert lengths to binary mask */
|
|
47
|
-
function lengthToMask (lengths: number[], maxLen: number | null = null): number[][][] {
|
|
48
|
-
/* handle empty input */
|
|
49
|
-
if (lengths.length === 0)
|
|
50
|
-
return []
|
|
51
|
-
|
|
52
|
-
/* determine maximum length */
|
|
53
|
-
maxLen = maxLen ?? Math.max(...lengths)
|
|
54
|
-
|
|
55
|
-
/* build mask array */
|
|
56
|
-
const mask: number[][][] = []
|
|
57
|
-
for (let i = 0; i < lengths.length; i++) {
|
|
58
|
-
const row: number[] = []
|
|
59
|
-
for (let j = 0; j < maxLen; j++)
|
|
60
|
-
row.push(j < lengths[i] ? 1.0 : 0.0)
|
|
61
|
-
mask.push([ row ])
|
|
62
|
-
}
|
|
63
|
-
return mask
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
/* get latent mask from wav lengths */
|
|
67
|
-
function getLatentMask (wavLengths: number[], baseChunkSize: number, chunkCompressFactor: number): number[][][] {
|
|
68
|
-
/* calculate latent size and lengths */
|
|
69
|
-
const latentSize = baseChunkSize * chunkCompressFactor
|
|
70
|
-
const latentLengths = wavLengths.map((len) =>
|
|
71
|
-
Math.floor((len + latentSize - 1) / latentSize))
|
|
72
|
-
|
|
73
|
-
/* generate mask from latent lengths */
|
|
74
|
-
return lengthToMask(latentLengths)
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
/* convert array to ONNX tensor */
|
|
78
|
-
function arrayToTensor (array: number[] | number[][] | number[][][], dims: number[]): ORT.Tensor {
|
|
79
|
-
/* flatten array and create float32 tensor */
|
|
80
|
-
const flat = array.flat(Infinity) as number[]
|
|
81
|
-
return new ORT.Tensor("float32", Float32Array.from(flat), dims)
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
/* convert int array to ONNX tensor */
|
|
85
|
-
function intArrayToTensor (array: number[][], dims: number[]): ORT.Tensor {
|
|
86
|
-
/* flatten array and create int64 tensor */
|
|
87
|
-
const flat = array.flat(Infinity) as number[]
|
|
88
|
-
return new ORT.Tensor("int64", BigInt64Array.from(flat.map(BigInt)), dims)
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
/* chunk text into manageable segments */
|
|
92
|
-
function chunkText (text: string, maxLen = 300): string[] {
|
|
93
|
-
/* validate input type */
|
|
94
|
-
if (typeof text !== "string")
|
|
95
|
-
throw new Error(`chunkText expects a string, got ${typeof text}`)
|
|
96
|
-
|
|
97
|
-
/* split by paragraph (two or more newlines) */
|
|
98
|
-
const paragraphs = text.trim().split(/\n\s*\n+/).filter((p) => p.trim())
|
|
99
|
-
|
|
100
|
-
/* process each paragraph into chunks */
|
|
101
|
-
const chunks: string[] = []
|
|
102
|
-
for (let paragraph of paragraphs) {
|
|
103
|
-
paragraph = paragraph.trim()
|
|
104
|
-
if (!paragraph)
|
|
105
|
-
continue
|
|
106
|
-
|
|
107
|
-
/* split by sentence boundaries (period, question mark, exclamation mark followed by space)
|
|
108
|
-
but exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F. */
|
|
109
|
-
const sentences = paragraph.split(/(?<!Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sr\.|Jr\.|Ph\.D\.|etc\.|e\.g\.|i\.e\.|vs\.|Inc\.|Ltd\.|Co\.|Corp\.|St\.|Ave\.|Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+/)
|
|
110
|
-
|
|
111
|
-
/* accumulate sentences into chunks respecting max length */
|
|
112
|
-
let currentChunk = ""
|
|
113
|
-
for (const sentence of sentences) {
|
|
114
|
-
if (currentChunk.length + sentence.length + 1 <= maxLen)
|
|
115
|
-
currentChunk += (currentChunk ? " " : "") + sentence
|
|
116
|
-
else {
|
|
117
|
-
if (currentChunk)
|
|
118
|
-
chunks.push(currentChunk.trim())
|
|
119
|
-
currentChunk = sentence
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
/* push remaining chunk */
|
|
124
|
-
if (currentChunk)
|
|
125
|
-
chunks.push(currentChunk.trim())
|
|
126
|
-
}
|
|
127
|
-
return chunks
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
/* unicode text processor class */
|
|
131
|
-
class SupertonicTextProcessor {
|
|
132
|
-
private indexer: Record<number, number>
|
|
133
|
-
|
|
134
|
-
/* construct text processor */
|
|
135
|
-
constructor (unicodeIndexerJsonPath: string) {
|
|
136
|
-
/* load and parse unicode indexer JSON */
|
|
137
|
-
try {
|
|
138
|
-
this.indexer = JSON.parse(fs.readFileSync(unicodeIndexerJsonPath, "utf8"))
|
|
139
|
-
}
|
|
140
|
-
catch (err) {
|
|
141
|
-
throw new Error(`failed to parse unicode indexer JSON "${unicodeIndexerJsonPath}"`, { cause: err })
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
/* preprocess text */
|
|
146
|
-
private preprocessText (text: string): string {
|
|
147
|
-
/* normalize text */
|
|
148
|
-
text = text.normalize("NFKD")
|
|
149
|
-
|
|
150
|
-
/* remove emojis (wide Unicode range) */
|
|
151
|
-
const emojiPattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F1E6}-\u{1F1FF}]+/gu
|
|
152
|
-
text = text.replace(emojiPattern, "")
|
|
153
|
-
|
|
154
|
-
/* replace various dashes and symbols */
|
|
155
|
-
const replacements: Record<string, string> = {
|
|
156
|
-
"–": "-",
|
|
157
|
-
"‑": "-",
|
|
158
|
-
"—": "-",
|
|
159
|
-
"¯": " ",
|
|
160
|
-
"_": " ",
|
|
161
|
-
"\u201C": "\"",
|
|
162
|
-
"\u201D": "\"",
|
|
163
|
-
"\u2018": "'",
|
|
164
|
-
"\u2019": "'",
|
|
165
|
-
"´": "'",
|
|
166
|
-
"`": "'",
|
|
167
|
-
"[": " ",
|
|
168
|
-
"]": " ",
|
|
169
|
-
"|": " ",
|
|
170
|
-
"/": " ",
|
|
171
|
-
"#": " ",
|
|
172
|
-
"→": " ",
|
|
173
|
-
"←": " "
|
|
174
|
-
}
|
|
175
|
-
for (const [ k, v ] of Object.entries(replacements))
|
|
176
|
-
text = text.replaceAll(k, v)
|
|
177
|
-
|
|
178
|
-
/* remove combining diacritics */
|
|
179
|
-
text = text.replace(/[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]/g, "")
|
|
180
|
-
|
|
181
|
-
/* remove special symbols */
|
|
182
|
-
text = text.replace(/[♥☆♡©\\]/g, "")
|
|
183
|
-
|
|
184
|
-
/* replace known expressions */
|
|
185
|
-
const exprReplacements: Record<string, string> = {
|
|
186
|
-
"@": " at ",
|
|
187
|
-
"e.g.,": "for example, ",
|
|
188
|
-
"i.e.,": "that is, "
|
|
189
|
-
}
|
|
190
|
-
for (const [ k, v ] of Object.entries(exprReplacements))
|
|
191
|
-
text = text.replaceAll(k, v)
|
|
192
|
-
|
|
193
|
-
/* fix spacing around punctuation */
|
|
194
|
-
text = text.replace(/ ,/g, ",")
|
|
195
|
-
text = text.replace(/ \./g, ".")
|
|
196
|
-
text = text.replace(/ !/g, "!")
|
|
197
|
-
text = text.replace(/ \?/g, "?")
|
|
198
|
-
text = text.replace(/ ;/g, ";")
|
|
199
|
-
text = text.replace(/ :/g, ":")
|
|
200
|
-
text = text.replace(/ '/g, "'")
|
|
201
|
-
|
|
202
|
-
/* remove duplicate quotes */
|
|
203
|
-
text = text.replace(/""+/g, "\"")
|
|
204
|
-
text = text.replace(/''+/g, "'")
|
|
205
|
-
text = text.replace(/``+/g, "`")
|
|
206
|
-
|
|
207
|
-
/* remove extra spaces */
|
|
208
|
-
text = text.replace(/\s+/g, " ").trim()
|
|
209
|
-
|
|
210
|
-
/* if text doesn't end with punctuation, add a period */
|
|
211
|
-
if (!/[.!?;:,'"')\]}…。」』】〉》›»]$/.test(text))
|
|
212
|
-
text += "."
|
|
213
|
-
return text
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
/* convert text to Unicode values */
|
|
217
|
-
private textToUnicodeValues (text: string): number[] {
|
|
218
|
-
/* convert text characters to unicode code points */
|
|
219
|
-
return Array.from(text).map((char) => char.charCodeAt(0))
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
/* process text list */
|
|
223
|
-
call (textList: string[]): { textIds: number[][], textMask: number[][][] } {
|
|
224
|
-
/* handle empty input */
|
|
225
|
-
if (textList.length === 0)
|
|
226
|
-
return { textIds: [], textMask: [] }
|
|
227
|
-
|
|
228
|
-
/* preprocess all texts */
|
|
229
|
-
const processedTexts = textList.map((t) => this.preprocessText(t))
|
|
230
|
-
const textIdsLengths = processedTexts.map((t) => t.length)
|
|
231
|
-
const maxLen = Math.max(...textIdsLengths)
|
|
232
|
-
|
|
233
|
-
/* convert texts to indexed token arrays */
|
|
234
|
-
const textIds: number[][] = []
|
|
235
|
-
for (let i = 0; i < processedTexts.length; i++) {
|
|
236
|
-
const row = Array.from<number>({ length: maxLen }).fill(0)
|
|
237
|
-
const unicodeVals = this.textToUnicodeValues(processedTexts[i])
|
|
238
|
-
for (let j = 0; j < unicodeVals.length; j++)
|
|
239
|
-
row[j] = this.indexer[unicodeVals[j]] ?? 0
|
|
240
|
-
textIds.push(row)
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
/* generate text mask from lengths */
|
|
244
|
-
const textMask = lengthToMask(textIdsLengths)
|
|
245
|
-
return { textIds, textMask }
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
/* Supertonic TTS engine class */
|
|
250
|
-
class SupertonicTTS {
|
|
251
|
-
public sampleRate: number
|
|
252
|
-
|
|
253
|
-
/* internal TTS state */
|
|
254
|
-
private cfgs: SupertonicConfig
|
|
255
|
-
private textProcessor: SupertonicTextProcessor
|
|
256
|
-
private dpOrt: ORT.InferenceSession
|
|
257
|
-
private textEncOrt: ORT.InferenceSession
|
|
258
|
-
private vectorEstOrt: ORT.InferenceSession
|
|
259
|
-
private vocoderOrt: ORT.InferenceSession
|
|
260
|
-
private baseChunkSize: number
|
|
261
|
-
private chunkCompressFactor: number
|
|
262
|
-
private latentDim: number
|
|
263
|
-
|
|
264
|
-
/* construct TTS engine */
|
|
265
|
-
constructor (
|
|
266
|
-
cfgs: SupertonicConfig,
|
|
267
|
-
textProcessor: SupertonicTextProcessor,
|
|
268
|
-
dpOrt: ORT.InferenceSession,
|
|
269
|
-
textEncOrt: ORT.InferenceSession,
|
|
270
|
-
vectorEstOrt: ORT.InferenceSession,
|
|
271
|
-
vocoderOrt: ORT.InferenceSession
|
|
272
|
-
) {
|
|
273
|
-
/* store configuration and dependencies */
|
|
274
|
-
this.cfgs = cfgs
|
|
275
|
-
this.textProcessor = textProcessor
|
|
276
|
-
this.dpOrt = dpOrt
|
|
277
|
-
this.textEncOrt = textEncOrt
|
|
278
|
-
this.vectorEstOrt = vectorEstOrt
|
|
279
|
-
this.vocoderOrt = vocoderOrt
|
|
280
|
-
|
|
281
|
-
/* extract configuration values */
|
|
282
|
-
this.sampleRate = cfgs.ae.sample_rate
|
|
283
|
-
this.baseChunkSize = cfgs.ae.base_chunk_size
|
|
284
|
-
this.chunkCompressFactor = cfgs.ttl.chunk_compress_factor
|
|
285
|
-
this.latentDim = cfgs.ttl.latent_dim
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
/* sample noisy latent vectors */
|
|
289
|
-
private sampleNoisyLatent (duration: number[]): { noisyLatent: number[][][], latentMask: number[][][] } {
|
|
290
|
-
/* calculate dimensions for latent space */
|
|
291
|
-
const wavLenMax = Math.max(...duration) * this.sampleRate
|
|
292
|
-
const wavLengths = duration.map((d) => Math.floor(d * this.sampleRate))
|
|
293
|
-
const chunkSize = this.baseChunkSize * this.chunkCompressFactor
|
|
294
|
-
const latentLen = Math.floor((wavLenMax + chunkSize - 1) / chunkSize)
|
|
295
|
-
const latentDimExpanded = this.latentDim * this.chunkCompressFactor
|
|
296
|
-
|
|
297
|
-
/* generate random noise (pre-allocate arrays for performance) */
|
|
298
|
-
const noisyLatent: number[][][] = Array.from({ length: duration.length })
|
|
299
|
-
for (let b = 0; b < duration.length; b++) {
|
|
300
|
-
const batch: number[][] = Array.from({ length: latentDimExpanded })
|
|
301
|
-
for (let d = 0; d < latentDimExpanded; d++) {
|
|
302
|
-
const row: number[] = Array.from({ length: latentLen })
|
|
303
|
-
for (let t = 0; t < latentLen; t++) {
|
|
304
|
-
/* Box-Muller transform for normal distribution */
|
|
305
|
-
const eps = 1e-10
|
|
306
|
-
const u1 = Math.max(eps, Math.random())
|
|
307
|
-
const u2 = Math.random()
|
|
308
|
-
row[t] = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2)
|
|
309
|
-
}
|
|
310
|
-
batch[d] = row
|
|
311
|
-
}
|
|
312
|
-
noisyLatent[b] = batch
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
/* apply mask */
|
|
316
|
-
const latentMask = getLatentMask(wavLengths, this.baseChunkSize, this.chunkCompressFactor)
|
|
317
|
-
for (let b = 0; b < noisyLatent.length; b++) {
|
|
318
|
-
for (let d = 0; d < noisyLatent[b].length; d++) {
|
|
319
|
-
for (let t = 0; t < noisyLatent[b][d].length; t++)
|
|
320
|
-
noisyLatent[b][d][t] *= latentMask[b][0][t]
|
|
321
|
-
}
|
|
322
|
-
}
|
|
323
|
-
return { noisyLatent, latentMask }
|
|
324
|
-
}
|
|
325
|
-
|
|
326
|
-
/* perform inference */
|
|
327
|
-
private async infer (textList: string[], style: SupertonicStyle, totalStep: number, speed: number): Promise<{ wav: number[], duration: number[] }> {
|
|
328
|
-
/* validate batch size matches style vectors */
|
|
329
|
-
if (textList.length !== style.ttl.dims[0])
|
|
330
|
-
throw new Error("Number of texts must match number of style vectors")
|
|
331
|
-
|
|
332
|
-
/* process text into token IDs and masks */
|
|
333
|
-
const batchSize = textList.length
|
|
334
|
-
const { textIds, textMask } = this.textProcessor.call(textList)
|
|
335
|
-
const textIdsShape = [ batchSize, textIds[0].length ]
|
|
336
|
-
const textMaskShape = [ batchSize, 1, textMask[0][0].length ]
|
|
337
|
-
const textMaskTensor = arrayToTensor(textMask, textMaskShape)
|
|
338
|
-
|
|
339
|
-
/* run duration predictor model */
|
|
340
|
-
const dpResult = await this.dpOrt.run({
|
|
341
|
-
text_ids: intArrayToTensor(textIds, textIdsShape),
|
|
342
|
-
style_dp: style.dp,
|
|
343
|
-
text_mask: textMaskTensor
|
|
344
|
-
})
|
|
345
|
-
const predictedDurations = Array.from(dpResult.duration.data as Float32Array)
|
|
346
|
-
|
|
347
|
-
/* apply speed factor to duration */
|
|
348
|
-
for (let i = 0; i < predictedDurations.length; i++)
|
|
349
|
-
predictedDurations[i] /= speed
|
|
350
|
-
|
|
351
|
-
/* run text encoder model */
|
|
352
|
-
const textEncResult = await this.textEncOrt.run({
|
|
353
|
-
text_ids: intArrayToTensor(textIds, textIdsShape),
|
|
354
|
-
style_ttl: style.ttl,
|
|
355
|
-
text_mask: textMaskTensor
|
|
356
|
-
})
|
|
357
|
-
const textEmbTensor = textEncResult.text_emb
|
|
358
|
-
|
|
359
|
-
/* sample initial noisy latent vectors */
|
|
360
|
-
const { noisyLatent, latentMask } = this.sampleNoisyLatent(predictedDurations)
|
|
361
|
-
const latentShape = [ batchSize, noisyLatent[0].length, noisyLatent[0][0].length ]
|
|
362
|
-
const latentMaskShape = [ batchSize, 1, latentMask[0][0].length ]
|
|
363
|
-
const latentMaskTensor = arrayToTensor(latentMask, latentMaskShape)
|
|
364
|
-
|
|
365
|
-
/* prepare step tensors */
|
|
366
|
-
const totalStepArray = Array.from<number>({ length: batchSize }).fill(totalStep)
|
|
367
|
-
const scalarShape = [ batchSize ]
|
|
368
|
-
const totalStepTensor = arrayToTensor(totalStepArray, scalarShape)
|
|
369
|
-
|
|
370
|
-
/* iteratively denoise latent vectors */
|
|
371
|
-
for (let step = 0; step < totalStep; step++) {
|
|
372
|
-
const currentStepArray = Array.from<number>({ length: batchSize }).fill(step)
|
|
373
|
-
|
|
374
|
-
/* run vector estimator model */
|
|
375
|
-
const vectorEstResult = await this.vectorEstOrt.run({
|
|
376
|
-
noisy_latent: arrayToTensor(noisyLatent, latentShape),
|
|
377
|
-
text_emb: textEmbTensor,
|
|
378
|
-
style_ttl: style.ttl,
|
|
379
|
-
text_mask: textMaskTensor,
|
|
380
|
-
latent_mask: latentMaskTensor,
|
|
381
|
-
total_step: totalStepTensor,
|
|
382
|
-
current_step: arrayToTensor(currentStepArray, scalarShape)
|
|
383
|
-
})
|
|
384
|
-
const denoisedLatent = Array.from(vectorEstResult.denoised_latent.data as Float32Array)
|
|
385
|
-
|
|
386
|
-
/* update latent with the denoised output */
|
|
387
|
-
let idx = 0
|
|
388
|
-
for (let b = 0; b < noisyLatent.length; b++)
|
|
389
|
-
for (let d = 0; d < noisyLatent[b].length; d++)
|
|
390
|
-
for (let t = 0; t < noisyLatent[b][d].length; t++)
|
|
391
|
-
noisyLatent[b][d][t] = denoisedLatent[idx++]
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
/* run vocoder to generate audio waveform */
|
|
395
|
-
const vocoderResult = await this.vocoderOrt.run({
|
|
396
|
-
latent: arrayToTensor(noisyLatent, latentShape)
|
|
397
|
-
})
|
|
398
|
-
const wav = Array.from(vocoderResult.wav_tts.data as Float32Array)
|
|
399
|
-
return { wav, duration: predictedDurations }
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
/* synthesize speech from text */
|
|
403
|
-
async synthesize (text: string, style: SupertonicStyle, totalStep: number, speed: number, silenceDuration = 0.3): Promise<{ wav: number[], duration: number }> {
|
|
404
|
-
/* validate single speaker mode */
|
|
405
|
-
if (style.ttl.dims[0] !== 1)
|
|
406
|
-
throw new Error("Single speaker text to speech only supports single style")
|
|
407
|
-
|
|
408
|
-
/* chunk text into segments */
|
|
409
|
-
const textList = chunkText(text)
|
|
410
|
-
if (textList.length === 0)
|
|
411
|
-
return { wav: [], duration: 0 }
|
|
412
|
-
|
|
413
|
-
/* synthesize each chunk and concatenate with silence */
|
|
414
|
-
const wavParts: number[][] = []
|
|
415
|
-
let totalDuration = 0
|
|
416
|
-
for (const chunk of textList) {
|
|
417
|
-
const { wav, duration } = await this.infer([ chunk ], style, totalStep, speed)
|
|
418
|
-
|
|
419
|
-
/* insert silence between chunks */
|
|
420
|
-
if (wavParts.length > 0) {
|
|
421
|
-
const silenceLen = Math.floor(silenceDuration * this.sampleRate)
|
|
422
|
-
wavParts.push(Array.from<number>({ length: silenceLen }).fill(0))
|
|
423
|
-
totalDuration += silenceDuration
|
|
424
|
-
}
|
|
425
|
-
wavParts.push(wav)
|
|
426
|
-
totalDuration += duration[0]
|
|
427
|
-
}
|
|
428
|
-
return { wav: wavParts.flat(), duration: totalDuration }
|
|
429
|
-
}
|
|
430
|
-
|
|
431
|
-
/* release TTS engine resources */
|
|
432
|
-
async release (): Promise<void> {
|
|
433
|
-
/* release all ONNX inference sessions */
|
|
434
|
-
await Promise.all([
|
|
435
|
-
this.dpOrt.release(),
|
|
436
|
-
this.textEncOrt.release(),
|
|
437
|
-
this.vectorEstOrt.release(),
|
|
438
|
-
this.vocoderOrt.release()
|
|
439
|
-
])
|
|
440
|
-
}
|
|
441
|
-
}
|
|
442
|
-
|
|
443
|
-
/* type for voice style JSON file */
|
|
444
|
-
interface VoiceStyleJSON {
|
|
445
|
-
style_ttl: { dims: number[], data: number[][][] }
|
|
446
|
-
style_dp: { dims: number[], data: number[][][] }
|
|
447
|
-
}
|
|
448
|
-
|
|
449
|
-
/* load voice style from JSON file */
|
|
450
|
-
async function loadVoiceStyle (voiceStylePath: string): Promise<SupertonicStyle> {
|
|
451
|
-
/* read and parse voice style JSON */
|
|
452
|
-
let voiceStyle: VoiceStyleJSON
|
|
453
|
-
try {
|
|
454
|
-
voiceStyle = JSON.parse(await fs.promises.readFile(voiceStylePath, "utf8")) as VoiceStyleJSON
|
|
455
|
-
}
|
|
456
|
-
catch (err) {
|
|
457
|
-
throw new Error(`failed to parse voice style JSON "${voiceStylePath}"`, { cause: err })
|
|
458
|
-
}
|
|
459
|
-
|
|
460
|
-
/* extract dimensions and data */
|
|
461
|
-
const ttlDims = voiceStyle.style_ttl.dims
|
|
462
|
-
const dpDims = voiceStyle.style_dp.dims
|
|
463
|
-
const ttlData = voiceStyle.style_ttl.data.flat(Infinity) as number[]
|
|
464
|
-
const dpData = voiceStyle.style_dp.data.flat(Infinity) as number[]
|
|
465
|
-
|
|
466
|
-
/* create ONNX tensors for style vectors */
|
|
467
|
-
const ttlStyle = new ORT.Tensor("float32", Float32Array.from(ttlData), ttlDims)
|
|
468
|
-
const dpStyle = new ORT.Tensor("float32", Float32Array.from(dpData), dpDims)
|
|
469
|
-
return { ttl: ttlStyle, dp: dpStyle }
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
/* load TTS engine from ONNX models */
|
|
473
|
-
async function loadSupertonic (assetsDir: string): Promise<SupertonicTTS> {
|
|
474
|
-
/* load configuration */
|
|
475
|
-
const cfgPath = path.join(assetsDir, "onnx", "tts.json")
|
|
476
|
-
let cfgs: SupertonicConfig
|
|
477
|
-
try {
|
|
478
|
-
cfgs = JSON.parse(await fs.promises.readFile(cfgPath, "utf8"))
|
|
479
|
-
}
|
|
480
|
-
catch (err) {
|
|
481
|
-
throw new Error(`failed to parse TTS config JSON "${cfgPath}"`, { cause: err })
|
|
482
|
-
}
|
|
483
|
-
|
|
484
|
-
/* load text processor */
|
|
485
|
-
const unicodeIndexerPath = path.join(assetsDir, "onnx", "unicode_indexer.json")
|
|
486
|
-
const textProcessor = new SupertonicTextProcessor(unicodeIndexerPath)
|
|
487
|
-
|
|
488
|
-
/* load ONNX models */
|
|
489
|
-
const opts: ORT.InferenceSession.SessionOptions = {}
|
|
490
|
-
const [ dpOrt, textEncOrt, vectorEstOrt, vocoderOrt ] = await Promise.all([
|
|
491
|
-
ORT.InferenceSession.create(path.join(assetsDir, "onnx", "duration_predictor.onnx"), opts),
|
|
492
|
-
ORT.InferenceSession.create(path.join(assetsDir, "onnx", "text_encoder.onnx"), opts),
|
|
493
|
-
ORT.InferenceSession.create(path.join(assetsDir, "onnx", "vector_estimator.onnx"), opts),
|
|
494
|
-
ORT.InferenceSession.create(path.join(assetsDir, "onnx", "vocoder.onnx"), opts)
|
|
495
|
-
])
|
|
496
|
-
return new SupertonicTTS(cfgs, textProcessor, dpOrt, textEncOrt, vectorEstOrt, vocoderOrt)
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
/* ==== SPEECHFLOW NODE IMPLEMENTATION ==== */
|
|
500
|
-
|
|
501
19
|
/* SpeechFlow node for Supertonic text-to-speech conversion */
|
|
502
20
|
export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
|
|
503
21
|
/* declare official node name */
|
|
504
22
|
public static name = "t2a-supertonic"
|
|
505
23
|
|
|
506
24
|
/* internal state */
|
|
507
|
-
private
|
|
508
|
-
private
|
|
509
|
-
private
|
|
510
|
-
private closing
|
|
25
|
+
private tts: Transformers.TextToAudioPipeline | null = null
|
|
26
|
+
private resampler: SpeexResampler | null = null
|
|
27
|
+
private sampleRate = 44100
|
|
28
|
+
private closing = false
|
|
511
29
|
|
|
512
30
|
/* construct node */
|
|
513
31
|
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
@@ -530,89 +48,110 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
|
|
|
530
48
|
return {}
|
|
531
49
|
}
|
|
532
50
|
|
|
533
|
-
/* download HuggingFace assets */
|
|
534
|
-
private async downloadAssets () {
|
|
535
|
-
/* define HuggingFace repository and required files */
|
|
536
|
-
const assetRepo = "Supertone/supertonic"
|
|
537
|
-
const assetFiles = [
|
|
538
|
-
"voice_styles/F1.json",
|
|
539
|
-
"voice_styles/F2.json",
|
|
540
|
-
"voice_styles/M1.json",
|
|
541
|
-
"voice_styles/M2.json",
|
|
542
|
-
"onnx/tts.json",
|
|
543
|
-
"onnx/duration_predictor.onnx",
|
|
544
|
-
"onnx/text_encoder.onnx",
|
|
545
|
-
"onnx/unicode_indexer.json",
|
|
546
|
-
"onnx/vector_estimator.onnx",
|
|
547
|
-
"onnx/vocoder.onnx"
|
|
548
|
-
]
|
|
549
|
-
|
|
550
|
-
/* create asset directories */
|
|
551
|
-
const assetDir = path.join(this.config.cacheDir, "supertonic")
|
|
552
|
-
await mkdirp(path.join(assetDir, "voice_styles"), { mode: 0o750 })
|
|
553
|
-
await mkdirp(path.join(assetDir, "onnx"), { mode: 0o750 })
|
|
554
|
-
|
|
555
|
-
/* download missing asset files */
|
|
556
|
-
for (const assetFile of assetFiles) {
|
|
557
|
-
const url = `${assetRepo}/${assetFile}`
|
|
558
|
-
const file = path.join(assetDir, assetFile)
|
|
559
|
-
const stat = await fs.promises.stat(file).catch((_err) => null)
|
|
560
|
-
if (stat === null || !stat.isFile()) {
|
|
561
|
-
this.log("info", `downloading from HuggingFace "${url}"`)
|
|
562
|
-
const response = await HF.downloadFile({ repo: assetRepo, path: assetFile })
|
|
563
|
-
if (!response)
|
|
564
|
-
throw new Error(`failed to download from HuggingFace "${url}"`)
|
|
565
|
-
const buffer = Buffer.from(await response.arrayBuffer())
|
|
566
|
-
await fs.promises.writeFile(file, buffer)
|
|
567
|
-
}
|
|
568
|
-
}
|
|
569
|
-
return assetDir
|
|
570
|
-
}
|
|
571
|
-
|
|
572
51
|
/* open node */
|
|
573
52
|
async open () {
|
|
574
53
|
this.closing = false
|
|
575
54
|
|
|
576
|
-
/*
|
|
577
|
-
const
|
|
55
|
+
/* load Supertonic TTS pipeline via transformers.js */
|
|
56
|
+
const model = "onnx-community/Supertonic-TTS-ONNX"
|
|
57
|
+
this.log("info", `loading Supertonic TTS model "${model}"`)
|
|
58
|
+
|
|
59
|
+
/* track download progress */
|
|
60
|
+
const progressState = new Map<string, number>()
|
|
61
|
+
const progressCallback = (progress: any) => {
|
|
62
|
+
let artifact = model
|
|
63
|
+
if (typeof progress.file === "string")
|
|
64
|
+
artifact += `:${progress.file}`
|
|
65
|
+
let percent = 0
|
|
66
|
+
if (typeof progress.loaded === "number" && typeof progress.total === "number")
|
|
67
|
+
percent = (progress.loaded / progress.total) * 100
|
|
68
|
+
else if (typeof progress.progress === "number")
|
|
69
|
+
percent = progress.progress
|
|
70
|
+
if (percent > 0)
|
|
71
|
+
progressState.set(artifact, percent)
|
|
72
|
+
}
|
|
73
|
+
let interval: ReturnType<typeof setInterval> | null = setInterval(() => {
|
|
74
|
+
for (const [ artifact, percent ] of progressState) {
|
|
75
|
+
this.log("info", `downloaded ${percent.toFixed(2)}% of artifact "${artifact}"`)
|
|
76
|
+
if (percent >= 100.0)
|
|
77
|
+
progressState.delete(artifact)
|
|
78
|
+
}
|
|
79
|
+
if (progressState.size === 0 && interval !== null) {
|
|
80
|
+
clearInterval(interval)
|
|
81
|
+
interval = null
|
|
82
|
+
}
|
|
83
|
+
}, 1000)
|
|
578
84
|
|
|
579
|
-
/*
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
85
|
+
/* create TTS pipeline */
|
|
86
|
+
try {
|
|
87
|
+
const tts = Transformers.pipeline("text-to-speech", model, {
|
|
88
|
+
dtype: "fp32",
|
|
89
|
+
progress_callback: progressCallback
|
|
90
|
+
})
|
|
91
|
+
this.tts = await tts
|
|
92
|
+
}
|
|
93
|
+
finally {
|
|
94
|
+
if (interval !== null) {
|
|
95
|
+
clearInterval(interval)
|
|
96
|
+
interval = null
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
if (this.tts === null)
|
|
100
|
+
throw new Error("failed to instantiate Supertonic TTS pipeline")
|
|
583
101
|
|
|
584
|
-
/*
|
|
585
|
-
const
|
|
586
|
-
if (
|
|
587
|
-
|
|
588
|
-
this.log("info", `
|
|
589
|
-
this.style = await loadVoiceStyle(voiceStylePath)
|
|
590
|
-
this.log("info", `loaded voice style "${this.params.voice}"`)
|
|
102
|
+
/* determine sample rate from model config */
|
|
103
|
+
const config = (this.tts as any).model?.config
|
|
104
|
+
if (config?.sampling_rate)
|
|
105
|
+
this.sampleRate = config.sampling_rate
|
|
106
|
+
this.log("info", `loaded Supertonic TTS model (sample rate: ${this.sampleRate}Hz)`)
|
|
591
107
|
|
|
592
108
|
/* establish resampler from Supertonic's output sample rate to our standard audio sample rate (48kHz) */
|
|
593
|
-
this.resampler = new SpeexResampler(1, this.
|
|
109
|
+
this.resampler = new SpeexResampler(1, this.sampleRate, this.config.audioSampleRate, 7)
|
|
110
|
+
|
|
111
|
+
/* map voice names to speaker embedding URLs */
|
|
112
|
+
const voiceUrls: Record<string, string> = {
|
|
113
|
+
"M1": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/M1.bin",
|
|
114
|
+
"M2": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/M2.bin",
|
|
115
|
+
"F1": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/F1.bin",
|
|
116
|
+
"F2": "https://huggingface.co/onnx-community/Supertonic-TTS-ONNX/resolve/main/voices/F2.bin"
|
|
117
|
+
}
|
|
118
|
+
const speakerEmbeddings = voiceUrls[this.params.voice]
|
|
119
|
+
if (speakerEmbeddings === undefined)
|
|
120
|
+
throw new Error(`invalid Supertonic voice "${this.params.voice}"`)
|
|
121
|
+
this.log("info", `using voice "${this.params.voice}"`)
|
|
594
122
|
|
|
595
123
|
/* perform text-to-speech operation with Supertonic */
|
|
596
124
|
const text2speech = async (text: string) => {
|
|
597
|
-
/* synthesize speech from text */
|
|
598
125
|
this.log("info", `Supertonic: input: "${text}"`)
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
this.params.
|
|
604
|
-
|
|
126
|
+
|
|
127
|
+
/* generate speech using transformers.js pipeline */
|
|
128
|
+
const result = await this.tts!(text, {
|
|
129
|
+
speaker_embeddings: speakerEmbeddings,
|
|
130
|
+
num_inference_steps: this.params.steps,
|
|
131
|
+
speed: this.params.speed
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
/* extract audio samples and sample rate */
|
|
135
|
+
if (!(result.audio instanceof Float32Array))
|
|
136
|
+
throw new Error("unexpected Supertonic result: audio is not a Float32Array")
|
|
137
|
+
if (typeof result.sampling_rate !== "number")
|
|
138
|
+
throw new Error("unexpected Supertonic result: sampling_rate is not a number")
|
|
139
|
+
const samples = result.audio
|
|
140
|
+
const outputSampleRate = result.sampling_rate
|
|
141
|
+
if (outputSampleRate !== this.sampleRate)
|
|
142
|
+
this.log("warn", `unexpected sample rate ${outputSampleRate}Hz (expected ${this.sampleRate}Hz)`)
|
|
143
|
+
|
|
144
|
+
/* calculate duration */
|
|
145
|
+
const duration = samples.length / outputSampleRate
|
|
605
146
|
this.log("info", `Supertonic: synthesized ${duration.toFixed(2)}s of audio`)
|
|
606
147
|
|
|
607
148
|
/* convert audio samples from PCM/F32 to PCM/I16 */
|
|
608
|
-
const buffer1 =
|
|
609
|
-
for (let i = 0; i < wav.length; i++) {
|
|
610
|
-
const sample = Math.max(-1, Math.min(1, wav[i]))
|
|
611
|
-
buffer1.writeInt16LE(sample * 0x7FFF, i * 2)
|
|
612
|
-
}
|
|
149
|
+
const buffer1 = util.convertF32ToBuf(samples)
|
|
613
150
|
|
|
614
151
|
/* resample audio samples from Supertonic sample rate to 48kHz */
|
|
615
|
-
|
|
152
|
+
if (this.resampler === null)
|
|
153
|
+
throw new Error("resampler destroyed during TTS processing")
|
|
154
|
+
return this.resampler.processChunk(buffer1)
|
|
616
155
|
}
|
|
617
156
|
|
|
618
157
|
/* create transform stream and connect it to the Supertonic TTS */
|
|
@@ -622,7 +161,7 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
|
|
|
622
161
|
readableObjectMode: true,
|
|
623
162
|
decodeStrings: false,
|
|
624
163
|
highWaterMark: 1,
|
|
625
|
-
|
|
164
|
+
transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
626
165
|
if (self.closing)
|
|
627
166
|
callback(new Error("stream already destroyed"))
|
|
628
167
|
else if (Buffer.isBuffer(chunk.payload))
|
|
@@ -640,13 +179,7 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
|
|
|
640
179
|
processTimeout = null
|
|
641
180
|
}
|
|
642
181
|
}
|
|
643
|
-
|
|
644
|
-
if (self.closing) {
|
|
645
|
-
clearProcessTimeout()
|
|
646
|
-
callback(new Error("stream destroyed during processing"))
|
|
647
|
-
return
|
|
648
|
-
}
|
|
649
|
-
const buffer = await text2speech(chunk.payload as string)
|
|
182
|
+
text2speech(chunk.payload as string).then((buffer) => {
|
|
650
183
|
if (self.closing) {
|
|
651
184
|
clearProcessTimeout()
|
|
652
185
|
callback(new Error("stream destroyed during processing"))
|
|
@@ -663,17 +196,13 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
|
|
|
663
196
|
chunkNew.type = "audio"
|
|
664
197
|
chunkNew.payload = buffer
|
|
665
198
|
chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
|
|
666
|
-
|
|
667
|
-
/* push chunk and complete transform */
|
|
668
199
|
clearProcessTimeout()
|
|
669
200
|
this.push(chunkNew)
|
|
670
201
|
callback()
|
|
671
|
-
}
|
|
672
|
-
catch (error) {
|
|
673
|
-
/* handle processing errors */
|
|
202
|
+
}).catch((error: unknown) => {
|
|
674
203
|
clearProcessTimeout()
|
|
675
204
|
callback(util.ensureError(error, "Supertonic processing failed"))
|
|
676
|
-
}
|
|
205
|
+
})
|
|
677
206
|
}
|
|
678
207
|
},
|
|
679
208
|
final (callback) {
|
|
@@ -693,18 +222,15 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
|
|
|
693
222
|
this.stream = null
|
|
694
223
|
}
|
|
695
224
|
|
|
696
|
-
/* destroy voice style */
|
|
697
|
-
if (this.style !== null)
|
|
698
|
-
this.style = null
|
|
699
|
-
|
|
700
225
|
/* destroy resampler */
|
|
701
226
|
if (this.resampler !== null)
|
|
702
227
|
this.resampler = null
|
|
703
228
|
|
|
704
|
-
/* destroy
|
|
705
|
-
if (this.
|
|
706
|
-
|
|
707
|
-
this.
|
|
229
|
+
/* destroy TTS pipeline */
|
|
230
|
+
if (this.tts !== null) {
|
|
231
|
+
/* dispose of the pipeline if possible */
|
|
232
|
+
await this.tts.dispose()
|
|
233
|
+
this.tts = null
|
|
708
234
|
}
|
|
709
235
|
}
|
|
710
236
|
}
|