speechflow 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +4 -4
- package/package.json +2 -2
- package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-cli.js +1 -0
- package/speechflow-cli/dst/speechflow-main-cli.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-graph.js +2 -4
- package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-nodes.js +1 -0
- package/speechflow-cli/dst/speechflow-main-nodes.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +1 -0
- package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +7 -9
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -0
- package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-expander.js +8 -9
- package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-filler.js +2 -0
- package/speechflow-cli/dst/speechflow-node-a2a-filler.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-gender.js +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-meter.js +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +11 -9
- package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js +1 -0
- package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-speex.js +4 -2
- package/speechflow-cli/dst/speechflow-node-a2a-speex.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-vad.js +19 -22
- package/speechflow-cli/dst/speechflow-node-a2a-vad.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js +7 -0
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +2 -11
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +0 -1
- package/speechflow-cli/dst/speechflow-node-a2t-google.js +0 -6
- package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js +6 -1
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.d.ts +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +27 -7
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.d.ts +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +5 -3
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-google.js +1 -4
- package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +27 -6
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-openai.js +1 -4
- package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +15 -4
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js +0 -2
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-google.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js +18 -16
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +5 -2
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-x2x-filter.js +2 -0
- package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-exec.js +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-file.js +3 -5
- package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +2 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-websocket.js +9 -9
- package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-audio.js +4 -0
- package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-queue.js +2 -1
- package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util.js +1 -0
- package/speechflow-cli/dst/speechflow-util.js.map +1 -1
- package/speechflow-cli/package.json +10 -10
- package/speechflow-cli/src/speechflow-main-api.ts +16 -16
- package/speechflow-cli/src/speechflow-main-cli.ts +1 -0
- package/speechflow-cli/src/speechflow-main-graph.ts +7 -9
- package/speechflow-cli/src/speechflow-main-nodes.ts +1 -0
- package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +1 -0
- package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +8 -10
- package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -0
- package/speechflow-cli/src/speechflow-node-a2a-expander.ts +9 -10
- package/speechflow-cli/src/speechflow-node-a2a-filler.ts +2 -0
- package/speechflow-cli/src/speechflow-node-a2a-gender.ts +3 -3
- package/speechflow-cli/src/speechflow-node-a2a-meter.ts +2 -2
- package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +11 -9
- package/speechflow-cli/src/speechflow-node-a2a-rnnoise-wt.ts +1 -0
- package/speechflow-cli/src/speechflow-node-a2a-rnnoise.ts +1 -1
- package/speechflow-cli/src/speechflow-node-a2a-speex.ts +5 -3
- package/speechflow-cli/src/speechflow-node-a2a-vad.ts +20 -23
- package/speechflow-cli/src/speechflow-node-a2a-wav.ts +7 -0
- package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +6 -18
- package/speechflow-cli/src/speechflow-node-a2t-google.ts +4 -11
- package/speechflow-cli/src/speechflow-node-a2t-openai.ts +12 -7
- package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +32 -10
- package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +6 -4
- package/speechflow-cli/src/speechflow-node-t2a-google.ts +1 -4
- package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +33 -10
- package/speechflow-cli/src/speechflow-node-t2a-openai.ts +1 -4
- package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +15 -6
- package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +1 -3
- package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +2 -2
- package/speechflow-cli/src/speechflow-node-t2t-google.ts +1 -1
- package/speechflow-cli/src/speechflow-node-t2t-opus.ts +19 -18
- package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +1 -1
- package/speechflow-cli/src/speechflow-node-t2t-spellcheck.ts +1 -1
- package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +5 -2
- package/speechflow-cli/src/speechflow-node-t2t-summary.ts +1 -1
- package/speechflow-cli/src/speechflow-node-x2x-filter.ts +2 -0
- package/speechflow-cli/src/speechflow-node-xio-exec.ts +1 -0
- package/speechflow-cli/src/speechflow-node-xio-file.ts +3 -5
- package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +2 -2
- package/speechflow-cli/src/speechflow-node-xio-vban.ts +5 -5
- package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +2 -0
- package/speechflow-cli/src/speechflow-node-xio-websocket.ts +9 -9
- package/speechflow-cli/src/speechflow-util-audio.ts +5 -0
- package/speechflow-cli/src/speechflow-util-queue.ts +3 -3
- package/speechflow-cli/src/speechflow-util.ts +1 -0
- package/speechflow-ui-db/package.json +4 -4
- package/speechflow-ui-st/package.json +4 -4
|
@@ -9,6 +9,7 @@ import Stream from "node:stream"
|
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import { getStreamAsBuffer } from "get-stream"
|
|
12
|
+
import { Duration } from "luxon"
|
|
12
13
|
import SpeexResampler from "speex-resampler"
|
|
13
14
|
import {
|
|
14
15
|
PollyClient, SynthesizeSpeechCommand,
|
|
@@ -25,9 +26,9 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
25
26
|
public static name = "t2a-amazon"
|
|
26
27
|
|
|
27
28
|
/* internal state */
|
|
28
|
-
private client:
|
|
29
|
-
private closing = false
|
|
29
|
+
private client: PollyClient | null = null
|
|
30
30
|
private resampler: SpeexResampler | null = null
|
|
31
|
+
private closing = false
|
|
31
32
|
|
|
32
33
|
/* construct node */
|
|
33
34
|
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
@@ -129,22 +130,43 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
129
130
|
}
|
|
130
131
|
if (Buffer.isBuffer(chunk.payload))
|
|
131
132
|
callback(new Error("invalid chunk payload type"))
|
|
132
|
-
else if (chunk.payload
|
|
133
|
+
else if (chunk.payload === "")
|
|
134
|
+
callback()
|
|
135
|
+
else {
|
|
136
|
+
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
137
|
+
processTimeout = null
|
|
138
|
+
callback(new Error("AWS Polly API timeout"))
|
|
139
|
+
}, 60 * 1000)
|
|
140
|
+
const clearProcessTimeout = () => {
|
|
141
|
+
if (processTimeout !== null) {
|
|
142
|
+
clearTimeout(processTimeout)
|
|
143
|
+
processTimeout = null
|
|
144
|
+
}
|
|
145
|
+
}
|
|
133
146
|
self.log("debug", `send data (${chunk.payload.length} bytes): "${chunk.payload}"`)
|
|
134
147
|
textToSpeech(chunk.payload as string).then((buffer) => {
|
|
135
|
-
if (self.closing)
|
|
136
|
-
|
|
148
|
+
if (self.closing) {
|
|
149
|
+
clearProcessTimeout()
|
|
150
|
+
callback(new Error("stream destroyed during processing"))
|
|
151
|
+
return
|
|
152
|
+
}
|
|
153
|
+
/* calculate actual audio duration from PCM buffer size */
|
|
154
|
+
const durationMs = util.audioBufferDuration(buffer,
|
|
155
|
+
self.config.audioSampleRate, self.config.audioBitDepth) * 1000
|
|
156
|
+
|
|
157
|
+
/* create new chunk with recalculated timestamps */
|
|
137
158
|
const chunkNew = chunk.clone()
|
|
138
|
-
chunkNew.type
|
|
139
|
-
chunkNew.payload
|
|
159
|
+
chunkNew.type = "audio"
|
|
160
|
+
chunkNew.payload = buffer
|
|
161
|
+
chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
|
|
162
|
+
clearProcessTimeout()
|
|
140
163
|
this.push(chunkNew)
|
|
141
164
|
callback()
|
|
142
165
|
}).catch((error: unknown) => {
|
|
143
|
-
|
|
166
|
+
clearProcessTimeout()
|
|
167
|
+
callback(util.ensureError(error, "AWS Polly processing failed"))
|
|
144
168
|
})
|
|
145
169
|
}
|
|
146
|
-
else
|
|
147
|
-
callback()
|
|
148
170
|
},
|
|
149
171
|
final (callback) {
|
|
150
172
|
callback()
|
|
@@ -24,8 +24,8 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
24
24
|
|
|
25
25
|
/* internal state */
|
|
26
26
|
private elevenlabs: ElevenLabs.ElevenLabsClient | null = null
|
|
27
|
-
private
|
|
28
|
-
private
|
|
27
|
+
private resampler: SpeexResampler | null = null
|
|
28
|
+
private closing = false
|
|
29
29
|
|
|
30
30
|
/* construct node */
|
|
31
31
|
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
@@ -131,8 +131,8 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
131
131
|
})
|
|
132
132
|
}
|
|
133
133
|
|
|
134
|
-
/* establish resampler from ElevenLabs's
|
|
135
|
-
output to our standard audio sample rate (48KHz) */
|
|
134
|
+
/* establish resampler from ElevenLabs's tier-dependent
|
|
135
|
+
output sample rate to our standard audio sample rate (48KHz) */
|
|
136
136
|
this.resampler = new SpeexResampler(1, maxSampleRate, this.config.audioSampleRate, 7)
|
|
137
137
|
|
|
138
138
|
/* create transform stream and connect it to the ElevenLabs API */
|
|
@@ -147,6 +147,8 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
147
147
|
callback(new Error("stream already destroyed"))
|
|
148
148
|
else if (Buffer.isBuffer(chunk.payload))
|
|
149
149
|
callback(new Error("invalid chunk payload type"))
|
|
150
|
+
else if (chunk.payload === "")
|
|
151
|
+
callback()
|
|
150
152
|
else {
|
|
151
153
|
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
152
154
|
processTimeout = null
|
|
@@ -126,11 +126,8 @@ export default class SpeechFlowNodeT2AGoogle extends SpeechFlowNode {
|
|
|
126
126
|
callback(new Error("stream already destroyed"))
|
|
127
127
|
else if (Buffer.isBuffer(chunk.payload))
|
|
128
128
|
callback(new Error("invalid chunk payload type"))
|
|
129
|
-
else if (chunk.payload === "")
|
|
130
|
-
/* pass through empty chunks */
|
|
131
|
-
this.push(chunk)
|
|
129
|
+
else if (chunk.payload === "")
|
|
132
130
|
callback()
|
|
133
|
-
}
|
|
134
131
|
else {
|
|
135
132
|
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
136
133
|
processTimeout = null
|
|
@@ -9,6 +9,7 @@ import Stream from "node:stream"
|
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import { KokoroTTS } from "kokoro-js"
|
|
12
|
+
import { Duration } from "luxon"
|
|
12
13
|
import SpeexResampler from "speex-resampler"
|
|
13
14
|
|
|
14
15
|
/* internal dependencies */
|
|
@@ -21,9 +22,9 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
21
22
|
public static name = "t2a-kokoro"
|
|
22
23
|
|
|
23
24
|
/* internal state */
|
|
24
|
-
private kokoro:
|
|
25
|
-
private closing = false
|
|
25
|
+
private kokoro: KokoroTTS | null = null
|
|
26
26
|
private resampler: SpeexResampler | null = null
|
|
27
|
+
private closing = false
|
|
27
28
|
|
|
28
29
|
/* construct node */
|
|
29
30
|
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
@@ -122,9 +123,7 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
122
123
|
}
|
|
123
124
|
|
|
124
125
|
/* resample audio samples from PCM/I16/24Khz to PCM/I16/48KHz */
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
return buffer2
|
|
126
|
+
return this.resampler!.processChunk(buffer1)
|
|
128
127
|
}
|
|
129
128
|
|
|
130
129
|
/* create transform stream and connect it to the Kokoro API */
|
|
@@ -139,18 +138,42 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
139
138
|
callback(new Error("stream already destroyed"))
|
|
140
139
|
else if (Buffer.isBuffer(chunk.payload))
|
|
141
140
|
callback(new Error("invalid chunk payload type"))
|
|
141
|
+
else if (chunk.payload === "")
|
|
142
|
+
callback()
|
|
142
143
|
else {
|
|
144
|
+
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
145
|
+
processTimeout = null
|
|
146
|
+
callback(new Error("Kokoro TTS timeout"))
|
|
147
|
+
}, 60 * 1000)
|
|
148
|
+
const clearProcessTimeout = () => {
|
|
149
|
+
if (processTimeout !== null) {
|
|
150
|
+
clearTimeout(processTimeout)
|
|
151
|
+
processTimeout = null
|
|
152
|
+
}
|
|
153
|
+
}
|
|
143
154
|
text2speech(chunk.payload).then((buffer) => {
|
|
144
|
-
if (self.closing)
|
|
145
|
-
|
|
155
|
+
if (self.closing) {
|
|
156
|
+
clearProcessTimeout()
|
|
157
|
+
callback(new Error("stream destroyed during processing"))
|
|
158
|
+
return
|
|
159
|
+
}
|
|
146
160
|
self.log("info", `Kokoro: received audio (buffer length: ${buffer.byteLength})`)
|
|
161
|
+
|
|
162
|
+
/* calculate actual audio duration from PCM buffer size */
|
|
163
|
+
const durationMs = util.audioBufferDuration(buffer,
|
|
164
|
+
self.config.audioSampleRate, self.config.audioBitDepth) * 1000
|
|
165
|
+
|
|
166
|
+
/* create new chunk with recalculated timestamps */
|
|
147
167
|
const chunkNew = chunk.clone()
|
|
148
|
-
chunkNew.type
|
|
149
|
-
chunkNew.payload
|
|
168
|
+
chunkNew.type = "audio"
|
|
169
|
+
chunkNew.payload = buffer
|
|
170
|
+
chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
|
|
171
|
+
clearProcessTimeout()
|
|
150
172
|
this.push(chunkNew)
|
|
151
173
|
callback()
|
|
152
174
|
}).catch((error: unknown) => {
|
|
153
|
-
|
|
175
|
+
clearProcessTimeout()
|
|
176
|
+
callback(util.ensureError(error, "Kokoro processing failed"))
|
|
154
177
|
})
|
|
155
178
|
}
|
|
156
179
|
},
|
|
@@ -103,11 +103,8 @@ export default class SpeechFlowNodeT2AOpenAI extends SpeechFlowNode {
|
|
|
103
103
|
callback(new Error("stream already destroyed"))
|
|
104
104
|
else if (Buffer.isBuffer(chunk.payload))
|
|
105
105
|
callback(new Error("invalid chunk payload type"))
|
|
106
|
-
else if (chunk.payload === "")
|
|
107
|
-
/* pass through empty chunks */
|
|
108
|
-
this.push(chunk)
|
|
106
|
+
else if (chunk.payload === "")
|
|
109
107
|
callback()
|
|
110
|
-
}
|
|
111
108
|
else {
|
|
112
109
|
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
113
110
|
processTimeout = null
|
|
@@ -131,6 +131,7 @@ function chunkText (text: string, maxLen = 300): string[] {
|
|
|
131
131
|
class SupertonicTextProcessor {
|
|
132
132
|
private indexer: Record<number, number>
|
|
133
133
|
|
|
134
|
+
/* construct text processor */
|
|
134
135
|
constructor (unicodeIndexerJsonPath: string) {
|
|
135
136
|
/* load and parse unicode indexer JSON */
|
|
136
137
|
try {
|
|
@@ -141,6 +142,7 @@ class SupertonicTextProcessor {
|
|
|
141
142
|
}
|
|
142
143
|
}
|
|
143
144
|
|
|
145
|
+
/* preprocess text */
|
|
144
146
|
private preprocessText (text: string): string {
|
|
145
147
|
/* normalize text */
|
|
146
148
|
text = text.normalize("NFKD")
|
|
@@ -211,11 +213,13 @@ class SupertonicTextProcessor {
|
|
|
211
213
|
return text
|
|
212
214
|
}
|
|
213
215
|
|
|
216
|
+
/* convert text to Unicode values */
|
|
214
217
|
private textToUnicodeValues (text: string): number[] {
|
|
215
218
|
/* convert text characters to unicode code points */
|
|
216
219
|
return Array.from(text).map((char) => char.charCodeAt(0))
|
|
217
220
|
}
|
|
218
221
|
|
|
222
|
+
/* process text list */
|
|
219
223
|
call (textList: string[]): { textIds: number[][], textMask: number[][][] } {
|
|
220
224
|
/* handle empty input */
|
|
221
225
|
if (textList.length === 0)
|
|
@@ -246,6 +250,7 @@ class SupertonicTextProcessor {
|
|
|
246
250
|
class SupertonicTTS {
|
|
247
251
|
public sampleRate: number
|
|
248
252
|
|
|
253
|
+
/* internal TTS state */
|
|
249
254
|
private cfgs: SupertonicConfig
|
|
250
255
|
private textProcessor: SupertonicTextProcessor
|
|
251
256
|
private dpOrt: ORT.InferenceSession
|
|
@@ -256,6 +261,7 @@ class SupertonicTTS {
|
|
|
256
261
|
private chunkCompressFactor: number
|
|
257
262
|
private latentDim: number
|
|
258
263
|
|
|
264
|
+
/* construct TTS engine */
|
|
259
265
|
constructor (
|
|
260
266
|
cfgs: SupertonicConfig,
|
|
261
267
|
textProcessor: SupertonicTextProcessor,
|
|
@@ -279,6 +285,7 @@ class SupertonicTTS {
|
|
|
279
285
|
this.latentDim = cfgs.ttl.latent_dim
|
|
280
286
|
}
|
|
281
287
|
|
|
288
|
+
/* sample noisy latent vectors */
|
|
282
289
|
private sampleNoisyLatent (duration: number[]): { noisyLatent: number[][][], latentMask: number[][][] } {
|
|
283
290
|
/* calculate dimensions for latent space */
|
|
284
291
|
const wavLenMax = Math.max(...duration) * this.sampleRate
|
|
@@ -294,7 +301,6 @@ class SupertonicTTS {
|
|
|
294
301
|
for (let d = 0; d < latentDimExpanded; d++) {
|
|
295
302
|
const row: number[] = Array.from({ length: latentLen })
|
|
296
303
|
for (let t = 0; t < latentLen; t++) {
|
|
297
|
-
|
|
298
304
|
/* Box-Muller transform for normal distribution */
|
|
299
305
|
const eps = 1e-10
|
|
300
306
|
const u1 = Math.max(eps, Math.random())
|
|
@@ -317,6 +323,7 @@ class SupertonicTTS {
|
|
|
317
323
|
return { noisyLatent, latentMask }
|
|
318
324
|
}
|
|
319
325
|
|
|
326
|
+
/* perform inference */
|
|
320
327
|
private async infer (textList: string[], style: SupertonicStyle, totalStep: number, speed: number): Promise<{ wav: number[], duration: number[] }> {
|
|
321
328
|
/* validate batch size matches style vectors */
|
|
322
329
|
if (textList.length !== style.ttl.dims[0])
|
|
@@ -392,6 +399,7 @@ class SupertonicTTS {
|
|
|
392
399
|
return { wav, duration: predictedDurations }
|
|
393
400
|
}
|
|
394
401
|
|
|
402
|
+
/* synthesize speech from text */
|
|
395
403
|
async synthesize (text: string, style: SupertonicStyle, totalStep: number, speed: number, silenceDuration = 0.3): Promise<{ wav: number[], duration: number }> {
|
|
396
404
|
/* validate single speaker mode */
|
|
397
405
|
if (style.ttl.dims[0] !== 1)
|
|
@@ -420,6 +428,7 @@ class SupertonicTTS {
|
|
|
420
428
|
return { wav: wavParts.flat(), duration: totalDuration }
|
|
421
429
|
}
|
|
422
430
|
|
|
431
|
+
/* release TTS engine resources */
|
|
423
432
|
async release (): Promise<void> {
|
|
424
433
|
/* release all ONNX inference sessions */
|
|
425
434
|
await Promise.all([
|
|
@@ -535,7 +544,7 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
|
|
|
535
544
|
"onnx/text_encoder.onnx",
|
|
536
545
|
"onnx/unicode_indexer.json",
|
|
537
546
|
"onnx/vector_estimator.onnx",
|
|
538
|
-
"onnx/vocoder.onnx"
|
|
547
|
+
"onnx/vocoder.onnx"
|
|
539
548
|
]
|
|
540
549
|
|
|
541
550
|
/* create asset directories */
|
|
@@ -602,9 +611,8 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
|
|
|
602
611
|
buffer1.writeInt16LE(sample * 0x7FFF, i * 2)
|
|
603
612
|
}
|
|
604
613
|
|
|
605
|
-
/* resample audio samples from
|
|
606
|
-
|
|
607
|
-
return buffer2
|
|
614
|
+
/* resample audio samples from Supertonic sample rate to 48kHz */
|
|
615
|
+
return this.resampler!.processChunk(buffer1)
|
|
608
616
|
}
|
|
609
617
|
|
|
610
618
|
/* create transform stream and connect it to the Supertonic TTS */
|
|
@@ -619,6 +627,8 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
|
|
|
619
627
|
callback(new Error("stream already destroyed"))
|
|
620
628
|
else if (Buffer.isBuffer(chunk.payload))
|
|
621
629
|
callback(new Error("invalid chunk payload type"))
|
|
630
|
+
else if (chunk.payload === "")
|
|
631
|
+
callback()
|
|
622
632
|
else {
|
|
623
633
|
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
624
634
|
processTimeout = null
|
|
@@ -660,7 +670,6 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
|
|
|
660
670
|
callback()
|
|
661
671
|
}
|
|
662
672
|
catch (error) {
|
|
663
|
-
|
|
664
673
|
/* handle processing errors */
|
|
665
674
|
clearProcessTimeout()
|
|
666
675
|
callback(util.ensureError(error, "Supertonic processing failed"))
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/* standard dependencies */
|
|
8
|
-
import Stream
|
|
8
|
+
import Stream from "node:stream"
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import { TranslateClient, TranslateTextCommand } from "@aws-sdk/client-translate"
|
|
@@ -65,8 +65,6 @@ export default class SpeechFlowNodeT2TAmazon extends SpeechFlowNode {
|
|
|
65
65
|
secretAccessKey: this.params.secKey
|
|
66
66
|
}
|
|
67
67
|
})
|
|
68
|
-
if (this.client === null)
|
|
69
|
-
throw new Error("failed to establish Amazon Translate client")
|
|
70
68
|
|
|
71
69
|
/* provide text-to-text translation */
|
|
72
70
|
const maxRetries = 10
|
|
@@ -5,10 +5,10 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/* standard dependencies */
|
|
8
|
-
import Stream
|
|
8
|
+
import Stream from "node:stream"
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
|
-
import * as DeepL
|
|
11
|
+
import * as DeepL from "deepl-node"
|
|
12
12
|
|
|
13
13
|
/* internal dependencies */
|
|
14
14
|
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
@@ -68,18 +68,19 @@ export default class SpeechFlowNodeT2TOPUS extends SpeechFlowNode {
|
|
|
68
68
|
}, 1000)
|
|
69
69
|
|
|
70
70
|
/* instantiate Transformers engine and model */
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
71
|
+
try {
|
|
72
|
+
const pipeline = Transformers.pipeline("translation", model, {
|
|
73
|
+
cache_dir: path.join(this.config.cacheDir, "transformers"),
|
|
74
|
+
dtype: "q4",
|
|
75
|
+
device: "auto",
|
|
76
|
+
progress_callback: progressCallback
|
|
77
|
+
})
|
|
78
|
+
this.translator = await pipeline
|
|
79
|
+
}
|
|
80
|
+
finally {
|
|
81
|
+
/* clear progress interval again */
|
|
82
|
+
clearInterval(interval)
|
|
83
|
+
}
|
|
83
84
|
|
|
84
85
|
/* provide text-to-text translation */
|
|
85
86
|
const translate = async (text: string) => {
|
|
@@ -120,17 +121,17 @@ export default class SpeechFlowNodeT2TOPUS extends SpeechFlowNode {
|
|
|
120
121
|
|
|
121
122
|
/* close node */
|
|
122
123
|
async close () {
|
|
123
|
-
/* shutdown stream */
|
|
124
|
-
if (this.stream !== null) {
|
|
125
|
-
await util.destroyStream(this.stream)
|
|
126
|
-
this.stream = null
|
|
127
|
-
}
|
|
128
|
-
|
|
129
124
|
/* shutdown Transformers */
|
|
130
125
|
if (this.translator !== null) {
|
|
131
126
|
this.translator.dispose()
|
|
132
127
|
this.translator = null
|
|
133
128
|
}
|
|
129
|
+
|
|
130
|
+
/* shutdown stream */
|
|
131
|
+
if (this.stream !== null) {
|
|
132
|
+
await util.destroyStream(this.stream)
|
|
133
|
+
this.stream = null
|
|
134
|
+
}
|
|
134
135
|
}
|
|
135
136
|
}
|
|
136
137
|
|
|
@@ -74,7 +74,7 @@ export default class SpeechFlowNodeT2TPunctuation extends SpeechFlowNode {
|
|
|
74
74
|
"Gib KEINE Erklärungen.\n" +
|
|
75
75
|
"Gib KEINE Einleitung.\n" +
|
|
76
76
|
"Gib KEINE Kommentare.\n" +
|
|
77
|
-
"Gib KEINE
|
|
77
|
+
"Gib KEINE Präambel.\n" +
|
|
78
78
|
"Gib KEINEN Prolog.\n" +
|
|
79
79
|
"Gib KEINEN Epilog.\n" +
|
|
80
80
|
"Ändere NICHT die Wörter.\n" +
|
|
@@ -66,7 +66,7 @@ export default class SpeechFlowNodeT2TSpellcheck extends SpeechFlowNode {
|
|
|
66
66
|
"Gib KEINE Erklärungen.\n" +
|
|
67
67
|
"Gib KEINE Einleitung.\n" +
|
|
68
68
|
"Gib KEINE Kommentare.\n" +
|
|
69
|
-
"Gib KEINE
|
|
69
|
+
"Gib KEINE Präambel.\n" +
|
|
70
70
|
"Gib KEINEN Prolog.\n" +
|
|
71
71
|
"Gib KEINEN Epilog.\n" +
|
|
72
72
|
"Ändere NICHT die Grammatik.\n" +
|
|
@@ -191,7 +191,7 @@ export default class SpeechFlowNodeT2TSubtitle extends SpeechFlowNode {
|
|
|
191
191
|
for (const block of blocks) {
|
|
192
192
|
const lines = block.trim().split(/\r?\n/)
|
|
193
193
|
if (lines.length < 2) {
|
|
194
|
-
this.log("warning", "SRT block contains
|
|
194
|
+
this.log("warning", "SRT block contains fewer than 2 lines")
|
|
195
195
|
continue
|
|
196
196
|
}
|
|
197
197
|
|
|
@@ -231,7 +231,7 @@ export default class SpeechFlowNodeT2TSubtitle extends SpeechFlowNode {
|
|
|
231
231
|
for (const block of blocks) {
|
|
232
232
|
const lines = block.trim().split(/\r?\n/)
|
|
233
233
|
if (lines.length < 1) {
|
|
234
|
-
this.log("warning", "VTT block contains
|
|
234
|
+
this.log("warning", "VTT block contains fewer than 1 line")
|
|
235
235
|
continue
|
|
236
236
|
}
|
|
237
237
|
|
|
@@ -394,15 +394,18 @@ export default class SpeechFlowNodeT2TSubtitle extends SpeechFlowNode {
|
|
|
394
394
|
h.response({}).code(204)
|
|
395
395
|
})
|
|
396
396
|
|
|
397
|
+
/* start HAPI server */
|
|
397
398
|
await this.hapi.start()
|
|
398
399
|
this.log("info", `HAPI: started REST/WebSocket network service: http://${this.params.addr}:${this.params.port}`)
|
|
399
400
|
|
|
401
|
+
/* helper to emit chunks to WebSocket peers */
|
|
400
402
|
const emit = (chunk: SpeechFlowChunk) => {
|
|
401
403
|
const data = JSON.stringify(chunk)
|
|
402
404
|
for (const info of wsPeers.values())
|
|
403
405
|
info.ws.send(data)
|
|
404
406
|
}
|
|
405
407
|
|
|
408
|
+
/* establish writable stream */
|
|
406
409
|
this.stream = new Stream.Writable({
|
|
407
410
|
objectMode: true,
|
|
408
411
|
decodeStrings: false,
|
|
@@ -60,7 +60,7 @@ export default class SpeechFlowNodeT2TSummary extends SpeechFlowNode {
|
|
|
60
60
|
"Gib KEINE Erklärungen.\n" +
|
|
61
61
|
"Gib KEINE Einleitung.\n" +
|
|
62
62
|
"Gib KEINE Kommentare.\n" +
|
|
63
|
-
"Gib KEINE
|
|
63
|
+
"Gib KEINE Präambel.\n" +
|
|
64
64
|
"Gib KEINEN Prolog.\n" +
|
|
65
65
|
"Gib KEINEN Epilog.\n" +
|
|
66
66
|
"Komme auf den Punkt.\n" +
|
|
@@ -120,6 +120,8 @@ export default class SpeechFlowNodeX2XFilter extends SpeechFlowNode {
|
|
|
120
120
|
val1 = chunk.timestampStart.toMillis()
|
|
121
121
|
else if (self.params.var === "time:end")
|
|
122
122
|
val1 = chunk.timestampEnd.toMillis()
|
|
123
|
+
else
|
|
124
|
+
val1 = undefined
|
|
123
125
|
if (comparison(val1, self.params.op, val2)) {
|
|
124
126
|
self.log("info", `[${self.params.name}]: passing through ${chunk.type} chunk`)
|
|
125
127
|
this.push(chunk)
|
|
@@ -54,7 +54,7 @@ export default class SpeechFlowNodeXIOFile extends SpeechFlowNode {
|
|
|
54
54
|
/* open node */
|
|
55
55
|
async open () {
|
|
56
56
|
/* determine how many bytes we need per chunk when
|
|
57
|
-
the chunk should be of the required duration/size
|
|
57
|
+
the chunk should be of the required duration/size */
|
|
58
58
|
const highWaterMarkAudio = (
|
|
59
59
|
this.config.audioSampleRate *
|
|
60
60
|
(this.config.audioBitDepth / 8)
|
|
@@ -139,11 +139,10 @@ export default class SpeechFlowNodeXIOFile extends SpeechFlowNode {
|
|
|
139
139
|
const payload = Buffer.isBuffer(chunk.payload) ?
|
|
140
140
|
chunk.payload : Buffer.from(chunk.payload)
|
|
141
141
|
const seekPosition = chunk.meta.get("chunk:seek") as number | undefined
|
|
142
|
-
if (seekPosition !== undefined)
|
|
142
|
+
if (seekPosition !== undefined)
|
|
143
143
|
/* seek to specified position and write (overload) */
|
|
144
144
|
fs.write(self.fd!, payload, 0, payload.byteLength, seekPosition, callback)
|
|
145
|
-
|
|
146
|
-
else {
|
|
145
|
+
else
|
|
147
146
|
/* append at current position */
|
|
148
147
|
fs.write(self.fd!, payload, 0, payload.byteLength, writePosition, (err) => {
|
|
149
148
|
if (err)
|
|
@@ -153,7 +152,6 @@ export default class SpeechFlowNodeXIOFile extends SpeechFlowNode {
|
|
|
153
152
|
callback()
|
|
154
153
|
}
|
|
155
154
|
})
|
|
156
|
-
}
|
|
157
155
|
},
|
|
158
156
|
final (callback) {
|
|
159
157
|
callback()
|
|
@@ -21,8 +21,8 @@ export default class SpeechFlowNodeXIOMQTT extends SpeechFlowNode {
|
|
|
21
21
|
public static name = "xio-mqtt"
|
|
22
22
|
|
|
23
23
|
/* internal state */
|
|
24
|
-
private broker:
|
|
25
|
-
private clientId:
|
|
24
|
+
private broker: MQTT.MqttClient | null = null
|
|
25
|
+
private clientId: string = (new UUID(1)).format()
|
|
26
26
|
private chunkQueue: util.SingleQueue<SpeechFlowChunk> | null = null
|
|
27
27
|
|
|
28
28
|
/* construct node */
|
|
@@ -29,11 +29,11 @@ export default class SpeechFlowNodeXIOVBAN extends SpeechFlowNode {
|
|
|
29
29
|
public static name = "xio-vban"
|
|
30
30
|
|
|
31
31
|
/* internal state */
|
|
32
|
-
private server:
|
|
33
|
-
private chunkQueue:
|
|
34
|
-
private frameCounter
|
|
35
|
-
private targetAddress
|
|
36
|
-
private targetPort
|
|
32
|
+
private server: VBANServer | null = null
|
|
33
|
+
private chunkQueue: util.SingleQueue<SpeechFlowChunk> | null = null
|
|
34
|
+
private frameCounter = 0
|
|
35
|
+
private targetAddress = ""
|
|
36
|
+
private targetPort = 0
|
|
37
37
|
|
|
38
38
|
/* construct node */
|
|
39
39
|
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
@@ -154,6 +154,7 @@ export default class SpeechFlowNodeXIOWebRTC extends SpeechFlowNode {
|
|
|
154
154
|
this.pcmBuffer = this.pcmBuffer.subarray(this.pcmBuffer.length - maxBufferSize)
|
|
155
155
|
}
|
|
156
156
|
|
|
157
|
+
/* process full Opus frames from buffer */
|
|
157
158
|
while (this.pcmBuffer.length >= this.OPUS_FRAME_BYTES) {
|
|
158
159
|
const frame = this.pcmBuffer.subarray(0, this.OPUS_FRAME_BYTES)
|
|
159
160
|
this.pcmBuffer = this.pcmBuffer.subarray(this.OPUS_FRAME_BYTES)
|
|
@@ -418,6 +419,7 @@ export default class SpeechFlowNodeXIOWebRTC extends SpeechFlowNode {
|
|
|
418
419
|
const isPublisher = hasSendonly || hasSendrecv
|
|
419
420
|
const isViewer = hasRecvonly
|
|
420
421
|
|
|
422
|
+
/* handle protocol based on mode */
|
|
421
423
|
if (self.params.mode === "r" && isPublisher)
|
|
422
424
|
/* in read mode, accept WHIP publishers */
|
|
423
425
|
await self.handleWHIP(res, body)
|