speechflow 1.6.7 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/README.md +77 -52
- package/etc/secretlint.json +7 -0
- package/etc/speechflow.yaml +13 -4
- package/etc/stx.conf +3 -2
- package/package.json +8 -6
- package/speechflow-cli/dst/speechflow-main-api.js +9 -8
- package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-graph.js +13 -14
- package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-status.js +38 -8
- package/speechflow-cli/dst/speechflow-main-status.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +3 -0
- package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +4 -2
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-expander.js +4 -2
- package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-ffmpeg.js +2 -2
- package/speechflow-cli/dst/speechflow-node-a2a-ffmpeg.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-filler.js +46 -17
- package/speechflow-cli/dst/speechflow-node-a2a-filler.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-gain.js +0 -5
- package/speechflow-cli/dst/speechflow-node-a2a-gain.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-gender.js +3 -4
- package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-mute.js +0 -5
- package/speechflow-cli/dst/speechflow-node-a2a-mute.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +1 -2
- package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js +0 -5
- package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-speex.js +0 -5
- package/speechflow-cli/dst/speechflow-node-a2a-speex.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js +8 -2
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +17 -19
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-deepgram.d.ts +0 -1
- package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js +30 -25
- package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js +79 -48
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +6 -11
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +45 -44
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +2 -0
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +19 -7
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js +1 -2
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-deepl.js +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-format.js +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-format.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-google.js +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-google.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-modify.js +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-modify.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-openai.js +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +173 -29
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-x2x-filter.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-node-x2x-filter.js +10 -1
- package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-x2x-trace.js +0 -5
- package/speechflow-cli/dst/speechflow-node-x2x-trace.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-device.js +5 -5
- package/speechflow-cli/dst/speechflow-node-xio-device.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-file.js +4 -4
- package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-mqtt.js +9 -3
- package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-websocket.js +16 -5
- package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-audio.js +3 -3
- package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-error.d.ts +0 -1
- package/speechflow-cli/dst/speechflow-util-error.js +0 -7
- package/speechflow-cli/dst/speechflow-util-error.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-misc.d.ts +2 -0
- package/speechflow-cli/dst/speechflow-util-misc.js +26 -0
- package/speechflow-cli/dst/speechflow-util-misc.js.map +1 -0
- package/speechflow-cli/dst/speechflow-util-queue.d.ts +9 -2
- package/speechflow-cli/dst/speechflow-util-queue.js +36 -15
- package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-stream.d.ts +2 -2
- package/speechflow-cli/dst/speechflow-util-stream.js +17 -19
- package/speechflow-cli/dst/speechflow-util-stream.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-util.js +1 -0
- package/speechflow-cli/dst/speechflow-util.js.map +1 -1
- package/speechflow-cli/etc/oxlint.jsonc +6 -1
- package/speechflow-cli/etc/stx.conf +1 -0
- package/speechflow-cli/package.json +28 -27
- package/speechflow-cli/src/speechflow-main-api.ts +9 -11
- package/speechflow-cli/src/speechflow-main-graph.ts +15 -16
- package/speechflow-cli/src/speechflow-main-status.ts +6 -10
- package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +4 -0
- package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +4 -2
- package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -1
- package/speechflow-cli/src/speechflow-node-a2a-expander.ts +4 -2
- package/speechflow-cli/src/speechflow-node-a2a-ffmpeg.ts +4 -2
- package/speechflow-cli/src/speechflow-node-a2a-filler.ts +57 -20
- package/speechflow-cli/src/speechflow-node-a2a-gain.ts +0 -5
- package/speechflow-cli/src/speechflow-node-a2a-gender.ts +3 -4
- package/speechflow-cli/src/speechflow-node-a2a-mute.ts +0 -5
- package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +1 -2
- package/speechflow-cli/src/speechflow-node-a2a-rnnoise.ts +0 -5
- package/speechflow-cli/src/speechflow-node-a2a-speex.ts +0 -5
- package/speechflow-cli/src/speechflow-node-a2a-wav.ts +9 -3
- package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +27 -27
- package/speechflow-cli/src/speechflow-node-a2t-deepgram.ts +37 -28
- package/speechflow-cli/src/speechflow-node-a2t-openai.ts +92 -56
- package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +7 -11
- package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +47 -43
- package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +22 -7
- package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +1 -2
- package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +0 -1
- package/speechflow-cli/src/speechflow-node-t2t-format.ts +0 -1
- package/speechflow-cli/src/speechflow-node-t2t-google.ts +0 -1
- package/speechflow-cli/src/speechflow-node-t2t-modify.ts +0 -1
- package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +0 -1
- package/speechflow-cli/src/speechflow-node-t2t-openai.ts +0 -1
- package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +205 -33
- package/speechflow-cli/src/speechflow-node-x2x-filter.ts +16 -4
- package/speechflow-cli/src/speechflow-node-x2x-trace.ts +3 -8
- package/speechflow-cli/src/speechflow-node-xio-device.ts +6 -9
- package/speechflow-cli/src/speechflow-node-xio-file.ts +4 -4
- package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +10 -4
- package/speechflow-cli/src/speechflow-node-xio-websocket.ts +16 -5
- package/speechflow-cli/src/speechflow-util-audio-wt.ts +4 -4
- package/speechflow-cli/src/speechflow-util-audio.ts +7 -7
- package/speechflow-cli/src/speechflow-util-error.ts +0 -7
- package/speechflow-cli/src/speechflow-util-misc.ts +23 -0
- package/speechflow-cli/src/speechflow-util-queue.ts +40 -20
- package/speechflow-cli/src/speechflow-util-stream.ts +29 -24
- package/speechflow-cli/src/speechflow-util.ts +1 -0
- package/speechflow-ui-db/dst/index.css +1 -5
- package/speechflow-ui-db/dst/index.js +14 -58
- package/speechflow-ui-db/etc/stx.conf +5 -16
- package/speechflow-ui-db/package.json +16 -15
- package/speechflow-ui-st/dst/index.css +1 -5
- package/speechflow-ui-st/dst/index.js +31 -160
- package/speechflow-ui-st/etc/stx.conf +5 -16
- package/speechflow-ui-st/package.json +17 -16
|
@@ -9,7 +9,7 @@ import Stream from "node:stream"
|
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import OpenAI from "openai"
|
|
12
|
-
import { DateTime }
|
|
12
|
+
import { DateTime, Duration } from "luxon"
|
|
13
13
|
import SpeexResampler from "speex-resampler"
|
|
14
14
|
import ws from "ws"
|
|
15
15
|
|
|
@@ -23,11 +23,11 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
23
23
|
public static name = "a2t-openai"
|
|
24
24
|
|
|
25
25
|
/* internal state */
|
|
26
|
-
private openai:
|
|
27
|
-
private ws:
|
|
28
|
-
private queue:
|
|
29
|
-
private resampler:
|
|
30
|
-
private closing
|
|
26
|
+
private openai: OpenAI | null = null
|
|
27
|
+
private ws: ws.WebSocket | null = null
|
|
28
|
+
private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
|
|
29
|
+
private resampler: SpeexResampler | null = null
|
|
30
|
+
private closing = false
|
|
31
31
|
private connectionTimeout: ReturnType<typeof setTimeout> | null = null
|
|
32
32
|
|
|
33
33
|
/* construct node */
|
|
@@ -43,6 +43,10 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
43
43
|
interim: { type: "boolean", val: false }
|
|
44
44
|
})
|
|
45
45
|
|
|
46
|
+
/* sanity check parameters */
|
|
47
|
+
if (!this.params.key)
|
|
48
|
+
throw new Error("OpenAI API key not configured")
|
|
49
|
+
|
|
46
50
|
/* declare node input/output format */
|
|
47
51
|
this.input = "audio"
|
|
48
52
|
this.output = "text"
|
|
@@ -141,11 +145,25 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
141
145
|
})
|
|
142
146
|
this.ws.on("close", () => {
|
|
143
147
|
this.log("info", "WebSocket connection closed")
|
|
144
|
-
this.queue
|
|
148
|
+
if (!this.closing && this.queue !== null)
|
|
149
|
+
this.queue.write(null)
|
|
145
150
|
})
|
|
146
151
|
this.ws.on("error", (err) => {
|
|
147
152
|
this.log("error", `WebSocket connection error: ${err}`)
|
|
148
153
|
})
|
|
154
|
+
|
|
155
|
+
/* track speech timing by item_id (OpenAI provides timestamps via VAD events) */
|
|
156
|
+
const speechTiming = new Map<string, { startMs: number, endMs: number }>()
|
|
157
|
+
|
|
158
|
+
/* helper function for aggregating meta information */
|
|
159
|
+
const aggregateMeta = (start: Duration, end: Duration): Map<string, any> => {
|
|
160
|
+
const metas = metastore.fetch(start, end)
|
|
161
|
+
return metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
|
|
162
|
+
curr.forEach((val, key) => { prev.set(key, val) })
|
|
163
|
+
return prev
|
|
164
|
+
}, new Map<string, any>())
|
|
165
|
+
}
|
|
166
|
+
|
|
149
167
|
let text = ""
|
|
150
168
|
this.ws.on("message", (data) => {
|
|
151
169
|
let ev: any
|
|
@@ -163,53 +181,63 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
163
181
|
switch (ev.type) {
|
|
164
182
|
case "transcription_session.created":
|
|
165
183
|
break
|
|
166
|
-
case "conversation.item.created":
|
|
184
|
+
case "conversation.item.created": {
|
|
167
185
|
text = ""
|
|
168
186
|
break
|
|
187
|
+
}
|
|
169
188
|
case "conversation.item.input_audio_transcription.delta": {
|
|
170
189
|
text += ev.delta as string
|
|
171
|
-
if (this.params.interim) {
|
|
172
|
-
const
|
|
173
|
-
const
|
|
174
|
-
const
|
|
175
|
-
const
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
const chunk = new SpeechFlowChunk(start, end, "intermediate", "text", text)
|
|
180
|
-
chunk.meta = meta
|
|
181
|
-
this.queue!.write(chunk)
|
|
190
|
+
if (this.params.interim && !this.closing && this.queue !== null) {
|
|
191
|
+
const itemId = ev.item_id as string
|
|
192
|
+
const timing = speechTiming.get(itemId)
|
|
193
|
+
const start = timing ? Duration.fromMillis(timing.startMs) : DateTime.now().diff(this.timeOpen!)
|
|
194
|
+
const end = timing ? Duration.fromMillis(timing.endMs) : start
|
|
195
|
+
const chunk = new SpeechFlowChunk(start, end, "intermediate", "text", text)
|
|
196
|
+
chunk.meta = aggregateMeta(start, end)
|
|
197
|
+
this.queue.write(chunk)
|
|
182
198
|
}
|
|
183
199
|
break
|
|
184
200
|
}
|
|
185
201
|
case "conversation.item.input_audio_transcription.completed": {
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
202
|
+
if (!this.closing && this.queue !== null) {
|
|
203
|
+
text = ev.transcript as string
|
|
204
|
+
const itemId = ev.item_id as string
|
|
205
|
+
const timing = speechTiming.get(itemId)
|
|
206
|
+
const start = timing ? Duration.fromMillis(timing.startMs) : DateTime.now().diff(this.timeOpen!)
|
|
207
|
+
const end = timing ? Duration.fromMillis(timing.endMs) : start
|
|
208
|
+
const chunk = new SpeechFlowChunk(start, end, "final", "text", text)
|
|
209
|
+
chunk.meta = aggregateMeta(start, end)
|
|
210
|
+
metastore.prune(start)
|
|
211
|
+
speechTiming.delete(itemId)
|
|
212
|
+
this.queue.write(chunk)
|
|
213
|
+
text = ""
|
|
214
|
+
}
|
|
199
215
|
break
|
|
200
216
|
}
|
|
201
|
-
case "input_audio_buffer.speech_started":
|
|
217
|
+
case "input_audio_buffer.speech_started": {
|
|
202
218
|
this.log("info", "VAD: speech started")
|
|
219
|
+
const itemId = ev.item_id as string
|
|
220
|
+
const audioStartMs = ev.audio_start_ms as number
|
|
221
|
+
speechTiming.set(itemId, { startMs: audioStartMs, endMs: audioStartMs })
|
|
203
222
|
break
|
|
204
|
-
|
|
223
|
+
}
|
|
224
|
+
case "input_audio_buffer.speech_stopped": {
|
|
205
225
|
this.log("info", "VAD: speech stopped")
|
|
226
|
+
const itemId = ev.item_id as string
|
|
227
|
+
const audioEndMs = ev.audio_end_ms as number
|
|
228
|
+
const timing = speechTiming.get(itemId)
|
|
229
|
+
if (timing)
|
|
230
|
+
timing.endMs = audioEndMs
|
|
206
231
|
break
|
|
207
|
-
|
|
232
|
+
}
|
|
233
|
+
case "input_audio_buffer.committed": {
|
|
208
234
|
this.log("info", "input buffer committed")
|
|
209
235
|
break
|
|
210
|
-
|
|
236
|
+
}
|
|
237
|
+
case "error": {
|
|
211
238
|
this.log("error", `error: ${ev.error?.message}`)
|
|
212
239
|
break
|
|
240
|
+
}
|
|
213
241
|
default:
|
|
214
242
|
break
|
|
215
243
|
}
|
|
@@ -220,6 +248,7 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
220
248
|
|
|
221
249
|
/* provide Duplex stream and internally attach to OpenAI API */
|
|
222
250
|
const self = this
|
|
251
|
+
const reads = new util.PromiseSet<void>()
|
|
223
252
|
this.stream = new Stream.Duplex({
|
|
224
253
|
writableObjectMode: true,
|
|
225
254
|
readableObjectMode: true,
|
|
@@ -255,12 +284,32 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
255
284
|
callback()
|
|
256
285
|
}
|
|
257
286
|
},
|
|
287
|
+
async final (callback) {
|
|
288
|
+
if (self.closing || self.ws === null) {
|
|
289
|
+
callback()
|
|
290
|
+
return
|
|
291
|
+
}
|
|
292
|
+
try {
|
|
293
|
+
sendMessage({ type: "input_audio_buffer.commit" })
|
|
294
|
+
self.ws.close()
|
|
295
|
+
await util.sleep(50)
|
|
296
|
+
}
|
|
297
|
+
catch (error) {
|
|
298
|
+
self.log("warning", `error closing OpenAI connection: ${error}`)
|
|
299
|
+
}
|
|
300
|
+
await reads.awaitAll()
|
|
301
|
+
const chunks: Array<SpeechFlowChunk | null> = self.queue?.drain() ?? []
|
|
302
|
+
for (const chunk of chunks)
|
|
303
|
+
this.push(chunk)
|
|
304
|
+
this.push(null)
|
|
305
|
+
callback()
|
|
306
|
+
},
|
|
258
307
|
read (size) {
|
|
259
308
|
if (self.closing || self.queue === null) {
|
|
260
309
|
this.push(null)
|
|
261
310
|
return
|
|
262
311
|
}
|
|
263
|
-
self.queue.read().then((chunk) => {
|
|
312
|
+
reads.add(self.queue.read().then((chunk) => {
|
|
264
313
|
if (self.closing || self.queue === null) {
|
|
265
314
|
this.push(null)
|
|
266
315
|
return
|
|
@@ -276,23 +325,7 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
276
325
|
}).catch((error: unknown) => {
|
|
277
326
|
if (!self.closing && self.queue !== null)
|
|
278
327
|
self.log("error", `queue read error: ${util.ensureError(error).message}`)
|
|
279
|
-
})
|
|
280
|
-
},
|
|
281
|
-
final (callback) {
|
|
282
|
-
if (self.closing || self.ws === null) {
|
|
283
|
-
callback()
|
|
284
|
-
return
|
|
285
|
-
}
|
|
286
|
-
try {
|
|
287
|
-
sendMessage({ type: "input_audio_buffer.commit" })
|
|
288
|
-
self.ws.close()
|
|
289
|
-
/* NOTICE: do not push null here -- let the OpenAI close event handle it */
|
|
290
|
-
callback()
|
|
291
|
-
}
|
|
292
|
-
catch (error) {
|
|
293
|
-
self.log("warning", `error closing OpenAI connection: ${error}`)
|
|
294
|
-
callback(util.ensureError(error, "failed to close OpenAI connection"))
|
|
295
|
-
}
|
|
328
|
+
}))
|
|
296
329
|
}
|
|
297
330
|
})
|
|
298
331
|
}
|
|
@@ -316,11 +349,14 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
316
349
|
|
|
317
350
|
/* close OpenAI connection */
|
|
318
351
|
if (this.ws !== null) {
|
|
352
|
+
this.ws.removeAllListeners()
|
|
319
353
|
this.ws.close()
|
|
320
354
|
this.ws = null
|
|
321
355
|
}
|
|
322
|
-
|
|
323
|
-
|
|
356
|
+
this.openai = null
|
|
357
|
+
|
|
358
|
+
/* close resampler */
|
|
359
|
+
this.resampler = null
|
|
324
360
|
|
|
325
361
|
/* shutdown stream */
|
|
326
362
|
if (this.stream !== null) {
|
|
@@ -83,7 +83,7 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
83
83
|
"Ruth": { language: "en", languageCode: "en-US", engine: "generative" },
|
|
84
84
|
"Stephen": { language: "en", languageCode: "en-US", engine: "generative" },
|
|
85
85
|
"Vicki": { language: "de", languageCode: "de-DE", engine: "generative" },
|
|
86
|
-
"Daniel": { language: "de", languageCode: "de-DE", engine: "generative" }
|
|
86
|
+
"Daniel": { language: "de", languageCode: "de-DE", engine: "generative" }
|
|
87
87
|
}
|
|
88
88
|
const voiceConfig = voices[this.params.voice as keyof typeof voices]
|
|
89
89
|
if (voiceConfig === undefined)
|
|
@@ -147,11 +147,6 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
147
147
|
callback()
|
|
148
148
|
},
|
|
149
149
|
final (callback) {
|
|
150
|
-
if (self.closing) {
|
|
151
|
-
callback()
|
|
152
|
-
return
|
|
153
|
-
}
|
|
154
|
-
this.push(null)
|
|
155
150
|
callback()
|
|
156
151
|
}
|
|
157
152
|
})
|
|
@@ -162,6 +157,12 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
162
157
|
/* indicate closing */
|
|
163
158
|
this.closing = true
|
|
164
159
|
|
|
160
|
+
/* shutdown stream */
|
|
161
|
+
if (this.stream !== null) {
|
|
162
|
+
await util.destroyStream(this.stream)
|
|
163
|
+
this.stream = null
|
|
164
|
+
}
|
|
165
|
+
|
|
165
166
|
/* destroy resampler */
|
|
166
167
|
if (this.resampler !== null)
|
|
167
168
|
this.resampler = null
|
|
@@ -171,11 +172,6 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
171
172
|
this.client.destroy()
|
|
172
173
|
this.client = null
|
|
173
174
|
}
|
|
174
|
-
/* shutdown stream */
|
|
175
|
-
if (this.stream !== null) {
|
|
176
|
-
await util.destroyStream(this.stream)
|
|
177
|
-
this.stream = null
|
|
178
|
-
}
|
|
179
175
|
}
|
|
180
176
|
}
|
|
181
177
|
|
|
@@ -10,6 +10,7 @@ import Stream from "node:stream"
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import * as ElevenLabs from "@elevenlabs/elevenlabs-js"
|
|
12
12
|
import { getStreamAsBuffer } from "get-stream"
|
|
13
|
+
import { Duration } from "luxon"
|
|
13
14
|
import SpeexResampler from "speex-resampler"
|
|
14
15
|
|
|
15
16
|
/* internal dependencies */
|
|
@@ -102,14 +103,15 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
102
103
|
throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`)
|
|
103
104
|
}
|
|
104
105
|
const labels = voice.labels ?? {}
|
|
105
|
-
const info = Object.keys(labels).length > 0
|
|
106
|
-
", " + Object.entries(labels).map(([ key, val ]) => `${key}: "${val}"`).join(", ")
|
|
106
|
+
const info = Object.keys(labels).length > 0
|
|
107
|
+
? ", " + Object.entries(labels).map(([ key, val ]) => `${key}: "${val}"`).join(", ")
|
|
108
|
+
: ""
|
|
107
109
|
this.log("info", `selected voice: name: "${voice.name}"${info}`)
|
|
108
110
|
|
|
109
111
|
/* perform text-to-speech operation with Elevenlabs API */
|
|
110
|
-
const model = this.params.optimize === "quality"
|
|
111
|
-
"eleven_turbo_v2_5"
|
|
112
|
-
"eleven_flash_v2_5"
|
|
112
|
+
const model = this.params.optimize === "quality"
|
|
113
|
+
? "eleven_turbo_v2_5"
|
|
114
|
+
: "eleven_flash_v2_5"
|
|
113
115
|
const speechStream = (text: string) => {
|
|
114
116
|
this.log("info", `ElevenLabs: send text "${text}"`)
|
|
115
117
|
return this.elevenlabs!.textToSpeech.convert(voice.voiceId, {
|
|
@@ -140,58 +142,60 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
140
142
|
readableObjectMode: true,
|
|
141
143
|
decodeStrings: false,
|
|
142
144
|
highWaterMark: 1,
|
|
143
|
-
transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
145
|
+
async transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
144
146
|
if (self.closing)
|
|
145
147
|
callback(new Error("stream already destroyed"))
|
|
146
148
|
else if (Buffer.isBuffer(chunk.payload))
|
|
147
149
|
callback(new Error("invalid chunk payload type"))
|
|
148
150
|
else {
|
|
149
|
-
|
|
150
|
-
|
|
151
|
+
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
152
|
+
processTimeout = null
|
|
153
|
+
callback(new Error("ElevenLabs API timeout"))
|
|
154
|
+
}, 60 * 1000)
|
|
155
|
+
const clearProcessTimeout = () => {
|
|
156
|
+
if (processTimeout !== null) {
|
|
157
|
+
clearTimeout(processTimeout)
|
|
151
158
|
processTimeout = null
|
|
152
|
-
callback(new Error("ElevenLabs API timeout"))
|
|
153
|
-
}, 60 * 1000)
|
|
154
|
-
const clearProcessTimeout = () => {
|
|
155
|
-
if (processTimeout !== null) {
|
|
156
|
-
clearTimeout(processTimeout)
|
|
157
|
-
processTimeout = null
|
|
158
|
-
}
|
|
159
159
|
}
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
callback(new Error("stream destroyed during processing"))
|
|
164
|
-
return
|
|
165
|
-
}
|
|
166
|
-
const stream = await speechStream(chunk.payload as string)
|
|
167
|
-
const buffer = await getStreamAsBuffer(stream)
|
|
168
|
-
if (self.closing) {
|
|
169
|
-
clearProcessTimeout()
|
|
170
|
-
callback(new Error("stream destroyed during processing"))
|
|
171
|
-
return
|
|
172
|
-
}
|
|
173
|
-
const bufferResampled = self.resampler!.processChunk(buffer)
|
|
174
|
-
self.log("info", `ElevenLabs: received audio (buffer length: ${buffer.byteLength})`)
|
|
175
|
-
const chunkNew = chunk.clone()
|
|
176
|
-
chunkNew.type = "audio"
|
|
177
|
-
chunkNew.payload = bufferResampled
|
|
160
|
+
}
|
|
161
|
+
try {
|
|
162
|
+
if (self.closing) {
|
|
178
163
|
clearProcessTimeout()
|
|
179
|
-
|
|
180
|
-
|
|
164
|
+
callback(new Error("stream destroyed during processing"))
|
|
165
|
+
return
|
|
181
166
|
}
|
|
182
|
-
|
|
167
|
+
const stream = await speechStream(chunk.payload as string)
|
|
168
|
+
const buffer = await getStreamAsBuffer(stream)
|
|
169
|
+
if (self.closing) {
|
|
183
170
|
clearProcessTimeout()
|
|
184
|
-
callback(
|
|
171
|
+
callback(new Error("stream destroyed during processing"))
|
|
172
|
+
return
|
|
185
173
|
}
|
|
186
|
-
|
|
174
|
+
self.log("info", `ElevenLabs: received audio (buffer length: ${buffer.byteLength})`)
|
|
175
|
+
const bufferResampled = self.resampler!.processChunk(buffer)
|
|
176
|
+
self.log("info", "ElevenLabs: forwarding resampled audio " +
|
|
177
|
+
`(buffer length: ${bufferResampled.byteLength})`)
|
|
178
|
+
|
|
179
|
+
/* calculate actual audio duration from PCM buffer size */
|
|
180
|
+
const durationMs = util.audioBufferDuration(bufferResampled,
|
|
181
|
+
self.config.audioSampleRate, self.config.audioBitDepth) * 1000
|
|
182
|
+
|
|
183
|
+
/* create new chunk with recalculated timestamps */
|
|
184
|
+
const chunkNew = chunk.clone()
|
|
185
|
+
chunkNew.type = "audio"
|
|
186
|
+
chunkNew.payload = bufferResampled
|
|
187
|
+
chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
|
|
188
|
+
clearProcessTimeout()
|
|
189
|
+
this.push(chunkNew)
|
|
190
|
+
callback()
|
|
191
|
+
}
|
|
192
|
+
catch (error) {
|
|
193
|
+
clearProcessTimeout()
|
|
194
|
+
callback(util.ensureError(error, "ElevenLabs processing failed"))
|
|
195
|
+
}
|
|
187
196
|
}
|
|
188
197
|
},
|
|
189
198
|
final (callback) {
|
|
190
|
-
if (self.closing) {
|
|
191
|
-
callback()
|
|
192
|
-
return
|
|
193
|
-
}
|
|
194
|
-
this.push(null)
|
|
195
199
|
callback()
|
|
196
200
|
}
|
|
197
201
|
})
|
|
@@ -22,6 +22,7 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
22
22
|
|
|
23
23
|
/* internal state */
|
|
24
24
|
private kokoro: KokoroTTS | null = null
|
|
25
|
+
private closing = false
|
|
25
26
|
private resampler: SpeexResampler | null = null
|
|
26
27
|
|
|
27
28
|
/* construct node */
|
|
@@ -32,7 +33,7 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
32
33
|
this.configure({
|
|
33
34
|
voice: { type: "string", val: "Aoede", pos: 0, match: /^(?:Aoede|Heart|Puck|Fenrir)$/ },
|
|
34
35
|
language: { type: "string", val: "en", pos: 1, match: /^(?:en)$/ },
|
|
35
|
-
speed: { type: "number", val: 1.25, pos: 2, match: (n: number) => n >= 1.0 && n <= 1.30 }
|
|
36
|
+
speed: { type: "number", val: 1.25, pos: 2, match: (n: number) => n >= 1.0 && n <= 1.30 }
|
|
36
37
|
})
|
|
37
38
|
|
|
38
39
|
/* declare node input/output format */
|
|
@@ -40,8 +41,16 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
40
41
|
this.output = "audio"
|
|
41
42
|
}
|
|
42
43
|
|
|
44
|
+
/* one-time status of node */
|
|
45
|
+
async status () {
|
|
46
|
+
return {}
|
|
47
|
+
}
|
|
48
|
+
|
|
43
49
|
/* open node */
|
|
44
50
|
async open () {
|
|
51
|
+
/* clear destruction flag */
|
|
52
|
+
this.closing = false
|
|
53
|
+
|
|
45
54
|
/* establish Kokoro */
|
|
46
55
|
const model = "onnx-community/Kokoro-82M-v1.0-ONNX"
|
|
47
56
|
const progressState = new Map<string, number>()
|
|
@@ -126,15 +135,19 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
126
135
|
decodeStrings: false,
|
|
127
136
|
highWaterMark: 1,
|
|
128
137
|
transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
129
|
-
if (
|
|
138
|
+
if (self.closing)
|
|
139
|
+
callback(new Error("stream already destroyed"))
|
|
140
|
+
else if (Buffer.isBuffer(chunk.payload))
|
|
130
141
|
callback(new Error("invalid chunk payload type"))
|
|
131
142
|
else {
|
|
132
143
|
text2speech(chunk.payload).then((buffer) => {
|
|
144
|
+
if (self.closing)
|
|
145
|
+
throw new Error("stream destroyed during processing")
|
|
133
146
|
self.log("info", `Kokoro: received audio (buffer length: ${buffer.byteLength})`)
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
this.push(
|
|
147
|
+
const chunkNew = chunk.clone()
|
|
148
|
+
chunkNew.type = "audio"
|
|
149
|
+
chunkNew.payload = buffer
|
|
150
|
+
this.push(chunkNew)
|
|
138
151
|
callback()
|
|
139
152
|
}).catch((error: unknown) => {
|
|
140
153
|
callback(util.ensureError(error))
|
|
@@ -142,7 +155,6 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
142
155
|
}
|
|
143
156
|
},
|
|
144
157
|
final (callback) {
|
|
145
|
-
this.push(null)
|
|
146
158
|
callback()
|
|
147
159
|
}
|
|
148
160
|
})
|
|
@@ -150,6 +162,9 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
150
162
|
|
|
151
163
|
/* close node */
|
|
152
164
|
async close () {
|
|
165
|
+
/* indicate closing */
|
|
166
|
+
this.closing = true
|
|
167
|
+
|
|
153
168
|
/* shutdown stream */
|
|
154
169
|
if (this.stream !== null) {
|
|
155
170
|
await util.destroyStream(this.stream)
|
|
@@ -98,7 +98,7 @@ export default class SpeechFlowNodeT2TAmazon extends SpeechFlowNode {
|
|
|
98
98
|
if (!retriable || attempt >= maxRetries)
|
|
99
99
|
break
|
|
100
100
|
const delayMs = Math.min(1000 * Math.pow(2, attempt - 1), 5000)
|
|
101
|
-
await
|
|
101
|
+
await util.sleep(delayMs)
|
|
102
102
|
}
|
|
103
103
|
}
|
|
104
104
|
throw util.ensureError(lastError)
|
|
@@ -129,7 +129,6 @@ export default class SpeechFlowNodeT2TAmazon extends SpeechFlowNode {
|
|
|
129
129
|
}
|
|
130
130
|
},
|
|
131
131
|
final (callback) {
|
|
132
|
-
this.push(null)
|
|
133
132
|
callback()
|
|
134
133
|
}
|
|
135
134
|
})
|