speechflow 1.7.1 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/README.md +388 -120
- package/etc/claude.md +5 -5
- package/etc/speechflow.yaml +2 -2
- package/package.json +3 -3
- package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-cli.js +1 -0
- package/speechflow-cli/dst/speechflow-main-cli.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-graph.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-main-graph.js +30 -9
- package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-nodes.js +1 -0
- package/speechflow-cli/dst/speechflow-main-nodes.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +1 -0
- package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +7 -9
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -0
- package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-expander.js +8 -9
- package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-filler.js +2 -0
- package/speechflow-cli/dst/speechflow-node-a2a-filler.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-gender.js +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-meter.js +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +11 -9
- package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js +1 -0
- package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-speex.js +4 -2
- package/speechflow-cli/dst/speechflow-node-a2a-speex.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-vad.js +19 -22
- package/speechflow-cli/dst/speechflow-node-a2a-vad.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js +31 -4
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +2 -11
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +16 -0
- package/speechflow-cli/dst/speechflow-node-a2t-google.js +314 -0
- package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js +6 -1
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.d.ts +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +27 -7
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.d.ts +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +5 -3
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-google.d.ts +15 -0
- package/speechflow-cli/dst/speechflow-node-t2a-google.js +215 -0
- package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +27 -6
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-openai.d.ts +15 -0
- package/speechflow-cli/dst/speechflow-node-t2a-openai.js +192 -0
- package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +17 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +619 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js +0 -2
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-google.js.map +1 -1
- package/speechflow-cli/dst/{speechflow-node-t2t-transformers.d.ts → speechflow-node-t2t-opus.d.ts} +1 -3
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js +161 -0
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.d.ts +11 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.js +118 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.d.ts +13 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +220 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -0
- package/speechflow-cli/dst/{speechflow-node-t2t-openai.d.ts → speechflow-node-t2t-spellcheck.d.ts} +2 -2
- package/speechflow-cli/dst/{speechflow-node-t2t-openai.js → speechflow-node-t2t-spellcheck.js} +48 -100
- package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +8 -8
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-summary.d.ts +16 -0
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js +241 -0
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -0
- package/speechflow-cli/dst/{speechflow-node-t2t-ollama.d.ts → speechflow-node-t2t-translate.d.ts} +2 -2
- package/speechflow-cli/dst/{speechflow-node-t2t-transformers.js → speechflow-node-t2t-translate.js} +53 -115
- package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-x2x-filter.js +2 -0
- package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-exec.d.ts +12 -0
- package/speechflow-cli/dst/speechflow-node-xio-exec.js +224 -0
- package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-file.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-file.js +78 -67
- package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-vban.d.ts +17 -0
- package/speechflow-cli/dst/speechflow-node-xio-vban.js +330 -0
- package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.d.ts +39 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +502 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-websocket.js +9 -9
- package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-audio.js +8 -5
- package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-error.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-util-error.js +5 -0
- package/speechflow-cli/dst/speechflow-util-error.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-llm.d.ts +35 -0
- package/speechflow-cli/dst/speechflow-util-llm.js +363 -0
- package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -0
- package/speechflow-cli/dst/speechflow-util-queue.js +2 -1
- package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-util.js +2 -0
- package/speechflow-cli/dst/speechflow-util.js.map +1 -1
- package/speechflow-cli/etc/oxlint.jsonc +2 -1
- package/speechflow-cli/package.json +35 -18
- package/speechflow-cli/src/lib.d.ts +5 -0
- package/speechflow-cli/src/speechflow-main-api.ts +16 -16
- package/speechflow-cli/src/speechflow-main-cli.ts +1 -0
- package/speechflow-cli/src/speechflow-main-graph.ts +38 -14
- package/speechflow-cli/src/speechflow-main-nodes.ts +1 -0
- package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +1 -0
- package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +8 -10
- package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -0
- package/speechflow-cli/src/speechflow-node-a2a-expander.ts +9 -10
- package/speechflow-cli/src/speechflow-node-a2a-filler.ts +2 -0
- package/speechflow-cli/src/speechflow-node-a2a-gender.ts +3 -3
- package/speechflow-cli/src/speechflow-node-a2a-meter.ts +2 -2
- package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +11 -9
- package/speechflow-cli/src/speechflow-node-a2a-rnnoise-wt.ts +1 -0
- package/speechflow-cli/src/speechflow-node-a2a-rnnoise.ts +1 -1
- package/speechflow-cli/src/speechflow-node-a2a-speex.ts +5 -3
- package/speechflow-cli/src/speechflow-node-a2a-vad.ts +20 -23
- package/speechflow-cli/src/speechflow-node-a2a-wav.ts +31 -4
- package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +6 -18
- package/speechflow-cli/src/speechflow-node-a2t-google.ts +315 -0
- package/speechflow-cli/src/speechflow-node-a2t-openai.ts +12 -7
- package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +32 -10
- package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +6 -4
- package/speechflow-cli/src/speechflow-node-t2a-google.ts +203 -0
- package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +33 -10
- package/speechflow-cli/src/speechflow-node-t2a-openai.ts +176 -0
- package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +710 -0
- package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +3 -4
- package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +2 -2
- package/speechflow-cli/src/speechflow-node-t2t-google.ts +1 -1
- package/speechflow-cli/src/speechflow-node-t2t-opus.ts +137 -0
- package/speechflow-cli/src/speechflow-node-t2t-profanity.ts +93 -0
- package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +201 -0
- package/speechflow-cli/src/speechflow-node-t2t-spellcheck.ts +188 -0
- package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +8 -8
- package/speechflow-cli/src/speechflow-node-t2t-summary.ts +229 -0
- package/speechflow-cli/src/speechflow-node-t2t-translate.ts +181 -0
- package/speechflow-cli/src/speechflow-node-x2x-filter.ts +2 -0
- package/speechflow-cli/src/speechflow-node-xio-exec.ts +211 -0
- package/speechflow-cli/src/speechflow-node-xio-file.ts +91 -80
- package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +2 -2
- package/speechflow-cli/src/speechflow-node-xio-vban.ts +325 -0
- package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +535 -0
- package/speechflow-cli/src/speechflow-node-xio-websocket.ts +9 -9
- package/speechflow-cli/src/speechflow-util-audio.ts +10 -5
- package/speechflow-cli/src/speechflow-util-error.ts +9 -0
- package/speechflow-cli/src/speechflow-util-llm.ts +367 -0
- package/speechflow-cli/src/speechflow-util-queue.ts +3 -3
- package/speechflow-cli/src/speechflow-util.ts +2 -0
- package/speechflow-ui-db/package.json +9 -9
- package/speechflow-ui-st/package.json +9 -9
- package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +0 -293
- package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +0 -1
- package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +0 -281
- package/speechflow-cli/src/speechflow-node-t2t-openai.ts +0 -247
- package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +0 -247
|
@@ -9,6 +9,7 @@ import Stream from "node:stream"
|
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import { getStreamAsBuffer } from "get-stream"
|
|
12
|
+
import { Duration } from "luxon"
|
|
12
13
|
import SpeexResampler from "speex-resampler"
|
|
13
14
|
import {
|
|
14
15
|
PollyClient, SynthesizeSpeechCommand,
|
|
@@ -25,9 +26,9 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
25
26
|
public static name = "t2a-amazon"
|
|
26
27
|
|
|
27
28
|
/* internal state */
|
|
28
|
-
private client:
|
|
29
|
-
private closing = false
|
|
29
|
+
private client: PollyClient | null = null
|
|
30
30
|
private resampler: SpeexResampler | null = null
|
|
31
|
+
private closing = false
|
|
31
32
|
|
|
32
33
|
/* construct node */
|
|
33
34
|
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
@@ -129,22 +130,43 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
129
130
|
}
|
|
130
131
|
if (Buffer.isBuffer(chunk.payload))
|
|
131
132
|
callback(new Error("invalid chunk payload type"))
|
|
132
|
-
else if (chunk.payload
|
|
133
|
+
else if (chunk.payload === "")
|
|
134
|
+
callback()
|
|
135
|
+
else {
|
|
136
|
+
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
137
|
+
processTimeout = null
|
|
138
|
+
callback(new Error("AWS Polly API timeout"))
|
|
139
|
+
}, 60 * 1000)
|
|
140
|
+
const clearProcessTimeout = () => {
|
|
141
|
+
if (processTimeout !== null) {
|
|
142
|
+
clearTimeout(processTimeout)
|
|
143
|
+
processTimeout = null
|
|
144
|
+
}
|
|
145
|
+
}
|
|
133
146
|
self.log("debug", `send data (${chunk.payload.length} bytes): "${chunk.payload}"`)
|
|
134
147
|
textToSpeech(chunk.payload as string).then((buffer) => {
|
|
135
|
-
if (self.closing)
|
|
136
|
-
|
|
148
|
+
if (self.closing) {
|
|
149
|
+
clearProcessTimeout()
|
|
150
|
+
callback(new Error("stream destroyed during processing"))
|
|
151
|
+
return
|
|
152
|
+
}
|
|
153
|
+
/* calculate actual audio duration from PCM buffer size */
|
|
154
|
+
const durationMs = util.audioBufferDuration(buffer,
|
|
155
|
+
self.config.audioSampleRate, self.config.audioBitDepth) * 1000
|
|
156
|
+
|
|
157
|
+
/* create new chunk with recalculated timestamps */
|
|
137
158
|
const chunkNew = chunk.clone()
|
|
138
|
-
chunkNew.type
|
|
139
|
-
chunkNew.payload
|
|
159
|
+
chunkNew.type = "audio"
|
|
160
|
+
chunkNew.payload = buffer
|
|
161
|
+
chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
|
|
162
|
+
clearProcessTimeout()
|
|
140
163
|
this.push(chunkNew)
|
|
141
164
|
callback()
|
|
142
165
|
}).catch((error: unknown) => {
|
|
143
|
-
|
|
166
|
+
clearProcessTimeout()
|
|
167
|
+
callback(util.ensureError(error, "AWS Polly processing failed"))
|
|
144
168
|
})
|
|
145
169
|
}
|
|
146
|
-
else
|
|
147
|
-
callback()
|
|
148
170
|
},
|
|
149
171
|
final (callback) {
|
|
150
172
|
callback()
|
|
@@ -24,8 +24,8 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
24
24
|
|
|
25
25
|
/* internal state */
|
|
26
26
|
private elevenlabs: ElevenLabs.ElevenLabsClient | null = null
|
|
27
|
-
private
|
|
28
|
-
private
|
|
27
|
+
private resampler: SpeexResampler | null = null
|
|
28
|
+
private closing = false
|
|
29
29
|
|
|
30
30
|
/* construct node */
|
|
31
31
|
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
@@ -131,8 +131,8 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
131
131
|
})
|
|
132
132
|
}
|
|
133
133
|
|
|
134
|
-
/* establish resampler from ElevenLabs's
|
|
135
|
-
output to our standard audio sample rate (48KHz) */
|
|
134
|
+
/* establish resampler from ElevenLabs's tier-dependent
|
|
135
|
+
output sample rate to our standard audio sample rate (48KHz) */
|
|
136
136
|
this.resampler = new SpeexResampler(1, maxSampleRate, this.config.audioSampleRate, 7)
|
|
137
137
|
|
|
138
138
|
/* create transform stream and connect it to the ElevenLabs API */
|
|
@@ -147,6 +147,8 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
147
147
|
callback(new Error("stream already destroyed"))
|
|
148
148
|
else if (Buffer.isBuffer(chunk.payload))
|
|
149
149
|
callback(new Error("invalid chunk payload type"))
|
|
150
|
+
else if (chunk.payload === "")
|
|
151
|
+
callback()
|
|
150
152
|
else {
|
|
151
153
|
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
152
154
|
processTimeout = null
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import * as GoogleTTS from "@google-cloud/text-to-speech"
|
|
12
|
+
import { Duration } from "luxon"
|
|
13
|
+
import SpeexResampler from "speex-resampler"
|
|
14
|
+
import * as arktype from "arktype"
|
|
15
|
+
|
|
16
|
+
/* internal dependencies */
|
|
17
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
18
|
+
import * as util from "./speechflow-util"
|
|
19
|
+
|
|
20
|
+
/* SpeechFlow node for Google Cloud text-to-speech conversion */
|
|
21
|
+
export default class SpeechFlowNodeT2AGoogle extends SpeechFlowNode {
|
|
22
|
+
/* declare official node name */
|
|
23
|
+
public static name = "t2a-google"
|
|
24
|
+
|
|
25
|
+
/* internal state */
|
|
26
|
+
private client: GoogleTTS.TextToSpeechClient | null = null
|
|
27
|
+
private resampler: SpeexResampler | null = null
|
|
28
|
+
private closing = false
|
|
29
|
+
|
|
30
|
+
/* construct node */
|
|
31
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
32
|
+
super(id, cfg, opts, args)
|
|
33
|
+
|
|
34
|
+
/* declare node configuration parameters */
|
|
35
|
+
this.configure({
|
|
36
|
+
key: { type: "string", val: process.env.SPEECHFLOW_GOOGLE_KEY ?? "" },
|
|
37
|
+
voice: { type: "string", pos: 0, val: "en-US-Neural2-J" },
|
|
38
|
+
language: { type: "string", pos: 1, val: "en-US" },
|
|
39
|
+
speed: { type: "number", pos: 2, val: 1.0, match: (n: number) => n >= 0.25 && n <= 4.0 },
|
|
40
|
+
pitch: { type: "number", pos: 3, val: 0.0, match: (n: number) => n >= -20.0 && n <= 20.0 }
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
/* validate API key */
|
|
44
|
+
if (this.params.key === "")
|
|
45
|
+
throw new Error("Google Cloud API credentials JSON key is required")
|
|
46
|
+
|
|
47
|
+
/* declare node input/output format */
|
|
48
|
+
this.input = "text"
|
|
49
|
+
this.output = "audio"
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/* one-time status of node */
|
|
53
|
+
async status () {
|
|
54
|
+
return {}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/* open node */
|
|
58
|
+
async open () {
|
|
59
|
+
/* clear destruction flag */
|
|
60
|
+
this.closing = false
|
|
61
|
+
|
|
62
|
+
/* instantiate Google TTS client */
|
|
63
|
+
const data = util.run("Google Cloud API credentials key", () =>
|
|
64
|
+
JSON.parse(this.params.key))
|
|
65
|
+
const credentials = util.importObject("Google Cloud API credentials key",
|
|
66
|
+
data,
|
|
67
|
+
arktype.type({
|
|
68
|
+
project_id: "string",
|
|
69
|
+
private_key: "string",
|
|
70
|
+
client_email: "string"
|
|
71
|
+
})
|
|
72
|
+
)
|
|
73
|
+
this.client = new GoogleTTS.TextToSpeechClient({
|
|
74
|
+
credentials: {
|
|
75
|
+
private_key: credentials.private_key,
|
|
76
|
+
client_email: credentials.client_email
|
|
77
|
+
},
|
|
78
|
+
projectId: credentials.project_id
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
/* establish resampler from Google TTS's output sample rate
|
|
82
|
+
to our standard audio sample rate (48KHz) */
|
|
83
|
+
const googleSampleRate = 24000 /* Google TTS outputs 24kHz for LINEAR16 */
|
|
84
|
+
this.resampler = new SpeexResampler(1, googleSampleRate, this.config.audioSampleRate, 7)
|
|
85
|
+
|
|
86
|
+
/* perform text-to-speech operation with Google Cloud TTS API */
|
|
87
|
+
const textToSpeech = async (text: string) => {
|
|
88
|
+
this.log("info", `Google TTS: send text "${text}"`)
|
|
89
|
+
const [ response ] = await this.client!.synthesizeSpeech({
|
|
90
|
+
input: { text },
|
|
91
|
+
voice: {
|
|
92
|
+
languageCode: this.params.language,
|
|
93
|
+
name: this.params.voice
|
|
94
|
+
},
|
|
95
|
+
audioConfig: {
|
|
96
|
+
audioEncoding: "LINEAR16",
|
|
97
|
+
sampleRateHertz: googleSampleRate,
|
|
98
|
+
speakingRate: this.params.speed,
|
|
99
|
+
pitch: this.params.pitch
|
|
100
|
+
}
|
|
101
|
+
})
|
|
102
|
+
if (!response.audioContent)
|
|
103
|
+
throw new Error("no audio content returned from Google TTS")
|
|
104
|
+
|
|
105
|
+
/* convert response to buffer */
|
|
106
|
+
const buffer = Buffer.isBuffer(response.audioContent)
|
|
107
|
+
? response.audioContent
|
|
108
|
+
: Buffer.from(response.audioContent)
|
|
109
|
+
this.log("info", `Google TTS: received audio (buffer length: ${buffer.byteLength})`)
|
|
110
|
+
|
|
111
|
+
/* resample from Google's sample rate to our standard rate */
|
|
112
|
+
const bufferResampled = this.resampler!.processChunk(buffer)
|
|
113
|
+
this.log("info", `Google TTS: forwarding resampled audio (buffer length: ${bufferResampled.byteLength})`)
|
|
114
|
+
return bufferResampled
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/* create transform stream and connect it to the Google TTS API */
|
|
118
|
+
const self = this
|
|
119
|
+
this.stream = new Stream.Transform({
|
|
120
|
+
writableObjectMode: true,
|
|
121
|
+
readableObjectMode: true,
|
|
122
|
+
decodeStrings: false,
|
|
123
|
+
highWaterMark: 1,
|
|
124
|
+
async transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
125
|
+
if (self.closing)
|
|
126
|
+
callback(new Error("stream already destroyed"))
|
|
127
|
+
else if (Buffer.isBuffer(chunk.payload))
|
|
128
|
+
callback(new Error("invalid chunk payload type"))
|
|
129
|
+
else if (chunk.payload === "")
|
|
130
|
+
callback()
|
|
131
|
+
else {
|
|
132
|
+
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
133
|
+
processTimeout = null
|
|
134
|
+
callback(new Error("Google TTS API timeout"))
|
|
135
|
+
}, 60 * 1000)
|
|
136
|
+
const clearProcessTimeout = () => {
|
|
137
|
+
if (processTimeout !== null) {
|
|
138
|
+
clearTimeout(processTimeout)
|
|
139
|
+
processTimeout = null
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
try {
|
|
143
|
+
if (self.closing) {
|
|
144
|
+
clearProcessTimeout()
|
|
145
|
+
callback(new Error("stream destroyed during processing"))
|
|
146
|
+
return
|
|
147
|
+
}
|
|
148
|
+
const buffer = await textToSpeech(chunk.payload as string)
|
|
149
|
+
if (self.closing) {
|
|
150
|
+
clearProcessTimeout()
|
|
151
|
+
callback(new Error("stream destroyed during processing"))
|
|
152
|
+
return
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/* calculate actual audio duration from PCM buffer size */
|
|
156
|
+
const durationMs = util.audioBufferDuration(buffer,
|
|
157
|
+
self.config.audioSampleRate, self.config.audioBitDepth) * 1000
|
|
158
|
+
|
|
159
|
+
/* create new chunk with recalculated timestamps */
|
|
160
|
+
const chunkNew = chunk.clone()
|
|
161
|
+
chunkNew.type = "audio"
|
|
162
|
+
chunkNew.payload = buffer
|
|
163
|
+
chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
|
|
164
|
+
clearProcessTimeout()
|
|
165
|
+
this.push(chunkNew)
|
|
166
|
+
callback()
|
|
167
|
+
}
|
|
168
|
+
catch (error) {
|
|
169
|
+
clearProcessTimeout()
|
|
170
|
+
callback(util.ensureError(error, "Google TTS processing failed"))
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
},
|
|
174
|
+
final (callback) {
|
|
175
|
+
callback()
|
|
176
|
+
}
|
|
177
|
+
})
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/* close node */
|
|
181
|
+
async close () {
|
|
182
|
+
/* indicate closing */
|
|
183
|
+
this.closing = true
|
|
184
|
+
|
|
185
|
+
/* shutdown stream */
|
|
186
|
+
if (this.stream !== null) {
|
|
187
|
+
await util.destroyStream(this.stream)
|
|
188
|
+
this.stream = null
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/* destroy resampler */
|
|
192
|
+
if (this.resampler !== null)
|
|
193
|
+
this.resampler = null
|
|
194
|
+
|
|
195
|
+
/* destroy Google TTS client */
|
|
196
|
+
if (this.client !== null) {
|
|
197
|
+
await this.client.close().catch((error) => {
|
|
198
|
+
this.log("warning", `error closing Google TTS client: ${error}`)
|
|
199
|
+
})
|
|
200
|
+
this.client = null
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
@@ -9,6 +9,7 @@ import Stream from "node:stream"
|
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import { KokoroTTS } from "kokoro-js"
|
|
12
|
+
import { Duration } from "luxon"
|
|
12
13
|
import SpeexResampler from "speex-resampler"
|
|
13
14
|
|
|
14
15
|
/* internal dependencies */
|
|
@@ -21,9 +22,9 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
21
22
|
public static name = "t2a-kokoro"
|
|
22
23
|
|
|
23
24
|
/* internal state */
|
|
24
|
-
private kokoro:
|
|
25
|
-
private closing = false
|
|
25
|
+
private kokoro: KokoroTTS | null = null
|
|
26
26
|
private resampler: SpeexResampler | null = null
|
|
27
|
+
private closing = false
|
|
27
28
|
|
|
28
29
|
/* construct node */
|
|
29
30
|
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
@@ -122,9 +123,7 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
122
123
|
}
|
|
123
124
|
|
|
124
125
|
/* resample audio samples from PCM/I16/24Khz to PCM/I16/48KHz */
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
return buffer2
|
|
126
|
+
return this.resampler!.processChunk(buffer1)
|
|
128
127
|
}
|
|
129
128
|
|
|
130
129
|
/* create transform stream and connect it to the Kokoro API */
|
|
@@ -139,18 +138,42 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
|
|
|
139
138
|
callback(new Error("stream already destroyed"))
|
|
140
139
|
else if (Buffer.isBuffer(chunk.payload))
|
|
141
140
|
callback(new Error("invalid chunk payload type"))
|
|
141
|
+
else if (chunk.payload === "")
|
|
142
|
+
callback()
|
|
142
143
|
else {
|
|
144
|
+
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
145
|
+
processTimeout = null
|
|
146
|
+
callback(new Error("Kokoro TTS timeout"))
|
|
147
|
+
}, 60 * 1000)
|
|
148
|
+
const clearProcessTimeout = () => {
|
|
149
|
+
if (processTimeout !== null) {
|
|
150
|
+
clearTimeout(processTimeout)
|
|
151
|
+
processTimeout = null
|
|
152
|
+
}
|
|
153
|
+
}
|
|
143
154
|
text2speech(chunk.payload).then((buffer) => {
|
|
144
|
-
if (self.closing)
|
|
145
|
-
|
|
155
|
+
if (self.closing) {
|
|
156
|
+
clearProcessTimeout()
|
|
157
|
+
callback(new Error("stream destroyed during processing"))
|
|
158
|
+
return
|
|
159
|
+
}
|
|
146
160
|
self.log("info", `Kokoro: received audio (buffer length: ${buffer.byteLength})`)
|
|
161
|
+
|
|
162
|
+
/* calculate actual audio duration from PCM buffer size */
|
|
163
|
+
const durationMs = util.audioBufferDuration(buffer,
|
|
164
|
+
self.config.audioSampleRate, self.config.audioBitDepth) * 1000
|
|
165
|
+
|
|
166
|
+
/* create new chunk with recalculated timestamps */
|
|
147
167
|
const chunkNew = chunk.clone()
|
|
148
|
-
chunkNew.type
|
|
149
|
-
chunkNew.payload
|
|
168
|
+
chunkNew.type = "audio"
|
|
169
|
+
chunkNew.payload = buffer
|
|
170
|
+
chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
|
|
171
|
+
clearProcessTimeout()
|
|
150
172
|
this.push(chunkNew)
|
|
151
173
|
callback()
|
|
152
174
|
}).catch((error: unknown) => {
|
|
153
|
-
|
|
175
|
+
clearProcessTimeout()
|
|
176
|
+
callback(util.ensureError(error, "Kokoro processing failed"))
|
|
154
177
|
})
|
|
155
178
|
}
|
|
156
179
|
},
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import OpenAI from "openai"
|
|
12
|
+
import { Duration } from "luxon"
|
|
13
|
+
import SpeexResampler from "speex-resampler"
|
|
14
|
+
|
|
15
|
+
/* internal dependencies */
|
|
16
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
17
|
+
import * as util from "./speechflow-util"
|
|
18
|
+
|
|
19
|
+
/* SpeechFlow node for OpenAI text-to-speech conversion */
|
|
20
|
+
export default class SpeechFlowNodeT2AOpenAI extends SpeechFlowNode {
|
|
21
|
+
/* declare official node name */
|
|
22
|
+
public static name = "t2a-openai"
|
|
23
|
+
|
|
24
|
+
/* internal state */
|
|
25
|
+
private openai: OpenAI | null = null
|
|
26
|
+
private resampler: SpeexResampler | null = null
|
|
27
|
+
private closing = false
|
|
28
|
+
|
|
29
|
+
/* construct node */
|
|
30
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
31
|
+
super(id, cfg, opts, args)
|
|
32
|
+
|
|
33
|
+
/* declare node configuration parameters */
|
|
34
|
+
this.configure({
|
|
35
|
+
key: { type: "string", val: process.env.SPEECHFLOW_OPENAI_KEY },
|
|
36
|
+
api: { type: "string", val: "https://api.openai.com/v1", match: /^https?:\/\/.+/ },
|
|
37
|
+
voice: { type: "string", val: "alloy", pos: 0, match: /^(?:alloy|echo|fable|onyx|nova|shimmer)$/ },
|
|
38
|
+
model: { type: "string", val: "tts-1", pos: 1, match: /^(?:tts-1|tts-1-hd)$/ },
|
|
39
|
+
speed: { type: "number", val: 1.0, pos: 2, match: (n: number) => n >= 0.25 && n <= 4.0 }
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
/* sanity check parameters */
|
|
43
|
+
if (!this.params.key)
|
|
44
|
+
throw new Error("OpenAI API key not configured")
|
|
45
|
+
|
|
46
|
+
/* declare node input/output format */
|
|
47
|
+
this.input = "text"
|
|
48
|
+
this.output = "audio"
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/* one-time status of node */
|
|
52
|
+
async status () {
|
|
53
|
+
return {}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/* open node */
|
|
57
|
+
async open () {
|
|
58
|
+
/* clear destruction flag */
|
|
59
|
+
this.closing = false
|
|
60
|
+
|
|
61
|
+
/* establish OpenAI API connection */
|
|
62
|
+
this.openai = new OpenAI({
|
|
63
|
+
baseURL: this.params.api,
|
|
64
|
+
apiKey: this.params.key,
|
|
65
|
+
timeout: 60000
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
/* establish resampler from OpenAI's 24Khz PCM output
|
|
69
|
+
to our standard audio sample rate (48KHz) */
|
|
70
|
+
this.resampler = new SpeexResampler(1, 24000, this.config.audioSampleRate, 7)
|
|
71
|
+
|
|
72
|
+
/* perform text-to-speech operation with OpenAI API */
|
|
73
|
+
const textToSpeech = async (text: string) => {
|
|
74
|
+
this.log("info", `OpenAI TTS: send text "${text}"`)
|
|
75
|
+
const response = await this.openai!.audio.speech.create({
|
|
76
|
+
model: this.params.model,
|
|
77
|
+
voice: this.params.voice,
|
|
78
|
+
input: text,
|
|
79
|
+
response_format: "pcm",
|
|
80
|
+
speed: this.params.speed
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
/* convert response to buffer (PCM 24kHz, 16-bit, little-endian) */
|
|
84
|
+
const arrayBuffer = await response.arrayBuffer()
|
|
85
|
+
const buffer = Buffer.from(arrayBuffer)
|
|
86
|
+
this.log("info", `OpenAI TTS: received audio (buffer length: ${buffer.byteLength})`)
|
|
87
|
+
|
|
88
|
+
/* resample from 24kHz to 48kHz */
|
|
89
|
+
const bufferResampled = this.resampler!.processChunk(buffer)
|
|
90
|
+
this.log("info", `OpenAI TTS: forwarding resampled audio (buffer length: ${bufferResampled.byteLength})`)
|
|
91
|
+
return bufferResampled
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/* create transform stream and connect it to the OpenAI API */
|
|
95
|
+
const self = this
|
|
96
|
+
this.stream = new Stream.Transform({
|
|
97
|
+
writableObjectMode: true,
|
|
98
|
+
readableObjectMode: true,
|
|
99
|
+
decodeStrings: false,
|
|
100
|
+
highWaterMark: 1,
|
|
101
|
+
async transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
102
|
+
if (self.closing)
|
|
103
|
+
callback(new Error("stream already destroyed"))
|
|
104
|
+
else if (Buffer.isBuffer(chunk.payload))
|
|
105
|
+
callback(new Error("invalid chunk payload type"))
|
|
106
|
+
else if (chunk.payload === "")
|
|
107
|
+
callback()
|
|
108
|
+
else {
|
|
109
|
+
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
110
|
+
processTimeout = null
|
|
111
|
+
callback(new Error("OpenAI TTS API timeout"))
|
|
112
|
+
}, 60 * 1000)
|
|
113
|
+
const clearProcessTimeout = () => {
|
|
114
|
+
if (processTimeout !== null) {
|
|
115
|
+
clearTimeout(processTimeout)
|
|
116
|
+
processTimeout = null
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
try {
|
|
120
|
+
if (self.closing) {
|
|
121
|
+
clearProcessTimeout()
|
|
122
|
+
callback(new Error("stream destroyed during processing"))
|
|
123
|
+
return
|
|
124
|
+
}
|
|
125
|
+
const buffer = await textToSpeech(chunk.payload as string)
|
|
126
|
+
if (self.closing) {
|
|
127
|
+
clearProcessTimeout()
|
|
128
|
+
callback(new Error("stream destroyed during processing"))
|
|
129
|
+
return
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/* calculate actual audio duration from PCM buffer size */
|
|
133
|
+
const durationMs = util.audioBufferDuration(buffer,
|
|
134
|
+
self.config.audioSampleRate, self.config.audioBitDepth) * 1000
|
|
135
|
+
|
|
136
|
+
/* create new chunk with recalculated timestamps */
|
|
137
|
+
const chunkNew = chunk.clone()
|
|
138
|
+
chunkNew.type = "audio"
|
|
139
|
+
chunkNew.payload = buffer
|
|
140
|
+
chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
|
|
141
|
+
clearProcessTimeout()
|
|
142
|
+
this.push(chunkNew)
|
|
143
|
+
callback()
|
|
144
|
+
}
|
|
145
|
+
catch (error) {
|
|
146
|
+
clearProcessTimeout()
|
|
147
|
+
callback(util.ensureError(error, "OpenAI TTS processing failed"))
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
},
|
|
151
|
+
final (callback) {
|
|
152
|
+
callback()
|
|
153
|
+
}
|
|
154
|
+
})
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/* close node */
|
|
158
|
+
async close () {
|
|
159
|
+
/* indicate closing */
|
|
160
|
+
this.closing = true
|
|
161
|
+
|
|
162
|
+
/* shutdown stream */
|
|
163
|
+
if (this.stream !== null) {
|
|
164
|
+
await util.destroyStream(this.stream)
|
|
165
|
+
this.stream = null
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/* destroy resampler */
|
|
169
|
+
if (this.resampler !== null)
|
|
170
|
+
this.resampler = null
|
|
171
|
+
|
|
172
|
+
/* destroy OpenAI API */
|
|
173
|
+
if (this.openai !== null)
|
|
174
|
+
this.openai = null
|
|
175
|
+
}
|
|
176
|
+
}
|