speechflow 2.0.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/README.md +43 -14
- package/etc/speechflow.yaml +20 -48
- package/etc/stx.conf +2 -2
- package/package.json +5 -5
- package/speechflow-cli/dst/speechflow-node-a2a-gtcrn-wt.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-node-a2a-gtcrn-wt.js +60 -0
- package/speechflow-cli/dst/speechflow-node-a2a-gtcrn-wt.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-a2a-gtcrn.d.ts +15 -0
- package/speechflow-cli/dst/speechflow-node-a2a-gtcrn.js +234 -0
- package/speechflow-cli/dst/speechflow-node-a2a-gtcrn.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-a2a-meter.js +2 -2
- package/speechflow-cli/dst/speechflow-node-a2a-meter.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +19 -11
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-assemblyai.d.ts +16 -0
- package/speechflow-cli/dst/speechflow-node-a2t-assemblyai.js +275 -0
- package/speechflow-cli/dst/speechflow-node-a2t-assemblyai.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js +32 -15
- package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js +7 -6
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +2 -4
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +3 -3
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js +9 -8
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-deepl.js +3 -3
- package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js +5 -5
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.js +26 -6
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-sentence.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-sentence.js +72 -5
- package/speechflow-cli/dst/speechflow-node-t2t-sentence.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-translate.js +50 -25
- package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -1
- package/speechflow-cli/etc/oxlint.jsonc +9 -1
- package/speechflow-cli/etc/stx.conf +1 -1
- package/speechflow-cli/package.d/sherpa-onnx+1.12.23.patch +12 -0
- package/speechflow-cli/package.json +23 -19
- package/speechflow-cli/src/lib.d.ts +30 -4
- package/speechflow-cli/src/speechflow-node-a2a-gtcrn-wt.ts +68 -0
- package/speechflow-cli/src/speechflow-node-a2a-gtcrn.ts +219 -0
- package/speechflow-cli/src/speechflow-node-a2a-meter.ts +2 -2
- package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +21 -12
- package/speechflow-cli/src/speechflow-node-a2t-deepgram.ts +33 -15
- package/speechflow-cli/src/speechflow-node-a2t-openai.ts +9 -8
- package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +2 -4
- package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +3 -3
- package/speechflow-cli/src/speechflow-node-t2a-google.ts +2 -2
- package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +1 -1
- package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +11 -10
- package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +3 -3
- package/speechflow-cli/src/speechflow-node-t2t-opus.ts +6 -6
- package/speechflow-cli/src/speechflow-node-t2t-profanity.ts +30 -11
- package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +1 -1
- package/speechflow-cli/src/speechflow-node-t2t-sentence.ts +86 -10
- package/speechflow-cli/src/speechflow-node-t2t-spellcheck.ts +1 -1
- package/speechflow-cli/src/speechflow-node-t2t-summary.ts +1 -1
- package/speechflow-cli/src/speechflow-node-t2t-translate.ts +54 -29
- package/speechflow-ui-db/dst/index.css +1 -1
- package/speechflow-ui-db/dst/index.js +13 -13
- package/speechflow-ui-db/package.json +16 -15
- package/speechflow-ui-db/src/app.vue +62 -17
- package/speechflow-ui-st/dst/index.css +1 -1
- package/speechflow-ui-st/dst/index.js +32 -32
- package/speechflow-ui-st/package.json +17 -16
- package/speechflow-ui-st/src/app.vue +9 -8
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import { parentPort, workerData } from "node:worker_threads"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import SherpaOnnx from "sherpa-onnx"
|
|
12
|
+
import type {
|
|
13
|
+
SherpaOnnxDenoiserConfig,
|
|
14
|
+
SherpaOnnxOfflineSpeechDenoiser
|
|
15
|
+
} from "sherpa-onnx"
|
|
16
|
+
|
|
17
|
+
/* receive model path from parent thread */
|
|
18
|
+
const modelPath: string = workerData.modelPath
|
|
19
|
+
|
|
20
|
+
/* GTCRN state */
|
|
21
|
+
let denoiser: SherpaOnnxOfflineSpeechDenoiser
|
|
22
|
+
|
|
23
|
+
/* helper: log message to parent */
|
|
24
|
+
const log = (level: string, message: string) => {
|
|
25
|
+
parentPort!.postMessage({ type: "log", level, message })
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/* initialize globals */
|
|
29
|
+
;(async () => {
|
|
30
|
+
try {
|
|
31
|
+
/* create denoiser */
|
|
32
|
+
const config: SherpaOnnxDenoiserConfig = {
|
|
33
|
+
model: {
|
|
34
|
+
gtcrn: {
|
|
35
|
+
model: modelPath
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
numThreads: 1
|
|
39
|
+
}
|
|
40
|
+
denoiser = SherpaOnnx.createOfflineSpeechDenoiser(config)
|
|
41
|
+
log("info", "GTCRN denoiser initialized")
|
|
42
|
+
parentPort!.postMessage({ type: "ready" })
|
|
43
|
+
}
|
|
44
|
+
catch (err) {
|
|
45
|
+
parentPort!.postMessage({ type: "failed", message: `failed to initialize GTCRN: ${err}` })
|
|
46
|
+
process.exit(1)
|
|
47
|
+
}
|
|
48
|
+
})()
|
|
49
|
+
|
|
50
|
+
/* receive messages */
|
|
51
|
+
parentPort!.on("message", (msg) => {
|
|
52
|
+
if (msg.type === "process") {
|
|
53
|
+
const { id, samples } = msg
|
|
54
|
+
|
|
55
|
+
/* process with GTCRN denoiser
|
|
56
|
+
NOTICE: GTCRN can also resample out input, but will always
|
|
57
|
+
produces 16KHz output, so we already fixate 16KHz input here! */
|
|
58
|
+
const result = denoiser.run(samples, 16000)
|
|
59
|
+
|
|
60
|
+
/* copy to transferable ArrayBuffer and send back to parent */
|
|
61
|
+
const samplesDenoised = new Float32Array(result.samples)
|
|
62
|
+
parentPort!.postMessage({ type: "process-done", id, data: samplesDenoised }, [ samplesDenoised.buffer ])
|
|
63
|
+
}
|
|
64
|
+
else if (msg.type === "close") {
|
|
65
|
+
/* shutdown this process */
|
|
66
|
+
process.exit(0)
|
|
67
|
+
}
|
|
68
|
+
})
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import fs from "node:fs"
|
|
9
|
+
import path from "node:path"
|
|
10
|
+
import Stream from "node:stream"
|
|
11
|
+
import { Worker } from "node:worker_threads"
|
|
12
|
+
|
|
13
|
+
/* external dependencies */
|
|
14
|
+
import axios from "axios"
|
|
15
|
+
import SpeexResampler from "speex-resampler"
|
|
16
|
+
|
|
17
|
+
/* internal dependencies */
|
|
18
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
19
|
+
import * as util from "./speechflow-util"
|
|
20
|
+
|
|
21
|
+
/* SpeechFlow node for GTCRN based noise suppression in audio-to-audio passing */
|
|
22
|
+
export default class SpeechFlowNodeA2AGTCRN extends SpeechFlowNode {
|
|
23
|
+
/* declare official node name */
|
|
24
|
+
public static name = "a2a-gtcrn"
|
|
25
|
+
|
|
26
|
+
/* internal state */
|
|
27
|
+
private closing = false
|
|
28
|
+
private worker: Worker | null = null
|
|
29
|
+
private resamplerDown: SpeexResampler | null = null
|
|
30
|
+
private resamplerUp: SpeexResampler | null = null
|
|
31
|
+
|
|
32
|
+
/* construct node */
|
|
33
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
34
|
+
super(id, cfg, opts, args)
|
|
35
|
+
|
|
36
|
+
/* declare node configuration parameters */
|
|
37
|
+
this.configure({})
|
|
38
|
+
|
|
39
|
+
/* declare node input/output format */
|
|
40
|
+
this.input = "audio"
|
|
41
|
+
this.output = "audio"
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/* open node */
|
|
45
|
+
async open () {
|
|
46
|
+
/* clear destruction flag */
|
|
47
|
+
this.closing = false
|
|
48
|
+
|
|
49
|
+
/* ensure GTCRN ONNX model is available */
|
|
50
|
+
const modelUrl = "https://github.com/k2-fsa/sherpa-onnx/" +
|
|
51
|
+
"releases/download/speech-enhancement-models/gtcrn_simple.onnx"
|
|
52
|
+
const modelDir = path.join(this.config.cacheDir, "gtcrn")
|
|
53
|
+
const modelPath = path.resolve(modelDir, "gtcrn_simple.onnx")
|
|
54
|
+
const stat = await fs.promises.stat(modelPath).catch(() => null)
|
|
55
|
+
if (stat === null) {
|
|
56
|
+
this.log("info", `GTCRN model downloading from "${modelUrl}"`)
|
|
57
|
+
await fs.promises.mkdir(modelDir, { recursive: true })
|
|
58
|
+
const response = await axios.get(modelUrl, {
|
|
59
|
+
responseType: "arraybuffer",
|
|
60
|
+
onDownloadProgress: (progressEvent) => {
|
|
61
|
+
if (progressEvent.total) {
|
|
62
|
+
const percent = (progressEvent.loaded / progressEvent.total) * 100
|
|
63
|
+
this.log("info", `GTCRN model download: ${percent.toFixed(1)}%`)
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
})
|
|
67
|
+
await fs.promises.writeFile(modelPath, Buffer.from(response.data))
|
|
68
|
+
this.log("info", `GTCRN model downloaded to "${modelPath}"`)
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/* establish resamplers from SpeechFlow's internal 48KHz
|
|
72
|
+
to GTCRN's required 16KHz format and back */
|
|
73
|
+
this.resamplerDown = new SpeexResampler(1, this.config.audioSampleRate, 16000, 7)
|
|
74
|
+
this.resamplerUp = new SpeexResampler(1, 16000, this.config.audioSampleRate, 7)
|
|
75
|
+
|
|
76
|
+
/* initialize worker */
|
|
77
|
+
this.worker = new Worker(path.resolve(__dirname, "speechflow-node-a2a-gtcrn-wt.js"), {
|
|
78
|
+
workerData: { modelPath }
|
|
79
|
+
})
|
|
80
|
+
this.worker.on("error", (err) => {
|
|
81
|
+
this.log("error", `GTCRN worker thread error: ${err}`)
|
|
82
|
+
this.stream?.emit("error", err)
|
|
83
|
+
})
|
|
84
|
+
this.worker.on("exit", (code) => {
|
|
85
|
+
if (code !== 0)
|
|
86
|
+
this.log("error", `GTCRN worker thread exited with error code ${code}`)
|
|
87
|
+
else
|
|
88
|
+
this.log("info", `GTCRN worker thread exited with regular code ${code}`)
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
/* wait for worker to be ready */
|
|
92
|
+
await new Promise<void>((resolve, reject) => {
|
|
93
|
+
const timeout = setTimeout(() => {
|
|
94
|
+
reject(new Error("GTCRN worker thread initialization timeout"))
|
|
95
|
+
}, 60 * 1000)
|
|
96
|
+
const onMessage = (msg: any) => {
|
|
97
|
+
if (typeof msg === "object" && msg !== null && msg.type === "log")
|
|
98
|
+
this.log(msg.level, msg.message)
|
|
99
|
+
else if (typeof msg === "object" && msg !== null && msg.type === "ready") {
|
|
100
|
+
clearTimeout(timeout)
|
|
101
|
+
this.worker!.off("message", onMessage)
|
|
102
|
+
resolve()
|
|
103
|
+
}
|
|
104
|
+
else if (typeof msg === "object" && msg !== null && msg.type === "failed") {
|
|
105
|
+
clearTimeout(timeout)
|
|
106
|
+
this.worker!.off("message", onMessage)
|
|
107
|
+
reject(new Error(msg.message ?? "GTCRN worker thread initialization failed"))
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
this.worker!.on("message", onMessage)
|
|
111
|
+
this.worker!.once("error", (err) => {
|
|
112
|
+
clearTimeout(timeout)
|
|
113
|
+
reject(err)
|
|
114
|
+
})
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
/* receive message from worker */
|
|
118
|
+
const pending = new Map<string, (arr: Float32Array<ArrayBuffer>) => void>()
|
|
119
|
+
this.worker.on("exit", () => {
|
|
120
|
+
pending.clear()
|
|
121
|
+
})
|
|
122
|
+
this.worker.on("message", (msg: any) => {
|
|
123
|
+
if (typeof msg === "object" && msg !== null && msg.type === "process-done") {
|
|
124
|
+
const cb = pending.get(msg.id)
|
|
125
|
+
pending.delete(msg.id)
|
|
126
|
+
if (cb)
|
|
127
|
+
cb(msg.data)
|
|
128
|
+
else
|
|
129
|
+
this.log("warning", `GTCRN worker thread sent back unexpected id: ${msg.id}`)
|
|
130
|
+
}
|
|
131
|
+
else if (typeof msg === "object" && msg !== null && msg.type === "log")
|
|
132
|
+
this.log(msg.level, msg.message)
|
|
133
|
+
else
|
|
134
|
+
this.log("warning", `GTCRN worker thread sent unexpected message: ${JSON.stringify(msg)}`)
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
/* send message to worker */
|
|
138
|
+
let seq = 0
|
|
139
|
+
const workerProcess = async (samples: Float32Array<ArrayBuffer>) => {
|
|
140
|
+
if (this.closing)
|
|
141
|
+
return samples
|
|
142
|
+
const id = `${seq++}`
|
|
143
|
+
return new Promise<Float32Array<ArrayBuffer>>((resolve) => {
|
|
144
|
+
pending.set(id, (result) => { resolve(result) })
|
|
145
|
+
this.worker!.postMessage({ type: "process", id, samples }, [ samples.buffer ])
|
|
146
|
+
})
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/* establish a transform stream */
|
|
150
|
+
const self = this
|
|
151
|
+
this.stream = new Stream.Transform({
|
|
152
|
+
readableObjectMode: true,
|
|
153
|
+
writableObjectMode: true,
|
|
154
|
+
decodeStrings: false,
|
|
155
|
+
transform (chunk: SpeechFlowChunk & { payload: Buffer }, encoding, callback) {
|
|
156
|
+
if (self.closing) {
|
|
157
|
+
callback(new Error("stream already destroyed"))
|
|
158
|
+
return
|
|
159
|
+
}
|
|
160
|
+
if (!Buffer.isBuffer(chunk.payload))
|
|
161
|
+
callback(new Error("invalid chunk payload type"))
|
|
162
|
+
else {
|
|
163
|
+
/* resample Buffer from 48KHz (SpeechFlow) to 16KHz (GTCRN) */
|
|
164
|
+
const resampledDown = self.resamplerDown!.processChunk(chunk.payload)
|
|
165
|
+
|
|
166
|
+
/* convert Buffer into Float32Array */
|
|
167
|
+
const payload = util.convertBufToF32(resampledDown)
|
|
168
|
+
|
|
169
|
+
/* process with GTCRN */
|
|
170
|
+
workerProcess(payload).then((result: Float32Array<ArrayBuffer>) => {
|
|
171
|
+
/* convert Float32Array into Buffer */
|
|
172
|
+
const buf = util.convertF32ToBuf(result)
|
|
173
|
+
|
|
174
|
+
/* resample Buffer from 16KHz (GTCRN) back to 48KHz (SpeechFlow) */
|
|
175
|
+
const resampledUp = self.resamplerUp!.processChunk(buf)
|
|
176
|
+
|
|
177
|
+
/* update chunk */
|
|
178
|
+
chunk.payload = resampledUp
|
|
179
|
+
|
|
180
|
+
/* forward updated chunk */
|
|
181
|
+
this.push(chunk)
|
|
182
|
+
callback()
|
|
183
|
+
}).catch((err: unknown) => {
|
|
184
|
+
const error = util.ensureError(err)
|
|
185
|
+
self.log("warning", `processing of chunk failed: ${error.message}`)
|
|
186
|
+
callback(error)
|
|
187
|
+
})
|
|
188
|
+
}
|
|
189
|
+
},
|
|
190
|
+
final (callback) {
|
|
191
|
+
callback()
|
|
192
|
+
}
|
|
193
|
+
})
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/* close node */
|
|
197
|
+
async close () {
|
|
198
|
+
/* indicate closing */
|
|
199
|
+
this.closing = true
|
|
200
|
+
|
|
201
|
+
/* shutdown worker */
|
|
202
|
+
if (this.worker !== null) {
|
|
203
|
+
this.worker.terminate()
|
|
204
|
+
this.worker = null
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
/* shutdown stream */
|
|
208
|
+
if (this.stream !== null) {
|
|
209
|
+
await util.destroyStream(this.stream)
|
|
210
|
+
this.stream = null
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/* destroy resamplers */
|
|
214
|
+
if (this.resamplerDown !== null)
|
|
215
|
+
this.resamplerDown = null
|
|
216
|
+
if (this.resamplerUp !== null)
|
|
217
|
+
this.resamplerUp = null
|
|
218
|
+
}
|
|
219
|
+
}
|
|
@@ -81,11 +81,11 @@ export default class SpeechFlowNodeA2AMeter extends SpeechFlowNode {
|
|
|
81
81
|
|
|
82
82
|
/* grab the accumulated chunk data */
|
|
83
83
|
const chunkData = this.chunkBuffer
|
|
84
|
-
this.chunkBuffer =
|
|
84
|
+
this.chunkBuffer = chunkData.subarray(samplesPerChunk)
|
|
85
85
|
|
|
86
86
|
/* update internal audio sample sliding window for LUFS-M */
|
|
87
87
|
if (chunkData.length > sampleWindow.length)
|
|
88
|
-
sampleWindow.set(chunkData.subarray(
|
|
88
|
+
sampleWindow.set(chunkData.subarray(0, sampleWindow.length), 0)
|
|
89
89
|
else {
|
|
90
90
|
sampleWindow.set(sampleWindow.subarray(chunkData.length), 0)
|
|
91
91
|
sampleWindow.set(chunkData, sampleWindow.length - chunkData.length)
|
|
@@ -29,7 +29,7 @@ class AsyncQueue<T> {
|
|
|
29
29
|
const resolve = this.resolvers.shift()
|
|
30
30
|
if (resolve) {
|
|
31
31
|
if (v !== null)
|
|
32
|
-
resolve({ value: v })
|
|
32
|
+
resolve({ value: v, done: false })
|
|
33
33
|
else
|
|
34
34
|
resolve({ value: null, done: true })
|
|
35
35
|
}
|
|
@@ -70,6 +70,7 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
|
|
|
70
70
|
/* internal state */
|
|
71
71
|
private client: TranscribeStreamingClient | null = null
|
|
72
72
|
private clientStream: AsyncIterable<TranscriptResultStream> | null = null
|
|
73
|
+
private audioQueue: AsyncQueue<Uint8Array> | null = null
|
|
73
74
|
private closing = false
|
|
74
75
|
private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
|
|
75
76
|
|
|
@@ -127,7 +128,8 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
|
|
|
127
128
|
})
|
|
128
129
|
|
|
129
130
|
/* create an AudioStream for Amazon Transcribe */
|
|
130
|
-
|
|
131
|
+
this.audioQueue = new AsyncQueue<Uint8Array>()
|
|
132
|
+
const audioQueue = this.audioQueue
|
|
131
133
|
const audioStream = (async function * (q: AsyncQueue<Uint8Array>): AsyncIterable<AudioStream> {
|
|
132
134
|
for await (const chunk of q) {
|
|
133
135
|
yield { AudioEvent: { AudioChunk: chunk } }
|
|
@@ -173,11 +175,11 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
|
|
|
173
175
|
return prev
|
|
174
176
|
}, new Map<string, any>())
|
|
175
177
|
if (this.params.interim) {
|
|
176
|
-
const words = []
|
|
178
|
+
const words: { word: string, start: Duration, end: Duration }[] = []
|
|
177
179
|
for (const item of alt.Items ?? []) {
|
|
178
180
|
if (item.Type === "pronunciation") {
|
|
179
181
|
words.push({
|
|
180
|
-
word: item.Content,
|
|
182
|
+
word: item.Content ?? "",
|
|
181
183
|
start: Duration.fromMillis((item.StartTime ?? 0) * 1000).plus(this.timeZeroOffset),
|
|
182
184
|
end: Duration.fromMillis((item.EndTime ?? 0) * 1000).plus(this.timeZeroOffset)
|
|
183
185
|
})
|
|
@@ -273,10 +275,10 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
|
|
|
273
275
|
/* indicate closing first to stop all async operations */
|
|
274
276
|
this.closing = true
|
|
275
277
|
|
|
276
|
-
/*
|
|
277
|
-
if (this.
|
|
278
|
-
this.
|
|
279
|
-
this.
|
|
278
|
+
/* shutdown stream */
|
|
279
|
+
if (this.stream !== null) {
|
|
280
|
+
await util.destroyStream(this.stream)
|
|
281
|
+
this.stream = null
|
|
280
282
|
}
|
|
281
283
|
|
|
282
284
|
/* close Amazon Transcribe connection */
|
|
@@ -285,10 +287,17 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
|
|
|
285
287
|
this.client = null
|
|
286
288
|
}
|
|
287
289
|
|
|
288
|
-
/*
|
|
289
|
-
if (this.
|
|
290
|
-
|
|
291
|
-
this.
|
|
290
|
+
/* close audio queue */
|
|
291
|
+
if (this.audioQueue !== null) {
|
|
292
|
+
this.audioQueue.push(null)
|
|
293
|
+
this.audioQueue.destroy()
|
|
294
|
+
this.audioQueue = null
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/* signal EOF to any pending read operations */
|
|
298
|
+
if (this.queue !== null) {
|
|
299
|
+
this.queue.write(null)
|
|
300
|
+
this.queue = null
|
|
292
301
|
}
|
|
293
302
|
}
|
|
294
303
|
}
|
|
@@ -37,7 +37,8 @@ export default class SpeechFlowNodeA2TDeepgram extends SpeechFlowNode {
|
|
|
37
37
|
model: { type: "string", val: "nova-2", pos: 0 },
|
|
38
38
|
version: { type: "string", val: "latest", pos: 1 },
|
|
39
39
|
language: { type: "string", val: "multi", pos: 2 },
|
|
40
|
-
interim: { type: "boolean", val: false, pos: 3 }
|
|
40
|
+
interim: { type: "boolean", val: false, pos: 3 },
|
|
41
|
+
keywords: { type: "string", val: "", pos: 4 }
|
|
41
42
|
})
|
|
42
43
|
|
|
43
44
|
/* sanity check parameters */
|
|
@@ -86,34 +87,51 @@ export default class SpeechFlowNodeA2TDeepgram extends SpeechFlowNode {
|
|
|
86
87
|
/* create a store for the meta information */
|
|
87
88
|
const metastore = new util.TimeStore<Map<string, any>>()
|
|
88
89
|
|
|
89
|
-
/*
|
|
90
|
-
const
|
|
91
|
-
let language = "en"
|
|
92
|
-
if (this.params.language !== "en") {
|
|
93
|
-
if (this.params.model.match(/^nova-2/))
|
|
94
|
-
language = this.params.language
|
|
95
|
-
else if (this.params.model.match(/^nova-3/))
|
|
96
|
-
language = "multi"
|
|
97
|
-
}
|
|
98
|
-
this.dg = deepgram.listen.live({
|
|
90
|
+
/* configure Deepgram connection options */
|
|
91
|
+
const options: Deepgram.LiveSchema = {
|
|
99
92
|
mip_opt_out: true,
|
|
100
93
|
model: this.params.model,
|
|
101
94
|
version: this.params.version,
|
|
102
|
-
language,
|
|
103
95
|
channels: this.config.audioChannels,
|
|
104
96
|
sample_rate: this.config.audioSampleRate,
|
|
105
97
|
encoding: "linear16",
|
|
106
98
|
multichannel: false,
|
|
107
99
|
endpointing: false,
|
|
108
100
|
interim_results: this.params.interim,
|
|
109
|
-
smart_format:
|
|
101
|
+
smart_format: false,
|
|
110
102
|
punctuate: true,
|
|
111
103
|
filler_words: true,
|
|
112
|
-
numerals:
|
|
104
|
+
numerals: false,
|
|
113
105
|
diarize: false,
|
|
114
106
|
profanity_filter: false,
|
|
115
107
|
redact: false
|
|
116
|
-
}
|
|
108
|
+
}
|
|
109
|
+
const model = this.params.model as string
|
|
110
|
+
const language = this.params.language as string
|
|
111
|
+
const keywords = this.params.keywords as string
|
|
112
|
+
if (model.match(/^nova-2/) && language !== "en")
|
|
113
|
+
options.language = this.params.language
|
|
114
|
+
else if (model.match(/^nova-3/) && language !== "en")
|
|
115
|
+
options.language = "multi"
|
|
116
|
+
else
|
|
117
|
+
options.language = "en"
|
|
118
|
+
if (keywords !== "") {
|
|
119
|
+
if (model.match(/^nova-2/))
|
|
120
|
+
options.keywords = keywords.split(/(?:\s+|\s*,\s*)/).map((kw) => {
|
|
121
|
+
let boost = 2
|
|
122
|
+
if (kw.startsWith("-")) {
|
|
123
|
+
kw = kw.slice(1)
|
|
124
|
+
boost = -4
|
|
125
|
+
}
|
|
126
|
+
return `${kw}:${boost}`
|
|
127
|
+
})
|
|
128
|
+
else if (model.match(/^nova-3/))
|
|
129
|
+
options.keyterm = keywords.split(/(?:\s+|\s*,\s*)/).join(" ")
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/* connect to Deepgram API */
|
|
133
|
+
const deepgram = Deepgram.createClient(this.params.key)
|
|
134
|
+
this.dg = deepgram.listen.live(options)
|
|
117
135
|
|
|
118
136
|
/* hook onto Deepgram API events */
|
|
119
137
|
this.dg.on(Deepgram.LiveTranscriptionEvents.Transcript, async (data) => {
|
|
@@ -170,9 +170,9 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
170
170
|
/* track transcription text */
|
|
171
171
|
let text = ""
|
|
172
172
|
this.ws.on("message", (data) => {
|
|
173
|
-
let ev:
|
|
173
|
+
let ev: Record<string, unknown>
|
|
174
174
|
try {
|
|
175
|
-
ev = JSON.parse(data.toString())
|
|
175
|
+
ev = JSON.parse(data.toString()) as Record<string, unknown>
|
|
176
176
|
}
|
|
177
177
|
catch (err) {
|
|
178
178
|
this.log("warning", `failed to parse WebSocket message: ${err}`)
|
|
@@ -194,8 +194,8 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
194
194
|
if (this.params.interim && !this.closing && this.queue !== null) {
|
|
195
195
|
const itemId = ev.item_id as string
|
|
196
196
|
const timing = speechTiming.get(itemId)
|
|
197
|
-
const start = timing ? Duration.fromMillis(timing.startMs) : DateTime.now().diff(this.timeOpen!)
|
|
198
|
-
const end = timing ? Duration.fromMillis(timing.endMs) : start
|
|
197
|
+
const start = timing !== undefined ? Duration.fromMillis(timing.startMs) : DateTime.now().diff(this.timeOpen!)
|
|
198
|
+
const end = timing !== undefined ? Duration.fromMillis(timing.endMs) : start
|
|
199
199
|
const chunk = new SpeechFlowChunk(start, end, "intermediate", "text", text)
|
|
200
200
|
chunk.meta = aggregateMeta(start, end)
|
|
201
201
|
this.queue.write(chunk)
|
|
@@ -207,8 +207,8 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
207
207
|
text = ev.transcript as string
|
|
208
208
|
const itemId = ev.item_id as string
|
|
209
209
|
const timing = speechTiming.get(itemId)
|
|
210
|
-
const start = timing ? Duration.fromMillis(timing.startMs) : DateTime.now().diff(this.timeOpen!)
|
|
211
|
-
const end = timing ? Duration.fromMillis(timing.endMs) : start
|
|
210
|
+
const start = timing !== undefined ? Duration.fromMillis(timing.startMs) : DateTime.now().diff(this.timeOpen!)
|
|
211
|
+
const end = timing !== undefined ? Duration.fromMillis(timing.endMs) : start
|
|
212
212
|
const chunk = new SpeechFlowChunk(start, end, "final", "text", text)
|
|
213
213
|
chunk.meta = aggregateMeta(start, end)
|
|
214
214
|
metastore.prune(start)
|
|
@@ -230,7 +230,7 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
230
230
|
const itemId = ev.item_id as string
|
|
231
231
|
const audioEndMs = ev.audio_end_ms as number
|
|
232
232
|
const timing = speechTiming.get(itemId)
|
|
233
|
-
if (timing)
|
|
233
|
+
if (timing !== undefined)
|
|
234
234
|
timing.endMs = audioEndMs
|
|
235
235
|
break
|
|
236
236
|
}
|
|
@@ -239,7 +239,8 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
239
239
|
break
|
|
240
240
|
}
|
|
241
241
|
case "error": {
|
|
242
|
-
|
|
242
|
+
const error = ev.error as { message?: string } | undefined
|
|
243
|
+
this.log("error", `error: ${error?.message ?? "unknown error"}`)
|
|
243
244
|
break
|
|
244
245
|
}
|
|
245
246
|
default:
|
|
@@ -124,11 +124,9 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
124
124
|
decodeStrings: false,
|
|
125
125
|
highWaterMark: 1,
|
|
126
126
|
transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
127
|
-
if (self.closing)
|
|
127
|
+
if (self.closing)
|
|
128
128
|
callback(new Error("stream already destroyed"))
|
|
129
|
-
|
|
130
|
-
}
|
|
131
|
-
if (Buffer.isBuffer(chunk.payload))
|
|
129
|
+
else if (Buffer.isBuffer(chunk.payload))
|
|
132
130
|
callback(new Error("invalid chunk payload type"))
|
|
133
131
|
else if (chunk.payload === "")
|
|
134
132
|
callback()
|
|
@@ -98,7 +98,7 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
98
98
|
const voices = await this.elevenlabs.voices.getAll()
|
|
99
99
|
let voice = voices.voices.find((v) => v.name === this.params.voice)
|
|
100
100
|
if (voice === undefined) {
|
|
101
|
-
voice = voices.voices.find((v) =>
|
|
101
|
+
voice = voices.voices.find((v) => v.name?.startsWith(this.params.voice))
|
|
102
102
|
if (voice === undefined)
|
|
103
103
|
throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`)
|
|
104
104
|
}
|
|
@@ -108,7 +108,7 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
108
108
|
""
|
|
109
109
|
this.log("info", `selected voice: name: "${voice.name}"${info}`)
|
|
110
110
|
|
|
111
|
-
/* perform text-to-speech operation with
|
|
111
|
+
/* perform text-to-speech operation with ElevenLabs API */
|
|
112
112
|
const model = this.params.optimize === "quality" ?
|
|
113
113
|
"eleven_turbo_v2_5" :
|
|
114
114
|
"eleven_flash_v2_5"
|
|
@@ -131,7 +131,7 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
131
131
|
})
|
|
132
132
|
}
|
|
133
133
|
|
|
134
|
-
/* establish resampler from ElevenLabs
|
|
134
|
+
/* establish resampler from ElevenLabs tier-dependent
|
|
135
135
|
output sample rate to our standard audio sample rate (48KHz) */
|
|
136
136
|
this.resampler = new SpeexResampler(1, maxSampleRate, this.config.audioSampleRate, 7)
|
|
137
137
|
|
|
@@ -36,8 +36,8 @@ export default class SpeechFlowNodeT2AGoogle extends SpeechFlowNode {
|
|
|
36
36
|
key: { type: "string", val: process.env.SPEECHFLOW_GOOGLE_KEY ?? "" },
|
|
37
37
|
voice: { type: "string", pos: 0, val: "en-US-Neural2-J" },
|
|
38
38
|
language: { type: "string", pos: 1, val: "en-US" },
|
|
39
|
-
speed: { type: "number", pos: 2, val: 1.0, match: (n: number) => n >=
|
|
40
|
-
pitch: { type: "number", pos: 3, val: 0.0, match: (n: number) => n >= -20.0
|
|
39
|
+
speed: { type: "number", pos: 2, val: 1.0, match: (n: number) => n >= 0.25 && n <= 4.0 },
|
|
40
|
+
pitch: { type: "number", pos: 3, val: 0.0, match: (n: number) => n >= -20.0 && n <= 20.0 }
|
|
41
41
|
})
|
|
42
42
|
|
|
43
43
|
/* validate API key */
|
|
@@ -139,7 +139,7 @@ export default class SpeechFlowNodeT2ASupertonic extends SpeechFlowNode {
|
|
|
139
139
|
const samples = result.audio
|
|
140
140
|
const outputSampleRate = result.sampling_rate
|
|
141
141
|
if (outputSampleRate !== this.sampleRate)
|
|
142
|
-
this.log("
|
|
142
|
+
this.log("warning", `unexpected sample rate ${outputSampleRate}Hz (expected ${this.sampleRate}Hz)`)
|
|
143
143
|
|
|
144
144
|
/* calculate duration */
|
|
145
145
|
const duration = samples.length / outputSampleRate
|
|
@@ -85,15 +85,16 @@ export default class SpeechFlowNodeT2TAmazon extends SpeechFlowNode {
|
|
|
85
85
|
const out = await this.client!.send(cmd)
|
|
86
86
|
return (out.TranslatedText ?? "").trim()
|
|
87
87
|
}
|
|
88
|
-
catch (e:
|
|
88
|
+
catch (e: unknown) {
|
|
89
89
|
lastError = e
|
|
90
90
|
attempt += 1
|
|
91
91
|
|
|
92
92
|
/* simple backoff for transient errors */
|
|
93
|
+
const err = e as { name?: string, $retryable?: boolean }
|
|
93
94
|
const retriable =
|
|
94
|
-
|
|
95
|
-
||
|
|
96
|
-
||
|
|
95
|
+
err?.name === "ThrottlingException"
|
|
96
|
+
|| err?.name === "ServiceUnavailableException"
|
|
97
|
+
|| err?.$retryable === true
|
|
97
98
|
if (!retriable || attempt >= maxRetries)
|
|
98
99
|
break
|
|
99
100
|
const delayMs = Math.min(1000 * Math.pow(2, attempt - 1), 5000)
|
|
@@ -135,17 +136,17 @@ export default class SpeechFlowNodeT2TAmazon extends SpeechFlowNode {
|
|
|
135
136
|
|
|
136
137
|
/* close node */
|
|
137
138
|
async close () {
|
|
138
|
-
/* close Amazon Translate connection */
|
|
139
|
-
if (this.client !== null) {
|
|
140
|
-
this.client.destroy()
|
|
141
|
-
this.client = null
|
|
142
|
-
}
|
|
143
|
-
|
|
144
139
|
/* shutdown stream */
|
|
145
140
|
if (this.stream !== null) {
|
|
146
141
|
await util.destroyStream(this.stream)
|
|
147
142
|
this.stream = null
|
|
148
143
|
}
|
|
144
|
+
|
|
145
|
+
/* close Amazon Translate connection */
|
|
146
|
+
if (this.client !== null) {
|
|
147
|
+
this.client.destroy()
|
|
148
|
+
this.client = null
|
|
149
|
+
}
|
|
149
150
|
}
|
|
150
151
|
}
|
|
151
152
|
|
|
@@ -53,7 +53,7 @@ export default class SpeechFlowNodeT2TDeepL extends SpeechFlowNode {
|
|
|
53
53
|
const usage = await deepl.getUsage()
|
|
54
54
|
const limit = usage?.character?.limit ?? 1
|
|
55
55
|
const percent = limit > 0 ? ((usage?.character?.count ?? 0) / limit * 100) : 0
|
|
56
|
-
return { usage: `${percent.toFixed(
|
|
56
|
+
return { usage: `${percent.toFixed(2)}%` }
|
|
57
57
|
}
|
|
58
58
|
|
|
59
59
|
/* open node */
|
|
@@ -63,7 +63,7 @@ export default class SpeechFlowNodeT2TDeepL extends SpeechFlowNode {
|
|
|
63
63
|
|
|
64
64
|
/* provide text-to-text translation */
|
|
65
65
|
const translate = async (text: string) => {
|
|
66
|
-
const src = this.params.src
|
|
66
|
+
const src = this.params.src
|
|
67
67
|
const dst = this.params.dst === "en" ? "en-US" : this.params.dst
|
|
68
68
|
const result = await this.deepl!.translateText(text, src, dst, {
|
|
69
69
|
splitSentences: "off",
|
|
@@ -95,7 +95,7 @@ export default class SpeechFlowNodeT2TDeepL extends SpeechFlowNode {
|
|
|
95
95
|
this.push(chunkNew)
|
|
96
96
|
callback()
|
|
97
97
|
}).catch((error: unknown) => {
|
|
98
|
-
callback(util.ensureError(error))
|
|
98
|
+
callback(util.ensureError(error, "DeepL translation failed"))
|
|
99
99
|
})
|
|
100
100
|
}
|
|
101
101
|
},
|