speechflow 0.9.8 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/LICENSE.txt +674 -0
- package/README.md +114 -17
- package/dst/speechflow-node-a2a-ffmpeg.js +1 -0
- package/dst/speechflow-node-a2a-ffmpeg.js.map +1 -0
- package/dst/{speechflow-node-deepl.d.ts → speechflow-node-a2a-meter.d.ts} +2 -2
- package/dst/speechflow-node-a2a-meter.js +147 -0
- package/dst/speechflow-node-a2a-meter.js.map +1 -0
- package/dst/speechflow-node-a2a-mute.d.ts +16 -0
- package/dst/speechflow-node-a2a-mute.js +90 -0
- package/dst/speechflow-node-a2a-mute.js.map +1 -0
- package/dst/{speechflow-node-whisper.d.ts → speechflow-node-a2a-vad.d.ts} +2 -5
- package/dst/speechflow-node-a2a-vad.js +272 -0
- package/dst/speechflow-node-a2a-vad.js.map +1 -0
- package/dst/speechflow-node-a2a-wav.js +1 -0
- package/dst/speechflow-node-a2a-wav.js.map +1 -0
- package/dst/speechflow-node-a2t-deepgram.js +2 -1
- package/dst/speechflow-node-a2t-deepgram.js.map +1 -0
- package/dst/speechflow-node-t2a-elevenlabs.js +1 -0
- package/dst/speechflow-node-t2a-elevenlabs.js.map +1 -0
- package/dst/{speechflow-node-elevenlabs.d.ts → speechflow-node-t2a-kokoro.d.ts} +2 -2
- package/dst/speechflow-node-t2a-kokoro.js +148 -0
- package/dst/speechflow-node-t2a-kokoro.js.map +1 -0
- package/dst/speechflow-node-t2t-deepl.js +1 -0
- package/dst/speechflow-node-t2t-deepl.js.map +1 -0
- package/dst/speechflow-node-t2t-format.js +1 -0
- package/dst/speechflow-node-t2t-format.js.map +1 -0
- package/dst/{speechflow-node-gemma.d.ts → speechflow-node-t2t-ollama.d.ts} +1 -1
- package/dst/{speechflow-node-gemma.js → speechflow-node-t2t-ollama.js} +41 -8
- package/dst/speechflow-node-t2t-ollama.js.map +1 -0
- package/dst/{speechflow-node-t2t-gemma.d.ts → speechflow-node-t2t-openai.d.ts} +2 -2
- package/dst/{speechflow-node-t2t-gemma.js → speechflow-node-t2t-openai.js} +43 -30
- package/dst/speechflow-node-t2t-openai.js.map +1 -0
- package/dst/speechflow-node-t2t-subtitle.js +1 -0
- package/dst/speechflow-node-t2t-subtitle.js.map +1 -0
- package/dst/{speechflow-node-opus.d.ts → speechflow-node-t2t-transformers.d.ts} +3 -1
- package/dst/speechflow-node-t2t-transformers.js +264 -0
- package/dst/speechflow-node-t2t-transformers.js.map +1 -0
- package/dst/speechflow-node-x2x-trace.js +3 -2
- package/dst/speechflow-node-x2x-trace.js.map +1 -0
- package/dst/speechflow-node-xio-device.js +1 -0
- package/dst/speechflow-node-xio-device.js.map +1 -0
- package/dst/speechflow-node-xio-file.js +1 -0
- package/dst/speechflow-node-xio-file.js.map +1 -0
- package/dst/speechflow-node-xio-mqtt.js +1 -0
- package/dst/speechflow-node-xio-mqtt.js.map +1 -0
- package/dst/speechflow-node-xio-websocket.js +1 -0
- package/dst/speechflow-node-xio-websocket.js.map +1 -0
- package/dst/speechflow-node.d.ts +3 -0
- package/dst/speechflow-node.js +10 -0
- package/dst/speechflow-node.js.map +1 -0
- package/dst/speechflow-utils.d.ts +33 -0
- package/dst/speechflow-utils.js +183 -1
- package/dst/speechflow-utils.js.map +1 -0
- package/dst/speechflow.js +295 -46
- package/dst/speechflow.js.map +1 -0
- package/etc/speechflow.yaml +14 -5
- package/etc/stx.conf +1 -1
- package/etc/tsconfig.json +2 -2
- package/package.json +17 -10
- package/src/speechflow-node-a2a-meter.ts +125 -0
- package/src/speechflow-node-a2a-mute.ts +101 -0
- package/src/speechflow-node-a2a-vad.ts +266 -0
- package/src/speechflow-node-a2t-deepgram.ts +1 -1
- package/src/speechflow-node-t2a-kokoro.ts +160 -0
- package/src/{speechflow-node-t2t-gemma.ts → speechflow-node-t2t-ollama.ts} +44 -10
- package/src/speechflow-node-t2t-openai.ts +246 -0
- package/src/speechflow-node-t2t-transformers.ts +249 -0
- package/src/speechflow-node-x2x-trace.ts +2 -2
- package/src/speechflow-node-xio-websocket.ts +5 -5
- package/src/speechflow-node.ts +12 -0
- package/src/speechflow-utils.ts +195 -0
- package/src/speechflow.ts +279 -46
- package/dst/speechflow-node-deepgram.d.ts +0 -12
- package/dst/speechflow-node-deepgram.js +0 -220
- package/dst/speechflow-node-deepl.js +0 -128
- package/dst/speechflow-node-device.d.ts +0 -13
- package/dst/speechflow-node-device.js +0 -205
- package/dst/speechflow-node-elevenlabs.js +0 -182
- package/dst/speechflow-node-ffmpeg.d.ts +0 -13
- package/dst/speechflow-node-ffmpeg.js +0 -152
- package/dst/speechflow-node-file.d.ts +0 -11
- package/dst/speechflow-node-file.js +0 -176
- package/dst/speechflow-node-format.d.ts +0 -11
- package/dst/speechflow-node-format.js +0 -80
- package/dst/speechflow-node-mqtt.d.ts +0 -13
- package/dst/speechflow-node-mqtt.js +0 -181
- package/dst/speechflow-node-opus.js +0 -135
- package/dst/speechflow-node-subtitle.d.ts +0 -12
- package/dst/speechflow-node-subtitle.js +0 -96
- package/dst/speechflow-node-t2t-opus.d.ts +0 -12
- package/dst/speechflow-node-t2t-opus.js +0 -135
- package/dst/speechflow-node-trace.d.ts +0 -11
- package/dst/speechflow-node-trace.js +0 -88
- package/dst/speechflow-node-wav.d.ts +0 -11
- package/dst/speechflow-node-wav.js +0 -170
- package/dst/speechflow-node-websocket.d.ts +0 -13
- package/dst/speechflow-node-websocket.js +0 -275
- package/dst/speechflow-node-whisper-common.d.ts +0 -34
- package/dst/speechflow-node-whisper-common.js +0 -7
- package/dst/speechflow-node-whisper-ggml.d.ts +0 -1
- package/dst/speechflow-node-whisper-ggml.js +0 -97
- package/dst/speechflow-node-whisper-onnx.d.ts +0 -1
- package/dst/speechflow-node-whisper-onnx.js +0 -131
- package/dst/speechflow-node-whisper-worker-ggml.d.ts +0 -1
- package/dst/speechflow-node-whisper-worker-ggml.js +0 -97
- package/dst/speechflow-node-whisper-worker-onnx.d.ts +0 -1
- package/dst/speechflow-node-whisper-worker-onnx.js +0 -131
- package/dst/speechflow-node-whisper-worker.d.ts +0 -1
- package/dst/speechflow-node-whisper-worker.js +0 -116
- package/dst/speechflow-node-whisper-worker2.d.ts +0 -1
- package/dst/speechflow-node-whisper-worker2.js +0 -82
- package/dst/speechflow-node-whisper.js +0 -604
- package/src/speechflow-node-t2t-opus.ts +0 -111
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "speechflow",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"x-stdver": "0.
|
|
5
|
-
"x-release": "2025-07-
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"x-stdver": "1.0.0-GA",
|
|
5
|
+
"x-release": "2025-07-16",
|
|
6
6
|
"homepage": "https://github.com/rse/speechflow",
|
|
7
7
|
"description": "Speech Processing Flow Graph",
|
|
8
8
|
"license": "GPL-3.0-only",
|
|
@@ -26,15 +26,20 @@
|
|
|
26
26
|
"@elevenlabs/elevenlabs-js": "2.6.0",
|
|
27
27
|
"stream-transform": "3.4.0",
|
|
28
28
|
"get-stream": "9.0.1",
|
|
29
|
-
"@dotenvx/dotenvx": "1.47.
|
|
29
|
+
"@dotenvx/dotenvx": "1.47.6",
|
|
30
30
|
"speex-resampler": "3.0.1",
|
|
31
31
|
"pcm-convert": "1.6.5",
|
|
32
32
|
"object-path": "0.11.8",
|
|
33
33
|
"ws": "8.18.3",
|
|
34
34
|
"bufferutil": "4.0.9",
|
|
35
35
|
"utf-8-validate": "6.0.5",
|
|
36
|
+
"@hapi/hapi": "21.4.0",
|
|
37
|
+
"@hapi/boom": "10.0.1",
|
|
38
|
+
"hapi-plugin-header": "1.1.8",
|
|
39
|
+
"hapi-plugin-websocket": "2.4.11",
|
|
36
40
|
"@opensumi/reconnecting-websocket": "4.4.0",
|
|
37
41
|
"ollama": "0.5.16",
|
|
42
|
+
"openai": "5.10.0",
|
|
38
43
|
"@rse/ffmpeg": "1.4.2",
|
|
39
44
|
"ffmpeg-stream": "1.0.1",
|
|
40
45
|
"installed-packages": "1.0.13",
|
|
@@ -42,13 +47,15 @@
|
|
|
42
47
|
"wav": "1.0.2",
|
|
43
48
|
"mqtt": "5.13.2",
|
|
44
49
|
"cbor2": "2.0.1",
|
|
50
|
+
"arktype": "2.1.20",
|
|
45
51
|
"pure-uuid": "1.8.1",
|
|
46
52
|
"wavefile": "11.0.0",
|
|
53
|
+
"audio-inspect": "0.0.2",
|
|
47
54
|
"@huggingface/transformers": "3.6.3",
|
|
55
|
+
"kokoro-js": "1.2.1",
|
|
48
56
|
"@ericedouard/vad-node-realtime": "0.2.0",
|
|
49
57
|
"luxon": "3.7.1",
|
|
50
|
-
"wrap-text": "1.0.10"
|
|
51
|
-
"smart-whisper": "0.8.1"
|
|
58
|
+
"wrap-text": "1.0.10"
|
|
52
59
|
},
|
|
53
60
|
"devDependencies": {
|
|
54
61
|
"eslint": "9.31.0",
|
|
@@ -57,14 +64,14 @@
|
|
|
57
64
|
"eslint-plugin-promise": "7.2.1",
|
|
58
65
|
"eslint-plugin-import": "2.32.0",
|
|
59
66
|
"eslint-plugin-node": "11.1.0",
|
|
60
|
-
"@typescript-eslint/eslint-plugin": "8.
|
|
61
|
-
"@typescript-eslint/parser": "8.
|
|
67
|
+
"@typescript-eslint/eslint-plugin": "8.37.0",
|
|
68
|
+
"@typescript-eslint/parser": "8.37.0",
|
|
62
69
|
"oxlint": "1.6.0",
|
|
63
70
|
"eslint-plugin-oxlint": "1.6.0",
|
|
64
71
|
"@biomejs/biome": "2.0.6",
|
|
65
72
|
"eslint-config-biome": "1.9.4",
|
|
66
73
|
|
|
67
|
-
"@types/node": "24.0.
|
|
74
|
+
"@types/node": "24.0.14",
|
|
68
75
|
"@types/yargs": "17.0.33",
|
|
69
76
|
"@types/js-yaml": "4.0.9",
|
|
70
77
|
"@types/object-path": "0.11.4",
|
|
@@ -84,7 +91,7 @@
|
|
|
84
91
|
"cross-env": "7.0.3"
|
|
85
92
|
},
|
|
86
93
|
"overrides": {
|
|
87
|
-
"onnxruntime-node":
|
|
94
|
+
"@huggingface/transformers": { "onnxruntime-node": "1.23.0-dev.20250703-7fc6235861" }
|
|
88
95
|
},
|
|
89
96
|
"upd": [ "!@biomejs/biome" ],
|
|
90
97
|
"engines": {
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import { getLUFS, getRMS, AudioData } from "audio-inspect"
|
|
12
|
+
|
|
13
|
+
/* internal dependencies */
|
|
14
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
15
|
+
import * as utils from "./speechflow-utils"
|
|
16
|
+
|
|
17
|
+
/* SpeechFlow node for audio metering */
|
|
18
|
+
export default class SpeechFlowNodeMeter extends SpeechFlowNode {
|
|
19
|
+
/* declare official node name */
|
|
20
|
+
public static name = "meter"
|
|
21
|
+
|
|
22
|
+
/* internal state */
|
|
23
|
+
interval: ReturnType<typeof setInterval> | null = null
|
|
24
|
+
|
|
25
|
+
/* construct node */
|
|
26
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
27
|
+
super(id, cfg, opts, args)
|
|
28
|
+
|
|
29
|
+
/* declare node configuration parameters */
|
|
30
|
+
this.configure({
|
|
31
|
+
interval: { type: "number", val: 250 }
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
/* declare node input/output format */
|
|
35
|
+
this.input = "audio"
|
|
36
|
+
this.output = "audio"
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/* open node */
|
|
40
|
+
async open () {
|
|
41
|
+
/* sanity check situation */
|
|
42
|
+
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
|
|
43
|
+
throw new Error("VAD node currently supports PCM-S16LE audio only")
|
|
44
|
+
|
|
45
|
+
/* internal state */
|
|
46
|
+
const sampleWindowDuration = 3 /* LUFS-S requires 3s */
|
|
47
|
+
const sampleWindowSize = this.config.audioSampleRate * sampleWindowDuration
|
|
48
|
+
let sampleWindow = new Float32Array(sampleWindowSize)
|
|
49
|
+
sampleWindow.fill(0, 0, sampleWindowSize)
|
|
50
|
+
let lufss = 0
|
|
51
|
+
let rms = 0
|
|
52
|
+
|
|
53
|
+
/* setup loundess emitting interval */
|
|
54
|
+
this.interval = setInterval(() => {
|
|
55
|
+
this.log("info", `LUFS-S: ${lufss.toFixed(1)} dB, RMS: ${rms.toFixed(1)} dB`)
|
|
56
|
+
this.sendResponse([ "meter", "LUFS-S", lufss ])
|
|
57
|
+
this.sendResponse([ "meter", "RMS", rms ])
|
|
58
|
+
}, this.params.interval)
|
|
59
|
+
|
|
60
|
+
/* provide Duplex stream and internally attach to VAD */
|
|
61
|
+
const self = this
|
|
62
|
+
this.stream = new Stream.Transform({
|
|
63
|
+
writableObjectMode: true,
|
|
64
|
+
readableObjectMode: true,
|
|
65
|
+
decodeStrings: false,
|
|
66
|
+
|
|
67
|
+
/* transform audio chunk */
|
|
68
|
+
transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
69
|
+
if (!Buffer.isBuffer(chunk.payload))
|
|
70
|
+
callback(new Error("expected audio input as Buffer chunks"))
|
|
71
|
+
else if (chunk.payload.byteLength === 0)
|
|
72
|
+
callback()
|
|
73
|
+
else {
|
|
74
|
+
/* convert audio samples from PCM/I16 to PCM/F32 */
|
|
75
|
+
const data = utils.convertBufToF32(chunk.payload, self.config.audioLittleEndian)
|
|
76
|
+
|
|
77
|
+
/* update internal audio sample sliding window */
|
|
78
|
+
const fusion = new Float32Array(sampleWindow.length + data.length)
|
|
79
|
+
fusion.set(sampleWindow, 0)
|
|
80
|
+
fusion.set(data, sampleWindow.length)
|
|
81
|
+
sampleWindow = fusion.slice(fusion.length - sampleWindowSize)
|
|
82
|
+
|
|
83
|
+
/* asynchronously calculate the LUFS-S metric */
|
|
84
|
+
setTimeout(() => {
|
|
85
|
+
const audioData = {
|
|
86
|
+
sampleRate: self.config.audioSampleRate,
|
|
87
|
+
numberOfChannels: self.config.audioChannels,
|
|
88
|
+
channelData: [ sampleWindow ],
|
|
89
|
+
duration: sampleWindowDuration,
|
|
90
|
+
length: sampleWindow.length
|
|
91
|
+
} satisfies AudioData
|
|
92
|
+
const lufs = getLUFS(audioData, {
|
|
93
|
+
channelMode: self.config.audioChannels === 1 ? "mono" : "stereo",
|
|
94
|
+
calculateShortTerm: true,
|
|
95
|
+
calculateMomentary: false,
|
|
96
|
+
calculateLoudnessRange: false,
|
|
97
|
+
calculateTruePeak: false
|
|
98
|
+
})
|
|
99
|
+
lufss = lufs.shortTerm ? lufs.shortTerm[0] : 0
|
|
100
|
+
rms = getRMS(audioData, { asDB: true })
|
|
101
|
+
}, 0)
|
|
102
|
+
|
|
103
|
+
/* pass-through original audio chunk */
|
|
104
|
+
this.push(chunk)
|
|
105
|
+
callback()
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
})
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/* close node */
|
|
112
|
+
async close () {
|
|
113
|
+
/* close stream */
|
|
114
|
+
if (this.stream !== null) {
|
|
115
|
+
this.stream.destroy()
|
|
116
|
+
this.stream = null
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/* stop interval */
|
|
120
|
+
if (this.interval !== null) {
|
|
121
|
+
clearInterval(this.interval)
|
|
122
|
+
this.interval = null
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* internal dependencies */
|
|
11
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
12
|
+
|
|
13
|
+
/* the type of muting */
|
|
14
|
+
type MuteMode =
|
|
15
|
+
"none" | /* not muted */
|
|
16
|
+
"silenced" | /* muted by changing audio samples to silence */
|
|
17
|
+
"unplugged" /* muted by unplugging the audio sample flow */
|
|
18
|
+
|
|
19
|
+
/* SpeechFlow node for muting in audio-to-audio passing */
|
|
20
|
+
export default class SpeechFlowNodeMute extends SpeechFlowNode {
|
|
21
|
+
/* declare official node name */
|
|
22
|
+
public static name = "mute"
|
|
23
|
+
|
|
24
|
+
/* internal state */
|
|
25
|
+
private muteMode: MuteMode = "none"
|
|
26
|
+
|
|
27
|
+
/* construct node */
|
|
28
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
29
|
+
super(id, cfg, opts, args)
|
|
30
|
+
|
|
31
|
+
/* declare node configuration parameters */
|
|
32
|
+
this.configure({})
|
|
33
|
+
|
|
34
|
+
/* declare node input/output format */
|
|
35
|
+
this.input = "audio"
|
|
36
|
+
this.output = "audio"
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/* receive external request */
|
|
40
|
+
async receiveRequest (params: any[]) {
|
|
41
|
+
if (params.length === 2 && params[0] === "mode") {
|
|
42
|
+
if (!params[1].match(/^(?:none|silenced|unplugged)$/))
|
|
43
|
+
throw new Error("mute: invalid mode argument in external request")
|
|
44
|
+
const muteMode: MuteMode = params[1] as MuteMode
|
|
45
|
+
this.setMuteMode(muteMode)
|
|
46
|
+
this.sendResponse([ "mute", "mode", muteMode ])
|
|
47
|
+
}
|
|
48
|
+
else
|
|
49
|
+
throw new Error("mute: invalid arguments in external request")
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/* change mute mode */
|
|
53
|
+
setMuteMode (mode: MuteMode) {
|
|
54
|
+
this.log("info", `setting mute mode to "${mode}"`)
|
|
55
|
+
this.muteMode = mode
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/* open node */
|
|
59
|
+
async open () {
|
|
60
|
+
/* establish a transform stream */
|
|
61
|
+
const self = this
|
|
62
|
+
this.stream = new Stream.Transform({
|
|
63
|
+
readableObjectMode: true,
|
|
64
|
+
writableObjectMode: true,
|
|
65
|
+
decodeStrings: false,
|
|
66
|
+
transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
67
|
+
if (!Buffer.isBuffer(chunk.payload))
|
|
68
|
+
callback(new Error("invalid chunk payload type"))
|
|
69
|
+
else if (self.muteMode === "unplugged")
|
|
70
|
+
/* pass-through nothing */
|
|
71
|
+
callback()
|
|
72
|
+
else if (self.muteMode === "silenced") {
|
|
73
|
+
/* pass-through a silenced chunk */
|
|
74
|
+
chunk = chunk.clone()
|
|
75
|
+
const buffer = chunk.payload as Buffer
|
|
76
|
+
buffer.fill(0)
|
|
77
|
+
callback()
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
/* pass-through original chunk */
|
|
81
|
+
this.push(chunk)
|
|
82
|
+
callback()
|
|
83
|
+
}
|
|
84
|
+
},
|
|
85
|
+
final (callback) {
|
|
86
|
+
this.push(null)
|
|
87
|
+
callback()
|
|
88
|
+
}
|
|
89
|
+
})
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/* close node */
|
|
93
|
+
async close () {
|
|
94
|
+
/* close stream */
|
|
95
|
+
if (this.stream !== null) {
|
|
96
|
+
this.stream.destroy()
|
|
97
|
+
this.stream = null
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import { RealTimeVAD } from "@ericedouard/vad-node-realtime"
|
|
12
|
+
import { Duration } from "luxon"
|
|
13
|
+
|
|
14
|
+
/* internal dependencies */
|
|
15
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
16
|
+
import * as utils from "./speechflow-utils"
|
|
17
|
+
|
|
18
|
+
/* audio stream queue element */
|
|
19
|
+
type AudioQueueElement = {
|
|
20
|
+
type: "audio-frame",
|
|
21
|
+
chunk: SpeechFlowChunk,
|
|
22
|
+
isSpeech?: boolean
|
|
23
|
+
} | {
|
|
24
|
+
type: "audio-eof"
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/* SpeechFlow node for VAD speech-to-speech processing */
|
|
28
|
+
export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
29
|
+
/* declare official node name */
|
|
30
|
+
public static name = "vad"
|
|
31
|
+
|
|
32
|
+
/* internal state */
|
|
33
|
+
private vad: RealTimeVAD | null = null
|
|
34
|
+
private queue = new utils.Queue<AudioQueueElement>()
|
|
35
|
+
private queueRecv = this.queue.pointerUse("recv")
|
|
36
|
+
private queueVAD = this.queue.pointerUse("vad")
|
|
37
|
+
private queueSend = this.queue.pointerUse("send")
|
|
38
|
+
|
|
39
|
+
/* construct node */
|
|
40
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
41
|
+
super(id, cfg, opts, args)
|
|
42
|
+
|
|
43
|
+
/* declare node configuration parameters */
|
|
44
|
+
this.configure({
|
|
45
|
+
mode: { type: "string", val: "unplugged", match: /^(?:silenced|unplugged)$/ },
|
|
46
|
+
posSpeechThreshold: { type: "number", val: 0.50 },
|
|
47
|
+
negSpeechThreshold: { type: "number", val: 0.35 },
|
|
48
|
+
minSpeechFrames: { type: "number", val: 2 },
|
|
49
|
+
redemptionFrames: { type: "number", val: 12 },
|
|
50
|
+
preSpeechPadFrames: { type: "number", val: 1 }
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
/* declare node input/output format */
|
|
54
|
+
this.input = "audio"
|
|
55
|
+
this.output = "audio"
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/* open node */
|
|
59
|
+
async open () {
|
|
60
|
+
/* sanity check situation */
|
|
61
|
+
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
|
|
62
|
+
throw new Error("VAD node currently supports PCM-S16LE audio only")
|
|
63
|
+
|
|
64
|
+
/* pass-through logging */
|
|
65
|
+
const log = (level: string, msg: string) => { this.log(level, msg) }
|
|
66
|
+
|
|
67
|
+
/* internal processing constants */
|
|
68
|
+
const vadSampleRateTarget = 16000 /* internal target of VAD */
|
|
69
|
+
const vadSamplesPerFrame = 512 /* required for VAD v5 */
|
|
70
|
+
|
|
71
|
+
/* establish Voice Activity Detection (VAD) facility */
|
|
72
|
+
this.vad = await RealTimeVAD.new({
|
|
73
|
+
model: "v5",
|
|
74
|
+
sampleRate: this.config.audioSampleRate, /* before resampling to 16KHz */
|
|
75
|
+
frameSamples: vadSamplesPerFrame, /* after resampling to 16KHz */
|
|
76
|
+
positiveSpeechThreshold: this.params.posSpeechThreshold,
|
|
77
|
+
negativeSpeechThreshold: this.params.negSpeechThreshold,
|
|
78
|
+
minSpeechFrames: this.params.minSpeechFrames,
|
|
79
|
+
redemptionFrames: this.params.redemptionFrames,
|
|
80
|
+
preSpeechPadFrames: this.params.preSpeechPadFrames,
|
|
81
|
+
onSpeechStart: () => {
|
|
82
|
+
log("info", "VAD: speech start")
|
|
83
|
+
},
|
|
84
|
+
onSpeechEnd: (audio) => {
|
|
85
|
+
const duration = utils.audioArrayDuration(audio, vadSampleRateTarget)
|
|
86
|
+
log("info", `VAD: speech end (duration: ${duration.toFixed(2)}s)`)
|
|
87
|
+
},
|
|
88
|
+
onVADMisfire: () => {
|
|
89
|
+
log("info", "VAD: speech end (segment too short)")
|
|
90
|
+
},
|
|
91
|
+
onFrameProcessed: (audio) => {
|
|
92
|
+
/* annotate the current audio frame */
|
|
93
|
+
const element = this.queueVAD.peek()
|
|
94
|
+
if (element !== undefined && element.type === "audio-frame") {
|
|
95
|
+
const isSpeech = audio.isSpeech > audio.notSpeech
|
|
96
|
+
element.isSpeech = isSpeech
|
|
97
|
+
this.queueVAD.touch()
|
|
98
|
+
this.queueVAD.walk(+1)
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
})
|
|
102
|
+
this.vad.start()
|
|
103
|
+
|
|
104
|
+
/* provide Duplex stream and internally attach to VAD */
|
|
105
|
+
const vad = this.vad
|
|
106
|
+
const cfg = this.config
|
|
107
|
+
const queue = this.queue
|
|
108
|
+
const queueRecv = this.queueRecv
|
|
109
|
+
const queueSend = this.queueSend
|
|
110
|
+
const mode = this.params.mode
|
|
111
|
+
let carrySamples = new Float32Array()
|
|
112
|
+
let carryStart = Duration.fromDurationLike(0)
|
|
113
|
+
this.stream = new Stream.Duplex({
|
|
114
|
+
writableObjectMode: true,
|
|
115
|
+
readableObjectMode: true,
|
|
116
|
+
decodeStrings: false,
|
|
117
|
+
|
|
118
|
+
/* receive audio chunk (writable side of stream) */
|
|
119
|
+
write (chunk: SpeechFlowChunk, encoding, callback) {
|
|
120
|
+
if (!Buffer.isBuffer(chunk.payload))
|
|
121
|
+
callback(new Error("expected audio input as Buffer chunks"))
|
|
122
|
+
else if (chunk.payload.byteLength === 0)
|
|
123
|
+
callback()
|
|
124
|
+
else {
|
|
125
|
+
/* convert audio samples from PCM/I16 to PCM/F32 */
|
|
126
|
+
let data = utils.convertBufToF32(chunk.payload, cfg.audioLittleEndian)
|
|
127
|
+
let start = chunk.timestampStart
|
|
128
|
+
|
|
129
|
+
/* merge previous carry samples */
|
|
130
|
+
if (carrySamples.length > 0) {
|
|
131
|
+
start = carryStart
|
|
132
|
+
const merged = new Float32Array(carrySamples.length + data.length)
|
|
133
|
+
merged.set(carrySamples)
|
|
134
|
+
merged.set(data, carrySamples.length)
|
|
135
|
+
data = merged
|
|
136
|
+
carrySamples = new Float32Array()
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/* queue audio samples as individual VAD-sized frames
|
|
140
|
+
and in parallel send it into the Voice Activity Detection (VAD) */
|
|
141
|
+
const chunkSize = (vadSamplesPerFrame * (cfg.audioSampleRate / vadSampleRateTarget))
|
|
142
|
+
const chunks = Math.trunc(data.length / chunkSize)
|
|
143
|
+
for (let i = 0; i < chunks; i++) {
|
|
144
|
+
const frame = data.slice(i * chunkSize, (i + 1) * chunkSize)
|
|
145
|
+
const buf = utils.convertF32ToBuf(frame)
|
|
146
|
+
const duration = utils.audioBufferDuration(buf)
|
|
147
|
+
const end = start.plus(duration)
|
|
148
|
+
const chunk = new SpeechFlowChunk(start, end, "final", "audio", buf)
|
|
149
|
+
queueRecv.append({ type: "audio-frame", chunk })
|
|
150
|
+
vad.processAudio(frame)
|
|
151
|
+
start = end
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/* remember new carry samples */
|
|
155
|
+
const bulkLen = chunks * chunkSize
|
|
156
|
+
carrySamples = data.slice(bulkLen)
|
|
157
|
+
carryStart = start
|
|
158
|
+
|
|
159
|
+
callback()
|
|
160
|
+
}
|
|
161
|
+
},
|
|
162
|
+
|
|
163
|
+
/* receive no more audio chunks (writable side of stream) */
|
|
164
|
+
final (callback) {
|
|
165
|
+
/* flush pending audio chunks */
|
|
166
|
+
if (carrySamples.length > 0) {
|
|
167
|
+
const chunkSize = (vadSamplesPerFrame * (cfg.audioSampleRate / vadSampleRateTarget))
|
|
168
|
+
if (carrySamples.length < chunkSize) {
|
|
169
|
+
const merged = new Float32Array(chunkSize)
|
|
170
|
+
merged.set(carrySamples)
|
|
171
|
+
merged.fill(0.0, carrySamples.length, chunkSize)
|
|
172
|
+
carrySamples = merged
|
|
173
|
+
}
|
|
174
|
+
const buf = utils.convertF32ToBuf(carrySamples)
|
|
175
|
+
const duration = utils.audioBufferDuration(buf)
|
|
176
|
+
const end = carryStart.plus(duration)
|
|
177
|
+
const chunk = new SpeechFlowChunk(carryStart, end, "final", "audio", buf)
|
|
178
|
+
queueRecv.append({ type: "audio-frame", chunk })
|
|
179
|
+
vad.processAudio(carrySamples)
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/* signal end of file */
|
|
183
|
+
queueRecv.append({ type: "audio-eof" })
|
|
184
|
+
callback()
|
|
185
|
+
},
|
|
186
|
+
|
|
187
|
+
/* send audio chunk(s) (readable side of stream) */
|
|
188
|
+
read (_size) {
|
|
189
|
+
/* try to perform read operation from scratch */
|
|
190
|
+
const tryToRead = () => {
|
|
191
|
+
/* flush pending audio chunks */
|
|
192
|
+
const flushPendingChunks = () => {
|
|
193
|
+
let pushed = 0
|
|
194
|
+
while (true) {
|
|
195
|
+
const element = queueSend.peek()
|
|
196
|
+
if (element === undefined)
|
|
197
|
+
break
|
|
198
|
+
else if (element.type === "audio-eof") {
|
|
199
|
+
this.push(null)
|
|
200
|
+
break
|
|
201
|
+
}
|
|
202
|
+
else if (element.type === "audio-frame"
|
|
203
|
+
&& element.isSpeech === undefined)
|
|
204
|
+
break
|
|
205
|
+
queueSend.walk(+1)
|
|
206
|
+
if (element.isSpeech) {
|
|
207
|
+
this.push(element.chunk)
|
|
208
|
+
pushed++
|
|
209
|
+
}
|
|
210
|
+
else if (mode === "silenced") {
|
|
211
|
+
const chunk = element.chunk.clone()
|
|
212
|
+
const buffer = chunk.payload as Buffer
|
|
213
|
+
buffer.fill(0)
|
|
214
|
+
this.push(chunk)
|
|
215
|
+
pushed++
|
|
216
|
+
}
|
|
217
|
+
else if (mode === "unplugged" && pushed === 0)
|
|
218
|
+
/* we have to await chunks now, as in unplugged
|
|
219
|
+
mode we else would be never called again until
|
|
220
|
+
we at least once push a new chunk as the result */
|
|
221
|
+
tryToRead()
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/* await forthcoming audio chunks */
|
|
226
|
+
const awaitForthcomingChunks = () => {
|
|
227
|
+
const element = queueSend.peek()
|
|
228
|
+
if (element !== undefined
|
|
229
|
+
&& element.type === "audio-frame"
|
|
230
|
+
&& element.isSpeech !== undefined)
|
|
231
|
+
flushPendingChunks()
|
|
232
|
+
else
|
|
233
|
+
queue.once("write", awaitForthcomingChunks)
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
const element = queueSend.peek()
|
|
237
|
+
if (element !== undefined && element.type === "audio-eof")
|
|
238
|
+
this.push(null)
|
|
239
|
+
else if (element !== undefined
|
|
240
|
+
&& element.type === "audio-frame"
|
|
241
|
+
&& element.isSpeech !== undefined)
|
|
242
|
+
flushPendingChunks()
|
|
243
|
+
else
|
|
244
|
+
queue.once("write", awaitForthcomingChunks)
|
|
245
|
+
}
|
|
246
|
+
tryToRead()
|
|
247
|
+
}
|
|
248
|
+
})
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/* close node */
|
|
252
|
+
async close () {
|
|
253
|
+
/* close stream */
|
|
254
|
+
if (this.stream !== null) {
|
|
255
|
+
this.stream.destroy()
|
|
256
|
+
this.stream = null
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/* close VAD */
|
|
260
|
+
if (this.vad !== null) {
|
|
261
|
+
await this.vad.flush()
|
|
262
|
+
this.vad.destroy()
|
|
263
|
+
this.vad = null
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
@@ -164,7 +164,7 @@ export default class SpeechFlowNodeDeepgram extends SpeechFlowNode {
|
|
|
164
164
|
if (chunk.payload.byteLength > 0) {
|
|
165
165
|
log("info", `Deepgram: send data (${chunk.payload.byteLength} bytes)`)
|
|
166
166
|
initTimeoutStart()
|
|
167
|
-
dg.send(chunk.payload) /* intentionally discard all time information */
|
|
167
|
+
dg.send(chunk.payload.buffer) /* intentionally discard all time information */
|
|
168
168
|
}
|
|
169
169
|
callback()
|
|
170
170
|
}
|