speechflow 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/README.md +37 -3
- package/dst/speechflow-node-a2a-gender.d.ts +17 -0
- package/dst/speechflow-node-a2a-gender.js +272 -0
- package/dst/speechflow-node-a2a-gender.js.map +1 -0
- package/dst/speechflow-node-a2a-meter.js +2 -2
- package/dst/speechflow-node-a2a-meter.js.map +1 -1
- package/dst/speechflow-node-a2a-mute.js +1 -0
- package/dst/speechflow-node-a2a-mute.js.map +1 -1
- package/dst/speechflow-node-a2a-vad.js +47 -63
- package/dst/speechflow-node-a2a-vad.js.map +1 -1
- package/dst/speechflow-node-a2a-wav.js +145 -122
- package/dst/speechflow-node-a2a-wav.js.map +1 -1
- package/dst/speechflow-node-a2t-deepgram.js +13 -3
- package/dst/speechflow-node-a2t-deepgram.js.map +1 -1
- package/dst/speechflow-node-t2a-elevenlabs.js +10 -5
- package/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
- package/dst/speechflow-node-t2a-kokoro.js.map +1 -1
- package/dst/speechflow-node-t2t-deepl.js.map +1 -1
- package/dst/speechflow-node-t2t-format.js.map +1 -1
- package/dst/speechflow-node-t2t-ollama.js.map +1 -1
- package/dst/speechflow-node-t2t-openai.js.map +1 -1
- package/dst/speechflow-node-t2t-subtitle.js.map +1 -1
- package/dst/speechflow-node-t2t-transformers.js.map +1 -1
- package/dst/speechflow-node-x2x-filter.d.ts +11 -0
- package/dst/speechflow-node-x2x-filter.js +113 -0
- package/dst/speechflow-node-x2x-filter.js.map +1 -0
- package/dst/speechflow-node-x2x-trace.js +24 -10
- package/dst/speechflow-node-x2x-trace.js.map +1 -1
- package/dst/speechflow-node-xio-device.js +14 -5
- package/dst/speechflow-node-xio-device.js.map +1 -1
- package/dst/speechflow-node-xio-file.js +58 -27
- package/dst/speechflow-node-xio-file.js.map +1 -1
- package/dst/speechflow-node-xio-mqtt.js.map +1 -1
- package/dst/speechflow-node-xio-websocket.js.map +1 -1
- package/dst/speechflow-node.js +1 -0
- package/dst/speechflow-node.js.map +1 -1
- package/dst/speechflow-utils.d.ts +14 -1
- package/dst/speechflow-utils.js +110 -2
- package/dst/speechflow-utils.js.map +1 -1
- package/dst/speechflow.js +56 -53
- package/dst/speechflow.js.map +1 -1
- package/etc/speechflow.yaml +51 -24
- package/package.json +6 -5
- package/src/speechflow-node-a2a-gender.ts +272 -0
- package/src/speechflow-node-a2a-meter.ts +3 -3
- package/src/speechflow-node-a2a-mute.ts +1 -0
- package/src/speechflow-node-a2a-vad.ts +58 -68
- package/src/speechflow-node-a2a-wav.ts +128 -91
- package/src/speechflow-node-a2t-deepgram.ts +15 -4
- package/src/speechflow-node-t2a-elevenlabs.ts +13 -8
- package/src/speechflow-node-t2a-kokoro.ts +3 -3
- package/src/speechflow-node-t2t-deepl.ts +2 -2
- package/src/speechflow-node-t2t-format.ts +2 -2
- package/src/speechflow-node-t2t-ollama.ts +2 -2
- package/src/speechflow-node-t2t-openai.ts +2 -2
- package/src/speechflow-node-t2t-subtitle.ts +1 -1
- package/src/speechflow-node-t2t-transformers.ts +2 -2
- package/src/speechflow-node-x2x-filter.ts +122 -0
- package/src/speechflow-node-x2x-trace.ts +28 -11
- package/src/speechflow-node-xio-device.ts +20 -8
- package/src/speechflow-node-xio-file.ts +74 -36
- package/src/speechflow-node-xio-mqtt.ts +3 -3
- package/src/speechflow-node-xio-websocket.ts +1 -1
- package/src/speechflow-node.ts +2 -0
- package/src/speechflow-utils.ts +81 -2
- package/src/speechflow.ts +84 -81
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import path from "node:path"
|
|
9
|
+
import Stream from "node:stream"
|
|
10
|
+
|
|
11
|
+
/* external dependencies */
|
|
12
|
+
import * as Transformers from "@huggingface/transformers"
|
|
13
|
+
import { WaveFile } from "wavefile"
|
|
14
|
+
|
|
15
|
+
/* internal dependencies */
|
|
16
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
17
|
+
import * as utils from "./speechflow-utils"
|
|
18
|
+
|
|
19
|
+
/* audio stream queue element */
|
|
20
|
+
type AudioQueueElement = {
|
|
21
|
+
type: "audio-frame",
|
|
22
|
+
chunk: SpeechFlowChunk,
|
|
23
|
+
data: Float32Array,
|
|
24
|
+
gender?: "male" | "female"
|
|
25
|
+
} | {
|
|
26
|
+
type: "audio-eof"
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/* SpeechFlow node for Gender recognition */
|
|
30
|
+
export default class SpeechFlowNodeGender extends SpeechFlowNode {
|
|
31
|
+
/* declare official node name */
|
|
32
|
+
public static name = "gender"
|
|
33
|
+
|
|
34
|
+
/* internal state */
|
|
35
|
+
private static speexInitialized = false
|
|
36
|
+
private classifier: Transformers.AudioClassificationPipeline | null = null
|
|
37
|
+
private queue = new utils.Queue<AudioQueueElement>()
|
|
38
|
+
private queueRecv = this.queue.pointerUse("recv")
|
|
39
|
+
private queueAC = this.queue.pointerUse("ac")
|
|
40
|
+
private queueSend = this.queue.pointerUse("send")
|
|
41
|
+
|
|
42
|
+
/* construct node */
|
|
43
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
44
|
+
super(id, cfg, opts, args)
|
|
45
|
+
|
|
46
|
+
/* declare node configuration parameters */
|
|
47
|
+
this.configure({
|
|
48
|
+
window: { type: "number", pos: 0, val: 500 }
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
/* declare node input/output format */
|
|
52
|
+
this.input = "audio"
|
|
53
|
+
this.output = "audio"
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/* open node */
|
|
57
|
+
async open () {
|
|
58
|
+
/* sanity check situation */
|
|
59
|
+
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
|
|
60
|
+
throw new Error("Gender node currently supports PCM-S16LE audio only")
|
|
61
|
+
|
|
62
|
+
/* pass-through logging */
|
|
63
|
+
const log = (level: string, msg: string) => { this.log(level, msg) }
|
|
64
|
+
|
|
65
|
+
/* the used model */
|
|
66
|
+
const model = "Xenova/wav2vec2-large-xlsr-53-gender-recognition-librispeech"
|
|
67
|
+
|
|
68
|
+
/* track download progress when instantiating Transformers engine and model */
|
|
69
|
+
const progressState = new Map<string, number>()
|
|
70
|
+
const progressCallback: Transformers.ProgressCallback = (progress: any) => {
|
|
71
|
+
let artifact = model
|
|
72
|
+
if (typeof progress.file === "string")
|
|
73
|
+
artifact += `:${progress.file}`
|
|
74
|
+
let percent = 0
|
|
75
|
+
if (typeof progress.loaded === "number" && typeof progress.total === "number")
|
|
76
|
+
percent = (progress.loaded as number / progress.total as number) * 100
|
|
77
|
+
else if (typeof progress.progress === "number")
|
|
78
|
+
percent = progress.progress
|
|
79
|
+
if (percent > 0)
|
|
80
|
+
progressState.set(artifact, percent)
|
|
81
|
+
}
|
|
82
|
+
const interval = setInterval(() => {
|
|
83
|
+
for (const [ artifact, percent ] of progressState) {
|
|
84
|
+
this.log("info", `downloaded ${percent.toFixed(2)}% of artifact "${artifact}"`)
|
|
85
|
+
if (percent >= 1.0)
|
|
86
|
+
progressState.delete(artifact)
|
|
87
|
+
}
|
|
88
|
+
}, 1000)
|
|
89
|
+
|
|
90
|
+
/* instantiate Transformers engine and model */
|
|
91
|
+
const pipeline = Transformers.pipeline("audio-classification", model, {
|
|
92
|
+
cache_dir: path.join(this.config.cacheDir, "gender"),
|
|
93
|
+
dtype: "q4",
|
|
94
|
+
device: "auto",
|
|
95
|
+
progress_callback: progressCallback
|
|
96
|
+
})
|
|
97
|
+
this.classifier = await pipeline
|
|
98
|
+
clearInterval(interval)
|
|
99
|
+
if (this.classifier === null)
|
|
100
|
+
throw new Error("failed to instantiate classifier pipeline")
|
|
101
|
+
|
|
102
|
+
/* classify a single large-enough concatenated audio frame */
|
|
103
|
+
const classify = async (data: Float32Array) => {
|
|
104
|
+
const result = await this.classifier!(data)
|
|
105
|
+
const classified: Transformers.AudioClassificationOutput =
|
|
106
|
+
Array.isArray(result) ? result as Transformers.AudioClassificationOutput : [ result ]
|
|
107
|
+
const c1 = classified.find((c: any) => c.label === "male")
|
|
108
|
+
const c2 = classified.find((c: any) => c.label === "female")
|
|
109
|
+
const male = c1 ? c1.score : 0.0
|
|
110
|
+
const female = c2 ? c2.score : 0.0
|
|
111
|
+
return (male > female ? "male" : "female")
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/* work off queued audio frames */
|
|
115
|
+
const frameWindowDuration = 0.5
|
|
116
|
+
const frameWindowSamples = frameWindowDuration * this.config.audioSampleRate
|
|
117
|
+
let lastGender = ""
|
|
118
|
+
let workingOffTimer: ReturnType<typeof setTimeout> | null = null
|
|
119
|
+
let workingOff = false
|
|
120
|
+
const workOffQueue = async () => {
|
|
121
|
+
/* control working off round */
|
|
122
|
+
if (workingOff)
|
|
123
|
+
return
|
|
124
|
+
workingOff = true
|
|
125
|
+
if (workingOffTimer !== null) {
|
|
126
|
+
clearTimeout(workingOffTimer)
|
|
127
|
+
workingOffTimer = null
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
let pos0 = this.queueAC.position()
|
|
131
|
+
const posL = this.queueAC.maxPosition()
|
|
132
|
+
const data = new Float32Array(frameWindowSamples)
|
|
133
|
+
data.fill(0)
|
|
134
|
+
let samples = 0
|
|
135
|
+
let pos = pos0
|
|
136
|
+
while (pos < posL && samples < frameWindowSamples) {
|
|
137
|
+
const element = this.queueAC.peek(pos)
|
|
138
|
+
if (element === undefined || element.type !== "audio-frame")
|
|
139
|
+
break
|
|
140
|
+
if ((samples + element.data.length) < frameWindowSamples) {
|
|
141
|
+
data.set(element.data, samples)
|
|
142
|
+
samples += element.data.length
|
|
143
|
+
}
|
|
144
|
+
pos++
|
|
145
|
+
}
|
|
146
|
+
if (pos0 < pos && samples > frameWindowSamples * 0.75) {
|
|
147
|
+
const gender = await classify(data)
|
|
148
|
+
const posM = pos0 + Math.trunc((pos - pos0) * 0.25)
|
|
149
|
+
while (pos0 < posM && pos0 < posL) {
|
|
150
|
+
const element = this.queueAC.peek(pos0)
|
|
151
|
+
if (element === undefined || element.type !== "audio-frame")
|
|
152
|
+
break
|
|
153
|
+
element.gender = gender
|
|
154
|
+
this.queueAC.touch()
|
|
155
|
+
this.queueAC.walk(+1)
|
|
156
|
+
pos0++
|
|
157
|
+
}
|
|
158
|
+
if (lastGender !== gender) {
|
|
159
|
+
log("info", `gender now recognized as <${gender}>`)
|
|
160
|
+
lastGender = gender
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/* re-initiate working off round */
|
|
165
|
+
workingOff = false
|
|
166
|
+
workingOffTimer = setTimeout(() => { workOffQueue() }, 100)
|
|
167
|
+
this.queue.once("write", () => { workOffQueue() })
|
|
168
|
+
}
|
|
169
|
+
this.queue.once("write", () => { workOffQueue() })
|
|
170
|
+
|
|
171
|
+
/* define sample rate required by model */
|
|
172
|
+
const sampleRateTarget = 16000
|
|
173
|
+
|
|
174
|
+
/* provide Duplex stream and internally attach to classifier */
|
|
175
|
+
const self = this
|
|
176
|
+
this.stream = new Stream.Duplex({
|
|
177
|
+
writableObjectMode: true,
|
|
178
|
+
readableObjectMode: true,
|
|
179
|
+
decodeStrings: false,
|
|
180
|
+
|
|
181
|
+
/* receive audio chunk (writable side of stream) */
|
|
182
|
+
write (chunk: SpeechFlowChunk, encoding, callback) {
|
|
183
|
+
if (!Buffer.isBuffer(chunk.payload))
|
|
184
|
+
callback(new Error("expected audio input as Buffer chunks"))
|
|
185
|
+
else if (chunk.payload.byteLength === 0)
|
|
186
|
+
callback()
|
|
187
|
+
else {
|
|
188
|
+
/* convert audio samples from PCM/I16/48KHz to PCM/F32/16KHz */
|
|
189
|
+
let data = utils.convertBufToF32(chunk.payload, self.config.audioLittleEndian)
|
|
190
|
+
const wav = new WaveFile()
|
|
191
|
+
wav.fromScratch(self.config.audioChannels, self.config.audioSampleRate, "32f", data)
|
|
192
|
+
wav.toSampleRate(sampleRateTarget, { method: "cubic" })
|
|
193
|
+
data = wav.getSamples(false, Float32Array<ArrayBuffer>) as
|
|
194
|
+
any as Float32Array<ArrayBuffer>
|
|
195
|
+
|
|
196
|
+
/* queue chunk and converted data */
|
|
197
|
+
self.queueRecv.append({ type: "audio-frame", chunk, data })
|
|
198
|
+
|
|
199
|
+
callback()
|
|
200
|
+
}
|
|
201
|
+
},
|
|
202
|
+
|
|
203
|
+
/* receive no more audio chunks (writable side of stream) */
|
|
204
|
+
final (callback) {
|
|
205
|
+
/* signal end of file */
|
|
206
|
+
self.queueRecv.append({ type: "audio-eof" })
|
|
207
|
+
callback()
|
|
208
|
+
},
|
|
209
|
+
|
|
210
|
+
/* send audio chunk(s) (readable side of stream) */
|
|
211
|
+
read (_size) {
|
|
212
|
+
/* flush pending audio chunks */
|
|
213
|
+
const flushPendingChunks = () => {
|
|
214
|
+
while (true) {
|
|
215
|
+
const element = self.queueSend.peek()
|
|
216
|
+
if (element === undefined)
|
|
217
|
+
break
|
|
218
|
+
else if (element.type === "audio-eof") {
|
|
219
|
+
this.push(null)
|
|
220
|
+
break
|
|
221
|
+
}
|
|
222
|
+
else if (element.type === "audio-frame"
|
|
223
|
+
&& element.gender === undefined)
|
|
224
|
+
break
|
|
225
|
+
const duration = utils.audioArrayDuration(element.data)
|
|
226
|
+
log("info", `send chunk (${duration.toFixed(3)}s) with gender <${element.gender}>`)
|
|
227
|
+
element.chunk.meta.set("gender", element.gender)
|
|
228
|
+
this.push(element.chunk)
|
|
229
|
+
self.queueSend.walk(+1)
|
|
230
|
+
self.queue.trim()
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/* await forthcoming audio chunks */
|
|
235
|
+
const awaitForthcomingChunks = () => {
|
|
236
|
+
const element = self.queueSend.peek()
|
|
237
|
+
if (element !== undefined
|
|
238
|
+
&& element.type === "audio-frame"
|
|
239
|
+
&& element.gender !== undefined)
|
|
240
|
+
flushPendingChunks()
|
|
241
|
+
else
|
|
242
|
+
self.queue.once("write", awaitForthcomingChunks)
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
const element = self.queueSend.peek()
|
|
246
|
+
if (element !== undefined && element.type === "audio-eof")
|
|
247
|
+
this.push(null)
|
|
248
|
+
else if (element !== undefined
|
|
249
|
+
&& element.type === "audio-frame"
|
|
250
|
+
&& element.gender !== undefined)
|
|
251
|
+
flushPendingChunks()
|
|
252
|
+
else
|
|
253
|
+
self.queue.once("write", awaitForthcomingChunks)
|
|
254
|
+
}
|
|
255
|
+
})
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/* close node */
|
|
259
|
+
async close () {
|
|
260
|
+
/* close stream */
|
|
261
|
+
if (this.stream !== null) {
|
|
262
|
+
this.stream.destroy()
|
|
263
|
+
this.stream = null
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/* close classifier */
|
|
267
|
+
if (this.classifier !== null) {
|
|
268
|
+
this.classifier.dispose()
|
|
269
|
+
this.classifier = null
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/* standard dependencies */
|
|
8
|
-
import Stream
|
|
8
|
+
import Stream from "node:stream"
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import { getLUFS, getRMS, AudioData } from "audio-inspect"
|
|
@@ -40,7 +40,7 @@ export default class SpeechFlowNodeMeter extends SpeechFlowNode {
|
|
|
40
40
|
async open () {
|
|
41
41
|
/* sanity check situation */
|
|
42
42
|
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
|
|
43
|
-
throw new Error("
|
|
43
|
+
throw new Error("meter node currently supports PCM-S16LE audio only")
|
|
44
44
|
|
|
45
45
|
/* internal state */
|
|
46
46
|
const sampleWindowDuration = 3 /* LUFS-S requires 3s */
|
|
@@ -50,7 +50,7 @@ export default class SpeechFlowNodeMeter extends SpeechFlowNode {
|
|
|
50
50
|
let lufss = 0
|
|
51
51
|
let rms = 0
|
|
52
52
|
|
|
53
|
-
/* setup
|
|
53
|
+
/* setup loudness emitting interval */
|
|
54
54
|
this.interval = setInterval(() => {
|
|
55
55
|
this.log("info", `LUFS-S: ${lufss.toFixed(1)} dB, RMS: ${rms.toFixed(1)} dB`)
|
|
56
56
|
this.sendResponse([ "meter", "LUFS-S", lufss ])
|
|
@@ -72,6 +72,7 @@ export default class SpeechFlowNodeMute extends SpeechFlowNode {
|
|
|
72
72
|
else if (self.muteMode === "silenced") {
|
|
73
73
|
/* pass-through a silenced chunk */
|
|
74
74
|
chunk = chunk.clone()
|
|
75
|
+
chunk.meta.set("muted", true)
|
|
75
76
|
const buffer = chunk.payload as Buffer
|
|
76
77
|
buffer.fill(0)
|
|
77
78
|
callback()
|
|
@@ -9,19 +9,24 @@ import Stream from "node:stream"
|
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import { RealTimeVAD } from "@ericedouard/vad-node-realtime"
|
|
12
|
-
import { Duration } from "luxon"
|
|
13
12
|
|
|
14
13
|
/* internal dependencies */
|
|
15
14
|
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
16
15
|
import * as utils from "./speechflow-utils"
|
|
17
16
|
|
|
18
17
|
/* audio stream queue element */
|
|
18
|
+
type AudioQueueElementSegment = {
|
|
19
|
+
data: Float32Array,
|
|
20
|
+
isSpeech?: boolean
|
|
21
|
+
}
|
|
19
22
|
type AudioQueueElement = {
|
|
20
|
-
type:
|
|
21
|
-
chunk:
|
|
22
|
-
|
|
23
|
+
type: "audio-frame",
|
|
24
|
+
chunk: SpeechFlowChunk,
|
|
25
|
+
segmentIdx: number,
|
|
26
|
+
segmentData: AudioQueueElementSegment[],
|
|
27
|
+
isSpeech?: boolean
|
|
23
28
|
} | {
|
|
24
|
-
type:
|
|
29
|
+
type: "audio-eof"
|
|
25
30
|
}
|
|
26
31
|
|
|
27
32
|
/* SpeechFlow node for VAD speech-to-speech processing */
|
|
@@ -89,10 +94,22 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
89
94
|
log("info", "VAD: speech end (segment too short)")
|
|
90
95
|
},
|
|
91
96
|
onFrameProcessed: (audio) => {
|
|
92
|
-
/* annotate the current audio
|
|
97
|
+
/* annotate the current audio segment */
|
|
93
98
|
const element = this.queueVAD.peek()
|
|
94
|
-
if (element
|
|
95
|
-
|
|
99
|
+
if (element === undefined || element.type !== "audio-frame")
|
|
100
|
+
throw new Error("internal error which cannot happen: no more queued element")
|
|
101
|
+
const segment = element.segmentData[element.segmentIdx++]
|
|
102
|
+
segment.isSpeech = (audio.isSpeech > audio.notSpeech)
|
|
103
|
+
|
|
104
|
+
/* annotate the entire audio chunk */
|
|
105
|
+
if (element.segmentIdx >= element.segmentData.length) {
|
|
106
|
+
let isSpeech = false
|
|
107
|
+
for (const segment of element.segmentData) {
|
|
108
|
+
if (segment.isSpeech) {
|
|
109
|
+
isSpeech = true
|
|
110
|
+
break
|
|
111
|
+
}
|
|
112
|
+
}
|
|
96
113
|
element.isSpeech = isSpeech
|
|
97
114
|
this.queueVAD.touch()
|
|
98
115
|
this.queueVAD.walk(+1)
|
|
@@ -102,14 +119,7 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
102
119
|
this.vad.start()
|
|
103
120
|
|
|
104
121
|
/* provide Duplex stream and internally attach to VAD */
|
|
105
|
-
const
|
|
106
|
-
const cfg = this.config
|
|
107
|
-
const queue = this.queue
|
|
108
|
-
const queueRecv = this.queueRecv
|
|
109
|
-
const queueSend = this.queueSend
|
|
110
|
-
const mode = this.params.mode
|
|
111
|
-
let carrySamples = new Float32Array()
|
|
112
|
-
let carryStart = Duration.fromDurationLike(0)
|
|
122
|
+
const self = this
|
|
113
123
|
this.stream = new Stream.Duplex({
|
|
114
124
|
writableObjectMode: true,
|
|
115
125
|
readableObjectMode: true,
|
|
@@ -123,38 +133,34 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
123
133
|
callback()
|
|
124
134
|
else {
|
|
125
135
|
/* convert audio samples from PCM/I16 to PCM/F32 */
|
|
126
|
-
|
|
127
|
-
let start = chunk.timestampStart
|
|
128
|
-
|
|
129
|
-
/* merge previous carry samples */
|
|
130
|
-
if (carrySamples.length > 0) {
|
|
131
|
-
start = carryStart
|
|
132
|
-
const merged = new Float32Array(carrySamples.length + data.length)
|
|
133
|
-
merged.set(carrySamples)
|
|
134
|
-
merged.set(data, carrySamples.length)
|
|
135
|
-
data = merged
|
|
136
|
-
carrySamples = new Float32Array()
|
|
137
|
-
}
|
|
136
|
+
const data = utils.convertBufToF32(chunk.payload, self.config.audioLittleEndian)
|
|
138
137
|
|
|
139
|
-
/*
|
|
140
|
-
|
|
141
|
-
const chunkSize =
|
|
138
|
+
/* segment audio samples as individual VAD-sized frames */
|
|
139
|
+
const segmentData: AudioQueueElementSegment[] = []
|
|
140
|
+
const chunkSize = vadSamplesPerFrame * (self.config.audioSampleRate / vadSampleRateTarget)
|
|
142
141
|
const chunks = Math.trunc(data.length / chunkSize)
|
|
143
142
|
for (let i = 0; i < chunks; i++) {
|
|
144
143
|
const frame = data.slice(i * chunkSize, (i + 1) * chunkSize)
|
|
145
|
-
const
|
|
146
|
-
|
|
147
|
-
const end = start.plus(duration)
|
|
148
|
-
const chunk = new SpeechFlowChunk(start, end, "final", "audio", buf)
|
|
149
|
-
queueRecv.append({ type: "audio-frame", chunk })
|
|
150
|
-
vad.processAudio(frame)
|
|
151
|
-
start = end
|
|
144
|
+
const segment: AudioQueueElementSegment = { data: frame }
|
|
145
|
+
segmentData.push(segment)
|
|
152
146
|
}
|
|
147
|
+
if ((chunks * chunkSize) < data.length) {
|
|
148
|
+
const frame = new Float32Array(chunkSize)
|
|
149
|
+
frame.fill(0)
|
|
150
|
+
frame.set(data.slice(chunks * chunkSize, data.length))
|
|
151
|
+
const segment: AudioQueueElementSegment = { data: frame }
|
|
152
|
+
segmentData.push(segment)
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/* queue the results */
|
|
156
|
+
self.queueRecv.append({
|
|
157
|
+
type: "audio-frame", chunk,
|
|
158
|
+
segmentIdx: 0, segmentData
|
|
159
|
+
})
|
|
153
160
|
|
|
154
|
-
/*
|
|
155
|
-
const
|
|
156
|
-
|
|
157
|
-
carryStart = start
|
|
161
|
+
/* push segments through Voice Activity Detection (VAD) */
|
|
162
|
+
for (const segment of segmentData)
|
|
163
|
+
self.vad!.processAudio(segment.data)
|
|
158
164
|
|
|
159
165
|
callback()
|
|
160
166
|
}
|
|
@@ -162,25 +168,8 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
162
168
|
|
|
163
169
|
/* receive no more audio chunks (writable side of stream) */
|
|
164
170
|
final (callback) {
|
|
165
|
-
/* flush pending audio chunks */
|
|
166
|
-
if (carrySamples.length > 0) {
|
|
167
|
-
const chunkSize = (vadSamplesPerFrame * (cfg.audioSampleRate / vadSampleRateTarget))
|
|
168
|
-
if (carrySamples.length < chunkSize) {
|
|
169
|
-
const merged = new Float32Array(chunkSize)
|
|
170
|
-
merged.set(carrySamples)
|
|
171
|
-
merged.fill(0.0, carrySamples.length, chunkSize)
|
|
172
|
-
carrySamples = merged
|
|
173
|
-
}
|
|
174
|
-
const buf = utils.convertF32ToBuf(carrySamples)
|
|
175
|
-
const duration = utils.audioBufferDuration(buf)
|
|
176
|
-
const end = carryStart.plus(duration)
|
|
177
|
-
const chunk = new SpeechFlowChunk(carryStart, end, "final", "audio", buf)
|
|
178
|
-
queueRecv.append({ type: "audio-frame", chunk })
|
|
179
|
-
vad.processAudio(carrySamples)
|
|
180
|
-
}
|
|
181
|
-
|
|
182
171
|
/* signal end of file */
|
|
183
|
-
queueRecv.append({ type: "audio-eof" })
|
|
172
|
+
self.queueRecv.append({ type: "audio-eof" })
|
|
184
173
|
callback()
|
|
185
174
|
},
|
|
186
175
|
|
|
@@ -192,7 +181,7 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
192
181
|
const flushPendingChunks = () => {
|
|
193
182
|
let pushed = 0
|
|
194
183
|
while (true) {
|
|
195
|
-
const element = queueSend.peek()
|
|
184
|
+
const element = self.queueSend.peek()
|
|
196
185
|
if (element === undefined)
|
|
197
186
|
break
|
|
198
187
|
else if (element.type === "audio-eof") {
|
|
@@ -202,19 +191,20 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
202
191
|
else if (element.type === "audio-frame"
|
|
203
192
|
&& element.isSpeech === undefined)
|
|
204
193
|
break
|
|
205
|
-
queueSend.walk(+1)
|
|
194
|
+
self.queueSend.walk(+1)
|
|
195
|
+
self.queue.trim()
|
|
206
196
|
if (element.isSpeech) {
|
|
207
197
|
this.push(element.chunk)
|
|
208
198
|
pushed++
|
|
209
199
|
}
|
|
210
|
-
else if (mode === "silenced") {
|
|
200
|
+
else if (self.params.mode === "silenced") {
|
|
211
201
|
const chunk = element.chunk.clone()
|
|
212
202
|
const buffer = chunk.payload as Buffer
|
|
213
203
|
buffer.fill(0)
|
|
214
204
|
this.push(chunk)
|
|
215
205
|
pushed++
|
|
216
206
|
}
|
|
217
|
-
else if (mode === "unplugged" && pushed === 0)
|
|
207
|
+
else if (self.params.mode === "unplugged" && pushed === 0)
|
|
218
208
|
/* we have to await chunks now, as in unplugged
|
|
219
209
|
mode we else would be never called again until
|
|
220
210
|
we at least once push a new chunk as the result */
|
|
@@ -224,16 +214,16 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
224
214
|
|
|
225
215
|
/* await forthcoming audio chunks */
|
|
226
216
|
const awaitForthcomingChunks = () => {
|
|
227
|
-
const element = queueSend.peek()
|
|
217
|
+
const element = self.queueSend.peek()
|
|
228
218
|
if (element !== undefined
|
|
229
219
|
&& element.type === "audio-frame"
|
|
230
220
|
&& element.isSpeech !== undefined)
|
|
231
221
|
flushPendingChunks()
|
|
232
222
|
else
|
|
233
|
-
queue.once("write", awaitForthcomingChunks)
|
|
223
|
+
self.queue.once("write", awaitForthcomingChunks)
|
|
234
224
|
}
|
|
235
225
|
|
|
236
|
-
const element = queueSend.peek()
|
|
226
|
+
const element = self.queueSend.peek()
|
|
237
227
|
if (element !== undefined && element.type === "audio-eof")
|
|
238
228
|
this.push(null)
|
|
239
229
|
else if (element !== undefined
|
|
@@ -241,7 +231,7 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
241
231
|
&& element.isSpeech !== undefined)
|
|
242
232
|
flushPendingChunks()
|
|
243
233
|
else
|
|
244
|
-
queue.once("write", awaitForthcomingChunks)
|
|
234
|
+
self.queue.once("write", awaitForthcomingChunks)
|
|
245
235
|
}
|
|
246
236
|
tryToRead()
|
|
247
237
|
}
|