speechflow 0.9.9 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/README.md +48 -1
- package/dst/speechflow-node-a2a-ffmpeg.js +1 -0
- package/dst/speechflow-node-a2a-ffmpeg.js.map +1 -0
- package/dst/{speechflow-node-gemma.d.ts → speechflow-node-a2a-meter.d.ts} +2 -3
- package/dst/speechflow-node-a2a-meter.js +147 -0
- package/dst/speechflow-node-a2a-meter.js.map +1 -0
- package/dst/speechflow-node-a2a-mute.d.ts +16 -0
- package/dst/speechflow-node-a2a-mute.js +90 -0
- package/dst/speechflow-node-a2a-mute.js.map +1 -0
- package/dst/speechflow-node-a2a-vad.js +130 -289
- package/dst/speechflow-node-a2a-vad.js.map +1 -0
- package/dst/speechflow-node-a2a-wav.js +1 -0
- package/dst/speechflow-node-a2a-wav.js.map +1 -0
- package/dst/speechflow-node-a2t-deepgram.js +2 -1
- package/dst/speechflow-node-a2t-deepgram.js.map +1 -0
- package/dst/speechflow-node-t2a-elevenlabs.js +1 -0
- package/dst/speechflow-node-t2a-elevenlabs.js.map +1 -0
- package/dst/speechflow-node-t2a-kokoro.js +1 -0
- package/dst/speechflow-node-t2a-kokoro.js.map +1 -0
- package/dst/speechflow-node-t2t-deepl.js +1 -0
- package/dst/speechflow-node-t2t-deepl.js.map +1 -0
- package/dst/speechflow-node-t2t-format.js +1 -0
- package/dst/speechflow-node-t2t-format.js.map +1 -0
- package/dst/speechflow-node-t2t-ollama.js +1 -0
- package/dst/speechflow-node-t2t-ollama.js.map +1 -0
- package/dst/speechflow-node-t2t-openai.js +1 -0
- package/dst/speechflow-node-t2t-openai.js.map +1 -0
- package/dst/speechflow-node-t2t-subtitle.js +1 -0
- package/dst/speechflow-node-t2t-subtitle.js.map +1 -0
- package/dst/speechflow-node-t2t-transformers.js +10 -6
- package/dst/speechflow-node-t2t-transformers.js.map +1 -0
- package/dst/speechflow-node-x2x-trace.js +1 -0
- package/dst/speechflow-node-x2x-trace.js.map +1 -0
- package/dst/speechflow-node-xio-device.js +1 -0
- package/dst/speechflow-node-xio-device.js.map +1 -0
- package/dst/speechflow-node-xio-file.js +1 -0
- package/dst/speechflow-node-xio-file.js.map +1 -0
- package/dst/speechflow-node-xio-mqtt.js +1 -0
- package/dst/speechflow-node-xio-mqtt.js.map +1 -0
- package/dst/speechflow-node-xio-websocket.js +1 -0
- package/dst/speechflow-node-xio-websocket.js.map +1 -0
- package/dst/speechflow-node.d.ts +3 -0
- package/dst/speechflow-node.js +10 -0
- package/dst/speechflow-node.js.map +1 -0
- package/dst/speechflow-utils.d.ts +33 -0
- package/dst/speechflow-utils.js +183 -1
- package/dst/speechflow-utils.js.map +1 -0
- package/dst/speechflow.js +209 -6
- package/dst/speechflow.js.map +1 -0
- package/etc/speechflow.yaml +5 -3
- package/etc/stx.conf +1 -1
- package/etc/tsconfig.json +2 -2
- package/package.json +14 -8
- package/src/speechflow-node-a2a-meter.ts +125 -0
- package/src/speechflow-node-a2a-mute.ts +101 -0
- package/src/speechflow-node-a2a-vad.ts +266 -0
- package/src/speechflow-node-a2t-deepgram.ts +1 -1
- package/src/speechflow-node-t2t-transformers.ts +12 -7
- package/src/speechflow-node-xio-websocket.ts +5 -5
- package/src/speechflow-node.ts +12 -0
- package/src/speechflow-utils.ts +195 -0
- package/src/speechflow.ts +193 -6
- package/dst/speechflow-node-deepgram.d.ts +0 -12
- package/dst/speechflow-node-deepgram.js +0 -220
- package/dst/speechflow-node-deepl.d.ts +0 -12
- package/dst/speechflow-node-deepl.js +0 -128
- package/dst/speechflow-node-device.d.ts +0 -13
- package/dst/speechflow-node-device.js +0 -205
- package/dst/speechflow-node-elevenlabs.d.ts +0 -13
- package/dst/speechflow-node-elevenlabs.js +0 -182
- package/dst/speechflow-node-ffmpeg.d.ts +0 -13
- package/dst/speechflow-node-ffmpeg.js +0 -152
- package/dst/speechflow-node-file.d.ts +0 -11
- package/dst/speechflow-node-file.js +0 -176
- package/dst/speechflow-node-format.d.ts +0 -11
- package/dst/speechflow-node-format.js +0 -80
- package/dst/speechflow-node-gemma.js +0 -213
- package/dst/speechflow-node-mqtt.d.ts +0 -13
- package/dst/speechflow-node-mqtt.js +0 -181
- package/dst/speechflow-node-opus.d.ts +0 -12
- package/dst/speechflow-node-opus.js +0 -135
- package/dst/speechflow-node-subtitle.d.ts +0 -12
- package/dst/speechflow-node-subtitle.js +0 -96
- package/dst/speechflow-node-t2t-gemma.d.ts +0 -13
- package/dst/speechflow-node-t2t-gemma.js +0 -233
- package/dst/speechflow-node-t2t-opus.d.ts +0 -12
- package/dst/speechflow-node-t2t-opus.js +0 -135
- package/dst/speechflow-node-trace.d.ts +0 -11
- package/dst/speechflow-node-trace.js +0 -88
- package/dst/speechflow-node-wav.d.ts +0 -11
- package/dst/speechflow-node-wav.js +0 -170
- package/dst/speechflow-node-websocket.d.ts +0 -13
- package/dst/speechflow-node-websocket.js +0 -275
- package/dst/speechflow-node-whisper-common.d.ts +0 -34
- package/dst/speechflow-node-whisper-common.js +0 -7
- package/dst/speechflow-node-whisper-ggml.d.ts +0 -1
- package/dst/speechflow-node-whisper-ggml.js +0 -97
- package/dst/speechflow-node-whisper-onnx.d.ts +0 -1
- package/dst/speechflow-node-whisper-onnx.js +0 -131
- package/dst/speechflow-node-whisper-worker-ggml.d.ts +0 -1
- package/dst/speechflow-node-whisper-worker-ggml.js +0 -97
- package/dst/speechflow-node-whisper-worker-onnx.d.ts +0 -1
- package/dst/speechflow-node-whisper-worker-onnx.js +0 -131
- package/dst/speechflow-node-whisper-worker.d.ts +0 -1
- package/dst/speechflow-node-whisper-worker.js +0 -116
- package/dst/speechflow-node-whisper-worker2.d.ts +0 -1
- package/dst/speechflow-node-whisper-worker2.js +0 -82
- package/dst/speechflow-node-whisper.d.ts +0 -19
- package/dst/speechflow-node-whisper.js +0 -604
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* internal dependencies */
|
|
11
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
12
|
+
|
|
13
|
+
/* the type of muting */
|
|
14
|
+
type MuteMode =
|
|
15
|
+
"none" | /* not muted */
|
|
16
|
+
"silenced" | /* muted by changing audio samples to silence */
|
|
17
|
+
"unplugged" /* muted by unplugging the audio sample flow */
|
|
18
|
+
|
|
19
|
+
/* SpeechFlow node for muting in audio-to-audio passing */
|
|
20
|
+
export default class SpeechFlowNodeMute extends SpeechFlowNode {
|
|
21
|
+
/* declare official node name */
|
|
22
|
+
public static name = "mute"
|
|
23
|
+
|
|
24
|
+
/* internal state */
|
|
25
|
+
private muteMode: MuteMode = "none"
|
|
26
|
+
|
|
27
|
+
/* construct node */
|
|
28
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
29
|
+
super(id, cfg, opts, args)
|
|
30
|
+
|
|
31
|
+
/* declare node configuration parameters */
|
|
32
|
+
this.configure({})
|
|
33
|
+
|
|
34
|
+
/* declare node input/output format */
|
|
35
|
+
this.input = "audio"
|
|
36
|
+
this.output = "audio"
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/* receive external request */
|
|
40
|
+
async receiveRequest (params: any[]) {
|
|
41
|
+
if (params.length === 2 && params[0] === "mode") {
|
|
42
|
+
if (!params[1].match(/^(?:none|silenced|unplugged)$/))
|
|
43
|
+
throw new Error("mute: invalid mode argument in external request")
|
|
44
|
+
const muteMode: MuteMode = params[1] as MuteMode
|
|
45
|
+
this.setMuteMode(muteMode)
|
|
46
|
+
this.sendResponse([ "mute", "mode", muteMode ])
|
|
47
|
+
}
|
|
48
|
+
else
|
|
49
|
+
throw new Error("mute: invalid arguments in external request")
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/* change mute mode */
|
|
53
|
+
setMuteMode (mode: MuteMode) {
|
|
54
|
+
this.log("info", `setting mute mode to "${mode}"`)
|
|
55
|
+
this.muteMode = mode
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/* open node */
|
|
59
|
+
async open () {
|
|
60
|
+
/* establish a transform stream */
|
|
61
|
+
const self = this
|
|
62
|
+
this.stream = new Stream.Transform({
|
|
63
|
+
readableObjectMode: true,
|
|
64
|
+
writableObjectMode: true,
|
|
65
|
+
decodeStrings: false,
|
|
66
|
+
transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
67
|
+
if (!Buffer.isBuffer(chunk.payload))
|
|
68
|
+
callback(new Error("invalid chunk payload type"))
|
|
69
|
+
else if (self.muteMode === "unplugged")
|
|
70
|
+
/* pass-through nothing */
|
|
71
|
+
callback()
|
|
72
|
+
else if (self.muteMode === "silenced") {
|
|
73
|
+
/* pass-through a silenced chunk */
|
|
74
|
+
chunk = chunk.clone()
|
|
75
|
+
const buffer = chunk.payload as Buffer
|
|
76
|
+
buffer.fill(0)
|
|
77
|
+
callback()
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
/* pass-through original chunk */
|
|
81
|
+
this.push(chunk)
|
|
82
|
+
callback()
|
|
83
|
+
}
|
|
84
|
+
},
|
|
85
|
+
final (callback) {
|
|
86
|
+
this.push(null)
|
|
87
|
+
callback()
|
|
88
|
+
}
|
|
89
|
+
})
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/* close node */
|
|
93
|
+
async close () {
|
|
94
|
+
/* close stream */
|
|
95
|
+
if (this.stream !== null) {
|
|
96
|
+
this.stream.destroy()
|
|
97
|
+
this.stream = null
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import { RealTimeVAD } from "@ericedouard/vad-node-realtime"
|
|
12
|
+
import { Duration } from "luxon"
|
|
13
|
+
|
|
14
|
+
/* internal dependencies */
|
|
15
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
16
|
+
import * as utils from "./speechflow-utils"
|
|
17
|
+
|
|
18
|
+
/* audio stream queue element */
|
|
19
|
+
type AudioQueueElement = {
|
|
20
|
+
type: "audio-frame",
|
|
21
|
+
chunk: SpeechFlowChunk,
|
|
22
|
+
isSpeech?: boolean
|
|
23
|
+
} | {
|
|
24
|
+
type: "audio-eof"
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/* SpeechFlow node for VAD speech-to-speech processing */
|
|
28
|
+
export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
29
|
+
/* declare official node name */
|
|
30
|
+
public static name = "vad"
|
|
31
|
+
|
|
32
|
+
/* internal state */
|
|
33
|
+
private vad: RealTimeVAD | null = null
|
|
34
|
+
private queue = new utils.Queue<AudioQueueElement>()
|
|
35
|
+
private queueRecv = this.queue.pointerUse("recv")
|
|
36
|
+
private queueVAD = this.queue.pointerUse("vad")
|
|
37
|
+
private queueSend = this.queue.pointerUse("send")
|
|
38
|
+
|
|
39
|
+
/* construct node */
|
|
40
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
41
|
+
super(id, cfg, opts, args)
|
|
42
|
+
|
|
43
|
+
/* declare node configuration parameters */
|
|
44
|
+
this.configure({
|
|
45
|
+
mode: { type: "string", val: "unplugged", match: /^(?:silenced|unplugged)$/ },
|
|
46
|
+
posSpeechThreshold: { type: "number", val: 0.50 },
|
|
47
|
+
negSpeechThreshold: { type: "number", val: 0.35 },
|
|
48
|
+
minSpeechFrames: { type: "number", val: 2 },
|
|
49
|
+
redemptionFrames: { type: "number", val: 12 },
|
|
50
|
+
preSpeechPadFrames: { type: "number", val: 1 }
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
/* declare node input/output format */
|
|
54
|
+
this.input = "audio"
|
|
55
|
+
this.output = "audio"
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/* open node */
|
|
59
|
+
async open () {
|
|
60
|
+
/* sanity check situation */
|
|
61
|
+
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
|
|
62
|
+
throw new Error("VAD node currently supports PCM-S16LE audio only")
|
|
63
|
+
|
|
64
|
+
/* pass-through logging */
|
|
65
|
+
const log = (level: string, msg: string) => { this.log(level, msg) }
|
|
66
|
+
|
|
67
|
+
/* internal processing constants */
|
|
68
|
+
const vadSampleRateTarget = 16000 /* internal target of VAD */
|
|
69
|
+
const vadSamplesPerFrame = 512 /* required for VAD v5 */
|
|
70
|
+
|
|
71
|
+
/* establish Voice Activity Detection (VAD) facility */
|
|
72
|
+
this.vad = await RealTimeVAD.new({
|
|
73
|
+
model: "v5",
|
|
74
|
+
sampleRate: this.config.audioSampleRate, /* before resampling to 16KHz */
|
|
75
|
+
frameSamples: vadSamplesPerFrame, /* after resampling to 16KHz */
|
|
76
|
+
positiveSpeechThreshold: this.params.posSpeechThreshold,
|
|
77
|
+
negativeSpeechThreshold: this.params.negSpeechThreshold,
|
|
78
|
+
minSpeechFrames: this.params.minSpeechFrames,
|
|
79
|
+
redemptionFrames: this.params.redemptionFrames,
|
|
80
|
+
preSpeechPadFrames: this.params.preSpeechPadFrames,
|
|
81
|
+
onSpeechStart: () => {
|
|
82
|
+
log("info", "VAD: speech start")
|
|
83
|
+
},
|
|
84
|
+
onSpeechEnd: (audio) => {
|
|
85
|
+
const duration = utils.audioArrayDuration(audio, vadSampleRateTarget)
|
|
86
|
+
log("info", `VAD: speech end (duration: ${duration.toFixed(2)}s)`)
|
|
87
|
+
},
|
|
88
|
+
onVADMisfire: () => {
|
|
89
|
+
log("info", "VAD: speech end (segment too short)")
|
|
90
|
+
},
|
|
91
|
+
onFrameProcessed: (audio) => {
|
|
92
|
+
/* annotate the current audio frame */
|
|
93
|
+
const element = this.queueVAD.peek()
|
|
94
|
+
if (element !== undefined && element.type === "audio-frame") {
|
|
95
|
+
const isSpeech = audio.isSpeech > audio.notSpeech
|
|
96
|
+
element.isSpeech = isSpeech
|
|
97
|
+
this.queueVAD.touch()
|
|
98
|
+
this.queueVAD.walk(+1)
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
})
|
|
102
|
+
this.vad.start()
|
|
103
|
+
|
|
104
|
+
/* provide Duplex stream and internally attach to VAD */
|
|
105
|
+
const vad = this.vad
|
|
106
|
+
const cfg = this.config
|
|
107
|
+
const queue = this.queue
|
|
108
|
+
const queueRecv = this.queueRecv
|
|
109
|
+
const queueSend = this.queueSend
|
|
110
|
+
const mode = this.params.mode
|
|
111
|
+
let carrySamples = new Float32Array()
|
|
112
|
+
let carryStart = Duration.fromDurationLike(0)
|
|
113
|
+
this.stream = new Stream.Duplex({
|
|
114
|
+
writableObjectMode: true,
|
|
115
|
+
readableObjectMode: true,
|
|
116
|
+
decodeStrings: false,
|
|
117
|
+
|
|
118
|
+
/* receive audio chunk (writable side of stream) */
|
|
119
|
+
write (chunk: SpeechFlowChunk, encoding, callback) {
|
|
120
|
+
if (!Buffer.isBuffer(chunk.payload))
|
|
121
|
+
callback(new Error("expected audio input as Buffer chunks"))
|
|
122
|
+
else if (chunk.payload.byteLength === 0)
|
|
123
|
+
callback()
|
|
124
|
+
else {
|
|
125
|
+
/* convert audio samples from PCM/I16 to PCM/F32 */
|
|
126
|
+
let data = utils.convertBufToF32(chunk.payload, cfg.audioLittleEndian)
|
|
127
|
+
let start = chunk.timestampStart
|
|
128
|
+
|
|
129
|
+
/* merge previous carry samples */
|
|
130
|
+
if (carrySamples.length > 0) {
|
|
131
|
+
start = carryStart
|
|
132
|
+
const merged = new Float32Array(carrySamples.length + data.length)
|
|
133
|
+
merged.set(carrySamples)
|
|
134
|
+
merged.set(data, carrySamples.length)
|
|
135
|
+
data = merged
|
|
136
|
+
carrySamples = new Float32Array()
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/* queue audio samples as individual VAD-sized frames
|
|
140
|
+
and in parallel send it into the Voice Activity Detection (VAD) */
|
|
141
|
+
const chunkSize = (vadSamplesPerFrame * (cfg.audioSampleRate / vadSampleRateTarget))
|
|
142
|
+
const chunks = Math.trunc(data.length / chunkSize)
|
|
143
|
+
for (let i = 0; i < chunks; i++) {
|
|
144
|
+
const frame = data.slice(i * chunkSize, (i + 1) * chunkSize)
|
|
145
|
+
const buf = utils.convertF32ToBuf(frame)
|
|
146
|
+
const duration = utils.audioBufferDuration(buf)
|
|
147
|
+
const end = start.plus(duration)
|
|
148
|
+
const chunk = new SpeechFlowChunk(start, end, "final", "audio", buf)
|
|
149
|
+
queueRecv.append({ type: "audio-frame", chunk })
|
|
150
|
+
vad.processAudio(frame)
|
|
151
|
+
start = end
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/* remember new carry samples */
|
|
155
|
+
const bulkLen = chunks * chunkSize
|
|
156
|
+
carrySamples = data.slice(bulkLen)
|
|
157
|
+
carryStart = start
|
|
158
|
+
|
|
159
|
+
callback()
|
|
160
|
+
}
|
|
161
|
+
},
|
|
162
|
+
|
|
163
|
+
/* receive no more audio chunks (writable side of stream) */
|
|
164
|
+
final (callback) {
|
|
165
|
+
/* flush pending audio chunks */
|
|
166
|
+
if (carrySamples.length > 0) {
|
|
167
|
+
const chunkSize = (vadSamplesPerFrame * (cfg.audioSampleRate / vadSampleRateTarget))
|
|
168
|
+
if (carrySamples.length < chunkSize) {
|
|
169
|
+
const merged = new Float32Array(chunkSize)
|
|
170
|
+
merged.set(carrySamples)
|
|
171
|
+
merged.fill(0.0, carrySamples.length, chunkSize)
|
|
172
|
+
carrySamples = merged
|
|
173
|
+
}
|
|
174
|
+
const buf = utils.convertF32ToBuf(carrySamples)
|
|
175
|
+
const duration = utils.audioBufferDuration(buf)
|
|
176
|
+
const end = carryStart.plus(duration)
|
|
177
|
+
const chunk = new SpeechFlowChunk(carryStart, end, "final", "audio", buf)
|
|
178
|
+
queueRecv.append({ type: "audio-frame", chunk })
|
|
179
|
+
vad.processAudio(carrySamples)
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/* signal end of file */
|
|
183
|
+
queueRecv.append({ type: "audio-eof" })
|
|
184
|
+
callback()
|
|
185
|
+
},
|
|
186
|
+
|
|
187
|
+
/* send audio chunk(s) (readable side of stream) */
|
|
188
|
+
read (_size) {
|
|
189
|
+
/* try to perform read operation from scratch */
|
|
190
|
+
const tryToRead = () => {
|
|
191
|
+
/* flush pending audio chunks */
|
|
192
|
+
const flushPendingChunks = () => {
|
|
193
|
+
let pushed = 0
|
|
194
|
+
while (true) {
|
|
195
|
+
const element = queueSend.peek()
|
|
196
|
+
if (element === undefined)
|
|
197
|
+
break
|
|
198
|
+
else if (element.type === "audio-eof") {
|
|
199
|
+
this.push(null)
|
|
200
|
+
break
|
|
201
|
+
}
|
|
202
|
+
else if (element.type === "audio-frame"
|
|
203
|
+
&& element.isSpeech === undefined)
|
|
204
|
+
break
|
|
205
|
+
queueSend.walk(+1)
|
|
206
|
+
if (element.isSpeech) {
|
|
207
|
+
this.push(element.chunk)
|
|
208
|
+
pushed++
|
|
209
|
+
}
|
|
210
|
+
else if (mode === "silenced") {
|
|
211
|
+
const chunk = element.chunk.clone()
|
|
212
|
+
const buffer = chunk.payload as Buffer
|
|
213
|
+
buffer.fill(0)
|
|
214
|
+
this.push(chunk)
|
|
215
|
+
pushed++
|
|
216
|
+
}
|
|
217
|
+
else if (mode === "unplugged" && pushed === 0)
|
|
218
|
+
/* we have to await chunks now, as in unplugged
|
|
219
|
+
mode we else would be never called again until
|
|
220
|
+
we at least once push a new chunk as the result */
|
|
221
|
+
tryToRead()
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/* await forthcoming audio chunks */
|
|
226
|
+
const awaitForthcomingChunks = () => {
|
|
227
|
+
const element = queueSend.peek()
|
|
228
|
+
if (element !== undefined
|
|
229
|
+
&& element.type === "audio-frame"
|
|
230
|
+
&& element.isSpeech !== undefined)
|
|
231
|
+
flushPendingChunks()
|
|
232
|
+
else
|
|
233
|
+
queue.once("write", awaitForthcomingChunks)
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
const element = queueSend.peek()
|
|
237
|
+
if (element !== undefined && element.type === "audio-eof")
|
|
238
|
+
this.push(null)
|
|
239
|
+
else if (element !== undefined
|
|
240
|
+
&& element.type === "audio-frame"
|
|
241
|
+
&& element.isSpeech !== undefined)
|
|
242
|
+
flushPendingChunks()
|
|
243
|
+
else
|
|
244
|
+
queue.once("write", awaitForthcomingChunks)
|
|
245
|
+
}
|
|
246
|
+
tryToRead()
|
|
247
|
+
}
|
|
248
|
+
})
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/* close node */
|
|
252
|
+
async close () {
|
|
253
|
+
/* close stream */
|
|
254
|
+
if (this.stream !== null) {
|
|
255
|
+
this.stream.destroy()
|
|
256
|
+
this.stream = null
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/* close VAD */
|
|
260
|
+
if (this.vad !== null) {
|
|
261
|
+
await this.vad.flush()
|
|
262
|
+
this.vad.destroy()
|
|
263
|
+
this.vad = null
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
@@ -164,7 +164,7 @@ export default class SpeechFlowNodeDeepgram extends SpeechFlowNode {
|
|
|
164
164
|
if (chunk.payload.byteLength > 0) {
|
|
165
165
|
log("info", `Deepgram: send data (${chunk.payload.byteLength} bytes)`)
|
|
166
166
|
initTimeoutStart()
|
|
167
|
-
dg.send(chunk.payload) /* intentionally discard all time information */
|
|
167
|
+
dg.send(chunk.payload.buffer) /* intentionally discard all time information */
|
|
168
168
|
}
|
|
169
169
|
callback()
|
|
170
170
|
}
|
|
@@ -104,10 +104,11 @@ export default class SpeechFlowNodeTransformers extends SpeechFlowNode {
|
|
|
104
104
|
|
|
105
105
|
/* open node */
|
|
106
106
|
async open () {
|
|
107
|
-
/* instantiate Transformers engine and model */
|
|
108
107
|
let model: string = ""
|
|
108
|
+
|
|
109
|
+
/* track download progress when instantiating Transformers engine and model */
|
|
109
110
|
const progressState = new Map<string, number>()
|
|
110
|
-
const progressCallback = (progress: any) => {
|
|
111
|
+
const progressCallback: Transformers.ProgressCallback = (progress: any) => {
|
|
111
112
|
let artifact = model
|
|
112
113
|
if (typeof progress.file === "string")
|
|
113
114
|
artifact += `:${progress.file}`
|
|
@@ -126,26 +127,30 @@ export default class SpeechFlowNodeTransformers extends SpeechFlowNode {
|
|
|
126
127
|
progressState.delete(artifact)
|
|
127
128
|
}
|
|
128
129
|
}, 1000)
|
|
130
|
+
|
|
131
|
+
/* instantiate Transformers engine and model */
|
|
129
132
|
if (this.params.model === "OPUS") {
|
|
130
133
|
model = `onnx-community/opus-mt-${this.params.src}-${this.params.dst}`
|
|
131
|
-
|
|
132
|
-
cache_dir: path.join(this.config.cacheDir, "
|
|
134
|
+
const pipeline = Transformers.pipeline("translation", model, {
|
|
135
|
+
cache_dir: path.join(this.config.cacheDir, "transformers"),
|
|
133
136
|
dtype: "q4",
|
|
134
|
-
device: "
|
|
137
|
+
device: "auto",
|
|
135
138
|
progress_callback: progressCallback
|
|
136
139
|
})
|
|
140
|
+
this.translator = await pipeline
|
|
137
141
|
clearInterval(interval)
|
|
138
142
|
if (this.translator === null)
|
|
139
143
|
throw new Error("failed to instantiate translator pipeline")
|
|
140
144
|
}
|
|
141
145
|
else if (this.params.model === "SmolLM3") {
|
|
142
146
|
model = "HuggingFaceTB/SmolLM3-3B-ONNX"
|
|
143
|
-
|
|
147
|
+
const pipeline = Transformers.pipeline("text-generation", model, {
|
|
144
148
|
cache_dir: path.join(this.config.cacheDir, "transformers"),
|
|
145
149
|
dtype: "q4",
|
|
146
|
-
device: "
|
|
150
|
+
device: "auto",
|
|
147
151
|
progress_callback: progressCallback
|
|
148
152
|
})
|
|
153
|
+
this.generator = await pipeline
|
|
149
154
|
clearInterval(interval)
|
|
150
155
|
if (this.generator === null)
|
|
151
156
|
throw new Error("failed to instantiate generator pipeline")
|
|
@@ -9,7 +9,7 @@ import Stream from "node:stream"
|
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import ws from "ws"
|
|
12
|
-
import
|
|
12
|
+
import ReconnWebSocket, { ErrorEvent } from "@opensumi/reconnecting-websocket"
|
|
13
13
|
|
|
14
14
|
/* internal dependencies */
|
|
15
15
|
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
@@ -22,7 +22,7 @@ export default class SpeechFlowNodeWebsocket extends SpeechFlowNode {
|
|
|
22
22
|
|
|
23
23
|
/* internal state */
|
|
24
24
|
private server: ws.WebSocketServer | null = null
|
|
25
|
-
private client:
|
|
25
|
+
private client: ReconnWebSocket | null = null
|
|
26
26
|
|
|
27
27
|
/* construct node */
|
|
28
28
|
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
@@ -153,7 +153,7 @@ export default class SpeechFlowNodeWebsocket extends SpeechFlowNode {
|
|
|
153
153
|
}
|
|
154
154
|
else if (this.params.connect !== "") {
|
|
155
155
|
/* connect remotely to a Websocket port */
|
|
156
|
-
this.client = new
|
|
156
|
+
this.client = new ReconnWebSocket(this.params.connect, [], {
|
|
157
157
|
WebSocket: ws,
|
|
158
158
|
WebSocketOptions: {},
|
|
159
159
|
reconnectionDelayGrowFactor: 1.3,
|
|
@@ -162,10 +162,10 @@ export default class SpeechFlowNodeWebsocket extends SpeechFlowNode {
|
|
|
162
162
|
connectionTimeout: 4000,
|
|
163
163
|
minUptime: 5000
|
|
164
164
|
})
|
|
165
|
-
this.client.addEventListener("open", (ev
|
|
165
|
+
this.client.addEventListener("open", (ev) => {
|
|
166
166
|
this.log("info", `connection opened to URL ${this.params.connect}`)
|
|
167
167
|
})
|
|
168
|
-
this.client.addEventListener("close", (ev
|
|
168
|
+
this.client.addEventListener("close", (ev) => {
|
|
169
169
|
this.log("info", `connection closed to URL ${this.params.connect}`)
|
|
170
170
|
})
|
|
171
171
|
this.client.addEventListener("error", (ev: ErrorEvent) => {
|
package/src/speechflow-node.ts
CHANGED
|
@@ -36,6 +36,8 @@ export class SpeechFlowChunk {
|
|
|
36
36
|
|
|
37
37
|
/* the base class for all SpeechFlow nodes */
|
|
38
38
|
export default class SpeechFlowNode extends Events.EventEmitter {
|
|
39
|
+
public static name: string | undefined
|
|
40
|
+
|
|
39
41
|
/* general constant configuration (for reference) */
|
|
40
42
|
config = {
|
|
41
43
|
audioChannels: 1, /* audio mono channel */
|
|
@@ -80,6 +82,16 @@ export default class SpeechFlowNode extends Events.EventEmitter {
|
|
|
80
82
|
this.timeZeroOffset = this.timeZero.diff(this.timeOpen)
|
|
81
83
|
}
|
|
82
84
|
|
|
85
|
+
/* receive external request */
|
|
86
|
+
async receiveRequest (args: any[]) {
|
|
87
|
+
/* no-op */
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/* send external response */
|
|
91
|
+
sendResponse (args: any[]) {
|
|
92
|
+
this.emit("send-response", args)
|
|
93
|
+
}
|
|
94
|
+
|
|
83
95
|
/* INTERNAL: utility function: create "params" attribute from constructor of sub-classes */
|
|
84
96
|
configure (spec: { [ id: string ]: { type: string, pos?: number, val?: any, match?: RegExp | ((x: any) => boolean) } }) {
|
|
85
97
|
for (const name of Object.keys(spec)) {
|