speechflow 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/README.md +37 -3
- package/dst/speechflow-node-a2a-gender.d.ts +17 -0
- package/dst/speechflow-node-a2a-gender.js +272 -0
- package/dst/speechflow-node-a2a-gender.js.map +1 -0
- package/dst/speechflow-node-a2a-meter.js +2 -2
- package/dst/speechflow-node-a2a-meter.js.map +1 -1
- package/dst/speechflow-node-a2a-mute.js +1 -0
- package/dst/speechflow-node-a2a-mute.js.map +1 -1
- package/dst/speechflow-node-a2a-vad.js +47 -63
- package/dst/speechflow-node-a2a-vad.js.map +1 -1
- package/dst/speechflow-node-a2a-wav.js +145 -122
- package/dst/speechflow-node-a2a-wav.js.map +1 -1
- package/dst/speechflow-node-a2t-deepgram.js +13 -3
- package/dst/speechflow-node-a2t-deepgram.js.map +1 -1
- package/dst/speechflow-node-t2a-elevenlabs.js +10 -5
- package/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
- package/dst/speechflow-node-t2a-kokoro.js.map +1 -1
- package/dst/speechflow-node-t2t-deepl.js.map +1 -1
- package/dst/speechflow-node-t2t-format.js.map +1 -1
- package/dst/speechflow-node-t2t-ollama.js.map +1 -1
- package/dst/speechflow-node-t2t-openai.js.map +1 -1
- package/dst/speechflow-node-t2t-subtitle.js.map +1 -1
- package/dst/speechflow-node-t2t-transformers.js.map +1 -1
- package/dst/speechflow-node-x2x-filter.d.ts +11 -0
- package/dst/speechflow-node-x2x-filter.js +113 -0
- package/dst/speechflow-node-x2x-filter.js.map +1 -0
- package/dst/speechflow-node-x2x-trace.js +24 -10
- package/dst/speechflow-node-x2x-trace.js.map +1 -1
- package/dst/speechflow-node-xio-device.js +14 -5
- package/dst/speechflow-node-xio-device.js.map +1 -1
- package/dst/speechflow-node-xio-file.js +58 -27
- package/dst/speechflow-node-xio-file.js.map +1 -1
- package/dst/speechflow-node-xio-mqtt.js.map +1 -1
- package/dst/speechflow-node-xio-websocket.js.map +1 -1
- package/dst/speechflow-node.js +1 -0
- package/dst/speechflow-node.js.map +1 -1
- package/dst/speechflow-utils.d.ts +14 -1
- package/dst/speechflow-utils.js +110 -2
- package/dst/speechflow-utils.js.map +1 -1
- package/dst/speechflow.js +23 -4
- package/dst/speechflow.js.map +1 -1
- package/etc/speechflow.yaml +51 -24
- package/package.json +6 -5
- package/src/speechflow-node-a2a-gender.ts +272 -0
- package/src/speechflow-node-a2a-meter.ts +3 -3
- package/src/speechflow-node-a2a-mute.ts +1 -0
- package/src/speechflow-node-a2a-vad.ts +58 -68
- package/src/speechflow-node-a2a-wav.ts +128 -91
- package/src/speechflow-node-a2t-deepgram.ts +15 -4
- package/src/speechflow-node-t2a-elevenlabs.ts +13 -8
- package/src/speechflow-node-t2a-kokoro.ts +3 -3
- package/src/speechflow-node-t2t-deepl.ts +2 -2
- package/src/speechflow-node-t2t-format.ts +2 -2
- package/src/speechflow-node-t2t-ollama.ts +2 -2
- package/src/speechflow-node-t2t-openai.ts +2 -2
- package/src/speechflow-node-t2t-subtitle.ts +1 -1
- package/src/speechflow-node-t2t-transformers.ts +2 -2
- package/src/speechflow-node-x2x-filter.ts +122 -0
- package/src/speechflow-node-x2x-trace.ts +28 -11
- package/src/speechflow-node-xio-device.ts +20 -8
- package/src/speechflow-node-xio-file.ts +74 -36
- package/src/speechflow-node-xio-mqtt.ts +3 -3
- package/src/speechflow-node-xio-websocket.ts +1 -1
- package/src/speechflow-node.ts +2 -0
- package/src/speechflow-utils.ts +81 -2
- package/src/speechflow.ts +46 -27
|
@@ -9,19 +9,24 @@ import Stream from "node:stream"
|
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import { RealTimeVAD } from "@ericedouard/vad-node-realtime"
|
|
12
|
-
import { Duration } from "luxon"
|
|
13
12
|
|
|
14
13
|
/* internal dependencies */
|
|
15
14
|
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
16
15
|
import * as utils from "./speechflow-utils"
|
|
17
16
|
|
|
18
17
|
/* audio stream queue element */
|
|
18
|
+
type AudioQueueElementSegment = {
|
|
19
|
+
data: Float32Array,
|
|
20
|
+
isSpeech?: boolean
|
|
21
|
+
}
|
|
19
22
|
type AudioQueueElement = {
|
|
20
|
-
type:
|
|
21
|
-
chunk:
|
|
22
|
-
|
|
23
|
+
type: "audio-frame",
|
|
24
|
+
chunk: SpeechFlowChunk,
|
|
25
|
+
segmentIdx: number,
|
|
26
|
+
segmentData: AudioQueueElementSegment[],
|
|
27
|
+
isSpeech?: boolean
|
|
23
28
|
} | {
|
|
24
|
-
type:
|
|
29
|
+
type: "audio-eof"
|
|
25
30
|
}
|
|
26
31
|
|
|
27
32
|
/* SpeechFlow node for VAD speech-to-speech processing */
|
|
@@ -89,10 +94,22 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
89
94
|
log("info", "VAD: speech end (segment too short)")
|
|
90
95
|
},
|
|
91
96
|
onFrameProcessed: (audio) => {
|
|
92
|
-
/* annotate the current audio
|
|
97
|
+
/* annotate the current audio segment */
|
|
93
98
|
const element = this.queueVAD.peek()
|
|
94
|
-
if (element
|
|
95
|
-
|
|
99
|
+
if (element === undefined || element.type !== "audio-frame")
|
|
100
|
+
throw new Error("internal error which cannot happen: no more queued element")
|
|
101
|
+
const segment = element.segmentData[element.segmentIdx++]
|
|
102
|
+
segment.isSpeech = (audio.isSpeech > audio.notSpeech)
|
|
103
|
+
|
|
104
|
+
/* annotate the entire audio chunk */
|
|
105
|
+
if (element.segmentIdx >= element.segmentData.length) {
|
|
106
|
+
let isSpeech = false
|
|
107
|
+
for (const segment of element.segmentData) {
|
|
108
|
+
if (segment.isSpeech) {
|
|
109
|
+
isSpeech = true
|
|
110
|
+
break
|
|
111
|
+
}
|
|
112
|
+
}
|
|
96
113
|
element.isSpeech = isSpeech
|
|
97
114
|
this.queueVAD.touch()
|
|
98
115
|
this.queueVAD.walk(+1)
|
|
@@ -102,14 +119,7 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
102
119
|
this.vad.start()
|
|
103
120
|
|
|
104
121
|
/* provide Duplex stream and internally attach to VAD */
|
|
105
|
-
const
|
|
106
|
-
const cfg = this.config
|
|
107
|
-
const queue = this.queue
|
|
108
|
-
const queueRecv = this.queueRecv
|
|
109
|
-
const queueSend = this.queueSend
|
|
110
|
-
const mode = this.params.mode
|
|
111
|
-
let carrySamples = new Float32Array()
|
|
112
|
-
let carryStart = Duration.fromDurationLike(0)
|
|
122
|
+
const self = this
|
|
113
123
|
this.stream = new Stream.Duplex({
|
|
114
124
|
writableObjectMode: true,
|
|
115
125
|
readableObjectMode: true,
|
|
@@ -123,38 +133,34 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
123
133
|
callback()
|
|
124
134
|
else {
|
|
125
135
|
/* convert audio samples from PCM/I16 to PCM/F32 */
|
|
126
|
-
|
|
127
|
-
let start = chunk.timestampStart
|
|
128
|
-
|
|
129
|
-
/* merge previous carry samples */
|
|
130
|
-
if (carrySamples.length > 0) {
|
|
131
|
-
start = carryStart
|
|
132
|
-
const merged = new Float32Array(carrySamples.length + data.length)
|
|
133
|
-
merged.set(carrySamples)
|
|
134
|
-
merged.set(data, carrySamples.length)
|
|
135
|
-
data = merged
|
|
136
|
-
carrySamples = new Float32Array()
|
|
137
|
-
}
|
|
136
|
+
const data = utils.convertBufToF32(chunk.payload, self.config.audioLittleEndian)
|
|
138
137
|
|
|
139
|
-
/*
|
|
140
|
-
|
|
141
|
-
const chunkSize =
|
|
138
|
+
/* segment audio samples as individual VAD-sized frames */
|
|
139
|
+
const segmentData: AudioQueueElementSegment[] = []
|
|
140
|
+
const chunkSize = vadSamplesPerFrame * (self.config.audioSampleRate / vadSampleRateTarget)
|
|
142
141
|
const chunks = Math.trunc(data.length / chunkSize)
|
|
143
142
|
for (let i = 0; i < chunks; i++) {
|
|
144
143
|
const frame = data.slice(i * chunkSize, (i + 1) * chunkSize)
|
|
145
|
-
const
|
|
146
|
-
|
|
147
|
-
const end = start.plus(duration)
|
|
148
|
-
const chunk = new SpeechFlowChunk(start, end, "final", "audio", buf)
|
|
149
|
-
queueRecv.append({ type: "audio-frame", chunk })
|
|
150
|
-
vad.processAudio(frame)
|
|
151
|
-
start = end
|
|
144
|
+
const segment: AudioQueueElementSegment = { data: frame }
|
|
145
|
+
segmentData.push(segment)
|
|
152
146
|
}
|
|
147
|
+
if ((chunks * chunkSize) < data.length) {
|
|
148
|
+
const frame = new Float32Array(chunkSize)
|
|
149
|
+
frame.fill(0)
|
|
150
|
+
frame.set(data.slice(chunks * chunkSize, data.length))
|
|
151
|
+
const segment: AudioQueueElementSegment = { data: frame }
|
|
152
|
+
segmentData.push(segment)
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/* queue the results */
|
|
156
|
+
self.queueRecv.append({
|
|
157
|
+
type: "audio-frame", chunk,
|
|
158
|
+
segmentIdx: 0, segmentData
|
|
159
|
+
})
|
|
153
160
|
|
|
154
|
-
/*
|
|
155
|
-
const
|
|
156
|
-
|
|
157
|
-
carryStart = start
|
|
161
|
+
/* push segments through Voice Activity Detection (VAD) */
|
|
162
|
+
for (const segment of segmentData)
|
|
163
|
+
self.vad!.processAudio(segment.data)
|
|
158
164
|
|
|
159
165
|
callback()
|
|
160
166
|
}
|
|
@@ -162,25 +168,8 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
162
168
|
|
|
163
169
|
/* receive no more audio chunks (writable side of stream) */
|
|
164
170
|
final (callback) {
|
|
165
|
-
/* flush pending audio chunks */
|
|
166
|
-
if (carrySamples.length > 0) {
|
|
167
|
-
const chunkSize = (vadSamplesPerFrame * (cfg.audioSampleRate / vadSampleRateTarget))
|
|
168
|
-
if (carrySamples.length < chunkSize) {
|
|
169
|
-
const merged = new Float32Array(chunkSize)
|
|
170
|
-
merged.set(carrySamples)
|
|
171
|
-
merged.fill(0.0, carrySamples.length, chunkSize)
|
|
172
|
-
carrySamples = merged
|
|
173
|
-
}
|
|
174
|
-
const buf = utils.convertF32ToBuf(carrySamples)
|
|
175
|
-
const duration = utils.audioBufferDuration(buf)
|
|
176
|
-
const end = carryStart.plus(duration)
|
|
177
|
-
const chunk = new SpeechFlowChunk(carryStart, end, "final", "audio", buf)
|
|
178
|
-
queueRecv.append({ type: "audio-frame", chunk })
|
|
179
|
-
vad.processAudio(carrySamples)
|
|
180
|
-
}
|
|
181
|
-
|
|
182
171
|
/* signal end of file */
|
|
183
|
-
queueRecv.append({ type: "audio-eof" })
|
|
172
|
+
self.queueRecv.append({ type: "audio-eof" })
|
|
184
173
|
callback()
|
|
185
174
|
},
|
|
186
175
|
|
|
@@ -192,7 +181,7 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
192
181
|
const flushPendingChunks = () => {
|
|
193
182
|
let pushed = 0
|
|
194
183
|
while (true) {
|
|
195
|
-
const element = queueSend.peek()
|
|
184
|
+
const element = self.queueSend.peek()
|
|
196
185
|
if (element === undefined)
|
|
197
186
|
break
|
|
198
187
|
else if (element.type === "audio-eof") {
|
|
@@ -202,19 +191,20 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
202
191
|
else if (element.type === "audio-frame"
|
|
203
192
|
&& element.isSpeech === undefined)
|
|
204
193
|
break
|
|
205
|
-
queueSend.walk(+1)
|
|
194
|
+
self.queueSend.walk(+1)
|
|
195
|
+
self.queue.trim()
|
|
206
196
|
if (element.isSpeech) {
|
|
207
197
|
this.push(element.chunk)
|
|
208
198
|
pushed++
|
|
209
199
|
}
|
|
210
|
-
else if (mode === "silenced") {
|
|
200
|
+
else if (self.params.mode === "silenced") {
|
|
211
201
|
const chunk = element.chunk.clone()
|
|
212
202
|
const buffer = chunk.payload as Buffer
|
|
213
203
|
buffer.fill(0)
|
|
214
204
|
this.push(chunk)
|
|
215
205
|
pushed++
|
|
216
206
|
}
|
|
217
|
-
else if (mode === "unplugged" && pushed === 0)
|
|
207
|
+
else if (self.params.mode === "unplugged" && pushed === 0)
|
|
218
208
|
/* we have to await chunks now, as in unplugged
|
|
219
209
|
mode we else would be never called again until
|
|
220
210
|
we at least once push a new chunk as the result */
|
|
@@ -224,16 +214,16 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
224
214
|
|
|
225
215
|
/* await forthcoming audio chunks */
|
|
226
216
|
const awaitForthcomingChunks = () => {
|
|
227
|
-
const element = queueSend.peek()
|
|
217
|
+
const element = self.queueSend.peek()
|
|
228
218
|
if (element !== undefined
|
|
229
219
|
&& element.type === "audio-frame"
|
|
230
220
|
&& element.isSpeech !== undefined)
|
|
231
221
|
flushPendingChunks()
|
|
232
222
|
else
|
|
233
|
-
queue.once("write", awaitForthcomingChunks)
|
|
223
|
+
self.queue.once("write", awaitForthcomingChunks)
|
|
234
224
|
}
|
|
235
225
|
|
|
236
|
-
const element = queueSend.peek()
|
|
226
|
+
const element = self.queueSend.peek()
|
|
237
227
|
if (element !== undefined && element.type === "audio-eof")
|
|
238
228
|
this.push(null)
|
|
239
229
|
else if (element !== undefined
|
|
@@ -241,7 +231,7 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
|
241
231
|
&& element.isSpeech !== undefined)
|
|
242
232
|
flushPendingChunks()
|
|
243
233
|
else
|
|
244
|
-
queue.once("write", awaitForthcomingChunks)
|
|
234
|
+
self.queue.once("write", awaitForthcomingChunks)
|
|
245
235
|
}
|
|
246
236
|
tryToRead()
|
|
247
237
|
}
|
|
@@ -7,52 +7,69 @@
|
|
|
7
7
|
/* standard dependencies */
|
|
8
8
|
import Stream from "node:stream"
|
|
9
9
|
|
|
10
|
-
/* external dependencies */
|
|
11
|
-
import wav from "wav"
|
|
12
|
-
|
|
13
10
|
/* internal dependencies */
|
|
14
|
-
import SpeechFlowNode
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
11
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
12
|
+
|
|
13
|
+
/* write WAV header */
|
|
14
|
+
const writeWavHeader = (
|
|
15
|
+
length: number,
|
|
16
|
+
options?: { audioFormat?: number, channels?: number, sampleRate?: number, bitDepth?: number }
|
|
17
|
+
) => {
|
|
18
|
+
const audioFormat = options?.audioFormat ?? 0x001 /* PCM */
|
|
19
|
+
const channels = options?.channels ?? 1 /* mono */
|
|
20
|
+
const sampleRate = options?.sampleRate ?? 44100 /* 44KHz */
|
|
21
|
+
const bitDepth = options?.bitDepth ?? 16 /* 16-Bit */
|
|
22
|
+
|
|
23
|
+
const headerLength = 44
|
|
24
|
+
const dataLength = length || (4294967295 - 100)
|
|
25
|
+
const fileSize = dataLength + headerLength
|
|
26
|
+
const header = Buffer.alloc(headerLength)
|
|
27
|
+
|
|
28
|
+
const RIFF = Buffer.alloc(4, "RIFF")
|
|
29
|
+
const WAVE = Buffer.alloc(4, "WAVE")
|
|
30
|
+
const fmt = Buffer.alloc(4, "fmt ")
|
|
31
|
+
const data = Buffer.alloc(4, "data")
|
|
32
|
+
const byteRate = (sampleRate * channels * bitDepth) / 8
|
|
33
|
+
const blockAlign = (channels * bitDepth) / 8
|
|
34
|
+
|
|
35
|
+
let offset = 0
|
|
36
|
+
RIFF.copy(header, offset); offset += RIFF.length
|
|
37
|
+
header.writeUInt32LE(fileSize - 8, offset); offset += 4
|
|
38
|
+
WAVE.copy(header, offset); offset += WAVE.length
|
|
39
|
+
fmt.copy(header, offset); offset += fmt.length
|
|
40
|
+
header.writeUInt32LE(16, offset); offset += 4
|
|
41
|
+
header.writeUInt16LE(audioFormat, offset); offset += 2
|
|
42
|
+
header.writeUInt16LE(channels, offset); offset += 2
|
|
43
|
+
header.writeUInt32LE(sampleRate, offset); offset += 4
|
|
44
|
+
header.writeUInt32LE(byteRate, offset); offset += 4
|
|
45
|
+
header.writeUInt16LE(blockAlign, offset); offset += 2
|
|
46
|
+
header.writeUInt16LE(bitDepth, offset); offset += 2
|
|
47
|
+
data.copy(header, offset); offset += data.length
|
|
48
|
+
header.writeUInt32LE(dataLength, offset); offset += 4
|
|
49
|
+
|
|
50
|
+
return header
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/* read WAV header */
|
|
54
|
+
const readWavHeader = (buffer: Buffer) => {
|
|
55
|
+
let offset = 0
|
|
56
|
+
const riffHead = buffer.subarray(offset, offset + 4).toString(); offset += 4
|
|
57
|
+
const fileSize = buffer.readUInt32LE(offset); offset += 4
|
|
58
|
+
const waveHead = buffer.subarray(offset, offset + 4).toString(); offset += 4
|
|
59
|
+
const fmtHead = buffer.subarray(offset, offset + 4).toString(); offset += 4
|
|
60
|
+
const formatLength = buffer.readUInt32LE(offset); offset += 4
|
|
61
|
+
const audioFormat = buffer.readUInt16LE(offset); offset += 2
|
|
62
|
+
const channels = buffer.readUInt16LE(offset); offset += 2
|
|
63
|
+
const sampleRate = buffer.readUInt32LE(offset); offset += 4
|
|
64
|
+
const byteRate = buffer.readUInt32LE(offset); offset += 4
|
|
65
|
+
const blockAlign = buffer.readUInt16LE(offset); offset += 2
|
|
66
|
+
const bitDepth = buffer.readUInt16LE(offset); offset += 2
|
|
67
|
+
const data = buffer.subarray(offset, offset + 4).toString(); offset += 4
|
|
68
|
+
const dataLength = buffer.readUInt32LE(offset); offset += 4
|
|
69
|
+
|
|
70
|
+
return {
|
|
71
|
+
riffHead, fileSize, waveHead, fmtHead, formatLength, audioFormat,
|
|
72
|
+
channels, sampleRate, byteRate, blockAlign, bitDepth, data, dataLength
|
|
56
73
|
}
|
|
57
74
|
}
|
|
58
75
|
|
|
@@ -77,52 +94,72 @@ export default class SpeechFlowNodeWAV extends SpeechFlowNode {
|
|
|
77
94
|
|
|
78
95
|
/* open node */
|
|
79
96
|
async open () {
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
97
|
+
/* establish a transform stream */
|
|
98
|
+
const self = this
|
|
99
|
+
let firstChunk = true
|
|
100
|
+
this.stream = new Stream.Transform({
|
|
101
|
+
readableObjectMode: true,
|
|
102
|
+
writableObjectMode: true,
|
|
103
|
+
decodeStrings: false,
|
|
104
|
+
transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
105
|
+
if (!Buffer.isBuffer(chunk.payload))
|
|
106
|
+
callback(new Error("invalid chunk payload type"))
|
|
107
|
+
else if (firstChunk) {
|
|
108
|
+
if (self.params.mode === "encode") {
|
|
109
|
+
/* convert raw/PCM to WAV/PCM
|
|
110
|
+
(NOTICE: as this is a continuous stream, the
|
|
111
|
+
resulting WAV header is not 100% conforming
|
|
112
|
+
to the WAV standard, as it has to use a zero
|
|
113
|
+
duration information. This cannot be changed in
|
|
114
|
+
a stream-based processing.) */
|
|
115
|
+
const headerBuffer = writeWavHeader(0, {
|
|
116
|
+
audioFormat: 0x0001 /* PCM */,
|
|
117
|
+
channels: self.config.audioChannels,
|
|
118
|
+
sampleRate: self.config.audioSampleRate,
|
|
119
|
+
bitDepth: self.config.audioBitDepth
|
|
120
|
+
})
|
|
121
|
+
const headerChunk = chunk.clone()
|
|
122
|
+
headerChunk.payload = headerBuffer
|
|
123
|
+
this.push(headerChunk)
|
|
124
|
+
this.push(chunk)
|
|
125
|
+
callback()
|
|
126
|
+
}
|
|
127
|
+
else if (self.params.mode === "decode") {
|
|
128
|
+
/* convert WAV/PCM to raw/PCM */
|
|
129
|
+
const header = readWavHeader(chunk.payload)
|
|
130
|
+
self.log("info", "WAV audio stream: " +
|
|
131
|
+
`audioFormat=${header.audioFormat === 0x0001 ? "PCM" :
|
|
132
|
+
"0x" + (header.audioFormat as number).toString(16).padStart(4, "0")} ` +
|
|
133
|
+
`channels=${header.channels} ` +
|
|
134
|
+
`sampleRate=${header.sampleRate} ` +
|
|
135
|
+
`bitDepth=${header.bitDepth}`)
|
|
136
|
+
if (header.audioFormat !== 0x0001 /* PCM */)
|
|
137
|
+
throw new Error("WAV not based on PCM format")
|
|
138
|
+
if (header.bitDepth !== 16)
|
|
139
|
+
throw new Error("WAV not based on 16 bit samples")
|
|
140
|
+
if (header.sampleRate !== 48000)
|
|
141
|
+
throw new Error("WAV not based on 48Khz sample rate")
|
|
142
|
+
if (header.channels !== 1)
|
|
143
|
+
throw new Error("WAV not based on mono channel")
|
|
144
|
+
chunk.payload = chunk.payload.subarray(44)
|
|
145
|
+
this.push(chunk)
|
|
146
|
+
callback()
|
|
147
|
+
}
|
|
148
|
+
else
|
|
149
|
+
throw new Error(`invalid operation mode "${self.params.mode}"`)
|
|
150
|
+
}
|
|
151
|
+
else {
|
|
152
|
+
/* pass-through original chunk */
|
|
153
|
+
this.push(chunk)
|
|
154
|
+
callback()
|
|
155
|
+
}
|
|
156
|
+
firstChunk = false
|
|
157
|
+
},
|
|
158
|
+
final (callback) {
|
|
159
|
+
this.push(null)
|
|
160
|
+
callback()
|
|
161
|
+
}
|
|
162
|
+
})
|
|
126
163
|
}
|
|
127
164
|
|
|
128
165
|
/* close node */
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/* standard dependencies */
|
|
8
|
-
import Stream
|
|
8
|
+
import Stream from "node:stream"
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import * as Deepgram from "@deepgram/sdk"
|
|
@@ -65,6 +65,9 @@ export default class SpeechFlowNodeDeepgram extends SpeechFlowNode {
|
|
|
65
65
|
/* create queue for results */
|
|
66
66
|
const queue = new utils.SingleQueue<SpeechFlowChunk>()
|
|
67
67
|
|
|
68
|
+
/* create a store for the meta information */
|
|
69
|
+
const metastore = new utils.TimeStore<Map<string, any>>()
|
|
70
|
+
|
|
68
71
|
/* connect to Deepgram API */
|
|
69
72
|
const deepgram = Deepgram.createClient(this.params.key)
|
|
70
73
|
let language = "en"
|
|
@@ -86,21 +89,27 @@ export default class SpeechFlowNodeDeepgram extends SpeechFlowNode {
|
|
|
86
89
|
smart_format: true,
|
|
87
90
|
punctuate: true,
|
|
88
91
|
filler_words: true,
|
|
89
|
-
diarize:
|
|
92
|
+
diarize: false,
|
|
90
93
|
numerals: true,
|
|
91
94
|
profanity_filter: false
|
|
92
95
|
})
|
|
93
96
|
|
|
94
97
|
/* hook onto Deepgram API events */
|
|
95
98
|
this.dg.on(Deepgram.LiveTranscriptionEvents.Transcript, async (data) => {
|
|
96
|
-
const text = (data.channel?.alternatives[0]
|
|
99
|
+
const text = (data.channel?.alternatives[0]?.transcript as string) ?? ""
|
|
97
100
|
if (text === "")
|
|
98
101
|
this.log("info", `Deepgram: empty/dummy text received (start: ${data.start}s, duration: ${data.duration}s)`)
|
|
99
102
|
else {
|
|
100
103
|
this.log("info", `Deepgram: text received (start: ${data.start}s, duration: ${data.duration}s): "${text}"`)
|
|
101
104
|
const start = Duration.fromMillis(data.start * 1000).plus(this.timeZeroOffset)
|
|
102
105
|
const end = start.plus({ seconds: data.duration })
|
|
103
|
-
const
|
|
106
|
+
const metas = metastore.fetch(start, end)
|
|
107
|
+
const meta = metas.reduce((prev: Map<string, any>, curr: Map<string, any>) => {
|
|
108
|
+
curr.forEach((val, key) => { prev.set(key, val) })
|
|
109
|
+
return prev
|
|
110
|
+
}, new Map<string, any>())
|
|
111
|
+
metastore.prune(start)
|
|
112
|
+
const chunk = new SpeechFlowChunk(start, end, "final", "text", text, meta)
|
|
104
113
|
queue.write(chunk)
|
|
105
114
|
}
|
|
106
115
|
})
|
|
@@ -180,6 +189,8 @@ export default class SpeechFlowNodeDeepgram extends SpeechFlowNode {
|
|
|
180
189
|
if (chunk.payload.byteLength > 0) {
|
|
181
190
|
log("info", `Deepgram: send data (${chunk.payload.byteLength} bytes)`)
|
|
182
191
|
initTimeoutStart()
|
|
192
|
+
if (chunk.meta.size > 0)
|
|
193
|
+
metastore.store(chunk.timestampStart, chunk.timestampEnd, chunk.meta)
|
|
183
194
|
dg.send(chunk.payload.buffer) /* intentionally discard all time information */
|
|
184
195
|
}
|
|
185
196
|
callback()
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/* standard dependencies */
|
|
8
|
-
import Stream
|
|
8
|
+
import Stream from "node:stream"
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
11
|
import * as ElevenLabs from "@elevenlabs/elevenlabs-js"
|
|
@@ -30,11 +30,13 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
|
|
|
30
30
|
|
|
31
31
|
/* declare node configuration parameters */
|
|
32
32
|
this.configure({
|
|
33
|
-
key:
|
|
34
|
-
voice:
|
|
35
|
-
language:
|
|
36
|
-
speed:
|
|
37
|
-
|
|
33
|
+
key: { type: "string", val: process.env.SPEECHFLOW_ELEVENLABS_KEY },
|
|
34
|
+
voice: { type: "string", val: "Brian", pos: 0, match: /^(?:Brittney|Cassidy|Leonie|Mark|Brian)$/ },
|
|
35
|
+
language: { type: "string", val: "en", pos: 1, match: /^(?:de|en)$/ },
|
|
36
|
+
speed: { type: "number", val: 1.00, pos: 2, match: (n: number) => n >= 0.7 && n <= 1.2 },
|
|
37
|
+
stability: { type: "number", val: 0.5, pos: 3, match: (n: number) => n >= 0.0 && n <= 1.0 },
|
|
38
|
+
similarity: { type: "number", val: 0.75, pos: 4, match: (n: number) => n >= 0.0 && n <= 1.0 },
|
|
39
|
+
optimize: { type: "string", val: "latency", pos: 5, match: /^(?:latency|quality)$/ }
|
|
38
40
|
})
|
|
39
41
|
|
|
40
42
|
/* declare node input/output format */
|
|
@@ -90,7 +92,7 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
|
|
|
90
92
|
|
|
91
93
|
/* perform text-to-speech operation with Elevenlabs API */
|
|
92
94
|
const model = this.params.optimize === "quality" ?
|
|
93
|
-
"
|
|
95
|
+
"eleven_turbo_v2_5" :
|
|
94
96
|
"eleven_flash_v2_5"
|
|
95
97
|
const speechStream = (text: string) => {
|
|
96
98
|
this.log("info", `ElevenLabs: send text "${text}"`)
|
|
@@ -101,7 +103,9 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
|
|
|
101
103
|
outputFormat: `pcm_${maxSampleRate}` as ElevenLabs.ElevenLabs.OutputFormat,
|
|
102
104
|
seed: 815, /* arbitrary, but fixated by us */
|
|
103
105
|
voiceSettings: {
|
|
104
|
-
speed:
|
|
106
|
+
speed: this.params.speed,
|
|
107
|
+
stability: this.params.stability,
|
|
108
|
+
similarityBoost: this.params.similarity
|
|
105
109
|
}
|
|
106
110
|
}, {
|
|
107
111
|
timeoutInSeconds: 30,
|
|
@@ -128,6 +132,7 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
|
|
|
128
132
|
if (Buffer.isBuffer(chunk.payload))
|
|
129
133
|
callback(new Error("invalid chunk payload type"))
|
|
130
134
|
else {
|
|
135
|
+
log("info", `ElevenLabs: send text: ${JSON.stringify(chunk.payload)}`)
|
|
131
136
|
speechStream(chunk.payload).then((stream) => {
|
|
132
137
|
getStreamAsBuffer(stream).then((buffer) => {
|
|
133
138
|
const bufferResampled = resampler.processChunk(buffer)
|
|
@@ -5,11 +5,11 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/* standard dependencies */
|
|
8
|
-
import Stream
|
|
8
|
+
import Stream from "node:stream"
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
|
-
import { KokoroTTS }
|
|
12
|
-
import SpeexResampler
|
|
11
|
+
import { KokoroTTS } from "kokoro-js"
|
|
12
|
+
import SpeexResampler from "speex-resampler"
|
|
13
13
|
|
|
14
14
|
/* internal dependencies */
|
|
15
15
|
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
@@ -5,10 +5,10 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/* standard dependencies */
|
|
8
|
-
import Stream
|
|
8
|
+
import Stream from "node:stream"
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
|
-
import * as DeepL
|
|
11
|
+
import * as DeepL from "deepl-node"
|
|
12
12
|
|
|
13
13
|
/* internal dependencies */
|
|
14
14
|
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
@@ -5,10 +5,10 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/* standard dependencies */
|
|
8
|
-
import Stream
|
|
8
|
+
import Stream from "node:stream"
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
|
-
import wrapText
|
|
11
|
+
import wrapText from "wrap-text"
|
|
12
12
|
|
|
13
13
|
/* internal dependencies */
|
|
14
14
|
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
@@ -5,10 +5,10 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/* standard dependencies */
|
|
8
|
-
import Stream
|
|
8
|
+
import Stream from "node:stream"
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
|
-
import { Ollama }
|
|
11
|
+
import { Ollama } from "ollama"
|
|
12
12
|
|
|
13
13
|
/* internal dependencies */
|
|
14
14
|
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
@@ -5,10 +5,10 @@
|
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
/* standard dependencies */
|
|
8
|
-
import Stream
|
|
8
|
+
import Stream from "node:stream"
|
|
9
9
|
|
|
10
10
|
/* external dependencies */
|
|
11
|
-
import OpenAI
|
|
11
|
+
import OpenAI from "openai"
|
|
12
12
|
|
|
13
13
|
/* internal dependencies */
|
|
14
14
|
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|