speechflow 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/CHANGELOG.md +11 -0
  2. package/README.md +37 -3
  3. package/dst/speechflow-node-a2a-gender.d.ts +17 -0
  4. package/dst/speechflow-node-a2a-gender.js +272 -0
  5. package/dst/speechflow-node-a2a-gender.js.map +1 -0
  6. package/dst/speechflow-node-a2a-meter.js +2 -2
  7. package/dst/speechflow-node-a2a-meter.js.map +1 -1
  8. package/dst/speechflow-node-a2a-mute.js +1 -0
  9. package/dst/speechflow-node-a2a-mute.js.map +1 -1
  10. package/dst/speechflow-node-a2a-vad.js +47 -63
  11. package/dst/speechflow-node-a2a-vad.js.map +1 -1
  12. package/dst/speechflow-node-a2a-wav.js +145 -122
  13. package/dst/speechflow-node-a2a-wav.js.map +1 -1
  14. package/dst/speechflow-node-a2t-deepgram.js +13 -3
  15. package/dst/speechflow-node-a2t-deepgram.js.map +1 -1
  16. package/dst/speechflow-node-t2a-elevenlabs.js +10 -5
  17. package/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
  18. package/dst/speechflow-node-t2a-kokoro.js.map +1 -1
  19. package/dst/speechflow-node-t2t-deepl.js.map +1 -1
  20. package/dst/speechflow-node-t2t-format.js.map +1 -1
  21. package/dst/speechflow-node-t2t-ollama.js.map +1 -1
  22. package/dst/speechflow-node-t2t-openai.js.map +1 -1
  23. package/dst/speechflow-node-t2t-subtitle.js.map +1 -1
  24. package/dst/speechflow-node-t2t-transformers.js.map +1 -1
  25. package/dst/speechflow-node-x2x-filter.d.ts +11 -0
  26. package/dst/speechflow-node-x2x-filter.js +113 -0
  27. package/dst/speechflow-node-x2x-filter.js.map +1 -0
  28. package/dst/speechflow-node-x2x-trace.js +24 -10
  29. package/dst/speechflow-node-x2x-trace.js.map +1 -1
  30. package/dst/speechflow-node-xio-device.js +14 -5
  31. package/dst/speechflow-node-xio-device.js.map +1 -1
  32. package/dst/speechflow-node-xio-file.js +58 -27
  33. package/dst/speechflow-node-xio-file.js.map +1 -1
  34. package/dst/speechflow-node-xio-mqtt.js.map +1 -1
  35. package/dst/speechflow-node-xio-websocket.js.map +1 -1
  36. package/dst/speechflow-node.js +1 -0
  37. package/dst/speechflow-node.js.map +1 -1
  38. package/dst/speechflow-utils.d.ts +14 -1
  39. package/dst/speechflow-utils.js +110 -2
  40. package/dst/speechflow-utils.js.map +1 -1
  41. package/dst/speechflow.js +23 -4
  42. package/dst/speechflow.js.map +1 -1
  43. package/etc/speechflow.yaml +51 -24
  44. package/package.json +6 -5
  45. package/src/speechflow-node-a2a-gender.ts +272 -0
  46. package/src/speechflow-node-a2a-meter.ts +3 -3
  47. package/src/speechflow-node-a2a-mute.ts +1 -0
  48. package/src/speechflow-node-a2a-vad.ts +58 -68
  49. package/src/speechflow-node-a2a-wav.ts +128 -91
  50. package/src/speechflow-node-a2t-deepgram.ts +15 -4
  51. package/src/speechflow-node-t2a-elevenlabs.ts +13 -8
  52. package/src/speechflow-node-t2a-kokoro.ts +3 -3
  53. package/src/speechflow-node-t2t-deepl.ts +2 -2
  54. package/src/speechflow-node-t2t-format.ts +2 -2
  55. package/src/speechflow-node-t2t-ollama.ts +2 -2
  56. package/src/speechflow-node-t2t-openai.ts +2 -2
  57. package/src/speechflow-node-t2t-subtitle.ts +1 -1
  58. package/src/speechflow-node-t2t-transformers.ts +2 -2
  59. package/src/speechflow-node-x2x-filter.ts +122 -0
  60. package/src/speechflow-node-x2x-trace.ts +28 -11
  61. package/src/speechflow-node-xio-device.ts +20 -8
  62. package/src/speechflow-node-xio-file.ts +74 -36
  63. package/src/speechflow-node-xio-mqtt.ts +3 -3
  64. package/src/speechflow-node-xio-websocket.ts +1 -1
  65. package/src/speechflow-node.ts +2 -0
  66. package/src/speechflow-utils.ts +81 -2
  67. package/src/speechflow.ts +46 -27
@@ -9,19 +9,24 @@ import Stream from "node:stream"
9
9
 
10
10
  /* external dependencies */
11
11
  import { RealTimeVAD } from "@ericedouard/vad-node-realtime"
12
- import { Duration } from "luxon"
13
12
 
14
13
  /* internal dependencies */
15
14
  import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
16
15
  import * as utils from "./speechflow-utils"
17
16
 
18
17
  /* audio stream queue element */
18
+ type AudioQueueElementSegment = {
19
+ data: Float32Array,
20
+ isSpeech?: boolean
21
+ }
19
22
  type AudioQueueElement = {
20
- type: "audio-frame",
21
- chunk: SpeechFlowChunk,
22
- isSpeech?: boolean
23
+ type: "audio-frame",
24
+ chunk: SpeechFlowChunk,
25
+ segmentIdx: number,
26
+ segmentData: AudioQueueElementSegment[],
27
+ isSpeech?: boolean
23
28
  } | {
24
- type: "audio-eof"
29
+ type: "audio-eof"
25
30
  }
26
31
 
27
32
  /* SpeechFlow node for VAD speech-to-speech processing */
@@ -89,10 +94,22 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
89
94
  log("info", "VAD: speech end (segment too short)")
90
95
  },
91
96
  onFrameProcessed: (audio) => {
92
- /* annotate the current audio frame */
97
+ /* annotate the current audio segment */
93
98
  const element = this.queueVAD.peek()
94
- if (element !== undefined && element.type === "audio-frame") {
95
- const isSpeech = audio.isSpeech > audio.notSpeech
99
+ if (element === undefined || element.type !== "audio-frame")
100
+ throw new Error("internal error which cannot happen: no more queued element")
101
+ const segment = element.segmentData[element.segmentIdx++]
102
+ segment.isSpeech = (audio.isSpeech > audio.notSpeech)
103
+
104
+ /* annotate the entire audio chunk */
105
+ if (element.segmentIdx >= element.segmentData.length) {
106
+ let isSpeech = false
107
+ for (const segment of element.segmentData) {
108
+ if (segment.isSpeech) {
109
+ isSpeech = true
110
+ break
111
+ }
112
+ }
96
113
  element.isSpeech = isSpeech
97
114
  this.queueVAD.touch()
98
115
  this.queueVAD.walk(+1)
@@ -102,14 +119,7 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
102
119
  this.vad.start()
103
120
 
104
121
  /* provide Duplex stream and internally attach to VAD */
105
- const vad = this.vad
106
- const cfg = this.config
107
- const queue = this.queue
108
- const queueRecv = this.queueRecv
109
- const queueSend = this.queueSend
110
- const mode = this.params.mode
111
- let carrySamples = new Float32Array()
112
- let carryStart = Duration.fromDurationLike(0)
122
+ const self = this
113
123
  this.stream = new Stream.Duplex({
114
124
  writableObjectMode: true,
115
125
  readableObjectMode: true,
@@ -123,38 +133,34 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
123
133
  callback()
124
134
  else {
125
135
  /* convert audio samples from PCM/I16 to PCM/F32 */
126
- let data = utils.convertBufToF32(chunk.payload, cfg.audioLittleEndian)
127
- let start = chunk.timestampStart
128
-
129
- /* merge previous carry samples */
130
- if (carrySamples.length > 0) {
131
- start = carryStart
132
- const merged = new Float32Array(carrySamples.length + data.length)
133
- merged.set(carrySamples)
134
- merged.set(data, carrySamples.length)
135
- data = merged
136
- carrySamples = new Float32Array()
137
- }
136
+ const data = utils.convertBufToF32(chunk.payload, self.config.audioLittleEndian)
138
137
 
139
- /* queue audio samples as individual VAD-sized frames
140
- and in parallel send it into the Voice Activity Detection (VAD) */
141
- const chunkSize = (vadSamplesPerFrame * (cfg.audioSampleRate / vadSampleRateTarget))
138
+ /* segment audio samples as individual VAD-sized frames */
139
+ const segmentData: AudioQueueElementSegment[] = []
140
+ const chunkSize = vadSamplesPerFrame * (self.config.audioSampleRate / vadSampleRateTarget)
142
141
  const chunks = Math.trunc(data.length / chunkSize)
143
142
  for (let i = 0; i < chunks; i++) {
144
143
  const frame = data.slice(i * chunkSize, (i + 1) * chunkSize)
145
- const buf = utils.convertF32ToBuf(frame)
146
- const duration = utils.audioBufferDuration(buf)
147
- const end = start.plus(duration)
148
- const chunk = new SpeechFlowChunk(start, end, "final", "audio", buf)
149
- queueRecv.append({ type: "audio-frame", chunk })
150
- vad.processAudio(frame)
151
- start = end
144
+ const segment: AudioQueueElementSegment = { data: frame }
145
+ segmentData.push(segment)
152
146
  }
147
+ if ((chunks * chunkSize) < data.length) {
148
+ const frame = new Float32Array(chunkSize)
149
+ frame.fill(0)
150
+ frame.set(data.slice(chunks * chunkSize, data.length))
151
+ const segment: AudioQueueElementSegment = { data: frame }
152
+ segmentData.push(segment)
153
+ }
154
+
155
+ /* queue the results */
156
+ self.queueRecv.append({
157
+ type: "audio-frame", chunk,
158
+ segmentIdx: 0, segmentData
159
+ })
153
160
 
154
- /* remember new carry samples */
155
- const bulkLen = chunks * chunkSize
156
- carrySamples = data.slice(bulkLen)
157
- carryStart = start
161
+ /* push segments through Voice Activity Detection (VAD) */
162
+ for (const segment of segmentData)
163
+ self.vad!.processAudio(segment.data)
158
164
 
159
165
  callback()
160
166
  }
@@ -162,25 +168,8 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
162
168
 
163
169
  /* receive no more audio chunks (writable side of stream) */
164
170
  final (callback) {
165
- /* flush pending audio chunks */
166
- if (carrySamples.length > 0) {
167
- const chunkSize = (vadSamplesPerFrame * (cfg.audioSampleRate / vadSampleRateTarget))
168
- if (carrySamples.length < chunkSize) {
169
- const merged = new Float32Array(chunkSize)
170
- merged.set(carrySamples)
171
- merged.fill(0.0, carrySamples.length, chunkSize)
172
- carrySamples = merged
173
- }
174
- const buf = utils.convertF32ToBuf(carrySamples)
175
- const duration = utils.audioBufferDuration(buf)
176
- const end = carryStart.plus(duration)
177
- const chunk = new SpeechFlowChunk(carryStart, end, "final", "audio", buf)
178
- queueRecv.append({ type: "audio-frame", chunk })
179
- vad.processAudio(carrySamples)
180
- }
181
-
182
171
  /* signal end of file */
183
- queueRecv.append({ type: "audio-eof" })
172
+ self.queueRecv.append({ type: "audio-eof" })
184
173
  callback()
185
174
  },
186
175
 
@@ -192,7 +181,7 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
192
181
  const flushPendingChunks = () => {
193
182
  let pushed = 0
194
183
  while (true) {
195
- const element = queueSend.peek()
184
+ const element = self.queueSend.peek()
196
185
  if (element === undefined)
197
186
  break
198
187
  else if (element.type === "audio-eof") {
@@ -202,19 +191,20 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
202
191
  else if (element.type === "audio-frame"
203
192
  && element.isSpeech === undefined)
204
193
  break
205
- queueSend.walk(+1)
194
+ self.queueSend.walk(+1)
195
+ self.queue.trim()
206
196
  if (element.isSpeech) {
207
197
  this.push(element.chunk)
208
198
  pushed++
209
199
  }
210
- else if (mode === "silenced") {
200
+ else if (self.params.mode === "silenced") {
211
201
  const chunk = element.chunk.clone()
212
202
  const buffer = chunk.payload as Buffer
213
203
  buffer.fill(0)
214
204
  this.push(chunk)
215
205
  pushed++
216
206
  }
217
- else if (mode === "unplugged" && pushed === 0)
207
+ else if (self.params.mode === "unplugged" && pushed === 0)
218
208
  /* we have to await chunks now, as in unplugged
219
209
  mode we else would be never called again until
220
210
  we at least once push a new chunk as the result */
@@ -224,16 +214,16 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
224
214
 
225
215
  /* await forthcoming audio chunks */
226
216
  const awaitForthcomingChunks = () => {
227
- const element = queueSend.peek()
217
+ const element = self.queueSend.peek()
228
218
  if (element !== undefined
229
219
  && element.type === "audio-frame"
230
220
  && element.isSpeech !== undefined)
231
221
  flushPendingChunks()
232
222
  else
233
- queue.once("write", awaitForthcomingChunks)
223
+ self.queue.once("write", awaitForthcomingChunks)
234
224
  }
235
225
 
236
- const element = queueSend.peek()
226
+ const element = self.queueSend.peek()
237
227
  if (element !== undefined && element.type === "audio-eof")
238
228
  this.push(null)
239
229
  else if (element !== undefined
@@ -241,7 +231,7 @@ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
241
231
  && element.isSpeech !== undefined)
242
232
  flushPendingChunks()
243
233
  else
244
- queue.once("write", awaitForthcomingChunks)
234
+ self.queue.once("write", awaitForthcomingChunks)
245
235
  }
246
236
  tryToRead()
247
237
  }
@@ -7,52 +7,69 @@
7
7
  /* standard dependencies */
8
8
  import Stream from "node:stream"
9
9
 
10
- /* external dependencies */
11
- import wav from "wav"
12
-
13
10
  /* internal dependencies */
14
- import SpeechFlowNode from "./speechflow-node"
15
- import * as utils from "./speechflow-utils"
16
-
17
- /* utility class for wrapping a custom stream into a regular Transform stream */
18
- class StreamWrapper extends Stream.Transform {
19
- private foreignStream: any
20
- constructor (foreignStream: any, options: Stream.TransformOptions = {}) {
21
- options.readableObjectMode = true
22
- options.writableObjectMode = true
23
- super(options)
24
- this.foreignStream = foreignStream
25
- this.foreignStream.on("data", (chunk: any) => {
26
- this.push(chunk)
27
- })
28
- this.foreignStream.on("error", (err: Error) => {
29
- this.emit("error", err)
30
- })
31
- this.foreignStream.on("end", () => {
32
- this.push(null)
33
- })
34
- }
35
- _transform (chunk: any, encoding: BufferEncoding, callback: Stream.TransformCallback): void {
36
- try {
37
- const canContinue = this.foreignStream.write(chunk)
38
- if (canContinue)
39
- callback()
40
- else
41
- this.foreignStream.once("drain", callback)
42
- }
43
- catch (err) {
44
- callback(err as Error)
45
- }
46
- }
47
- _flush (callback: Stream.TransformCallback): void {
48
- try {
49
- if (typeof this.foreignStream.end === "function")
50
- this.foreignStream.end()
51
- callback()
52
- }
53
- catch (err) {
54
- callback(err as Error)
55
- }
11
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
12
+
13
+ /* write WAV header */
14
+ const writeWavHeader = (
15
+ length: number,
16
+ options?: { audioFormat?: number, channels?: number, sampleRate?: number, bitDepth?: number }
17
+ ) => {
18
+ const audioFormat = options?.audioFormat ?? 0x001 /* PCM */
19
+ const channels = options?.channels ?? 1 /* mono */
20
+ const sampleRate = options?.sampleRate ?? 44100 /* 44KHz */
21
+ const bitDepth = options?.bitDepth ?? 16 /* 16-Bit */
22
+
23
+ const headerLength = 44
24
+ const dataLength = length || (4294967295 - 100)
25
+ const fileSize = dataLength + headerLength
26
+ const header = Buffer.alloc(headerLength)
27
+
28
+ const RIFF = Buffer.alloc(4, "RIFF")
29
+ const WAVE = Buffer.alloc(4, "WAVE")
30
+ const fmt = Buffer.alloc(4, "fmt ")
31
+ const data = Buffer.alloc(4, "data")
32
+ const byteRate = (sampleRate * channels * bitDepth) / 8
33
+ const blockAlign = (channels * bitDepth) / 8
34
+
35
+ let offset = 0
36
+ RIFF.copy(header, offset); offset += RIFF.length
37
+ header.writeUInt32LE(fileSize - 8, offset); offset += 4
38
+ WAVE.copy(header, offset); offset += WAVE.length
39
+ fmt.copy(header, offset); offset += fmt.length
40
+ header.writeUInt32LE(16, offset); offset += 4
41
+ header.writeUInt16LE(audioFormat, offset); offset += 2
42
+ header.writeUInt16LE(channels, offset); offset += 2
43
+ header.writeUInt32LE(sampleRate, offset); offset += 4
44
+ header.writeUInt32LE(byteRate, offset); offset += 4
45
+ header.writeUInt16LE(blockAlign, offset); offset += 2
46
+ header.writeUInt16LE(bitDepth, offset); offset += 2
47
+ data.copy(header, offset); offset += data.length
48
+ header.writeUInt32LE(dataLength, offset); offset += 4
49
+
50
+ return header
51
+ }
52
+
53
+ /* read WAV header */
54
+ const readWavHeader = (buffer: Buffer) => {
55
+ let offset = 0
56
+ const riffHead = buffer.subarray(offset, offset + 4).toString(); offset += 4
57
+ const fileSize = buffer.readUInt32LE(offset); offset += 4
58
+ const waveHead = buffer.subarray(offset, offset + 4).toString(); offset += 4
59
+ const fmtHead = buffer.subarray(offset, offset + 4).toString(); offset += 4
60
+ const formatLength = buffer.readUInt32LE(offset); offset += 4
61
+ const audioFormat = buffer.readUInt16LE(offset); offset += 2
62
+ const channels = buffer.readUInt16LE(offset); offset += 2
63
+ const sampleRate = buffer.readUInt32LE(offset); offset += 4
64
+ const byteRate = buffer.readUInt32LE(offset); offset += 4
65
+ const blockAlign = buffer.readUInt16LE(offset); offset += 2
66
+ const bitDepth = buffer.readUInt16LE(offset); offset += 2
67
+ const data = buffer.subarray(offset, offset + 4).toString(); offset += 4
68
+ const dataLength = buffer.readUInt32LE(offset); offset += 4
69
+
70
+ return {
71
+ riffHead, fileSize, waveHead, fmtHead, formatLength, audioFormat,
72
+ channels, sampleRate, byteRate, blockAlign, bitDepth, data, dataLength
56
73
  }
57
74
  }
58
75
 
@@ -77,52 +94,72 @@ export default class SpeechFlowNodeWAV extends SpeechFlowNode {
77
94
 
78
95
  /* open node */
79
96
  async open () {
80
- if (this.params.mode === "encode") {
81
- /* convert raw/PCM to WAV/PCM */
82
- /* NOTICE: as this is a continuous stream, the resulting WAV header is not 100%
83
- conforming to the WAV standard, as it has to use a zero duration information.
84
- This cannot be changed in a stream-based processing. */
85
- const writer = new wav.Writer({
86
- format: 0x0001 /* PCM */,
87
- channels: this.config.audioChannels,
88
- sampleRate: this.config.audioSampleRate,
89
- bitDepth: this.config.audioBitDepth
90
- })
91
- this.stream = new StreamWrapper(writer)
92
- }
93
- else if (this.params.mode === "decode") {
94
- /* convert WAV/PCM to raw/PCM */
95
- const reader = new wav.Reader()
96
- reader.on("format", (format: any) => {
97
- this.log("info", `WAV audio stream: format=${format.audioFormat === 0x0001 ? "PCM" :
98
- "0x" + (format.audioFormat as number).toString(16).padStart(4, "0")} ` +
99
- `bitDepth=${format.bitDepth} ` +
100
- `signed=${format.signed ? "yes" : "no"} ` +
101
- `endian=${format.endianness} ` +
102
- `sampleRate=${format.sampleRate} ` +
103
- `channels=${format.channels}`)
104
- if (format.audioFormat !== 0x0001 /* PCM */)
105
- throw new Error("WAV not based on PCM format")
106
- if (format.bitDepth !== 16)
107
- throw new Error("WAV not based on 16 bit samples")
108
- if (!format.signed)
109
- throw new Error("WAV not based on signed integers")
110
- if (format.endianness !== "LE")
111
- throw new Error("WAV not based on little endianness")
112
- if (format.sampleRate !== 48000)
113
- throw new Error("WAV not based on 48Khz sample rate")
114
- if (format.channels !== 1)
115
- throw new Error("WAV not based on mono channel")
116
- })
117
- this.stream = new StreamWrapper(reader)
118
- }
119
- else
120
- throw new Error(`invalid operation mode "${this.params.mode}"`)
121
-
122
- /* convert regular stream into object-mode stream */
123
- const wrapper1 = utils.createTransformStreamForWritableSide()
124
- const wrapper2 = utils.createTransformStreamForReadableSide("audio", () => this.timeZero)
125
- this.stream = Stream.compose(wrapper1, this.stream, wrapper2)
97
+ /* establish a transform stream */
98
+ const self = this
99
+ let firstChunk = true
100
+ this.stream = new Stream.Transform({
101
+ readableObjectMode: true,
102
+ writableObjectMode: true,
103
+ decodeStrings: false,
104
+ transform (chunk: SpeechFlowChunk, encoding, callback) {
105
+ if (!Buffer.isBuffer(chunk.payload))
106
+ callback(new Error("invalid chunk payload type"))
107
+ else if (firstChunk) {
108
+ if (self.params.mode === "encode") {
109
+ /* convert raw/PCM to WAV/PCM
110
+ (NOTICE: as this is a continuous stream, the
111
+ resulting WAV header is not 100% conforming
112
+ to the WAV standard, as it has to use a zero
113
+ duration information. This cannot be changed in
114
+ a stream-based processing.) */
115
+ const headerBuffer = writeWavHeader(0, {
116
+ audioFormat: 0x0001 /* PCM */,
117
+ channels: self.config.audioChannels,
118
+ sampleRate: self.config.audioSampleRate,
119
+ bitDepth: self.config.audioBitDepth
120
+ })
121
+ const headerChunk = chunk.clone()
122
+ headerChunk.payload = headerBuffer
123
+ this.push(headerChunk)
124
+ this.push(chunk)
125
+ callback()
126
+ }
127
+ else if (self.params.mode === "decode") {
128
+ /* convert WAV/PCM to raw/PCM */
129
+ const header = readWavHeader(chunk.payload)
130
+ self.log("info", "WAV audio stream: " +
131
+ `audioFormat=${header.audioFormat === 0x0001 ? "PCM" :
132
+ "0x" + (header.audioFormat as number).toString(16).padStart(4, "0")} ` +
133
+ `channels=${header.channels} ` +
134
+ `sampleRate=${header.sampleRate} ` +
135
+ `bitDepth=${header.bitDepth}`)
136
+ if (header.audioFormat !== 0x0001 /* PCM */)
137
+ throw new Error("WAV not based on PCM format")
138
+ if (header.bitDepth !== 16)
139
+ throw new Error("WAV not based on 16 bit samples")
140
+ if (header.sampleRate !== 48000)
141
+ throw new Error("WAV not based on 48Khz sample rate")
142
+ if (header.channels !== 1)
143
+ throw new Error("WAV not based on mono channel")
144
+ chunk.payload = chunk.payload.subarray(44)
145
+ this.push(chunk)
146
+ callback()
147
+ }
148
+ else
149
+ throw new Error(`invalid operation mode "${self.params.mode}"`)
150
+ }
151
+ else {
152
+ /* pass-through original chunk */
153
+ this.push(chunk)
154
+ callback()
155
+ }
156
+ firstChunk = false
157
+ },
158
+ final (callback) {
159
+ this.push(null)
160
+ callback()
161
+ }
162
+ })
126
163
  }
127
164
 
128
165
  /* close node */
@@ -5,7 +5,7 @@
5
5
  */
6
6
 
7
7
  /* standard dependencies */
8
- import Stream from "node:stream"
8
+ import Stream from "node:stream"
9
9
 
10
10
  /* external dependencies */
11
11
  import * as Deepgram from "@deepgram/sdk"
@@ -65,6 +65,9 @@ export default class SpeechFlowNodeDeepgram extends SpeechFlowNode {
65
65
  /* create queue for results */
66
66
  const queue = new utils.SingleQueue<SpeechFlowChunk>()
67
67
 
68
+ /* create a store for the meta information */
69
+ const metastore = new utils.TimeStore<Map<string, any>>()
70
+
68
71
  /* connect to Deepgram API */
69
72
  const deepgram = Deepgram.createClient(this.params.key)
70
73
  let language = "en"
@@ -86,21 +89,27 @@ export default class SpeechFlowNodeDeepgram extends SpeechFlowNode {
86
89
  smart_format: true,
87
90
  punctuate: true,
88
91
  filler_words: true,
89
- diarize: true, /* still not used by us */
92
+ diarize: false,
90
93
  numerals: true,
91
94
  profanity_filter: false
92
95
  })
93
96
 
94
97
  /* hook onto Deepgram API events */
95
98
  this.dg.on(Deepgram.LiveTranscriptionEvents.Transcript, async (data) => {
96
- const text = (data.channel?.alternatives[0].transcript as string) ?? ""
99
+ const text = (data.channel?.alternatives[0]?.transcript as string) ?? ""
97
100
  if (text === "")
98
101
  this.log("info", `Deepgram: empty/dummy text received (start: ${data.start}s, duration: ${data.duration}s)`)
99
102
  else {
100
103
  this.log("info", `Deepgram: text received (start: ${data.start}s, duration: ${data.duration}s): "${text}"`)
101
104
  const start = Duration.fromMillis(data.start * 1000).plus(this.timeZeroOffset)
102
105
  const end = start.plus({ seconds: data.duration })
103
- const chunk = new SpeechFlowChunk(start, end, "final", "text", text)
106
+ const metas = metastore.fetch(start, end)
107
+ const meta = metas.reduce((prev: Map<string, any>, curr: Map<string, any>) => {
108
+ curr.forEach((val, key) => { prev.set(key, val) })
109
+ return prev
110
+ }, new Map<string, any>())
111
+ metastore.prune(start)
112
+ const chunk = new SpeechFlowChunk(start, end, "final", "text", text, meta)
104
113
  queue.write(chunk)
105
114
  }
106
115
  })
@@ -180,6 +189,8 @@ export default class SpeechFlowNodeDeepgram extends SpeechFlowNode {
180
189
  if (chunk.payload.byteLength > 0) {
181
190
  log("info", `Deepgram: send data (${chunk.payload.byteLength} bytes)`)
182
191
  initTimeoutStart()
192
+ if (chunk.meta.size > 0)
193
+ metastore.store(chunk.timestampStart, chunk.timestampEnd, chunk.meta)
183
194
  dg.send(chunk.payload.buffer) /* intentionally discard all time information */
184
195
  }
185
196
  callback()
@@ -5,7 +5,7 @@
5
5
  */
6
6
 
7
7
  /* standard dependencies */
8
- import Stream from "node:stream"
8
+ import Stream from "node:stream"
9
9
 
10
10
  /* external dependencies */
11
11
  import * as ElevenLabs from "@elevenlabs/elevenlabs-js"
@@ -30,11 +30,13 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
30
30
 
31
31
  /* declare node configuration parameters */
32
32
  this.configure({
33
- key: { type: "string", val: process.env.SPEECHFLOW_ELEVENLABS_KEY },
34
- voice: { type: "string", val: "Brian", pos: 0, match: /^(?:.+)$/ },
35
- language: { type: "string", val: "en", pos: 1, match: /^(?:de|en)$/ },
36
- speed: { type: "number", val: 1.05, pos: 2, match: (n: number) => n >= 0.7 && n <= 1.2 },
37
- optimize: { type: "string", val: "latency", pos: 3, match: /^(?:latency|quality)$/ }
33
+ key: { type: "string", val: process.env.SPEECHFLOW_ELEVENLABS_KEY },
34
+ voice: { type: "string", val: "Brian", pos: 0, match: /^(?:Brittney|Cassidy|Leonie|Mark|Brian)$/ },
35
+ language: { type: "string", val: "en", pos: 1, match: /^(?:de|en)$/ },
36
+ speed: { type: "number", val: 1.00, pos: 2, match: (n: number) => n >= 0.7 && n <= 1.2 },
37
+ stability: { type: "number", val: 0.5, pos: 3, match: (n: number) => n >= 0.0 && n <= 1.0 },
38
+ similarity: { type: "number", val: 0.75, pos: 4, match: (n: number) => n >= 0.0 && n <= 1.0 },
39
+ optimize: { type: "string", val: "latency", pos: 5, match: /^(?:latency|quality)$/ }
38
40
  })
39
41
 
40
42
  /* declare node input/output format */
@@ -90,7 +92,7 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
90
92
 
91
93
  /* perform text-to-speech operation with Elevenlabs API */
92
94
  const model = this.params.optimize === "quality" ?
93
- "eleven_multilingual_v2" :
95
+ "eleven_turbo_v2_5" :
94
96
  "eleven_flash_v2_5"
95
97
  const speechStream = (text: string) => {
96
98
  this.log("info", `ElevenLabs: send text "${text}"`)
@@ -101,7 +103,9 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
101
103
  outputFormat: `pcm_${maxSampleRate}` as ElevenLabs.ElevenLabs.OutputFormat,
102
104
  seed: 815, /* arbitrary, but fixated by us */
103
105
  voiceSettings: {
104
- speed: this.params.speed
106
+ speed: this.params.speed,
107
+ stability: this.params.stability,
108
+ similarityBoost: this.params.similarity
105
109
  }
106
110
  }, {
107
111
  timeoutInSeconds: 30,
@@ -128,6 +132,7 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
128
132
  if (Buffer.isBuffer(chunk.payload))
129
133
  callback(new Error("invalid chunk payload type"))
130
134
  else {
135
+ log("info", `ElevenLabs: send text: ${JSON.stringify(chunk.payload)}`)
131
136
  speechStream(chunk.payload).then((stream) => {
132
137
  getStreamAsBuffer(stream).then((buffer) => {
133
138
  const bufferResampled = resampler.processChunk(buffer)
@@ -5,11 +5,11 @@
5
5
  */
6
6
 
7
7
  /* standard dependencies */
8
- import Stream from "node:stream"
8
+ import Stream from "node:stream"
9
9
 
10
10
  /* external dependencies */
11
- import { KokoroTTS } from "kokoro-js"
12
- import SpeexResampler from "speex-resampler"
11
+ import { KokoroTTS } from "kokoro-js"
12
+ import SpeexResampler from "speex-resampler"
13
13
 
14
14
  /* internal dependencies */
15
15
  import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
@@ -5,10 +5,10 @@
5
5
  */
6
6
 
7
7
  /* standard dependencies */
8
- import Stream from "node:stream"
8
+ import Stream from "node:stream"
9
9
 
10
10
  /* external dependencies */
11
- import * as DeepL from "deepl-node"
11
+ import * as DeepL from "deepl-node"
12
12
 
13
13
  /* internal dependencies */
14
14
  import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
@@ -5,10 +5,10 @@
5
5
  */
6
6
 
7
7
  /* standard dependencies */
8
- import Stream from "node:stream"
8
+ import Stream from "node:stream"
9
9
 
10
10
  /* external dependencies */
11
- import wrapText from "wrap-text"
11
+ import wrapText from "wrap-text"
12
12
 
13
13
  /* internal dependencies */
14
14
  import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
@@ -5,10 +5,10 @@
5
5
  */
6
6
 
7
7
  /* standard dependencies */
8
- import Stream from "node:stream"
8
+ import Stream from "node:stream"
9
9
 
10
10
  /* external dependencies */
11
- import { Ollama } from "ollama"
11
+ import { Ollama } from "ollama"
12
12
 
13
13
  /* internal dependencies */
14
14
  import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
@@ -5,10 +5,10 @@
5
5
  */
6
6
 
7
7
  /* standard dependencies */
8
- import Stream from "node:stream"
8
+ import Stream from "node:stream"
9
9
 
10
10
  /* external dependencies */
11
- import OpenAI from "openai"
11
+ import OpenAI from "openai"
12
12
 
13
13
  /* internal dependencies */
14
14
  import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"