speechflow 0.9.8 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/CHANGELOG.md +18 -0
  2. package/LICENSE.txt +674 -0
  3. package/README.md +114 -17
  4. package/dst/speechflow-node-a2a-ffmpeg.js +1 -0
  5. package/dst/speechflow-node-a2a-ffmpeg.js.map +1 -0
  6. package/dst/{speechflow-node-deepl.d.ts → speechflow-node-a2a-meter.d.ts} +2 -2
  7. package/dst/speechflow-node-a2a-meter.js +147 -0
  8. package/dst/speechflow-node-a2a-meter.js.map +1 -0
  9. package/dst/speechflow-node-a2a-mute.d.ts +16 -0
  10. package/dst/speechflow-node-a2a-mute.js +90 -0
  11. package/dst/speechflow-node-a2a-mute.js.map +1 -0
  12. package/dst/{speechflow-node-whisper.d.ts → speechflow-node-a2a-vad.d.ts} +2 -5
  13. package/dst/speechflow-node-a2a-vad.js +272 -0
  14. package/dst/speechflow-node-a2a-vad.js.map +1 -0
  15. package/dst/speechflow-node-a2a-wav.js +1 -0
  16. package/dst/speechflow-node-a2a-wav.js.map +1 -0
  17. package/dst/speechflow-node-a2t-deepgram.js +2 -1
  18. package/dst/speechflow-node-a2t-deepgram.js.map +1 -0
  19. package/dst/speechflow-node-t2a-elevenlabs.js +1 -0
  20. package/dst/speechflow-node-t2a-elevenlabs.js.map +1 -0
  21. package/dst/{speechflow-node-elevenlabs.d.ts → speechflow-node-t2a-kokoro.d.ts} +2 -2
  22. package/dst/speechflow-node-t2a-kokoro.js +148 -0
  23. package/dst/speechflow-node-t2a-kokoro.js.map +1 -0
  24. package/dst/speechflow-node-t2t-deepl.js +1 -0
  25. package/dst/speechflow-node-t2t-deepl.js.map +1 -0
  26. package/dst/speechflow-node-t2t-format.js +1 -0
  27. package/dst/speechflow-node-t2t-format.js.map +1 -0
  28. package/dst/{speechflow-node-gemma.d.ts → speechflow-node-t2t-ollama.d.ts} +1 -1
  29. package/dst/{speechflow-node-gemma.js → speechflow-node-t2t-ollama.js} +41 -8
  30. package/dst/speechflow-node-t2t-ollama.js.map +1 -0
  31. package/dst/{speechflow-node-t2t-gemma.d.ts → speechflow-node-t2t-openai.d.ts} +2 -2
  32. package/dst/{speechflow-node-t2t-gemma.js → speechflow-node-t2t-openai.js} +43 -30
  33. package/dst/speechflow-node-t2t-openai.js.map +1 -0
  34. package/dst/speechflow-node-t2t-subtitle.js +1 -0
  35. package/dst/speechflow-node-t2t-subtitle.js.map +1 -0
  36. package/dst/{speechflow-node-opus.d.ts → speechflow-node-t2t-transformers.d.ts} +3 -1
  37. package/dst/speechflow-node-t2t-transformers.js +264 -0
  38. package/dst/speechflow-node-t2t-transformers.js.map +1 -0
  39. package/dst/speechflow-node-x2x-trace.js +3 -2
  40. package/dst/speechflow-node-x2x-trace.js.map +1 -0
  41. package/dst/speechflow-node-xio-device.js +1 -0
  42. package/dst/speechflow-node-xio-device.js.map +1 -0
  43. package/dst/speechflow-node-xio-file.js +1 -0
  44. package/dst/speechflow-node-xio-file.js.map +1 -0
  45. package/dst/speechflow-node-xio-mqtt.js +1 -0
  46. package/dst/speechflow-node-xio-mqtt.js.map +1 -0
  47. package/dst/speechflow-node-xio-websocket.js +1 -0
  48. package/dst/speechflow-node-xio-websocket.js.map +1 -0
  49. package/dst/speechflow-node.d.ts +3 -0
  50. package/dst/speechflow-node.js +10 -0
  51. package/dst/speechflow-node.js.map +1 -0
  52. package/dst/speechflow-utils.d.ts +33 -0
  53. package/dst/speechflow-utils.js +183 -1
  54. package/dst/speechflow-utils.js.map +1 -0
  55. package/dst/speechflow.js +295 -46
  56. package/dst/speechflow.js.map +1 -0
  57. package/etc/speechflow.yaml +14 -5
  58. package/etc/stx.conf +1 -1
  59. package/etc/tsconfig.json +2 -2
  60. package/package.json +17 -10
  61. package/src/speechflow-node-a2a-meter.ts +125 -0
  62. package/src/speechflow-node-a2a-mute.ts +101 -0
  63. package/src/speechflow-node-a2a-vad.ts +266 -0
  64. package/src/speechflow-node-a2t-deepgram.ts +1 -1
  65. package/src/speechflow-node-t2a-kokoro.ts +160 -0
  66. package/src/{speechflow-node-t2t-gemma.ts → speechflow-node-t2t-ollama.ts} +44 -10
  67. package/src/speechflow-node-t2t-openai.ts +246 -0
  68. package/src/speechflow-node-t2t-transformers.ts +249 -0
  69. package/src/speechflow-node-x2x-trace.ts +2 -2
  70. package/src/speechflow-node-xio-websocket.ts +5 -5
  71. package/src/speechflow-node.ts +12 -0
  72. package/src/speechflow-utils.ts +195 -0
  73. package/src/speechflow.ts +279 -46
  74. package/dst/speechflow-node-deepgram.d.ts +0 -12
  75. package/dst/speechflow-node-deepgram.js +0 -220
  76. package/dst/speechflow-node-deepl.js +0 -128
  77. package/dst/speechflow-node-device.d.ts +0 -13
  78. package/dst/speechflow-node-device.js +0 -205
  79. package/dst/speechflow-node-elevenlabs.js +0 -182
  80. package/dst/speechflow-node-ffmpeg.d.ts +0 -13
  81. package/dst/speechflow-node-ffmpeg.js +0 -152
  82. package/dst/speechflow-node-file.d.ts +0 -11
  83. package/dst/speechflow-node-file.js +0 -176
  84. package/dst/speechflow-node-format.d.ts +0 -11
  85. package/dst/speechflow-node-format.js +0 -80
  86. package/dst/speechflow-node-mqtt.d.ts +0 -13
  87. package/dst/speechflow-node-mqtt.js +0 -181
  88. package/dst/speechflow-node-opus.js +0 -135
  89. package/dst/speechflow-node-subtitle.d.ts +0 -12
  90. package/dst/speechflow-node-subtitle.js +0 -96
  91. package/dst/speechflow-node-t2t-opus.d.ts +0 -12
  92. package/dst/speechflow-node-t2t-opus.js +0 -135
  93. package/dst/speechflow-node-trace.d.ts +0 -11
  94. package/dst/speechflow-node-trace.js +0 -88
  95. package/dst/speechflow-node-wav.d.ts +0 -11
  96. package/dst/speechflow-node-wav.js +0 -170
  97. package/dst/speechflow-node-websocket.d.ts +0 -13
  98. package/dst/speechflow-node-websocket.js +0 -275
  99. package/dst/speechflow-node-whisper-common.d.ts +0 -34
  100. package/dst/speechflow-node-whisper-common.js +0 -7
  101. package/dst/speechflow-node-whisper-ggml.d.ts +0 -1
  102. package/dst/speechflow-node-whisper-ggml.js +0 -97
  103. package/dst/speechflow-node-whisper-onnx.d.ts +0 -1
  104. package/dst/speechflow-node-whisper-onnx.js +0 -131
  105. package/dst/speechflow-node-whisper-worker-ggml.d.ts +0 -1
  106. package/dst/speechflow-node-whisper-worker-ggml.js +0 -97
  107. package/dst/speechflow-node-whisper-worker-onnx.d.ts +0 -1
  108. package/dst/speechflow-node-whisper-worker-onnx.js +0 -131
  109. package/dst/speechflow-node-whisper-worker.d.ts +0 -1
  110. package/dst/speechflow-node-whisper-worker.js +0 -116
  111. package/dst/speechflow-node-whisper-worker2.d.ts +0 -1
  112. package/dst/speechflow-node-whisper-worker2.js +0 -82
  113. package/dst/speechflow-node-whisper.js +0 -604
  114. package/src/speechflow-node-t2t-opus.ts +0 -111
package/package.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "name": "speechflow",
3
- "version": "0.9.8",
4
- "x-stdver": "0.9.8-EA",
5
- "x-release": "2025-07-12",
3
+ "version": "1.0.0",
4
+ "x-stdver": "1.0.0-GA",
5
+ "x-release": "2025-07-16",
6
6
  "homepage": "https://github.com/rse/speechflow",
7
7
  "description": "Speech Processing Flow Graph",
8
8
  "license": "GPL-3.0-only",
@@ -26,15 +26,20 @@
26
26
  "@elevenlabs/elevenlabs-js": "2.6.0",
27
27
  "stream-transform": "3.4.0",
28
28
  "get-stream": "9.0.1",
29
- "@dotenvx/dotenvx": "1.47.5",
29
+ "@dotenvx/dotenvx": "1.47.6",
30
30
  "speex-resampler": "3.0.1",
31
31
  "pcm-convert": "1.6.5",
32
32
  "object-path": "0.11.8",
33
33
  "ws": "8.18.3",
34
34
  "bufferutil": "4.0.9",
35
35
  "utf-8-validate": "6.0.5",
36
+ "@hapi/hapi": "21.4.0",
37
+ "@hapi/boom": "10.0.1",
38
+ "hapi-plugin-header": "1.1.8",
39
+ "hapi-plugin-websocket": "2.4.11",
36
40
  "@opensumi/reconnecting-websocket": "4.4.0",
37
41
  "ollama": "0.5.16",
42
+ "openai": "5.10.0",
38
43
  "@rse/ffmpeg": "1.4.2",
39
44
  "ffmpeg-stream": "1.0.1",
40
45
  "installed-packages": "1.0.13",
@@ -42,13 +47,15 @@
42
47
  "wav": "1.0.2",
43
48
  "mqtt": "5.13.2",
44
49
  "cbor2": "2.0.1",
50
+ "arktype": "2.1.20",
45
51
  "pure-uuid": "1.8.1",
46
52
  "wavefile": "11.0.0",
53
+ "audio-inspect": "0.0.2",
47
54
  "@huggingface/transformers": "3.6.3",
55
+ "kokoro-js": "1.2.1",
48
56
  "@ericedouard/vad-node-realtime": "0.2.0",
49
57
  "luxon": "3.7.1",
50
- "wrap-text": "1.0.10",
51
- "smart-whisper": "0.8.1"
58
+ "wrap-text": "1.0.10"
52
59
  },
53
60
  "devDependencies": {
54
61
  "eslint": "9.31.0",
@@ -57,14 +64,14 @@
57
64
  "eslint-plugin-promise": "7.2.1",
58
65
  "eslint-plugin-import": "2.32.0",
59
66
  "eslint-plugin-node": "11.1.0",
60
- "@typescript-eslint/eslint-plugin": "8.36.0",
61
- "@typescript-eslint/parser": "8.36.0",
67
+ "@typescript-eslint/eslint-plugin": "8.37.0",
68
+ "@typescript-eslint/parser": "8.37.0",
62
69
  "oxlint": "1.6.0",
63
70
  "eslint-plugin-oxlint": "1.6.0",
64
71
  "@biomejs/biome": "2.0.6",
65
72
  "eslint-config-biome": "1.9.4",
66
73
 
67
- "@types/node": "24.0.13",
74
+ "@types/node": "24.0.14",
68
75
  "@types/yargs": "17.0.33",
69
76
  "@types/js-yaml": "4.0.9",
70
77
  "@types/object-path": "0.11.4",
@@ -84,7 +91,7 @@
84
91
  "cross-env": "7.0.3"
85
92
  },
86
93
  "overrides": {
87
- "onnxruntime-node": "1.22.0-dev.20250418-c19a49615b"
94
+ "@huggingface/transformers": { "onnxruntime-node": "1.23.0-dev.20250703-7fc6235861" }
88
95
  },
89
96
  "upd": [ "!@biomejs/biome" ],
90
97
  "engines": {
@@ -0,0 +1,125 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* external dependencies */
11
+ import { getLUFS, getRMS, AudioData } from "audio-inspect"
12
+
13
+ /* internal dependencies */
14
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
15
+ import * as utils from "./speechflow-utils"
16
+
17
+ /* SpeechFlow node for audio metering */
18
+ export default class SpeechFlowNodeMeter extends SpeechFlowNode {
19
+ /* declare official node name */
20
+ public static name = "meter"
21
+
22
+ /* internal state */
23
+ interval: ReturnType<typeof setInterval> | null = null
24
+
25
+ /* construct node */
26
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
27
+ super(id, cfg, opts, args)
28
+
29
+ /* declare node configuration parameters */
30
+ this.configure({
31
+ interval: { type: "number", val: 250 }
32
+ })
33
+
34
+ /* declare node input/output format */
35
+ this.input = "audio"
36
+ this.output = "audio"
37
+ }
38
+
39
+ /* open node */
40
+ async open () {
41
+ /* sanity check situation */
42
+ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
43
+ throw new Error("VAD node currently supports PCM-S16LE audio only")
44
+
45
+ /* internal state */
46
+ const sampleWindowDuration = 3 /* LUFS-S requires 3s */
47
+ const sampleWindowSize = this.config.audioSampleRate * sampleWindowDuration
48
+ let sampleWindow = new Float32Array(sampleWindowSize)
49
+ sampleWindow.fill(0, 0, sampleWindowSize)
50
+ let lufss = 0
51
+ let rms = 0
52
+
53
+ /* setup loundess emitting interval */
54
+ this.interval = setInterval(() => {
55
+ this.log("info", `LUFS-S: ${lufss.toFixed(1)} dB, RMS: ${rms.toFixed(1)} dB`)
56
+ this.sendResponse([ "meter", "LUFS-S", lufss ])
57
+ this.sendResponse([ "meter", "RMS", rms ])
58
+ }, this.params.interval)
59
+
60
+ /* provide Duplex stream and internally attach to VAD */
61
+ const self = this
62
+ this.stream = new Stream.Transform({
63
+ writableObjectMode: true,
64
+ readableObjectMode: true,
65
+ decodeStrings: false,
66
+
67
+ /* transform audio chunk */
68
+ transform (chunk: SpeechFlowChunk, encoding, callback) {
69
+ if (!Buffer.isBuffer(chunk.payload))
70
+ callback(new Error("expected audio input as Buffer chunks"))
71
+ else if (chunk.payload.byteLength === 0)
72
+ callback()
73
+ else {
74
+ /* convert audio samples from PCM/I16 to PCM/F32 */
75
+ const data = utils.convertBufToF32(chunk.payload, self.config.audioLittleEndian)
76
+
77
+ /* update internal audio sample sliding window */
78
+ const fusion = new Float32Array(sampleWindow.length + data.length)
79
+ fusion.set(sampleWindow, 0)
80
+ fusion.set(data, sampleWindow.length)
81
+ sampleWindow = fusion.slice(fusion.length - sampleWindowSize)
82
+
83
+ /* asynchronously calculate the LUFS-S metric */
84
+ setTimeout(() => {
85
+ const audioData = {
86
+ sampleRate: self.config.audioSampleRate,
87
+ numberOfChannels: self.config.audioChannels,
88
+ channelData: [ sampleWindow ],
89
+ duration: sampleWindowDuration,
90
+ length: sampleWindow.length
91
+ } satisfies AudioData
92
+ const lufs = getLUFS(audioData, {
93
+ channelMode: self.config.audioChannels === 1 ? "mono" : "stereo",
94
+ calculateShortTerm: true,
95
+ calculateMomentary: false,
96
+ calculateLoudnessRange: false,
97
+ calculateTruePeak: false
98
+ })
99
+ lufss = lufs.shortTerm ? lufs.shortTerm[0] : 0
100
+ rms = getRMS(audioData, { asDB: true })
101
+ }, 0)
102
+
103
+ /* pass-through original audio chunk */
104
+ this.push(chunk)
105
+ callback()
106
+ }
107
+ }
108
+ })
109
+ }
110
+
111
+ /* close node */
112
+ async close () {
113
+ /* close stream */
114
+ if (this.stream !== null) {
115
+ this.stream.destroy()
116
+ this.stream = null
117
+ }
118
+
119
+ /* stop interval */
120
+ if (this.interval !== null) {
121
+ clearInterval(this.interval)
122
+ this.interval = null
123
+ }
124
+ }
125
+ }
@@ -0,0 +1,101 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* internal dependencies */
11
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
12
+
13
+ /* the type of muting */
14
+ type MuteMode =
15
+ "none" | /* not muted */
16
+ "silenced" | /* muted by changing audio samples to silence */
17
+ "unplugged" /* muted by unplugging the audio sample flow */
18
+
19
+ /* SpeechFlow node for muting in audio-to-audio passing */
20
+ export default class SpeechFlowNodeMute extends SpeechFlowNode {
21
+ /* declare official node name */
22
+ public static name = "mute"
23
+
24
+ /* internal state */
25
+ private muteMode: MuteMode = "none"
26
+
27
+ /* construct node */
28
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
29
+ super(id, cfg, opts, args)
30
+
31
+ /* declare node configuration parameters */
32
+ this.configure({})
33
+
34
+ /* declare node input/output format */
35
+ this.input = "audio"
36
+ this.output = "audio"
37
+ }
38
+
39
+ /* receive external request */
40
+ async receiveRequest (params: any[]) {
41
+ if (params.length === 2 && params[0] === "mode") {
42
+ if (!params[1].match(/^(?:none|silenced|unplugged)$/))
43
+ throw new Error("mute: invalid mode argument in external request")
44
+ const muteMode: MuteMode = params[1] as MuteMode
45
+ this.setMuteMode(muteMode)
46
+ this.sendResponse([ "mute", "mode", muteMode ])
47
+ }
48
+ else
49
+ throw new Error("mute: invalid arguments in external request")
50
+ }
51
+
52
+ /* change mute mode */
53
+ setMuteMode (mode: MuteMode) {
54
+ this.log("info", `setting mute mode to "${mode}"`)
55
+ this.muteMode = mode
56
+ }
57
+
58
+ /* open node */
59
+ async open () {
60
+ /* establish a transform stream */
61
+ const self = this
62
+ this.stream = new Stream.Transform({
63
+ readableObjectMode: true,
64
+ writableObjectMode: true,
65
+ decodeStrings: false,
66
+ transform (chunk: SpeechFlowChunk, encoding, callback) {
67
+ if (!Buffer.isBuffer(chunk.payload))
68
+ callback(new Error("invalid chunk payload type"))
69
+ else if (self.muteMode === "unplugged")
70
+ /* pass-through nothing */
71
+ callback()
72
+ else if (self.muteMode === "silenced") {
73
+ /* pass-through a silenced chunk */
74
+ chunk = chunk.clone()
75
+ const buffer = chunk.payload as Buffer
76
+ buffer.fill(0)
77
+ callback()
78
+ }
79
+ else {
80
+ /* pass-through original chunk */
81
+ this.push(chunk)
82
+ callback()
83
+ }
84
+ },
85
+ final (callback) {
86
+ this.push(null)
87
+ callback()
88
+ }
89
+ })
90
+ }
91
+
92
+ /* close node */
93
+ async close () {
94
+ /* close stream */
95
+ if (this.stream !== null) {
96
+ this.stream.destroy()
97
+ this.stream = null
98
+ }
99
+ }
100
+ }
101
+
@@ -0,0 +1,266 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* external dependencies */
11
+ import { RealTimeVAD } from "@ericedouard/vad-node-realtime"
12
+ import { Duration } from "luxon"
13
+
14
+ /* internal dependencies */
15
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
16
+ import * as utils from "./speechflow-utils"
17
+
18
+ /* audio stream queue element */
19
+ type AudioQueueElement = {
20
+ type: "audio-frame",
21
+ chunk: SpeechFlowChunk,
22
+ isSpeech?: boolean
23
+ } | {
24
+ type: "audio-eof"
25
+ }
26
+
27
+ /* SpeechFlow node for VAD speech-to-speech processing */
28
+ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
29
+ /* declare official node name */
30
+ public static name = "vad"
31
+
32
+ /* internal state */
33
+ private vad: RealTimeVAD | null = null
34
+ private queue = new utils.Queue<AudioQueueElement>()
35
+ private queueRecv = this.queue.pointerUse("recv")
36
+ private queueVAD = this.queue.pointerUse("vad")
37
+ private queueSend = this.queue.pointerUse("send")
38
+
39
+ /* construct node */
40
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
41
+ super(id, cfg, opts, args)
42
+
43
+ /* declare node configuration parameters */
44
+ this.configure({
45
+ mode: { type: "string", val: "unplugged", match: /^(?:silenced|unplugged)$/ },
46
+ posSpeechThreshold: { type: "number", val: 0.50 },
47
+ negSpeechThreshold: { type: "number", val: 0.35 },
48
+ minSpeechFrames: { type: "number", val: 2 },
49
+ redemptionFrames: { type: "number", val: 12 },
50
+ preSpeechPadFrames: { type: "number", val: 1 }
51
+ })
52
+
53
+ /* declare node input/output format */
54
+ this.input = "audio"
55
+ this.output = "audio"
56
+ }
57
+
58
+ /* open node */
59
+ async open () {
60
+ /* sanity check situation */
61
+ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
62
+ throw new Error("VAD node currently supports PCM-S16LE audio only")
63
+
64
+ /* pass-through logging */
65
+ const log = (level: string, msg: string) => { this.log(level, msg) }
66
+
67
+ /* internal processing constants */
68
+ const vadSampleRateTarget = 16000 /* internal target of VAD */
69
+ const vadSamplesPerFrame = 512 /* required for VAD v5 */
70
+
71
+ /* establish Voice Activity Detection (VAD) facility */
72
+ this.vad = await RealTimeVAD.new({
73
+ model: "v5",
74
+ sampleRate: this.config.audioSampleRate, /* before resampling to 16KHz */
75
+ frameSamples: vadSamplesPerFrame, /* after resampling to 16KHz */
76
+ positiveSpeechThreshold: this.params.posSpeechThreshold,
77
+ negativeSpeechThreshold: this.params.negSpeechThreshold,
78
+ minSpeechFrames: this.params.minSpeechFrames,
79
+ redemptionFrames: this.params.redemptionFrames,
80
+ preSpeechPadFrames: this.params.preSpeechPadFrames,
81
+ onSpeechStart: () => {
82
+ log("info", "VAD: speech start")
83
+ },
84
+ onSpeechEnd: (audio) => {
85
+ const duration = utils.audioArrayDuration(audio, vadSampleRateTarget)
86
+ log("info", `VAD: speech end (duration: ${duration.toFixed(2)}s)`)
87
+ },
88
+ onVADMisfire: () => {
89
+ log("info", "VAD: speech end (segment too short)")
90
+ },
91
+ onFrameProcessed: (audio) => {
92
+ /* annotate the current audio frame */
93
+ const element = this.queueVAD.peek()
94
+ if (element !== undefined && element.type === "audio-frame") {
95
+ const isSpeech = audio.isSpeech > audio.notSpeech
96
+ element.isSpeech = isSpeech
97
+ this.queueVAD.touch()
98
+ this.queueVAD.walk(+1)
99
+ }
100
+ }
101
+ })
102
+ this.vad.start()
103
+
104
+ /* provide Duplex stream and internally attach to VAD */
105
+ const vad = this.vad
106
+ const cfg = this.config
107
+ const queue = this.queue
108
+ const queueRecv = this.queueRecv
109
+ const queueSend = this.queueSend
110
+ const mode = this.params.mode
111
+ let carrySamples = new Float32Array()
112
+ let carryStart = Duration.fromDurationLike(0)
113
+ this.stream = new Stream.Duplex({
114
+ writableObjectMode: true,
115
+ readableObjectMode: true,
116
+ decodeStrings: false,
117
+
118
+ /* receive audio chunk (writable side of stream) */
119
+ write (chunk: SpeechFlowChunk, encoding, callback) {
120
+ if (!Buffer.isBuffer(chunk.payload))
121
+ callback(new Error("expected audio input as Buffer chunks"))
122
+ else if (chunk.payload.byteLength === 0)
123
+ callback()
124
+ else {
125
+ /* convert audio samples from PCM/I16 to PCM/F32 */
126
+ let data = utils.convertBufToF32(chunk.payload, cfg.audioLittleEndian)
127
+ let start = chunk.timestampStart
128
+
129
+ /* merge previous carry samples */
130
+ if (carrySamples.length > 0) {
131
+ start = carryStart
132
+ const merged = new Float32Array(carrySamples.length + data.length)
133
+ merged.set(carrySamples)
134
+ merged.set(data, carrySamples.length)
135
+ data = merged
136
+ carrySamples = new Float32Array()
137
+ }
138
+
139
+ /* queue audio samples as individual VAD-sized frames
140
+ and in parallel send it into the Voice Activity Detection (VAD) */
141
+ const chunkSize = (vadSamplesPerFrame * (cfg.audioSampleRate / vadSampleRateTarget))
142
+ const chunks = Math.trunc(data.length / chunkSize)
143
+ for (let i = 0; i < chunks; i++) {
144
+ const frame = data.slice(i * chunkSize, (i + 1) * chunkSize)
145
+ const buf = utils.convertF32ToBuf(frame)
146
+ const duration = utils.audioBufferDuration(buf)
147
+ const end = start.plus(duration)
148
+ const chunk = new SpeechFlowChunk(start, end, "final", "audio", buf)
149
+ queueRecv.append({ type: "audio-frame", chunk })
150
+ vad.processAudio(frame)
151
+ start = end
152
+ }
153
+
154
+ /* remember new carry samples */
155
+ const bulkLen = chunks * chunkSize
156
+ carrySamples = data.slice(bulkLen)
157
+ carryStart = start
158
+
159
+ callback()
160
+ }
161
+ },
162
+
163
+ /* receive no more audio chunks (writable side of stream) */
164
+ final (callback) {
165
+ /* flush pending audio chunks */
166
+ if (carrySamples.length > 0) {
167
+ const chunkSize = (vadSamplesPerFrame * (cfg.audioSampleRate / vadSampleRateTarget))
168
+ if (carrySamples.length < chunkSize) {
169
+ const merged = new Float32Array(chunkSize)
170
+ merged.set(carrySamples)
171
+ merged.fill(0.0, carrySamples.length, chunkSize)
172
+ carrySamples = merged
173
+ }
174
+ const buf = utils.convertF32ToBuf(carrySamples)
175
+ const duration = utils.audioBufferDuration(buf)
176
+ const end = carryStart.plus(duration)
177
+ const chunk = new SpeechFlowChunk(carryStart, end, "final", "audio", buf)
178
+ queueRecv.append({ type: "audio-frame", chunk })
179
+ vad.processAudio(carrySamples)
180
+ }
181
+
182
+ /* signal end of file */
183
+ queueRecv.append({ type: "audio-eof" })
184
+ callback()
185
+ },
186
+
187
+ /* send audio chunk(s) (readable side of stream) */
188
+ read (_size) {
189
+ /* try to perform read operation from scratch */
190
+ const tryToRead = () => {
191
+ /* flush pending audio chunks */
192
+ const flushPendingChunks = () => {
193
+ let pushed = 0
194
+ while (true) {
195
+ const element = queueSend.peek()
196
+ if (element === undefined)
197
+ break
198
+ else if (element.type === "audio-eof") {
199
+ this.push(null)
200
+ break
201
+ }
202
+ else if (element.type === "audio-frame"
203
+ && element.isSpeech === undefined)
204
+ break
205
+ queueSend.walk(+1)
206
+ if (element.isSpeech) {
207
+ this.push(element.chunk)
208
+ pushed++
209
+ }
210
+ else if (mode === "silenced") {
211
+ const chunk = element.chunk.clone()
212
+ const buffer = chunk.payload as Buffer
213
+ buffer.fill(0)
214
+ this.push(chunk)
215
+ pushed++
216
+ }
217
+ else if (mode === "unplugged" && pushed === 0)
218
+ /* we have to await chunks now, as in unplugged
219
+ mode we else would be never called again until
220
+ we at least once push a new chunk as the result */
221
+ tryToRead()
222
+ }
223
+ }
224
+
225
+ /* await forthcoming audio chunks */
226
+ const awaitForthcomingChunks = () => {
227
+ const element = queueSend.peek()
228
+ if (element !== undefined
229
+ && element.type === "audio-frame"
230
+ && element.isSpeech !== undefined)
231
+ flushPendingChunks()
232
+ else
233
+ queue.once("write", awaitForthcomingChunks)
234
+ }
235
+
236
+ const element = queueSend.peek()
237
+ if (element !== undefined && element.type === "audio-eof")
238
+ this.push(null)
239
+ else if (element !== undefined
240
+ && element.type === "audio-frame"
241
+ && element.isSpeech !== undefined)
242
+ flushPendingChunks()
243
+ else
244
+ queue.once("write", awaitForthcomingChunks)
245
+ }
246
+ tryToRead()
247
+ }
248
+ })
249
+ }
250
+
251
+ /* close node */
252
+ async close () {
253
+ /* close stream */
254
+ if (this.stream !== null) {
255
+ this.stream.destroy()
256
+ this.stream = null
257
+ }
258
+
259
+ /* close VAD */
260
+ if (this.vad !== null) {
261
+ await this.vad.flush()
262
+ this.vad.destroy()
263
+ this.vad = null
264
+ }
265
+ }
266
+ }
@@ -164,7 +164,7 @@ export default class SpeechFlowNodeDeepgram extends SpeechFlowNode {
164
164
  if (chunk.payload.byteLength > 0) {
165
165
  log("info", `Deepgram: send data (${chunk.payload.byteLength} bytes)`)
166
166
  initTimeoutStart()
167
- dg.send(chunk.payload) /* intentionally discard all time information */
167
+ dg.send(chunk.payload.buffer) /* intentionally discard all time information */
168
168
  }
169
169
  callback()
170
170
  }