speechflow 0.9.4 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/README.md +227 -54
- package/dst/speechflow-node-a2a-ffmpeg.d.ts +13 -0
- package/dst/speechflow-node-a2a-ffmpeg.js +152 -0
- package/dst/speechflow-node-a2a-wav.d.ts +11 -0
- package/dst/speechflow-node-a2a-wav.js +170 -0
- package/dst/speechflow-node-a2t-deepgram.d.ts +12 -0
- package/dst/speechflow-node-a2t-deepgram.js +220 -0
- package/dst/speechflow-node-deepgram.d.ts +3 -1
- package/dst/speechflow-node-deepgram.js +86 -22
- package/dst/speechflow-node-deepl.d.ts +3 -1
- package/dst/speechflow-node-deepl.js +25 -20
- package/dst/speechflow-node-device.d.ts +3 -1
- package/dst/speechflow-node-device.js +53 -2
- package/dst/speechflow-node-elevenlabs.d.ts +4 -1
- package/dst/speechflow-node-elevenlabs.js +88 -49
- package/dst/speechflow-node-ffmpeg.d.ts +3 -1
- package/dst/speechflow-node-ffmpeg.js +42 -4
- package/dst/speechflow-node-file.d.ts +3 -1
- package/dst/speechflow-node-file.js +84 -13
- package/dst/speechflow-node-format.d.ts +11 -0
- package/dst/speechflow-node-format.js +80 -0
- package/dst/speechflow-node-gemma.d.ts +3 -1
- package/dst/speechflow-node-gemma.js +84 -23
- package/dst/speechflow-node-mqtt.d.ts +13 -0
- package/dst/speechflow-node-mqtt.js +181 -0
- package/dst/speechflow-node-opus.d.ts +12 -0
- package/dst/speechflow-node-opus.js +135 -0
- package/dst/speechflow-node-subtitle.d.ts +12 -0
- package/dst/speechflow-node-subtitle.js +96 -0
- package/dst/speechflow-node-t2a-elevenlabs.d.ts +13 -0
- package/dst/speechflow-node-t2a-elevenlabs.js +182 -0
- package/dst/speechflow-node-t2t-deepl.d.ts +12 -0
- package/dst/speechflow-node-t2t-deepl.js +133 -0
- package/dst/speechflow-node-t2t-format.d.ts +11 -0
- package/dst/speechflow-node-t2t-format.js +80 -0
- package/dst/speechflow-node-t2t-gemma.d.ts +13 -0
- package/dst/speechflow-node-t2t-gemma.js +213 -0
- package/dst/speechflow-node-t2t-opus.d.ts +12 -0
- package/dst/speechflow-node-t2t-opus.js +135 -0
- package/dst/speechflow-node-t2t-subtitle.d.ts +12 -0
- package/dst/speechflow-node-t2t-subtitle.js +96 -0
- package/dst/speechflow-node-trace.d.ts +11 -0
- package/dst/speechflow-node-trace.js +88 -0
- package/dst/speechflow-node-wav.d.ts +11 -0
- package/dst/speechflow-node-wav.js +170 -0
- package/dst/speechflow-node-websocket.d.ts +3 -1
- package/dst/speechflow-node-websocket.js +149 -49
- package/dst/speechflow-node-whisper-common.d.ts +34 -0
- package/dst/speechflow-node-whisper-common.js +7 -0
- package/dst/speechflow-node-whisper-ggml.d.ts +1 -0
- package/dst/speechflow-node-whisper-ggml.js +97 -0
- package/dst/speechflow-node-whisper-onnx.d.ts +1 -0
- package/dst/speechflow-node-whisper-onnx.js +131 -0
- package/dst/speechflow-node-whisper-worker-ggml.d.ts +1 -0
- package/dst/speechflow-node-whisper-worker-ggml.js +97 -0
- package/dst/speechflow-node-whisper-worker-onnx.d.ts +1 -0
- package/dst/speechflow-node-whisper-worker-onnx.js +131 -0
- package/dst/speechflow-node-whisper-worker.d.ts +1 -0
- package/dst/speechflow-node-whisper-worker.js +116 -0
- package/dst/speechflow-node-whisper-worker2.d.ts +1 -0
- package/dst/speechflow-node-whisper-worker2.js +82 -0
- package/dst/speechflow-node-whisper.d.ts +19 -0
- package/dst/speechflow-node-whisper.js +604 -0
- package/dst/speechflow-node-x2x-trace.d.ts +11 -0
- package/dst/speechflow-node-x2x-trace.js +88 -0
- package/dst/speechflow-node-xio-device.d.ts +13 -0
- package/dst/speechflow-node-xio-device.js +205 -0
- package/dst/speechflow-node-xio-file.d.ts +11 -0
- package/dst/speechflow-node-xio-file.js +176 -0
- package/dst/speechflow-node-xio-mqtt.d.ts +13 -0
- package/dst/speechflow-node-xio-mqtt.js +181 -0
- package/dst/speechflow-node-xio-websocket.d.ts +13 -0
- package/dst/speechflow-node-xio-websocket.js +275 -0
- package/dst/speechflow-node.d.ts +25 -7
- package/dst/speechflow-node.js +74 -9
- package/dst/speechflow-utils.d.ts +23 -0
- package/dst/speechflow-utils.js +194 -0
- package/dst/speechflow.js +146 -43
- package/etc/biome.jsonc +12 -4
- package/etc/stx.conf +65 -0
- package/package.d/@ericedouard+vad-node-realtime+0.2.0.patch +18 -0
- package/package.json +49 -31
- package/sample.yaml +61 -23
- package/src/lib.d.ts +6 -1
- package/src/{speechflow-node-ffmpeg.ts → speechflow-node-a2a-ffmpeg.ts} +10 -4
- package/src/speechflow-node-a2a-wav.ts +143 -0
- package/src/speechflow-node-a2t-deepgram.ts +199 -0
- package/src/speechflow-node-t2a-elevenlabs.ts +160 -0
- package/src/{speechflow-node-deepl.ts → speechflow-node-t2t-deepl.ts} +36 -25
- package/src/speechflow-node-t2t-format.ts +85 -0
- package/src/{speechflow-node-gemma.ts → speechflow-node-t2t-gemma.ts} +89 -25
- package/src/speechflow-node-t2t-opus.ts +111 -0
- package/src/speechflow-node-t2t-subtitle.ts +101 -0
- package/src/speechflow-node-x2x-trace.ts +92 -0
- package/src/{speechflow-node-device.ts → speechflow-node-xio-device.ts} +25 -3
- package/src/speechflow-node-xio-file.ts +153 -0
- package/src/speechflow-node-xio-mqtt.ts +154 -0
- package/src/speechflow-node-xio-websocket.ts +248 -0
- package/src/speechflow-node.ts +78 -13
- package/src/speechflow-utils.ts +212 -0
- package/src/speechflow.ts +150 -43
- package/etc/nps.yaml +0 -40
- package/src/speechflow-node-deepgram.ts +0 -133
- package/src/speechflow-node-elevenlabs.ts +0 -116
- package/src/speechflow-node-file.ts +0 -108
- package/src/speechflow-node-websocket.ts +0 -179
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "speechflow",
|
|
3
|
-
"version": "0.9.
|
|
4
|
-
"x-stdver": "0.9.
|
|
5
|
-
"x-release": "2025-
|
|
3
|
+
"version": "0.9.7",
|
|
4
|
+
"x-stdver": "0.9.7-EA",
|
|
5
|
+
"x-release": "2025-07-12",
|
|
6
6
|
"homepage": "https://github.com/rse/speechflow",
|
|
7
7
|
"description": "Speech Processing Flow Graph",
|
|
8
8
|
"license": "GPL-3.0-only",
|
|
@@ -17,58 +17,76 @@
|
|
|
17
17
|
},
|
|
18
18
|
"dependencies": {
|
|
19
19
|
"cli-io": "0.9.13",
|
|
20
|
-
"yargs": "
|
|
21
|
-
"flowlink": "0.9.
|
|
20
|
+
"yargs": "18.0.0",
|
|
21
|
+
"flowlink": "0.9.11",
|
|
22
22
|
"js-yaml": "4.1.0",
|
|
23
|
-
"@gpeng/naudiodon": "2.4.
|
|
24
|
-
"@deepgram/sdk": "
|
|
25
|
-
"deepl-node": "1.
|
|
26
|
-
"elevenlabs":
|
|
27
|
-
"stream-transform": "3.
|
|
23
|
+
"@gpeng/naudiodon": "2.4.1",
|
|
24
|
+
"@deepgram/sdk": "4.9.1",
|
|
25
|
+
"deepl-node": "1.19.0",
|
|
26
|
+
"@elevenlabs/elevenlabs-js": "2.6.0",
|
|
27
|
+
"stream-transform": "3.4.0",
|
|
28
28
|
"get-stream": "9.0.1",
|
|
29
|
-
"@dotenvx/dotenvx": "1.
|
|
29
|
+
"@dotenvx/dotenvx": "1.47.5",
|
|
30
30
|
"speex-resampler": "3.0.1",
|
|
31
31
|
"pcm-convert": "1.6.5",
|
|
32
32
|
"object-path": "0.11.8",
|
|
33
|
-
"ws": "8.18.
|
|
33
|
+
"ws": "8.18.3",
|
|
34
34
|
"bufferutil": "4.0.9",
|
|
35
35
|
"utf-8-validate": "6.0.5",
|
|
36
36
|
"@opensumi/reconnecting-websocket": "4.4.0",
|
|
37
|
-
"ollama": "0.5.
|
|
37
|
+
"ollama": "0.5.16",
|
|
38
38
|
"@rse/ffmpeg": "1.4.2",
|
|
39
|
-
"ffmpeg-stream": "1.0.
|
|
40
|
-
"installed-packages": "1.0.13"
|
|
39
|
+
"ffmpeg-stream": "1.0.1",
|
|
40
|
+
"installed-packages": "1.0.13",
|
|
41
|
+
"syspath": "1.0.8",
|
|
42
|
+
"wav": "1.0.2",
|
|
43
|
+
"mqtt": "5.13.2",
|
|
44
|
+
"cbor2": "2.0.1",
|
|
45
|
+
"pure-uuid": "1.8.1",
|
|
46
|
+
"wavefile": "11.0.0",
|
|
47
|
+
"@huggingface/transformers": "3.6.3",
|
|
48
|
+
"@ericedouard/vad-node-realtime": "0.2.0",
|
|
49
|
+
"luxon": "3.7.1",
|
|
50
|
+
"wrap-text": "1.0.10",
|
|
51
|
+
"smart-whisper": "0.8.1"
|
|
41
52
|
},
|
|
42
53
|
"devDependencies": {
|
|
43
|
-
"eslint": "9.
|
|
44
|
-
"@eslint/js": "9.
|
|
45
|
-
"neostandard": "0.12.
|
|
54
|
+
"eslint": "9.31.0",
|
|
55
|
+
"@eslint/js": "9.31.0",
|
|
56
|
+
"neostandard": "0.12.2",
|
|
46
57
|
"eslint-plugin-promise": "7.2.1",
|
|
47
|
-
"eslint-plugin-import": "2.
|
|
58
|
+
"eslint-plugin-import": "2.32.0",
|
|
48
59
|
"eslint-plugin-node": "11.1.0",
|
|
49
|
-
"@typescript-eslint/eslint-plugin": "8.
|
|
50
|
-
"@typescript-eslint/parser": "8.
|
|
51
|
-
"oxlint": "
|
|
52
|
-
"eslint-plugin-oxlint": "
|
|
53
|
-
"@biomejs/biome": "
|
|
60
|
+
"@typescript-eslint/eslint-plugin": "8.36.0",
|
|
61
|
+
"@typescript-eslint/parser": "8.36.0",
|
|
62
|
+
"oxlint": "1.6.0",
|
|
63
|
+
"eslint-plugin-oxlint": "1.6.0",
|
|
64
|
+
"@biomejs/biome": "2.0.6",
|
|
54
65
|
"eslint-config-biome": "1.9.4",
|
|
55
66
|
|
|
56
|
-
"@types/node": "
|
|
67
|
+
"@types/node": "24.0.13",
|
|
57
68
|
"@types/yargs": "17.0.33",
|
|
58
69
|
"@types/js-yaml": "4.0.9",
|
|
59
70
|
"@types/object-path": "0.11.4",
|
|
60
71
|
"@types/ws": "8.18.1",
|
|
61
72
|
"@types/resolve": "1.20.6",
|
|
73
|
+
"@types/wav": "1.0.4",
|
|
74
|
+
"@types/luxon": "3.6.2",
|
|
75
|
+
"@types/wrap-text": "1.0.2",
|
|
62
76
|
|
|
63
|
-
"
|
|
77
|
+
"patch-package": "8.0.0",
|
|
78
|
+
"stmux": "1.8.11",
|
|
64
79
|
"nodemon": "3.1.10",
|
|
65
|
-
"
|
|
80
|
+
"shx": "0.4.0",
|
|
66
81
|
"typescript": "5.8.3",
|
|
67
82
|
"delay-cli": "2.0.0",
|
|
68
|
-
"
|
|
83
|
+
"@rse/stx": "1.0.2",
|
|
69
84
|
"cross-env": "7.0.3"
|
|
70
85
|
},
|
|
71
|
-
"
|
|
86
|
+
"overrides": {
|
|
87
|
+
"onnxruntime-node": "1.22.0-dev.20250418-c19a49615b"
|
|
88
|
+
},
|
|
89
|
+
"upd": [ "!@biomejs/biome" ],
|
|
72
90
|
"engines": {
|
|
73
91
|
"node": ">=22.0.0"
|
|
74
92
|
},
|
|
@@ -83,7 +101,7 @@
|
|
|
83
101
|
}
|
|
84
102
|
},
|
|
85
103
|
"scripts": {
|
|
86
|
-
"
|
|
87
|
-
"
|
|
104
|
+
"postinstall": "npm start patch-apply",
|
|
105
|
+
"start": "stx -v4 -c etc/stx.conf"
|
|
88
106
|
}
|
|
89
107
|
}
|
package/sample.yaml
CHANGED
|
@@ -2,32 +2,70 @@
|
|
|
2
2
|
## sample.yaml -- Speechflow Sample Audio Processing Graphs
|
|
3
3
|
##
|
|
4
4
|
|
|
5
|
-
#
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
5
|
+
# Capture audio from microphone device into WAV audio file
|
|
6
|
+
capturing: |
|
|
7
|
+
device(device: "wasapi:VoiceMeeter Out B1", mode: "r") |
|
|
8
|
+
wav(mode: "encode") |
|
|
9
|
+
file(path: "capture.wav", mode: "w", type: "audio")
|
|
9
10
|
|
|
10
|
-
#
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
11
|
+
# Pass-through audio from microphone device to speaker
|
|
12
|
+
# device and in parallel record it to WAV audio file
|
|
13
|
+
pass-through: |
|
|
14
|
+
device(device: "wasapi:VoiceMeeter Out B1", mode: "r") | {
|
|
15
|
+
wav(mode: "encode") |
|
|
16
|
+
file(path: "capture.wav", mode: "w", type: "audio"),
|
|
17
|
+
device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
|
|
18
|
+
}
|
|
15
19
|
|
|
16
|
-
#
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
# Generate text file with German narration of MP3 audio file
|
|
21
|
+
narration: |
|
|
22
|
+
file(path: argv.0, mode: "r", type: "audio") |
|
|
23
|
+
ffmpeg(src: "mp3", dst: "pcm") |
|
|
24
|
+
deepgram(language: "de", key: env.SPEECHFLOW_KEY_DEEPGRAM) |
|
|
25
|
+
format(width: 80) |
|
|
26
|
+
file(path: argv.1, mode: "w", type: "text")
|
|
22
27
|
|
|
23
|
-
#
|
|
28
|
+
# Generate text file with German subtitles of MP3 audio file
|
|
29
|
+
subtitling: |
|
|
30
|
+
file(path: argv.0, mode: "r", type: "audio") |
|
|
31
|
+
ffmpeg(src: "mp3", dst: "pcm") |
|
|
32
|
+
deepgram(language: "de", key: env.SPEECHFLOW_KEY_DEEPGRAM) |
|
|
33
|
+
subtitle(format: "vtt") |
|
|
34
|
+
file(path: argv.1, mode: "w", type: "text")
|
|
35
|
+
|
|
36
|
+
# Ad-Hoc text translation from German to English
|
|
24
37
|
translation: |
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
38
|
+
file(path: "-", mode: "r", type: "text") |
|
|
39
|
+
deepl(src: "de", dst: "en") |
|
|
40
|
+
file(path: "-", mode: "w", type: "text")
|
|
28
41
|
|
|
29
|
-
#
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
42
|
+
# Real-time studio translation from German to English,
|
|
43
|
+
# including the capturing of all involved inputs and outputs:
|
|
44
|
+
studio: |
|
|
45
|
+
device(device: "coreaudio:Elgato Wave:3", mode: "r") | {
|
|
46
|
+
wav(mode: "encode") |
|
|
47
|
+
file(path: "program-de.wav", mode: "w", type: "audio"),
|
|
48
|
+
deepgram(key: env.SPEECHFLOW_KEY_DEEPGRAM, language: "de") | {
|
|
49
|
+
format(width: 80) |
|
|
50
|
+
file(path: "program-de.txt", mode: "w", type: "text"),
|
|
51
|
+
deepl(key: env.SPEECHFLOW_KEY_DEEPL, src: "de", dst: "en") | {
|
|
52
|
+
format(width: 80) |
|
|
53
|
+
file(path: "program-en.txt", mode: "w", type: "text"),
|
|
54
|
+
subtitle(format: "vtt") | {
|
|
55
|
+
file(path: "program-en.vtt", mode: "w", type: "text"),
|
|
56
|
+
mqtt(url: "mqtt://10.1.0.10:1883",
|
|
57
|
+
username: env.SPEECHFLOW_MQTT_USER,
|
|
58
|
+
password: env.SPEECHFLOW_MQTT_PASS,
|
|
59
|
+
topicWrite: "stream/studio/sender")
|
|
60
|
+
},
|
|
61
|
+
subtitle(format: "srt") |
|
|
62
|
+
file(path: "program-en.srt", mode: "w", type: "text"),
|
|
63
|
+
elevenlabs(voice: "Mark", speed: 1.05, language: "en") | {
|
|
64
|
+
wav(mode: "encode") |
|
|
65
|
+
file(path: "program-en.wav", mode: "w", type: "audio"),
|
|
66
|
+
device(device: "coreaudio:USBAudio2.0", mode: "w")
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
33
71
|
|
package/src/lib.d.ts
CHANGED
|
@@ -15,6 +15,11 @@ declare module "pcm-convert" {
|
|
|
15
15
|
data: Buffer,
|
|
16
16
|
srcFormat: Format,
|
|
17
17
|
dstFormat: Format
|
|
18
|
-
):
|
|
18
|
+
): any
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
declare module "node:stream" {
|
|
22
|
+
import { Stream, Duplex } from "node:stream"
|
|
23
|
+
export function compose (...streams: Stream[]): Duplex
|
|
19
24
|
}
|
|
20
25
|
|
|
@@ -13,6 +13,7 @@ import { Converter as FFmpegStream } from "ffmpeg-stream"
|
|
|
13
13
|
|
|
14
14
|
/* internal dependencies */
|
|
15
15
|
import SpeechFlowNode from "./speechflow-node"
|
|
16
|
+
import * as utils from "./speechflow-utils"
|
|
16
17
|
|
|
17
18
|
/* SpeechFlow node for FFmpeg */
|
|
18
19
|
export default class SpeechFlowNodeFFmpeg extends SpeechFlowNode {
|
|
@@ -24,8 +25,8 @@ export default class SpeechFlowNodeFFmpeg extends SpeechFlowNode {
|
|
|
24
25
|
private ffmpeg: FFmpegStream | null = null
|
|
25
26
|
|
|
26
27
|
/* construct node */
|
|
27
|
-
constructor (id: string, opts: { [ id: string ]: any }, args: any[]) {
|
|
28
|
-
super(id, opts, args)
|
|
28
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
29
|
+
super(id, cfg, opts, args)
|
|
29
30
|
|
|
30
31
|
/* declare node configuration parameters */
|
|
31
32
|
this.configure({
|
|
@@ -93,9 +94,14 @@ export default class SpeechFlowNodeFFmpeg extends SpeechFlowNode {
|
|
|
93
94
|
|
|
94
95
|
/* establish a duplex stream and connect it to FFmpeg */
|
|
95
96
|
this.stream = Stream.Duplex.from({
|
|
96
|
-
|
|
97
|
-
|
|
97
|
+
writable: streamInput,
|
|
98
|
+
readable: streamOutput
|
|
98
99
|
})
|
|
100
|
+
|
|
101
|
+
/* wrap streams with conversions for chunk vs plain audio */
|
|
102
|
+
const wrapper1 = utils.createTransformStreamForWritableSide()
|
|
103
|
+
const wrapper2 = utils.createTransformStreamForReadableSide("audio", () => this.timeZero)
|
|
104
|
+
this.stream = Stream.compose(wrapper1, this.stream, wrapper2)
|
|
99
105
|
}
|
|
100
106
|
|
|
101
107
|
/* close node */
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import wav from "wav"
|
|
12
|
+
|
|
13
|
+
/* internal dependencies */
|
|
14
|
+
import SpeechFlowNode from "./speechflow-node"
|
|
15
|
+
import * as utils from "./speechflow-utils"
|
|
16
|
+
|
|
17
|
+
/* utility class for wrapping a custom stream into a regular Transform stream */
|
|
18
|
+
class StreamWrapper extends Stream.Transform {
|
|
19
|
+
private foreignStream: any
|
|
20
|
+
constructor (foreignStream: any, options: Stream.TransformOptions = {}) {
|
|
21
|
+
options.readableObjectMode = true
|
|
22
|
+
options.writableObjectMode = true
|
|
23
|
+
super(options)
|
|
24
|
+
this.foreignStream = foreignStream
|
|
25
|
+
this.foreignStream.on("data", (chunk: any) => {
|
|
26
|
+
this.push(chunk)
|
|
27
|
+
})
|
|
28
|
+
this.foreignStream.on("error", (err: Error) => {
|
|
29
|
+
this.emit("error", err)
|
|
30
|
+
})
|
|
31
|
+
this.foreignStream.on("end", () => {
|
|
32
|
+
this.push(null)
|
|
33
|
+
})
|
|
34
|
+
}
|
|
35
|
+
_transform (chunk: any, encoding: BufferEncoding, callback: Stream.TransformCallback): void {
|
|
36
|
+
try {
|
|
37
|
+
const canContinue = this.foreignStream.write(chunk)
|
|
38
|
+
if (canContinue)
|
|
39
|
+
callback()
|
|
40
|
+
else
|
|
41
|
+
this.foreignStream.once("drain", callback)
|
|
42
|
+
}
|
|
43
|
+
catch (err) {
|
|
44
|
+
callback(err as Error)
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
_flush (callback: Stream.TransformCallback): void {
|
|
48
|
+
try {
|
|
49
|
+
if (typeof this.foreignStream.end === "function")
|
|
50
|
+
this.foreignStream.end()
|
|
51
|
+
callback()
|
|
52
|
+
}
|
|
53
|
+
catch (err) {
|
|
54
|
+
callback(err as Error)
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/* SpeechFlow node for WAV format conversion */
|
|
60
|
+
export default class SpeechFlowNodeWAV extends SpeechFlowNode {
|
|
61
|
+
/* declare official node name */
|
|
62
|
+
public static name = "wav"
|
|
63
|
+
|
|
64
|
+
/* construct node */
|
|
65
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
66
|
+
super(id, cfg, opts, args)
|
|
67
|
+
|
|
68
|
+
/* declare node configuration parameters */
|
|
69
|
+
this.configure({
|
|
70
|
+
mode: { type: "string", pos: 1, val: "encode", match: /^(?:encode|decode)$/ }
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
/* declare node input/output format */
|
|
74
|
+
this.input = "audio"
|
|
75
|
+
this.output = "audio"
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/* open node */
|
|
79
|
+
async open () {
|
|
80
|
+
if (this.params.mode === "encode") {
|
|
81
|
+
/* convert raw/PCM to WAV/PCM */
|
|
82
|
+
/* NOTICE: as this is a continuous stream, the resulting WAV header is not 100%
|
|
83
|
+
conforming to the WAV standard, as it has to use a zero duration information.
|
|
84
|
+
This cannot be changed in a stream-based processing. */
|
|
85
|
+
const writer = new wav.Writer({
|
|
86
|
+
format: 0x0001 /* PCM */,
|
|
87
|
+
channels: this.config.audioChannels,
|
|
88
|
+
sampleRate: this.config.audioSampleRate,
|
|
89
|
+
bitDepth: this.config.audioBitDepth
|
|
90
|
+
})
|
|
91
|
+
this.stream = new StreamWrapper(writer)
|
|
92
|
+
}
|
|
93
|
+
else if (this.params.mode === "decode") {
|
|
94
|
+
/* convert WAV/PCM to raw/PCM */
|
|
95
|
+
const reader = new wav.Reader()
|
|
96
|
+
reader.on("format", (format: any) => {
|
|
97
|
+
this.log("info", `WAV audio stream: format=${format.audioFormat === 0x0001 ? "PCM" :
|
|
98
|
+
"0x" + (format.audioFormat as number).toString(16).padStart(4, "0")} ` +
|
|
99
|
+
`bitDepth=${format.bitDepth} ` +
|
|
100
|
+
`signed=${format.signed ? "yes" : "no"} ` +
|
|
101
|
+
`endian=${format.endianness} ` +
|
|
102
|
+
`sampleRate=${format.sampleRate} ` +
|
|
103
|
+
`channels=${format.channels}`)
|
|
104
|
+
if (format.audioFormat !== 0x0001 /* PCM */)
|
|
105
|
+
throw new Error("WAV not based on PCM format")
|
|
106
|
+
if (format.bitDepth !== 16)
|
|
107
|
+
throw new Error("WAV not based on 16 bit samples")
|
|
108
|
+
if (!format.signed)
|
|
109
|
+
throw new Error("WAV not based on signed integers")
|
|
110
|
+
if (format.endianness !== "LE")
|
|
111
|
+
throw new Error("WAV not based on little endianness")
|
|
112
|
+
if (format.sampleRate !== 48000)
|
|
113
|
+
throw new Error("WAV not based on 48Khz sample rate")
|
|
114
|
+
if (format.channels !== 1)
|
|
115
|
+
throw new Error("WAV not based on mono channel")
|
|
116
|
+
})
|
|
117
|
+
this.stream = new StreamWrapper(reader)
|
|
118
|
+
}
|
|
119
|
+
else
|
|
120
|
+
throw new Error(`invalid operation mode "${this.params.mode}"`)
|
|
121
|
+
|
|
122
|
+
/* convert regular stream into object-mode stream */
|
|
123
|
+
const wrapper1 = utils.createTransformStreamForWritableSide()
|
|
124
|
+
const wrapper2 = utils.createTransformStreamForReadableSide("audio", () => this.timeZero)
|
|
125
|
+
this.stream = Stream.compose(wrapper1, this.stream, wrapper2)
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/* close node */
|
|
129
|
+
async close () {
|
|
130
|
+
/* shutdown stream */
|
|
131
|
+
if (this.stream !== null) {
|
|
132
|
+
await new Promise<void>((resolve) => {
|
|
133
|
+
if (this.stream instanceof Stream.Duplex)
|
|
134
|
+
this.stream.end(() => { resolve() })
|
|
135
|
+
else
|
|
136
|
+
resolve()
|
|
137
|
+
})
|
|
138
|
+
this.stream.destroy()
|
|
139
|
+
this.stream = null
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import * as Deepgram from "@deepgram/sdk"
|
|
12
|
+
import { DateTime, Duration } from "luxon"
|
|
13
|
+
|
|
14
|
+
/* internal dependencies */
|
|
15
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
16
|
+
import * as utils from "./speechflow-utils"
|
|
17
|
+
|
|
18
|
+
/* SpeechFlow node for Deepgram speech-to-text conversion */
|
|
19
|
+
export default class SpeechFlowNodeDeepgram extends SpeechFlowNode {
|
|
20
|
+
/* declare official node name */
|
|
21
|
+
public static name = "deepgram"
|
|
22
|
+
|
|
23
|
+
/* internal state */
|
|
24
|
+
private dg: Deepgram.LiveClient | null = null
|
|
25
|
+
|
|
26
|
+
/* construct node */
|
|
27
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
28
|
+
super(id, cfg, opts, args)
|
|
29
|
+
|
|
30
|
+
/* declare node configuration parameters */
|
|
31
|
+
this.configure({
|
|
32
|
+
key: { type: "string", val: process.env.SPEECHFLOW_KEY_DEEPGRAM },
|
|
33
|
+
model: { type: "string", val: "nova-3", pos: 0 },
|
|
34
|
+
version: { type: "string", val: "latest", pos: 1 },
|
|
35
|
+
language: { type: "string", val: "multi", pos: 2 }
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
/* declare node input/output format */
|
|
39
|
+
this.input = "audio"
|
|
40
|
+
this.output = "text"
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/* open node */
|
|
44
|
+
async open () {
|
|
45
|
+
/* sanity check situation */
|
|
46
|
+
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
|
|
47
|
+
throw new Error("Deepgram node currently supports PCM-S16LE audio only")
|
|
48
|
+
|
|
49
|
+
/* create queue for results */
|
|
50
|
+
const queue = new utils.SingleQueue<SpeechFlowChunk>()
|
|
51
|
+
|
|
52
|
+
/* connect to Deepgram API */
|
|
53
|
+
const deepgram = Deepgram.createClient(this.params.key)
|
|
54
|
+
let language = "en"
|
|
55
|
+
if (this.params.model.match(/^nova-2/) && this.params.language !== "en")
|
|
56
|
+
language = this.params.language
|
|
57
|
+
else if (this.params.model.match(/^nova-3/) && this.params.language !== "en")
|
|
58
|
+
language = "multi"
|
|
59
|
+
this.dg = deepgram.listen.live({
|
|
60
|
+
mip_opt_out: true,
|
|
61
|
+
model: this.params.model,
|
|
62
|
+
version: this.params.version,
|
|
63
|
+
language,
|
|
64
|
+
channels: this.config.audioChannels,
|
|
65
|
+
sample_rate: this.config.audioSampleRate,
|
|
66
|
+
encoding: "linear16",
|
|
67
|
+
multichannel: false,
|
|
68
|
+
endpointing: 10,
|
|
69
|
+
interim_results: false,
|
|
70
|
+
smart_format: true,
|
|
71
|
+
punctuate: true,
|
|
72
|
+
filler_words: true,
|
|
73
|
+
diarize: true, /* still not used by us */
|
|
74
|
+
numerals: true,
|
|
75
|
+
profanity_filter: false
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
/* hook onto Deepgram API events */
|
|
79
|
+
this.dg.on(Deepgram.LiveTranscriptionEvents.Transcript, async (data) => {
|
|
80
|
+
const text = (data.channel?.alternatives[0].transcript as string) ?? ""
|
|
81
|
+
if (text === "")
|
|
82
|
+
this.log("info", `Deepgram: empty/dummy text received (start: ${data.start}s, duration: ${data.duration}s)`)
|
|
83
|
+
else {
|
|
84
|
+
this.log("info", `Deepgram: text received (start: ${data.start}s, duration: ${data.duration}s): "${text}"`)
|
|
85
|
+
const start = Duration.fromMillis(data.start * 1000).plus(this.timeZeroOffset)
|
|
86
|
+
const end = start.plus({ seconds: data.duration })
|
|
87
|
+
const chunk = new SpeechFlowChunk(start, end, "final", "text", text)
|
|
88
|
+
queue.write(chunk)
|
|
89
|
+
}
|
|
90
|
+
})
|
|
91
|
+
this.dg.on(Deepgram.LiveTranscriptionEvents.Metadata, (data) => {
|
|
92
|
+
this.log("info", "Deepgram: metadata received")
|
|
93
|
+
})
|
|
94
|
+
this.dg.on(Deepgram.LiveTranscriptionEvents.Close, () => {
|
|
95
|
+
this.log("info", "Deepgram: connection close")
|
|
96
|
+
})
|
|
97
|
+
this.dg.on(Deepgram.LiveTranscriptionEvents.Error, (error: Error) => {
|
|
98
|
+
this.log("error", `Deepgram: ${error.message}`)
|
|
99
|
+
this.emit("error")
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
/* wait for Deepgram API to be available */
|
|
103
|
+
await new Promise((resolve, reject) => {
|
|
104
|
+
let timer: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
105
|
+
if (timer !== null) {
|
|
106
|
+
timer = null
|
|
107
|
+
reject(new Error("Deepgram: timeout waiting for connection open"))
|
|
108
|
+
}
|
|
109
|
+
}, 3000)
|
|
110
|
+
this.dg!.once(Deepgram.LiveTranscriptionEvents.Open, () => {
|
|
111
|
+
this.log("info", "Deepgram: connection open")
|
|
112
|
+
if (timer !== null) {
|
|
113
|
+
clearTimeout(timer)
|
|
114
|
+
timer = null
|
|
115
|
+
}
|
|
116
|
+
resolve(true)
|
|
117
|
+
})
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
/* remember opening time to receive time zero offset */
|
|
121
|
+
this.timeOpen = DateTime.now()
|
|
122
|
+
|
|
123
|
+
/* workaround Deepgram initialization problems */
|
|
124
|
+
let initDone = false
|
|
125
|
+
let initTimeout: ReturnType<typeof setTimeout> | null = null
|
|
126
|
+
const initTimeoutStart = () => {
|
|
127
|
+
if (initDone)
|
|
128
|
+
return
|
|
129
|
+
setTimeout(async () => {
|
|
130
|
+
if (initTimeout === null)
|
|
131
|
+
return
|
|
132
|
+
initTimeout = null
|
|
133
|
+
this.log("warning", "Deepgram: initialization timeout -- restarting service usage")
|
|
134
|
+
await this.close()
|
|
135
|
+
this.open()
|
|
136
|
+
}, 3000)
|
|
137
|
+
}
|
|
138
|
+
const initTimeoutStop = () => {
|
|
139
|
+
if (initDone)
|
|
140
|
+
return
|
|
141
|
+
initDone = true
|
|
142
|
+
if (initTimeout !== null) {
|
|
143
|
+
clearTimeout(initTimeout)
|
|
144
|
+
initTimeout = null
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/* provide Duplex stream and internally attach to Deepgram API */
|
|
149
|
+
const dg = this.dg
|
|
150
|
+
const log = (level: string, msg: string) => {
|
|
151
|
+
this.log(level, msg)
|
|
152
|
+
}
|
|
153
|
+
const encoding = this.config.textEncoding
|
|
154
|
+
this.stream = new Stream.Duplex({
|
|
155
|
+
writableObjectMode: true,
|
|
156
|
+
readableObjectMode: true,
|
|
157
|
+
decodeStrings: false,
|
|
158
|
+
write (chunk: SpeechFlowChunk, encoding, callback) {
|
|
159
|
+
if (chunk.type !== "audio")
|
|
160
|
+
callback(new Error("expected audio input chunk"))
|
|
161
|
+
else if (!Buffer.isBuffer(chunk.payload))
|
|
162
|
+
callback(new Error("expected Buffer input chunk"))
|
|
163
|
+
else {
|
|
164
|
+
if (chunk.payload.byteLength > 0) {
|
|
165
|
+
log("info", `Deepgram: send data (${chunk.payload.byteLength} bytes)`)
|
|
166
|
+
initTimeoutStart()
|
|
167
|
+
dg.send(chunk.payload) /* intentionally discard all time information */
|
|
168
|
+
}
|
|
169
|
+
callback()
|
|
170
|
+
}
|
|
171
|
+
},
|
|
172
|
+
read (size) {
|
|
173
|
+
queue.read().then((chunk) => {
|
|
174
|
+
log("info", `Deepgram: receive data (${chunk.payload.length} bytes)`)
|
|
175
|
+
initTimeoutStop()
|
|
176
|
+
this.push(chunk, encoding)
|
|
177
|
+
})
|
|
178
|
+
},
|
|
179
|
+
final (callback) {
|
|
180
|
+
dg.requestClose()
|
|
181
|
+
this.push(null)
|
|
182
|
+
callback()
|
|
183
|
+
}
|
|
184
|
+
})
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/* close node */
|
|
188
|
+
async close () {
|
|
189
|
+
/* close stream */
|
|
190
|
+
if (this.stream !== null) {
|
|
191
|
+
this.stream.destroy()
|
|
192
|
+
this.stream = null
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/* shutdown Deepgram API */
|
|
196
|
+
if (this.dg !== null)
|
|
197
|
+
this.dg.requestClose()
|
|
198
|
+
}
|
|
199
|
+
}
|