speechflow 0.9.5 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/README.md +221 -53
- package/dst/speechflow-node-a2a-ffmpeg.d.ts +13 -0
- package/dst/speechflow-node-a2a-ffmpeg.js +152 -0
- package/dst/speechflow-node-a2a-wav.d.ts +11 -0
- package/dst/speechflow-node-a2a-wav.js +170 -0
- package/dst/speechflow-node-a2t-deepgram.d.ts +12 -0
- package/dst/speechflow-node-a2t-deepgram.js +220 -0
- package/dst/speechflow-node-deepgram.d.ts +3 -1
- package/dst/speechflow-node-deepgram.js +86 -22
- package/dst/speechflow-node-deepl.d.ts +3 -1
- package/dst/speechflow-node-deepl.js +25 -20
- package/dst/speechflow-node-device.d.ts +3 -1
- package/dst/speechflow-node-device.js +53 -2
- package/dst/speechflow-node-elevenlabs.d.ts +3 -1
- package/dst/speechflow-node-elevenlabs.js +37 -42
- package/dst/speechflow-node-ffmpeg.d.ts +3 -1
- package/dst/speechflow-node-ffmpeg.js +42 -4
- package/dst/speechflow-node-file.d.ts +3 -1
- package/dst/speechflow-node-file.js +84 -13
- package/dst/speechflow-node-format.d.ts +11 -0
- package/dst/speechflow-node-format.js +80 -0
- package/dst/speechflow-node-gemma.d.ts +3 -1
- package/dst/speechflow-node-gemma.js +84 -23
- package/dst/speechflow-node-mqtt.d.ts +13 -0
- package/dst/speechflow-node-mqtt.js +181 -0
- package/dst/speechflow-node-opus.d.ts +12 -0
- package/dst/speechflow-node-opus.js +135 -0
- package/dst/speechflow-node-subtitle.d.ts +12 -0
- package/dst/speechflow-node-subtitle.js +96 -0
- package/dst/speechflow-node-t2a-elevenlabs.d.ts +13 -0
- package/dst/speechflow-node-t2a-elevenlabs.js +182 -0
- package/dst/speechflow-node-t2t-deepl.d.ts +12 -0
- package/dst/speechflow-node-t2t-deepl.js +133 -0
- package/dst/speechflow-node-t2t-format.d.ts +11 -0
- package/dst/speechflow-node-t2t-format.js +80 -0
- package/dst/speechflow-node-t2t-gemma.d.ts +13 -0
- package/dst/speechflow-node-t2t-gemma.js +213 -0
- package/dst/speechflow-node-t2t-opus.d.ts +12 -0
- package/dst/speechflow-node-t2t-opus.js +135 -0
- package/dst/speechflow-node-t2t-subtitle.d.ts +12 -0
- package/dst/speechflow-node-t2t-subtitle.js +96 -0
- package/dst/speechflow-node-trace.d.ts +11 -0
- package/dst/speechflow-node-trace.js +88 -0
- package/dst/speechflow-node-wav.d.ts +11 -0
- package/dst/speechflow-node-wav.js +170 -0
- package/dst/speechflow-node-websocket.d.ts +3 -1
- package/dst/speechflow-node-websocket.js +149 -49
- package/dst/speechflow-node-whisper-common.d.ts +34 -0
- package/dst/speechflow-node-whisper-common.js +7 -0
- package/dst/speechflow-node-whisper-ggml.d.ts +1 -0
- package/dst/speechflow-node-whisper-ggml.js +97 -0
- package/dst/speechflow-node-whisper-onnx.d.ts +1 -0
- package/dst/speechflow-node-whisper-onnx.js +131 -0
- package/dst/speechflow-node-whisper-worker-ggml.d.ts +1 -0
- package/dst/speechflow-node-whisper-worker-ggml.js +97 -0
- package/dst/speechflow-node-whisper-worker-onnx.d.ts +1 -0
- package/dst/speechflow-node-whisper-worker-onnx.js +131 -0
- package/dst/speechflow-node-whisper-worker.d.ts +1 -0
- package/dst/speechflow-node-whisper-worker.js +116 -0
- package/dst/speechflow-node-whisper-worker2.d.ts +1 -0
- package/dst/speechflow-node-whisper-worker2.js +82 -0
- package/dst/speechflow-node-whisper.d.ts +19 -0
- package/dst/speechflow-node-whisper.js +604 -0
- package/dst/speechflow-node-x2x-trace.d.ts +11 -0
- package/dst/speechflow-node-x2x-trace.js +88 -0
- package/dst/speechflow-node-xio-device.d.ts +13 -0
- package/dst/speechflow-node-xio-device.js +205 -0
- package/dst/speechflow-node-xio-file.d.ts +11 -0
- package/dst/speechflow-node-xio-file.js +176 -0
- package/dst/speechflow-node-xio-mqtt.d.ts +13 -0
- package/dst/speechflow-node-xio-mqtt.js +181 -0
- package/dst/speechflow-node-xio-websocket.d.ts +13 -0
- package/dst/speechflow-node-xio-websocket.js +275 -0
- package/dst/speechflow-node.d.ts +24 -6
- package/dst/speechflow-node.js +63 -6
- package/dst/speechflow-utils.d.ts +23 -0
- package/dst/speechflow-utils.js +194 -0
- package/dst/speechflow.js +146 -43
- package/etc/biome.jsonc +12 -4
- package/etc/stx.conf +65 -0
- package/package.d/@ericedouard+vad-node-realtime+0.2.0.patch +18 -0
- package/package.json +49 -31
- package/sample.yaml +59 -27
- package/src/lib.d.ts +6 -1
- package/src/{speechflow-node-ffmpeg.ts → speechflow-node-a2a-ffmpeg.ts} +10 -4
- package/src/speechflow-node-a2a-wav.ts +143 -0
- package/src/speechflow-node-a2t-deepgram.ts +199 -0
- package/src/{speechflow-node-elevenlabs.ts → speechflow-node-t2a-elevenlabs.ts} +38 -45
- package/src/{speechflow-node-deepl.ts → speechflow-node-t2t-deepl.ts} +36 -25
- package/src/speechflow-node-t2t-format.ts +85 -0
- package/src/{speechflow-node-gemma.ts → speechflow-node-t2t-gemma.ts} +89 -25
- package/src/speechflow-node-t2t-opus.ts +111 -0
- package/src/speechflow-node-t2t-subtitle.ts +101 -0
- package/src/speechflow-node-x2x-trace.ts +92 -0
- package/src/{speechflow-node-device.ts → speechflow-node-xio-device.ts} +25 -3
- package/src/speechflow-node-xio-file.ts +153 -0
- package/src/speechflow-node-xio-mqtt.ts +154 -0
- package/src/speechflow-node-xio-websocket.ts +248 -0
- package/src/speechflow-node.ts +63 -6
- package/src/speechflow-utils.ts +212 -0
- package/src/speechflow.ts +150 -43
- package/etc/nps.yaml +0 -40
- package/src/speechflow-node-deepgram.ts +0 -133
- package/src/speechflow-node-file.ts +0 -108
- package/src/speechflow-node-websocket.ts +0 -179
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
|
|
2
|
+
ChangeLog
|
|
3
|
+
=========
|
|
4
|
+
|
|
5
|
+
0.9.7 (2025-07-12)
|
|
6
|
+
------------------
|
|
7
|
+
|
|
8
|
+
- IMPROVEMENT: replace "nps" with "stx" for NPM scripting
|
|
9
|
+
|
|
10
|
+
0.9.6 (2025-07-12)
|
|
11
|
+
------------------
|
|
12
|
+
|
|
13
|
+
- IMPROVEMENT: major refactoring to object-mode streaming for supporting timestamps
|
|
14
|
+
|
|
15
|
+
0.9.5 (2025-04-27)
|
|
16
|
+
------------------
|
|
17
|
+
|
|
18
|
+
(first rough cut of program)
|
|
19
|
+
|
package/README.md
CHANGED
|
@@ -18,13 +18,24 @@ About
|
|
|
18
18
|
directed data flow graph of audio and text processing nodes. This way,
|
|
19
19
|
it allows to perform various speech processing tasks in a flexible way.
|
|
20
20
|
|
|
21
|
-
**SpeechFlow** comes with built-in graph nodes for
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
21
|
+
**SpeechFlow** comes with built-in graph nodes for
|
|
22
|
+
local file I/O,
|
|
23
|
+
local audio device I/O,
|
|
24
|
+
remote WebSocket network I/O,
|
|
25
|
+
remote MQTT network I/O,
|
|
26
|
+
cloud-based [Deepgram](https://deepgram.com) speech-to-text conversion,
|
|
27
|
+
cloud-based [ElevenLabs](https://elevenlabs.io/) text-to-speech conversion,
|
|
28
|
+
cloud-based [DeepL](https://deepl.com) text-to-text translation,
|
|
29
|
+
local [Gemma/Ollama](https://ollama.com/library/gemma3) text-to-text translation,
|
|
30
|
+
local [Gemma/Ollama](https://ollama.com/library/gemma3) text-to-text spelling correction,
|
|
31
|
+
local [OPUS/ONNX](https://github.com/Helsinki-NLP/Opus-MT) text-to-text translation,
|
|
32
|
+
local [FFmpeg](https://ffmpeg.org/) speech-to-speech encoding,
|
|
33
|
+
local WAV speech-to-speech encoding,
|
|
34
|
+
local text-to-text formatting,
|
|
35
|
+
local text-to-text subtitle generation, and
|
|
36
|
+
local text or audio tracing.
|
|
37
|
+
|
|
38
|
+
Additional **SpeechFlow** graph nodes can be provided externally
|
|
28
39
|
by NPM packages named `speechflow-node-xxx` which expose a class
|
|
29
40
|
derived from the exported `SpeechFlowNode` class of the `speechflow` package.
|
|
30
41
|
|
|
@@ -47,60 +58,94 @@ $ speechflow
|
|
|
47
58
|
[-V|--version]
|
|
48
59
|
[-v|--verbose <level>]
|
|
49
60
|
[-e|--expression <expression>]
|
|
50
|
-
[-f|--
|
|
51
|
-
[-c|--config <
|
|
61
|
+
[-f|--file <file>]
|
|
62
|
+
[-c|--config <id>@<yaml-config-file>]
|
|
52
63
|
[<argument> [...]]
|
|
53
64
|
```
|
|
54
65
|
|
|
55
66
|
Processing Graph Examples
|
|
56
67
|
-------------------------
|
|
57
68
|
|
|
58
|
-
|
|
69
|
+
The following are examples of **SpeechFlow** processing graphs.
|
|
70
|
+
They can also be found in the [sample.yaml](./sample.yaml) file
|
|
71
|
+
for easy consumption with `speechflow -c <id>@sample.yaml>`.
|
|
72
|
+
|
|
73
|
+
- **Capturing**: Capture audio from microphone device into WAV audio file:
|
|
59
74
|
|
|
60
75
|
```
|
|
61
76
|
device(device: "wasapi:VoiceMeeter Out B1", mode: "r") |
|
|
62
|
-
|
|
77
|
+
wav(mode: "encode") |
|
|
78
|
+
file(path: "capture.wav", mode: "w", type: "audio")
|
|
63
79
|
```
|
|
64
80
|
|
|
65
|
-
-
|
|
81
|
+
- **Pass-Through**: Pass-through audio from microphone device to speaker
|
|
82
|
+
device and in parallel record it to WAV audio file:
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
device(device: "wasapi:VoiceMeeter Out B1", mode: "r") | {
|
|
86
|
+
wav(mode: "encode") |
|
|
87
|
+
file(path: "capture.wav", mode: "w", type: "audio"),
|
|
88
|
+
device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
|
|
89
|
+
}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
- **Narration**: Generate text file with German narration of MP3 audio file:
|
|
66
93
|
|
|
67
94
|
```
|
|
68
95
|
file(path: argv.0, mode: "r", type: "audio") |
|
|
69
|
-
|
|
70
|
-
|
|
96
|
+
ffmpeg(src: "mp3", dst: "pcm") |
|
|
97
|
+
deepgram(language: "de", key: env.SPEECHFLOW_KEY_DEEPGRAM) |
|
|
98
|
+
format(width: 80) |
|
|
99
|
+
file(path: argv.1, mode: "w", type: "text")
|
|
71
100
|
```
|
|
72
101
|
|
|
73
|
-
-
|
|
102
|
+
- **Subtitling**: Generate text file with German subtitles of MP3 audio file:
|
|
74
103
|
|
|
75
104
|
```
|
|
76
|
-
file(path:
|
|
77
|
-
|
|
78
|
-
|
|
105
|
+
file(path: argv.0, mode: "r", type: "audio") |
|
|
106
|
+
ffmpeg(src: "mp3", dst: "pcm") |
|
|
107
|
+
deepgram(language: "de", key: env.SPEECHFLOW_KEY_DEEPGRAM) |
|
|
108
|
+
subtitle(format: "vtt") |
|
|
109
|
+
file(path: argv.1, mode: "w", type: "text")
|
|
79
110
|
```
|
|
80
111
|
|
|
81
|
-
-
|
|
112
|
+
- **Ad-Hoc Translation**: Ad-Hoc text translation from German to English
|
|
113
|
+
via stdin/stdout:
|
|
82
114
|
|
|
83
115
|
```
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
}
|
|
116
|
+
file(path: "-", mode: "r", type: "text") |
|
|
117
|
+
deepl(src: "de", dst: "en") |
|
|
118
|
+
file(path: "-", mode: "w", type: "text")
|
|
88
119
|
```
|
|
89
120
|
|
|
90
|
-
- Real-time translation from
|
|
121
|
+
- **Studio Translation**: Real-time studio translation from German to English,
|
|
122
|
+
including the capturing of all involved inputs and outputs:
|
|
91
123
|
|
|
92
124
|
```
|
|
93
|
-
device(device: "
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
125
|
+
device(device: "coreaudio:Elgato Wave:3", mode: "r") | {
|
|
126
|
+
wav(mode: "encode") |
|
|
127
|
+
file(path: "program-de.wav", mode: "w", type: "audio"),
|
|
128
|
+
deepgram(key: env.SPEECHFLOW_KEY_DEEPGRAM, language: "de") | {
|
|
129
|
+
format(width: 80) |
|
|
130
|
+
file(path: "program-de.txt", mode: "w", type: "text"),
|
|
131
|
+
deepl(key: env.SPEECHFLOW_KEY_DEEPL, src: "de", dst: "en") | {
|
|
132
|
+
format(width: 80) |
|
|
133
|
+
file(path: "program-en.txt", mode: "w", type: "text"),
|
|
134
|
+
subtitle(format: "vtt") | {
|
|
135
|
+
file(path: "program-en.vtt", mode: "w", type: "text"),
|
|
136
|
+
mqtt(url: "mqtt://10.1.0.10:1883",
|
|
137
|
+
username: env.SPEECHFLOW_MQTT_USER,
|
|
138
|
+
password: env.SPEECHFLOW_MQTT_PASS,
|
|
139
|
+
topicWrite: "stream/studio/sender")
|
|
140
|
+
},
|
|
141
|
+
subtitle(format: "srt") |
|
|
142
|
+
file(path: "program-en.srt", mode: "w", type: "text"),
|
|
143
|
+
elevenlabs(voice: "Mark", speed: 1.05, language: "en") | {
|
|
144
|
+
wav(mode: "encode") |
|
|
145
|
+
file(path: "program-en.wav", mode: "w", type: "audio"),
|
|
146
|
+
device(device: "coreaudio:USBAudio2.0", mode: "w")
|
|
147
|
+
}
|
|
148
|
+
}
|
|
104
149
|
}
|
|
105
150
|
}
|
|
106
151
|
```
|
|
@@ -108,7 +153,30 @@ Processing Graph Examples
|
|
|
108
153
|
Processing Node Types
|
|
109
154
|
---------------------
|
|
110
155
|
|
|
111
|
-
|
|
156
|
+
First a short overview of the available processing nodes:
|
|
157
|
+
|
|
158
|
+
- Input/Output nodes:
|
|
159
|
+
**file**,
|
|
160
|
+
**device**,
|
|
161
|
+
**websocket**,
|
|
162
|
+
**mqtt**.
|
|
163
|
+
- Audio-to-Audio nodes:
|
|
164
|
+
**ffmpeg**,
|
|
165
|
+
**wav**.
|
|
166
|
+
- Audio-to-Text nodes:
|
|
167
|
+
**deepgram**.
|
|
168
|
+
- Text-to-Text nodes:
|
|
169
|
+
**deepl**,
|
|
170
|
+
**gemma**,
|
|
171
|
+
**opus**,
|
|
172
|
+
**subtitle**,
|
|
173
|
+
**format**.
|
|
174
|
+
- Text-to-Audio nodes:
|
|
175
|
+
**elevenlabs**.
|
|
176
|
+
- Any-to-Any nodes:
|
|
177
|
+
**trace**.
|
|
178
|
+
|
|
179
|
+
### Input/Output Nodes:
|
|
112
180
|
|
|
113
181
|
- Node: **file**<br/>
|
|
114
182
|
Purpose: **File and StdIO source/sink**<br/>
|
|
@@ -125,9 +193,24 @@ Currently **SpeechFlow** provides the following processing nodes:
|
|
|
125
193
|
| **mode** | 1 | "r" | `/^(?:r\|w\|rw)$/` |
|
|
126
194
|
| **type** | 2 | "audio" | `/^(?:audio\|text)$/` |
|
|
127
195
|
|
|
196
|
+
- Node: **device**<br/>
|
|
197
|
+
Purpose: **Microphone/speaker device source/sink**<br/>
|
|
198
|
+
Example: `device(device: "wasapi:VoiceMeeter Out B1", mode: "r")`
|
|
199
|
+
|
|
200
|
+
| Port | Payload |
|
|
201
|
+
| ------- | ----------- |
|
|
202
|
+
| input | audio |
|
|
203
|
+
| output | audio |
|
|
204
|
+
|
|
205
|
+
| Parameter | Position | Default | Requirement |
|
|
206
|
+
| ----------- | --------- | -------- | ------------------ |
|
|
207
|
+
| **device** | 0 | *none* | `/^(.+?):(.+)$/` |
|
|
208
|
+
| **mode** | 1 | "rw" | `/^(?:r\|w\|rw)$/` |
|
|
209
|
+
|
|
128
210
|
- Node: **websocket**<br/>
|
|
129
211
|
Purpose: **WebSocket source/sink**<br/>
|
|
130
|
-
Example: `websocket(connect: "ws://127.0.0.1:12345"
|
|
212
|
+
Example: `websocket(connect: "ws://127.0.0.1:12345", type: "text")`
|
|
213
|
+
Notice: this node requires a peer WebSocket service!
|
|
131
214
|
|
|
132
215
|
| Port | Payload |
|
|
133
216
|
| ------- | ----------- |
|
|
@@ -140,19 +223,24 @@ Currently **SpeechFlow** provides the following processing nodes:
|
|
|
140
223
|
| **connect** | *none* | *none* | `/^(?:\|ws:\/\/(.+?):(\d+)(?:\/.*)?)$/` |
|
|
141
224
|
| **type** | *none* | "audio" | `/^(?:audio\|text)$/` |
|
|
142
225
|
|
|
143
|
-
- Node: **
|
|
144
|
-
Purpose: **
|
|
145
|
-
Example: `
|
|
226
|
+
- Node: **mqtt**<br/>
|
|
227
|
+
Purpose: **MQTT sink**<br/>
|
|
228
|
+
Example: `mqtt(url: "mqtt://127.0.0.1:1883", username: "foo", password: "bar", topic: "quux")`
|
|
229
|
+
Notice: this node requires a peer MQTT broker!
|
|
146
230
|
|
|
147
231
|
| Port | Payload |
|
|
148
232
|
| ------- | ----------- |
|
|
149
|
-
| input |
|
|
150
|
-
| output |
|
|
233
|
+
| input | text |
|
|
234
|
+
| output | none |
|
|
151
235
|
|
|
152
|
-
| Parameter
|
|
153
|
-
|
|
|
154
|
-
| **
|
|
155
|
-
| **
|
|
236
|
+
| Parameter | Position | Default | Requirement |
|
|
237
|
+
| ------------ | --------- | -------- | --------------------- |
|
|
238
|
+
| **url** | 0 | *none* | `/^(?:\|(?:ws|mqtt):\/\/(.+?):(\d+))$/` |
|
|
239
|
+
| **username** | 1 | *none* | `/^.+$/` |
|
|
240
|
+
| **password** | 2 | *none* | `/^.+$/` |
|
|
241
|
+
| **topic** | 3 | *none* | `/^.+$/` |
|
|
242
|
+
|
|
243
|
+
### Audio-to-Audio Nodes:
|
|
156
244
|
|
|
157
245
|
- Node: **ffmpeg**<br/>
|
|
158
246
|
Purpose: **FFmpeg audio format conversion**<br/>
|
|
@@ -168,6 +256,21 @@ Currently **SpeechFlow** provides the following processing nodes:
|
|
|
168
256
|
| **src** | 0 | "pcm" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
|
|
169
257
|
| **dst** | 1 | "wav" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
|
|
170
258
|
|
|
259
|
+
- Node: **wav**<br/>
|
|
260
|
+
Purpose: **WAV audio format conversion**<br/>
|
|
261
|
+
Example: `wav(mode: "encode")`
|
|
262
|
+
|
|
263
|
+
| Port | Payload |
|
|
264
|
+
| ------- | ----------- |
|
|
265
|
+
| input | audio |
|
|
266
|
+
| output | audio |
|
|
267
|
+
|
|
268
|
+
| Parameter | Position | Default | Requirement |
|
|
269
|
+
| ----------- | --------- | -------- | ------------------------ |
|
|
270
|
+
| **mode** | 0 | "encode" | `/^(?:encode\|decode)$/` |
|
|
271
|
+
|
|
272
|
+
### Audio-to-Text Nodes:
|
|
273
|
+
|
|
171
274
|
- Node: **deepgram**<br/>
|
|
172
275
|
Purpose: **Deepgram Speech-to-Text conversion**<br/>
|
|
173
276
|
Example: `deepgram(language: "de")`<br/>
|
|
@@ -181,13 +284,15 @@ Currently **SpeechFlow** provides the following processing nodes:
|
|
|
181
284
|
| Parameter | Position | Default | Requirement |
|
|
182
285
|
| ------------ | --------- | -------- | ------------------ |
|
|
183
286
|
| **key** | *none* | env.SPEECHFLOW\_KEY\_DEEPGRAM | *none* |
|
|
184
|
-
| **model** | 0 | "nova-
|
|
287
|
+
| **model** | 0 | "nova-3" | *none* |
|
|
185
288
|
| **version** | 1 | "latest" | *none* |
|
|
186
|
-
| **language** | 2 | "
|
|
289
|
+
| **language** | 2 | "multi" | *none* |
|
|
290
|
+
|
|
291
|
+
### Text-to-Text Nodes:
|
|
187
292
|
|
|
188
293
|
- Node: **deepl**<br/>
|
|
189
294
|
Purpose: **DeepL Text-to-Text translation**<br/>
|
|
190
|
-
Example: `deepl(src: "de", dst: "en
|
|
295
|
+
Example: `deepl(src: "de", dst: "en")`<br/>
|
|
191
296
|
Notice: this node requires an API key!
|
|
192
297
|
|
|
193
298
|
| Port | Payload |
|
|
@@ -198,11 +303,11 @@ Currently **SpeechFlow** provides the following processing nodes:
|
|
|
198
303
|
| Parameter | Position | Default | Requirement |
|
|
199
304
|
| ------------ | --------- | -------- | ------------------ |
|
|
200
305
|
| **key** | *none* | env.SPEECHFLOW\_KEY\_DEEPL | *none* |
|
|
201
|
-
| **src** | 0 | "de" | `/^(?:de\|en
|
|
202
|
-
| **dst** | 1 | "en
|
|
306
|
+
| **src** | 0 | "de" | `/^(?:de\|en)$/` |
|
|
307
|
+
| **dst** | 1 | "en" | `/^(?:de\|en)$/` |
|
|
203
308
|
|
|
204
309
|
- Node: **gemma**<br/>
|
|
205
|
-
Purpose: **Google Gemma Text-to-Text translation**<br/>
|
|
310
|
+
Purpose: **Google Gemma Text-to-Text translation and spelling correction**<br/>
|
|
206
311
|
Example: `gemma(src: "de", dst: "en")`<br/>
|
|
207
312
|
Notice; this node requires the Ollama API!
|
|
208
313
|
|
|
@@ -217,6 +322,48 @@ Currently **SpeechFlow** provides the following processing nodes:
|
|
|
217
322
|
| **src** | 0 | "de" | `/^(?:de\|en)$/` |
|
|
218
323
|
| **dst** | 1 | "en" | `/^(?:de\|en)$/` |
|
|
219
324
|
|
|
325
|
+
- Node: **opus**<br/>
|
|
326
|
+
Purpose: **OPUS Text-to-Text translation**<br/>
|
|
327
|
+
Example: `deepl(src: "de", dst: "en")`<br/>
|
|
328
|
+
|
|
329
|
+
| Port | Payload |
|
|
330
|
+
| ------- | ----------- |
|
|
331
|
+
| input | text |
|
|
332
|
+
| output | text |
|
|
333
|
+
|
|
334
|
+
| Parameter | Position | Default | Requirement |
|
|
335
|
+
| ------------ | --------- | -------- | ---------------- |
|
|
336
|
+
| **src** | 0 | "de" | `/^(?:de\|en)$/` |
|
|
337
|
+
| **dst** | 1 | "en" | `/^(?:de\|en)$/` |
|
|
338
|
+
|
|
339
|
+
- Node: **subtitle**<br/>
|
|
340
|
+
Purpose: **SRT/VTT Subtitle Generation**<br/>
|
|
341
|
+
Example: `subtitle(format: "srt")`<br/>
|
|
342
|
+
|
|
343
|
+
| Port | Payload |
|
|
344
|
+
| ------- | ----------- |
|
|
345
|
+
| input | text |
|
|
346
|
+
| output | text |
|
|
347
|
+
|
|
348
|
+
| Parameter | Position | Default | Requirement |
|
|
349
|
+
| ------------ | --------- | -------- | ------------------ |
|
|
350
|
+
| **format** | *none* | "srt" | /^(?:srt\|vtt)$/ |
|
|
351
|
+
|
|
352
|
+
- Node: **format**<br/>
|
|
353
|
+
Purpose: **text paragraph formatting**<br/>
|
|
354
|
+
Example: `format(width: 80)`<br/>
|
|
355
|
+
|
|
356
|
+
| Port | Payload |
|
|
357
|
+
| ------- | ----------- |
|
|
358
|
+
| input | text |
|
|
359
|
+
| output | text |
|
|
360
|
+
|
|
361
|
+
| Parameter | Position | Default | Requirement |
|
|
362
|
+
| ------------ | --------- | -------- | --------------------- |
|
|
363
|
+
| **width** | 0 | 80 | *none* |
|
|
364
|
+
|
|
365
|
+
### Text-to-Audio Nodes:
|
|
366
|
+
|
|
220
367
|
- Node: **elevenlabs**<br/>
|
|
221
368
|
Purpose: **ElevenLabs Text-to-Speech conversion**<br/>
|
|
222
369
|
Example: `elevenlabs(language: "en")`<br/>
|
|
@@ -233,6 +380,22 @@ Currently **SpeechFlow** provides the following processing nodes:
|
|
|
233
380
|
| **voice** | 0 | "Brian" | *none* |
|
|
234
381
|
| **language** | 1 | "de" | *none* |
|
|
235
382
|
|
|
383
|
+
### Any-to-Any Nodes:
|
|
384
|
+
|
|
385
|
+
- Node: **trace**<br/>
|
|
386
|
+
Purpose: **data flow tracing**<br/>
|
|
387
|
+
Example: `trace(type: "audio")`<br/>
|
|
388
|
+
|
|
389
|
+
| Port | Payload |
|
|
390
|
+
| ------- | ----------- |
|
|
391
|
+
| input | text, audio |
|
|
392
|
+
| output | text, audio |
|
|
393
|
+
|
|
394
|
+
| Parameter | Position | Default | Requirement |
|
|
395
|
+
| ------------ | --------- | -------- | --------------------- |
|
|
396
|
+
| **type** | 0 | "audio" | `/^(?:audio\|text)$/` |
|
|
397
|
+
| **name** | 1 | *none* | *none* |
|
|
398
|
+
|
|
236
399
|
Graph Expression Language
|
|
237
400
|
-------------------------
|
|
238
401
|
|
|
@@ -266,13 +429,18 @@ number-value ::= "0b" /[01]+/
|
|
|
266
429
|
value ::= "true" | "false" | "null" | "NaN" | "undefined"
|
|
267
430
|
```
|
|
268
431
|
|
|
432
|
+
**SpeechFlow** makes available to **FlowLink** all **SpeechFlow** nodes as
|
|
433
|
+
`node`, the CLI arguments under the array `variable` named `argv`, and all
|
|
434
|
+
environment variables under the object `variable` named `env`.
|
|
435
|
+
|
|
269
436
|
History
|
|
270
437
|
-------
|
|
271
438
|
|
|
272
439
|
**Speechflow**, as a technical cut-through, was initially created in
|
|
273
440
|
March 2024 for use in the msg Filmstudio context. It was later refined
|
|
274
441
|
into a more complete toolkit in April 2025 and this way the first time
|
|
275
|
-
could be used in production.
|
|
442
|
+
could be used in production. It was fully refactored in July 2025 in
|
|
443
|
+
order to support timestamps in the streams processing.
|
|
276
444
|
|
|
277
445
|
Copyright & License
|
|
278
446
|
-------------------
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import SpeechFlowNode from "./speechflow-node";
|
|
2
|
+
export default class SpeechFlowNodeFFmpeg extends SpeechFlowNode {
|
|
3
|
+
static name: string;
|
|
4
|
+
private ffmpegBinary;
|
|
5
|
+
private ffmpeg;
|
|
6
|
+
constructor(id: string, cfg: {
|
|
7
|
+
[id: string]: any;
|
|
8
|
+
}, opts: {
|
|
9
|
+
[id: string]: any;
|
|
10
|
+
}, args: any[]);
|
|
11
|
+
open(): Promise<void>;
|
|
12
|
+
close(): Promise<void>;
|
|
13
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/*
|
|
3
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
4
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
5
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
6
|
+
*/
|
|
7
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
8
|
+
if (k2 === undefined) k2 = k;
|
|
9
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
10
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
11
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
12
|
+
}
|
|
13
|
+
Object.defineProperty(o, k2, desc);
|
|
14
|
+
}) : (function(o, m, k, k2) {
|
|
15
|
+
if (k2 === undefined) k2 = k;
|
|
16
|
+
o[k2] = m[k];
|
|
17
|
+
}));
|
|
18
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
19
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
20
|
+
}) : function(o, v) {
|
|
21
|
+
o["default"] = v;
|
|
22
|
+
});
|
|
23
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
24
|
+
var ownKeys = function(o) {
|
|
25
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
26
|
+
var ar = [];
|
|
27
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
28
|
+
return ar;
|
|
29
|
+
};
|
|
30
|
+
return ownKeys(o);
|
|
31
|
+
};
|
|
32
|
+
return function (mod) {
|
|
33
|
+
if (mod && mod.__esModule) return mod;
|
|
34
|
+
var result = {};
|
|
35
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
36
|
+
__setModuleDefault(result, mod);
|
|
37
|
+
return result;
|
|
38
|
+
};
|
|
39
|
+
})();
|
|
40
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
41
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
42
|
+
};
|
|
43
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
44
|
+
/* standard dependencies */
|
|
45
|
+
const node_stream_1 = __importDefault(require("node:stream"));
|
|
46
|
+
/* external dependencies */
|
|
47
|
+
const ffmpeg_1 = __importDefault(require("@rse/ffmpeg"));
|
|
48
|
+
const ffmpeg_stream_1 = require("ffmpeg-stream");
|
|
49
|
+
/* internal dependencies */
|
|
50
|
+
const speechflow_node_1 = __importDefault(require("./speechflow-node"));
|
|
51
|
+
const utils = __importStar(require("./speechflow-utils"));
|
|
52
|
+
/* SpeechFlow node for FFmpeg */
|
|
53
|
+
class SpeechFlowNodeFFmpeg extends speechflow_node_1.default {
|
|
54
|
+
/* declare official node name */
|
|
55
|
+
static name = "ffmpeg";
|
|
56
|
+
/* internal state */
|
|
57
|
+
ffmpegBinary = ffmpeg_1.default.supported ? ffmpeg_1.default.binary : "ffmpeg";
|
|
58
|
+
ffmpeg = null;
|
|
59
|
+
/* construct node */
|
|
60
|
+
constructor(id, cfg, opts, args) {
|
|
61
|
+
super(id, cfg, opts, args);
|
|
62
|
+
/* declare node configuration parameters */
|
|
63
|
+
this.configure({
|
|
64
|
+
src: { type: "string", pos: 0, val: "pcm", match: /^(?:pcm|wav|mp3|opus)$/ },
|
|
65
|
+
dst: { type: "string", pos: 1, val: "wav", match: /^(?:pcm|wav|mp3|opus)$/ }
|
|
66
|
+
});
|
|
67
|
+
/* declare node input/output format */
|
|
68
|
+
this.input = "audio";
|
|
69
|
+
this.output = "audio";
|
|
70
|
+
}
|
|
71
|
+
/* open node */
|
|
72
|
+
async open() {
|
|
73
|
+
/* sanity check situation */
|
|
74
|
+
if (this.params.src === this.params.dst)
|
|
75
|
+
throw new Error("source and destination formats should not be the same");
|
|
76
|
+
/* instantiate FFmpeg sub-process */
|
|
77
|
+
this.ffmpeg = new ffmpeg_stream_1.Converter(this.ffmpegBinary);
|
|
78
|
+
const streamInput = this.ffmpeg.createInputStream({
|
|
79
|
+
/* FFmpeg input options */
|
|
80
|
+
"fflags": "nobuffer",
|
|
81
|
+
"flags": "low_delay",
|
|
82
|
+
"probesize": 32,
|
|
83
|
+
"analyzeduration": 0,
|
|
84
|
+
...(this.params.src === "pcm" ? {
|
|
85
|
+
"f": "s16le",
|
|
86
|
+
"ar": this.config.audioSampleRate,
|
|
87
|
+
"ac": this.config.audioChannels
|
|
88
|
+
} : {}),
|
|
89
|
+
...(this.params.src === "wav" ? {
|
|
90
|
+
"f": "wav"
|
|
91
|
+
} : {}),
|
|
92
|
+
...(this.params.src === "mp3" ? {
|
|
93
|
+
"f": "mp3"
|
|
94
|
+
} : {}),
|
|
95
|
+
...(this.params.src === "opus" ? {
|
|
96
|
+
"f": "opus"
|
|
97
|
+
} : {})
|
|
98
|
+
});
|
|
99
|
+
const streamOutput = this.ffmpeg.createOutputStream({
|
|
100
|
+
/* FFmpeg output options */
|
|
101
|
+
"flush_packets": 1,
|
|
102
|
+
...(this.params.dst === "pcm" ? {
|
|
103
|
+
"c:a": "pcm_s16le",
|
|
104
|
+
"ar": this.config.audioSampleRate,
|
|
105
|
+
"ac": this.config.audioChannels,
|
|
106
|
+
"f": "s16le",
|
|
107
|
+
} : {}),
|
|
108
|
+
...(this.params.dst === "wav" ? {
|
|
109
|
+
"f": "wav"
|
|
110
|
+
} : {}),
|
|
111
|
+
...(this.params.dst === "mp3" ? {
|
|
112
|
+
"c:a": "libmp3lame",
|
|
113
|
+
"b:a": "192k",
|
|
114
|
+
"f": "mp3"
|
|
115
|
+
} : {}),
|
|
116
|
+
...(this.params.dst === "opus" ? {
|
|
117
|
+
"acodec": "libopus",
|
|
118
|
+
"f": "opus"
|
|
119
|
+
} : {})
|
|
120
|
+
});
|
|
121
|
+
this.ffmpeg.run();
|
|
122
|
+
/* establish a duplex stream and connect it to FFmpeg */
|
|
123
|
+
this.stream = node_stream_1.default.Duplex.from({
|
|
124
|
+
writable: streamInput,
|
|
125
|
+
readable: streamOutput
|
|
126
|
+
});
|
|
127
|
+
/* wrap streams with conversions for chunk vs plain audio */
|
|
128
|
+
const wrapper1 = utils.createTransformStreamForWritableSide();
|
|
129
|
+
const wrapper2 = utils.createTransformStreamForReadableSide("audio", () => this.timeZero);
|
|
130
|
+
this.stream = node_stream_1.default.compose(wrapper1, this.stream, wrapper2);
|
|
131
|
+
}
|
|
132
|
+
/* close node */
|
|
133
|
+
async close() {
|
|
134
|
+
/* close duplex stream */
|
|
135
|
+
if (this.stream !== null) {
|
|
136
|
+
await new Promise((resolve) => {
|
|
137
|
+
if (this.stream instanceof node_stream_1.default.Duplex)
|
|
138
|
+
this.stream.end(() => { resolve(); });
|
|
139
|
+
else
|
|
140
|
+
resolve();
|
|
141
|
+
});
|
|
142
|
+
this.stream.destroy();
|
|
143
|
+
this.stream = null;
|
|
144
|
+
}
|
|
145
|
+
/* shutdown FFmpeg */
|
|
146
|
+
if (this.ffmpeg !== null) {
|
|
147
|
+
this.ffmpeg.kill();
|
|
148
|
+
this.ffmpeg = null;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
exports.default = SpeechFlowNodeFFmpeg;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import SpeechFlowNode from "./speechflow-node";
|
|
2
|
+
export default class SpeechFlowNodeWAV extends SpeechFlowNode {
|
|
3
|
+
static name: string;
|
|
4
|
+
constructor(id: string, cfg: {
|
|
5
|
+
[id: string]: any;
|
|
6
|
+
}, opts: {
|
|
7
|
+
[id: string]: any;
|
|
8
|
+
}, args: any[]);
|
|
9
|
+
open(): Promise<void>;
|
|
10
|
+
close(): Promise<void>;
|
|
11
|
+
}
|