speechflow 0.9.4 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/README.md +227 -54
  3. package/dst/speechflow-node-a2a-ffmpeg.d.ts +13 -0
  4. package/dst/speechflow-node-a2a-ffmpeg.js +152 -0
  5. package/dst/speechflow-node-a2a-wav.d.ts +11 -0
  6. package/dst/speechflow-node-a2a-wav.js +170 -0
  7. package/dst/speechflow-node-a2t-deepgram.d.ts +12 -0
  8. package/dst/speechflow-node-a2t-deepgram.js +220 -0
  9. package/dst/speechflow-node-deepgram.d.ts +3 -1
  10. package/dst/speechflow-node-deepgram.js +86 -22
  11. package/dst/speechflow-node-deepl.d.ts +3 -1
  12. package/dst/speechflow-node-deepl.js +25 -20
  13. package/dst/speechflow-node-device.d.ts +3 -1
  14. package/dst/speechflow-node-device.js +53 -2
  15. package/dst/speechflow-node-elevenlabs.d.ts +4 -1
  16. package/dst/speechflow-node-elevenlabs.js +88 -49
  17. package/dst/speechflow-node-ffmpeg.d.ts +3 -1
  18. package/dst/speechflow-node-ffmpeg.js +42 -4
  19. package/dst/speechflow-node-file.d.ts +3 -1
  20. package/dst/speechflow-node-file.js +84 -13
  21. package/dst/speechflow-node-format.d.ts +11 -0
  22. package/dst/speechflow-node-format.js +80 -0
  23. package/dst/speechflow-node-gemma.d.ts +3 -1
  24. package/dst/speechflow-node-gemma.js +84 -23
  25. package/dst/speechflow-node-mqtt.d.ts +13 -0
  26. package/dst/speechflow-node-mqtt.js +181 -0
  27. package/dst/speechflow-node-opus.d.ts +12 -0
  28. package/dst/speechflow-node-opus.js +135 -0
  29. package/dst/speechflow-node-subtitle.d.ts +12 -0
  30. package/dst/speechflow-node-subtitle.js +96 -0
  31. package/dst/speechflow-node-t2a-elevenlabs.d.ts +13 -0
  32. package/dst/speechflow-node-t2a-elevenlabs.js +182 -0
  33. package/dst/speechflow-node-t2t-deepl.d.ts +12 -0
  34. package/dst/speechflow-node-t2t-deepl.js +133 -0
  35. package/dst/speechflow-node-t2t-format.d.ts +11 -0
  36. package/dst/speechflow-node-t2t-format.js +80 -0
  37. package/dst/speechflow-node-t2t-gemma.d.ts +13 -0
  38. package/dst/speechflow-node-t2t-gemma.js +213 -0
  39. package/dst/speechflow-node-t2t-opus.d.ts +12 -0
  40. package/dst/speechflow-node-t2t-opus.js +135 -0
  41. package/dst/speechflow-node-t2t-subtitle.d.ts +12 -0
  42. package/dst/speechflow-node-t2t-subtitle.js +96 -0
  43. package/dst/speechflow-node-trace.d.ts +11 -0
  44. package/dst/speechflow-node-trace.js +88 -0
  45. package/dst/speechflow-node-wav.d.ts +11 -0
  46. package/dst/speechflow-node-wav.js +170 -0
  47. package/dst/speechflow-node-websocket.d.ts +3 -1
  48. package/dst/speechflow-node-websocket.js +149 -49
  49. package/dst/speechflow-node-whisper-common.d.ts +34 -0
  50. package/dst/speechflow-node-whisper-common.js +7 -0
  51. package/dst/speechflow-node-whisper-ggml.d.ts +1 -0
  52. package/dst/speechflow-node-whisper-ggml.js +97 -0
  53. package/dst/speechflow-node-whisper-onnx.d.ts +1 -0
  54. package/dst/speechflow-node-whisper-onnx.js +131 -0
  55. package/dst/speechflow-node-whisper-worker-ggml.d.ts +1 -0
  56. package/dst/speechflow-node-whisper-worker-ggml.js +97 -0
  57. package/dst/speechflow-node-whisper-worker-onnx.d.ts +1 -0
  58. package/dst/speechflow-node-whisper-worker-onnx.js +131 -0
  59. package/dst/speechflow-node-whisper-worker.d.ts +1 -0
  60. package/dst/speechflow-node-whisper-worker.js +116 -0
  61. package/dst/speechflow-node-whisper-worker2.d.ts +1 -0
  62. package/dst/speechflow-node-whisper-worker2.js +82 -0
  63. package/dst/speechflow-node-whisper.d.ts +19 -0
  64. package/dst/speechflow-node-whisper.js +604 -0
  65. package/dst/speechflow-node-x2x-trace.d.ts +11 -0
  66. package/dst/speechflow-node-x2x-trace.js +88 -0
  67. package/dst/speechflow-node-xio-device.d.ts +13 -0
  68. package/dst/speechflow-node-xio-device.js +205 -0
  69. package/dst/speechflow-node-xio-file.d.ts +11 -0
  70. package/dst/speechflow-node-xio-file.js +176 -0
  71. package/dst/speechflow-node-xio-mqtt.d.ts +13 -0
  72. package/dst/speechflow-node-xio-mqtt.js +181 -0
  73. package/dst/speechflow-node-xio-websocket.d.ts +13 -0
  74. package/dst/speechflow-node-xio-websocket.js +275 -0
  75. package/dst/speechflow-node.d.ts +25 -7
  76. package/dst/speechflow-node.js +74 -9
  77. package/dst/speechflow-utils.d.ts +23 -0
  78. package/dst/speechflow-utils.js +194 -0
  79. package/dst/speechflow.js +146 -43
  80. package/etc/biome.jsonc +12 -4
  81. package/etc/stx.conf +65 -0
  82. package/package.d/@ericedouard+vad-node-realtime+0.2.0.patch +18 -0
  83. package/package.json +49 -31
  84. package/sample.yaml +61 -23
  85. package/src/lib.d.ts +6 -1
  86. package/src/{speechflow-node-ffmpeg.ts → speechflow-node-a2a-ffmpeg.ts} +10 -4
  87. package/src/speechflow-node-a2a-wav.ts +143 -0
  88. package/src/speechflow-node-a2t-deepgram.ts +199 -0
  89. package/src/speechflow-node-t2a-elevenlabs.ts +160 -0
  90. package/src/{speechflow-node-deepl.ts → speechflow-node-t2t-deepl.ts} +36 -25
  91. package/src/speechflow-node-t2t-format.ts +85 -0
  92. package/src/{speechflow-node-gemma.ts → speechflow-node-t2t-gemma.ts} +89 -25
  93. package/src/speechflow-node-t2t-opus.ts +111 -0
  94. package/src/speechflow-node-t2t-subtitle.ts +101 -0
  95. package/src/speechflow-node-x2x-trace.ts +92 -0
  96. package/src/{speechflow-node-device.ts → speechflow-node-xio-device.ts} +25 -3
  97. package/src/speechflow-node-xio-file.ts +153 -0
  98. package/src/speechflow-node-xio-mqtt.ts +154 -0
  99. package/src/speechflow-node-xio-websocket.ts +248 -0
  100. package/src/speechflow-node.ts +78 -13
  101. package/src/speechflow-utils.ts +212 -0
  102. package/src/speechflow.ts +150 -43
  103. package/etc/nps.yaml +0 -40
  104. package/src/speechflow-node-deepgram.ts +0 -133
  105. package/src/speechflow-node-elevenlabs.ts +0 -116
  106. package/src/speechflow-node-file.ts +0 -108
  107. package/src/speechflow-node-websocket.ts +0 -179
package/CHANGELOG.md ADDED
@@ -0,0 +1,19 @@
1
+
2
+ ChangeLog
3
+ =========
4
+
5
+ 0.9.7 (2025-07-12)
6
+ ------------------
7
+
8
+ - IMPROVEMENT: replace "nps" with "stx" for NPM scripting
9
+
10
+ 0.9.6 (2025-07-12)
11
+ ------------------
12
+
13
+ - IMPROVEMENT: major refactoring to object-mode streaming for supporting timestamps
14
+
15
+ 0.9.5 (2025-04-27)
16
+ ------------------
17
+
18
+ (first rough cut of program)
19
+
package/README.md CHANGED
@@ -17,14 +17,30 @@ About
17
17
  **SpeechFlow** is a command-line interface based tool for establishing a
18
18
  directed data flow graph of audio and text processing nodes. This way,
19
19
  it allows to perform various speech processing tasks in a flexible way.
20
- Currently, **SpeechFlow** comes with graph nodes for local file I/O, local audio
21
- device I/O, local/remote WebSocket network I/O, cloud-based [Deepgram](https://deepgram.com)
22
- speech-to-text conversion, cloud-based [DeepL](https://deepl.com) text-to-text
23
- translation, local [Gemma/Ollama](https://ollama.com/library/gemma3)
24
- text-to-text translation, cloud-based [ElevenLabs](https://elevenlabs.io/)
25
- text-to-speech conversion, and local [FFmpeg](https://ffmpeg.org/)
26
- speech-to-speech encoding. **SpeechFlow** is written in TypeScript and
27
- ships as a package for the Node Package Manager (NPM).
20
+
21
+ **SpeechFlow** comes with built-in graph nodes for
22
+ local file I/O,
23
+ local audio device I/O,
24
+ remote WebSocket network I/O,
25
+ remote MQTT network I/O,
26
+ cloud-based [Deepgram](https://deepgram.com) speech-to-text conversion,
27
+ cloud-based [ElevenLabs](https://elevenlabs.io/) text-to-speech conversion,
28
+ cloud-based [DeepL](https://deepl.com) text-to-text translation,
29
+ local [Gemma/Ollama](https://ollama.com/library/gemma3) text-to-text translation,
30
+ local [Gemma/Ollama](https://ollama.com/library/gemma3) text-to-text spelling correction,
31
+ local [OPUS/ONNX](https://github.com/Helsinki-NLP/Opus-MT) text-to-text translation,
32
+ local [FFmpeg](https://ffmpeg.org/) speech-to-speech encoding,
33
+ local WAV speech-to-speech encoding,
34
+ local text-to-text formatting,
35
+ local text-to-text subtitle generation, and
36
+ local text or audio tracing.
37
+
38
+ Additional **SpeechFlow** graph nodes can be provided externally
39
+ by NPM packages named `speechflow-node-xxx` which expose a class
40
+ derived from the exported `SpeechFlowNode` class of the `speechflow` package.
41
+
42
+ **SpeechFlow** is written in TypeScript and
43
+ ships as an installable package for the Node Package Manager (NPM).
28
44
 
29
45
  Installation
30
46
  ------------
@@ -42,60 +58,94 @@ $ speechflow
42
58
  [-V|--version]
43
59
  [-v|--verbose <level>]
44
60
  [-e|--expression <expression>]
45
- [-f|--expression-file <expression-file>]
46
- [-c|--config <key>@<yaml-config-file>]
61
+ [-f|--file <file>]
62
+ [-c|--config <id>@<yaml-config-file>]
47
63
  [<argument> [...]]
48
64
  ```
49
65
 
50
66
  Processing Graph Examples
51
67
  -------------------------
52
68
 
53
- - Capture audio from microphone to file:
69
+ The following are examples of **SpeechFlow** processing graphs.
70
+ They can also be found in the [sample.yaml](./sample.yaml) file
71
+ for easy consumption with `speechflow -c <id>@sample.yaml>`.
72
+
73
+ - **Capturing**: Capture audio from microphone device into WAV audio file:
54
74
 
55
75
  ```
56
76
  device(device: "wasapi:VoiceMeeter Out B1", mode: "r") |
57
- file(path: "capture.pcm", mode: "w", type: "audio")
77
+ wav(mode: "encode") |
78
+ file(path: "capture.wav", mode: "w", type: "audio")
58
79
  ```
59
80
 
60
- - Generate audio file with narration of text file:
81
+ - **Pass-Through**: Pass-through audio from microphone device to speaker
82
+ device and in parallel record it to WAV audio file:
83
+
84
+ ```
85
+ device(device: "wasapi:VoiceMeeter Out B1", mode: "r") | {
86
+ wav(mode: "encode") |
87
+ file(path: "capture.wav", mode: "w", type: "audio"),
88
+ device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
89
+ }
90
+ ```
91
+
92
+ - **Narration**: Generate text file with German narration of MP3 audio file:
61
93
 
62
94
  ```
63
95
  file(path: argv.0, mode: "r", type: "audio") |
64
- deepgram(language: "en") |
65
- file(path: argv.1, mode: "w", type: "text")
96
+ ffmpeg(src: "mp3", dst: "pcm") |
97
+ deepgram(language: "de", key: env.SPEECHFLOW_KEY_DEEPGRAM) |
98
+ format(width: 80) |
99
+ file(path: argv.1, mode: "w", type: "text")
66
100
  ```
67
101
 
68
- - Translate stdin to stdout:
102
+ - **Subtitling**: Generate text file with German subtitles of MP3 audio file:
69
103
 
70
104
  ```
71
- file(path: "-", mode: "r", type: "text") |
72
- deepl(src: "de", dst: "en-US") |
73
- file(path: "-", mode: "w", type: "text")
105
+ file(path: argv.0, mode: "r", type: "audio") |
106
+ ffmpeg(src: "mp3", dst: "pcm") |
107
+ deepgram(language: "de", key: env.SPEECHFLOW_KEY_DEEPGRAM) |
108
+ subtitle(format: "vtt") |
109
+ file(path: argv.1, mode: "w", type: "text")
74
110
  ```
75
111
 
76
- - Pass-through audio from microphone to speaker and in parallel record it to file:
112
+ - **Ad-Hoc Translation**: Ad-Hoc text translation from German to English
113
+ via stdin/stdout:
77
114
 
78
115
  ```
79
- device(device: "wasapi:VoiceMeeter Out B1", mode: "r") | {
80
- file(path: "capture.pcm", mode: "w", type: "audio"),
81
- device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
82
- }
116
+ file(path: "-", mode: "r", type: "text") |
117
+ deepl(src: "de", dst: "en") |
118
+ file(path: "-", mode: "w", type: "text")
83
119
  ```
84
120
 
85
- - Real-time translation from german to english, including capturing of all inputs and outputs:
121
+ - **Studio Translation**: Real-time studio translation from German to English,
122
+ including the capturing of all involved inputs and outputs:
86
123
 
87
124
  ```
88
- device(device: "wasapi:VoiceMeeter Out B1", mode: "r") | {
89
- file(path: "translation-audio-de.pcm", mode: "w", type: "audio"),
90
- deepgram(language: "de") |
91
- file(path: "translation-text-de.txt", mode: "w", type: "text")
92
- } | {
93
- deepl(src: "de", dst: "en-US") |
94
- file(path: "translation-text-en.txt", mode: "w", type: "text")
95
- } | {
96
- elevenlabs(language: "en") | {
97
- file(path: "translation-audio-en.pcm", mode: "w", type: "audio"),
98
- device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
125
+ device(device: "coreaudio:Elgato Wave:3", mode: "r") | {
126
+ wav(mode: "encode") |
127
+ file(path: "program-de.wav", mode: "w", type: "audio"),
128
+ deepgram(key: env.SPEECHFLOW_KEY_DEEPGRAM, language: "de") | {
129
+ format(width: 80) |
130
+ file(path: "program-de.txt", mode: "w", type: "text"),
131
+ deepl(key: env.SPEECHFLOW_KEY_DEEPL, src: "de", dst: "en") | {
132
+ format(width: 80) |
133
+ file(path: "program-en.txt", mode: "w", type: "text"),
134
+ subtitle(format: "vtt") | {
135
+ file(path: "program-en.vtt", mode: "w", type: "text"),
136
+ mqtt(url: "mqtt://10.1.0.10:1883",
137
+ username: env.SPEECHFLOW_MQTT_USER,
138
+ password: env.SPEECHFLOW_MQTT_PASS,
139
+ topicWrite: "stream/studio/sender")
140
+ },
141
+ subtitle(format: "srt") |
142
+ file(path: "program-en.srt", mode: "w", type: "text"),
143
+ elevenlabs(voice: "Mark", speed: 1.05, language: "en") | {
144
+ wav(mode: "encode") |
145
+ file(path: "program-en.wav", mode: "w", type: "audio"),
146
+ device(device: "coreaudio:USBAudio2.0", mode: "w")
147
+ }
148
+ }
99
149
  }
100
150
  }
101
151
  ```
@@ -103,7 +153,30 @@ Processing Graph Examples
103
153
  Processing Node Types
104
154
  ---------------------
105
155
 
106
- Currently **SpeechFlow** provides the following processing nodes:
156
+ First a short overview of the available processing nodes:
157
+
158
+ - Input/Output nodes:
159
+ **file**,
160
+ **device**,
161
+ **websocket**,
162
+ **mqtt**.
163
+ - Audio-to-Audio nodes:
164
+ **ffmpeg**,
165
+ **wav**.
166
+ - Audio-to-Text nodes:
167
+ **deepgram**.
168
+ - Text-to-Text nodes:
169
+ **deepl**,
170
+ **gemma**,
171
+ **opus**,
172
+ **subtitle**,
173
+ **format**.
174
+ - Text-to-Audio nodes:
175
+ **elevenlabs**.
176
+ - Any-to-Any nodes:
177
+ **trace**.
178
+
179
+ ### Input/Output Nodes:
107
180
 
108
181
  - Node: **file**<br/>
109
182
  Purpose: **File and StdIO source/sink**<br/>
@@ -120,9 +193,24 @@ Currently **SpeechFlow** provides the following processing nodes:
120
193
  | **mode** | 1 | "r" | `/^(?:r\|w\|rw)$/` |
121
194
  | **type** | 2 | "audio" | `/^(?:audio\|text)$/` |
122
195
 
196
+ - Node: **device**<br/>
197
+ Purpose: **Microphone/speaker device source/sink**<br/>
198
+ Example: `device(device: "wasapi:VoiceMeeter Out B1", mode: "r")`
199
+
200
+ | Port | Payload |
201
+ | ------- | ----------- |
202
+ | input | audio |
203
+ | output | audio |
204
+
205
+ | Parameter | Position | Default | Requirement |
206
+ | ----------- | --------- | -------- | ------------------ |
207
+ | **device** | 0 | *none* | `/^(.+?):(.+)$/` |
208
+ | **mode** | 1 | "rw" | `/^(?:r\|w\|rw)$/` |
209
+
123
210
  - Node: **websocket**<br/>
124
211
  Purpose: **WebSocket source/sink**<br/>
125
- Example: `websocket(connect: "ws://127.0.0.1:12345". type: "text")`
212
+ Example: `websocket(connect: "ws://127.0.0.1:12345", type: "text")`
213
+ Notice: this node requires a peer WebSocket service!
126
214
 
127
215
  | Port | Payload |
128
216
  | ------- | ----------- |
@@ -135,19 +223,24 @@ Currently **SpeechFlow** provides the following processing nodes:
135
223
  | **connect** | *none* | *none* | `/^(?:\|ws:\/\/(.+?):(\d+)(?:\/.*)?)$/` |
136
224
  | **type** | *none* | "audio" | `/^(?:audio\|text)$/` |
137
225
 
138
- - Node: **device**<br/>
139
- Purpose: **Microphone/speaker device source/sink**<br/>
140
- Example: `device(device: "wasapi:VoiceMeeter Out B1", mode: "r")`
226
+ - Node: **mqtt**<br/>
227
+ Purpose: **MQTT sink**<br/>
228
+ Example: `mqtt(url: "mqtt://127.0.0.1:1883", username: "foo", password: "bar", topic: "quux")`
229
+ Notice: this node requires a peer MQTT broker!
141
230
 
142
231
  | Port | Payload |
143
232
  | ------- | ----------- |
144
- | input | audio |
145
- | output | audio |
233
+ | input | text |
234
+ | output | none |
146
235
 
147
- | Parameter | Position | Default | Requirement |
148
- | ----------- | --------- | -------- | ------------------ |
149
- | **device** | 0 | *none* | `/^(.+?):(.+)$/` |
150
- | **mode** | 1 | "rw" | `/^(?:r\|w\|rw)$/` |
236
+ | Parameter | Position | Default | Requirement |
237
+ | ------------ | --------- | -------- | --------------------- |
238
+ | **url** | 0 | *none* | `/^(?:\|(?:ws|mqtt):\/\/(.+?):(\d+))$/` |
239
+ | **username** | 1 | *none* | `/^.+$/` |
240
+ | **password** | 2 | *none* | `/^.+$/` |
241
+ | **topic** | 3 | *none* | `/^.+$/` |
242
+
243
+ ### Audio-to-Audio Nodes:
151
244
 
152
245
  - Node: **ffmpeg**<br/>
153
246
  Purpose: **FFmpeg audio format conversion**<br/>
@@ -163,6 +256,21 @@ Currently **SpeechFlow** provides the following processing nodes:
163
256
  | **src** | 0 | "pcm" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
164
257
  | **dst** | 1 | "wav" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
165
258
 
259
+ - Node: **wav**<br/>
260
+ Purpose: **WAV audio format conversion**<br/>
261
+ Example: `wav(mode: "encode")`
262
+
263
+ | Port | Payload |
264
+ | ------- | ----------- |
265
+ | input | audio |
266
+ | output | audio |
267
+
268
+ | Parameter | Position | Default | Requirement |
269
+ | ----------- | --------- | -------- | ------------------------ |
270
+ | **mode** | 0 | "encode" | `/^(?:encode\|decode)$/` |
271
+
272
+ ### Audio-to-Text Nodes:
273
+
166
274
  - Node: **deepgram**<br/>
167
275
  Purpose: **Deepgram Speech-to-Text conversion**<br/>
168
276
  Example: `deepgram(language: "de")`<br/>
@@ -176,13 +284,15 @@ Currently **SpeechFlow** provides the following processing nodes:
176
284
  | Parameter | Position | Default | Requirement |
177
285
  | ------------ | --------- | -------- | ------------------ |
178
286
  | **key** | *none* | env.SPEECHFLOW\_KEY\_DEEPGRAM | *none* |
179
- | **model** | 0 | "nova-2" | *none* |
287
+ | **model** | 0 | "nova-3" | *none* |
180
288
  | **version** | 1 | "latest" | *none* |
181
- | **language** | 2 | "de" | *none* |
289
+ | **language** | 2 | "multi" | *none* |
290
+
291
+ ### Text-to-Text Nodes:
182
292
 
183
293
  - Node: **deepl**<br/>
184
294
  Purpose: **DeepL Text-to-Text translation**<br/>
185
- Example: `deepl(src: "de", dst: "en-US")`<br/>
295
+ Example: `deepl(src: "de", dst: "en")`<br/>
186
296
  Notice: this node requires an API key!
187
297
 
188
298
  | Port | Payload |
@@ -193,11 +303,11 @@ Currently **SpeechFlow** provides the following processing nodes:
193
303
  | Parameter | Position | Default | Requirement |
194
304
  | ------------ | --------- | -------- | ------------------ |
195
305
  | **key** | *none* | env.SPEECHFLOW\_KEY\_DEEPL | *none* |
196
- | **src** | 0 | "de" | `/^(?:de\|en-US)$/` |
197
- | **dst** | 1 | "en-US" | `/^(?:de\|en-US)$/` |
306
+ | **src** | 0 | "de" | `/^(?:de\|en)$/` |
307
+ | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
198
308
 
199
309
  - Node: **gemma**<br/>
200
- Purpose: **Google Gemma Text-to-Text translation**<br/>
310
+ Purpose: **Google Gemma Text-to-Text translation and spelling correction**<br/>
201
311
  Example: `gemma(src: "de", dst: "en")`<br/>
202
312
  Notice; this node requires the Ollama API!
203
313
 
@@ -212,6 +322,48 @@ Currently **SpeechFlow** provides the following processing nodes:
212
322
  | **src** | 0 | "de" | `/^(?:de\|en)$/` |
213
323
  | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
214
324
 
325
+ - Node: **opus**<br/>
326
+ Purpose: **OPUS Text-to-Text translation**<br/>
327
+ Example: `deepl(src: "de", dst: "en")`<br/>
328
+
329
+ | Port | Payload |
330
+ | ------- | ----------- |
331
+ | input | text |
332
+ | output | text |
333
+
334
+ | Parameter | Position | Default | Requirement |
335
+ | ------------ | --------- | -------- | ---------------- |
336
+ | **src** | 0 | "de" | `/^(?:de\|en)$/` |
337
+ | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
338
+
339
+ - Node: **subtitle**<br/>
340
+ Purpose: **SRT/VTT Subtitle Generation**<br/>
341
+ Example: `subtitle(format: "srt")`<br/>
342
+
343
+ | Port | Payload |
344
+ | ------- | ----------- |
345
+ | input | text |
346
+ | output | text |
347
+
348
+ | Parameter | Position | Default | Requirement |
349
+ | ------------ | --------- | -------- | ------------------ |
350
+ | **format** | *none* | "srt" | /^(?:srt\|vtt)$/ |
351
+
352
+ - Node: **format**<br/>
353
+ Purpose: **text paragraph formatting**<br/>
354
+ Example: `format(width: 80)`<br/>
355
+
356
+ | Port | Payload |
357
+ | ------- | ----------- |
358
+ | input | text |
359
+ | output | text |
360
+
361
+ | Parameter | Position | Default | Requirement |
362
+ | ------------ | --------- | -------- | --------------------- |
363
+ | **width** | 0 | 80 | *none* |
364
+
365
+ ### Text-to-Audio Nodes:
366
+
215
367
  - Node: **elevenlabs**<br/>
216
368
  Purpose: **ElevenLabs Text-to-Speech conversion**<br/>
217
369
  Example: `elevenlabs(language: "en")`<br/>
@@ -228,6 +380,22 @@ Currently **SpeechFlow** provides the following processing nodes:
228
380
  | **voice** | 0 | "Brian" | *none* |
229
381
  | **language** | 1 | "de" | *none* |
230
382
 
383
+ ### Any-to-Any Nodes:
384
+
385
+ - Node: **trace**<br/>
386
+ Purpose: **data flow tracing**<br/>
387
+ Example: `trace(type: "audio")`<br/>
388
+
389
+ | Port | Payload |
390
+ | ------- | ----------- |
391
+ | input | text, audio |
392
+ | output | text, audio |
393
+
394
+ | Parameter | Position | Default | Requirement |
395
+ | ------------ | --------- | -------- | --------------------- |
396
+ | **type** | 0 | "audio" | `/^(?:audio\|text)$/` |
397
+ | **name** | 1 | *none* | *none* |
398
+
231
399
  Graph Expression Language
232
400
  -------------------------
233
401
 
@@ -261,13 +429,18 @@ number-value ::= "0b" /[01]+/
261
429
  value ::= "true" | "false" | "null" | "NaN" | "undefined"
262
430
  ```
263
431
 
432
+ **SpeechFlow** makes available to **FlowLink** all **SpeechFlow** nodes as
433
+ `node`, the CLI arguments under the array `variable` named `argv`, and all
434
+ environment variables under the object `variable` named `env`.
435
+
264
436
  History
265
437
  -------
266
438
 
267
439
  **Speechflow**, as a technical cut-through, was initially created in
268
440
  March 2024 for use in the msg Filmstudio context. It was later refined
269
441
  into a more complete toolkit in April 2025 and this way the first time
270
- could be used in production.
442
+ could be used in production. It was fully refactored in July 2025 in
443
+ order to support timestamps in the streams processing.
271
444
 
272
445
  Copyright & License
273
446
  -------------------
@@ -0,0 +1,13 @@
1
+ import SpeechFlowNode from "./speechflow-node";
2
+ export default class SpeechFlowNodeFFmpeg extends SpeechFlowNode {
3
+ static name: string;
4
+ private ffmpegBinary;
5
+ private ffmpeg;
6
+ constructor(id: string, cfg: {
7
+ [id: string]: any;
8
+ }, opts: {
9
+ [id: string]: any;
10
+ }, args: any[]);
11
+ open(): Promise<void>;
12
+ close(): Promise<void>;
13
+ }
@@ -0,0 +1,152 @@
1
+ "use strict";
2
+ /*
3
+ ** SpeechFlow - Speech Processing Flow Graph
4
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
5
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
6
+ */
7
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
8
+ if (k2 === undefined) k2 = k;
9
+ var desc = Object.getOwnPropertyDescriptor(m, k);
10
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
11
+ desc = { enumerable: true, get: function() { return m[k]; } };
12
+ }
13
+ Object.defineProperty(o, k2, desc);
14
+ }) : (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ o[k2] = m[k];
17
+ }));
18
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
19
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
20
+ }) : function(o, v) {
21
+ o["default"] = v;
22
+ });
23
+ var __importStar = (this && this.__importStar) || (function () {
24
+ var ownKeys = function(o) {
25
+ ownKeys = Object.getOwnPropertyNames || function (o) {
26
+ var ar = [];
27
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
28
+ return ar;
29
+ };
30
+ return ownKeys(o);
31
+ };
32
+ return function (mod) {
33
+ if (mod && mod.__esModule) return mod;
34
+ var result = {};
35
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
36
+ __setModuleDefault(result, mod);
37
+ return result;
38
+ };
39
+ })();
40
+ var __importDefault = (this && this.__importDefault) || function (mod) {
41
+ return (mod && mod.__esModule) ? mod : { "default": mod };
42
+ };
43
+ Object.defineProperty(exports, "__esModule", { value: true });
44
+ /* standard dependencies */
45
+ const node_stream_1 = __importDefault(require("node:stream"));
46
+ /* external dependencies */
47
+ const ffmpeg_1 = __importDefault(require("@rse/ffmpeg"));
48
+ const ffmpeg_stream_1 = require("ffmpeg-stream");
49
+ /* internal dependencies */
50
+ const speechflow_node_1 = __importDefault(require("./speechflow-node"));
51
+ const utils = __importStar(require("./speechflow-utils"));
52
+ /* SpeechFlow node for FFmpeg */
53
+ class SpeechFlowNodeFFmpeg extends speechflow_node_1.default {
54
+ /* declare official node name */
55
+ static name = "ffmpeg";
56
+ /* internal state */
57
+ ffmpegBinary = ffmpeg_1.default.supported ? ffmpeg_1.default.binary : "ffmpeg";
58
+ ffmpeg = null;
59
+ /* construct node */
60
+ constructor(id, cfg, opts, args) {
61
+ super(id, cfg, opts, args);
62
+ /* declare node configuration parameters */
63
+ this.configure({
64
+ src: { type: "string", pos: 0, val: "pcm", match: /^(?:pcm|wav|mp3|opus)$/ },
65
+ dst: { type: "string", pos: 1, val: "wav", match: /^(?:pcm|wav|mp3|opus)$/ }
66
+ });
67
+ /* declare node input/output format */
68
+ this.input = "audio";
69
+ this.output = "audio";
70
+ }
71
+ /* open node */
72
+ async open() {
73
+ /* sanity check situation */
74
+ if (this.params.src === this.params.dst)
75
+ throw new Error("source and destination formats should not be the same");
76
+ /* instantiate FFmpeg sub-process */
77
+ this.ffmpeg = new ffmpeg_stream_1.Converter(this.ffmpegBinary);
78
+ const streamInput = this.ffmpeg.createInputStream({
79
+ /* FFmpeg input options */
80
+ "fflags": "nobuffer",
81
+ "flags": "low_delay",
82
+ "probesize": 32,
83
+ "analyzeduration": 0,
84
+ ...(this.params.src === "pcm" ? {
85
+ "f": "s16le",
86
+ "ar": this.config.audioSampleRate,
87
+ "ac": this.config.audioChannels
88
+ } : {}),
89
+ ...(this.params.src === "wav" ? {
90
+ "f": "wav"
91
+ } : {}),
92
+ ...(this.params.src === "mp3" ? {
93
+ "f": "mp3"
94
+ } : {}),
95
+ ...(this.params.src === "opus" ? {
96
+ "f": "opus"
97
+ } : {})
98
+ });
99
+ const streamOutput = this.ffmpeg.createOutputStream({
100
+ /* FFmpeg output options */
101
+ "flush_packets": 1,
102
+ ...(this.params.dst === "pcm" ? {
103
+ "c:a": "pcm_s16le",
104
+ "ar": this.config.audioSampleRate,
105
+ "ac": this.config.audioChannels,
106
+ "f": "s16le",
107
+ } : {}),
108
+ ...(this.params.dst === "wav" ? {
109
+ "f": "wav"
110
+ } : {}),
111
+ ...(this.params.dst === "mp3" ? {
112
+ "c:a": "libmp3lame",
113
+ "b:a": "192k",
114
+ "f": "mp3"
115
+ } : {}),
116
+ ...(this.params.dst === "opus" ? {
117
+ "acodec": "libopus",
118
+ "f": "opus"
119
+ } : {})
120
+ });
121
+ this.ffmpeg.run();
122
+ /* establish a duplex stream and connect it to FFmpeg */
123
+ this.stream = node_stream_1.default.Duplex.from({
124
+ writable: streamInput,
125
+ readable: streamOutput
126
+ });
127
+ /* wrap streams with conversions for chunk vs plain audio */
128
+ const wrapper1 = utils.createTransformStreamForWritableSide();
129
+ const wrapper2 = utils.createTransformStreamForReadableSide("audio", () => this.timeZero);
130
+ this.stream = node_stream_1.default.compose(wrapper1, this.stream, wrapper2);
131
+ }
132
+ /* close node */
133
+ async close() {
134
+ /* close duplex stream */
135
+ if (this.stream !== null) {
136
+ await new Promise((resolve) => {
137
+ if (this.stream instanceof node_stream_1.default.Duplex)
138
+ this.stream.end(() => { resolve(); });
139
+ else
140
+ resolve();
141
+ });
142
+ this.stream.destroy();
143
+ this.stream = null;
144
+ }
145
+ /* shutdown FFmpeg */
146
+ if (this.ffmpeg !== null) {
147
+ this.ffmpeg.kill();
148
+ this.ffmpeg = null;
149
+ }
150
+ }
151
+ }
152
+ exports.default = SpeechFlowNodeFFmpeg;
@@ -0,0 +1,11 @@
1
+ import SpeechFlowNode from "./speechflow-node";
2
+ export default class SpeechFlowNodeWAV extends SpeechFlowNode {
3
+ static name: string;
4
+ constructor(id: string, cfg: {
5
+ [id: string]: any;
6
+ }, opts: {
7
+ [id: string]: any;
8
+ }, args: any[]);
9
+ open(): Promise<void>;
10
+ close(): Promise<void>;
11
+ }