speechflow 0.9.5 → 0.9.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/CHANGELOG.md +24 -0
  2. package/README.md +220 -53
  3. package/dst/speechflow-node-a2a-ffmpeg.d.ts +13 -0
  4. package/dst/speechflow-node-a2a-ffmpeg.js +152 -0
  5. package/dst/speechflow-node-a2a-wav.d.ts +11 -0
  6. package/dst/speechflow-node-a2a-wav.js +170 -0
  7. package/dst/speechflow-node-a2t-deepgram.d.ts +12 -0
  8. package/dst/speechflow-node-a2t-deepgram.js +220 -0
  9. package/dst/speechflow-node-deepgram.d.ts +3 -1
  10. package/dst/speechflow-node-deepgram.js +86 -22
  11. package/dst/speechflow-node-deepl.d.ts +3 -1
  12. package/dst/speechflow-node-deepl.js +25 -20
  13. package/dst/speechflow-node-device.d.ts +3 -1
  14. package/dst/speechflow-node-device.js +53 -2
  15. package/dst/speechflow-node-elevenlabs.d.ts +3 -1
  16. package/dst/speechflow-node-elevenlabs.js +37 -42
  17. package/dst/speechflow-node-ffmpeg.d.ts +3 -1
  18. package/dst/speechflow-node-ffmpeg.js +42 -4
  19. package/dst/speechflow-node-file.d.ts +3 -1
  20. package/dst/speechflow-node-file.js +84 -13
  21. package/dst/speechflow-node-format.d.ts +11 -0
  22. package/dst/speechflow-node-format.js +80 -0
  23. package/dst/speechflow-node-gemma.d.ts +3 -1
  24. package/dst/speechflow-node-gemma.js +84 -23
  25. package/dst/speechflow-node-mqtt.d.ts +13 -0
  26. package/dst/speechflow-node-mqtt.js +181 -0
  27. package/dst/speechflow-node-opus.d.ts +12 -0
  28. package/dst/speechflow-node-opus.js +135 -0
  29. package/dst/speechflow-node-subtitle.d.ts +12 -0
  30. package/dst/speechflow-node-subtitle.js +96 -0
  31. package/dst/speechflow-node-t2a-elevenlabs.d.ts +13 -0
  32. package/dst/speechflow-node-t2a-elevenlabs.js +182 -0
  33. package/dst/speechflow-node-t2t-deepl.d.ts +12 -0
  34. package/dst/speechflow-node-t2t-deepl.js +133 -0
  35. package/dst/speechflow-node-t2t-format.d.ts +11 -0
  36. package/dst/speechflow-node-t2t-format.js +80 -0
  37. package/dst/speechflow-node-t2t-gemma.d.ts +13 -0
  38. package/dst/speechflow-node-t2t-gemma.js +213 -0
  39. package/dst/speechflow-node-t2t-opus.d.ts +12 -0
  40. package/dst/speechflow-node-t2t-opus.js +135 -0
  41. package/dst/speechflow-node-t2t-subtitle.d.ts +12 -0
  42. package/dst/speechflow-node-t2t-subtitle.js +96 -0
  43. package/dst/speechflow-node-trace.d.ts +11 -0
  44. package/dst/speechflow-node-trace.js +88 -0
  45. package/dst/speechflow-node-wav.d.ts +11 -0
  46. package/dst/speechflow-node-wav.js +170 -0
  47. package/dst/speechflow-node-websocket.d.ts +3 -1
  48. package/dst/speechflow-node-websocket.js +149 -49
  49. package/dst/speechflow-node-whisper-common.d.ts +34 -0
  50. package/dst/speechflow-node-whisper-common.js +7 -0
  51. package/dst/speechflow-node-whisper-ggml.d.ts +1 -0
  52. package/dst/speechflow-node-whisper-ggml.js +97 -0
  53. package/dst/speechflow-node-whisper-onnx.d.ts +1 -0
  54. package/dst/speechflow-node-whisper-onnx.js +131 -0
  55. package/dst/speechflow-node-whisper-worker-ggml.d.ts +1 -0
  56. package/dst/speechflow-node-whisper-worker-ggml.js +97 -0
  57. package/dst/speechflow-node-whisper-worker-onnx.d.ts +1 -0
  58. package/dst/speechflow-node-whisper-worker-onnx.js +131 -0
  59. package/dst/speechflow-node-whisper-worker.d.ts +1 -0
  60. package/dst/speechflow-node-whisper-worker.js +116 -0
  61. package/dst/speechflow-node-whisper-worker2.d.ts +1 -0
  62. package/dst/speechflow-node-whisper-worker2.js +82 -0
  63. package/dst/speechflow-node-whisper.d.ts +19 -0
  64. package/dst/speechflow-node-whisper.js +604 -0
  65. package/dst/speechflow-node-x2x-trace.d.ts +11 -0
  66. package/dst/speechflow-node-x2x-trace.js +88 -0
  67. package/dst/speechflow-node-xio-device.d.ts +13 -0
  68. package/dst/speechflow-node-xio-device.js +205 -0
  69. package/dst/speechflow-node-xio-file.d.ts +11 -0
  70. package/dst/speechflow-node-xio-file.js +176 -0
  71. package/dst/speechflow-node-xio-mqtt.d.ts +13 -0
  72. package/dst/speechflow-node-xio-mqtt.js +181 -0
  73. package/dst/speechflow-node-xio-websocket.d.ts +13 -0
  74. package/dst/speechflow-node-xio-websocket.js +275 -0
  75. package/dst/speechflow-node.d.ts +24 -6
  76. package/dst/speechflow-node.js +63 -6
  77. package/dst/speechflow-utils.d.ts +23 -0
  78. package/dst/speechflow-utils.js +194 -0
  79. package/dst/speechflow.js +146 -43
  80. package/etc/biome.jsonc +12 -4
  81. package/etc/speechflow.bat +6 -0
  82. package/etc/speechflow.sh +5 -0
  83. package/etc/speechflow.yaml +71 -0
  84. package/etc/stx.conf +65 -0
  85. package/package.d/@ericedouard+vad-node-realtime+0.2.0.patch +18 -0
  86. package/package.json +49 -31
  87. package/src/lib.d.ts +6 -1
  88. package/src/{speechflow-node-ffmpeg.ts → speechflow-node-a2a-ffmpeg.ts} +10 -4
  89. package/src/speechflow-node-a2a-wav.ts +143 -0
  90. package/src/speechflow-node-a2t-deepgram.ts +199 -0
  91. package/src/{speechflow-node-elevenlabs.ts → speechflow-node-t2a-elevenlabs.ts} +38 -45
  92. package/src/{speechflow-node-deepl.ts → speechflow-node-t2t-deepl.ts} +36 -25
  93. package/src/speechflow-node-t2t-format.ts +85 -0
  94. package/src/{speechflow-node-gemma.ts → speechflow-node-t2t-gemma.ts} +89 -25
  95. package/src/speechflow-node-t2t-opus.ts +111 -0
  96. package/src/speechflow-node-t2t-subtitle.ts +101 -0
  97. package/src/speechflow-node-x2x-trace.ts +92 -0
  98. package/src/{speechflow-node-device.ts → speechflow-node-xio-device.ts} +25 -3
  99. package/src/speechflow-node-xio-file.ts +153 -0
  100. package/src/speechflow-node-xio-mqtt.ts +154 -0
  101. package/src/speechflow-node-xio-websocket.ts +248 -0
  102. package/src/speechflow-node.ts +63 -6
  103. package/src/speechflow-utils.ts +212 -0
  104. package/src/speechflow.ts +150 -43
  105. package/etc/nps.yaml +0 -40
  106. package/sample.yaml +0 -39
  107. package/src/speechflow-node-deepgram.ts +0 -133
  108. package/src/speechflow-node-file.ts +0 -108
  109. package/src/speechflow-node-websocket.ts +0 -179
package/CHANGELOG.md ADDED
@@ -0,0 +1,24 @@
1
+
2
+ ChangeLog
3
+ =========
4
+
5
+ 0.9.8 (2025-07-12)
6
+ ------------------
7
+
8
+ - CLEANUP: provide start scripts and move config to sub-directory
9
+
10
+ 0.9.7 (2025-07-12)
11
+ ------------------
12
+
13
+ - IMPROVEMENT: replace "nps" with "stx" for NPM scripting
14
+
15
+ 0.9.6 (2025-07-12)
16
+ ------------------
17
+
18
+ - IMPROVEMENT: major refactoring to object-mode streaming for supporting timestamps
19
+
20
+ 0.9.5 (2025-04-27)
21
+ ------------------
22
+
23
+ (first rough cut of program)
24
+
package/README.md CHANGED
@@ -18,13 +18,24 @@ About
18
18
  directed data flow graph of audio and text processing nodes. This way,
19
19
  it allows to perform various speech processing tasks in a flexible way.
20
20
 
21
- **SpeechFlow** comes with built-in graph nodes for local file I/O, local audio
22
- device I/O, local/remote WebSocket network I/O, cloud-based [Deepgram](https://deepgram.com)
23
- speech-to-text conversion, cloud-based [DeepL](https://deepl.com) text-to-text
24
- translation, local [Gemma/Ollama](https://ollama.com/library/gemma3)
25
- text-to-text translation, cloud-based [ElevenLabs](https://elevenlabs.io/)
26
- text-to-speech conversion, and local [FFmpeg](https://ffmpeg.org/)
27
- speech-to-speech encoding. Additional SpeechFlow graph nodes can be provided externally
21
+ **SpeechFlow** comes with built-in graph nodes for
22
+ local file I/O,
23
+ local audio device I/O,
24
+ remote WebSocket network I/O,
25
+ remote MQTT network I/O,
26
+ cloud-based [Deepgram](https://deepgram.com) speech-to-text conversion,
27
+ cloud-based [ElevenLabs](https://elevenlabs.io/) text-to-speech conversion,
28
+ cloud-based [DeepL](https://deepl.com) text-to-text translation,
29
+ local [Gemma/Ollama](https://ollama.com/library/gemma3) text-to-text translation,
30
+ local [Gemma/Ollama](https://ollama.com/library/gemma3) text-to-text spelling correction,
31
+ local [OPUS/ONNX](https://github.com/Helsinki-NLP/Opus-MT) text-to-text translation,
32
+ local [FFmpeg](https://ffmpeg.org/) speech-to-speech encoding,
33
+ local WAV speech-to-speech encoding,
34
+ local text-to-text formatting,
35
+ local text-to-text subtitle generation, and
36
+ local text or audio tracing.
37
+
38
+ Additional **SpeechFlow** graph nodes can be provided externally
28
39
  by NPM packages named `speechflow-node-xxx` which expose a class
29
40
  derived from the exported `SpeechFlowNode` class of the `speechflow` package.
30
41
 
@@ -47,60 +58,93 @@ $ speechflow
47
58
  [-V|--version]
48
59
  [-v|--verbose <level>]
49
60
  [-e|--expression <expression>]
50
- [-f|--expression-file <expression-file>]
51
- [-c|--config <key>@<yaml-config-file>]
61
+ [-f|--file <file>]
62
+ [-c|--config <id>@<yaml-config-file>]
52
63
  [<argument> [...]]
53
64
  ```
54
65
 
55
66
  Processing Graph Examples
56
67
  -------------------------
57
68
 
58
- - Capture audio from microphone to file:
69
+ The following are examples of **SpeechFlow** processing graphs.
70
+ They can also be found in the sample [speechflow.yaml](./etc/speechflow.yaml) file.
71
+
72
+ - **Capturing**: Capture audio from microphone device into WAV audio file:
59
73
 
60
74
  ```
61
75
  device(device: "wasapi:VoiceMeeter Out B1", mode: "r") |
62
- file(path: "capture.pcm", mode: "w", type: "audio")
76
+ wav(mode: "encode") |
77
+ file(path: "capture.wav", mode: "w", type: "audio")
63
78
  ```
64
79
 
65
- - Generate audio file with narration of text file:
80
+ - **Pass-Through**: Pass-through audio from microphone device to speaker
81
+ device and in parallel record it to WAV audio file:
82
+
83
+ ```
84
+ device(device: "wasapi:VoiceMeeter Out B1", mode: "r") | {
85
+ wav(mode: "encode") |
86
+ file(path: "capture.wav", mode: "w", type: "audio"),
87
+ device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
88
+ }
89
+ ```
90
+
91
+ - **Narration**: Generate text file with German narration of MP3 audio file:
66
92
 
67
93
  ```
68
94
  file(path: argv.0, mode: "r", type: "audio") |
69
- deepgram(language: "en") |
70
- file(path: argv.1, mode: "w", type: "text")
95
+ ffmpeg(src: "mp3", dst: "pcm") |
96
+ deepgram(language: "de", key: env.SPEECHFLOW_KEY_DEEPGRAM) |
97
+ format(width: 80) |
98
+ file(path: argv.1, mode: "w", type: "text")
71
99
  ```
72
100
 
73
- - Translate stdin to stdout:
101
+ - **Subtitling**: Generate text file with German subtitles of MP3 audio file:
74
102
 
75
103
  ```
76
- file(path: "-", mode: "r", type: "text") |
77
- deepl(src: "de", dst: "en-US") |
78
- file(path: "-", mode: "w", type: "text")
104
+ file(path: argv.0, mode: "r", type: "audio") |
105
+ ffmpeg(src: "mp3", dst: "pcm") |
106
+ deepgram(language: "de", key: env.SPEECHFLOW_KEY_DEEPGRAM) |
107
+ subtitle(format: "vtt") |
108
+ file(path: argv.1, mode: "w", type: "text")
79
109
  ```
80
110
 
81
- - Pass-through audio from microphone to speaker and in parallel record it to file:
111
+ - **Ad-Hoc Translation**: Ad-Hoc text translation from German to English
112
+ via stdin/stdout:
82
113
 
83
114
  ```
84
- device(device: "wasapi:VoiceMeeter Out B1", mode: "r") | {
85
- file(path: "capture.pcm", mode: "w", type: "audio"),
86
- device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
87
- }
115
+ file(path: "-", mode: "r", type: "text") |
116
+ deepl(src: "de", dst: "en") |
117
+ file(path: "-", mode: "w", type: "text")
88
118
  ```
89
119
 
90
- - Real-time translation from german to english, including capturing of all inputs and outputs:
120
+ - **Studio Translation**: Real-time studio translation from German to English,
121
+ including the capturing of all involved inputs and outputs:
91
122
 
92
123
  ```
93
- device(device: "wasapi:VoiceMeeter Out B1", mode: "r") | {
94
- file(path: "translation-audio-de.pcm", mode: "w", type: "audio"),
95
- deepgram(language: "de") |
96
- file(path: "translation-text-de.txt", mode: "w", type: "text")
97
- } | {
98
- deepl(src: "de", dst: "en-US") |
99
- file(path: "translation-text-en.txt", mode: "w", type: "text")
100
- } | {
101
- elevenlabs(language: "en") | {
102
- file(path: "translation-audio-en.pcm", mode: "w", type: "audio"),
103
- device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
124
+ device(device: "coreaudio:Elgato Wave:3", mode: "r") | {
125
+ wav(mode: "encode") |
126
+ file(path: "program-de.wav", mode: "w", type: "audio"),
127
+ deepgram(key: env.SPEECHFLOW_KEY_DEEPGRAM, language: "de") | {
128
+ format(width: 80) |
129
+ file(path: "program-de.txt", mode: "w", type: "text"),
130
+ deepl(key: env.SPEECHFLOW_KEY_DEEPL, src: "de", dst: "en") | {
131
+ format(width: 80) |
132
+ file(path: "program-en.txt", mode: "w", type: "text"),
133
+ subtitle(format: "vtt") | {
134
+ file(path: "program-en.vtt", mode: "w", type: "text"),
135
+ mqtt(url: "mqtt://10.1.0.10:1883",
136
+ username: env.SPEECHFLOW_MQTT_USER,
137
+ password: env.SPEECHFLOW_MQTT_PASS,
138
+ topicWrite: "stream/studio/sender")
139
+ },
140
+ subtitle(format: "srt") |
141
+ file(path: "program-en.srt", mode: "w", type: "text"),
142
+ elevenlabs(voice: "Mark", speed: 1.05, language: "en") | {
143
+ wav(mode: "encode") |
144
+ file(path: "program-en.wav", mode: "w", type: "audio"),
145
+ device(device: "coreaudio:USBAudio2.0", mode: "w")
146
+ }
147
+ }
104
148
  }
105
149
  }
106
150
  ```
@@ -108,7 +152,30 @@ Processing Graph Examples
108
152
  Processing Node Types
109
153
  ---------------------
110
154
 
111
- Currently **SpeechFlow** provides the following processing nodes:
155
+ First a short overview of the available processing nodes:
156
+
157
+ - Input/Output nodes:
158
+ **file**,
159
+ **device**,
160
+ **websocket**,
161
+ **mqtt**.
162
+ - Audio-to-Audio nodes:
163
+ **ffmpeg**,
164
+ **wav**.
165
+ - Audio-to-Text nodes:
166
+ **deepgram**.
167
+ - Text-to-Text nodes:
168
+ **deepl**,
169
+ **gemma**,
170
+ **opus**,
171
+ **subtitle**,
172
+ **format**.
173
+ - Text-to-Audio nodes:
174
+ **elevenlabs**.
175
+ - Any-to-Any nodes:
176
+ **trace**.
177
+
178
+ ### Input/Output Nodes:
112
179
 
113
180
  - Node: **file**<br/>
114
181
  Purpose: **File and StdIO source/sink**<br/>
@@ -125,9 +192,24 @@ Currently **SpeechFlow** provides the following processing nodes:
125
192
  | **mode** | 1 | "r" | `/^(?:r\|w\|rw)$/` |
126
193
  | **type** | 2 | "audio" | `/^(?:audio\|text)$/` |
127
194
 
195
+ - Node: **device**<br/>
196
+ Purpose: **Microphone/speaker device source/sink**<br/>
197
+ Example: `device(device: "wasapi:VoiceMeeter Out B1", mode: "r")`
198
+
199
+ | Port | Payload |
200
+ | ------- | ----------- |
201
+ | input | audio |
202
+ | output | audio |
203
+
204
+ | Parameter | Position | Default | Requirement |
205
+ | ----------- | --------- | -------- | ------------------ |
206
+ | **device** | 0 | *none* | `/^(.+?):(.+)$/` |
207
+ | **mode** | 1 | "rw" | `/^(?:r\|w\|rw)$/` |
208
+
128
209
  - Node: **websocket**<br/>
129
210
  Purpose: **WebSocket source/sink**<br/>
130
- Example: `websocket(connect: "ws://127.0.0.1:12345". type: "text")`
211
+ Example: `websocket(connect: "ws://127.0.0.1:12345", type: "text")`
212
+ Notice: this node requires a peer WebSocket service!
131
213
 
132
214
  | Port | Payload |
133
215
  | ------- | ----------- |
@@ -140,19 +222,24 @@ Currently **SpeechFlow** provides the following processing nodes:
140
222
  | **connect** | *none* | *none* | `/^(?:\|ws:\/\/(.+?):(\d+)(?:\/.*)?)$/` |
141
223
  | **type** | *none* | "audio" | `/^(?:audio\|text)$/` |
142
224
 
143
- - Node: **device**<br/>
144
- Purpose: **Microphone/speaker device source/sink**<br/>
145
- Example: `device(device: "wasapi:VoiceMeeter Out B1", mode: "r")`
225
+ - Node: **mqtt**<br/>
226
+ Purpose: **MQTT sink**<br/>
227
+ Example: `mqtt(url: "mqtt://127.0.0.1:1883", username: "foo", password: "bar", topic: "quux")`
228
+ Notice: this node requires a peer MQTT broker!
146
229
 
147
230
  | Port | Payload |
148
231
  | ------- | ----------- |
149
- | input | audio |
150
- | output | audio |
232
+ | input | text |
233
+ | output | none |
151
234
 
152
- | Parameter | Position | Default | Requirement |
153
- | ----------- | --------- | -------- | ------------------ |
154
- | **device** | 0 | *none* | `/^(.+?):(.+)$/` |
155
- | **mode** | 1 | "rw" | `/^(?:r\|w\|rw)$/` |
235
+ | Parameter | Position | Default | Requirement |
236
+ | ------------ | --------- | -------- | --------------------- |
237
+ | **url** | 0 | *none* | `/^(?:\|(?:ws|mqtt):\/\/(.+?):(\d+))$/` |
238
+ | **username** | 1 | *none* | `/^.+$/` |
239
+ | **password** | 2 | *none* | `/^.+$/` |
240
+ | **topic** | 3 | *none* | `/^.+$/` |
241
+
242
+ ### Audio-to-Audio Nodes:
156
243
 
157
244
  - Node: **ffmpeg**<br/>
158
245
  Purpose: **FFmpeg audio format conversion**<br/>
@@ -168,6 +255,21 @@ Currently **SpeechFlow** provides the following processing nodes:
168
255
  | **src** | 0 | "pcm" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
169
256
  | **dst** | 1 | "wav" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
170
257
 
258
+ - Node: **wav**<br/>
259
+ Purpose: **WAV audio format conversion**<br/>
260
+ Example: `wav(mode: "encode")`
261
+
262
+ | Port | Payload |
263
+ | ------- | ----------- |
264
+ | input | audio |
265
+ | output | audio |
266
+
267
+ | Parameter | Position | Default | Requirement |
268
+ | ----------- | --------- | -------- | ------------------------ |
269
+ | **mode** | 0 | "encode" | `/^(?:encode\|decode)$/` |
270
+
271
+ ### Audio-to-Text Nodes:
272
+
171
273
  - Node: **deepgram**<br/>
172
274
  Purpose: **Deepgram Speech-to-Text conversion**<br/>
173
275
  Example: `deepgram(language: "de")`<br/>
@@ -181,13 +283,15 @@ Currently **SpeechFlow** provides the following processing nodes:
181
283
  | Parameter | Position | Default | Requirement |
182
284
  | ------------ | --------- | -------- | ------------------ |
183
285
  | **key** | *none* | env.SPEECHFLOW\_KEY\_DEEPGRAM | *none* |
184
- | **model** | 0 | "nova-2" | *none* |
286
+ | **model** | 0 | "nova-3" | *none* |
185
287
  | **version** | 1 | "latest" | *none* |
186
- | **language** | 2 | "de" | *none* |
288
+ | **language** | 2 | "multi" | *none* |
289
+
290
+ ### Text-to-Text Nodes:
187
291
 
188
292
  - Node: **deepl**<br/>
189
293
  Purpose: **DeepL Text-to-Text translation**<br/>
190
- Example: `deepl(src: "de", dst: "en-US")`<br/>
294
+ Example: `deepl(src: "de", dst: "en")`<br/>
191
295
  Notice: this node requires an API key!
192
296
 
193
297
  | Port | Payload |
@@ -198,11 +302,11 @@ Currently **SpeechFlow** provides the following processing nodes:
198
302
  | Parameter | Position | Default | Requirement |
199
303
  | ------------ | --------- | -------- | ------------------ |
200
304
  | **key** | *none* | env.SPEECHFLOW\_KEY\_DEEPL | *none* |
201
- | **src** | 0 | "de" | `/^(?:de\|en-US)$/` |
202
- | **dst** | 1 | "en-US" | `/^(?:de\|en-US)$/` |
305
+ | **src** | 0 | "de" | `/^(?:de\|en)$/` |
306
+ | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
203
307
 
204
308
  - Node: **gemma**<br/>
205
- Purpose: **Google Gemma Text-to-Text translation**<br/>
309
+ Purpose: **Google Gemma Text-to-Text translation and spelling correction**<br/>
206
310
  Example: `gemma(src: "de", dst: "en")`<br/>
207
311
  Notice; this node requires the Ollama API!
208
312
 
@@ -217,6 +321,48 @@ Currently **SpeechFlow** provides the following processing nodes:
217
321
  | **src** | 0 | "de" | `/^(?:de\|en)$/` |
218
322
  | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
219
323
 
324
+ - Node: **opus**<br/>
325
+ Purpose: **OPUS Text-to-Text translation**<br/>
326
+ Example: `deepl(src: "de", dst: "en")`<br/>
327
+
328
+ | Port | Payload |
329
+ | ------- | ----------- |
330
+ | input | text |
331
+ | output | text |
332
+
333
+ | Parameter | Position | Default | Requirement |
334
+ | ------------ | --------- | -------- | ---------------- |
335
+ | **src** | 0 | "de" | `/^(?:de\|en)$/` |
336
+ | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
337
+
338
+ - Node: **subtitle**<br/>
339
+ Purpose: **SRT/VTT Subtitle Generation**<br/>
340
+ Example: `subtitle(format: "srt")`<br/>
341
+
342
+ | Port | Payload |
343
+ | ------- | ----------- |
344
+ | input | text |
345
+ | output | text |
346
+
347
+ | Parameter | Position | Default | Requirement |
348
+ | ------------ | --------- | -------- | ------------------ |
349
+ | **format** | *none* | "srt" | /^(?:srt\|vtt)$/ |
350
+
351
+ - Node: **format**<br/>
352
+ Purpose: **text paragraph formatting**<br/>
353
+ Example: `format(width: 80)`<br/>
354
+
355
+ | Port | Payload |
356
+ | ------- | ----------- |
357
+ | input | text |
358
+ | output | text |
359
+
360
+ | Parameter | Position | Default | Requirement |
361
+ | ------------ | --------- | -------- | --------------------- |
362
+ | **width** | 0 | 80 | *none* |
363
+
364
+ ### Text-to-Audio Nodes:
365
+
220
366
  - Node: **elevenlabs**<br/>
221
367
  Purpose: **ElevenLabs Text-to-Speech conversion**<br/>
222
368
  Example: `elevenlabs(language: "en")`<br/>
@@ -233,6 +379,22 @@ Currently **SpeechFlow** provides the following processing nodes:
233
379
  | **voice** | 0 | "Brian" | *none* |
234
380
  | **language** | 1 | "de" | *none* |
235
381
 
382
+ ### Any-to-Any Nodes:
383
+
384
+ - Node: **trace**<br/>
385
+ Purpose: **data flow tracing**<br/>
386
+ Example: `trace(type: "audio")`<br/>
387
+
388
+ | Port | Payload |
389
+ | ------- | ----------- |
390
+ | input | text, audio |
391
+ | output | text, audio |
392
+
393
+ | Parameter | Position | Default | Requirement |
394
+ | ------------ | --------- | -------- | --------------------- |
395
+ | **type** | 0 | "audio" | `/^(?:audio\|text)$/` |
396
+ | **name** | 1 | *none* | *none* |
397
+
236
398
  Graph Expression Language
237
399
  -------------------------
238
400
 
@@ -266,13 +428,18 @@ number-value ::= "0b" /[01]+/
266
428
  value ::= "true" | "false" | "null" | "NaN" | "undefined"
267
429
  ```
268
430
 
431
+ **SpeechFlow** makes available to **FlowLink** all **SpeechFlow** nodes as
432
+ `node`, the CLI arguments under the array `variable` named `argv`, and all
433
+ environment variables under the object `variable` named `env`.
434
+
269
435
  History
270
436
  -------
271
437
 
272
438
  **Speechflow**, as a technical cut-through, was initially created in
273
439
  March 2024 for use in the msg Filmstudio context. It was later refined
274
440
  into a more complete toolkit in April 2025 and this way the first time
275
- could be used in production.
441
+ could be used in production. It was fully refactored in July 2025 in
442
+ order to support timestamps in the streams processing.
276
443
 
277
444
  Copyright & License
278
445
  -------------------
@@ -0,0 +1,13 @@
1
+ import SpeechFlowNode from "./speechflow-node";
2
+ export default class SpeechFlowNodeFFmpeg extends SpeechFlowNode {
3
+ static name: string;
4
+ private ffmpegBinary;
5
+ private ffmpeg;
6
+ constructor(id: string, cfg: {
7
+ [id: string]: any;
8
+ }, opts: {
9
+ [id: string]: any;
10
+ }, args: any[]);
11
+ open(): Promise<void>;
12
+ close(): Promise<void>;
13
+ }
@@ -0,0 +1,152 @@
1
+ "use strict";
2
+ /*
3
+ ** SpeechFlow - Speech Processing Flow Graph
4
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
5
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
6
+ */
7
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
8
+ if (k2 === undefined) k2 = k;
9
+ var desc = Object.getOwnPropertyDescriptor(m, k);
10
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
11
+ desc = { enumerable: true, get: function() { return m[k]; } };
12
+ }
13
+ Object.defineProperty(o, k2, desc);
14
+ }) : (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ o[k2] = m[k];
17
+ }));
18
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
19
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
20
+ }) : function(o, v) {
21
+ o["default"] = v;
22
+ });
23
+ var __importStar = (this && this.__importStar) || (function () {
24
+ var ownKeys = function(o) {
25
+ ownKeys = Object.getOwnPropertyNames || function (o) {
26
+ var ar = [];
27
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
28
+ return ar;
29
+ };
30
+ return ownKeys(o);
31
+ };
32
+ return function (mod) {
33
+ if (mod && mod.__esModule) return mod;
34
+ var result = {};
35
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
36
+ __setModuleDefault(result, mod);
37
+ return result;
38
+ };
39
+ })();
40
+ var __importDefault = (this && this.__importDefault) || function (mod) {
41
+ return (mod && mod.__esModule) ? mod : { "default": mod };
42
+ };
43
+ Object.defineProperty(exports, "__esModule", { value: true });
44
+ /* standard dependencies */
45
+ const node_stream_1 = __importDefault(require("node:stream"));
46
+ /* external dependencies */
47
+ const ffmpeg_1 = __importDefault(require("@rse/ffmpeg"));
48
+ const ffmpeg_stream_1 = require("ffmpeg-stream");
49
+ /* internal dependencies */
50
+ const speechflow_node_1 = __importDefault(require("./speechflow-node"));
51
+ const utils = __importStar(require("./speechflow-utils"));
52
+ /* SpeechFlow node for FFmpeg */
53
+ class SpeechFlowNodeFFmpeg extends speechflow_node_1.default {
54
+ /* declare official node name */
55
+ static name = "ffmpeg";
56
+ /* internal state */
57
+ ffmpegBinary = ffmpeg_1.default.supported ? ffmpeg_1.default.binary : "ffmpeg";
58
+ ffmpeg = null;
59
+ /* construct node */
60
+ constructor(id, cfg, opts, args) {
61
+ super(id, cfg, opts, args);
62
+ /* declare node configuration parameters */
63
+ this.configure({
64
+ src: { type: "string", pos: 0, val: "pcm", match: /^(?:pcm|wav|mp3|opus)$/ },
65
+ dst: { type: "string", pos: 1, val: "wav", match: /^(?:pcm|wav|mp3|opus)$/ }
66
+ });
67
+ /* declare node input/output format */
68
+ this.input = "audio";
69
+ this.output = "audio";
70
+ }
71
+ /* open node */
72
+ async open() {
73
+ /* sanity check situation */
74
+ if (this.params.src === this.params.dst)
75
+ throw new Error("source and destination formats should not be the same");
76
+ /* instantiate FFmpeg sub-process */
77
+ this.ffmpeg = new ffmpeg_stream_1.Converter(this.ffmpegBinary);
78
+ const streamInput = this.ffmpeg.createInputStream({
79
+ /* FFmpeg input options */
80
+ "fflags": "nobuffer",
81
+ "flags": "low_delay",
82
+ "probesize": 32,
83
+ "analyzeduration": 0,
84
+ ...(this.params.src === "pcm" ? {
85
+ "f": "s16le",
86
+ "ar": this.config.audioSampleRate,
87
+ "ac": this.config.audioChannels
88
+ } : {}),
89
+ ...(this.params.src === "wav" ? {
90
+ "f": "wav"
91
+ } : {}),
92
+ ...(this.params.src === "mp3" ? {
93
+ "f": "mp3"
94
+ } : {}),
95
+ ...(this.params.src === "opus" ? {
96
+ "f": "opus"
97
+ } : {})
98
+ });
99
+ const streamOutput = this.ffmpeg.createOutputStream({
100
+ /* FFmpeg output options */
101
+ "flush_packets": 1,
102
+ ...(this.params.dst === "pcm" ? {
103
+ "c:a": "pcm_s16le",
104
+ "ar": this.config.audioSampleRate,
105
+ "ac": this.config.audioChannels,
106
+ "f": "s16le",
107
+ } : {}),
108
+ ...(this.params.dst === "wav" ? {
109
+ "f": "wav"
110
+ } : {}),
111
+ ...(this.params.dst === "mp3" ? {
112
+ "c:a": "libmp3lame",
113
+ "b:a": "192k",
114
+ "f": "mp3"
115
+ } : {}),
116
+ ...(this.params.dst === "opus" ? {
117
+ "acodec": "libopus",
118
+ "f": "opus"
119
+ } : {})
120
+ });
121
+ this.ffmpeg.run();
122
+ /* establish a duplex stream and connect it to FFmpeg */
123
+ this.stream = node_stream_1.default.Duplex.from({
124
+ writable: streamInput,
125
+ readable: streamOutput
126
+ });
127
+ /* wrap streams with conversions for chunk vs plain audio */
128
+ const wrapper1 = utils.createTransformStreamForWritableSide();
129
+ const wrapper2 = utils.createTransformStreamForReadableSide("audio", () => this.timeZero);
130
+ this.stream = node_stream_1.default.compose(wrapper1, this.stream, wrapper2);
131
+ }
132
+ /* close node */
133
+ async close() {
134
+ /* close duplex stream */
135
+ if (this.stream !== null) {
136
+ await new Promise((resolve) => {
137
+ if (this.stream instanceof node_stream_1.default.Duplex)
138
+ this.stream.end(() => { resolve(); });
139
+ else
140
+ resolve();
141
+ });
142
+ this.stream.destroy();
143
+ this.stream = null;
144
+ }
145
+ /* shutdown FFmpeg */
146
+ if (this.ffmpeg !== null) {
147
+ this.ffmpeg.kill();
148
+ this.ffmpeg = null;
149
+ }
150
+ }
151
+ }
152
+ exports.default = SpeechFlowNodeFFmpeg;
@@ -0,0 +1,11 @@
1
+ import SpeechFlowNode from "./speechflow-node";
2
+ export default class SpeechFlowNodeWAV extends SpeechFlowNode {
3
+ static name: string;
4
+ constructor(id: string, cfg: {
5
+ [id: string]: any;
6
+ }, opts: {
7
+ [id: string]: any;
8
+ }, args: any[]);
9
+ open(): Promise<void>;
10
+ close(): Promise<void>;
11
+ }