speechflow 1.7.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/README.md +425 -146
  3. package/etc/claude.md +5 -5
  4. package/etc/speechflow.yaml +2 -2
  5. package/package.json +3 -3
  6. package/speechflow-cli/dst/speechflow-main-api.js +6 -5
  7. package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
  8. package/speechflow-cli/dst/speechflow-main-graph.d.ts +1 -0
  9. package/speechflow-cli/dst/speechflow-main-graph.js +35 -13
  10. package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
  11. package/speechflow-cli/dst/speechflow-main-status.js +3 -7
  12. package/speechflow-cli/dst/speechflow-main-status.js.map +1 -1
  13. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +3 -0
  14. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
  15. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +4 -2
  16. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
  17. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -1
  18. package/speechflow-cli/dst/speechflow-node-a2a-expander.js +4 -2
  19. package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
  20. package/speechflow-cli/dst/speechflow-node-a2a-gender.js +2 -2
  21. package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
  22. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +1 -2
  23. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
  24. package/speechflow-cli/dst/speechflow-node-a2a-wav.js +32 -5
  25. package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
  26. package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
  27. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +1 -6
  28. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
  29. package/speechflow-cli/dst/speechflow-node-a2t-deepgram.d.ts +0 -1
  30. package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js +9 -9
  31. package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js.map +1 -1
  32. package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +17 -0
  33. package/speechflow-cli/dst/speechflow-node-a2t-google.js +320 -0
  34. package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -0
  35. package/speechflow-cli/dst/speechflow-node-a2t-openai.js +6 -4
  36. package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
  37. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +6 -11
  38. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
  39. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +6 -5
  40. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
  41. package/speechflow-cli/dst/speechflow-node-t2a-google.d.ts +15 -0
  42. package/speechflow-cli/dst/speechflow-node-t2a-google.js +218 -0
  43. package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -0
  44. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +2 -0
  45. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +19 -6
  46. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
  47. package/speechflow-cli/dst/speechflow-node-t2a-openai.d.ts +15 -0
  48. package/speechflow-cli/dst/speechflow-node-t2a-openai.js +195 -0
  49. package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -0
  50. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +17 -0
  51. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +608 -0
  52. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -0
  53. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
  54. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.d.ts → speechflow-node-t2t-opus.d.ts} +1 -3
  55. package/speechflow-cli/dst/speechflow-node-t2t-opus.js +159 -0
  56. package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -0
  57. package/speechflow-cli/dst/speechflow-node-t2t-profanity.d.ts +11 -0
  58. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js +118 -0
  59. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js.map +1 -0
  60. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.d.ts +13 -0
  61. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +220 -0
  62. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -0
  63. package/speechflow-cli/dst/{speechflow-node-t2t-openai.d.ts → speechflow-node-t2t-spellcheck.d.ts} +2 -2
  64. package/speechflow-cli/dst/{speechflow-node-t2t-openai.js → speechflow-node-t2t-spellcheck.js} +47 -99
  65. package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -0
  66. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +3 -6
  67. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
  68. package/speechflow-cli/dst/speechflow-node-t2t-summary.d.ts +16 -0
  69. package/speechflow-cli/dst/speechflow-node-t2t-summary.js +241 -0
  70. package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -0
  71. package/speechflow-cli/dst/{speechflow-node-t2t-ollama.d.ts → speechflow-node-t2t-translate.d.ts} +2 -2
  72. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.js → speechflow-node-t2t-translate.js} +53 -115
  73. package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -0
  74. package/speechflow-cli/dst/speechflow-node-x2x-filter.d.ts +1 -0
  75. package/speechflow-cli/dst/speechflow-node-x2x-filter.js +10 -0
  76. package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
  77. package/speechflow-cli/dst/speechflow-node-x2x-trace.js.map +1 -1
  78. package/speechflow-cli/dst/speechflow-node-xio-device.js +3 -3
  79. package/speechflow-cli/dst/speechflow-node-xio-device.js.map +1 -1
  80. package/speechflow-cli/dst/speechflow-node-xio-exec.d.ts +12 -0
  81. package/speechflow-cli/dst/speechflow-node-xio-exec.js +223 -0
  82. package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -0
  83. package/speechflow-cli/dst/speechflow-node-xio-file.d.ts +1 -0
  84. package/speechflow-cli/dst/speechflow-node-xio-file.js +80 -67
  85. package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
  86. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js +2 -1
  87. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
  88. package/speechflow-cli/dst/speechflow-node-xio-vban.d.ts +17 -0
  89. package/speechflow-cli/dst/speechflow-node-xio-vban.js +330 -0
  90. package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -0
  91. package/speechflow-cli/dst/speechflow-node-xio-webrtc.d.ts +39 -0
  92. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +500 -0
  93. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -0
  94. package/speechflow-cli/dst/speechflow-node-xio-websocket.js +2 -1
  95. package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
  96. package/speechflow-cli/dst/speechflow-util-audio.js +5 -6
  97. package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
  98. package/speechflow-cli/dst/speechflow-util-error.d.ts +1 -1
  99. package/speechflow-cli/dst/speechflow-util-error.js +5 -7
  100. package/speechflow-cli/dst/speechflow-util-error.js.map +1 -1
  101. package/speechflow-cli/dst/speechflow-util-llm.d.ts +35 -0
  102. package/speechflow-cli/dst/speechflow-util-llm.js +363 -0
  103. package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -0
  104. package/speechflow-cli/dst/speechflow-util-misc.d.ts +1 -1
  105. package/speechflow-cli/dst/speechflow-util-misc.js +4 -4
  106. package/speechflow-cli/dst/speechflow-util-misc.js.map +1 -1
  107. package/speechflow-cli/dst/speechflow-util-queue.js +3 -3
  108. package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
  109. package/speechflow-cli/dst/speechflow-util-stream.js +4 -2
  110. package/speechflow-cli/dst/speechflow-util-stream.js.map +1 -1
  111. package/speechflow-cli/dst/speechflow-util.d.ts +1 -0
  112. package/speechflow-cli/dst/speechflow-util.js +1 -0
  113. package/speechflow-cli/dst/speechflow-util.js.map +1 -1
  114. package/speechflow-cli/etc/oxlint.jsonc +2 -1
  115. package/speechflow-cli/package.json +34 -17
  116. package/speechflow-cli/src/lib.d.ts +5 -0
  117. package/speechflow-cli/src/speechflow-main-api.ts +6 -5
  118. package/speechflow-cli/src/speechflow-main-graph.ts +40 -13
  119. package/speechflow-cli/src/speechflow-main-status.ts +4 -8
  120. package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +4 -0
  121. package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +4 -2
  122. package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -1
  123. package/speechflow-cli/src/speechflow-node-a2a-expander.ts +4 -2
  124. package/speechflow-cli/src/speechflow-node-a2a-gender.ts +2 -2
  125. package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +1 -2
  126. package/speechflow-cli/src/speechflow-node-a2a-wav.ts +33 -6
  127. package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +6 -11
  128. package/speechflow-cli/src/speechflow-node-a2t-deepgram.ts +13 -12
  129. package/speechflow-cli/src/speechflow-node-a2t-google.ts +322 -0
  130. package/speechflow-cli/src/speechflow-node-a2t-openai.ts +8 -4
  131. package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +7 -11
  132. package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +6 -5
  133. package/speechflow-cli/src/speechflow-node-t2a-google.ts +206 -0
  134. package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +22 -6
  135. package/speechflow-cli/src/speechflow-node-t2a-openai.ts +179 -0
  136. package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +701 -0
  137. package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +2 -1
  138. package/speechflow-cli/src/speechflow-node-t2t-opus.ts +136 -0
  139. package/speechflow-cli/src/speechflow-node-t2t-profanity.ts +93 -0
  140. package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +201 -0
  141. package/speechflow-cli/src/{speechflow-node-t2t-openai.ts → speechflow-node-t2t-spellcheck.ts} +48 -107
  142. package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +3 -6
  143. package/speechflow-cli/src/speechflow-node-t2t-summary.ts +229 -0
  144. package/speechflow-cli/src/speechflow-node-t2t-translate.ts +181 -0
  145. package/speechflow-cli/src/speechflow-node-x2x-filter.ts +16 -3
  146. package/speechflow-cli/src/speechflow-node-x2x-trace.ts +3 -3
  147. package/speechflow-cli/src/speechflow-node-xio-device.ts +4 -7
  148. package/speechflow-cli/src/speechflow-node-xio-exec.ts +210 -0
  149. package/speechflow-cli/src/speechflow-node-xio-file.ts +93 -80
  150. package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +3 -2
  151. package/speechflow-cli/src/speechflow-node-xio-vban.ts +325 -0
  152. package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +533 -0
  153. package/speechflow-cli/src/speechflow-node-xio-websocket.ts +2 -1
  154. package/speechflow-cli/src/speechflow-util-audio-wt.ts +4 -4
  155. package/speechflow-cli/src/speechflow-util-audio.ts +10 -10
  156. package/speechflow-cli/src/speechflow-util-error.ts +9 -7
  157. package/speechflow-cli/src/speechflow-util-llm.ts +367 -0
  158. package/speechflow-cli/src/speechflow-util-misc.ts +4 -4
  159. package/speechflow-cli/src/speechflow-util-queue.ts +4 -4
  160. package/speechflow-cli/src/speechflow-util-stream.ts +5 -3
  161. package/speechflow-cli/src/speechflow-util.ts +1 -0
  162. package/speechflow-ui-db/package.json +9 -9
  163. package/speechflow-ui-st/package.json +9 -9
  164. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +0 -293
  165. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +0 -1
  166. package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +0 -1
  167. package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +0 -1
  168. package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +0 -281
  169. package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +0 -247
package/README.md CHANGED
@@ -26,7 +26,8 @@ speech-to-speech).
26
26
  **SpeechFlow** comes with built-in graph nodes for various functionalities:
27
27
 
28
28
  - file and audio device I/O for local connectivity,
29
- - WebSocket and MQTT network I/O for remote connectivity,
29
+ - WebSocket, MQTT, VBAN, and WebRTC network I/O for remote connectivity,
30
+ - external command execution I/O for process integration,
30
31
  - local Voice Activity Detection (VAD),
31
32
  - local voice gender recognition,
32
33
  - local audio LUFS-S/RMS metering,
@@ -38,20 +39,27 @@ speech-to-speech).
38
39
  - remote-controlable audio muting,
39
40
  - cloud-based speech-to-text conversion with
40
41
  [Amazon Transcribe](https://aws.amazon.com/transcribe/),
41
- [OpenAI GPT-Transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe), or
42
- [Deepgram](https://deepgram.com).
42
+ [OpenAI GPT-Transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe),
43
+ [Deepgram](https://deepgram.com), or
44
+ [Google Cloud Speech-to-Text](https://cloud.google.com/speech-to-text).
43
45
  - cloud-based text-to-text translation (or spelling correction) with
44
46
  [DeepL](https://deepl.com),
45
47
  [Amazon Translate](https://aws.amazon.com/translate/),
46
- [Google Cloud Translate](https://cloud.google.com/translate), or
47
- [OpenAI GPT](https://openai.com).
48
+ [Google Cloud Translate](https://cloud.google.com/translate),
49
+ [OpenAI GPT](https://openai.com),
50
+ [Anthropic Claude](https://anthropic.com), or
51
+ [Google Gemini](https://ai.google.dev).
48
52
  - local text-to-text translation (or spelling correction) with
49
- [Ollama/Gemma](https://ollama.com) or
50
- [Transformers/OPUS](https://github.com/Helsinki-NLP/Opus-MT).
53
+ [Ollama](https://ollama.com) or
54
+ [OPUS-MT](https://github.com/Helsinki-NLP/Opus-MT).
51
55
  - cloud-based text-to-speech conversion with
52
- [ElevenLabs](https://elevenlabs.io/) or
53
- [Amazon Polly](https://aws.amazon.com/polly/).
54
- - local text-to-speech conversion with [Kokoro](https://github.com/nazdridoy/kokoro-tts).
56
+ [OpenAI TTS](https://platform.openai.com/docs/guides/text-to-speech),
57
+ [ElevenLabs](https://elevenlabs.io/),
58
+ [Amazon Polly](https://aws.amazon.com/polly/), or
59
+ [Google Cloud Text-to-Speech](https://cloud.google.com/text-to-speech).
60
+ - local text-to-speech conversion with
61
+ [Kokoro](https://github.com/nazdridoy/kokoro-tts) or
62
+ [Supertonic](https://huggingface.co/Supertone/supertonic).
55
63
  - local [FFmpeg](https://ffmpeg.org/)-based speech-to-speech conversion,
56
64
  - local WAV speech-to-speech decoding/encoding,
57
65
  - local text-to-text formatting, regex-based modification,
@@ -221,8 +229,8 @@ They can also be found in the sample [speechflow.yaml](./etc/speechflow.yaml) fi
221
229
 
222
230
  ```
223
231
  xio-device(device: env.SPEECHFLOW_DEVICE_MIC, mode: "r") |
224
- a2a-wav(mode: "encode") |
225
- xio-file(path: "capture.wav", mode: "w", type: "audio")
232
+ a2a-wav(mode: "encode", seekable: true) |
233
+ xio-file(path: "capture.wav", mode: "w", type: "audio", seekable: true)
226
234
  ```
227
235
 
228
236
  - **Pass-Through**: Pass-through audio from microphone device to speaker
@@ -335,7 +343,10 @@ First a short overview of the available processing nodes:
335
343
  **xio-file**,
336
344
  **xio-device**,
337
345
  **xio-websocket**,
338
- **xio-mqtt**.
346
+ **xio-mqtt**,
347
+ **xio-vban**,
348
+ **xio-webrtc**,
349
+ **xio-exec**.
339
350
  - Audio-to-Audio nodes:
340
351
  **a2a-ffmpeg**,
341
352
  **a2a-wav**,
@@ -353,22 +364,29 @@ First a short overview of the available processing nodes:
353
364
  - Audio-to-Text nodes:
354
365
  **a2t-openai**,
355
366
  **a2t-amazon**,
356
- **a2t-deepgram**.
367
+ **a2t-deepgram**,
368
+ **a2t-google**.
357
369
  - Text-to-Text nodes:
358
370
  **t2t-deepl**,
359
371
  **t2t-amazon**,
360
- **t2t-openai**,
361
- **t2t-ollama**,
362
- **t2t-transformers**,
372
+ **t2t-opus**,
363
373
  **t2t-google**,
374
+ **t2t-translate**,
375
+ **t2t-spellcheck**,
376
+ **t2t-punctuation**,
364
377
  **t2t-modify**,
378
+ **t2t-profanity**,
379
+ **t2t-summary**,
365
380
  **t2t-subtitle**,
366
381
  **t2t-format**,
367
382
  **t2t-sentence**.
368
383
  - Text-to-Audio nodes:
384
+ **t2a-openai**,
369
385
  **t2a-amazon**,
370
386
  **t2a-elevenlabs**,
371
- **t2a-kokoro**.
387
+ **t2a-google**,
388
+ **t2a-kokoro**,
389
+ **t2a-supertonic**.
372
390
  - Any-to-Any nodes:
373
391
  **x2x-filter**,
374
392
  **x2x-trace**.
@@ -384,20 +402,24 @@ external files, devices and network services.
384
402
 
385
403
  > This node allows the reading/writing from/to files or from StdIO. It
386
404
  > is intended to be used as source and sink nodes in batch processing,
387
- > and as sing nodes in real-time processing.
405
+ > and as sing nodes in real-time processing. When `seekable` is enabled
406
+ > for write mode, the node uses a file descriptor allowing random access
407
+ > writes to specific file positions via the `chunk:seek` metadata field.
408
+ > Option `seekable` cannot be used on StdIO.
388
409
 
389
410
  | Port | Payload |
390
411
  | ------- | ----------- |
391
412
  | input | text, audio |
392
413
  | output | text, audio |
393
414
 
394
- | Parameter | Position | Default | Requirement |
395
- | ---------- | --------- | -------- | --------------------- |
396
- | **path** | 0 | *none* | *none* |
397
- | **mode** | 1 | "r" | `/^(?:r\|w\|rw)$/` |
398
- | **type** | 2 | "audio" | `/^(?:audio\|text)$/` |
399
- | **chunka** | | 200 | `10 <= n <= 1000` |
400
- | **chunkt** | | 65536 | `1024 <= n <= 131072` |
415
+ | Parameter | Position | Default | Requirement |
416
+ | -------------- | --------- | -------- | --------------------- |
417
+ | **path** | 0 | *none* | *none* |
418
+ | **mode** | 1 | "r" | `/^(?:r\|w)$/` |
419
+ | **type** | 2 | "audio" | `/^(?:audio\|text)$/` |
420
+ | **seekable** | | false | *none* |
421
+ | **chunkAudio** | | 200 | `10 <= n <= 1000` |
422
+ | **chunkText** | | 65536 | `1024 <= n <= 131072` |
401
423
 
402
424
  - Node: **xio-device**<br/>
403
425
  Purpose: **Microphone/speaker device source/sink**<br/>
@@ -437,11 +459,12 @@ external files, devices and network services.
437
459
  | ----------- | --------- | -------- | --------------------- |
438
460
  | **listen** | *none* | *none* | `/^(?:\|ws:\/\/(.+?):(\d+))$/` |
439
461
  | **connect** | *none* | *none* | `/^(?:\|ws:\/\/(.+?):(\d+)(?:\/.*)?)$/` |
440
- | **type** | *none* | "audio" | `/^(?:audio\|text)$/` |
462
+ | **mode** | *none* | "r" | `/^(?:r\|w\|rw)$/` |
463
+ | **type** | *none* | "text" | `/^(?:audio\|text)$/` |
441
464
 
442
465
  - Node: **xio-mqtt**<br/>
443
- Purpose: **MQTT sink**<br/>
444
- Example: `xio-mqtt(url: "mqtt://127.0.0.1:1883", username: "foo", password: "bar", topic: "quux")`
466
+ Purpose: **MQTT source/sink**<br/>
467
+ Example: `xio-mqtt(url: "mqtt://127.0.0.1:1883", username: "foo", password: "bar", topicWrite: "quux")`
445
468
  Notice: this node requires a peer MQTT broker!
446
469
 
447
470
  > This node allows reading/writing from/to MQTT broker topics. It is
@@ -450,15 +473,94 @@ external files, devices and network services.
450
473
 
451
474
  | Port | Payload |
452
475
  | ------- | ----------- |
453
- | input | text |
454
- | output | none |
476
+ | input | text, audio |
477
+ | output | text, audio |
455
478
 
456
- | Parameter | Position | Default | Requirement |
457
- | ------------ | --------- | -------- | --------------------- |
458
- | **url** | 0 | *none* | `/^(?:\|(?:ws\|mqtt):\/\/(.+?):(\d+))$/` |
459
- | **username** | 1 | *none* | `/^.+$/` |
460
- | **password** | 2 | *none* | `/^.+$/` |
461
- | **topic** | 3 | *none* | `/^.+$/` |
479
+ | Parameter | Position | Default | Requirement |
480
+ | -------------- | --------- | -------- | --------------------- |
481
+ | **url** | 0 | *none* | `/^(?:\|(?:ws\|mqtt):\/\/(.+?):(\d+)(?:\/.*)?)$/` |
482
+ | **username** | 1 | *none* | `/^.+$/` |
483
+ | **password** | 2 | *none* | `/^.+$/` |
484
+ | **topicRead** | 3 | *none* | `/^.+$/` |
485
+ | **topicWrite** | 4 | *none* | `/^.+$/` |
486
+ | **mode** | 5 | "w" | `/^(?:r\|w\|rw)$/` |
487
+ | **type** | 6 | "text" | `/^(?:audio\|text)$/` |
488
+
489
+ - Node: **xio-vban**<br/>
490
+ Purpose: **VBAN network audio source/sink**<br/>
491
+ Example: `xio-vban(listen: 6980, stream: "Stream1", mode: "r")`
492
+ Notice: this node requires a peer VBAN-compatible application!
493
+
494
+ > This node allows reading/writing audio from/to VBAN (VoiceMeeter
495
+ > Audio Network) protocol endpoints. It is intended to be used for
496
+ > real-time audio streaming with applications like VoiceMeeter,
497
+ > VB-Audio Matrix, or other VBAN-compatible software. It supports
498
+ > various audio bit resolutions (8-bit, 16-bit, 24-bit, 32-bit,
499
+ > float32, float64) and automatic channel downmixing to mono.
500
+
501
+ | Port | Payload |
502
+ | ------- | ----------- |
503
+ | input | audio |
504
+ | output | audio |
505
+
506
+ | Parameter | Position | Default | Requirement |
507
+ | ----------- | --------- | --------- | ---------------------------- |
508
+ | **listen** | 0 | "" | `/^(?:\|\d+\|.+?:\d+)$/` |
509
+ | **connect** | 1 | "" | `/^(?:\|.+?:\d+)$/` |
510
+ | **stream** | 2 | "Stream" | `/^.{1,16}$/` |
511
+ | **mode** | 3 | "rw" | `/^(?:r\|w\|rw)$/` |
512
+
513
+ - Node: **xio-webrtc**<br/>
514
+ Purpose: **WebRTC audio streaming source (WHIP) or sink (WHEP)**<br/>
515
+ Example: `xio-webrtc(listen: 8085, path: "/webrtc", mode: "r")`
516
+
517
+ > This node allows real-time audio streaming using WebRTC technology
518
+ > via WebRTC-HTTP Ingestion Protocol (WHIP) or WebRTC-HTTP Egress
519
+ > Protocol (WHEP). It provides an HTTP server for SDP negotiation
520
+ > and uses Opus codec for audio encoding/decoding at 48kHz. The node
521
+ > can operate in WHIP mode (i.e., read mode where publishers POST
522
+ > SDP offers to SpeechFlow and SpeechFlow receives audio stream from
523
+ > them) or WHEP mode (i.e., write mode where viewers POST SDP offers
524
+ > to SpeechFlow and SpeechFlow sends audio stream to them). This node
525
+ > supports multiple simultaneous connections, configurable ICE servers
526
+ > for NAT traversal, and automatic connection lifecycle management.
527
+
528
+ | Port | Payload |
529
+ | ------- | ----------- |
530
+ | input | audio |
531
+ | output | audio |
532
+
533
+ | Parameter | Position | Default | Requirement |
534
+ | -------------- | --------- | --------- | ---------------------------- |
535
+ | **listen** | 0 | "8085" | `/^(?:\d+\|.+?:\d+)$/` |
536
+ | **path** | 1 | "/webrtc" | `/^\/.+$/` |
537
+ | **mode** | 2 | "r" | `/^(?:r\|w)$/` |
538
+ | **iceServers** | 3 | "" | `/^.*$/` |
539
+
540
+ - Node: **xio-exec**<br/>
541
+ Purpose: **External command execution source/sink**<br/>
542
+ Example: `xio-exec(command: "ffmpeg -i - -f s16le -", mode: "rw", type: "audio")`
543
+
544
+ > This node allows reading/writing from/to external commands via stdin/stdout.
545
+ > It executes arbitrary commands and pipes audio or text data through them,
546
+ > enabling integration with external processing tools. The node supports
547
+ > read-only mode (capturing stdout), write-only mode (sending to stdin),
548
+ > and bidirectional mode (both stdin and stdout). This is useful for integrating
549
+ > external audio/text processing tools like FFmpeg, SoX, or custom scripts into
550
+ > the SpeechFlow pipeline.
551
+
552
+ | Port | Payload |
553
+ | ------- | ----------- |
554
+ | input | text, audio |
555
+ | output | text, audio |
556
+
557
+ | Parameter | Position | Default | Requirement |
558
+ | -------------- | --------- | -------- | --------------------- |
559
+ | **command** | 0 | *none* | *required* |
560
+ | **mode** | 1 | "r" | `/^(?:r\|w\|rw)$/` |
561
+ | **type** | 2 | "audio" | `/^(?:audio\|text)$/` |
562
+ | **chunkAudio** | | 200 | `10 <= n <= 1000` |
563
+ | **chunkText** | | 65536 | `1024 <= n <= 131072` |
462
564
 
463
565
  ### Audio-to-Audio Nodes
464
566
 
@@ -477,10 +579,10 @@ The following nodes process audio chunks only.
477
579
  | input | audio |
478
580
  | output | audio |
479
581
 
480
- | Parameter | Position | Default | Requirement |
481
- | ----------- | --------- | -------- | ------------------ |
482
- | **src** | 0 | "pcm" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
483
- | **dst** | 1 | "wav" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
582
+ | Parameter | Position | Default | Requirement |
583
+ | --------- | --------- | -------- | ------------------ |
584
+ | **src** | 0 | "pcm" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
585
+ | **dst** | 1 | "wav" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
484
586
 
485
587
  - Node: **a2a-wav**<br/>
486
588
  Purpose: **WAV audio format conversion**<br/>
@@ -489,15 +591,20 @@ The following nodes process audio chunks only.
489
591
  > This node allows converting between PCM and WAV audio formats. It is
490
592
  > primarily intended to support the reading/writing of external WAV
491
593
  > format files, although SpeechFlow internally uses PCM format only.
594
+ > When `seekable` is enabled in encode mode, the node writes a corrected
595
+ > WAV header at the end of processing with accurate file size information
596
+ > by seeking back to position 0, producing standard-compliant WAV files.
597
+ > Option `seekable` requires a seekable output stream.
492
598
 
493
599
  | Port | Payload |
494
600
  | ------- | ----------- |
495
601
  | input | audio |
496
602
  | output | audio |
497
603
 
498
- | Parameter | Position | Default | Requirement |
499
- | ----------- | --------- | -------- | ------------------------ |
500
- | **mode** | 0 | "encode" | `/^(?:encode\|decode)$/` |
604
+ | Parameter | Position | Default | Requirement |
605
+ | ------------ | --------- | -------- | ------------------------ |
606
+ | **mode** | 0 | "encode" | `/^(?:encode\|decode)$/` |
607
+ | **seekable** | 1 | false | *none* |
501
608
 
502
609
  - Node: **a2a-mute**<br/>
503
610
  Purpose: **volume muting node**<br/>
@@ -512,8 +619,8 @@ The following nodes process audio chunks only.
512
619
  | input | audio |
513
620
  | output | audio |
514
621
 
515
- | Parameter | Position | Default | Requirement |
516
- | ----------- | --------- | -------- | ------------------------ |
622
+ | Parameter | Position | Default | Requirement |
623
+ | --------- | --------- | -------- | ------------------------ |
517
624
 
518
625
  - Node: **a2a-meter**<br/>
519
626
  Purpose: **Loudness metering node**<br/>
@@ -531,7 +638,7 @@ The following nodes process audio chunks only.
531
638
 
532
639
  | Parameter | Position | Default | Requirement |
533
640
  | ------------- | --------- | -------- | ---------------------- |
534
- | **interval** | 0 | 250 | *none* |
641
+ | **interval** | 0 | 100 | *none* |
535
642
  | **mode** | 1 | "filter" | `/^(?:filter\|sink)$/` |
536
643
  | **dashboard** | | *none* | *none* |
537
644
 
@@ -548,8 +655,8 @@ The following nodes process audio chunks only.
548
655
  | input | audio |
549
656
  | output | audio |
550
657
 
551
- | Parameter | Position | Default | Requirement |
552
- | ----------- | --------- | -------- | ------------------------ |
658
+ | Parameter | Position | Default | Requirement |
659
+ | --------- | --------- | -------- | ------------------------ |
553
660
  | **mode** | *none* | "unplugged" | `/^(?:silenced\|unplugged)$/` |
554
661
  | **posSpeechThreshold** | *none* | 0.50 | *none* |
555
662
  | **negSpeechThreshold** | *none* | 0.35 | *none* |
@@ -571,11 +678,12 @@ The following nodes process audio chunks only.
571
678
  | input | audio |
572
679
  | output | audio |
573
680
 
574
- | Parameter | Position | Default | Requirement |
575
- | -------------- | --------- | -------- | ------------------------ |
576
- | **window** | 0 | 500 | *none* |
577
- | **treshold** | 1 | 0.50 | *none* |
578
- | **hysteresis** | 2 | 0.25 | *none* |
681
+ | Parameter | Position | Default | Requirement |
682
+ | ------------------- | --------- | -------- | ------------------------ |
683
+ | **window** | 0 | 500 | *none* |
684
+ | **threshold** | 1 | 0.50 | *none* |
685
+ | **hysteresis** | 2 | 0.25 | *none* |
686
+ | **volumeThreshold** | 3 | -45 | *none* |
579
687
 
580
688
  - Node: **a2a-speex**<br/>
581
689
  Purpose: **Speex Noise Suppression node**<br/>
@@ -590,9 +698,9 @@ The following nodes process audio chunks only.
590
698
  | input | audio |
591
699
  | output | audio |
592
700
 
593
- | Parameter | Position | Default | Requirement |
594
- | ----------- | --------- | -------- | ------------------------ |
595
- | **attentuate** | 0 | -18 | *none* | `-60 <= n <= 0` |
701
+ | Parameter | Position | Default | Requirement |
702
+ | -------------- | --------- | -------- | ------------------ |
703
+ | **attentuate** | 0 | -18 | `-60 <= n <= 0` |
596
704
 
597
705
  - Node: **a2a-rnnoise**<br/>
598
706
  Purpose: **RNNoise Noise Suppression node**<br/>
@@ -606,8 +714,8 @@ The following nodes process audio chunks only.
606
714
  | input | audio |
607
715
  | output | audio |
608
716
 
609
- | Parameter | Position | Default | Requirement |
610
- | ----------- | --------- | -------- | ------------------------ |
717
+ | Parameter | Position | Default | Requirement |
718
+ | --------- | --------- | -------- | ------------------------ |
611
719
 
612
720
  - Node: **a2a-compressor**<br/>
613
721
  Purpose: **audio compressor node**<br/>
@@ -621,14 +729,17 @@ The following nodes process audio chunks only.
621
729
  | input | audio |
622
730
  | output | audio |
623
731
 
624
- | Parameter | Position | Default | Requirement |
625
- | ----------- | --------- | -------- | ------------------------ |
626
- | **thresholdDb** | *none* | -18 | `n <= 0 && n >= -60` |
627
- | **ratio** | *none* | 4 | `n >= 1 && n <= 20` |
628
- | **attackMs** | *none* | 10 | `n >= 0 && n <= 100` |
629
- | **releaseMs** | *none* | 50 | `n >= 0 && n <= 100` |
630
- | **kneeDb** | *none* | 6 | `n >= 0 && n <= 100` |
631
- | **makeupDb** | *none* | 0 | `n >= 0 && n <= 100` |
732
+ | Parameter | Position | Default | Requirement |
733
+ | --------------- | --------- | ------------ | ------------------------ |
734
+ | **type** | *none* | "standalone" | `/^(?:standalone\|sidechain)$/` |
735
+ | **mode** | *none* | "compress" | `/^(?:compress\|measure\|adjust)$/` |
736
+ | **bus** | *none* | "compressor" | `/^.+$/` |
737
+ | **thresholdDb** | *none* | -23 | `n <= 0 && n >= -100`|
738
+ | **ratio** | *none* | 4.0 | `n >= 1 && n <= 20` |
739
+ | **attackMs** | *none* | 10 | `n >= 0 && n <= 1000`|
740
+ | **releaseMs** | *none* | 50 | `n >= 0 && n <= 1000`|
741
+ | **kneeDb** | *none* | 6.0 | `n >= 0 && n <= 40` |
742
+ | **makeupDb** | *none* | 0 | `n >= -24 && n <= 24`|
632
743
 
633
744
  - Node: **a2a-expander**<br/>
634
745
  Purpose: **audio expander node**<br/>
@@ -642,14 +753,15 @@ The following nodes process audio chunks only.
642
753
  | input | audio |
643
754
  | output | audio |
644
755
 
645
- | Parameter | Position | Default | Requirement |
646
- | ----------- | --------- | -------- | ------------------------ |
647
- | **thresholdDb** | *none* | -45 | `n <= 0 && n >= -60` |
648
- | **ratio** | *none* | 4 | `n >= 1 && n <= 20` |
649
- | **attackMs** | *none* | 10 | `n >= 0 && n <= 100` |
650
- | **releaseMs** | *none* | 50 | `n >= 0 && n <= 100` |
651
- | **kneeDb** | *none* | 6 | `n >= 0 && n <= 100` |
652
- | **makeupDb** | *none* | 0 | `n >= 0 && n <= 100` |
756
+ | Parameter | Position | Default | Requirement |
757
+ | --------------- | --------- | -------- | --------------------- |
758
+ | **thresholdDb** | *none* | -45 | `n <= 0 && n >= -100` |
759
+ | **floorDb** | *none* | -64 | `n <= 0 && n >= -100` |
760
+ | **ratio** | *none* | 4.0 | `n >= 1 && n <= 20` |
761
+ | **attackMs** | *none* | 10 | `n >= 0 && n <= 1000` |
762
+ | **releaseMs** | *none* | 50 | `n >= 0 && n <= 1000` |
763
+ | **kneeDb** | *none* | 6.0 | `n >= 0 && n <= 40` |
764
+ | **makeupDb** | *none* | 0 | `n >= -24 && n <= 24` |
653
765
 
654
766
  - Node: **a2a-gain**<br/>
655
767
  Purpose: **audio gain adjustment node**<br/>
@@ -663,9 +775,9 @@ The following nodes process audio chunks only.
663
775
  | input | audio |
664
776
  | output | audio |
665
777
 
666
- | Parameter | Position | Default | Requirement |
667
- | ----------- | --------- | -------- | ------------------------ |
668
- | **db** | *none* | 12 | `n >= -60 && n <= -60` |
778
+ | Parameter | Position | Default | Requirement |
779
+ | --------- | --------- | -------- | --------------------- |
780
+ | **db** | 0 | 0 | `n >= -60 && n <= 60` |
669
781
 
670
782
  - Node: **a2a-pitch**<br/>
671
783
  Purpose: **audio pitch shifting and time stretching**<br/>
@@ -701,8 +813,9 @@ The following nodes process audio chunks only.
701
813
  | input | audio |
702
814
  | output | audio |
703
815
 
704
- | Parameter | Position | Default | Requirement |
705
- | ----------- | --------- | -------- | ------------------------ |
816
+ | Parameter | Position | Default | Requirement |
817
+ | ----------- | --------- | -------- | ---------------------- |
818
+ | **segment** | 0 | 50 | `n >= 10 && n <= 1000` |
706
819
 
707
820
  ### Audio-to-Text Nodes
708
821
 
@@ -719,7 +832,7 @@ The following nodes convert audio to text chunks.
719
832
 
720
833
  | Port | Payload |
721
834
  | ------- | ----------- |
722
- | input | text |
835
+ | input | audio |
723
836
  | output | text |
724
837
 
725
838
  | Parameter | Position | Default | Requirement |
@@ -770,9 +883,32 @@ The following nodes convert audio to text chunks.
770
883
  | ------------ | --------- | -------- | ------------------ |
771
884
  | **key** | *none* | env.SPEECHFLOW\_DEEPGRAM\_KEY | *none* |
772
885
  | **keyAdm** | *none* | env.SPEECHFLOW\_DEEPGRAM\_KEY\_ADM | *none* |
773
- | **model** | 0 | "nova-3" | *none* |
886
+ | **model** | 0 | "nova-2" | *none* |
774
887
  | **version** | 1 | "latest" | *none* |
775
888
  | **language** | 2 | "multi" | *none* |
889
+ | **interim** | 3 | false | *none* |
890
+
891
+ - Node: **a2t-google**<br/>
892
+ Purpose: **Google Cloud Speech-to-Text conversion**<br/>
893
+ Example: `a2t-google(language: "en-US")`<br/>
894
+ Notice: this node requires a Google Cloud API key!
895
+
896
+ > This node uses Google Cloud Speech-to-Text to perform Speech-to-Text (S2T)
897
+ > conversion, i.e., it recognizes speech in the input audio stream and
898
+ > outputs a corresponding text stream. It supports various languages
899
+ > and models, including the `latest_long` model for long-form audio.
900
+
901
+ | Port | Payload |
902
+ | ------- | ----------- |
903
+ | input | audio |
904
+ | output | text |
905
+
906
+ | Parameter | Position | Default | Requirement |
907
+ | ------------ | --------- | ------------- | ------------ |
908
+ | **key** | *none* | env.SPEECHFLOW\_GOOGLE\_KEY | *none* |
909
+ | **model** | 0 | "latest_long" | *none* |
910
+ | **language** | 1 | "en-US" | *none* |
911
+ | **interim** | 2 | false | *none* |
776
912
 
777
913
  ### Text-to-Text Nodes
778
914
 
@@ -783,73 +919,65 @@ The following nodes process text chunks only.
783
919
  Example: `t2t-deepl(src: "de", dst: "en")`<br/>
784
920
  Notice: this node requires an API key!
785
921
 
786
- > This node performs translation between English and German languages.
922
+ > This node performs translation between multiple languages.
787
923
 
788
924
  | Port | Payload |
789
925
  | ------- | ----------- |
790
926
  | input | text |
791
927
  | output | text |
792
928
 
793
- | Parameter | Position | Default | Requirement |
794
- | ------------ | --------- | -------- | ------------------ |
795
- | **key** | *none* | env.SPEECHFLOW\_DEEPL\_KEY | *none* |
796
- | **src** | 0 | "de" | `/^(?:de\|en)$/` |
797
- | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
929
+ | Parameter | Position | Default | Requirement |
930
+ | ------------ | --------- | ---------- | ----------------------------- |
931
+ | **key** | *none* | env.SPEECHFLOW\_DEEPL\_KEY | *none* |
932
+ | **src** | 0 | "de" | `/^(?:de\|en\|fr\|it)$/` |
933
+ | **dst** | 1 | "en" | `/^(?:de\|en\|fr\|it)$/` |
934
+ | **optimize** | 2 | "latency" | `/^(?:latency\|quality)$/` |
798
935
 
799
936
  - Node: **t2t-amazon**<br/>
800
937
  Purpose: **AWS Translate Text-to-Text translation**<br/>
801
938
  Example: `t2t-amazon(src: "de", dst: "en")`<br/>
802
939
  Notice: this node requires an API key!
803
940
 
804
- > This node performs translation between English and German languages.
941
+ > This node performs translation between multiple languages.
805
942
 
806
943
  | Port | Payload |
807
944
  | ------- | ----------- |
808
945
  | input | text |
809
946
  | output | text |
810
947
 
811
- | Parameter | Position | Default | Requirement |
812
- | ------------ | --------- | -------- | ------------------ |
813
- | **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none* |
948
+ | Parameter | Position | Default | Requirement |
949
+ | ------------ | --------- | -------- | ---------------------------- |
950
+ | **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none* |
814
951
  | **secKey** | *none* | env.SPEECHFLOW\_AMAZON\_KEY\_SEC | *none* |
815
- | **region** | *none* | "eu-central-1" | *none* |
816
- | **src** | 0 | "de" | `/^(?:de\|en)$/` |
817
- | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
952
+ | **region** | *none* | "eu-central-1" | *none* |
953
+ | **src** | 0 | "de" | `/^(?:de\|en\|fr\|it)$/` |
954
+ | **dst** | 1 | "en" | `/^(?:de\|en\|fr\|it)$/` |
818
955
 
819
- - Node: **t2t-openai**<br/>
820
- Purpose: **OpenAI/GPT Text-to-Text translation and spelling correction**<br/>
821
- Example: `t2t-openai(src: "de", dst: "en")`<br/>
822
- Notice: this node requires an OpenAI API key!
956
+ - Node: **t2t-opus**<br/>
957
+ Purpose: **OPUS-MT Text-to-Text translation**<br/>
958
+ Example: `t2t-opus(src: "de", dst: "en")`<br/>
823
959
 
824
960
  > This node performs translation between English and German languages
825
- > in the text stream or (if the source and destination language is
826
- > the same) spellchecking of English or German languages in the text
827
- > stream. It is based on the remote OpenAI cloud AI service and uses
828
- > the GPT-4o-mini LLM.
961
+ > in the text stream. It is based on the local OPUS-MT translation model.
829
962
 
830
963
  | Port | Payload |
831
964
  | ------- | ----------- |
832
965
  | input | text |
833
966
  | output | text |
834
967
 
835
- | Parameter | Position | Default | Requirement |
836
- | ------------ | --------- | -------- | ------------------ |
837
- | **api** | *none* | "https://api.openai.com" | `/^https?:\/\/.+?:\d+$/` |
968
+ | Parameter | Position | Default | Requirement |
969
+ | ------------ | --------- | -------- | ---------------- |
838
970
  | **src** | 0 | "de" | `/^(?:de\|en)$/` |
839
971
  | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
840
- | **key** | *none* | env.SPEECHFLOW\_OPENAI\_KEY | *none* |
841
- | **model** | *none* | "gpt-5-mini" | *none* |
842
972
 
843
- - Node: **t2t-ollama**<br/>
844
- Purpose: **Ollama/Gemma Text-to-Text translation and spelling correction**<br/>
845
- Example: `t2t-ollama(src: "de", dst: "en")`<br/>
846
- Notice: this node requires Ollama to be installed!
973
+ - Node: **t2t-google**<br/>
974
+ Purpose: **Google Cloud Translate Text-to-Text translation**<br/>
975
+ Example: `t2t-google(src: "de", dst: "en")`<br/>
976
+ Notice: this node requires a Google Cloud API key and project ID!
847
977
 
848
- > This node performs translation between English and German languages
849
- > in the text stream or (if the source and destination language is
850
- > the same) spellchecking of English or German languages in the text
851
- > stream. It is based on the local Ollama AI service and uses the
852
- > Google Gemma 3 LLM.
978
+ > This node performs translation between multiple languages
979
+ > in the text stream using Google Cloud Translate API.
980
+ > It supports German, English, French, and Italian languages.
853
981
 
854
982
  | Port | Payload |
855
983
  | ------- | ----------- |
@@ -858,48 +986,83 @@ The following nodes process text chunks only.
858
986
 
859
987
  | Parameter | Position | Default | Requirement |
860
988
  | ------------ | --------- | -------- | ------------------ |
861
- | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?:\d+$/` |
862
- | **model** | *none* | "gemma3:4b-it-q4_K_M" | *none* |
863
- | **src** | 0 | "de" | `/^(?:de\|en)$/` |
864
- | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
989
+ | **key** | *none* | env.SPEECHFLOW\_GOOGLE\_KEY | *none* |
990
+ | **src** | 0 | "de" | `/^(?:de\|en\|fr\|it)$/` |
991
+ | **dst** | 1 | "en" | `/^(?:de\|en\|fr\|it)$/` |
865
992
 
866
- - Node: **t2t-transformers**<br/>
867
- Purpose: **Transformers Text-to-Text translation**<br/>
868
- Example: `t2t-transformers(src: "de", dst: "en")`<br/>
993
+ - Node: **t2t-translate**<br/>
994
+ Purpose: **LLM-based Text-to-Text translation**<br/>
995
+ Example: `t2t-translate(src: "de", dst: "en")`<br/>
996
+ Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
869
997
 
870
998
  > This node performs translation between English and German languages
871
- > in the text stream. It is based on local OPUS or SmolLM3 LLMs.
999
+ > in the text stream using an LLM service. Multiple LLM providers are
1000
+ > supported: local Ollama (default), local HuggingFace Transformers,
1001
+ > or cloud-based OpenAI, Anthropic, or Google.
872
1002
 
873
1003
  | Port | Payload |
874
1004
  | ------- | ----------- |
875
1005
  | input | text |
876
1006
  | output | text |
877
1007
 
878
- | Parameter | Position | Default | Requirement |
879
- | ------------ | --------- | -------- | ---------------- |
880
- | **model** | *none* | "OPUS" | `/^(?:OPUS\|SmolLM3)$/` |
881
- | **src** | 0 | "de" | `/^(?:de\|en)$/` |
882
- | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
1008
+ | Parameter | Position | Default | Requirement |
1009
+ | ------------ | --------- | ------------------------ | ---------------------------------------- |
1010
+ | **src** | 0 | "de" | `/^(?:de\|en)$/` |
1011
+ | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
1012
+ | **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
1013
+ | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
1014
+ | **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
1015
+ | **key** | *none* | "" | *none* |
1016
+
1017
+ - Node: **t2t-spellcheck**<br/>
1018
+ Purpose: **LLM-based Text-to-Text spellchecking**<br/>
1019
+ Example: `t2t-spellcheck(lang: "en")`<br/>
1020
+ Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
1021
+
1022
+ > This node performs spellchecking of English or German text using an
1023
+ > LLM service. It corrects spelling mistakes, adds missing punctuation,
1024
+ > but preserves grammar and word choice. Multiple LLM providers are
1025
+ > supported: local Ollama (default), local HuggingFace Transformers,
1026
+ > or cloud-based OpenAI, Anthropic, or Google.
883
1027
 
884
- - Node: **t2t-google**<br/>
885
- Purpose: **Google Cloud Translate Text-to-Text translation**<br/>
886
- Example: `t2t-google(src: "de", dst: "en")`<br/>
887
- Notice: this node requires a Google Cloud API key and project ID!
1028
+ | Port | Payload |
1029
+ | ------- | ----------- |
1030
+ | input | text |
1031
+ | output | text |
888
1032
 
889
- > This node performs translation between multiple languages
890
- > in the text stream using Google Cloud Translate API.
891
- > It supports German, English, French, and Italian languages.
1033
+ | Parameter | Position | Default | Requirement |
1034
+ | ------------ | --------- | ------------------------ | ---------------------------------------- |
1035
+ | **lang** | 0 | "en" | `/^(?:en\|de)$/` |
1036
+ | **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
1037
+ | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
1038
+ | **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
1039
+ | **key** | *none* | "" | *none* |
1040
+
1041
+ - Node: **t2t-punctuation**<br/>
1042
+ Purpose: **LLM-based punctuation restoration**<br/>
1043
+ Example: `t2t-punctuation(lang: "en")`<br/>
1044
+ Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
1045
+
1046
+ > This node performs punctuation restoration using an LLM service.
1047
+ > It adds missing punctuation marks (periods, commas, question marks,
1048
+ > exclamation marks, colons, semicolons) and capitalizes the first
1049
+ > letters of sentences. It preserves all original words exactly as they
1050
+ > are without spelling corrections or grammar changes. Multiple LLM
1051
+ > providers are supported: local Ollama (default), local HuggingFace
1052
+ > Transformers, or cloud-based OpenAI, Anthropic, or Google.
892
1053
 
893
1054
  | Port | Payload |
894
1055
  | ------- | ----------- |
895
1056
  | input | text |
896
1057
  | output | text |
897
1058
 
898
- | Parameter | Position | Default | Requirement |
899
- | ------------ | --------- | -------- | ------------------ |
900
- | **key** | *none* | env.SPEECHFLOW\_GOOGLE\_KEY | *none* |
901
- | **src** | 0 | "de" | `/^(?:de\|en\|fr\|it)$/` |
902
- | **dst** | 1 | "en" | `/^(?:de\|en\|fr\|it)$/` |
1059
+ | Parameter | Position | Default | Requirement |
1060
+ | ------------ | --------- | ------------------------ | ---------------------------------------- |
1061
+ | **lang** | 0 | "en" | `/^(?:en\|de)$/` |
1062
+ | **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
1063
+ | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
1064
+ | **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
1065
+ | **key** | *none* | "" | *none* |
903
1066
 
904
1067
  - Node: **t2t-modify**<br/>
905
1068
  Purpose: **regex-based text modification**<br/>
@@ -919,6 +1082,53 @@ The following nodes process text chunks only.
919
1082
  | **match** | 0 | "" | *required* |
920
1083
  | **replace** | 1 | "" | *required* |
921
1084
 
1085
+ - Node: **t2t-profanity**<br/>
1086
+ Purpose: **profanity filtering**<br/>
1087
+ Example: `t2t-profanity(lang: "en", placeholder: "***")`<br/>
1088
+
1089
+ > This node filters profanity from the text stream by detecting bad words
1090
+ > and replacing them with a placeholder. It supports English and German
1091
+ > languages and can either replace with a fixed placeholder or repeat
1092
+ > the placeholder character for each character of the detected word.
1093
+
1094
+ | Port | Payload |
1095
+ | ------- | ----------- |
1096
+ | input | text |
1097
+ | output | text |
1098
+
1099
+ | Parameter | Position | Default | Requirement |
1100
+ | --------------- | --------- | ---------- | ------------------------ |
1101
+ | **lang** | *none* | "en" | `/^(?:en\|de)$/` |
1102
+ | **placeholder** | *none* | "\*\*\*" | *none* |
1103
+ | **mode** | *none* | "replace" | `/^(?:replace\|repeat)$/`|
1104
+
1105
+ - Node: **t2t-summary**<br/>
1106
+ Purpose: **LLM-based Text-to-Text summarization**<br/>
1107
+ Example: `t2t-summary(lang: "en", size: 4, trigger: 8)`<br/>
1108
+ Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
1109
+
1110
+ > This node performs text summarization using an LLM service.
1111
+ > It accumulates incoming text sentences and generates a summary after
1112
+ > a configurable number of sentences (trigger). The summary length is
1113
+ > also configurable (size). It supports English and German languages.
1114
+ > Multiple LLM providers are supported: local Ollama (default), local
1115
+ > HuggingFace Transformers, or cloud-based OpenAI, Anthropic, or Google.
1116
+
1117
+ | Port | Payload |
1118
+ | ------- | ----------- |
1119
+ | input | text |
1120
+ | output | text |
1121
+
1122
+ | Parameter | Position | Default | Requirement |
1123
+ | ------------ | --------- | ------------------------ | ---------------------------------------- |
1124
+ | **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
1125
+ | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
1126
+ | **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
1127
+ | **key** | *none* | "" | *none* |
1128
+ | **lang** | 0 | "en" | `/^(?:en\|de)$/` |
1129
+ | **size** | 1 | 4 | `1 <= n <= 20` |
1130
+ | **trigger** | 2 | 8 | `1 <= n <= 100` |
1131
+
922
1132
  - Node: **t2t-sentence**<br/>
923
1133
  Purpose: **sentence splitting/merging**<br/>
924
1134
  Example: `t2t-sentence()`<br/>
@@ -977,6 +1187,32 @@ The following nodes process text chunks only.
977
1187
 
978
1188
  The following nodes convert text chunks to audio chunks.
979
1189
 
1190
+ - Node: **t2a-openai**<br/>
1191
+ Purpose: **OpenAI Text-to-Speech conversion**<br/>
1192
+ Example: `t2a-openai(voice: "nova", model: "tts-1-hd")`<br/>
1193
+ Notice: this node requires an OpenAI API key!
1194
+
1195
+ > This node uses OpenAI TTS to perform Text-to-Speech (T2S)
1196
+ > conversion, i.e., it converts the input text stream into an output
1197
+ > audio stream. It supports six built-in voices and two models:
1198
+ > `tts-1` for lower latency and `tts-1-hd` for higher quality.
1199
+ > The language is automatically detected from the input text and
1200
+ > supports many languages including German, English, French, Spanish,
1201
+ > Chinese, Japanese, and more (no language parameter needed).
1202
+
1203
+ | Port | Payload |
1204
+ | ------- | ----------- |
1205
+ | input | text |
1206
+ | output | audio |
1207
+
1208
+ | Parameter | Position | Default | Requirement |
1209
+ | -------------- | --------- | --------- | ------------------ |
1210
+ | **key** | *none* | env.SPEECHFLOW\_OPENAI\_KEY | *none* |
1211
+ | **api** | *none* | "https://api.openai.com/v1" | `/^https?:\/\/.+/` |
1212
+ | **voice** | 0 | "alloy" | `/^(?:alloy\|echo\|fable\|onyx\|nova\|shimmer)$/` |
1213
+ | **model** | 1 | "tts-1" | `/^(?:tts-1\|tts-1-hd)$/` |
1214
+ | **speed** | 2 | 1.0 | `0.25 <= n <= 4.0` |
1215
+
980
1216
  - Node: **t2a-amazon**<br/>
981
1217
  Purpose: **Amazon Polly Text-to-Speech conversion**<br/>
982
1218
  Example: `t2a-amazon(language: "en", voice: "Danielle)`<br/>
@@ -996,7 +1232,7 @@ The following nodes convert text chunks to audio chunks.
996
1232
  | **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none* |
997
1233
  | **secKey** | *none* | env.SPEECHFLOW\_AMAZON\_KEY\_SEC | *none* |
998
1234
  | **region** | *none* | "eu-central-1" | *none* |
999
- | **voice** | 0 | "Amy" | `^(?:Amy|Danielle|Joanna|Matthew|Ruth|Stephen|Viki|Daniel)$/` |
1235
+ | **voice** | 0 | "Amy" | `/^(?:Amy\|Danielle\|Joanna\|Matthew\|Ruth\|Stephen\|Vicki\|Daniel)$/` |
1000
1236
  | **language** | 1 | "en" | `/^(?:de\|en)$/` |
1001
1237
 
1002
1238
  - Node: **t2a-elevenlabs**<br/>
@@ -1018,11 +1254,34 @@ The following nodes convert text chunks to audio chunks.
1018
1254
  | **key** | *none* | env.SPEECHFLOW\_ELEVENLABS\_KEY | *none* |
1019
1255
  | **voice** | 0 | "Brian" | `/^(?:Brittney\|Cassidy\|Leonie\|Mark\|Brian)$/` |
1020
1256
  | **language** | 1 | "de" | `/^(?:de\|en)$/` |
1021
- | **speed** | 2 | 1.00 | `n >= 0`7 && n <= 1.2` |
1257
+ | **speed** | 2 | 1.00 | `n >= 0.7 && n <= 1.2` |
1022
1258
  | **stability** | 3 | 0.5 | `n >= 0.0 && n <= 1.0` |
1023
1259
  | **similarity** | 4 | 0.75 | `n >= 0.0 && n <= 1.0` |
1024
1260
  | **optimize** | 5 | "latency" | `/^(?:latency\|quality)$/` |
1025
1261
 
1262
+ - Node: **t2a-google**<br/>
1263
+ Purpose: **Google Cloud Text-to-Speech conversion**<br/>
1264
+ Example: `t2a-google(voice: "en-US-Neural2-J", language: "en-US")`<br/>
1265
+ Notice: this node requires a Google Cloud API key!
1266
+
1267
+ > This node uses Google Cloud Text-to-Speech to perform Text-to-Speech (T2S)
1268
+ > conversion, i.e., it converts the input text stream into an output
1269
+ > audio stream. It supports various voices and languages with configurable
1270
+ > speaking rate and pitch adjustment.
1271
+
1272
+ | Port | Payload |
1273
+ | ------- | ----------- |
1274
+ | input | text |
1275
+ | output | audio |
1276
+
1277
+ | Parameter | Position | Default | Requirement |
1278
+ | ------------ | --------- | ------------------ | -------------------- |
1279
+ | **key** | *none* | env.SPEECHFLOW\_GOOGLE\_KEY | *none* |
1280
+ | **voice** | 0 | "en-US-Neural2-J" | *none* |
1281
+ | **language** | 1 | "en-US" | *none* |
1282
+ | **speed** | 2 | 1.0 | `0.25 <= n <= 4.0` |
1283
+ | **pitch** | 3 | 0.0 | `-20.0 <= n <= 20.0` |
1284
+
1026
1285
  - Node: **t2a-kokoro**<br/>
1027
1286
  Purpose: **Kokoro Text-to-Speech conversion**<br/>
1028
1287
  Example: `t2a-kokoro(language: "en")`<br/>
@@ -1043,6 +1302,26 @@ The following nodes convert text chunks to audio chunks.
1043
1302
  | **language** | 1 | "en" | `/^en$/` |
1044
1303
  | **speed** | 2 | 1.25 | 1.0...1.30 |
1045
1304
 
1305
+ - Node: **t2a-supertonic**<br/>
1306
+ Purpose: **Supertonic Text-to-Speech conversion**<br/>
1307
+ Example: `t2a-supertonic(voice: "M1", speed: 1.40)`<br/>
1308
+
1309
+ > This node uses Supertonic to perform Text-to-Speech (T2S) conversion,
1310
+ > i.e., it converts the input text stream into an output audio stream.
1311
+ > It is intended to generate speech. The ONNX models are automatically
1312
+ > downloaded from HuggingFace on first use. It supports English language only.
1313
+
1314
+ | Port | Payload |
1315
+ | ------- | ----------- |
1316
+ | input | text |
1317
+ | output | audio |
1318
+
1319
+ | Parameter | Position | Default | Requirement |
1320
+ | ------------ | --------- | -------- | ----------- |
1321
+ | **voice** | 0 | "M1" | `/^(?:M1\|M2\|F1\|F2)$/` |
1322
+ | **speed** | 1 | 1.40 | `0.5 <= n <= 2.0` |
1323
+ | **steps** | 2 | 20 | `1 <= n <= 20` |
1324
+
1046
1325
  ### Any-to-Any Nodes
1047
1326
 
1048
1327
  The following nodes process any type of chunk, i.e., both audio and text chunks.
@@ -1064,8 +1343,8 @@ The following nodes process any type of chunk, i.e., both audio and text chunks.
1064
1343
  | Parameter | Position | Default | Requirement |
1065
1344
  | ------------ | --------- | -------- | --------------------- |
1066
1345
  | **type** | 0 | "audio" | `/^(?:audio\|text)$/` |
1067
- | **name** | 1 | "filter" | `/^.+$/` |
1068
- | **var** | 2 | "" | `/^(?:meta:.+\|payload:(?:length\|text)\|time:(?:start\|end))$/` |
1346
+ | **name** | 1 | "filter" | `/^.+?$/` |
1347
+ | **var** | 2 | "" | `/^(?:meta:.+\|payload:(?:length\|text)\|time:(?:start\|end)\|kind\|type)$/` |
1069
1348
  | **op** | 3 | "==" | `/^(?:<\|<=\|==\|!=\|~~\|!~\|>=\|>)$/` |
1070
1349
  | **val** | 4 | "" | `/^.*$/` |
1071
1350