speechflow 1.7.1 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/CHANGELOG.md +24 -0
  2. package/README.md +388 -120
  3. package/etc/claude.md +5 -5
  4. package/etc/speechflow.yaml +2 -2
  5. package/package.json +3 -3
  6. package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
  7. package/speechflow-cli/dst/speechflow-main-cli.js +1 -0
  8. package/speechflow-cli/dst/speechflow-main-cli.js.map +1 -1
  9. package/speechflow-cli/dst/speechflow-main-graph.d.ts +1 -0
  10. package/speechflow-cli/dst/speechflow-main-graph.js +30 -9
  11. package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
  12. package/speechflow-cli/dst/speechflow-main-nodes.js +1 -0
  13. package/speechflow-cli/dst/speechflow-main-nodes.js.map +1 -1
  14. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +1 -0
  15. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
  16. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +7 -9
  17. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
  18. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -0
  19. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js.map +1 -1
  20. package/speechflow-cli/dst/speechflow-node-a2a-expander.js +8 -9
  21. package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
  22. package/speechflow-cli/dst/speechflow-node-a2a-filler.js +2 -0
  23. package/speechflow-cli/dst/speechflow-node-a2a-filler.js.map +1 -1
  24. package/speechflow-cli/dst/speechflow-node-a2a-gender.js +1 -1
  25. package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
  26. package/speechflow-cli/dst/speechflow-node-a2a-meter.js +1 -1
  27. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +11 -9
  28. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
  29. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js +1 -0
  30. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js.map +1 -1
  31. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js.map +1 -1
  32. package/speechflow-cli/dst/speechflow-node-a2a-speex.js +4 -2
  33. package/speechflow-cli/dst/speechflow-node-a2a-speex.js.map +1 -1
  34. package/speechflow-cli/dst/speechflow-node-a2a-vad.js +19 -22
  35. package/speechflow-cli/dst/speechflow-node-a2a-vad.js.map +1 -1
  36. package/speechflow-cli/dst/speechflow-node-a2a-wav.js +31 -4
  37. package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
  38. package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
  39. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +2 -11
  40. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
  41. package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +16 -0
  42. package/speechflow-cli/dst/speechflow-node-a2t-google.js +314 -0
  43. package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -0
  44. package/speechflow-cli/dst/speechflow-node-a2t-openai.js +6 -1
  45. package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
  46. package/speechflow-cli/dst/speechflow-node-t2a-amazon.d.ts +1 -1
  47. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +27 -7
  48. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
  49. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.d.ts +1 -1
  50. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +5 -3
  51. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
  52. package/speechflow-cli/dst/speechflow-node-t2a-google.d.ts +15 -0
  53. package/speechflow-cli/dst/speechflow-node-t2a-google.js +215 -0
  54. package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -0
  55. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +1 -1
  56. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +27 -6
  57. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
  58. package/speechflow-cli/dst/speechflow-node-t2a-openai.d.ts +15 -0
  59. package/speechflow-cli/dst/speechflow-node-t2a-openai.js +192 -0
  60. package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -0
  61. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +17 -0
  62. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +619 -0
  63. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -0
  64. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js +0 -2
  65. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
  66. package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
  67. package/speechflow-cli/dst/speechflow-node-t2t-google.js.map +1 -1
  68. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.d.ts → speechflow-node-t2t-opus.d.ts} +1 -3
  69. package/speechflow-cli/dst/speechflow-node-t2t-opus.js +161 -0
  70. package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -0
  71. package/speechflow-cli/dst/speechflow-node-t2t-profanity.d.ts +11 -0
  72. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js +118 -0
  73. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js.map +1 -0
  74. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.d.ts +13 -0
  75. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +220 -0
  76. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -0
  77. package/speechflow-cli/dst/{speechflow-node-t2t-openai.d.ts → speechflow-node-t2t-spellcheck.d.ts} +2 -2
  78. package/speechflow-cli/dst/{speechflow-node-t2t-openai.js → speechflow-node-t2t-spellcheck.js} +48 -100
  79. package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -0
  80. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +8 -8
  81. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
  82. package/speechflow-cli/dst/speechflow-node-t2t-summary.d.ts +16 -0
  83. package/speechflow-cli/dst/speechflow-node-t2t-summary.js +241 -0
  84. package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -0
  85. package/speechflow-cli/dst/{speechflow-node-t2t-ollama.d.ts → speechflow-node-t2t-translate.d.ts} +2 -2
  86. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.js → speechflow-node-t2t-translate.js} +53 -115
  87. package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -0
  88. package/speechflow-cli/dst/speechflow-node-x2x-filter.js +2 -0
  89. package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
  90. package/speechflow-cli/dst/speechflow-node-xio-exec.d.ts +12 -0
  91. package/speechflow-cli/dst/speechflow-node-xio-exec.js +224 -0
  92. package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -0
  93. package/speechflow-cli/dst/speechflow-node-xio-file.d.ts +1 -0
  94. package/speechflow-cli/dst/speechflow-node-xio-file.js +78 -67
  95. package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
  96. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
  97. package/speechflow-cli/dst/speechflow-node-xio-vban.d.ts +17 -0
  98. package/speechflow-cli/dst/speechflow-node-xio-vban.js +330 -0
  99. package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -0
  100. package/speechflow-cli/dst/speechflow-node-xio-webrtc.d.ts +39 -0
  101. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +502 -0
  102. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -0
  103. package/speechflow-cli/dst/speechflow-node-xio-websocket.js +9 -9
  104. package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
  105. package/speechflow-cli/dst/speechflow-util-audio.js +8 -5
  106. package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
  107. package/speechflow-cli/dst/speechflow-util-error.d.ts +1 -0
  108. package/speechflow-cli/dst/speechflow-util-error.js +5 -0
  109. package/speechflow-cli/dst/speechflow-util-error.js.map +1 -1
  110. package/speechflow-cli/dst/speechflow-util-llm.d.ts +35 -0
  111. package/speechflow-cli/dst/speechflow-util-llm.js +363 -0
  112. package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -0
  113. package/speechflow-cli/dst/speechflow-util-queue.js +2 -1
  114. package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
  115. package/speechflow-cli/dst/speechflow-util.d.ts +1 -0
  116. package/speechflow-cli/dst/speechflow-util.js +2 -0
  117. package/speechflow-cli/dst/speechflow-util.js.map +1 -1
  118. package/speechflow-cli/etc/oxlint.jsonc +2 -1
  119. package/speechflow-cli/package.json +35 -18
  120. package/speechflow-cli/src/lib.d.ts +5 -0
  121. package/speechflow-cli/src/speechflow-main-api.ts +16 -16
  122. package/speechflow-cli/src/speechflow-main-cli.ts +1 -0
  123. package/speechflow-cli/src/speechflow-main-graph.ts +38 -14
  124. package/speechflow-cli/src/speechflow-main-nodes.ts +1 -0
  125. package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +1 -0
  126. package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +8 -10
  127. package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -0
  128. package/speechflow-cli/src/speechflow-node-a2a-expander.ts +9 -10
  129. package/speechflow-cli/src/speechflow-node-a2a-filler.ts +2 -0
  130. package/speechflow-cli/src/speechflow-node-a2a-gender.ts +3 -3
  131. package/speechflow-cli/src/speechflow-node-a2a-meter.ts +2 -2
  132. package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +11 -9
  133. package/speechflow-cli/src/speechflow-node-a2a-rnnoise-wt.ts +1 -0
  134. package/speechflow-cli/src/speechflow-node-a2a-rnnoise.ts +1 -1
  135. package/speechflow-cli/src/speechflow-node-a2a-speex.ts +5 -3
  136. package/speechflow-cli/src/speechflow-node-a2a-vad.ts +20 -23
  137. package/speechflow-cli/src/speechflow-node-a2a-wav.ts +31 -4
  138. package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +6 -18
  139. package/speechflow-cli/src/speechflow-node-a2t-google.ts +315 -0
  140. package/speechflow-cli/src/speechflow-node-a2t-openai.ts +12 -7
  141. package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +32 -10
  142. package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +6 -4
  143. package/speechflow-cli/src/speechflow-node-t2a-google.ts +203 -0
  144. package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +33 -10
  145. package/speechflow-cli/src/speechflow-node-t2a-openai.ts +176 -0
  146. package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +710 -0
  147. package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +3 -4
  148. package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +2 -2
  149. package/speechflow-cli/src/speechflow-node-t2t-google.ts +1 -1
  150. package/speechflow-cli/src/speechflow-node-t2t-opus.ts +137 -0
  151. package/speechflow-cli/src/speechflow-node-t2t-profanity.ts +93 -0
  152. package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +201 -0
  153. package/speechflow-cli/src/speechflow-node-t2t-spellcheck.ts +188 -0
  154. package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +8 -8
  155. package/speechflow-cli/src/speechflow-node-t2t-summary.ts +229 -0
  156. package/speechflow-cli/src/speechflow-node-t2t-translate.ts +181 -0
  157. package/speechflow-cli/src/speechflow-node-x2x-filter.ts +2 -0
  158. package/speechflow-cli/src/speechflow-node-xio-exec.ts +211 -0
  159. package/speechflow-cli/src/speechflow-node-xio-file.ts +91 -80
  160. package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +2 -2
  161. package/speechflow-cli/src/speechflow-node-xio-vban.ts +325 -0
  162. package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +535 -0
  163. package/speechflow-cli/src/speechflow-node-xio-websocket.ts +9 -9
  164. package/speechflow-cli/src/speechflow-util-audio.ts +10 -5
  165. package/speechflow-cli/src/speechflow-util-error.ts +9 -0
  166. package/speechflow-cli/src/speechflow-util-llm.ts +367 -0
  167. package/speechflow-cli/src/speechflow-util-queue.ts +3 -3
  168. package/speechflow-cli/src/speechflow-util.ts +2 -0
  169. package/speechflow-ui-db/package.json +9 -9
  170. package/speechflow-ui-st/package.json +9 -9
  171. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +0 -293
  172. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +0 -1
  173. package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +0 -1
  174. package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +0 -1
  175. package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +0 -281
  176. package/speechflow-cli/src/speechflow-node-t2t-openai.ts +0 -247
  177. package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +0 -247
package/README.md CHANGED
@@ -26,7 +26,8 @@ speech-to-speech).
26
26
  **SpeechFlow** comes with built-in graph nodes for various functionalities:
27
27
 
28
28
  - file and audio device I/O for local connectivity,
29
- - WebSocket and MQTT network I/O for remote connectivity,
29
+ - WebSocket, MQTT, VBAN, and WebRTC network I/O for remote connectivity,
30
+ - external command execution I/O for process integration,
30
31
  - local Voice Activity Detection (VAD),
31
32
  - local voice gender recognition,
32
33
  - local audio LUFS-S/RMS metering,
@@ -38,20 +39,27 @@ speech-to-speech).
38
39
  - remote-controlable audio muting,
39
40
  - cloud-based speech-to-text conversion with
40
41
  [Amazon Transcribe](https://aws.amazon.com/transcribe/),
41
- [OpenAI GPT-Transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe), or
42
- [Deepgram](https://deepgram.com).
42
+ [OpenAI GPT-Transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe),
43
+ [Deepgram](https://deepgram.com), or
44
+ [Google Cloud Speech-to-Text](https://cloud.google.com/speech-to-text).
43
45
  - cloud-based text-to-text translation (or spelling correction) with
44
46
  [DeepL](https://deepl.com),
45
47
  [Amazon Translate](https://aws.amazon.com/translate/),
46
- [Google Cloud Translate](https://cloud.google.com/translate), or
47
- [OpenAI GPT](https://openai.com).
48
+ [Google Cloud Translate](https://cloud.google.com/translate),
49
+ [OpenAI GPT](https://openai.com),
50
+ [Anthropic Claude](https://anthropic.com), or
51
+ [Google Gemini](https://ai.google.dev).
48
52
  - local text-to-text translation (or spelling correction) with
49
- [Ollama/Gemma](https://ollama.com) or
50
- [Transformers/OPUS](https://github.com/Helsinki-NLP/Opus-MT).
53
+ [Ollama](https://ollama.com) or
54
+ [OPUS-MT](https://github.com/Helsinki-NLP/Opus-MT).
51
55
  - cloud-based text-to-speech conversion with
52
- [ElevenLabs](https://elevenlabs.io/) or
53
- [Amazon Polly](https://aws.amazon.com/polly/).
54
- - local text-to-speech conversion with [Kokoro](https://github.com/nazdridoy/kokoro-tts).
56
+ [OpenAI TTS](https://platform.openai.com/docs/guides/text-to-speech),
57
+ [ElevenLabs](https://elevenlabs.io/),
58
+ [Amazon Polly](https://aws.amazon.com/polly/), or
59
+ [Google Cloud Text-to-Speech](https://cloud.google.com/text-to-speech).
60
+ - local text-to-speech conversion with
61
+ [Kokoro](https://github.com/nazdridoy/kokoro-tts) or
62
+ [Supertonic](https://huggingface.co/Supertone/supertonic).
55
63
  - local [FFmpeg](https://ffmpeg.org/)-based speech-to-speech conversion,
56
64
  - local WAV speech-to-speech decoding/encoding,
57
65
  - local text-to-text formatting, regex-based modification,
@@ -221,8 +229,8 @@ They can also be found in the sample [speechflow.yaml](./etc/speechflow.yaml) fi
221
229
 
222
230
  ```
223
231
  xio-device(device: env.SPEECHFLOW_DEVICE_MIC, mode: "r") |
224
- a2a-wav(mode: "encode") |
225
- xio-file(path: "capture.wav", mode: "w", type: "audio")
232
+ a2a-wav(mode: "encode", seekable: true) |
233
+ xio-file(path: "capture.wav", mode: "w", type: "audio", seekable: true)
226
234
  ```
227
235
 
228
236
  - **Pass-Through**: Pass-through audio from microphone device to speaker
@@ -335,7 +343,10 @@ First a short overview of the available processing nodes:
335
343
  **xio-file**,
336
344
  **xio-device**,
337
345
  **xio-websocket**,
338
- **xio-mqtt**.
346
+ **xio-mqtt**,
347
+ **xio-vban**,
348
+ **xio-webrtc**,
349
+ **xio-exec**.
339
350
  - Audio-to-Audio nodes:
340
351
  **a2a-ffmpeg**,
341
352
  **a2a-wav**,
@@ -353,22 +364,29 @@ First a short overview of the available processing nodes:
353
364
  - Audio-to-Text nodes:
354
365
  **a2t-openai**,
355
366
  **a2t-amazon**,
356
- **a2t-deepgram**.
367
+ **a2t-deepgram**,
368
+ **a2t-google**.
357
369
  - Text-to-Text nodes:
358
370
  **t2t-deepl**,
359
371
  **t2t-amazon**,
360
- **t2t-openai**,
361
- **t2t-ollama**,
362
- **t2t-transformers**,
372
+ **t2t-opus**,
363
373
  **t2t-google**,
374
+ **t2t-translate**,
375
+ **t2t-spellcheck**,
376
+ **t2t-punctuation**,
364
377
  **t2t-modify**,
378
+ **t2t-profanity**,
379
+ **t2t-summary**,
365
380
  **t2t-subtitle**,
366
381
  **t2t-format**,
367
382
  **t2t-sentence**.
368
383
  - Text-to-Audio nodes:
384
+ **t2a-openai**,
369
385
  **t2a-amazon**,
370
386
  **t2a-elevenlabs**,
371
- **t2a-kokoro**.
387
+ **t2a-google**,
388
+ **t2a-kokoro**,
389
+ **t2a-supertonic**.
372
390
  - Any-to-Any nodes:
373
391
  **x2x-filter**,
374
392
  **x2x-trace**.
@@ -384,20 +402,24 @@ external files, devices and network services.
384
402
 
385
403
  > This node allows the reading/writing from/to files or from StdIO. It
386
404
  > is intended to be used as source and sink nodes in batch processing,
387
- > and as sing nodes in real-time processing.
405
+ > and as sing nodes in real-time processing. When `seekable` is enabled
406
+ > for write mode, the node uses a file descriptor allowing random access
407
+ > writes to specific file positions via the `chunk:seek` metadata field.
408
+ > Option `seekable` cannot be used on StdIO.
388
409
 
389
410
  | Port | Payload |
390
411
  | ------- | ----------- |
391
412
  | input | text, audio |
392
413
  | output | text, audio |
393
414
 
394
- | Parameter | Position | Default | Requirement |
395
- | ---------- | --------- | -------- | --------------------- |
396
- | **path** | 0 | *none* | *none* |
397
- | **mode** | 1 | "r" | `/^(?:r\|w\|rw)$/` |
398
- | **type** | 2 | "audio" | `/^(?:audio\|text)$/` |
399
- | **chunka** | | 200 | `10 <= n <= 1000` |
400
- | **chunkt** | | 65536 | `1024 <= n <= 131072` |
415
+ | Parameter | Position | Default | Requirement |
416
+ | -------------- | --------- | -------- | --------------------- |
417
+ | **path** | 0 | *none* | *none* |
418
+ | **mode** | 1 | "r" | `/^(?:r\|w)$/` |
419
+ | **type** | 2 | "audio" | `/^(?:audio\|text)$/` |
420
+ | **seekable** | | false | *none* |
421
+ | **chunkAudio** | | 200 | `10 <= n <= 1000` |
422
+ | **chunkText** | | 65536 | `1024 <= n <= 131072` |
401
423
 
402
424
  - Node: **xio-device**<br/>
403
425
  Purpose: **Microphone/speaker device source/sink**<br/>
@@ -464,6 +486,82 @@ external files, devices and network services.
464
486
  | **mode** | 5 | "w" | `/^(?:r\|w\|rw)$/` |
465
487
  | **type** | 6 | "text" | `/^(?:audio\|text)$/` |
466
488
 
489
+ - Node: **xio-vban**<br/>
490
+ Purpose: **VBAN network audio source/sink**<br/>
491
+ Example: `xio-vban(listen: 6980, stream: "Stream1", mode: "r")`
492
+ Notice: this node requires a peer VBAN-compatible application!
493
+
494
+ > This node allows reading/writing audio from/to VBAN (VoiceMeeter
495
+ > Audio Network) protocol endpoints. It is intended to be used for
496
+ > real-time audio streaming with applications like VoiceMeeter,
497
+ > VB-Audio Matrix, or other VBAN-compatible software. It supports
498
+ > various audio bit resolutions (8-bit, 16-bit, 24-bit, 32-bit,
499
+ > float32, float64) and automatic channel downmixing to mono.
500
+
501
+ | Port | Payload |
502
+ | ------- | ----------- |
503
+ | input | audio |
504
+ | output | audio |
505
+
506
+ | Parameter | Position | Default | Requirement |
507
+ | ----------- | --------- | --------- | ---------------------------- |
508
+ | **listen** | 0 | "" | `/^(?:\|\d+\|.+?:\d+)$/` |
509
+ | **connect** | 1 | "" | `/^(?:\|.+?:\d+)$/` |
510
+ | **stream** | 2 | "Stream" | `/^.{1,16}$/` |
511
+ | **mode** | 3 | "rw" | `/^(?:r\|w\|rw)$/` |
512
+
513
+ - Node: **xio-webrtc**<br/>
514
+ Purpose: **WebRTC audio streaming source (WHIP) or sink (WHEP)**<br/>
515
+ Example: `xio-webrtc(listen: 8085, path: "/webrtc", mode: "r")`
516
+
517
+ > This node allows real-time audio streaming using WebRTC technology
518
+ > via WebRTC-HTTP Ingestion Protocol (WHIP) or WebRTC-HTTP Egress
519
+ > Protocol (WHEP). It provides an HTTP server for SDP negotiation
520
+ > and uses Opus codec for audio encoding/decoding at 48kHz. The node
521
+ > can operate in WHIP mode (i.e., read mode where publishers POST
522
+ > SDP offers to SpeechFlow and SpeechFlow receives audio stream from
523
+ > them) or WHEP mode (i.e., write mode where viewers POST SDP offers
524
+ > to SpeechFlow and SpeechFlow sends audio stream to them). This node
525
+ > supports multiple simultaneous connections, configurable ICE servers
526
+ > for NAT traversal, and automatic connection lifecycle management.
527
+
528
+ | Port | Payload |
529
+ | ------- | ----------- |
530
+ | input | audio |
531
+ | output | audio |
532
+
533
+ | Parameter | Position | Default | Requirement |
534
+ | -------------- | --------- | --------- | ---------------------------- |
535
+ | **listen** | 0 | "8085" | `/^(?:\d+\|.+?:\d+)$/` |
536
+ | **path** | 1 | "/webrtc" | `/^\/.+$/` |
537
+ | **mode** | 2 | "r" | `/^(?:r\|w)$/` |
538
+ | **iceServers** | 3 | "" | `/^.*$/` |
539
+
540
+ - Node: **xio-exec**<br/>
541
+ Purpose: **External command execution source/sink**<br/>
542
+ Example: `xio-exec(command: "ffmpeg -i - -f s16le -", mode: "rw", type: "audio")`
543
+
544
+ > This node allows reading/writing from/to external commands via stdin/stdout.
545
+ > It executes arbitrary commands and pipes audio or text data through them,
546
+ > enabling integration with external processing tools. The node supports
547
+ > read-only mode (capturing stdout), write-only mode (sending to stdin),
548
+ > and bidirectional mode (both stdin and stdout). This is useful for integrating
549
+ > external audio/text processing tools like FFmpeg, SoX, or custom scripts into
550
+ > the SpeechFlow pipeline.
551
+
552
+ | Port | Payload |
553
+ | ------- | ----------- |
554
+ | input | text, audio |
555
+ | output | text, audio |
556
+
557
+ | Parameter | Position | Default | Requirement |
558
+ | -------------- | --------- | -------- | --------------------- |
559
+ | **command** | 0 | *none* | *required* |
560
+ | **mode** | 1 | "r" | `/^(?:r\|w\|rw)$/` |
561
+ | **type** | 2 | "audio" | `/^(?:audio\|text)$/` |
562
+ | **chunkAudio** | | 200 | `10 <= n <= 1000` |
563
+ | **chunkText** | | 65536 | `1024 <= n <= 131072` |
564
+
467
565
  ### Audio-to-Audio Nodes
468
566
 
469
567
  The following nodes process audio chunks only.
@@ -481,10 +579,10 @@ The following nodes process audio chunks only.
481
579
  | input | audio |
482
580
  | output | audio |
483
581
 
484
- | Parameter | Position | Default | Requirement |
485
- | ----------- | --------- | -------- | ------------------ |
486
- | **src** | 0 | "pcm" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
487
- | **dst** | 1 | "wav" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
582
+ | Parameter | Position | Default | Requirement |
583
+ | --------- | --------- | -------- | ------------------ |
584
+ | **src** | 0 | "pcm" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
585
+ | **dst** | 1 | "wav" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
488
586
 
489
587
  - Node: **a2a-wav**<br/>
490
588
  Purpose: **WAV audio format conversion**<br/>
@@ -493,15 +591,20 @@ The following nodes process audio chunks only.
493
591
  > This node allows converting between PCM and WAV audio formats. It is
494
592
  > primarily intended to support the reading/writing of external WAV
495
593
  > format files, although SpeechFlow internally uses PCM format only.
594
+ > When `seekable` is enabled in encode mode, the node writes a corrected
595
+ > WAV header at the end of processing with accurate file size information
596
+ > by seeking back to position 0, producing standard-compliant WAV files.
597
+ > Option `seekable` requires a seekable output stream.
496
598
 
497
599
  | Port | Payload |
498
600
  | ------- | ----------- |
499
601
  | input | audio |
500
602
  | output | audio |
501
603
 
502
- | Parameter | Position | Default | Requirement |
503
- | ----------- | --------- | -------- | ------------------------ |
504
- | **mode** | 0 | "encode" | `/^(?:encode\|decode)$/` |
604
+ | Parameter | Position | Default | Requirement |
605
+ | ------------ | --------- | -------- | ------------------------ |
606
+ | **mode** | 0 | "encode" | `/^(?:encode\|decode)$/` |
607
+ | **seekable** | 1 | false | *none* |
505
608
 
506
609
  - Node: **a2a-mute**<br/>
507
610
  Purpose: **volume muting node**<br/>
@@ -516,8 +619,8 @@ The following nodes process audio chunks only.
516
619
  | input | audio |
517
620
  | output | audio |
518
621
 
519
- | Parameter | Position | Default | Requirement |
520
- | ----------- | --------- | -------- | ------------------------ |
622
+ | Parameter | Position | Default | Requirement |
623
+ | --------- | --------- | -------- | ------------------------ |
521
624
 
522
625
  - Node: **a2a-meter**<br/>
523
626
  Purpose: **Loudness metering node**<br/>
@@ -552,8 +655,8 @@ The following nodes process audio chunks only.
552
655
  | input | audio |
553
656
  | output | audio |
554
657
 
555
- | Parameter | Position | Default | Requirement |
556
- | ----------- | --------- | -------- | ------------------------ |
658
+ | Parameter | Position | Default | Requirement |
659
+ | --------- | --------- | -------- | ------------------------ |
557
660
  | **mode** | *none* | "unplugged" | `/^(?:silenced\|unplugged)$/` |
558
661
  | **posSpeechThreshold** | *none* | 0.50 | *none* |
559
662
  | **negSpeechThreshold** | *none* | 0.35 | *none* |
@@ -584,7 +687,7 @@ The following nodes process audio chunks only.
584
687
 
585
688
  - Node: **a2a-speex**<br/>
586
689
  Purpose: **Speex Noise Suppression node**<br/>
587
- Example: `a2a-speex(attentuate: -18)`
690
+ Example: `a2a-speex(attenuate: -18)`
588
691
 
589
692
  > This node uses the Speex DSP pre-processor to perform noise
590
693
  > suppression, i.e., it detects and attenuates (by a certain level of
@@ -595,9 +698,9 @@ The following nodes process audio chunks only.
595
698
  | input | audio |
596
699
  | output | audio |
597
700
 
598
- | Parameter | Position | Default | Requirement |
599
- | ----------- | --------- | -------- | ------------------------ |
600
- | **attentuate** | 0 | -18 | *none* | `-60 <= n <= 0` |
701
+ | Parameter | Position | Default | Requirement |
702
+ | ------------- | --------- | -------- | ------------------ |
703
+ | **attenuate** | 0 | -18 | `-60 <= n <= 0` |
601
704
 
602
705
  - Node: **a2a-rnnoise**<br/>
603
706
  Purpose: **RNNoise Noise Suppression node**<br/>
@@ -611,8 +714,8 @@ The following nodes process audio chunks only.
611
714
  | input | audio |
612
715
  | output | audio |
613
716
 
614
- | Parameter | Position | Default | Requirement |
615
- | ----------- | --------- | -------- | ------------------------ |
717
+ | Parameter | Position | Default | Requirement |
718
+ | --------- | --------- | -------- | ------------------------ |
616
719
 
617
720
  - Node: **a2a-compressor**<br/>
618
721
  Purpose: **audio compressor node**<br/>
@@ -650,15 +753,15 @@ The following nodes process audio chunks only.
650
753
  | input | audio |
651
754
  | output | audio |
652
755
 
653
- | Parameter | Position | Default | Requirement |
654
- | --------------- | --------- | -------- | ------------------------ |
655
- | **thresholdDb** | *none* | -45 | `n <= 0 && n >= -100`|
656
- | **floorDb** | *none* | -64 | `n <= 0 && n >= -100`|
657
- | **ratio** | *none* | 4.0 | `n >= 1 && n <= 20` |
658
- | **attackMs** | *none* | 10 | `n >= 0 && n <= 1000`|
659
- | **releaseMs** | *none* | 50 | `n >= 0 && n <= 1000`|
660
- | **kneeDb** | *none* | 6.0 | `n >= 0 && n <= 40` |
661
- | **makeupDb** | *none* | 0 | `n >= -24 && n <= 24`|
756
+ | Parameter | Position | Default | Requirement |
757
+ | --------------- | --------- | -------- | --------------------- |
758
+ | **thresholdDb** | *none* | -45 | `n <= 0 && n >= -100` |
759
+ | **floorDb** | *none* | -64 | `n <= 0 && n >= -100` |
760
+ | **ratio** | *none* | 4.0 | `n >= 1 && n <= 20` |
761
+ | **attackMs** | *none* | 10 | `n >= 0 && n <= 1000` |
762
+ | **releaseMs** | *none* | 50 | `n >= 0 && n <= 1000` |
763
+ | **kneeDb** | *none* | 6.0 | `n >= 0 && n <= 40` |
764
+ | **makeupDb** | *none* | 0 | `n >= -24 && n <= 24` |
662
765
 
663
766
  - Node: **a2a-gain**<br/>
664
767
  Purpose: **audio gain adjustment node**<br/>
@@ -672,9 +775,9 @@ The following nodes process audio chunks only.
672
775
  | input | audio |
673
776
  | output | audio |
674
777
 
675
- | Parameter | Position | Default | Requirement |
676
- | ----------- | --------- | -------- | ------------------------ |
677
- | **db** | 0 | 0 | `n >= -60 && n <= 60` |
778
+ | Parameter | Position | Default | Requirement |
779
+ | --------- | --------- | -------- | --------------------- |
780
+ | **db** | 0 | 0 | `n >= -60 && n <= 60` |
678
781
 
679
782
  - Node: **a2a-pitch**<br/>
680
783
  Purpose: **audio pitch shifting and time stretching**<br/>
@@ -710,9 +813,9 @@ The following nodes process audio chunks only.
710
813
  | input | audio |
711
814
  | output | audio |
712
815
 
713
- | Parameter | Position | Default | Requirement |
714
- | ----------- | --------- | -------- | ------------------------ |
715
- | **segment** | 0 | 50 | `n >= 10 && n <= 1000` |
816
+ | Parameter | Position | Default | Requirement |
817
+ | ----------- | --------- | -------- | ---------------------- |
818
+ | **segment** | 0 | 50 | `n >= 10 && n <= 1000` |
716
819
 
717
820
  ### Audio-to-Text Nodes
718
821
 
@@ -785,6 +888,28 @@ The following nodes convert audio to text chunks.
785
888
  | **language** | 2 | "multi" | *none* |
786
889
  | **interim** | 3 | false | *none* |
787
890
 
891
+ - Node: **a2t-google**<br/>
892
+ Purpose: **Google Cloud Speech-to-Text conversion**<br/>
893
+ Example: `a2t-google(language: "en-US")`<br/>
894
+ Notice: this node requires a Google Cloud API key!
895
+
896
+ > This node uses Google Cloud Speech-to-Text to perform Speech-to-Text (S2T)
897
+ > conversion, i.e., it recognizes speech in the input audio stream and
898
+ > outputs a corresponding text stream. It supports various languages
899
+ > and models, including the `latest_long` model for long-form audio.
900
+
901
+ | Port | Payload |
902
+ | ------- | ----------- |
903
+ | input | audio |
904
+ | output | text |
905
+
906
+ | Parameter | Position | Default | Requirement |
907
+ | ------------ | --------- | ------------- | ------------ |
908
+ | **key** | *none* | env.SPEECHFLOW\_GOOGLE\_KEY | *none* |
909
+ | **model** | 0 | "latest_long" | *none* |
910
+ | **language** | 1 | "en-US" | *none* |
911
+ | **interim** | 2 | false | *none* |
912
+
788
913
  ### Text-to-Text Nodes
789
914
 
790
915
  The following nodes process text chunks only.
@@ -794,73 +919,65 @@ The following nodes process text chunks only.
794
919
  Example: `t2t-deepl(src: "de", dst: "en")`<br/>
795
920
  Notice: this node requires an API key!
796
921
 
797
- > This node performs translation between English and German languages.
922
+ > This node performs translation between multiple languages.
798
923
 
799
924
  | Port | Payload |
800
925
  | ------- | ----------- |
801
926
  | input | text |
802
927
  | output | text |
803
928
 
804
- | Parameter | Position | Default | Requirement |
805
- | ------------ | --------- | -------- | ------------------ |
806
- | **key** | *none* | env.SPEECHFLOW\_DEEPL\_KEY | *none* |
807
- | **src** | 0 | "de" | `/^(?:de\|en)$/` |
808
- | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
929
+ | Parameter | Position | Default | Requirement |
930
+ | ------------ | --------- | ---------- | ----------------------------- |
931
+ | **key** | *none* | env.SPEECHFLOW\_DEEPL\_KEY | *none* |
932
+ | **src** | 0 | "de" | `/^(?:de\|en\|fr\|it)$/` |
933
+ | **dst** | 1 | "en" | `/^(?:de\|en\|fr\|it)$/` |
934
+ | **optimize** | 2 | "latency" | `/^(?:latency\|quality)$/` |
809
935
 
810
936
  - Node: **t2t-amazon**<br/>
811
937
  Purpose: **AWS Translate Text-to-Text translation**<br/>
812
938
  Example: `t2t-amazon(src: "de", dst: "en")`<br/>
813
939
  Notice: this node requires an API key!
814
940
 
815
- > This node performs translation between English and German languages.
941
+ > This node performs translation between multiple languages.
816
942
 
817
943
  | Port | Payload |
818
944
  | ------- | ----------- |
819
945
  | input | text |
820
946
  | output | text |
821
947
 
822
- | Parameter | Position | Default | Requirement |
823
- | ------------ | --------- | -------- | ------------------ |
824
- | **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none* |
948
+ | Parameter | Position | Default | Requirement |
949
+ | ------------ | --------- | -------- | ---------------------------- |
950
+ | **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none* |
825
951
  | **secKey** | *none* | env.SPEECHFLOW\_AMAZON\_KEY\_SEC | *none* |
826
- | **region** | *none* | "eu-central-1" | *none* |
827
- | **src** | 0 | "de" | `/^(?:de\|en)$/` |
828
- | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
952
+ | **region** | *none* | "eu-central-1" | *none* |
953
+ | **src** | 0 | "de" | `/^(?:de\|en\|fr\|it)$/` |
954
+ | **dst** | 1 | "en" | `/^(?:de\|en\|fr\|it)$/` |
829
955
 
830
- - Node: **t2t-openai**<br/>
831
- Purpose: **OpenAI/GPT Text-to-Text translation and spelling correction**<br/>
832
- Example: `t2t-openai(src: "de", dst: "en")`<br/>
833
- Notice: this node requires an OpenAI API key!
956
+ - Node: **t2t-opus**<br/>
957
+ Purpose: **OPUS-MT Text-to-Text translation**<br/>
958
+ Example: `t2t-opus(src: "de", dst: "en")`<br/>
834
959
 
835
960
  > This node performs translation between English and German languages
836
- > in the text stream or (if the source and destination language is
837
- > the same) spellchecking of English or German languages in the text
838
- > stream. It is based on the remote OpenAI cloud AI service and uses
839
- > the GPT-4o-mini LLM.
961
+ > in the text stream. It is based on the local OPUS-MT translation model.
840
962
 
841
963
  | Port | Payload |
842
964
  | ------- | ----------- |
843
965
  | input | text |
844
966
  | output | text |
845
967
 
846
- | Parameter | Position | Default | Requirement |
847
- | ------------ | --------- | -------- | ------------------ |
848
- | **api** | *none* | "https://api.openai.com" | `/^https?:\/\/.+?:\d+$/` |
968
+ | Parameter | Position | Default | Requirement |
969
+ | ------------ | --------- | -------- | ---------------- |
849
970
  | **src** | 0 | "de" | `/^(?:de\|en)$/` |
850
971
  | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
851
- | **key** | *none* | env.SPEECHFLOW\_OPENAI\_KEY | *none* |
852
- | **model** | *none* | "gpt-5-mini" | *none* |
853
972
 
854
- - Node: **t2t-ollama**<br/>
855
- Purpose: **Ollama/Gemma Text-to-Text translation and spelling correction**<br/>
856
- Example: `t2t-ollama(src: "de", dst: "en")`<br/>
857
- Notice: this node requires Ollama to be installed!
973
+ - Node: **t2t-google**<br/>
974
+ Purpose: **Google Cloud Translate Text-to-Text translation**<br/>
975
+ Example: `t2t-google(src: "de", dst: "en")`<br/>
976
+ Notice: this node requires a Google Cloud API key and project ID!
858
977
 
859
- > This node performs translation between English and German languages
860
- > in the text stream or (if the source and destination language is
861
- > the same) spellchecking of English or German languages in the text
862
- > stream. It is based on the local Ollama AI service and uses the
863
- > Google Gemma 3 LLM.
978
+ > This node performs translation between multiple languages
979
+ > in the text stream using Google Cloud Translate API.
980
+ > It supports German, English, French, and Italian languages.
864
981
 
865
982
  | Port | Payload |
866
983
  | ------- | ----------- |
@@ -869,48 +986,83 @@ The following nodes process text chunks only.
869
986
 
870
987
  | Parameter | Position | Default | Requirement |
871
988
  | ------------ | --------- | -------- | ------------------ |
872
- | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?:\d+$/` |
873
- | **model** | *none* | "gemma3:4b-it-q4_K_M" | *none* |
874
- | **src** | 0 | "de" | `/^(?:de\|en)$/` |
875
- | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
989
+ | **key** | *none* | env.SPEECHFLOW\_GOOGLE\_KEY | *none* |
990
+ | **src** | 0 | "de" | `/^(?:de\|en\|fr\|it)$/` |
991
+ | **dst** | 1 | "en" | `/^(?:de\|en\|fr\|it)$/` |
876
992
 
877
- - Node: **t2t-transformers**<br/>
878
- Purpose: **Transformers Text-to-Text translation**<br/>
879
- Example: `t2t-transformers(src: "de", dst: "en")`<br/>
993
+ - Node: **t2t-translate**<br/>
994
+ Purpose: **LLM-based Text-to-Text translation**<br/>
995
+ Example: `t2t-translate(src: "de", dst: "en")`<br/>
996
+ Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
880
997
 
881
998
  > This node performs translation between English and German languages
882
- > in the text stream. It is based on local OPUS or SmolLM3 LLMs.
999
+ > in the text stream using an LLM service. Multiple LLM providers are
1000
+ > supported: local Ollama (default), local HuggingFace Transformers,
1001
+ > or cloud-based OpenAI, Anthropic, or Google.
883
1002
 
884
1003
  | Port | Payload |
885
1004
  | ------- | ----------- |
886
1005
  | input | text |
887
1006
  | output | text |
888
1007
 
889
- | Parameter | Position | Default | Requirement |
890
- | ------------ | --------- | -------- | ---------------- |
891
- | **model** | *none* | "OPUS" | `/^(?:OPUS\|SmolLM3)$/` |
892
- | **src** | 0 | "de" | `/^(?:de\|en)$/` |
893
- | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
1008
+ | Parameter | Position | Default | Requirement |
1009
+ | ------------ | --------- | ------------------------ | ---------------------------------------- |
1010
+ | **src** | 0 | "de" | `/^(?:de\|en)$/` |
1011
+ | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
1012
+ | **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
1013
+ | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
1014
+ | **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
1015
+ | **key** | *none* | "" | *none* |
1016
+
1017
+ - Node: **t2t-spellcheck**<br/>
1018
+ Purpose: **LLM-based Text-to-Text spellchecking**<br/>
1019
+ Example: `t2t-spellcheck(lang: "en")`<br/>
1020
+ Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
1021
+
1022
+ > This node performs spellchecking of English or German text using an
1023
+ > LLM service. It corrects spelling mistakes, adds missing punctuation,
1024
+ > but preserves grammar and word choice. Multiple LLM providers are
1025
+ > supported: local Ollama (default), local HuggingFace Transformers,
1026
+ > or cloud-based OpenAI, Anthropic, or Google.
894
1027
 
895
- - Node: **t2t-google**<br/>
896
- Purpose: **Google Cloud Translate Text-to-Text translation**<br/>
897
- Example: `t2t-google(src: "de", dst: "en")`<br/>
898
- Notice: this node requires a Google Cloud API key and project ID!
1028
+ | Port | Payload |
1029
+ | ------- | ----------- |
1030
+ | input | text |
1031
+ | output | text |
899
1032
 
900
- > This node performs translation between multiple languages
901
- > in the text stream using Google Cloud Translate API.
902
- > It supports German, English, French, and Italian languages.
1033
+ | Parameter | Position | Default | Requirement |
1034
+ | ------------ | --------- | ------------------------ | ---------------------------------------- |
1035
+ | **lang** | 0 | "en" | `/^(?:en\|de)$/` |
1036
+ | **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
1037
+ | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
1038
+ | **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
1039
+ | **key** | *none* | "" | *none* |
1040
+
1041
+ - Node: **t2t-punctuation**<br/>
1042
+ Purpose: **LLM-based punctuation restoration**<br/>
1043
+ Example: `t2t-punctuation(lang: "en")`<br/>
1044
+ Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
1045
+
1046
+ > This node performs punctuation restoration using an LLM service.
1047
+ > It adds missing punctuation marks (periods, commas, question marks,
1048
+ > exclamation marks, colons, semicolons) and capitalizes the first
1049
+ > letters of sentences. It preserves all original words exactly as they
1050
+ > are without spelling corrections or grammar changes. Multiple LLM
1051
+ > providers are supported: local Ollama (default), local HuggingFace
1052
+ > Transformers, or cloud-based OpenAI, Anthropic, or Google.
903
1053
 
904
1054
  | Port | Payload |
905
1055
  | ------- | ----------- |
906
1056
  | input | text |
907
1057
  | output | text |
908
1058
 
909
- | Parameter | Position | Default | Requirement |
910
- | ------------ | --------- | -------- | ------------------ |
911
- | **key** | *none* | env.SPEECHFLOW\_GOOGLE\_KEY | *none* |
912
- | **src** | 0 | "de" | `/^(?:de\|en\|fr\|it)$/` |
913
- | **dst** | 1 | "en" | `/^(?:de\|en\|fr\|it)$/` |
1059
+ | Parameter | Position | Default | Requirement |
1060
+ | ------------ | --------- | ------------------------ | ---------------------------------------- |
1061
+ | **lang** | 0 | "en" | `/^(?:en\|de)$/` |
1062
+ | **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
1063
+ | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
1064
+ | **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
1065
+ | **key** | *none* | "" | *none* |
914
1066
 
915
1067
  - Node: **t2t-modify**<br/>
916
1068
  Purpose: **regex-based text modification**<br/>
@@ -930,6 +1082,53 @@ The following nodes process text chunks only.
930
1082
  | **match** | 0 | "" | *required* |
931
1083
  | **replace** | 1 | "" | *required* |
932
1084
 
1085
+ - Node: **t2t-profanity**<br/>
1086
+ Purpose: **profanity filtering**<br/>
1087
+ Example: `t2t-profanity(lang: "en", placeholder: "***")`<br/>
1088
+
1089
+ > This node filters profanity from the text stream by detecting bad words
1090
+ > and replacing them with a placeholder. It supports English and German
1091
+ > languages and can either replace with a fixed placeholder or repeat
1092
+ > the placeholder character for each character of the detected word.
1093
+
1094
+ | Port | Payload |
1095
+ | ------- | ----------- |
1096
+ | input | text |
1097
+ | output | text |
1098
+
1099
+ | Parameter | Position | Default | Requirement |
1100
+ | --------------- | --------- | ---------- | ------------------------ |
1101
+ | **lang** | *none* | "en" | `/^(?:en\|de)$/` |
1102
+ | **placeholder** | *none* | "\*\*\*" | *none* |
1103
+ | **mode** | *none* | "replace" | `/^(?:replace\|repeat)$/`|
1104
+
1105
+ - Node: **t2t-summary**<br/>
1106
+ Purpose: **LLM-based Text-to-Text summarization**<br/>
1107
+ Example: `t2t-summary(lang: "en", size: 4, trigger: 8)`<br/>
1108
+ Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
1109
+
1110
+ > This node performs text summarization using an LLM service.
1111
+ > It accumulates incoming text sentences and generates a summary after
1112
+ > a configurable number of sentences (trigger). The summary length is
1113
+ > also configurable (size). It supports English and German languages.
1114
+ > Multiple LLM providers are supported: local Ollama (default), local
1115
+ > HuggingFace Transformers, or cloud-based OpenAI, Anthropic, or Google.
1116
+
1117
+ | Port | Payload |
1118
+ | ------- | ----------- |
1119
+ | input | text |
1120
+ | output | text |
1121
+
1122
+ | Parameter | Position | Default | Requirement |
1123
+ | ------------ | --------- | ------------------------ | ---------------------------------------- |
1124
+ | **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
1125
+ | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
1126
+ | **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
1127
+ | **key** | *none* | "" | *none* |
1128
+ | **lang** | 0 | "en" | `/^(?:en\|de)$/` |
1129
+ | **size** | 1 | 4 | `1 <= n <= 20` |
1130
+ | **trigger** | 2 | 8 | `1 <= n <= 100` |
1131
+
933
1132
  - Node: **t2t-sentence**<br/>
934
1133
  Purpose: **sentence splitting/merging**<br/>
935
1134
  Example: `t2t-sentence()`<br/>
@@ -988,6 +1187,32 @@ The following nodes process text chunks only.
988
1187
 
989
1188
  The following nodes convert text chunks to audio chunks.
990
1189
 
1190
+ - Node: **t2a-openai**<br/>
1191
+ Purpose: **OpenAI Text-to-Speech conversion**<br/>
1192
+ Example: `t2a-openai(voice: "nova", model: "tts-1-hd")`<br/>
1193
+ Notice: this node requires an OpenAI API key!
1194
+
1195
+ > This node uses OpenAI TTS to perform Text-to-Speech (T2S)
1196
+ > conversion, i.e., it converts the input text stream into an output
1197
+ > audio stream. It supports six built-in voices and two models:
1198
+ > `tts-1` for lower latency and `tts-1-hd` for higher quality.
1199
+ > The language is automatically detected from the input text and
1200
+ > supports many languages including German, English, French, Spanish,
1201
+ > Chinese, Japanese, and more (no language parameter needed).
1202
+
1203
+ | Port | Payload |
1204
+ | ------- | ----------- |
1205
+ | input | text |
1206
+ | output | audio |
1207
+
1208
+ | Parameter | Position | Default | Requirement |
1209
+ | -------------- | --------- | --------- | ------------------ |
1210
+ | **key** | *none* | env.SPEECHFLOW\_OPENAI\_KEY | *none* |
1211
+ | **api** | *none* | "https://api.openai.com/v1" | `/^https?:\/\/.+/` |
1212
+ | **voice** | 0 | "alloy" | `/^(?:alloy\|echo\|fable\|onyx\|nova\|shimmer)$/` |
1213
+ | **model** | 1 | "tts-1" | `/^(?:tts-1\|tts-1-hd)$/` |
1214
+ | **speed** | 2 | 1.0 | `0.25 <= n <= 4.0` |
1215
+
991
1216
  - Node: **t2a-amazon**<br/>
992
1217
  Purpose: **Amazon Polly Text-to-Speech conversion**<br/>
993
1218
  Example: `t2a-amazon(language: "en", voice: "Danielle)`<br/>
@@ -1007,7 +1232,7 @@ The following nodes convert text chunks to audio chunks.
1007
1232
  | **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none* |
1008
1233
  | **secKey** | *none* | env.SPEECHFLOW\_AMAZON\_KEY\_SEC | *none* |
1009
1234
  | **region** | *none* | "eu-central-1" | *none* |
1010
- | **voice** | 0 | "Amy" | `^(?:Amy|Danielle|Joanna|Matthew|Ruth|Stephen|Viki|Daniel)$/` |
1235
+ | **voice** | 0 | "Amy" | `/^(?:Amy\|Danielle\|Joanna\|Matthew\|Ruth\|Stephen\|Vicki\|Daniel)$/` |
1011
1236
  | **language** | 1 | "en" | `/^(?:de\|en)$/` |
1012
1237
 
1013
1238
  - Node: **t2a-elevenlabs**<br/>
@@ -1029,11 +1254,34 @@ The following nodes convert text chunks to audio chunks.
1029
1254
  | **key** | *none* | env.SPEECHFLOW\_ELEVENLABS\_KEY | *none* |
1030
1255
  | **voice** | 0 | "Brian" | `/^(?:Brittney\|Cassidy\|Leonie\|Mark\|Brian)$/` |
1031
1256
  | **language** | 1 | "de" | `/^(?:de\|en)$/` |
1032
- | **speed** | 2 | 1.00 | `n >= 0`7 && n <= 1.2` |
1257
+ | **speed** | 2 | 1.00 | `n >= 0.7 && n <= 1.2` |
1033
1258
  | **stability** | 3 | 0.5 | `n >= 0.0 && n <= 1.0` |
1034
1259
  | **similarity** | 4 | 0.75 | `n >= 0.0 && n <= 1.0` |
1035
1260
  | **optimize** | 5 | "latency" | `/^(?:latency\|quality)$/` |
1036
1261
 
1262
+ - Node: **t2a-google**<br/>
1263
+ Purpose: **Google Cloud Text-to-Speech conversion**<br/>
1264
+ Example: `t2a-google(voice: "en-US-Neural2-J", language: "en-US")`<br/>
1265
+ Notice: this node requires a Google Cloud API key!
1266
+
1267
+ > This node uses Google Cloud Text-to-Speech to perform Text-to-Speech (T2S)
1268
+ > conversion, i.e., it converts the input text stream into an output
1269
+ > audio stream. It supports various voices and languages with configurable
1270
+ > speaking rate and pitch adjustment.
1271
+
1272
+ | Port | Payload |
1273
+ | ------- | ----------- |
1274
+ | input | text |
1275
+ | output | audio |
1276
+
1277
+ | Parameter | Position | Default | Requirement |
1278
+ | ------------ | --------- | ------------------ | -------------------- |
1279
+ | **key** | *none* | env.SPEECHFLOW\_GOOGLE\_KEY | *none* |
1280
+ | **voice** | 0 | "en-US-Neural2-J" | *none* |
1281
+ | **language** | 1 | "en-US" | *none* |
1282
+ | **speed** | 2 | 1.0 | `0.25 <= n <= 4.0` |
1283
+ | **pitch** | 3 | 0.0 | `-20.0 <= n <= 20.0` |
1284
+
1037
1285
  - Node: **t2a-kokoro**<br/>
1038
1286
  Purpose: **Kokoro Text-to-Speech conversion**<br/>
1039
1287
  Example: `t2a-kokoro(language: "en")`<br/>
@@ -1054,6 +1302,26 @@ The following nodes convert text chunks to audio chunks.
1054
1302
  | **language** | 1 | "en" | `/^en$/` |
1055
1303
  | **speed** | 2 | 1.25 | 1.0...1.30 |
1056
1304
 
1305
+ - Node: **t2a-supertonic**<br/>
1306
+ Purpose: **Supertonic Text-to-Speech conversion**<br/>
1307
+ Example: `t2a-supertonic(voice: "M1", speed: 1.40)`<br/>
1308
+
1309
+ > This node uses Supertonic to perform Text-to-Speech (T2S) conversion,
1310
+ > i.e., it converts the input text stream into an output audio stream.
1311
+ > It is intended to generate speech. The ONNX models are automatically
1312
+ > downloaded from HuggingFace on first use. It supports English language only.
1313
+
1314
+ | Port | Payload |
1315
+ | ------- | ----------- |
1316
+ | input | text |
1317
+ | output | audio |
1318
+
1319
+ | Parameter | Position | Default | Requirement |
1320
+ | ------------ | --------- | -------- | ----------- |
1321
+ | **voice** | 0 | "M1" | `/^(?:M1\|M2\|F1\|F2)$/` |
1322
+ | **speed** | 1 | 1.40 | `0.5 <= n <= 2.0` |
1323
+ | **steps** | 2 | 20 | `1 <= n <= 20` |
1324
+
1057
1325
  ### Any-to-Any Nodes
1058
1326
 
1059
1327
  The following nodes process any type of chunk, i.e., both audio and text chunks.