speechflow 1.7.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/README.md +425 -146
- package/etc/claude.md +5 -5
- package/etc/speechflow.yaml +2 -2
- package/package.json +3 -3
- package/speechflow-cli/dst/speechflow-main-api.js +6 -5
- package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-graph.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-main-graph.js +35 -13
- package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-status.js +3 -7
- package/speechflow-cli/dst/speechflow-main-status.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +3 -0
- package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +4 -2
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-expander.js +4 -2
- package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-gender.js +2 -2
- package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +1 -2
- package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js +32 -5
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +1 -6
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-deepgram.d.ts +0 -1
- package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js +9 -9
- package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +17 -0
- package/speechflow-cli/dst/speechflow-node-a2t-google.js +320 -0
- package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js +6 -4
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +6 -11
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +6 -5
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-google.d.ts +15 -0
- package/speechflow-cli/dst/speechflow-node-t2a-google.js +218 -0
- package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +2 -0
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +19 -6
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-openai.d.ts +15 -0
- package/speechflow-cli/dst/speechflow-node-t2a-openai.js +195 -0
- package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +17 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +608 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/{speechflow-node-t2t-transformers.d.ts → speechflow-node-t2t-opus.d.ts} +1 -3
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js +159 -0
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.d.ts +11 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.js +118 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.d.ts +13 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +220 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -0
- package/speechflow-cli/dst/{speechflow-node-t2t-openai.d.ts → speechflow-node-t2t-spellcheck.d.ts} +2 -2
- package/speechflow-cli/dst/{speechflow-node-t2t-openai.js → speechflow-node-t2t-spellcheck.js} +47 -99
- package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +3 -6
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-summary.d.ts +16 -0
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js +241 -0
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -0
- package/speechflow-cli/dst/{speechflow-node-t2t-ollama.d.ts → speechflow-node-t2t-translate.d.ts} +2 -2
- package/speechflow-cli/dst/{speechflow-node-t2t-transformers.js → speechflow-node-t2t-translate.js} +53 -115
- package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-x2x-filter.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-node-x2x-filter.js +10 -0
- package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-x2x-trace.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-device.js +3 -3
- package/speechflow-cli/dst/speechflow-node-xio-device.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-exec.d.ts +12 -0
- package/speechflow-cli/dst/speechflow-node-xio-exec.js +223 -0
- package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-file.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-file.js +80 -67
- package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-mqtt.js +2 -1
- package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-vban.d.ts +17 -0
- package/speechflow-cli/dst/speechflow-node-xio-vban.js +330 -0
- package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.d.ts +39 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +500 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-websocket.js +2 -1
- package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-audio.js +5 -6
- package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-error.d.ts +1 -1
- package/speechflow-cli/dst/speechflow-util-error.js +5 -7
- package/speechflow-cli/dst/speechflow-util-error.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-llm.d.ts +35 -0
- package/speechflow-cli/dst/speechflow-util-llm.js +363 -0
- package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -0
- package/speechflow-cli/dst/speechflow-util-misc.d.ts +1 -1
- package/speechflow-cli/dst/speechflow-util-misc.js +4 -4
- package/speechflow-cli/dst/speechflow-util-misc.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-queue.js +3 -3
- package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-stream.js +4 -2
- package/speechflow-cli/dst/speechflow-util-stream.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-util.js +1 -0
- package/speechflow-cli/dst/speechflow-util.js.map +1 -1
- package/speechflow-cli/etc/oxlint.jsonc +2 -1
- package/speechflow-cli/package.json +34 -17
- package/speechflow-cli/src/lib.d.ts +5 -0
- package/speechflow-cli/src/speechflow-main-api.ts +6 -5
- package/speechflow-cli/src/speechflow-main-graph.ts +40 -13
- package/speechflow-cli/src/speechflow-main-status.ts +4 -8
- package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +4 -0
- package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +4 -2
- package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -1
- package/speechflow-cli/src/speechflow-node-a2a-expander.ts +4 -2
- package/speechflow-cli/src/speechflow-node-a2a-gender.ts +2 -2
- package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +1 -2
- package/speechflow-cli/src/speechflow-node-a2a-wav.ts +33 -6
- package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +6 -11
- package/speechflow-cli/src/speechflow-node-a2t-deepgram.ts +13 -12
- package/speechflow-cli/src/speechflow-node-a2t-google.ts +322 -0
- package/speechflow-cli/src/speechflow-node-a2t-openai.ts +8 -4
- package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +7 -11
- package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +6 -5
- package/speechflow-cli/src/speechflow-node-t2a-google.ts +206 -0
- package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +22 -6
- package/speechflow-cli/src/speechflow-node-t2a-openai.ts +179 -0
- package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +701 -0
- package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +2 -1
- package/speechflow-cli/src/speechflow-node-t2t-opus.ts +136 -0
- package/speechflow-cli/src/speechflow-node-t2t-profanity.ts +93 -0
- package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +201 -0
- package/speechflow-cli/src/{speechflow-node-t2t-openai.ts → speechflow-node-t2t-spellcheck.ts} +48 -107
- package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +3 -6
- package/speechflow-cli/src/speechflow-node-t2t-summary.ts +229 -0
- package/speechflow-cli/src/speechflow-node-t2t-translate.ts +181 -0
- package/speechflow-cli/src/speechflow-node-x2x-filter.ts +16 -3
- package/speechflow-cli/src/speechflow-node-x2x-trace.ts +3 -3
- package/speechflow-cli/src/speechflow-node-xio-device.ts +4 -7
- package/speechflow-cli/src/speechflow-node-xio-exec.ts +210 -0
- package/speechflow-cli/src/speechflow-node-xio-file.ts +93 -80
- package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +3 -2
- package/speechflow-cli/src/speechflow-node-xio-vban.ts +325 -0
- package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +533 -0
- package/speechflow-cli/src/speechflow-node-xio-websocket.ts +2 -1
- package/speechflow-cli/src/speechflow-util-audio-wt.ts +4 -4
- package/speechflow-cli/src/speechflow-util-audio.ts +10 -10
- package/speechflow-cli/src/speechflow-util-error.ts +9 -7
- package/speechflow-cli/src/speechflow-util-llm.ts +367 -0
- package/speechflow-cli/src/speechflow-util-misc.ts +4 -4
- package/speechflow-cli/src/speechflow-util-queue.ts +4 -4
- package/speechflow-cli/src/speechflow-util-stream.ts +5 -3
- package/speechflow-cli/src/speechflow-util.ts +1 -0
- package/speechflow-ui-db/package.json +9 -9
- package/speechflow-ui-st/package.json +9 -9
- package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +0 -293
- package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +0 -1
- package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +0 -281
- package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +0 -247
package/README.md
CHANGED
|
@@ -26,7 +26,8 @@ speech-to-speech).
|
|
|
26
26
|
**SpeechFlow** comes with built-in graph nodes for various functionalities:
|
|
27
27
|
|
|
28
28
|
- file and audio device I/O for local connectivity,
|
|
29
|
-
- WebSocket and
|
|
29
|
+
- WebSocket, MQTT, VBAN, and WebRTC network I/O for remote connectivity,
|
|
30
|
+
- external command execution I/O for process integration,
|
|
30
31
|
- local Voice Activity Detection (VAD),
|
|
31
32
|
- local voice gender recognition,
|
|
32
33
|
- local audio LUFS-S/RMS metering,
|
|
@@ -38,20 +39,27 @@ speech-to-speech).
|
|
|
38
39
|
- remote-controlable audio muting,
|
|
39
40
|
- cloud-based speech-to-text conversion with
|
|
40
41
|
[Amazon Transcribe](https://aws.amazon.com/transcribe/),
|
|
41
|
-
[OpenAI GPT-Transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe),
|
|
42
|
-
[Deepgram](https://deepgram.com)
|
|
42
|
+
[OpenAI GPT-Transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe),
|
|
43
|
+
[Deepgram](https://deepgram.com), or
|
|
44
|
+
[Google Cloud Speech-to-Text](https://cloud.google.com/speech-to-text).
|
|
43
45
|
- cloud-based text-to-text translation (or spelling correction) with
|
|
44
46
|
[DeepL](https://deepl.com),
|
|
45
47
|
[Amazon Translate](https://aws.amazon.com/translate/),
|
|
46
|
-
[Google Cloud Translate](https://cloud.google.com/translate),
|
|
47
|
-
[OpenAI GPT](https://openai.com)
|
|
48
|
+
[Google Cloud Translate](https://cloud.google.com/translate),
|
|
49
|
+
[OpenAI GPT](https://openai.com),
|
|
50
|
+
[Anthropic Claude](https://anthropic.com), or
|
|
51
|
+
[Google Gemini](https://ai.google.dev).
|
|
48
52
|
- local text-to-text translation (or spelling correction) with
|
|
49
|
-
[Ollama
|
|
50
|
-
[
|
|
53
|
+
[Ollama](https://ollama.com) or
|
|
54
|
+
[OPUS-MT](https://github.com/Helsinki-NLP/Opus-MT).
|
|
51
55
|
- cloud-based text-to-speech conversion with
|
|
52
|
-
[
|
|
53
|
-
[
|
|
54
|
-
|
|
56
|
+
[OpenAI TTS](https://platform.openai.com/docs/guides/text-to-speech),
|
|
57
|
+
[ElevenLabs](https://elevenlabs.io/),
|
|
58
|
+
[Amazon Polly](https://aws.amazon.com/polly/), or
|
|
59
|
+
[Google Cloud Text-to-Speech](https://cloud.google.com/text-to-speech).
|
|
60
|
+
- local text-to-speech conversion with
|
|
61
|
+
[Kokoro](https://github.com/nazdridoy/kokoro-tts) or
|
|
62
|
+
[Supertonic](https://huggingface.co/Supertone/supertonic).
|
|
55
63
|
- local [FFmpeg](https://ffmpeg.org/)-based speech-to-speech conversion,
|
|
56
64
|
- local WAV speech-to-speech decoding/encoding,
|
|
57
65
|
- local text-to-text formatting, regex-based modification,
|
|
@@ -221,8 +229,8 @@ They can also be found in the sample [speechflow.yaml](./etc/speechflow.yaml) fi
|
|
|
221
229
|
|
|
222
230
|
```
|
|
223
231
|
xio-device(device: env.SPEECHFLOW_DEVICE_MIC, mode: "r") |
|
|
224
|
-
a2a-wav(mode: "encode") |
|
|
225
|
-
xio-file(path: "capture.wav", mode: "w", type: "audio")
|
|
232
|
+
a2a-wav(mode: "encode", seekable: true) |
|
|
233
|
+
xio-file(path: "capture.wav", mode: "w", type: "audio", seekable: true)
|
|
226
234
|
```
|
|
227
235
|
|
|
228
236
|
- **Pass-Through**: Pass-through audio from microphone device to speaker
|
|
@@ -335,7 +343,10 @@ First a short overview of the available processing nodes:
|
|
|
335
343
|
**xio-file**,
|
|
336
344
|
**xio-device**,
|
|
337
345
|
**xio-websocket**,
|
|
338
|
-
**xio-mqtt
|
|
346
|
+
**xio-mqtt**,
|
|
347
|
+
**xio-vban**,
|
|
348
|
+
**xio-webrtc**,
|
|
349
|
+
**xio-exec**.
|
|
339
350
|
- Audio-to-Audio nodes:
|
|
340
351
|
**a2a-ffmpeg**,
|
|
341
352
|
**a2a-wav**,
|
|
@@ -353,22 +364,29 @@ First a short overview of the available processing nodes:
|
|
|
353
364
|
- Audio-to-Text nodes:
|
|
354
365
|
**a2t-openai**,
|
|
355
366
|
**a2t-amazon**,
|
|
356
|
-
**a2t-deepgram
|
|
367
|
+
**a2t-deepgram**,
|
|
368
|
+
**a2t-google**.
|
|
357
369
|
- Text-to-Text nodes:
|
|
358
370
|
**t2t-deepl**,
|
|
359
371
|
**t2t-amazon**,
|
|
360
|
-
**t2t-
|
|
361
|
-
**t2t-ollama**,
|
|
362
|
-
**t2t-transformers**,
|
|
372
|
+
**t2t-opus**,
|
|
363
373
|
**t2t-google**,
|
|
374
|
+
**t2t-translate**,
|
|
375
|
+
**t2t-spellcheck**,
|
|
376
|
+
**t2t-punctuation**,
|
|
364
377
|
**t2t-modify**,
|
|
378
|
+
**t2t-profanity**,
|
|
379
|
+
**t2t-summary**,
|
|
365
380
|
**t2t-subtitle**,
|
|
366
381
|
**t2t-format**,
|
|
367
382
|
**t2t-sentence**.
|
|
368
383
|
- Text-to-Audio nodes:
|
|
384
|
+
**t2a-openai**,
|
|
369
385
|
**t2a-amazon**,
|
|
370
386
|
**t2a-elevenlabs**,
|
|
371
|
-
**t2a-
|
|
387
|
+
**t2a-google**,
|
|
388
|
+
**t2a-kokoro**,
|
|
389
|
+
**t2a-supertonic**.
|
|
372
390
|
- Any-to-Any nodes:
|
|
373
391
|
**x2x-filter**,
|
|
374
392
|
**x2x-trace**.
|
|
@@ -384,20 +402,24 @@ external files, devices and network services.
|
|
|
384
402
|
|
|
385
403
|
> This node allows the reading/writing from/to files or from StdIO. It
|
|
386
404
|
> is intended to be used as source and sink nodes in batch processing,
|
|
387
|
-
> and as sing nodes in real-time processing.
|
|
405
|
+
> and as sing nodes in real-time processing. When `seekable` is enabled
|
|
406
|
+
> for write mode, the node uses a file descriptor allowing random access
|
|
407
|
+
> writes to specific file positions via the `chunk:seek` metadata field.
|
|
408
|
+
> Option `seekable` cannot be used on StdIO.
|
|
388
409
|
|
|
389
410
|
| Port | Payload |
|
|
390
411
|
| ------- | ----------- |
|
|
391
412
|
| input | text, audio |
|
|
392
413
|
| output | text, audio |
|
|
393
414
|
|
|
394
|
-
| Parameter
|
|
395
|
-
|
|
|
396
|
-
| **path**
|
|
397
|
-
| **mode**
|
|
398
|
-
| **type**
|
|
399
|
-
| **
|
|
400
|
-
| **
|
|
415
|
+
| Parameter | Position | Default | Requirement |
|
|
416
|
+
| -------------- | --------- | -------- | --------------------- |
|
|
417
|
+
| **path** | 0 | *none* | *none* |
|
|
418
|
+
| **mode** | 1 | "r" | `/^(?:r\|w)$/` |
|
|
419
|
+
| **type** | 2 | "audio" | `/^(?:audio\|text)$/` |
|
|
420
|
+
| **seekable** | | false | *none* |
|
|
421
|
+
| **chunkAudio** | | 200 | `10 <= n <= 1000` |
|
|
422
|
+
| **chunkText** | | 65536 | `1024 <= n <= 131072` |
|
|
401
423
|
|
|
402
424
|
- Node: **xio-device**<br/>
|
|
403
425
|
Purpose: **Microphone/speaker device source/sink**<br/>
|
|
@@ -437,11 +459,12 @@ external files, devices and network services.
|
|
|
437
459
|
| ----------- | --------- | -------- | --------------------- |
|
|
438
460
|
| **listen** | *none* | *none* | `/^(?:\|ws:\/\/(.+?):(\d+))$/` |
|
|
439
461
|
| **connect** | *none* | *none* | `/^(?:\|ws:\/\/(.+?):(\d+)(?:\/.*)?)$/` |
|
|
440
|
-
| **
|
|
462
|
+
| **mode** | *none* | "r" | `/^(?:r\|w\|rw)$/` |
|
|
463
|
+
| **type** | *none* | "text" | `/^(?:audio\|text)$/` |
|
|
441
464
|
|
|
442
465
|
- Node: **xio-mqtt**<br/>
|
|
443
|
-
Purpose: **MQTT sink**<br/>
|
|
444
|
-
Example: `xio-mqtt(url: "mqtt://127.0.0.1:1883", username: "foo", password: "bar",
|
|
466
|
+
Purpose: **MQTT source/sink**<br/>
|
|
467
|
+
Example: `xio-mqtt(url: "mqtt://127.0.0.1:1883", username: "foo", password: "bar", topicWrite: "quux")`
|
|
445
468
|
Notice: this node requires a peer MQTT broker!
|
|
446
469
|
|
|
447
470
|
> This node allows reading/writing from/to MQTT broker topics. It is
|
|
@@ -450,15 +473,94 @@ external files, devices and network services.
|
|
|
450
473
|
|
|
451
474
|
| Port | Payload |
|
|
452
475
|
| ------- | ----------- |
|
|
453
|
-
| input | text
|
|
454
|
-
| output |
|
|
476
|
+
| input | text, audio |
|
|
477
|
+
| output | text, audio |
|
|
455
478
|
|
|
456
|
-
| Parameter
|
|
457
|
-
|
|
|
458
|
-
| **url**
|
|
459
|
-
| **username**
|
|
460
|
-
| **password**
|
|
461
|
-
| **
|
|
479
|
+
| Parameter | Position | Default | Requirement |
|
|
480
|
+
| -------------- | --------- | -------- | --------------------- |
|
|
481
|
+
| **url** | 0 | *none* | `/^(?:\|(?:ws\|mqtt):\/\/(.+?):(\d+)(?:\/.*)?)$/` |
|
|
482
|
+
| **username** | 1 | *none* | `/^.+$/` |
|
|
483
|
+
| **password** | 2 | *none* | `/^.+$/` |
|
|
484
|
+
| **topicRead** | 3 | *none* | `/^.+$/` |
|
|
485
|
+
| **topicWrite** | 4 | *none* | `/^.+$/` |
|
|
486
|
+
| **mode** | 5 | "w" | `/^(?:r\|w\|rw)$/` |
|
|
487
|
+
| **type** | 6 | "text" | `/^(?:audio\|text)$/` |
|
|
488
|
+
|
|
489
|
+
- Node: **xio-vban**<br/>
|
|
490
|
+
Purpose: **VBAN network audio source/sink**<br/>
|
|
491
|
+
Example: `xio-vban(listen: 6980, stream: "Stream1", mode: "r")`
|
|
492
|
+
Notice: this node requires a peer VBAN-compatible application!
|
|
493
|
+
|
|
494
|
+
> This node allows reading/writing audio from/to VBAN (VoiceMeeter
|
|
495
|
+
> Audio Network) protocol endpoints. It is intended to be used for
|
|
496
|
+
> real-time audio streaming with applications like VoiceMeeter,
|
|
497
|
+
> VB-Audio Matrix, or other VBAN-compatible software. It supports
|
|
498
|
+
> various audio bit resolutions (8-bit, 16-bit, 24-bit, 32-bit,
|
|
499
|
+
> float32, float64) and automatic channel downmixing to mono.
|
|
500
|
+
|
|
501
|
+
| Port | Payload |
|
|
502
|
+
| ------- | ----------- |
|
|
503
|
+
| input | audio |
|
|
504
|
+
| output | audio |
|
|
505
|
+
|
|
506
|
+
| Parameter | Position | Default | Requirement |
|
|
507
|
+
| ----------- | --------- | --------- | ---------------------------- |
|
|
508
|
+
| **listen** | 0 | "" | `/^(?:\|\d+\|.+?:\d+)$/` |
|
|
509
|
+
| **connect** | 1 | "" | `/^(?:\|.+?:\d+)$/` |
|
|
510
|
+
| **stream** | 2 | "Stream" | `/^.{1,16}$/` |
|
|
511
|
+
| **mode** | 3 | "rw" | `/^(?:r\|w\|rw)$/` |
|
|
512
|
+
|
|
513
|
+
- Node: **xio-webrtc**<br/>
|
|
514
|
+
Purpose: **WebRTC audio streaming source (WHIP) or sink (WHEP)**<br/>
|
|
515
|
+
Example: `xio-webrtc(listen: 8085, path: "/webrtc", mode: "r")`
|
|
516
|
+
|
|
517
|
+
> This node allows real-time audio streaming using WebRTC technology
|
|
518
|
+
> via WebRTC-HTTP Ingestion Protocol (WHIP) or WebRTC-HTTP Egress
|
|
519
|
+
> Protocol (WHEP). It provides an HTTP server for SDP negotiation
|
|
520
|
+
> and uses Opus codec for audio encoding/decoding at 48kHz. The node
|
|
521
|
+
> can operate in WHIP mode (i.e., read mode where publishers POST
|
|
522
|
+
> SDP offers to SpeechFlow and SpeechFlow receives audio stream from
|
|
523
|
+
> them) or WHEP mode (i.e., write mode where viewers POST SDP offers
|
|
524
|
+
> to SpeechFlow and SpeechFlow sends audio stream to them). This node
|
|
525
|
+
> supports multiple simultaneous connections, configurable ICE servers
|
|
526
|
+
> for NAT traversal, and automatic connection lifecycle management.
|
|
527
|
+
|
|
528
|
+
| Port | Payload |
|
|
529
|
+
| ------- | ----------- |
|
|
530
|
+
| input | audio |
|
|
531
|
+
| output | audio |
|
|
532
|
+
|
|
533
|
+
| Parameter | Position | Default | Requirement |
|
|
534
|
+
| -------------- | --------- | --------- | ---------------------------- |
|
|
535
|
+
| **listen** | 0 | "8085" | `/^(?:\d+\|.+?:\d+)$/` |
|
|
536
|
+
| **path** | 1 | "/webrtc" | `/^\/.+$/` |
|
|
537
|
+
| **mode** | 2 | "r" | `/^(?:r\|w)$/` |
|
|
538
|
+
| **iceServers** | 3 | "" | `/^.*$/` |
|
|
539
|
+
|
|
540
|
+
- Node: **xio-exec**<br/>
|
|
541
|
+
Purpose: **External command execution source/sink**<br/>
|
|
542
|
+
Example: `xio-exec(command: "ffmpeg -i - -f s16le -", mode: "rw", type: "audio")`
|
|
543
|
+
|
|
544
|
+
> This node allows reading/writing from/to external commands via stdin/stdout.
|
|
545
|
+
> It executes arbitrary commands and pipes audio or text data through them,
|
|
546
|
+
> enabling integration with external processing tools. The node supports
|
|
547
|
+
> read-only mode (capturing stdout), write-only mode (sending to stdin),
|
|
548
|
+
> and bidirectional mode (both stdin and stdout). This is useful for integrating
|
|
549
|
+
> external audio/text processing tools like FFmpeg, SoX, or custom scripts into
|
|
550
|
+
> the SpeechFlow pipeline.
|
|
551
|
+
|
|
552
|
+
| Port | Payload |
|
|
553
|
+
| ------- | ----------- |
|
|
554
|
+
| input | text, audio |
|
|
555
|
+
| output | text, audio |
|
|
556
|
+
|
|
557
|
+
| Parameter | Position | Default | Requirement |
|
|
558
|
+
| -------------- | --------- | -------- | --------------------- |
|
|
559
|
+
| **command** | 0 | *none* | *required* |
|
|
560
|
+
| **mode** | 1 | "r" | `/^(?:r\|w\|rw)$/` |
|
|
561
|
+
| **type** | 2 | "audio" | `/^(?:audio\|text)$/` |
|
|
562
|
+
| **chunkAudio** | | 200 | `10 <= n <= 1000` |
|
|
563
|
+
| **chunkText** | | 65536 | `1024 <= n <= 131072` |
|
|
462
564
|
|
|
463
565
|
### Audio-to-Audio Nodes
|
|
464
566
|
|
|
@@ -477,10 +579,10 @@ The following nodes process audio chunks only.
|
|
|
477
579
|
| input | audio |
|
|
478
580
|
| output | audio |
|
|
479
581
|
|
|
480
|
-
| Parameter
|
|
481
|
-
|
|
|
482
|
-
| **src**
|
|
483
|
-
| **dst**
|
|
582
|
+
| Parameter | Position | Default | Requirement |
|
|
583
|
+
| --------- | --------- | -------- | ------------------ |
|
|
584
|
+
| **src** | 0 | "pcm" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
|
|
585
|
+
| **dst** | 1 | "wav" | `/^(?:pcm\|wav\|mp3\|opus)$/` |
|
|
484
586
|
|
|
485
587
|
- Node: **a2a-wav**<br/>
|
|
486
588
|
Purpose: **WAV audio format conversion**<br/>
|
|
@@ -489,15 +591,20 @@ The following nodes process audio chunks only.
|
|
|
489
591
|
> This node allows converting between PCM and WAV audio formats. It is
|
|
490
592
|
> primarily intended to support the reading/writing of external WAV
|
|
491
593
|
> format files, although SpeechFlow internally uses PCM format only.
|
|
594
|
+
> When `seekable` is enabled in encode mode, the node writes a corrected
|
|
595
|
+
> WAV header at the end of processing with accurate file size information
|
|
596
|
+
> by seeking back to position 0, producing standard-compliant WAV files.
|
|
597
|
+
> Option `seekable` requires a seekable output stream.
|
|
492
598
|
|
|
493
599
|
| Port | Payload |
|
|
494
600
|
| ------- | ----------- |
|
|
495
601
|
| input | audio |
|
|
496
602
|
| output | audio |
|
|
497
603
|
|
|
498
|
-
| Parameter
|
|
499
|
-
|
|
|
500
|
-
| **mode**
|
|
604
|
+
| Parameter | Position | Default | Requirement |
|
|
605
|
+
| ------------ | --------- | -------- | ------------------------ |
|
|
606
|
+
| **mode** | 0 | "encode" | `/^(?:encode\|decode)$/` |
|
|
607
|
+
| **seekable** | 1 | false | *none* |
|
|
501
608
|
|
|
502
609
|
- Node: **a2a-mute**<br/>
|
|
503
610
|
Purpose: **volume muting node**<br/>
|
|
@@ -512,8 +619,8 @@ The following nodes process audio chunks only.
|
|
|
512
619
|
| input | audio |
|
|
513
620
|
| output | audio |
|
|
514
621
|
|
|
515
|
-
| Parameter
|
|
516
|
-
|
|
|
622
|
+
| Parameter | Position | Default | Requirement |
|
|
623
|
+
| --------- | --------- | -------- | ------------------------ |
|
|
517
624
|
|
|
518
625
|
- Node: **a2a-meter**<br/>
|
|
519
626
|
Purpose: **Loudness metering node**<br/>
|
|
@@ -531,7 +638,7 @@ The following nodes process audio chunks only.
|
|
|
531
638
|
|
|
532
639
|
| Parameter | Position | Default | Requirement |
|
|
533
640
|
| ------------- | --------- | -------- | ---------------------- |
|
|
534
|
-
| **interval** | 0 |
|
|
641
|
+
| **interval** | 0 | 100 | *none* |
|
|
535
642
|
| **mode** | 1 | "filter" | `/^(?:filter\|sink)$/` |
|
|
536
643
|
| **dashboard** | | *none* | *none* |
|
|
537
644
|
|
|
@@ -548,8 +655,8 @@ The following nodes process audio chunks only.
|
|
|
548
655
|
| input | audio |
|
|
549
656
|
| output | audio |
|
|
550
657
|
|
|
551
|
-
| Parameter
|
|
552
|
-
|
|
|
658
|
+
| Parameter | Position | Default | Requirement |
|
|
659
|
+
| --------- | --------- | -------- | ------------------------ |
|
|
553
660
|
| **mode** | *none* | "unplugged" | `/^(?:silenced\|unplugged)$/` |
|
|
554
661
|
| **posSpeechThreshold** | *none* | 0.50 | *none* |
|
|
555
662
|
| **negSpeechThreshold** | *none* | 0.35 | *none* |
|
|
@@ -571,11 +678,12 @@ The following nodes process audio chunks only.
|
|
|
571
678
|
| input | audio |
|
|
572
679
|
| output | audio |
|
|
573
680
|
|
|
574
|
-
| Parameter
|
|
575
|
-
|
|
|
576
|
-
| **window**
|
|
577
|
-
| **
|
|
578
|
-
| **hysteresis**
|
|
681
|
+
| Parameter | Position | Default | Requirement |
|
|
682
|
+
| ------------------- | --------- | -------- | ------------------------ |
|
|
683
|
+
| **window** | 0 | 500 | *none* |
|
|
684
|
+
| **threshold** | 1 | 0.50 | *none* |
|
|
685
|
+
| **hysteresis** | 2 | 0.25 | *none* |
|
|
686
|
+
| **volumeThreshold** | 3 | -45 | *none* |
|
|
579
687
|
|
|
580
688
|
- Node: **a2a-speex**<br/>
|
|
581
689
|
Purpose: **Speex Noise Suppression node**<br/>
|
|
@@ -590,9 +698,9 @@ The following nodes process audio chunks only.
|
|
|
590
698
|
| input | audio |
|
|
591
699
|
| output | audio |
|
|
592
700
|
|
|
593
|
-
| Parameter
|
|
594
|
-
|
|
|
595
|
-
| **attentuate** | 0
|
|
701
|
+
| Parameter | Position | Default | Requirement |
|
|
702
|
+
| -------------- | --------- | -------- | ------------------ |
|
|
703
|
+
| **attentuate** | 0 | -18 | `-60 <= n <= 0` |
|
|
596
704
|
|
|
597
705
|
- Node: **a2a-rnnoise**<br/>
|
|
598
706
|
Purpose: **RNNoise Noise Suppression node**<br/>
|
|
@@ -606,8 +714,8 @@ The following nodes process audio chunks only.
|
|
|
606
714
|
| input | audio |
|
|
607
715
|
| output | audio |
|
|
608
716
|
|
|
609
|
-
| Parameter
|
|
610
|
-
|
|
|
717
|
+
| Parameter | Position | Default | Requirement |
|
|
718
|
+
| --------- | --------- | -------- | ------------------------ |
|
|
611
719
|
|
|
612
720
|
- Node: **a2a-compressor**<br/>
|
|
613
721
|
Purpose: **audio compressor node**<br/>
|
|
@@ -621,14 +729,17 @@ The following nodes process audio chunks only.
|
|
|
621
729
|
| input | audio |
|
|
622
730
|
| output | audio |
|
|
623
731
|
|
|
624
|
-
| Parameter
|
|
625
|
-
|
|
|
626
|
-
| **
|
|
627
|
-
| **
|
|
628
|
-
| **
|
|
629
|
-
| **
|
|
630
|
-
| **
|
|
631
|
-
| **
|
|
732
|
+
| Parameter | Position | Default | Requirement |
|
|
733
|
+
| --------------- | --------- | ------------ | ------------------------ |
|
|
734
|
+
| **type** | *none* | "standalone" | `/^(?:standalone\|sidechain)$/` |
|
|
735
|
+
| **mode** | *none* | "compress" | `/^(?:compress\|measure\|adjust)$/` |
|
|
736
|
+
| **bus** | *none* | "compressor" | `/^.+$/` |
|
|
737
|
+
| **thresholdDb** | *none* | -23 | `n <= 0 && n >= -100`|
|
|
738
|
+
| **ratio** | *none* | 4.0 | `n >= 1 && n <= 20` |
|
|
739
|
+
| **attackMs** | *none* | 10 | `n >= 0 && n <= 1000`|
|
|
740
|
+
| **releaseMs** | *none* | 50 | `n >= 0 && n <= 1000`|
|
|
741
|
+
| **kneeDb** | *none* | 6.0 | `n >= 0 && n <= 40` |
|
|
742
|
+
| **makeupDb** | *none* | 0 | `n >= -24 && n <= 24`|
|
|
632
743
|
|
|
633
744
|
- Node: **a2a-expander**<br/>
|
|
634
745
|
Purpose: **audio expander node**<br/>
|
|
@@ -642,14 +753,15 @@ The following nodes process audio chunks only.
|
|
|
642
753
|
| input | audio |
|
|
643
754
|
| output | audio |
|
|
644
755
|
|
|
645
|
-
| Parameter
|
|
646
|
-
|
|
|
647
|
-
| **thresholdDb** | *none*
|
|
648
|
-
| **
|
|
649
|
-
| **
|
|
650
|
-
| **
|
|
651
|
-
| **
|
|
652
|
-
| **
|
|
756
|
+
| Parameter | Position | Default | Requirement |
|
|
757
|
+
| --------------- | --------- | -------- | --------------------- |
|
|
758
|
+
| **thresholdDb** | *none* | -45 | `n <= 0 && n >= -100` |
|
|
759
|
+
| **floorDb** | *none* | -64 | `n <= 0 && n >= -100` |
|
|
760
|
+
| **ratio** | *none* | 4.0 | `n >= 1 && n <= 20` |
|
|
761
|
+
| **attackMs** | *none* | 10 | `n >= 0 && n <= 1000` |
|
|
762
|
+
| **releaseMs** | *none* | 50 | `n >= 0 && n <= 1000` |
|
|
763
|
+
| **kneeDb** | *none* | 6.0 | `n >= 0 && n <= 40` |
|
|
764
|
+
| **makeupDb** | *none* | 0 | `n >= -24 && n <= 24` |
|
|
653
765
|
|
|
654
766
|
- Node: **a2a-gain**<br/>
|
|
655
767
|
Purpose: **audio gain adjustment node**<br/>
|
|
@@ -663,9 +775,9 @@ The following nodes process audio chunks only.
|
|
|
663
775
|
| input | audio |
|
|
664
776
|
| output | audio |
|
|
665
777
|
|
|
666
|
-
| Parameter
|
|
667
|
-
|
|
|
668
|
-
| **db**
|
|
778
|
+
| Parameter | Position | Default | Requirement |
|
|
779
|
+
| --------- | --------- | -------- | --------------------- |
|
|
780
|
+
| **db** | 0 | 0 | `n >= -60 && n <= 60` |
|
|
669
781
|
|
|
670
782
|
- Node: **a2a-pitch**<br/>
|
|
671
783
|
Purpose: **audio pitch shifting and time stretching**<br/>
|
|
@@ -701,8 +813,9 @@ The following nodes process audio chunks only.
|
|
|
701
813
|
| input | audio |
|
|
702
814
|
| output | audio |
|
|
703
815
|
|
|
704
|
-
| Parameter | Position | Default | Requirement
|
|
705
|
-
| ----------- | --------- | -------- |
|
|
816
|
+
| Parameter | Position | Default | Requirement |
|
|
817
|
+
| ----------- | --------- | -------- | ---------------------- |
|
|
818
|
+
| **segment** | 0 | 50 | `n >= 10 && n <= 1000` |
|
|
706
819
|
|
|
707
820
|
### Audio-to-Text Nodes
|
|
708
821
|
|
|
@@ -719,7 +832,7 @@ The following nodes convert audio to text chunks.
|
|
|
719
832
|
|
|
720
833
|
| Port | Payload |
|
|
721
834
|
| ------- | ----------- |
|
|
722
|
-
| input |
|
|
835
|
+
| input | audio |
|
|
723
836
|
| output | text |
|
|
724
837
|
|
|
725
838
|
| Parameter | Position | Default | Requirement |
|
|
@@ -770,9 +883,32 @@ The following nodes convert audio to text chunks.
|
|
|
770
883
|
| ------------ | --------- | -------- | ------------------ |
|
|
771
884
|
| **key** | *none* | env.SPEECHFLOW\_DEEPGRAM\_KEY | *none* |
|
|
772
885
|
| **keyAdm** | *none* | env.SPEECHFLOW\_DEEPGRAM\_KEY\_ADM | *none* |
|
|
773
|
-
| **model** | 0 | "nova-
|
|
886
|
+
| **model** | 0 | "nova-2" | *none* |
|
|
774
887
|
| **version** | 1 | "latest" | *none* |
|
|
775
888
|
| **language** | 2 | "multi" | *none* |
|
|
889
|
+
| **interim** | 3 | false | *none* |
|
|
890
|
+
|
|
891
|
+
- Node: **a2t-google**<br/>
|
|
892
|
+
Purpose: **Google Cloud Speech-to-Text conversion**<br/>
|
|
893
|
+
Example: `a2t-google(language: "en-US")`<br/>
|
|
894
|
+
Notice: this node requires a Google Cloud API key!
|
|
895
|
+
|
|
896
|
+
> This node uses Google Cloud Speech-to-Text to perform Speech-to-Text (S2T)
|
|
897
|
+
> conversion, i.e., it recognizes speech in the input audio stream and
|
|
898
|
+
> outputs a corresponding text stream. It supports various languages
|
|
899
|
+
> and models, including the `latest_long` model for long-form audio.
|
|
900
|
+
|
|
901
|
+
| Port | Payload |
|
|
902
|
+
| ------- | ----------- |
|
|
903
|
+
| input | audio |
|
|
904
|
+
| output | text |
|
|
905
|
+
|
|
906
|
+
| Parameter | Position | Default | Requirement |
|
|
907
|
+
| ------------ | --------- | ------------- | ------------ |
|
|
908
|
+
| **key** | *none* | env.SPEECHFLOW\_GOOGLE\_KEY | *none* |
|
|
909
|
+
| **model** | 0 | "latest_long" | *none* |
|
|
910
|
+
| **language** | 1 | "en-US" | *none* |
|
|
911
|
+
| **interim** | 2 | false | *none* |
|
|
776
912
|
|
|
777
913
|
### Text-to-Text Nodes
|
|
778
914
|
|
|
@@ -783,73 +919,65 @@ The following nodes process text chunks only.
|
|
|
783
919
|
Example: `t2t-deepl(src: "de", dst: "en")`<br/>
|
|
784
920
|
Notice: this node requires an API key!
|
|
785
921
|
|
|
786
|
-
> This node performs translation between
|
|
922
|
+
> This node performs translation between multiple languages.
|
|
787
923
|
|
|
788
924
|
| Port | Payload |
|
|
789
925
|
| ------- | ----------- |
|
|
790
926
|
| input | text |
|
|
791
927
|
| output | text |
|
|
792
928
|
|
|
793
|
-
| Parameter | Position | Default
|
|
794
|
-
| ------------ | --------- |
|
|
795
|
-
| **key** | *none* | env.SPEECHFLOW\_DEEPL\_KEY | *none*
|
|
796
|
-
| **src** | 0 | "de"
|
|
797
|
-
| **dst** | 1 | "en"
|
|
929
|
+
| Parameter | Position | Default | Requirement |
|
|
930
|
+
| ------------ | --------- | ---------- | ----------------------------- |
|
|
931
|
+
| **key** | *none* | env.SPEECHFLOW\_DEEPL\_KEY | *none* |
|
|
932
|
+
| **src** | 0 | "de" | `/^(?:de\|en\|fr\|it)$/` |
|
|
933
|
+
| **dst** | 1 | "en" | `/^(?:de\|en\|fr\|it)$/` |
|
|
934
|
+
| **optimize** | 2 | "latency" | `/^(?:latency\|quality)$/` |
|
|
798
935
|
|
|
799
936
|
- Node: **t2t-amazon**<br/>
|
|
800
937
|
Purpose: **AWS Translate Text-to-Text translation**<br/>
|
|
801
938
|
Example: `t2t-amazon(src: "de", dst: "en")`<br/>
|
|
802
939
|
Notice: this node requires an API key!
|
|
803
940
|
|
|
804
|
-
> This node performs translation between
|
|
941
|
+
> This node performs translation between multiple languages.
|
|
805
942
|
|
|
806
943
|
| Port | Payload |
|
|
807
944
|
| ------- | ----------- |
|
|
808
945
|
| input | text |
|
|
809
946
|
| output | text |
|
|
810
947
|
|
|
811
|
-
| Parameter | Position | Default | Requirement
|
|
812
|
-
| ------------ | --------- | -------- |
|
|
813
|
-
| **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none*
|
|
948
|
+
| Parameter | Position | Default | Requirement |
|
|
949
|
+
| ------------ | --------- | -------- | ---------------------------- |
|
|
950
|
+
| **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none* |
|
|
814
951
|
| **secKey** | *none* | env.SPEECHFLOW\_AMAZON\_KEY\_SEC | *none* |
|
|
815
|
-
| **region** | *none* | "eu-central-1" | *none*
|
|
816
|
-
| **src** | 0 | "de" | `/^(?:de\|en)$/`
|
|
817
|
-
| **dst** | 1 | "en" | `/^(?:de\|en)$/`
|
|
952
|
+
| **region** | *none* | "eu-central-1" | *none* |
|
|
953
|
+
| **src** | 0 | "de" | `/^(?:de\|en\|fr\|it)$/` |
|
|
954
|
+
| **dst** | 1 | "en" | `/^(?:de\|en\|fr\|it)$/` |
|
|
818
955
|
|
|
819
|
-
- Node: **t2t-
|
|
820
|
-
Purpose: **
|
|
821
|
-
Example: `t2t-
|
|
822
|
-
Notice: this node requires an OpenAI API key!
|
|
956
|
+
- Node: **t2t-opus**<br/>
|
|
957
|
+
Purpose: **OPUS-MT Text-to-Text translation**<br/>
|
|
958
|
+
Example: `t2t-opus(src: "de", dst: "en")`<br/>
|
|
823
959
|
|
|
824
960
|
> This node performs translation between English and German languages
|
|
825
|
-
> in the text stream
|
|
826
|
-
> the same) spellchecking of English or German languages in the text
|
|
827
|
-
> stream. It is based on the remote OpenAI cloud AI service and uses
|
|
828
|
-
> the GPT-4o-mini LLM.
|
|
961
|
+
> in the text stream. It is based on the local OPUS-MT translation model.
|
|
829
962
|
|
|
830
963
|
| Port | Payload |
|
|
831
964
|
| ------- | ----------- |
|
|
832
965
|
| input | text |
|
|
833
966
|
| output | text |
|
|
834
967
|
|
|
835
|
-
| Parameter | Position | Default | Requirement
|
|
836
|
-
| ------------ | --------- | -------- |
|
|
837
|
-
| **api** | *none* | "https://api.openai.com" | `/^https?:\/\/.+?:\d+$/` |
|
|
968
|
+
| Parameter | Position | Default | Requirement |
|
|
969
|
+
| ------------ | --------- | -------- | ---------------- |
|
|
838
970
|
| **src** | 0 | "de" | `/^(?:de\|en)$/` |
|
|
839
971
|
| **dst** | 1 | "en" | `/^(?:de\|en)$/` |
|
|
840
|
-
| **key** | *none* | env.SPEECHFLOW\_OPENAI\_KEY | *none* |
|
|
841
|
-
| **model** | *none* | "gpt-5-mini" | *none* |
|
|
842
972
|
|
|
843
|
-
- Node: **t2t-
|
|
844
|
-
Purpose: **
|
|
845
|
-
Example: `t2t-
|
|
846
|
-
Notice: this node requires
|
|
973
|
+
- Node: **t2t-google**<br/>
|
|
974
|
+
Purpose: **Google Cloud Translate Text-to-Text translation**<br/>
|
|
975
|
+
Example: `t2t-google(src: "de", dst: "en")`<br/>
|
|
976
|
+
Notice: this node requires a Google Cloud API key and project ID!
|
|
847
977
|
|
|
848
|
-
> This node performs translation between
|
|
849
|
-
> in the text stream
|
|
850
|
-
>
|
|
851
|
-
> stream. It is based on the local Ollama AI service and uses the
|
|
852
|
-
> Google Gemma 3 LLM.
|
|
978
|
+
> This node performs translation between multiple languages
|
|
979
|
+
> in the text stream using Google Cloud Translate API.
|
|
980
|
+
> It supports German, English, French, and Italian languages.
|
|
853
981
|
|
|
854
982
|
| Port | Payload |
|
|
855
983
|
| ------- | ----------- |
|
|
@@ -858,48 +986,83 @@ The following nodes process text chunks only.
|
|
|
858
986
|
|
|
859
987
|
| Parameter | Position | Default | Requirement |
|
|
860
988
|
| ------------ | --------- | -------- | ------------------ |
|
|
861
|
-
| **
|
|
862
|
-
| **
|
|
863
|
-
| **
|
|
864
|
-
| **dst** | 1 | "en" | `/^(?:de\|en)$/` |
|
|
989
|
+
| **key** | *none* | env.SPEECHFLOW\_GOOGLE\_KEY | *none* |
|
|
990
|
+
| **src** | 0 | "de" | `/^(?:de\|en\|fr\|it)$/` |
|
|
991
|
+
| **dst** | 1 | "en" | `/^(?:de\|en\|fr\|it)$/` |
|
|
865
992
|
|
|
866
|
-
- Node: **t2t-
|
|
867
|
-
Purpose: **
|
|
868
|
-
Example: `t2t-
|
|
993
|
+
- Node: **t2t-translate**<br/>
|
|
994
|
+
Purpose: **LLM-based Text-to-Text translation**<br/>
|
|
995
|
+
Example: `t2t-translate(src: "de", dst: "en")`<br/>
|
|
996
|
+
Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
|
|
869
997
|
|
|
870
998
|
> This node performs translation between English and German languages
|
|
871
|
-
> in the text stream
|
|
999
|
+
> in the text stream using an LLM service. Multiple LLM providers are
|
|
1000
|
+
> supported: local Ollama (default), local HuggingFace Transformers,
|
|
1001
|
+
> or cloud-based OpenAI, Anthropic, or Google.
|
|
872
1002
|
|
|
873
1003
|
| Port | Payload |
|
|
874
1004
|
| ------- | ----------- |
|
|
875
1005
|
| input | text |
|
|
876
1006
|
| output | text |
|
|
877
1007
|
|
|
878
|
-
| Parameter | Position | Default
|
|
879
|
-
| ------------ | --------- |
|
|
880
|
-
| **
|
|
881
|
-
| **
|
|
882
|
-
| **
|
|
1008
|
+
| Parameter | Position | Default | Requirement |
|
|
1009
|
+
| ------------ | --------- | ------------------------ | ---------------------------------------- |
|
|
1010
|
+
| **src** | 0 | "de" | `/^(?:de\|en)$/` |
|
|
1011
|
+
| **dst** | 1 | "en" | `/^(?:de\|en)$/` |
|
|
1012
|
+
| **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
|
|
1013
|
+
| **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
|
|
1014
|
+
| **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
|
|
1015
|
+
| **key** | *none* | "" | *none* |
|
|
1016
|
+
|
|
1017
|
+
- Node: **t2t-spellcheck**<br/>
|
|
1018
|
+
Purpose: **LLM-based Text-to-Text spellchecking**<br/>
|
|
1019
|
+
Example: `t2t-spellcheck(lang: "en")`<br/>
|
|
1020
|
+
Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
|
|
1021
|
+
|
|
1022
|
+
> This node performs spellchecking of English or German text using an
|
|
1023
|
+
> LLM service. It corrects spelling mistakes, adds missing punctuation,
|
|
1024
|
+
> but preserves grammar and word choice. Multiple LLM providers are
|
|
1025
|
+
> supported: local Ollama (default), local HuggingFace Transformers,
|
|
1026
|
+
> or cloud-based OpenAI, Anthropic, or Google.
|
|
883
1027
|
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
1028
|
+
| Port | Payload |
|
|
1029
|
+
| ------- | ----------- |
|
|
1030
|
+
| input | text |
|
|
1031
|
+
| output | text |
|
|
888
1032
|
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
1033
|
+
| Parameter | Position | Default | Requirement |
|
|
1034
|
+
| ------------ | --------- | ------------------------ | ---------------------------------------- |
|
|
1035
|
+
| **lang** | 0 | "en" | `/^(?:en\|de)$/` |
|
|
1036
|
+
| **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
|
|
1037
|
+
| **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
|
|
1038
|
+
| **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
|
|
1039
|
+
| **key** | *none* | "" | *none* |
|
|
1040
|
+
|
|
1041
|
+
- Node: **t2t-punctuation**<br/>
|
|
1042
|
+
Purpose: **LLM-based punctuation restoration**<br/>
|
|
1043
|
+
Example: `t2t-punctuation(lang: "en")`<br/>
|
|
1044
|
+
Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
|
|
1045
|
+
|
|
1046
|
+
> This node performs punctuation restoration using an LLM service.
|
|
1047
|
+
> It adds missing punctuation marks (periods, commas, question marks,
|
|
1048
|
+
> exclamation marks, colons, semicolons) and capitalizes the first
|
|
1049
|
+
> letters of sentences. It preserves all original words exactly as they
|
|
1050
|
+
> are without spelling corrections or grammar changes. Multiple LLM
|
|
1051
|
+
> providers are supported: local Ollama (default), local HuggingFace
|
|
1052
|
+
> Transformers, or cloud-based OpenAI, Anthropic, or Google.
|
|
892
1053
|
|
|
893
1054
|
| Port | Payload |
|
|
894
1055
|
| ------- | ----------- |
|
|
895
1056
|
| input | text |
|
|
896
1057
|
| output | text |
|
|
897
1058
|
|
|
898
|
-
| Parameter | Position | Default
|
|
899
|
-
| ------------ | --------- |
|
|
900
|
-
| **
|
|
901
|
-
| **
|
|
902
|
-
| **
|
|
1059
|
+
| Parameter | Position | Default | Requirement |
|
|
1060
|
+
| ------------ | --------- | ------------------------ | ---------------------------------------- |
|
|
1061
|
+
| **lang** | 0 | "en" | `/^(?:en\|de)$/` |
|
|
1062
|
+
| **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
|
|
1063
|
+
| **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
|
|
1064
|
+
| **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
|
|
1065
|
+
| **key** | *none* | "" | *none* |
|
|
903
1066
|
|
|
904
1067
|
- Node: **t2t-modify**<br/>
|
|
905
1068
|
Purpose: **regex-based text modification**<br/>
|
|
@@ -919,6 +1082,53 @@ The following nodes process text chunks only.
|
|
|
919
1082
|
| **match** | 0 | "" | *required* |
|
|
920
1083
|
| **replace** | 1 | "" | *required* |
|
|
921
1084
|
|
|
1085
|
+
- Node: **t2t-profanity**<br/>
|
|
1086
|
+
Purpose: **profanity filtering**<br/>
|
|
1087
|
+
Example: `t2t-profanity(lang: "en", placeholder: "***")`<br/>
|
|
1088
|
+
|
|
1089
|
+
> This node filters profanity from the text stream by detecting bad words
|
|
1090
|
+
> and replacing them with a placeholder. It supports English and German
|
|
1091
|
+
> languages and can either replace with a fixed placeholder or repeat
|
|
1092
|
+
> the placeholder character for each character of the detected word.
|
|
1093
|
+
|
|
1094
|
+
| Port | Payload |
|
|
1095
|
+
| ------- | ----------- |
|
|
1096
|
+
| input | text |
|
|
1097
|
+
| output | text |
|
|
1098
|
+
|
|
1099
|
+
| Parameter | Position | Default | Requirement |
|
|
1100
|
+
| --------------- | --------- | ---------- | ------------------------ |
|
|
1101
|
+
| **lang** | *none* | "en" | `/^(?:en\|de)$/` |
|
|
1102
|
+
| **placeholder** | *none* | "\*\*\*" | *none* |
|
|
1103
|
+
| **mode** | *none* | "replace" | `/^(?:replace\|repeat)$/`|
|
|
1104
|
+
|
|
1105
|
+
- Node: **t2t-summary**<br/>
|
|
1106
|
+
Purpose: **LLM-based Text-to-Text summarization**<br/>
|
|
1107
|
+
Example: `t2t-summary(lang: "en", size: 4, trigger: 8)`<br/>
|
|
1108
|
+
Notice: this node requires an LLM provider (Ollama by default, or cloud-based OpenAI/Anthropic/Google, or local HuggingFace Transformers)!
|
|
1109
|
+
|
|
1110
|
+
> This node performs text summarization using an LLM service.
|
|
1111
|
+
> It accumulates incoming text sentences and generates a summary after
|
|
1112
|
+
> a configurable number of sentences (trigger). The summary length is
|
|
1113
|
+
> also configurable (size). It supports English and German languages.
|
|
1114
|
+
> Multiple LLM providers are supported: local Ollama (default), local
|
|
1115
|
+
> HuggingFace Transformers, or cloud-based OpenAI, Anthropic, or Google.
|
|
1116
|
+
|
|
1117
|
+
| Port | Payload |
|
|
1118
|
+
| ------- | ----------- |
|
|
1119
|
+
| input | text |
|
|
1120
|
+
| output | text |
|
|
1121
|
+
|
|
1122
|
+
| Parameter | Position | Default | Requirement |
|
|
1123
|
+
| ------------ | --------- | ------------------------ | ---------------------------------------- |
|
|
1124
|
+
| **provider** | *none* | "ollama" | `/^(?:openai\|anthropic\|google\|ollama\|transformers)$/` |
|
|
1125
|
+
| **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?(:\d+)?$/` |
|
|
1126
|
+
| **model** | *none* | "gemma3:4b-it-q4\_K\_M" | *none* |
|
|
1127
|
+
| **key** | *none* | "" | *none* |
|
|
1128
|
+
| **lang** | 0 | "en" | `/^(?:en\|de)$/` |
|
|
1129
|
+
| **size** | 1 | 4 | `1 <= n <= 20` |
|
|
1130
|
+
| **trigger** | 2 | 8 | `1 <= n <= 100` |
|
|
1131
|
+
|
|
922
1132
|
- Node: **t2t-sentence**<br/>
|
|
923
1133
|
Purpose: **sentence splitting/merging**<br/>
|
|
924
1134
|
Example: `t2t-sentence()`<br/>
|
|
@@ -977,6 +1187,32 @@ The following nodes process text chunks only.
|
|
|
977
1187
|
|
|
978
1188
|
The following nodes convert text chunks to audio chunks.
|
|
979
1189
|
|
|
1190
|
+
- Node: **t2a-openai**<br/>
|
|
1191
|
+
Purpose: **OpenAI Text-to-Speech conversion**<br/>
|
|
1192
|
+
Example: `t2a-openai(voice: "nova", model: "tts-1-hd")`<br/>
|
|
1193
|
+
Notice: this node requires an OpenAI API key!
|
|
1194
|
+
|
|
1195
|
+
> This node uses OpenAI TTS to perform Text-to-Speech (T2S)
|
|
1196
|
+
> conversion, i.e., it converts the input text stream into an output
|
|
1197
|
+
> audio stream. It supports six built-in voices and two models:
|
|
1198
|
+
> `tts-1` for lower latency and `tts-1-hd` for higher quality.
|
|
1199
|
+
> The language is automatically detected from the input text and
|
|
1200
|
+
> supports many languages including German, English, French, Spanish,
|
|
1201
|
+
> Chinese, Japanese, and more (no language parameter needed).
|
|
1202
|
+
|
|
1203
|
+
| Port | Payload |
|
|
1204
|
+
| ------- | ----------- |
|
|
1205
|
+
| input | text |
|
|
1206
|
+
| output | audio |
|
|
1207
|
+
|
|
1208
|
+
| Parameter | Position | Default | Requirement |
|
|
1209
|
+
| -------------- | --------- | --------- | ------------------ |
|
|
1210
|
+
| **key** | *none* | env.SPEECHFLOW\_OPENAI\_KEY | *none* |
|
|
1211
|
+
| **api** | *none* | "https://api.openai.com/v1" | `/^https?:\/\/.+/` |
|
|
1212
|
+
| **voice** | 0 | "alloy" | `/^(?:alloy\|echo\|fable\|onyx\|nova\|shimmer)$/` |
|
|
1213
|
+
| **model** | 1 | "tts-1" | `/^(?:tts-1\|tts-1-hd)$/` |
|
|
1214
|
+
| **speed** | 2 | 1.0 | `0.25 <= n <= 4.0` |
|
|
1215
|
+
|
|
980
1216
|
- Node: **t2a-amazon**<br/>
|
|
981
1217
|
Purpose: **Amazon Polly Text-to-Speech conversion**<br/>
|
|
982
1218
|
Example: `t2a-amazon(language: "en", voice: "Danielle)`<br/>
|
|
@@ -996,7 +1232,7 @@ The following nodes convert text chunks to audio chunks.
|
|
|
996
1232
|
| **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none* |
|
|
997
1233
|
| **secKey** | *none* | env.SPEECHFLOW\_AMAZON\_KEY\_SEC | *none* |
|
|
998
1234
|
| **region** | *none* | "eu-central-1" | *none* |
|
|
999
|
-
| **voice** | 0 | "Amy" |
|
|
1235
|
+
| **voice** | 0 | "Amy" | `/^(?:Amy\|Danielle\|Joanna\|Matthew\|Ruth\|Stephen\|Vicki\|Daniel)$/` |
|
|
1000
1236
|
| **language** | 1 | "en" | `/^(?:de\|en)$/` |
|
|
1001
1237
|
|
|
1002
1238
|
- Node: **t2a-elevenlabs**<br/>
|
|
@@ -1018,11 +1254,34 @@ The following nodes convert text chunks to audio chunks.
|
|
|
1018
1254
|
| **key** | *none* | env.SPEECHFLOW\_ELEVENLABS\_KEY | *none* |
|
|
1019
1255
|
| **voice** | 0 | "Brian" | `/^(?:Brittney\|Cassidy\|Leonie\|Mark\|Brian)$/` |
|
|
1020
1256
|
| **language** | 1 | "de" | `/^(?:de\|en)$/` |
|
|
1021
|
-
| **speed** | 2 | 1.00 | `n >= 0
|
|
1257
|
+
| **speed** | 2 | 1.00 | `n >= 0.7 && n <= 1.2` |
|
|
1022
1258
|
| **stability** | 3 | 0.5 | `n >= 0.0 && n <= 1.0` |
|
|
1023
1259
|
| **similarity** | 4 | 0.75 | `n >= 0.0 && n <= 1.0` |
|
|
1024
1260
|
| **optimize** | 5 | "latency" | `/^(?:latency\|quality)$/` |
|
|
1025
1261
|
|
|
1262
|
+
- Node: **t2a-google**<br/>
|
|
1263
|
+
Purpose: **Google Cloud Text-to-Speech conversion**<br/>
|
|
1264
|
+
Example: `t2a-google(voice: "en-US-Neural2-J", language: "en-US")`<br/>
|
|
1265
|
+
Notice: this node requires a Google Cloud API key!
|
|
1266
|
+
|
|
1267
|
+
> This node uses Google Cloud Text-to-Speech to perform Text-to-Speech (T2S)
|
|
1268
|
+
> conversion, i.e., it converts the input text stream into an output
|
|
1269
|
+
> audio stream. It supports various voices and languages with configurable
|
|
1270
|
+
> speaking rate and pitch adjustment.
|
|
1271
|
+
|
|
1272
|
+
| Port | Payload |
|
|
1273
|
+
| ------- | ----------- |
|
|
1274
|
+
| input | text |
|
|
1275
|
+
| output | audio |
|
|
1276
|
+
|
|
1277
|
+
| Parameter | Position | Default | Requirement |
|
|
1278
|
+
| ------------ | --------- | ------------------ | -------------------- |
|
|
1279
|
+
| **key** | *none* | env.SPEECHFLOW\_GOOGLE\_KEY | *none* |
|
|
1280
|
+
| **voice** | 0 | "en-US-Neural2-J" | *none* |
|
|
1281
|
+
| **language** | 1 | "en-US" | *none* |
|
|
1282
|
+
| **speed** | 2 | 1.0 | `0.25 <= n <= 4.0` |
|
|
1283
|
+
| **pitch** | 3 | 0.0 | `-20.0 <= n <= 20.0` |
|
|
1284
|
+
|
|
1026
1285
|
- Node: **t2a-kokoro**<br/>
|
|
1027
1286
|
Purpose: **Kokoro Text-to-Speech conversion**<br/>
|
|
1028
1287
|
Example: `t2a-kokoro(language: "en")`<br/>
|
|
@@ -1043,6 +1302,26 @@ The following nodes convert text chunks to audio chunks.
|
|
|
1043
1302
|
| **language** | 1 | "en" | `/^en$/` |
|
|
1044
1303
|
| **speed** | 2 | 1.25 | 1.0...1.30 |
|
|
1045
1304
|
|
|
1305
|
+
- Node: **t2a-supertonic**<br/>
|
|
1306
|
+
Purpose: **Supertonic Text-to-Speech conversion**<br/>
|
|
1307
|
+
Example: `t2a-supertonic(voice: "M1", speed: 1.40)`<br/>
|
|
1308
|
+
|
|
1309
|
+
> This node uses Supertonic to perform Text-to-Speech (T2S) conversion,
|
|
1310
|
+
> i.e., it converts the input text stream into an output audio stream.
|
|
1311
|
+
> It is intended to generate speech. The ONNX models are automatically
|
|
1312
|
+
> downloaded from HuggingFace on first use. It supports English language only.
|
|
1313
|
+
|
|
1314
|
+
| Port | Payload |
|
|
1315
|
+
| ------- | ----------- |
|
|
1316
|
+
| input | text |
|
|
1317
|
+
| output | audio |
|
|
1318
|
+
|
|
1319
|
+
| Parameter | Position | Default | Requirement |
|
|
1320
|
+
| ------------ | --------- | -------- | ----------- |
|
|
1321
|
+
| **voice** | 0 | "M1" | `/^(?:M1\|M2\|F1\|F2)$/` |
|
|
1322
|
+
| **speed** | 1 | 1.40 | `0.5 <= n <= 2.0` |
|
|
1323
|
+
| **steps** | 2 | 20 | `1 <= n <= 20` |
|
|
1324
|
+
|
|
1046
1325
|
### Any-to-Any Nodes
|
|
1047
1326
|
|
|
1048
1327
|
The following nodes process any type of chunk, i.e., both audio and text chunks.
|
|
@@ -1064,8 +1343,8 @@ The following nodes process any type of chunk, i.e., both audio and text chunks.
|
|
|
1064
1343
|
| Parameter | Position | Default | Requirement |
|
|
1065
1344
|
| ------------ | --------- | -------- | --------------------- |
|
|
1066
1345
|
| **type** | 0 | "audio" | `/^(?:audio\|text)$/` |
|
|
1067
|
-
| **name** | 1 | "filter" |
|
|
1068
|
-
| **var** | 2 | "" | `/^(?:meta:.+\|payload:(?:length\|text)\|time:(?:start\|end))$/` |
|
|
1346
|
+
| **name** | 1 | "filter" | `/^.+?$/` |
|
|
1347
|
+
| **var** | 2 | "" | `/^(?:meta:.+\|payload:(?:length\|text)\|time:(?:start\|end)\|kind\|type)$/` |
|
|
1069
1348
|
| **op** | 3 | "==" | `/^(?:<\|<=\|==\|!=\|~~\|!~\|>=\|>)$/` |
|
|
1070
1349
|
| **val** | 4 | "" | `/^.*$/` |
|
|
1071
1350
|
|