speechflow 1.4.4 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/CHANGELOG.md +37 -0
  2. package/README.md +273 -7
  3. package/etc/claude.md +70 -0
  4. package/etc/speechflow.png +0 -0
  5. package/etc/speechflow.yaml +29 -11
  6. package/etc/stx.conf +7 -0
  7. package/package.json +7 -6
  8. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.d.ts +1 -0
  9. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +155 -0
  10. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -0
  11. package/speechflow-cli/dst/speechflow-node-a2a-compressor.d.ts +15 -0
  12. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +287 -0
  13. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -0
  14. package/speechflow-cli/dst/speechflow-node-a2a-dynamics-wt.d.ts +1 -0
  15. package/speechflow-cli/dst/speechflow-node-a2a-dynamics-wt.js +208 -0
  16. package/speechflow-cli/dst/speechflow-node-a2a-dynamics-wt.js.map +1 -0
  17. package/speechflow-cli/dst/speechflow-node-a2a-dynamics.d.ts +15 -0
  18. package/speechflow-cli/dst/speechflow-node-a2a-dynamics.js +312 -0
  19. package/speechflow-cli/dst/speechflow-node-a2a-dynamics.js.map +1 -0
  20. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.d.ts +1 -0
  21. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +161 -0
  22. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js.map +1 -0
  23. package/speechflow-cli/dst/speechflow-node-a2a-expander.d.ts +13 -0
  24. package/speechflow-cli/dst/speechflow-node-a2a-expander.js +208 -0
  25. package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -0
  26. package/speechflow-cli/dst/speechflow-node-a2a-ffmpeg.js +13 -3
  27. package/speechflow-cli/dst/speechflow-node-a2a-ffmpeg.js.map +1 -1
  28. package/speechflow-cli/dst/speechflow-node-a2a-filler.d.ts +14 -0
  29. package/speechflow-cli/dst/speechflow-node-a2a-filler.js +233 -0
  30. package/speechflow-cli/dst/speechflow-node-a2a-filler.js.map +1 -0
  31. package/speechflow-cli/dst/speechflow-node-a2a-gain.d.ts +12 -0
  32. package/speechflow-cli/dst/speechflow-node-a2a-gain.js +125 -0
  33. package/speechflow-cli/dst/speechflow-node-a2a-gain.js.map +1 -0
  34. package/speechflow-cli/dst/speechflow-node-a2a-gender.d.ts +0 -1
  35. package/speechflow-cli/dst/speechflow-node-a2a-gender.js +28 -12
  36. package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
  37. package/speechflow-cli/dst/speechflow-node-a2a-meter.d.ts +1 -1
  38. package/speechflow-cli/dst/speechflow-node-a2a-meter.js +35 -53
  39. package/speechflow-cli/dst/speechflow-node-a2a-meter.js.map +1 -1
  40. package/speechflow-cli/dst/speechflow-node-a2a-mute.js +2 -1
  41. package/speechflow-cli/dst/speechflow-node-a2a-mute.js.map +1 -1
  42. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.d.ts +1 -0
  43. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js +55 -0
  44. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js.map +1 -0
  45. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.d.ts +14 -0
  46. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js +184 -0
  47. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js.map +1 -0
  48. package/speechflow-cli/dst/speechflow-node-a2a-speex.d.ts +14 -0
  49. package/speechflow-cli/dst/speechflow-node-a2a-speex.js +156 -0
  50. package/speechflow-cli/dst/speechflow-node-a2a-speex.js.map +1 -0
  51. package/speechflow-cli/dst/speechflow-node-a2a-vad.js +3 -3
  52. package/speechflow-cli/dst/speechflow-node-a2a-vad.js.map +1 -1
  53. package/speechflow-cli/dst/speechflow-node-a2a-wav.js +22 -17
  54. package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
  55. package/speechflow-cli/dst/speechflow-node-a2t-awstranscribe.d.ts +18 -0
  56. package/speechflow-cli/dst/speechflow-node-a2t-awstranscribe.js +317 -0
  57. package/speechflow-cli/dst/speechflow-node-a2t-awstranscribe.js.map +1 -0
  58. package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js +16 -33
  59. package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js.map +1 -1
  60. package/speechflow-cli/dst/speechflow-node-a2t-openaitranscribe.d.ts +19 -0
  61. package/speechflow-cli/dst/speechflow-node-a2t-openaitranscribe.js +351 -0
  62. package/speechflow-cli/dst/speechflow-node-a2t-openaitranscribe.js.map +1 -0
  63. package/speechflow-cli/dst/speechflow-node-t2a-awspolly.d.ts +16 -0
  64. package/speechflow-cli/dst/speechflow-node-t2a-awspolly.js +171 -0
  65. package/speechflow-cli/dst/speechflow-node-t2a-awspolly.js.map +1 -0
  66. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +19 -14
  67. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
  68. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +11 -6
  69. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
  70. package/speechflow-cli/dst/speechflow-node-t2t-awstranslate.d.ts +13 -0
  71. package/speechflow-cli/dst/speechflow-node-t2t-awstranslate.js +141 -0
  72. package/speechflow-cli/dst/speechflow-node-t2t-awstranslate.js.map +1 -0
  73. package/speechflow-cli/dst/speechflow-node-t2t-deepl.js +13 -15
  74. package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
  75. package/speechflow-cli/dst/speechflow-node-t2t-format.js +10 -15
  76. package/speechflow-cli/dst/speechflow-node-t2t-format.js.map +1 -1
  77. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +44 -31
  78. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +1 -1
  79. package/speechflow-cli/dst/speechflow-node-t2t-openai.js +44 -45
  80. package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +1 -1
  81. package/speechflow-cli/dst/speechflow-node-t2t-sentence.js +8 -8
  82. package/speechflow-cli/dst/speechflow-node-t2t-sentence.js.map +1 -1
  83. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +10 -12
  84. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
  85. package/speechflow-cli/dst/speechflow-node-t2t-transformers.js +22 -27
  86. package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +1 -1
  87. package/speechflow-cli/dst/speechflow-node-x2x-filter.d.ts +1 -0
  88. package/speechflow-cli/dst/speechflow-node-x2x-filter.js +50 -15
  89. package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
  90. package/speechflow-cli/dst/speechflow-node-x2x-trace.js +17 -18
  91. package/speechflow-cli/dst/speechflow-node-x2x-trace.js.map +1 -1
  92. package/speechflow-cli/dst/speechflow-node-xio-device.js +13 -21
  93. package/speechflow-cli/dst/speechflow-node-xio-device.js.map +1 -1
  94. package/speechflow-cli/dst/speechflow-node-xio-mqtt.d.ts +1 -0
  95. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js +22 -16
  96. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
  97. package/speechflow-cli/dst/speechflow-node-xio-websocket.js +19 -19
  98. package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
  99. package/speechflow-cli/dst/speechflow-node.d.ts +6 -3
  100. package/speechflow-cli/dst/speechflow-node.js +13 -2
  101. package/speechflow-cli/dst/speechflow-node.js.map +1 -1
  102. package/speechflow-cli/dst/speechflow-utils-audio-wt.d.ts +1 -0
  103. package/speechflow-cli/dst/speechflow-utils-audio-wt.js +124 -0
  104. package/speechflow-cli/dst/speechflow-utils-audio-wt.js.map +1 -0
  105. package/speechflow-cli/dst/speechflow-utils-audio.d.ts +13 -0
  106. package/speechflow-cli/dst/speechflow-utils-audio.js +137 -0
  107. package/speechflow-cli/dst/speechflow-utils-audio.js.map +1 -0
  108. package/speechflow-cli/dst/speechflow-utils.d.ts +18 -0
  109. package/speechflow-cli/dst/speechflow-utils.js +123 -35
  110. package/speechflow-cli/dst/speechflow-utils.js.map +1 -1
  111. package/speechflow-cli/dst/speechflow.js +114 -27
  112. package/speechflow-cli/dst/speechflow.js.map +1 -1
  113. package/speechflow-cli/etc/oxlint.jsonc +112 -11
  114. package/speechflow-cli/etc/stx.conf +2 -2
  115. package/speechflow-cli/etc/tsconfig.json +1 -1
  116. package/speechflow-cli/package.d/@shiguredo+rnnoise-wasm+2025.1.5.patch +25 -0
  117. package/speechflow-cli/package.json +102 -94
  118. package/speechflow-cli/src/lib.d.ts +24 -0
  119. package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +151 -0
  120. package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +303 -0
  121. package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +158 -0
  122. package/speechflow-cli/src/speechflow-node-a2a-expander.ts +212 -0
  123. package/speechflow-cli/src/speechflow-node-a2a-ffmpeg.ts +13 -3
  124. package/speechflow-cli/src/speechflow-node-a2a-filler.ts +223 -0
  125. package/speechflow-cli/src/speechflow-node-a2a-gain.ts +98 -0
  126. package/speechflow-cli/src/speechflow-node-a2a-gender.ts +31 -17
  127. package/speechflow-cli/src/speechflow-node-a2a-meter.ts +37 -56
  128. package/speechflow-cli/src/speechflow-node-a2a-mute.ts +3 -2
  129. package/speechflow-cli/src/speechflow-node-a2a-rnnoise-wt.ts +62 -0
  130. package/speechflow-cli/src/speechflow-node-a2a-rnnoise.ts +164 -0
  131. package/speechflow-cli/src/speechflow-node-a2a-speex.ts +137 -0
  132. package/speechflow-cli/src/speechflow-node-a2a-vad.ts +3 -3
  133. package/speechflow-cli/src/speechflow-node-a2a-wav.ts +20 -13
  134. package/speechflow-cli/src/speechflow-node-a2t-awstranscribe.ts +308 -0
  135. package/speechflow-cli/src/speechflow-node-a2t-deepgram.ts +16 -33
  136. package/speechflow-cli/src/speechflow-node-a2t-openaitranscribe.ts +337 -0
  137. package/speechflow-cli/src/speechflow-node-t2a-awspolly.ts +187 -0
  138. package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +19 -14
  139. package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +12 -7
  140. package/speechflow-cli/src/speechflow-node-t2t-awstranslate.ts +152 -0
  141. package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +13 -15
  142. package/speechflow-cli/src/speechflow-node-t2t-format.ts +10 -15
  143. package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +55 -42
  144. package/speechflow-cli/src/speechflow-node-t2t-openai.ts +58 -58
  145. package/speechflow-cli/src/speechflow-node-t2t-sentence.ts +10 -10
  146. package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +15 -16
  147. package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +27 -32
  148. package/speechflow-cli/src/speechflow-node-x2x-filter.ts +20 -16
  149. package/speechflow-cli/src/speechflow-node-x2x-trace.ts +20 -19
  150. package/speechflow-cli/src/speechflow-node-xio-device.ts +15 -23
  151. package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +23 -16
  152. package/speechflow-cli/src/speechflow-node-xio-websocket.ts +19 -19
  153. package/speechflow-cli/src/speechflow-node.ts +21 -8
  154. package/speechflow-cli/src/speechflow-utils-audio-wt.ts +172 -0
  155. package/speechflow-cli/src/speechflow-utils-audio.ts +147 -0
  156. package/speechflow-cli/src/speechflow-utils.ts +125 -32
  157. package/speechflow-cli/src/speechflow.ts +118 -30
  158. package/speechflow-ui-db/dst/index.css +1 -1
  159. package/speechflow-ui-db/dst/index.js +31 -31
  160. package/speechflow-ui-db/etc/eslint.mjs +0 -1
  161. package/speechflow-ui-db/etc/tsc-client.json +3 -3
  162. package/speechflow-ui-db/package.json +11 -10
  163. package/speechflow-ui-db/src/app.vue +96 -78
  164. package/speechflow-ui-st/dst/index.js +26 -26
  165. package/speechflow-ui-st/etc/eslint.mjs +0 -1
  166. package/speechflow-ui-st/etc/tsc-client.json +3 -3
  167. package/speechflow-ui-st/package.json +11 -10
  168. package/speechflow-ui-st/src/app.vue +5 -12
package/CHANGELOG.md CHANGED
@@ -2,6 +2,43 @@
2
2
  ChangeLog
3
3
  =========
4
4
 
5
+ 1.5.0 (2025-08-31)
6
+ ------------------
7
+
8
+ - IMPROVEMENT: add improved dashboard infrastructure and allow nodes to publish dashboard info
9
+ - IMPROVEMENT: add CLI option for exporting dashboard info via OSC
10
+ - IMPROVEMENT: add new audio processing nodes (compressor with sidechain, expander, gain, filler)
11
+ - IMPROVEMENT: add AWS integration nodes (Polly, Translate, Transcribe)
12
+ - IMPROVEMENT: add OpenAI Transcribe node for speech-to-text
13
+ - IMPROVEMENT: add noise suppression nodes (rnnoise, speex)
14
+ - IMPROVEMENT: provide audio helper utilities and access bus functionality
15
+ - IMPROVEMENT: improve types and error handling
16
+ - IMPROVEMENT: switch to GPT-5 with improved error handling and timeout support
17
+ - IMPROVEMENT: switch from native compressor to custom implementation
18
+ - BUGFIX: fix usage of AudioIO quit and abort methods
19
+ - BUGFIX: fix operator order in audio processing
20
+ - BUGFIX: reset envelope array when channels change
21
+ - BUGFIX: fix parameter configuration in audio nodes
22
+ - BUGFIX: fix private field access and remove unnecessary casts
23
+ - UPDATE: upgrade NPM dependencies
24
+ - UPDATE: update OxLint rules and configuration
25
+ - CLEANUP: cleanup and simplify code throughout project
26
+ - CLEANUP: cleanup expander node implementation and remove stereoLink feature
27
+ - CLEANUP: cleanup gender, ffmpeg, filler, and AWS nodes
28
+ - CLEANUP: reduce code depth in multiple components
29
+ - CLEANUP: align identifiers with remaining code
30
+ - CLEANUP: make code compliant with updated linter rules
31
+ - CLEANUP: fix indentation and remove duplicate entries
32
+
33
+ 1.4.5 (2025-08-07)
34
+ ------------------
35
+
36
+ - IMPROVEMENT: better CLI option handling
37
+ - IMPROVEMENT: better optical appearance of dashboard
38
+ - BUGFIX: do not complain if no .env file is found
39
+ - BUGFIX: avoid read-timeouts in "deepgram" node
40
+ - CLEANUP: output stack traces only for "info" and "debug" verbosity levels
41
+
5
42
  1.4.4 (2025-08-07)
6
43
  ------------------
7
44
 
package/README.md CHANGED
@@ -31,10 +31,20 @@ remote MQTT network I/O,
31
31
  local Voice Activity Detection (VAD),
32
32
  local voice gender recognition,
33
33
  local audio LUFS-S/RMS metering,
34
+ local audio Speex noise suppression,
35
+ local audio RNNoise noise suppression,
36
+ local audio compressor processing,
37
+ local audio expander processing,
38
+ local audio gain processing,
39
+ local audio filler processing,
34
40
  remote-controlable local audio muting,
41
+ cloud-based [Amazon Transcribe](https://aws.amazon.com/transcribe/) speech-to-text conversion,
42
+ cloud-based [OpenAI GPT Transcribe](https://platform.openai.com/docs/models/gpt-4o-mini-transcribe) speech-to-text conversion,
35
43
  cloud-based [Deepgram](https://deepgram.com) speech-to-text conversion,
36
44
  cloud-based [ElevenLabs](https://elevenlabs.io/) text-to-speech conversion,
45
+ cloud-based [Amazon Polly](https://aws.amazon.com/polly/) text-to-speech conversion,
37
46
  cloud-based [DeepL](https://deepl.com) text-to-text translation,
47
+ cloud-based [Amazon Translate](https://aws.amazon.com/translate/) text-to-text translation,
38
48
  cloud-based [OpenAI/GPT](https://openai.com) text-to-text translation (or spelling correction),
39
49
  local [Ollama/Gemma](https://ollama.com) text-to-text translation (or spelling correction),
40
50
  local [OPUS/ONNX](https://github.com/Helsinki-NLP/Opus-MT) text-to-text translation,
@@ -53,6 +63,59 @@ derived from the exported `SpeechFlowNode` class of the `speechflow` package.
53
63
  **SpeechFlow** is written in TypeScript and
54
64
  ships as an installable package for the Node Package Manager (NPM).
55
65
 
66
+ Impression
67
+ ----------
68
+
69
+ **SpeechFlow** is a command-line interface (CLI) based tool, so there
70
+ is no exciting screenshot possible from its CLI appearance, of course.
71
+ Instead, here is a sample of a fictive training which is held in German
72
+ and real-time translated to English.
73
+
74
+ First, the used configuration was a straight linear pipeline in file `sample.conf`:
75
+
76
+ ```txt
77
+ device(device: "coreaudio:Elgato Wave:3", mode: "r") |
78
+ meter(interval: 50, dashboard: "meter1") |
79
+ deepgram(language: "de", model: "nova-2", interim: true) |
80
+ trace(type: "text", dashboard: "text1") |
81
+ filter(name: "final", type: "text", var: "kind", op: "==", val: "final") |
82
+ sentence() |
83
+ trace(type: "text", dashboard: "text2") |
84
+ deepl(src: "de", dst: "en") |
85
+ trace(type: "text", dashboard: "text3") |
86
+ elevenlabs(voice: "Mark", optimize: "latency", speed: 1.05, language: "en") |
87
+ meter(interval: 50, dashboard: "meter2") |
88
+ device(device: "coreaudio:USBAudio2.0", mode: "w")
89
+ ```
90
+
91
+ Second, the corresponding **SpeechFlow** command was:
92
+
93
+ ```sh
94
+ $ speechflow -v info -c sample.conf \
95
+ -d audio:meter1:DE,text:text1:DE-Interim,text:text2:DE-Final,text:text3:EN,audio:meter2:EN
96
+ ```
97
+
98
+ Finally, the resulting dashboard under URL `http://127.0.0.1:8484/` was:
99
+
100
+ ![dashboard](etc/speechflow.png)
101
+
102
+ On the left you can see the volume meter of the microphone (`device`),
103
+ followed by the German result of the speech-to-text conversion
104
+ (`deepgram`), followed by the still German results of the text-to-text
105
+ sentence splitting/aggregation (`sentence`), followed by the English
106
+ results of the text-to-text translation (`deepl`) and then finally on
107
+ the right you can see the volume meter of the text-to-speech conversion
108
+ (`elevenlabs`).
109
+
110
+ The entire **SpeechFlow** processing pipeline runs in real-time and
111
+ the latency between input and output audio is about 2-3 seconds, very
112
+ similar to the usual latency human live translators also cause. The
113
+ latency primarily comes from the speech-to-text part in the pipeline,
114
+ as the end of sentences have to be awaited -- especially in the German
115
+ language where the verb can come very late in a sentence. So, the
116
+ latency is primarily not caused by any technical aspects, but by the
117
+ nature of live translation.
118
+
56
119
  Installation
57
120
  ------------
58
121
 
@@ -235,18 +298,29 @@ First a short overview of the available processing nodes:
235
298
  **mute**,
236
299
  **meter**,
237
300
  **vad**,
238
- **gender**.
301
+ **gender**,
302
+ **speex**,
303
+ **rrnoise**,
304
+ **compressor**,
305
+ **expander**,
306
+ **gain**,
307
+ **filler**.
239
308
  - Audio-to-Text nodes:
309
+ **openaitranscribe**,
310
+ **awstranscribe**,
240
311
  **deepgram**.
241
312
  - Text-to-Text nodes:
242
313
  **deepl**,
314
+ **awstranslate**,
243
315
  **openai**,
244
316
  **ollama**,
245
317
  **transformers**,
246
318
  **subtitle**,
247
319
  **format**.
248
320
  - Text-to-Audio nodes:
321
+ **awspolly**.
249
322
  **elevenlabs**.
323
+ **kokoro**.
250
324
  - Any-to-Any nodes:
251
325
  **filter**,
252
326
  **trace**.
@@ -450,10 +524,160 @@ The following nodes process audio chunks only.
450
524
  | ----------- | --------- | -------- | ------------------------ |
451
525
  | **window** | 0 | 500 | *none* |
452
526
 
527
+ - Node: **speex**<br/>
528
+ Purpose: **Speex Noise Suppression node**<br/>
529
+ Example: `speex(attentuate: -18)`
530
+
531
+ > This node uses the Speex DSP pre-processor to perform noise
532
+ > suppression, i.e., it detects and attenuates (by a certain level of
533
+ > dB) the noise in the audio stream.
534
+
535
+ | Port | Payload |
536
+ | ------- | ----------- |
537
+ | input | audio |
538
+ | output | audio |
539
+
540
+ | Parameter | Position | Default | Requirement |
541
+ | ----------- | --------- | -------- | ------------------------ |
542
+ | **attentuate** | 0 | -18 | *none* | `-60 <= n <= 0` |
543
+
544
+ - Node: **rnnoise**<br/>
545
+ Purpose: **RNNoise Noise Suppression node**<br/>
546
+ Example: `rnnoise()`
547
+
548
+ > This node uses RNNoise to perform noise suppression, i.e., it
549
+ > detects and attenuates the noise in the audio stream.
550
+
551
+ | Port | Payload |
552
+ | ------- | ----------- |
553
+ | input | audio |
554
+ | output | audio |
555
+
556
+ | Parameter | Position | Default | Requirement |
557
+ | ----------- | --------- | -------- | ------------------------ |
558
+
559
+ - Node: **compressor**<br/>
560
+ Purpose: **audio compressor node**<br/>
561
+ Example: `compressor(thresholdDb: -18)`
562
+
563
+ > This node applies a dynamics compressor, i.e., it attenuates the
564
+ > volume by a certain ratio whenever the volume is above the threshold.
565
+
566
+ | Port | Payload |
567
+ | ------- | ----------- |
568
+ | input | audio |
569
+ | output | audio |
570
+
571
+ | Parameter | Position | Default | Requirement |
572
+ | ----------- | --------- | -------- | ------------------------ |
573
+ | **thresholdDb** | *none* | -18 | `n <= 0 && n >= -60` |
574
+ | **ratio** | *none* | 4 | `n >= 1 && n <= 20` |
575
+ | **attackMs** | *none* | 10 | `n >= 0 && n <= 100` |
576
+ | **releaseMs** | *none* | 50 | `n >= 0 && n <= 100` |
577
+ | **kneeDb** | *none* | 6 | `n >= 0 && n <= 100` |
578
+ | **makeupDb** | *none* | 0 | `n >= 0 && n <= 100` |
579
+
580
+ - Node: **expander**<br/>
581
+ Purpose: **audio expander node**<br/>
582
+ Example: `expander(thresholdDb: -46)`
583
+
584
+ > This node applies a dynamics expander, i.e., it attenuates the
585
+ > volume by a certain ratio whenever the volume is below the threshold.
586
+
587
+ | Port | Payload |
588
+ | ------- | ----------- |
589
+ | input | audio |
590
+ | output | audio |
591
+
592
+ | Parameter | Position | Default | Requirement |
593
+ | ----------- | --------- | -------- | ------------------------ |
594
+ | **thresholdDb** | *none* | -45 | `n <= 0 && n >= -60` |
595
+ | **ratio** | *none* | 4 | `n >= 1 && n <= 20` |
596
+ | **attackMs** | *none* | 10 | `n >= 0 && n <= 100` |
597
+ | **releaseMs** | *none* | 50 | `n >= 0 && n <= 100` |
598
+ | **kneeDb** | *none* | 6 | `n >= 0 && n <= 100` |
599
+ | **makeupDb** | *none* | 0 | `n >= 0 && n <= 100` |
600
+
601
+ - Node: **gain**<br/>
602
+ Purpose: **audio gain adjustment node**<br/>
603
+ Example: `gain(db: 12)`
604
+
605
+ > This node applies a gain adjustment to audio, i.e., it increases or
606
+ > decreases the volume by certain decibels
607
+
608
+ | Port | Payload |
609
+ | ------- | ----------- |
610
+ | input | audio |
611
+ | output | audio |
612
+
613
+ | Parameter | Position | Default | Requirement |
614
+ | ----------- | --------- | -------- | ------------------------ |
615
+ | **db** | *none* | 12 | `n >= -60 && n <= -60` |
616
+
617
+ - Node: **filler**<br/>
618
+ Purpose: **audio filler node**<br/>
619
+ Example: `filler()`
620
+
621
+ > This node adds missing audio frames of silence in order to fill
622
+ > the chronological gaps between generated audio frames (from
623
+ > text-to-speech).
624
+
625
+ | Port | Payload |
626
+ | ------- | ----------- |
627
+ | input | audio |
628
+ | output | audio |
629
+
630
+ | Parameter | Position | Default | Requirement |
631
+ | ----------- | --------- | -------- | ------------------------ |
632
+
453
633
  ### Audio-to-Text Nodes
454
634
 
455
635
  The following nodes convert audio to text chunks.
456
636
 
637
+ - Node: **openaitranscribe**<br/>
638
+ Purpose: **OpenAI/GPT Speech-to-Text conversion**<br/>
639
+ Example: `openaitranscribe(language: "de")`<br/>
640
+ Notice: this node requires an OpenAI API key!
641
+
642
+ > This node uses OpenAI GPT to perform Speech-to-Text (S2T)
643
+ > conversion, i.e., it recognizes speech in the input audio stream and
644
+ > outputs a corresponding text stream.
645
+
646
+ | Port | Payload |
647
+ | ------- | ----------- |
648
+ | input | text |
649
+ | output | text |
650
+
651
+ | Parameter | Position | Default | Requirement |
652
+ | ------------ | --------- | -------- | ------------------ |
653
+ | **key** | *none* | env.SPEECHFLOW\_OPENAI\_KEY | *none* |
654
+ | **api** | *none* | "https://api.openai.com" | `/^https?:\/\/.+?:\d+$/` |
655
+ | **model** | *none* | "gpt-4o-mini-transcribe" | *none* |
656
+ | **language** | *none* | "en" | `/^(?:de\|en)$/` |
657
+ | **interim** | *none* | false | *none* |
658
+
659
+ - Node: **awstranscribe**<br/>
660
+ Purpose: **Amazon Transcribe Speech-to-Text conversion**<br/>
661
+ Example: `awstranscribe(language: "de")`<br/>
662
+ Notice: this node requires an API key!
663
+
664
+ > This node uses Amazon Trancribe to perform Speech-to-Text (S2T)
665
+ > conversion, i.e., it recognizes speech in the input audio stream and
666
+ > outputs a corresponding text stream.
667
+
668
+ | Port | Payload |
669
+ | ------- | ----------- |
670
+ | input | audio |
671
+ | output | text |
672
+
673
+ | Parameter | Position | Default | Requirement |
674
+ | ------------ | --------- | -------- | ------------------ |
675
+ | **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none* |
676
+ | **secKey** | *none* | env.SPEECHFLOW\_AMAZON\_KEY\_SEC | *none* |
677
+ | **region** | *none* | "eu-central-1" | *none* |
678
+ | **language** | *none* | "en" | `/^(?:en|de)$/` |
679
+ | **interim** | *none* | false | *none* |
680
+
457
681
  - Node: **deepgram**<br/>
458
682
  Purpose: **Deepgram Speech-to-Text conversion**<br/>
459
683
  Example: `deepgram(language: "de")`<br/>
@@ -498,6 +722,26 @@ The following nodes process text chunks only.
498
722
  | **src** | 0 | "de" | `/^(?:de\|en)$/` |
499
723
  | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
500
724
 
725
+ - Node: **awstranslate**<br/>
726
+ Purpose: **AWS Translate Text-to-Text translation**<br/>
727
+ Example: `awstranslate(src: "de", dst: "en")`<br/>
728
+ Notice: this node requires an API key!
729
+
730
+ > This node performs translation between English and German languages.
731
+
732
+ | Port | Payload |
733
+ | ------- | ----------- |
734
+ | input | text |
735
+ | output | text |
736
+
737
+ | Parameter | Position | Default | Requirement |
738
+ | ------------ | --------- | -------- | ------------------ |
739
+ | **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none* |
740
+ | **secKey** | *none* | env.SPEECHFLOW\_AMAZON\_KEY\_SEC | *none* |
741
+ | **region** | *none* | "eu-central-1" | *none* |
742
+ | **src** | 0 | "de" | `/^(?:de\|en)$/` |
743
+ | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
744
+
501
745
  - Node: **openai**<br/>
502
746
  Purpose: **OpenAI/GPT Text-to-Text translation and spelling correction**<br/>
503
747
  Example: `openai(src: "de", dst: "en")`<br/>
@@ -618,14 +862,36 @@ The following nodes process text chunks only.
618
862
 
619
863
  The following nodes convert text chunks to audio chunks.
620
864
 
865
+ - Node: **awspolly**<br/>
866
+ Purpose: **Amazon Polly Text-to-Speech conversion**<br/>
867
+ Example: `awspolly(language: "en", voice: "Danielle)`<br/>
868
+ Notice: this node requires an Amazon API key!
869
+
870
+ > This node uses Amazon Polly to perform Text-to-Speech (T2S)
871
+ > conversion, i.e., it converts the input text stream into an output
872
+ > audio stream. It is intended to generate speech.
873
+
874
+ | Port | Payload |
875
+ | ------- | ----------- |
876
+ | input | text |
877
+ | output | audio |
878
+
879
+ | Parameter | Position | Default | Requirement |
880
+ | -------------- | --------- | --------- | ------------------ |
881
+ | **key** | *none* | env.SPEECHFLOW\_AMAZON\_KEY | *none* |
882
+ | **secKey** | *none* | env.SPEECHFLOW\_AMAZON\_KEY\_SEC | *none* |
883
+ | **region** | *none* | "eu-central-1" | *none* |
884
+ | **voice** | 0 | "Amy" | `^(?:Amy|Danielle|Joanna|Matthew|Ruth|Stephen|Viki|Daniel)$/` |
885
+ | **language** | 1 | "en" | `/^(?:de\|en)$/` |
886
+
621
887
  - Node: **elevenlabs**<br/>
622
888
  Purpose: **ElevenLabs Text-to-Speech conversion**<br/>
623
889
  Example: `elevenlabs(language: "en")`<br/>
624
890
  Notice: this node requires an ElevenLabs API key!
625
891
 
626
- > This node perform Text-to-Speech (T2S) conversion, i.e., it converts
627
- > the input text stream into an output audio stream. It is intended to
628
- > generate speech.
892
+ > This node uses ElevenLabs to perform Text-to-Speech (T2S)
893
+ > conversion, i.e., it converts the input text stream into an output
894
+ > audio stream. It is intended to generate speech.
629
895
 
630
896
  | Port | Payload |
631
897
  | ------- | ----------- |
@@ -647,9 +913,9 @@ The following nodes convert text chunks to audio chunks.
647
913
  Example: `kokoro(language: "en")`<br/>
648
914
  Notice: this currently support English language only!
649
915
 
650
- > This node perform Text-to-Speech (T2S) conversion, i.e., it converts
651
- > the input text stream into an output audio stream. It is intended to
652
- > generate speech.
916
+ > This node uses Kokoro to perform Text-to-Speech (T2S) conversion,
917
+ > i.e., it converts the input text stream into an output audio stream.
918
+ > It is intended to generate speech.
653
919
 
654
920
  | Port | Payload |
655
921
  | ------- | ----------- |
package/etc/claude.md ADDED
@@ -0,0 +1,70 @@
1
+
2
+ # CLAUDE.md
3
+
4
+ This file provides guidance to Claude Code (claude.ai/code) when working
5
+ with code in this repository.
6
+
7
+ ## Project Overview
8
+
9
+ SpeechFlow is a command-line interface tool for establishing directed
10
+ data flow graphs of audio and text processing nodes. It enables flexible
11
+ speech processing tasks including capturing audio, text-to-speech,
12
+ speech-to-text, and speech-to-speech translation.
13
+
14
+ ## Architecture
15
+
16
+ SpeechFlow uses a modular node-based architecture:
17
+
18
+ - **Core Engine**: TypeScript-based CLI tool that orchestrates processing flows
19
+ - **Processing Nodes**: Modular components for different speech processing tasks (see `src/speechflow-node-*.ts`)
20
+ - **Flow Expression Language**: Based on FlowLink for defining processing graphs
21
+ - **Web Interfaces**: Two Vue.js applications for dashboard and subtitle display
22
+ - **REST/WebSocket API**: External control interface for nodes
23
+
24
+ ### Key Components
25
+
26
+ - **Main CLI**:
27
+ `src/speechflow.ts` - Entry point and CLI parsing
28
+ - **Nodes**:
29
+ - Input/Output: `file`, `device`, `websocket`, `mqtt`
30
+ - Audio-to-Audio: `ffmpeg`, `wav`, `mute`, `meter`, `vad`, `gender`
31
+ - Audio-to-Text: `deepgram`
32
+ - Text-to-Text: `deepl`, `openai`, `ollama`, `transformers`, `subtitle`, `format`, `sentence`
33
+ - Text-to-Audio: `elevenlabs`, `kokoro`
34
+ - Any-to-Any: `filter`, `trace`
35
+
36
+ ## Development Commands
37
+
38
+ The project uses STX (Simple Task eXecutor) for build automation. Main commands:
39
+
40
+ ### Core Project
41
+
42
+ ```bash
43
+ npm start lint # Static code analysis (TypeScript, ESLint, Biome, Oxlint)
44
+ npm start build # Compile TypeScript to JavaScript in dst/
45
+ npm start dev # Multi-pane development dashboard with linting, building, and server
46
+ npm start server # Run the main speechflow program
47
+ npm start clean # Remove generated files
48
+ ```
49
+
50
+ ## Project Structure
51
+
52
+ - `src/` - Main TypeScript source files
53
+ - `dst/` - Compiled JavaScript output
54
+ - `etc/` - Configuration files (TypeScript, ESLint, Biome, etc.)
55
+ - `package.d/` - NPM package patches
56
+
57
+ ## Development Notes
58
+
59
+ - Node.js 22+ required
60
+ - Uses object-mode streaming with timestamps for audio/text processing
61
+ - External services integration: Deepgram, ElevenLabs, DeepL, OpenAI, Ollama
62
+ - Supports local processing: FFmpeg, WAV, Voice Activity Detection, Gender Detection
63
+ - REST/WebSocket API on port 8484 for external control
64
+
65
+ ## Configuration
66
+
67
+ Main configuration in `etc/speechflow.yaml` with example
68
+ processing graphs. Environment variables used for API keys (e.g.,
69
+ `SPEECHFLOW_DEEPGRAM_KEY`, `SPEECHFLOW_ELEVENLABS_KEY`).
70
+
Binary file
@@ -68,8 +68,10 @@ studio-transcription: |
68
68
  subtitle(format: "vtt") |
69
69
  file(path: argv.2, mode: "w", type: "text"),
70
70
  subtitle(format: "srt") |
71
- file(path: argv.3, mode: "w", type: "text")
72
- elevenlabs(voice: "Mark", optimize: "quality", speed: 1.05, language: "en")
71
+ file(path: argv.3, mode: "w", type: "text"),
72
+ elevenlabs(voice: "Mark", optimize: "quality", speed: 1.05, language: "en") |
73
+ wav(mode: "encode") |
74
+ file(path: argv.4, mode: "w", type: "audio")
73
75
  }
74
76
  }
75
77
  }
@@ -79,31 +81,30 @@ studio-transcription: |
79
81
  studio-translation: |
80
82
  device(device: "coreaudio:Elgato Wave:3", mode: "r") | {
81
83
  gender() | {
82
- meter(interval: 250) |
84
+ meter(interval: 250, dashboard: "meter1") |
83
85
  wav(mode: "encode") |
84
86
  file(path: "program-de.wav", mode: "w", type: "audio"),
85
- deepgram(language: "de", key: env.SPEECHFLOW_DEEPGRAM_KEY) | {
87
+ deepgram(language: "de", key: env.SPEECHFLOW_DEEPGRAM_KEY, interim: true) | {
88
+ trace(name: "trace1", type: "text", dashboard: "text1")
86
89
  subtitle(format: "vtt", words: true) |
87
90
  file(path: "program-de.vtt", mode: "w", type: "text"),
88
91
  sentence() | {
89
- format(width: 80) |
90
- file(path: "program-de.txt", mode: "w", type: "text"),
92
+ trace(name: "trace2", type: "text", notify: true, dashboard: "text2") |
93
+ format(width: 80) |
94
+ file(path: "program-de.txt", mode: "w", type: "text"),
91
95
  deepl(src: "de", dst: "en", key: env.SPEECHFLOW_DEEPL_KEY) | {
92
- trace(name: "text", type: "text") | {
96
+ trace(name: "trace3", type: "text", dashboard: "text3") | {
93
97
  format(width: 80) |
94
98
  file(path: "program-en.txt", mode: "w", type: "text"),
95
99
  subtitle(format: "vtt", words: false) |
96
100
  file(path: "program-en.vtt", mode: "w", type: "text"),
97
- mqtt(url: "mqtt://10.1.0.10:1883",
98
- username: env.SPEECHFLOW_MQTT_USER,
99
- password: env.SPEECHFLOW_MQTT_PASS,
100
- topicWrite: "stream/studio/sender"),
101
101
  {
102
102
  filter(name: "S2T-male", type: "text", var: "meta:gender", op: "==", val: "male") |
103
103
  elevenlabs(voice: "Mark", optimize: "latency", speed: 1.05, language: "en"),
104
104
  filter(name: "S2T-female", type: "text", var: "meta:gender", op: "==", val: "female") |
105
105
  elevenlabs(voice: "Brittney", optimize: "latency", speed: 1.05, language: "en")
106
106
  } | {
107
+ meter(interval: 250, dashboard: "meter2"),
107
108
  wav(mode: "encode") |
108
109
  file(path: "program-en.wav", mode: "w", type: "audio"),
109
110
  device(device: "coreaudio:USBAudio2.0", mode: "w")
@@ -115,3 +116,20 @@ studio-translation: |
115
116
  }
116
117
  }
117
118
 
119
+ # Test-drive for development
120
+ test: |
121
+ device(device: "coreaudio:Elgato Wave:3", mode: "r") |
122
+ meter(interval: 50, dashboard: "meter1") |
123
+ deepgram(language: "de", model: "nova-2", key: env.SPEECHFLOW_DEEPGRAM_KEY, interim: true) |
124
+ trace(type: "text", dashboard: "text1") | {
125
+ subtitle(mode: "render", addr: "127.0.0.1", port: 8585),
126
+ filter(name: "final", type: "text", var: "kind", op: "==", val: "final") |
127
+ sentence() |
128
+ trace(type: "text", dashboard: "text2") |
129
+ deepl(src: "de", dst: "en", key: env.SPEECHFLOW_DEEPL_KEY) |
130
+ trace(type: "text", dashboard: "text3") |
131
+ elevenlabs(voice: "Mark", optimize: "latency", speed: 1.05, language: "en") |
132
+ meter(interval: 50, dashboard: "meter2") |
133
+ device(device: "coreaudio:USBAudio2.0", mode: "w")
134
+ }
135
+
package/etc/stx.conf CHANGED
@@ -17,6 +17,13 @@ upd
17
17
  (cd speechflow-ui-db && npx -y upd) && \
18
18
  (cd speechflow-ui-st && npx -y upd)
19
19
 
20
+ # [top-level] provide statistics about code base
21
+ cloc
22
+ cloc etc \
23
+ speechflow-cli/etc speechflow-cli/src \
24
+ speechflow-ui-db/etc speechflow-ui-db/src \
25
+ speechflow-ui-st/etc speechflow-ui-st/src
26
+
20
27
  # [top-level] lint components for development
21
28
  lint
22
29
  npm --prefix speechflow-cli start lint && \
package/package.json CHANGED
@@ -1,10 +1,11 @@
1
1
  {
2
2
  "name": "speechflow",
3
- "version": "1.4.4",
4
- "x-stdver": "1.4.4-GA",
5
- "x-release": "2025-08-07",
3
+ "version": "1.5.0",
4
+ "x-stdver": "1.5.0-GA",
5
+ "x-release": "2025-08-31",
6
6
  "homepage": "https://github.com/rse/speechflow",
7
7
  "description": "Speech Processing Flow Graph",
8
+ "keywords": [ "speech", "audio", "flow", "graph" ],
8
9
  "license": "GPL-3.0-only",
9
10
  "author": {
10
11
  "name": "Dr. Ralf S. Engelschall",
@@ -16,17 +17,17 @@
16
17
  "url": "git+https://github.com/rse/speechflow.git"
17
18
  },
18
19
  "dependencies": {
19
- "@rse/stx": "1.0.7"
20
+ "@rse/stx": "1.0.9"
20
21
  },
21
22
  "devDependencies": {
22
23
  "nodemon": "3.1.10",
23
24
  "watch": "1.0.2",
24
- "concurrently": "9.2.0",
25
+ "concurrently": "9.2.1",
25
26
  "wait-on": "8.0.4",
26
27
  "cross-env": "10.0.0",
27
28
  "shx": "0.4.0"
28
29
  },
29
- "engines" : {
30
+ "engines": {
30
31
  "npm": ">=10.0.0",
31
32
  "node": ">=22.0.0"
32
33
  },