npm - speechflow - Versions diffs - 1.3.0 → 1.3.2 - Mend

speechflow 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

package/CHANGELOG.md +15 -0
package/README.md +165 -22
package/dst/speechflow-node-a2a-gender.d.ts +2 -0
package/dst/speechflow-node-a2a-gender.js +137 -59
package/dst/speechflow-node-a2a-gender.js.map +1 -1
package/dst/speechflow-node-a2a-meter.d.ts +3 -1
package/dst/speechflow-node-a2a-meter.js +79 -35
package/dst/speechflow-node-a2a-meter.js.map +1 -1
package/dst/speechflow-node-a2a-mute.d.ts +1 -0
package/dst/speechflow-node-a2a-mute.js +37 -11
package/dst/speechflow-node-a2a-mute.js.map +1 -1
package/dst/speechflow-node-a2a-vad.d.ts +3 -0
package/dst/speechflow-node-a2a-vad.js +194 -96
package/dst/speechflow-node-a2a-vad.js.map +1 -1
package/dst/speechflow-node-a2a-wav.js +27 -11
package/dst/speechflow-node-a2a-wav.js.map +1 -1
package/dst/speechflow-node-a2t-deepgram.d.ts +4 -0
package/dst/speechflow-node-a2t-deepgram.js +141 -43
package/dst/speechflow-node-a2t-deepgram.js.map +1 -1
package/dst/speechflow-node-t2a-elevenlabs.d.ts +2 -0
package/dst/speechflow-node-t2a-elevenlabs.js +61 -12
package/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
package/dst/speechflow-node-t2a-kokoro.d.ts +1 -0
package/dst/speechflow-node-t2a-kokoro.js +10 -4
package/dst/speechflow-node-t2a-kokoro.js.map +1 -1
package/dst/speechflow-node-t2t-deepl.js +8 -4
package/dst/speechflow-node-t2t-deepl.js.map +1 -1
package/dst/speechflow-node-t2t-format.js +2 -2
package/dst/speechflow-node-t2t-format.js.map +1 -1
package/dst/speechflow-node-t2t-ollama.js +1 -1
package/dst/speechflow-node-t2t-ollama.js.map +1 -1
package/dst/speechflow-node-t2t-openai.js +1 -1
package/dst/speechflow-node-t2t-openai.js.map +1 -1
package/dst/speechflow-node-t2t-sentence.d.ts +1 -1
package/dst/speechflow-node-t2t-sentence.js +35 -24
package/dst/speechflow-node-t2t-sentence.js.map +1 -1
package/dst/speechflow-node-t2t-subtitle.js +85 -17
package/dst/speechflow-node-t2t-subtitle.js.map +1 -1
package/dst/speechflow-node-t2t-transformers.js +2 -2
package/dst/speechflow-node-t2t-transformers.js.map +1 -1
package/dst/speechflow-node-x2x-filter.js +4 -4
package/dst/speechflow-node-x2x-trace.js +1 -1
package/dst/speechflow-node-x2x-trace.js.map +1 -1
package/dst/speechflow-node-xio-device.js +12 -8
package/dst/speechflow-node-xio-device.js.map +1 -1
package/dst/speechflow-node-xio-file.js +9 -3
package/dst/speechflow-node-xio-file.js.map +1 -1
package/dst/speechflow-node-xio-mqtt.js +5 -2
package/dst/speechflow-node-xio-mqtt.js.map +1 -1
package/dst/speechflow-node-xio-websocket.js +11 -11
package/dst/speechflow-node-xio-websocket.js.map +1 -1
package/dst/speechflow-utils.d.ts +5 -0
package/dst/speechflow-utils.js +77 -44
package/dst/speechflow-utils.js.map +1 -1
package/dst/speechflow.js +104 -34
package/dst/speechflow.js.map +1 -1
package/etc/eslint.mjs +1 -2
package/etc/speechflow.yaml +18 -7
package/etc/stx.conf +3 -3
package/package.json +14 -13
package/src/speechflow-node-a2a-gender.ts +148 -64
package/src/speechflow-node-a2a-meter.ts +87 -40
package/src/speechflow-node-a2a-mute.ts +39 -11
package/src/speechflow-node-a2a-vad.ts +206 -100
package/src/speechflow-node-a2a-wav.ts +27 -11
package/src/speechflow-node-a2t-deepgram.ts +148 -45
package/src/speechflow-node-t2a-elevenlabs.ts +65 -12
package/src/speechflow-node-t2a-kokoro.ts +11 -4
package/src/speechflow-node-t2t-deepl.ts +9 -4
package/src/speechflow-node-t2t-format.ts +2 -2
package/src/speechflow-node-t2t-ollama.ts +1 -1
package/src/speechflow-node-t2t-openai.ts +1 -1
package/src/speechflow-node-t2t-sentence.ts +38 -27
package/src/speechflow-node-t2t-subtitle.ts +62 -15
package/src/speechflow-node-t2t-transformers.ts +4 -3
package/src/speechflow-node-x2x-filter.ts +4 -4
package/src/speechflow-node-x2x-trace.ts +1 -1
package/src/speechflow-node-xio-device.ts +12 -8
package/src/speechflow-node-xio-file.ts +9 -3
package/src/speechflow-node-xio-mqtt.ts +5 -2
package/src/speechflow-node-xio-websocket.ts +12 -12
package/src/speechflow-utils.ts +78 -44
package/src/speechflow.ts +117 -36

package/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,21 @@
 ChangeLog
 =========
+1.3.2 (2025-08-04)
+------------------
+- BUGFIX: many timeout handling fixes in many nodes
+- CLEANUP: many code cleanups
+1.3.1 (2025-07-31)
+------------------
+- BUGFIX: wait a longer time for "deepgram" node to open
+- IMPROVEMENT: keep word information as meta information in "deepgram" node
+- IMPROVEMENT: support words in subtitle generation in "subtitle" node
+- BUGFIX: fix WebVTT format generation in "subtitle" node
+- UPGRADE: upgrade NPM dependencies
 1.3.0 (2025-07-26)
 ------------------

package/README.md CHANGED Viewed

@@ -56,14 +56,14 @@ ships as an installable package for the Node Package Manager (NPM).
 Installation
 ------------
-```
+```sh
 $ npm install -g speechflow
 ```
 Usage
 -----
-```
+```sh
 $ speechflow
   [-h|--help]
   [-V|--version]
@@ -251,12 +251,19 @@ First a short overview of the available processing nodes:
   **filter**,
   **trace**.
-### Input/Output Nodes:
+### Input/Output Nodes
+The following nodes are for external I/O, i.e, to read/write from
+external files, devices and network services.
 - Node:    **file**<br/>
   Purpose: **File and StdIO source/sink**<br/>
   Example: `file(path: "capture.pcm", mode: "w", type: "audio")`
+  > This node allows the reading/writing from/to files or from StdIO. It
+  > is intended to be used as source and sink nodes in batch processing,
+  > and as sing nodes in real-time processing.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text, audio |
@@ -274,6 +281,10 @@ First a short overview of the available processing nodes:
   Purpose: **Microphone/speaker device source/sink**<br/>
   Example: `device(device: "wasapi:VoiceMeeter Out B1", mode: "r")`
+  > This node allows the reading/writing from/to audio devices. It is
+  > intended to be used as source nodes for microphone devices and as
+  > sink nodes for speaker devices.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | audio       |
@@ -290,6 +301,11 @@ First a short overview of the available processing nodes:
   Example: `websocket(connect: "ws://127.0.0.1:12345", type: "text")`
   Notice: this node requires a peer WebSocket service!
+  > This node allows reading/writing from/to WebSocket network services.
+  > It is primarily intended to be used for sending out the text of
+  > subtitles, but can be also used for receiving the text to be
+  > processed.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text, audio |
@@ -306,6 +322,10 @@ First a short overview of the available processing nodes:
   Example: `mqtt(url: "mqtt://127.0.0.1:1883", username: "foo", password: "bar", topic: "quux")`
   Notice: this node requires a peer MQTT broker!
+  > This node allows reading/writing from/to MQTT broker topics. It is
+  > primarily intended to be used for sending out the text of subtitles,
+  > but can be also used for receiving the text to be processed.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text        |
@@ -313,17 +333,23 @@ First a short overview of the available processing nodes:
   | Parameter    | Position  | Default  | Requirement           |
   | ------------ | --------- | -------- | --------------------- |
-  | **url**      | 0         | *none*   | `/^(?:\|(?:ws|mqtt):\/\/(.+?):(\d+))$/` |
+  | **url**      | 0         | *none*   | `/^(?:\|(?:ws\|mqtt):\/\/(.+?):(\d+))$/` |
   | **username** | 1         | *none*   | `/^.+$/` |
   | **password** | 2         | *none*   | `/^.+$/` |
   | **topic**    | 3         | *none*   | `/^.+$/` |
-### Audio-to-Audio Nodes:
+### Audio-to-Audio Nodes
+The following nodes process audio chunks only.
 - Node: **ffmpeg**<br/>
   Purpose: **FFmpeg audio format conversion**<br/>
   Example: `ffmpeg(src: "pcm", dst: "mp3")`
+  > This node allows converting between audio formats. It is primarily
+  > intended to support the reading/writing of external MP3 and Opus
+  > format files, although SpeechFlow internally uses PCM format only.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | audio       |
@@ -338,6 +364,10 @@ First a short overview of the available processing nodes:
   Purpose: **WAV audio format conversion**<br/>
   Example: `wav(mode: "encode")`
+  > This node allows converting between PCM and WAV audio formats. It is
+  > primarily intended to support the reading/writing of external WAV
+  > format files, although SpeechFlow internally uses PCM format only.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | audio       |
@@ -352,6 +382,9 @@ First a short overview of the available processing nodes:
   Example: `mute()`
   Notice: this node has to be externally controlled via REST/WebSockets!
+  > This node allows muting the audio stream by either silencing or even
+  > unplugging. It has to be externally controlled via REST/WebSocket (see below).
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | audio       |
@@ -364,6 +397,10 @@ First a short overview of the available processing nodes:
   Purpose: **Loudness metering node**<br/>
   Example: `meter(250)`
+  > This node allows measuring the loudness of the audio stream. The
+  > results are emitted to both the logfile of **SpeechFlow** and the
+  > WebSockets API (see below).
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | audio       |
@@ -377,6 +414,10 @@ First a short overview of the available processing nodes:
   Purpose: **Voice Audio Detection (VAD) node**<br/>
   Example: `vad()`
+  > This node perform Voice Audio Detection (VAD), i.e., it detects
+  > voice in the audio stream and if not detected either silences or
+  > unplugs the audio stream.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | audio       |
@@ -384,7 +425,7 @@ First a short overview of the available processing nodes:
   | Parameter   | Position  | Default  | Requirement              |
   | ----------- | --------- | -------- | ------------------------ |
-  | **mode**               | *none* | "unplugged" | `/^(?:silenced|unplugged)$/` |
+  | **mode**               | *none* | "unplugged" | `/^(?:silenced\|unplugged)$/` |
   | **posSpeechThreshold** | *none* | 0.50  | *none* |
   | **negSpeechThreshold** | *none* | 0.35  | *none* |
   | **minSpeechFrames**    | *none* | 2     | *none* |
@@ -396,6 +437,10 @@ First a short overview of the available processing nodes:
   Purpose: **Gender Detection node**<br/>
   Example: `gender()`
+  > This node performs gender detection on the audio stream. It
+  > annotates the audio chunks with `gender=male` or `gender=female`
+  > meta information. Use this meta information with the "filter" node.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | audio       |
@@ -405,13 +450,19 @@ First a short overview of the available processing nodes:
   | ----------- | --------- | -------- | ------------------------ |
   | **window**  | 0         | 500      | *none*                   |
-### Audio-to-Text Nodes:
+### Audio-to-Text Nodes
+The following nodes convert audio to text chunks.
 - Node: **deepgram**<br/>
   Purpose: **Deepgram Speech-to-Text conversion**<br/>
   Example: `deepgram(language: "de")`<br/>
   Notice: this node requires an API key!
+  > This node performs Speech-to-Text (S2T) conversion, i.e., it
+  > recognizes speech in the input audio stream and outputs a
+  > corresponding text stream.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | audio       |
@@ -425,13 +476,17 @@ First a short overview of the available processing nodes:
   | **version**  | 1         | "latest" | *none* |
   | **language** | 2         | "multi"  | *none* |
-### Text-to-Text Nodes:
+### Text-to-Text Nodes
+The following nodes process text chunks only.
 - Node: **deepl**<br/>
   Purpose: **DeepL Text-to-Text translation**<br/>
   Example: `deepl(src: "de", dst: "en")`<br/>
   Notice: this node requires an API key!
+  > This node performs translation between English and German languages.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text        |
@@ -448,6 +503,12 @@ First a short overview of the available processing nodes:
   Example: `openai(src: "de", dst: "en")`<br/>
   Notice: this node requires an OpenAI API key!
+  > This node performs translation between English and German languages
+  > in the text stream or (if the source and destination language is
+  > the same) spellchecking of English or German languages in the text
+  > stream. It is based on the remote OpenAI cloud AI service and uses
+  > the GPT-4o-mini LLM.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text        |
@@ -464,7 +525,13 @@ First a short overview of the available processing nodes:
 - Node: **ollama**<br/>
   Purpose: **Ollama/Gemma Text-to-Text translation and spelling correction**<br/>
   Example: `ollama(src: "de", dst: "en")`<br/>
-  Notice: this node requires the Ollama API!
+  Notice: this node requires Ollama to be installed!
+  > This node performs translation between English and German languages
+  > in the text stream or (if the source and destination language is
+  > the same) spellchecking of English or German languages in the text
+  > stream. It is based on the local Ollama AI service and uses the
+  > Google Gemma 3 LLM.
   | Port    | Payload     |
   | ------- | ----------- |
@@ -482,6 +549,9 @@ First a short overview of the available processing nodes:
   Purpose: **Transformers Text-to-Text translation**<br/>
   Example: `transformers(src: "de", dst: "en")`<br/>
+  > This node performs translation between English and German languages
+  > in the text stream. It is based on local OPUS or SmolLM3 LLMs.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text        |
@@ -489,7 +559,7 @@ First a short overview of the available processing nodes:
   | Parameter    | Position  | Default  | Requirement      |
   | ------------ | --------- | -------- | ---------------- |
-  | **model**    | *none*    | "OPUS"   | `/^(?:OPUS|SmolLM3)$/` |
+  | **model**    | *none*    | "OPUS"   | `/^(?:OPUS\|SmolLM3)$/` |
   | **src**      | 0         | "de"     | `/^(?:de\|en)$/` |
   | **dst**      | 1         | "en"     | `/^(?:de\|en)$/` |
@@ -497,6 +567,11 @@ First a short overview of the available processing nodes:
   Purpose: **sentence splitting/merging**<br/>
   Example: `sentence()`<br/>
+  > This node allows you to ensure that a text stream is split or merged
+  > into complete sentences. It is primarily intended to be used after
+  > the "deepgram" node and before "deepl" or "elevenlabs" nodes in
+  > order to improve overall quality.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text        |
@@ -509,6 +584,9 @@ First a short overview of the available processing nodes:
   Purpose: **SRT/VTT Subtitle Generation**<br/>
   Example: `subtitle(format: "srt")`<br/>
+  > This node generates subtitles from the text stream (and its embedded
+  > timestamps) in the formats SRT (SubRip) or VTT (WebVTT).
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text        |
@@ -517,11 +595,16 @@ First a short overview of the available processing nodes:
   | Parameter    | Position  | Default  | Requirement        |
   | ------------ | --------- | -------- | ------------------ |
   | **format**   | *none*    | "srt"    | /^(?:srt\|vtt)$/   |
+  | **words**    | *none*    | false    | *none*             |
 - Node: **format**<br/>
   Purpose: **text paragraph formatting**<br/>
   Example: `format(width: 80)`<br/>
+  > This node formats the text stream into lines no longer than a
+  > certain width. It is primarily intended for use before writing text
+  > chunks to files.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text        |
@@ -531,29 +614,43 @@ First a short overview of the available processing nodes:
   | ------------ | --------- | -------- | --------------------- |
   | **width**    | 0         | 80       | *none*                |
-### Text-to-Audio Nodes:
+### Text-to-Audio Nodes
+The following nodes convert text chunks to audio chunks.
 - Node: **elevenlabs**<br/>
   Purpose: **ElevenLabs Text-to-Speech conversion**<br/>
   Example: `elevenlabs(language: "en")`<br/>
-  Notice: this node requires an API key!
+  Notice: this node requires an ElevenLabs API key!
+  > This node perform Text-to-Speech (T2S) conversion, i.e., it converts
+  > the input text stream into an output audio stream. It is intended to
+  > generate speech.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text        |
   | output  | audio       |
-  | Parameter    | Position  | Default  | Requirement        |
-  | ------------ | --------- | -------- | ------------------ |
-  | **key**      | *none*    | env.SPEECHFLOW\_ELEVENLABS\_KEY | *none* |
-  | **voice**    | 0         | "Brian"  | *none* |
-  | **language** | 1         | "de"     | *none* |
+  | Parameter      | Position  | Default   | Requirement        |
+  | -------------- | --------- | --------- | ------------------ |
+  | **key**        | *none*    | env.SPEECHFLOW\_ELEVENLABS\_KEY | *none* |
+  | **voice**      | 0         | "Brian"   | `/^(?:Brittney\|Cassidy\|Leonie\|Mark\|Brian)$/` |
+  | **language**   | 1         | "de"      | `/^(?:de\|en)$/`  |
+  | **speed**      | 2         | 1.00      | `n >= 0`7 && n <= 1.2` |
+  | **stability**  | 3         | 0.5       | `n >= 0.0 && n <= 1.0` |
+  | **similarity** | 4         | 0.75      | `n >= 0.0 && n <= 1.0` |
+  | **optimize**   | 5         | "latency" | `/^(?:latency\|quality)$/` |
 - Node: **kokoro**<br/>
   Purpose: **Kokoro Text-to-Speech conversion**<br/>
   Example: `kokoro(language: "en")`<br/>
   Notice: this currently support English language only!
+  > This node perform Text-to-Speech (T2S) conversion, i.e., it converts
+  > the input text stream into an output audio stream. It is intended to
+  > generate speech.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text        |
@@ -561,16 +658,23 @@ First a short overview of the available processing nodes:
   | Parameter    | Position  | Default  | Requirement |
   | ------------ | --------- | -------- | ----------- |
-  | **voice**    | 0         | "Aoede"  | `/^(?:Aoede|Heart|Puck|Fenrir)$/` |
+  | **voice**    | 0         | "Aoede"  | `/^(?:Aoede\|Heart\|Puck\|Fenrir)$/` |
   | **language** | 1         | "en"     | `/^en$/`    |
   | **speed**    | 2         | 1.25     | 1.0...1.30  |
-### Any-to-Any Nodes:
+### Any-to-Any Nodes
+The following nodes process any type of chunk, i.e., both audio and text chunks.
 - Node: **filter**<br/>
   Purpose: **meta information based filter**<br/>
   Example: `filter(type: "audio", var: "meta:gender", op: "==", val: "male")`<br/>
+  > This node allows you to filter nodes based on certain criteria. It
+  > is primarily intended to be used in conjunction with the "gender"
+  > node and in front of the `elevenlabs` or `kokoro` nodes in order to
+  > translate with a corresponding voice.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text, audio |
@@ -580,14 +684,18 @@ First a short overview of the available processing nodes:
   | ------------ | --------- | -------- | --------------------- |
   | **type**     | 0         | "audio"  | `/^(?:audio\|text)$/` |
   | **name**     | 1         | "filter" | `/^.+$/` |
-  | **var**      | 2         | ""       | `/^(?:meta:.+|payload:(?:length|text)|time:(?:start|end))$/` |
-  | **op**       | 3         | "=="     | `/^(?:<|<=|==|!=|~~|!~|>=|>)$/` |
+  | **var**      | 2         | ""       | `/^(?:meta:.+\|payload:(?:length\|text)\|time:(?:start\|end))$/` |
+  | **op**       | 3         | "=="     | `/^(?:<\|<=\|==\|!=\|~~\|!~\|>=\|>)$/` |
   | **val**      | 4         | ""       | `/^.*$/` |
 - Node: **trace**<br/>
   Purpose: **data flow tracing**<br/>
   Example: `trace(type: "audio")`<br/>
+  > This node allows you to trace the audio and text chunk flow through
+  > the **SpeechFlow** graph. It just passes through its chunks, but
+  > sends information about the chunks to the log.
   | Port    | Payload     |
   | ------- | ----------- |
   | input   | text, audio |
@@ -598,10 +706,45 @@ First a short overview of the available processing nodes:
   | **type**     | 0         | "audio"  | `/^(?:audio\|text)$/` |
   | **name**     | 1         | *none*   | *none*                |
+REST/WebSocket API
+------------------
+**SpeechFlow** has an externally exposed REST/WebSockets API which can
+be used to control the nodes and to receive information from nodes.
+For controlling a node you have three possibilities (illustrated by
+controlling the mode of the "mute" node):
+```sh
+# use HTTP/REST/GET:
+$ curl http://127.0.0.1:8484/api/COMMAND/mute/mode/silenced
+```
+```sh
+# use HTTP/REST/POST:
+$ curl -H "Content-type: application/json" \
+  --data '{ "request": "COMMAND", "node": "mute", "args": [ "mode", "silenced" ] }' \
+  http://127.0.0.1:8484/api
+```
+```sh
+# use WebSockets:
+$ wscat -c ws://127.0.0.1:8484/api \
+> { "request": "COMMAND", "node": "mute", "args": [ "mode", "silenced" ] }
+```
+For receiving emitted information from nodes, you have to use the WebSockets
+API (illustrated by the emitted information of the "meter" node):
+```sh
+# use WebSockets:
+$ wscat -c ws://127.0.0.1:8484/api \
+< { "response": "NOTIFY", "node": "meter", "args": [ "meter", "LUFS-S", -35.75127410888672 ] }
+```
 History
 -------
-**Speechflow**, as a technical cut-through, was initially created in
+**SpeechFlow**, as a technical cut-through, was initially created in
 March 2024 for use in the msg Filmstudio context. It was later refined
 into a more complete toolkit in April 2025 and this way the first time
 could be used in production. It was fully refactored in July 2025 in

package/dst/speechflow-node-a2a-gender.d.ts CHANGED Viewed

@@ -8,6 +8,8 @@ export default class SpeechFlowNodeGender extends SpeechFlowNode {
     private queueAC;
     private queueSend;
     private shutdown;
+    private workingOffTimer;
+    private progressInterval;
     constructor(id: string, cfg: {
         [id: string]: any;
     }, opts: {