speechflow 0.9.7 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -14,9 +14,13 @@ SpeechFlow
14
14
  About
15
15
  -----
16
16
 
17
- **SpeechFlow** is a command-line interface based tool for establishing a
18
- directed data flow graph of audio and text processing nodes. This way,
19
- it allows to perform various speech processing tasks in a flexible way.
17
+ **SpeechFlow** is a command-line interface based tool for establishing
18
+ a directed data flow graph of audio and text processing nodes. This
19
+ way, it allows to perform various speech processing tasks in a very
20
+ flexible and configurable way. The usual supported tasks are capturing
21
+ audio, generate narrations of text (aka text-to-speech), generate
22
+ transcriptions or subtitles for audio (aka speech-to-text), and generate
23
+ translations for audio (aka speech-to-speech).
20
24
 
21
25
  **SpeechFlow** comes with built-in graph nodes for
22
26
  local file I/O,
@@ -26,8 +30,8 @@ remote MQTT network I/O,
26
30
  cloud-based [Deepgram](https://deepgram.com) speech-to-text conversion,
27
31
  cloud-based [ElevenLabs](https://elevenlabs.io/) text-to-speech conversion,
28
32
  cloud-based [DeepL](https://deepl.com) text-to-text translation,
29
- local [Gemma/Ollama](https://ollama.com/library/gemma3) text-to-text translation,
30
- local [Gemma/Ollama](https://ollama.com/library/gemma3) text-to-text spelling correction,
33
+ cloud-based [OpenAI/GPT](https://openai.com) text-to-text translation (or spelling correction),
34
+ local [Ollama/Gemma](https://ollama.com) text-to-text translation (or spelling correction),
31
35
  local [OPUS/ONNX](https://github.com/Helsinki-NLP/Opus-MT) text-to-text translation,
32
36
  local [FFmpeg](https://ffmpeg.org/) speech-to-speech encoding,
33
37
  local WAV speech-to-speech encoding,
@@ -67,8 +71,7 @@ Processing Graph Examples
67
71
  -------------------------
68
72
 
69
73
  The following are examples of **SpeechFlow** processing graphs.
70
- They can also be found in the [sample.yaml](./sample.yaml) file
71
- for easy consumption with `speechflow -c <id>@sample.yaml>`.
74
+ They can also be found in the sample [speechflow.yaml](./etc/speechflow.yaml) file.
72
75
 
73
76
  - **Capturing**: Capture audio from microphone device into WAV audio file:
74
77
 
@@ -89,7 +92,7 @@ for easy consumption with `speechflow -c <id>@sample.yaml>`.
89
92
  }
90
93
  ```
91
94
 
92
- - **Narration**: Generate text file with German narration of MP3 audio file:
95
+ - **Transcription**: Generate text file with German transcription of MP3 audio file:
93
96
 
94
97
  ```
95
98
  file(path: argv.0, mode: "r", type: "audio") |
@@ -109,6 +112,15 @@ for easy consumption with `speechflow -c <id>@sample.yaml>`.
109
112
  file(path: argv.1, mode: "w", type: "text")
110
113
  ```
111
114
 
115
+ - **Speaking**: Generate audio file with English voice for a text file:
116
+
117
+ ```
118
+ file(path: argv.0, mode: "r", type: "text") |
119
+ kokoro(language: "en") |
120
+ wav(mode: "encode") |
121
+ file(path: argv.1, mode: "w", type: "audio")
122
+ ```
123
+
112
124
  - **Ad-Hoc Translation**: Ad-Hoc text translation from German to English
113
125
  via stdin/stdout:
114
126
 
@@ -167,8 +179,9 @@ First a short overview of the available processing nodes:
167
179
  **deepgram**.
168
180
  - Text-to-Text nodes:
169
181
  **deepl**,
170
- **gemma**,
171
- **opus**,
182
+ **openai**,
183
+ **ollama**,
184
+ **transformers**,
172
185
  **subtitle**,
173
186
  **format**.
174
187
  - Text-to-Audio nodes:
@@ -306,10 +319,10 @@ First a short overview of the available processing nodes:
306
319
  | **src** | 0 | "de" | `/^(?:de\|en)$/` |
307
320
  | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
308
321
 
309
- - Node: **gemma**<br/>
310
- Purpose: **Google Gemma Text-to-Text translation and spelling correction**<br/>
311
- Example: `gemma(src: "de", dst: "en")`<br/>
312
- Notice; this node requires the Ollama API!
322
+ - Node: **openai**<br/>
323
+ Purpose: **OpenAI/GPT Text-to-Text translation and spelling correction**<br/>
324
+ Example: `openai(src: "de", dst: "en")`<br/>
325
+ Notice: this node requires an OpenAI API key!
313
326
 
314
327
  | Port | Payload |
315
328
  | ------- | ----------- |
@@ -318,13 +331,32 @@ First a short overview of the available processing nodes:
318
331
 
319
332
  | Parameter | Position | Default | Requirement |
320
333
  | ------------ | --------- | -------- | ------------------ |
321
- | **url** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?:\d+$/` |
334
+ | **api** | *none* | "https://api.openai.com" | `/^https?:\/\/.+?:\d+$/` |
322
335
  | **src** | 0 | "de" | `/^(?:de\|en)$/` |
323
336
  | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
337
+ | **key** | *none* | env.SPEECHFLOW\_KEY\_OPENAI | *none* |
338
+ | **model** | *none* | "gpt-4o-mini" | *none* |
324
339
 
325
- - Node: **opus**<br/>
326
- Purpose: **OPUS Text-to-Text translation**<br/>
327
- Example: `deepl(src: "de", dst: "en")`<br/>
340
+ - Node: **ollama**<br/>
341
+ Purpose: **Ollama/Gemma Text-to-Text translation and spelling correction**<br/>
342
+ Example: `ollama(src: "de", dst: "en")`<br/>
343
+ Notice: this node requires the Ollama API!
344
+
345
+ | Port | Payload |
346
+ | ------- | ----------- |
347
+ | input | text |
348
+ | output | text |
349
+
350
+ | Parameter | Position | Default | Requirement |
351
+ | ------------ | --------- | -------- | ------------------ |
352
+ | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?:\d+$/` |
353
+ | **model** | *none* | "gemma3:4b-it-q4_K_M" | *none* |
354
+ | **src** | 0 | "de" | `/^(?:de\|en)$/` |
355
+ | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
356
+
357
+ - Node: **transformers**<br/>
358
+ Purpose: **Transformers Text-to-Text translation**<br/>
359
+ Example: `transformers(src: "de", dst: "en")`<br/>
328
360
 
329
361
  | Port | Payload |
330
362
  | ------- | ----------- |
@@ -333,6 +365,7 @@ First a short overview of the available processing nodes:
333
365
 
334
366
  | Parameter | Position | Default | Requirement |
335
367
  | ------------ | --------- | -------- | ---------------- |
368
+ | **model** | *none* | "OPUS" | `/^(?:OPUS|SmolLM3)$/` |
336
369
  | **src** | 0 | "de" | `/^(?:de\|en)$/` |
337
370
  | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
338
371
 
@@ -380,6 +413,22 @@ First a short overview of the available processing nodes:
380
413
  | **voice** | 0 | "Brian" | *none* |
381
414
  | **language** | 1 | "de" | *none* |
382
415
 
416
+ - Node: **kokoro**<br/>
417
+ Purpose: **Kokoro Text-to-Speech conversion**<br/>
418
+ Example: `kokoro(language: "en")`<br/>
419
+ Notice: this currently support English language only!
420
+
421
+ | Port | Payload |
422
+ | ------- | ----------- |
423
+ | input | text |
424
+ | output | audio |
425
+
426
+ | Parameter | Position | Default | Requirement |
427
+ | ------------ | --------- | -------- | ----------- |
428
+ | **voice** | 0 | "Aoede" | `/^(?:Aoede|Heart|Puck|Fenrir)$/` |
429
+ | **language** | 1 | "en" | `/^en$/` |
430
+ | **speed** | 2 | 1.25 | 1.0...1.30 |
431
+
383
432
  ### Any-to-Any Nodes:
384
433
 
385
434
  - Node: **trace**<br/>
@@ -0,0 +1,16 @@
1
+ import SpeechFlowNode from "./speechflow-node";
2
+ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
3
+ static name: string;
4
+ private vad;
5
+ private queue;
6
+ private queueRecv;
7
+ private queueVAD;
8
+ private queueSend;
9
+ constructor(id: string, cfg: {
10
+ [id: string]: any;
11
+ }, opts: {
12
+ [id: string]: any;
13
+ }, args: any[]);
14
+ open(): Promise<void>;
15
+ close(): Promise<void>;
16
+ }
@@ -0,0 +1,431 @@
1
+ "use strict";
2
+ /*
3
+ ** SpeechFlow - Speech Processing Flow Graph
4
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
5
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
6
+ */
7
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
8
+ if (k2 === undefined) k2 = k;
9
+ var desc = Object.getOwnPropertyDescriptor(m, k);
10
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
11
+ desc = { enumerable: true, get: function() { return m[k]; } };
12
+ }
13
+ Object.defineProperty(o, k2, desc);
14
+ }) : (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ o[k2] = m[k];
17
+ }));
18
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
19
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
20
+ }) : function(o, v) {
21
+ o["default"] = v;
22
+ });
23
+ var __importStar = (this && this.__importStar) || (function () {
24
+ var ownKeys = function(o) {
25
+ ownKeys = Object.getOwnPropertyNames || function (o) {
26
+ var ar = [];
27
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
28
+ return ar;
29
+ };
30
+ return ownKeys(o);
31
+ };
32
+ return function (mod) {
33
+ if (mod && mod.__esModule) return mod;
34
+ var result = {};
35
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
36
+ __setModuleDefault(result, mod);
37
+ return result;
38
+ };
39
+ })();
40
+ var __importDefault = (this && this.__importDefault) || function (mod) {
41
+ return (mod && mod.__esModule) ? mod : { "default": mod };
42
+ };
43
+ Object.defineProperty(exports, "__esModule", { value: true });
44
+ /* standard dependencies */
45
+ const node_events_1 = require("node:events");
46
+ const node_stream_1 = __importDefault(require("node:stream"));
47
+ /* external dependencies */
48
+ const wavefile = __importStar(require("wavefile"));
49
+ const vad_node_realtime_1 = require("@ericedouard/vad-node-realtime");
50
+ /* internal dependencies */
51
+ const speechflow_node_1 = __importDefault(require("./speechflow-node"));
52
+ /* audio stream queue pointer */
53
+ class AudioQueuePointer extends node_events_1.EventEmitter {
54
+ name;
55
+ queue;
56
+ /* internal state */
57
+ index = 0;
58
+ /* construction */
59
+ constructor(name, queue) {
60
+ super();
61
+ this.name = name;
62
+ this.queue = queue;
63
+ }
64
+ /* positioning operations */
65
+ maxPosition() {
66
+ return this.queue.elements.length;
67
+ }
68
+ position(index) {
69
+ if (index !== undefined) {
70
+ this.index = index;
71
+ if (this.index < 0)
72
+ this.index = 0;
73
+ else if (this.index >= this.queue.elements.length)
74
+ this.index = this.queue.elements.length;
75
+ this.emit("position", this.index);
76
+ }
77
+ return this.index;
78
+ }
79
+ walk(num) {
80
+ if (num > 0) {
81
+ for (let i = 0; i < num && this.index < this.queue.elements.length; i++)
82
+ this.index++;
83
+ this.emit("position", { start: this.index });
84
+ }
85
+ else if (num < 0) {
86
+ for (let i = 0; i < Math.abs(num) && this.index > 0; i++)
87
+ this.index--;
88
+ this.emit("position", { start: this.index });
89
+ }
90
+ }
91
+ walkForwardUntil(type) {
92
+ while (this.index < this.queue.elements.length
93
+ && this.queue.elements[this.index].type !== type)
94
+ this.index++;
95
+ this.emit("position", { start: this.index });
96
+ }
97
+ walkBackwardUntil(type) {
98
+ while (this.index > 0
99
+ && this.queue.elements[this.index].type !== type)
100
+ this.index--;
101
+ this.emit("position", { start: this.index });
102
+ }
103
+ /* search operations */
104
+ searchForward(type) {
105
+ let position = this.index;
106
+ while (position < this.queue.elements.length
107
+ && this.queue.elements[position].type !== type)
108
+ position++;
109
+ this.emit("search", { start: this.index, end: position });
110
+ return position;
111
+ }
112
+ searchBackward(type) {
113
+ let position = this.index;
114
+ while (position > 0
115
+ && this.queue.elements[position].type !== type)
116
+ position--;
117
+ this.emit("search", { start: position, end: this.index });
118
+ }
119
+ /* reading operations */
120
+ peek(position) {
121
+ if (position === undefined)
122
+ position = this.index;
123
+ else {
124
+ if (position < 0)
125
+ position = 0;
126
+ else if (position >= this.queue.elements.length)
127
+ position = this.queue.elements.length;
128
+ }
129
+ const element = this.queue.elements[position];
130
+ this.queue.emit("read", { start: position, end: position });
131
+ return element;
132
+ }
133
+ read() {
134
+ const element = this.queue.elements[this.index];
135
+ if (this.index < this.queue.elements.length)
136
+ this.index++;
137
+ this.queue.emit("read", { start: this.index - 1, end: this.index - 1 });
138
+ return element;
139
+ }
140
+ slice(size) {
141
+ let slice;
142
+ const start = this.index;
143
+ if (size !== undefined) {
144
+ slice = this.queue.elements.slice(this.index, size);
145
+ this.index += size;
146
+ }
147
+ else {
148
+ slice = this.queue.elements.slice(this.index);
149
+ this.index = this.queue.elements.length;
150
+ }
151
+ this.queue.emit("read", { start, end: this.index });
152
+ return slice;
153
+ }
154
+ /* writing operations */
155
+ append(element) {
156
+ this.queue.elements.push(element);
157
+ this.index = this.queue.elements.length;
158
+ this.queue.emit("write", { start: this.index - 1, end: this.index - 1 });
159
+ }
160
+ insert(element) {
161
+ this.queue.elements.splice(this.index++, 0, element);
162
+ this.queue.emit("write", { start: this.index - 1, end: this.index });
163
+ }
164
+ delete() {
165
+ if (this.index >= this.queue.elements.length)
166
+ throw new Error("cannot delete after last element");
167
+ this.queue.elements.splice(this.index, 1);
168
+ this.queue.emit("write", { start: this.index, end: this.index });
169
+ }
170
+ }
171
+ /* audio stream queue */
172
+ class AudioQueue extends node_events_1.EventEmitter {
173
+ elements = [];
174
+ pointers = new Map();
175
+ pointerUse(name) {
176
+ if (!this.pointers.has(name))
177
+ this.pointers.set(name, new AudioQueuePointer(name, this));
178
+ return this.pointers.get(name);
179
+ }
180
+ pointerDelete(name) {
181
+ if (!this.pointers.has(name))
182
+ throw new Error("pointer not exists");
183
+ this.pointers.delete(name);
184
+ }
185
+ trim() {
186
+ /* determine minimum pointer position */
187
+ let min = this.elements.length;
188
+ for (const pointer of this.pointers.values())
189
+ if (min > pointer.position())
190
+ min = pointer.position();
191
+ /* trim the maximum amount of first elements */
192
+ this.elements.splice(0, min);
193
+ /* shift all pointers */
194
+ for (const pointer of this.pointers.values())
195
+ pointer.position(pointer.position() - min);
196
+ }
197
+ }
198
+ /* SpeechFlow node for VAD speech-to-speech processing */
199
+ class SpeechFlowNodeVAD extends speechflow_node_1.default {
200
+ /* declare official node name */
201
+ static name = "vad";
202
+ /* internal state */
203
+ vad = null;
204
+ queue = new AudioQueue();
205
+ queueRecv = this.queue.pointerUse("recv");
206
+ queueVAD = this.queue.pointerUse("vad");
207
+ queueSend = this.queue.pointerUse("send");
208
+ /* construct node */
209
+ constructor(id, cfg, opts, args) {
210
+ super(id, cfg, opts, args);
211
+ /* declare node configuration parameters */
212
+ this.configure({});
213
+ /* declare node input/output format */
214
+ this.input = "audio";
215
+ this.output = "audio";
216
+ }
217
+ /* open node */
218
+ async open() {
219
+ /* sanity check situation */
220
+ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
221
+ throw new Error("VAD node currently supports PCM-S16LE audio only");
222
+ /* pass-through logging */
223
+ const log = (level, msg) => { this.log(level, msg); };
224
+ /* internal processing constants */
225
+ const sampleRateTarget = 16000;
226
+ const samplesPerVADFrame = 512; /* required for VAD v5 */
227
+ const minFramesPerSecond = Math.trunc(sampleRateTarget / samplesPerVADFrame) + 1;
228
+ /* track audio queue element changes */
229
+ let speechActive = false;
230
+ let speechStart = -1;
231
+ let speechEnd = -1;
232
+ let speechMinSeconds = 2;
233
+ this.queue.on("write", () => {
234
+ if (!speechActive) {
235
+ const position = this.queueSend.searchForward("speech-start");
236
+ const element = this.queueSend.peek(position);
237
+ if (element !== undefined && element.type === "speech-start") {
238
+ this.queueSend.position(position + 1);
239
+ speechActive = true;
240
+ speechStart = this.queueSend.position();
241
+ speechEnd = speechStart;
242
+ speechMinSeconds = 2;
243
+ }
244
+ }
245
+ else {
246
+ speechEnd = this.queueSend.searchForward("speech-end");
247
+ /* determine number of speech and fill frames */
248
+ let framesSpeech = 0;
249
+ for (let f = speechStart; f < speechEnd; f++) {
250
+ const element = this.queueSend.peek(f);
251
+ if (element.type === "audio-frame")
252
+ framesSpeech++;
253
+ }
254
+ let framesFilled = minFramesPerSecond - framesSpeech;
255
+ if (framesFilled < 0)
256
+ framesFilled = 0;
257
+ /* assemble all speech and fill frames */
258
+ /*
259
+ const assembleFrames = () => {
260
+ const speech = new Float32Array((framesSpeech + framesFilled) * samplesPerVADFrame)
261
+ let i = 0
262
+ for (let f = speechStart; f < speechEnd; f++) {
263
+ const element = this.queueSend.peek(f)
264
+ if (element.type === "audio-frame")
265
+ speech.set(element.data, samplesPerVADFrame * i++)
266
+ }
267
+ if (framesFilled > 0)
268
+ speech.fill(0.0, i * samplesPerVADFrame, (i + framesFilled) * samplesPerVADFrame)
269
+ return speech
270
+ }
271
+ */
272
+ if (speechEnd === this.queueSend.maxPosition()) {
273
+ /* intermediate transcription */
274
+ const duration = ((framesSpeech + framesFilled) * samplesPerVADFrame) / sampleRateTarget;
275
+ if (duration >= speechMinSeconds) {
276
+ /* intermediate transcription of at least the next required minimum seconds */
277
+ // const samples = assembleFrames()
278
+ this.log("info", `trigger intermediate transcription (duration: ${duration.toFixed(1)}s)`);
279
+ // this.tqueue!.enqueue({ id: speechStart, type: "intermediate", audio: samples, language: this.params.language })
280
+ speechMinSeconds++;
281
+ }
282
+ }
283
+ else {
284
+ /* final transcription */
285
+ const duration = ((framesSpeech + framesFilled) * samplesPerVADFrame) / sampleRateTarget;
286
+ if (duration >= 1.0) {
287
+ // const samples = assembleFrames()
288
+ this.log("info", `trigger final transcription (duration: ${duration.toFixed(1)}s)`);
289
+ // this.tqueue!.enqueue({ id: speechStart, type: "final", audio: samples, language: this.params.language })
290
+ this.queueSend.position(speechEnd + 1);
291
+ }
292
+ else
293
+ this.log("info", `skipping final transcription -- too short (duration: ${duration.toFixed(1)}s)`);
294
+ speechActive = false;
295
+ }
296
+ }
297
+ });
298
+ /* Voice Activity Detection (VAD) */
299
+ this.vad = await vad_node_realtime_1.RealTimeVAD.new({
300
+ onSpeechStart: () => {
301
+ this.log("info", "VAD: speech start");
302
+ this.queueVAD.insert({ type: "speech-start" });
303
+ },
304
+ onSpeechEnd: (audio) => {
305
+ this.log("info", `VAD: speech end (samples: ${audio.length})`);
306
+ this.queueVAD.insert({ type: "speech-end", short: false });
307
+ },
308
+ onVADMisfire: () => {
309
+ this.log("info", "VAD: speech end (segment too short)");
310
+ this.queueVAD.insert({ type: "speech-end", short: true });
311
+ },
312
+ onFrameProcessed: () => {
313
+ this.queueVAD.walk(+1);
314
+ },
315
+ sampleRate: 16000,
316
+ model: "v5",
317
+ frameSamples: samplesPerVADFrame, /* (= 32ms: 512 frameSamples / 16000 sampleSize) */
318
+ positiveSpeechThreshold: 0.50,
319
+ negativeSpeechThreshold: 0.35,
320
+ minSpeechFrames: 4, /* (= 128ms: 4 x 512 frameSamples) */
321
+ redemptionFrames: 8, /* (= 256ms: 8 x 512 frameSamples) */
322
+ preSpeechPadFrames: 1, /* (= 32ms: 1 x 512 frameSamples) */
323
+ });
324
+ this.vad.start();
325
+ /* provide Duplex stream and internally attach to VAD */
326
+ const vad = this.vad;
327
+ const cfg = this.config;
328
+ const queueRecv = this.queueRecv;
329
+ const queueSend = this.queueSend;
330
+ let carrySamples = new Float32Array();
331
+ let endOfStream = false;
332
+ this.stream = new node_stream_1.default.Duplex({
333
+ writableObjectMode: true,
334
+ readableObjectMode: true,
335
+ decodeStrings: false,
336
+ /* receive audio samples */
337
+ write(chunk, encoding, callback) {
338
+ if (!Buffer.isBuffer(chunk.payload))
339
+ callback(new Error("expected audio input as Buffer chunks"));
340
+ else if (chunk.payload.byteLength === 0)
341
+ callback();
342
+ else {
343
+ /* convert audio samples from PCM/I16/48KHz to PCM/F32/16KHz */
344
+ const bufferToInt16Array = (buf) => {
345
+ const dataView = new DataView(buf.buffer);
346
+ const result = new Int16Array(buf.length / 2);
347
+ for (let i = 0; i < result.length; i++)
348
+ result[i] = dataView.getInt16(i * 2, cfg.audioLittleEndian);
349
+ return result;
350
+ };
351
+ const wav = new wavefile.WaveFile();
352
+ wav.fromScratch(cfg.audioChannels, cfg.audioSampleRate, String(cfg.audioBitDepth), bufferToInt16Array(chunk.payload));
353
+ wav.toBitDepth("32f");
354
+ wav.toSampleRate(16000, { method: "cubic" });
355
+ let data = wav.getSamples(false, Float32Array);
356
+ /* merge previous carry samples */
357
+ if (carrySamples.length > 0) {
358
+ const merged = new Float32Array(carrySamples.length + data.length);
359
+ merged.set(carrySamples);
360
+ merged.set(data, carrySamples.length);
361
+ data = merged;
362
+ carrySamples = new Float32Array();
363
+ }
364
+ /* queue audio samples as individual VAD-sized frames
365
+ and in parallel send it into the Voice Activity Detection (VAD) */
366
+ const chunks = Math.trunc(data.length / samplesPerVADFrame);
367
+ for (let i = 0; i < chunks; i++) {
368
+ const frame = data.slice(i * samplesPerVADFrame, (i + 1) * samplesPerVADFrame);
369
+ queueRecv.append({ type: "audio-frame", data: frame });
370
+ vad.processAudio(frame);
371
+ }
372
+ /* remember new carry samples */
373
+ const bulkLen = chunks * samplesPerVADFrame;
374
+ carrySamples = data.slice(bulkLen);
375
+ callback();
376
+ }
377
+ },
378
+ /* send transcription texts */
379
+ read(size) {
380
+ if (endOfStream)
381
+ this.push(null);
382
+ else {
383
+ queueSend.once("write", (text) => {
384
+ log("info", `VAD: receive data (${text.length} bytes)`);
385
+ this.push(text, cfg.textEncoding);
386
+ });
387
+ }
388
+ },
389
+ /* react on end of input */
390
+ final(callback) {
391
+ if (carrySamples.length > 0) {
392
+ /* flush pending audio samples */
393
+ if (carrySamples.length < samplesPerVADFrame) {
394
+ const merged = new Float32Array(samplesPerVADFrame);
395
+ merged.set(carrySamples);
396
+ merged.fill(0.0, carrySamples.length, samplesPerVADFrame);
397
+ carrySamples = merged;
398
+ }
399
+ queueRecv.append({ type: "audio-frame", data: carrySamples });
400
+ vad.processAudio(carrySamples);
401
+ /* give the processing a chance to still process the remaining samples */
402
+ setTimeout(() => {
403
+ endOfStream = true;
404
+ this.push(null);
405
+ callback();
406
+ }, 2000);
407
+ }
408
+ else {
409
+ endOfStream = true;
410
+ this.push(null);
411
+ callback();
412
+ }
413
+ }
414
+ });
415
+ }
416
+ /* close node */
417
+ async close() {
418
+ /* close stream */
419
+ if (this.stream !== null) {
420
+ this.stream.destroy();
421
+ this.stream = null;
422
+ }
423
+ /* close VAD */
424
+ if (this.vad !== null) {
425
+ await this.vad.flush();
426
+ this.vad.destroy();
427
+ this.vad = null;
428
+ }
429
+ }
430
+ }
431
+ exports.default = SpeechFlowNodeVAD;
@@ -0,0 +1,13 @@
1
+ import SpeechFlowNode from "./speechflow-node";
2
+ export default class SpeechFlowNodeKokoro extends SpeechFlowNode {
3
+ static name: string;
4
+ private kokoro;
5
+ private static speexInitialized;
6
+ constructor(id: string, cfg: {
7
+ [id: string]: any;
8
+ }, opts: {
9
+ [id: string]: any;
10
+ }, args: any[]);
11
+ open(): Promise<void>;
12
+ close(): Promise<void>;
13
+ }