speechflow 0.9.8 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -14,9 +14,13 @@ SpeechFlow
14
14
  About
15
15
  -----
16
16
 
17
- **SpeechFlow** is a command-line interface based tool for establishing a
18
- directed data flow graph of audio and text processing nodes. This way,
19
- it allows to perform various speech processing tasks in a flexible way.
17
+ **SpeechFlow** is a command-line interface based tool for establishing
18
+ a directed data flow graph of audio and text processing nodes. This
19
+ way, it allows to perform various speech processing tasks in a very
20
+ flexible and configurable way. The usual supported tasks are capturing
21
+ audio, generate narrations of text (aka text-to-speech), generate
22
+ transcriptions or subtitles for audio (aka speech-to-text), and generate
23
+ translations for audio (aka speech-to-speech).
20
24
 
21
25
  **SpeechFlow** comes with built-in graph nodes for
22
26
  local file I/O,
@@ -26,8 +30,8 @@ remote MQTT network I/O,
26
30
  cloud-based [Deepgram](https://deepgram.com) speech-to-text conversion,
27
31
  cloud-based [ElevenLabs](https://elevenlabs.io/) text-to-speech conversion,
28
32
  cloud-based [DeepL](https://deepl.com) text-to-text translation,
29
- local [Gemma/Ollama](https://ollama.com/library/gemma3) text-to-text translation,
30
- local [Gemma/Ollama](https://ollama.com/library/gemma3) text-to-text spelling correction,
33
+ cloud-based [OpenAI/GPT](https://openai.com) text-to-text translation (or spelling correction),
34
+ local [Ollama/Gemma](https://ollama.com) text-to-text translation (or spelling correction),
31
35
  local [OPUS/ONNX](https://github.com/Helsinki-NLP/Opus-MT) text-to-text translation,
32
36
  local [FFmpeg](https://ffmpeg.org/) speech-to-speech encoding,
33
37
  local WAV speech-to-speech encoding,
@@ -88,7 +92,7 @@ They can also be found in the sample [speechflow.yaml](./etc/speechflow.yaml) fi
88
92
  }
89
93
  ```
90
94
 
91
- - **Narration**: Generate text file with German narration of MP3 audio file:
95
+ - **Transcription**: Generate text file with German transcription of MP3 audio file:
92
96
 
93
97
  ```
94
98
  file(path: argv.0, mode: "r", type: "audio") |
@@ -108,6 +112,15 @@ They can also be found in the sample [speechflow.yaml](./etc/speechflow.yaml) fi
108
112
  file(path: argv.1, mode: "w", type: "text")
109
113
  ```
110
114
 
115
+ - **Speaking**: Generate audio file with English voice for a text file:
116
+
117
+ ```
118
+ file(path: argv.0, mode: "r", type: "text") |
119
+ kokoro(language: "en") |
120
+ wav(mode: "encode") |
121
+ file(path: argv.1, mode: "w", type: "audio")
122
+ ```
123
+
111
124
  - **Ad-Hoc Translation**: Ad-Hoc text translation from German to English
112
125
  via stdin/stdout:
113
126
 
@@ -166,8 +179,9 @@ First a short overview of the available processing nodes:
166
179
  **deepgram**.
167
180
  - Text-to-Text nodes:
168
181
  **deepl**,
169
- **gemma**,
170
- **opus**,
182
+ **openai**,
183
+ **ollama**,
184
+ **transformers**,
171
185
  **subtitle**,
172
186
  **format**.
173
187
  - Text-to-Audio nodes:
@@ -305,10 +319,10 @@ First a short overview of the available processing nodes:
305
319
  | **src** | 0 | "de" | `/^(?:de\|en)$/` |
306
320
  | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
307
321
 
308
- - Node: **gemma**<br/>
309
- Purpose: **Google Gemma Text-to-Text translation and spelling correction**<br/>
310
- Example: `gemma(src: "de", dst: "en")`<br/>
311
- Notice; this node requires the Ollama API!
322
+ - Node: **openai**<br/>
323
+ Purpose: **OpenAI/GPT Text-to-Text translation and spelling correction**<br/>
324
+ Example: `openai(src: "de", dst: "en")`<br/>
325
+ Notice: this node requires an OpenAI API key!
312
326
 
313
327
  | Port | Payload |
314
328
  | ------- | ----------- |
@@ -317,13 +331,32 @@ First a short overview of the available processing nodes:
317
331
 
318
332
  | Parameter | Position | Default | Requirement |
319
333
  | ------------ | --------- | -------- | ------------------ |
320
- | **url** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?:\d+$/` |
334
+ | **api** | *none* | "https://api.openai.com" | `/^https?:\/\/.+?:\d+$/` |
321
335
  | **src** | 0 | "de" | `/^(?:de\|en)$/` |
322
336
  | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
337
+ | **key** | *none* | env.SPEECHFLOW\_KEY\_OPENAI | *none* |
338
+ | **model** | *none* | "gpt-4o-mini" | *none* |
323
339
 
324
- - Node: **opus**<br/>
325
- Purpose: **OPUS Text-to-Text translation**<br/>
326
- Example: `deepl(src: "de", dst: "en")`<br/>
340
+ - Node: **ollama**<br/>
341
+ Purpose: **Ollama/Gemma Text-to-Text translation and spelling correction**<br/>
342
+ Example: `ollama(src: "de", dst: "en")`<br/>
343
+ Notice: this node requires the Ollama API!
344
+
345
+ | Port | Payload |
346
+ | ------- | ----------- |
347
+ | input | text |
348
+ | output | text |
349
+
350
+ | Parameter | Position | Default | Requirement |
351
+ | ------------ | --------- | -------- | ------------------ |
352
+ | **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?:\d+$/` |
353
+ | **model** | *none* | "gemma3:4b-it-q4_K_M" | *none* |
354
+ | **src** | 0 | "de" | `/^(?:de\|en)$/` |
355
+ | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
356
+
357
+ - Node: **transformers**<br/>
358
+ Purpose: **Transformers Text-to-Text translation**<br/>
359
+ Example: `transformers(src: "de", dst: "en")`<br/>
327
360
 
328
361
  | Port | Payload |
329
362
  | ------- | ----------- |
@@ -332,6 +365,7 @@ First a short overview of the available processing nodes:
332
365
 
333
366
  | Parameter | Position | Default | Requirement |
334
367
  | ------------ | --------- | -------- | ---------------- |
368
+ | **model** | *none* | "OPUS" | `/^(?:OPUS|SmolLM3)$/` |
335
369
  | **src** | 0 | "de" | `/^(?:de\|en)$/` |
336
370
  | **dst** | 1 | "en" | `/^(?:de\|en)$/` |
337
371
 
@@ -379,6 +413,22 @@ First a short overview of the available processing nodes:
379
413
  | **voice** | 0 | "Brian" | *none* |
380
414
  | **language** | 1 | "de" | *none* |
381
415
 
416
+ - Node: **kokoro**<br/>
417
+ Purpose: **Kokoro Text-to-Speech conversion**<br/>
418
+ Example: `kokoro(language: "en")`<br/>
419
+ Notice: this currently support English language only!
420
+
421
+ | Port | Payload |
422
+ | ------- | ----------- |
423
+ | input | text |
424
+ | output | audio |
425
+
426
+ | Parameter | Position | Default | Requirement |
427
+ | ------------ | --------- | -------- | ----------- |
428
+ | **voice** | 0 | "Aoede" | `/^(?:Aoede|Heart|Puck|Fenrir)$/` |
429
+ | **language** | 1 | "en" | `/^en$/` |
430
+ | **speed** | 2 | 1.25 | 1.0...1.30 |
431
+
382
432
  ### Any-to-Any Nodes:
383
433
 
384
434
  - Node: **trace**<br/>
@@ -0,0 +1,16 @@
1
+ import SpeechFlowNode from "./speechflow-node";
2
+ export default class SpeechFlowNodeVAD extends SpeechFlowNode {
3
+ static name: string;
4
+ private vad;
5
+ private queue;
6
+ private queueRecv;
7
+ private queueVAD;
8
+ private queueSend;
9
+ constructor(id: string, cfg: {
10
+ [id: string]: any;
11
+ }, opts: {
12
+ [id: string]: any;
13
+ }, args: any[]);
14
+ open(): Promise<void>;
15
+ close(): Promise<void>;
16
+ }
@@ -0,0 +1,431 @@
1
+ "use strict";
2
+ /*
3
+ ** SpeechFlow - Speech Processing Flow Graph
4
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
5
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
6
+ */
7
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
8
+ if (k2 === undefined) k2 = k;
9
+ var desc = Object.getOwnPropertyDescriptor(m, k);
10
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
11
+ desc = { enumerable: true, get: function() { return m[k]; } };
12
+ }
13
+ Object.defineProperty(o, k2, desc);
14
+ }) : (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ o[k2] = m[k];
17
+ }));
18
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
19
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
20
+ }) : function(o, v) {
21
+ o["default"] = v;
22
+ });
23
+ var __importStar = (this && this.__importStar) || (function () {
24
+ var ownKeys = function(o) {
25
+ ownKeys = Object.getOwnPropertyNames || function (o) {
26
+ var ar = [];
27
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
28
+ return ar;
29
+ };
30
+ return ownKeys(o);
31
+ };
32
+ return function (mod) {
33
+ if (mod && mod.__esModule) return mod;
34
+ var result = {};
35
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
36
+ __setModuleDefault(result, mod);
37
+ return result;
38
+ };
39
+ })();
40
+ var __importDefault = (this && this.__importDefault) || function (mod) {
41
+ return (mod && mod.__esModule) ? mod : { "default": mod };
42
+ };
43
+ Object.defineProperty(exports, "__esModule", { value: true });
44
+ /* standard dependencies */
45
+ const node_events_1 = require("node:events");
46
+ const node_stream_1 = __importDefault(require("node:stream"));
47
+ /* external dependencies */
48
+ const wavefile = __importStar(require("wavefile"));
49
+ const vad_node_realtime_1 = require("@ericedouard/vad-node-realtime");
50
+ /* internal dependencies */
51
+ const speechflow_node_1 = __importDefault(require("./speechflow-node"));
52
+ /* audio stream queue pointer */
53
+ class AudioQueuePointer extends node_events_1.EventEmitter {
54
+ name;
55
+ queue;
56
+ /* internal state */
57
+ index = 0;
58
+ /* construction */
59
+ constructor(name, queue) {
60
+ super();
61
+ this.name = name;
62
+ this.queue = queue;
63
+ }
64
+ /* positioning operations */
65
+ maxPosition() {
66
+ return this.queue.elements.length;
67
+ }
68
+ position(index) {
69
+ if (index !== undefined) {
70
+ this.index = index;
71
+ if (this.index < 0)
72
+ this.index = 0;
73
+ else if (this.index >= this.queue.elements.length)
74
+ this.index = this.queue.elements.length;
75
+ this.emit("position", this.index);
76
+ }
77
+ return this.index;
78
+ }
79
+ walk(num) {
80
+ if (num > 0) {
81
+ for (let i = 0; i < num && this.index < this.queue.elements.length; i++)
82
+ this.index++;
83
+ this.emit("position", { start: this.index });
84
+ }
85
+ else if (num < 0) {
86
+ for (let i = 0; i < Math.abs(num) && this.index > 0; i++)
87
+ this.index--;
88
+ this.emit("position", { start: this.index });
89
+ }
90
+ }
91
+ walkForwardUntil(type) {
92
+ while (this.index < this.queue.elements.length
93
+ && this.queue.elements[this.index].type !== type)
94
+ this.index++;
95
+ this.emit("position", { start: this.index });
96
+ }
97
+ walkBackwardUntil(type) {
98
+ while (this.index > 0
99
+ && this.queue.elements[this.index].type !== type)
100
+ this.index--;
101
+ this.emit("position", { start: this.index });
102
+ }
103
+ /* search operations */
104
+ searchForward(type) {
105
+ let position = this.index;
106
+ while (position < this.queue.elements.length
107
+ && this.queue.elements[position].type !== type)
108
+ position++;
109
+ this.emit("search", { start: this.index, end: position });
110
+ return position;
111
+ }
112
+ searchBackward(type) {
113
+ let position = this.index;
114
+ while (position > 0
115
+ && this.queue.elements[position].type !== type)
116
+ position--;
117
+ this.emit("search", { start: position, end: this.index });
118
+ }
119
+ /* reading operations */
120
+ peek(position) {
121
+ if (position === undefined)
122
+ position = this.index;
123
+ else {
124
+ if (position < 0)
125
+ position = 0;
126
+ else if (position >= this.queue.elements.length)
127
+ position = this.queue.elements.length;
128
+ }
129
+ const element = this.queue.elements[position];
130
+ this.queue.emit("read", { start: position, end: position });
131
+ return element;
132
+ }
133
+ read() {
134
+ const element = this.queue.elements[this.index];
135
+ if (this.index < this.queue.elements.length)
136
+ this.index++;
137
+ this.queue.emit("read", { start: this.index - 1, end: this.index - 1 });
138
+ return element;
139
+ }
140
+ slice(size) {
141
+ let slice;
142
+ const start = this.index;
143
+ if (size !== undefined) {
144
+ slice = this.queue.elements.slice(this.index, size);
145
+ this.index += size;
146
+ }
147
+ else {
148
+ slice = this.queue.elements.slice(this.index);
149
+ this.index = this.queue.elements.length;
150
+ }
151
+ this.queue.emit("read", { start, end: this.index });
152
+ return slice;
153
+ }
154
+ /* writing operations */
155
+ append(element) {
156
+ this.queue.elements.push(element);
157
+ this.index = this.queue.elements.length;
158
+ this.queue.emit("write", { start: this.index - 1, end: this.index - 1 });
159
+ }
160
+ insert(element) {
161
+ this.queue.elements.splice(this.index++, 0, element);
162
+ this.queue.emit("write", { start: this.index - 1, end: this.index });
163
+ }
164
+ delete() {
165
+ if (this.index >= this.queue.elements.length)
166
+ throw new Error("cannot delete after last element");
167
+ this.queue.elements.splice(this.index, 1);
168
+ this.queue.emit("write", { start: this.index, end: this.index });
169
+ }
170
+ }
171
+ /* audio stream queue */
172
+ class AudioQueue extends node_events_1.EventEmitter {
173
+ elements = [];
174
+ pointers = new Map();
175
+ pointerUse(name) {
176
+ if (!this.pointers.has(name))
177
+ this.pointers.set(name, new AudioQueuePointer(name, this));
178
+ return this.pointers.get(name);
179
+ }
180
+ pointerDelete(name) {
181
+ if (!this.pointers.has(name))
182
+ throw new Error("pointer not exists");
183
+ this.pointers.delete(name);
184
+ }
185
+ trim() {
186
+ /* determine minimum pointer position */
187
+ let min = this.elements.length;
188
+ for (const pointer of this.pointers.values())
189
+ if (min > pointer.position())
190
+ min = pointer.position();
191
+ /* trim the maximum amount of first elements */
192
+ this.elements.splice(0, min);
193
+ /* shift all pointers */
194
+ for (const pointer of this.pointers.values())
195
+ pointer.position(pointer.position() - min);
196
+ }
197
+ }
198
+ /* SpeechFlow node for VAD speech-to-speech processing */
199
+ class SpeechFlowNodeVAD extends speechflow_node_1.default {
200
+ /* declare official node name */
201
+ static name = "vad";
202
+ /* internal state */
203
+ vad = null;
204
+ queue = new AudioQueue();
205
+ queueRecv = this.queue.pointerUse("recv");
206
+ queueVAD = this.queue.pointerUse("vad");
207
+ queueSend = this.queue.pointerUse("send");
208
+ /* construct node */
209
+ constructor(id, cfg, opts, args) {
210
+ super(id, cfg, opts, args);
211
+ /* declare node configuration parameters */
212
+ this.configure({});
213
+ /* declare node input/output format */
214
+ this.input = "audio";
215
+ this.output = "audio";
216
+ }
217
+ /* open node */
218
+ async open() {
219
+ /* sanity check situation */
220
+ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
221
+ throw new Error("VAD node currently supports PCM-S16LE audio only");
222
+ /* pass-through logging */
223
+ const log = (level, msg) => { this.log(level, msg); };
224
+ /* internal processing constants */
225
+ const sampleRateTarget = 16000;
226
+ const samplesPerVADFrame = 512; /* required for VAD v5 */
227
+ const minFramesPerSecond = Math.trunc(sampleRateTarget / samplesPerVADFrame) + 1;
228
+ /* track audio queue element changes */
229
+ let speechActive = false;
230
+ let speechStart = -1;
231
+ let speechEnd = -1;
232
+ let speechMinSeconds = 2;
233
+ this.queue.on("write", () => {
234
+ if (!speechActive) {
235
+ const position = this.queueSend.searchForward("speech-start");
236
+ const element = this.queueSend.peek(position);
237
+ if (element !== undefined && element.type === "speech-start") {
238
+ this.queueSend.position(position + 1);
239
+ speechActive = true;
240
+ speechStart = this.queueSend.position();
241
+ speechEnd = speechStart;
242
+ speechMinSeconds = 2;
243
+ }
244
+ }
245
+ else {
246
+ speechEnd = this.queueSend.searchForward("speech-end");
247
+ /* determine number of speech and fill frames */
248
+ let framesSpeech = 0;
249
+ for (let f = speechStart; f < speechEnd; f++) {
250
+ const element = this.queueSend.peek(f);
251
+ if (element.type === "audio-frame")
252
+ framesSpeech++;
253
+ }
254
+ let framesFilled = minFramesPerSecond - framesSpeech;
255
+ if (framesFilled < 0)
256
+ framesFilled = 0;
257
+ /* assemble all speech and fill frames */
258
+ /*
259
+ const assembleFrames = () => {
260
+ const speech = new Float32Array((framesSpeech + framesFilled) * samplesPerVADFrame)
261
+ let i = 0
262
+ for (let f = speechStart; f < speechEnd; f++) {
263
+ const element = this.queueSend.peek(f)
264
+ if (element.type === "audio-frame")
265
+ speech.set(element.data, samplesPerVADFrame * i++)
266
+ }
267
+ if (framesFilled > 0)
268
+ speech.fill(0.0, i * samplesPerVADFrame, (i + framesFilled) * samplesPerVADFrame)
269
+ return speech
270
+ }
271
+ */
272
+ if (speechEnd === this.queueSend.maxPosition()) {
273
+ /* intermediate transcription */
274
+ const duration = ((framesSpeech + framesFilled) * samplesPerVADFrame) / sampleRateTarget;
275
+ if (duration >= speechMinSeconds) {
276
+ /* intermediate transcription of at least the next required minimum seconds */
277
+ // const samples = assembleFrames()
278
+ this.log("info", `trigger intermediate transcription (duration: ${duration.toFixed(1)}s)`);
279
+ // this.tqueue!.enqueue({ id: speechStart, type: "intermediate", audio: samples, language: this.params.language })
280
+ speechMinSeconds++;
281
+ }
282
+ }
283
+ else {
284
+ /* final transcription */
285
+ const duration = ((framesSpeech + framesFilled) * samplesPerVADFrame) / sampleRateTarget;
286
+ if (duration >= 1.0) {
287
+ // const samples = assembleFrames()
288
+ this.log("info", `trigger final transcription (duration: ${duration.toFixed(1)}s)`);
289
+ // this.tqueue!.enqueue({ id: speechStart, type: "final", audio: samples, language: this.params.language })
290
+ this.queueSend.position(speechEnd + 1);
291
+ }
292
+ else
293
+ this.log("info", `skipping final transcription -- too short (duration: ${duration.toFixed(1)}s)`);
294
+ speechActive = false;
295
+ }
296
+ }
297
+ });
298
+ /* Voice Activity Detection (VAD) */
299
+ this.vad = await vad_node_realtime_1.RealTimeVAD.new({
300
+ onSpeechStart: () => {
301
+ this.log("info", "VAD: speech start");
302
+ this.queueVAD.insert({ type: "speech-start" });
303
+ },
304
+ onSpeechEnd: (audio) => {
305
+ this.log("info", `VAD: speech end (samples: ${audio.length})`);
306
+ this.queueVAD.insert({ type: "speech-end", short: false });
307
+ },
308
+ onVADMisfire: () => {
309
+ this.log("info", "VAD: speech end (segment too short)");
310
+ this.queueVAD.insert({ type: "speech-end", short: true });
311
+ },
312
+ onFrameProcessed: () => {
313
+ this.queueVAD.walk(+1);
314
+ },
315
+ sampleRate: 16000,
316
+ model: "v5",
317
+ frameSamples: samplesPerVADFrame, /* (= 32ms: 512 frameSamples / 16000 sampleSize) */
318
+ positiveSpeechThreshold: 0.50,
319
+ negativeSpeechThreshold: 0.35,
320
+ minSpeechFrames: 4, /* (= 128ms: 4 x 512 frameSamples) */
321
+ redemptionFrames: 8, /* (= 256ms: 8 x 512 frameSamples) */
322
+ preSpeechPadFrames: 1, /* (= 32ms: 1 x 512 frameSamples) */
323
+ });
324
+ this.vad.start();
325
+ /* provide Duplex stream and internally attach to VAD */
326
+ const vad = this.vad;
327
+ const cfg = this.config;
328
+ const queueRecv = this.queueRecv;
329
+ const queueSend = this.queueSend;
330
+ let carrySamples = new Float32Array();
331
+ let endOfStream = false;
332
+ this.stream = new node_stream_1.default.Duplex({
333
+ writableObjectMode: true,
334
+ readableObjectMode: true,
335
+ decodeStrings: false,
336
+ /* receive audio samples */
337
+ write(chunk, encoding, callback) {
338
+ if (!Buffer.isBuffer(chunk.payload))
339
+ callback(new Error("expected audio input as Buffer chunks"));
340
+ else if (chunk.payload.byteLength === 0)
341
+ callback();
342
+ else {
343
+ /* convert audio samples from PCM/I16/48KHz to PCM/F32/16KHz */
344
+ const bufferToInt16Array = (buf) => {
345
+ const dataView = new DataView(buf.buffer);
346
+ const result = new Int16Array(buf.length / 2);
347
+ for (let i = 0; i < result.length; i++)
348
+ result[i] = dataView.getInt16(i * 2, cfg.audioLittleEndian);
349
+ return result;
350
+ };
351
+ const wav = new wavefile.WaveFile();
352
+ wav.fromScratch(cfg.audioChannels, cfg.audioSampleRate, String(cfg.audioBitDepth), bufferToInt16Array(chunk.payload));
353
+ wav.toBitDepth("32f");
354
+ wav.toSampleRate(16000, { method: "cubic" });
355
+ let data = wav.getSamples(false, Float32Array);
356
+ /* merge previous carry samples */
357
+ if (carrySamples.length > 0) {
358
+ const merged = new Float32Array(carrySamples.length + data.length);
359
+ merged.set(carrySamples);
360
+ merged.set(data, carrySamples.length);
361
+ data = merged;
362
+ carrySamples = new Float32Array();
363
+ }
364
+ /* queue audio samples as individual VAD-sized frames
365
+ and in parallel send it into the Voice Activity Detection (VAD) */
366
+ const chunks = Math.trunc(data.length / samplesPerVADFrame);
367
+ for (let i = 0; i < chunks; i++) {
368
+ const frame = data.slice(i * samplesPerVADFrame, (i + 1) * samplesPerVADFrame);
369
+ queueRecv.append({ type: "audio-frame", data: frame });
370
+ vad.processAudio(frame);
371
+ }
372
+ /* remember new carry samples */
373
+ const bulkLen = chunks * samplesPerVADFrame;
374
+ carrySamples = data.slice(bulkLen);
375
+ callback();
376
+ }
377
+ },
378
+ /* send transcription texts */
379
+ read(size) {
380
+ if (endOfStream)
381
+ this.push(null);
382
+ else {
383
+ queueSend.once("write", (text) => {
384
+ log("info", `VAD: receive data (${text.length} bytes)`);
385
+ this.push(text, cfg.textEncoding);
386
+ });
387
+ }
388
+ },
389
+ /* react on end of input */
390
+ final(callback) {
391
+ if (carrySamples.length > 0) {
392
+ /* flush pending audio samples */
393
+ if (carrySamples.length < samplesPerVADFrame) {
394
+ const merged = new Float32Array(samplesPerVADFrame);
395
+ merged.set(carrySamples);
396
+ merged.fill(0.0, carrySamples.length, samplesPerVADFrame);
397
+ carrySamples = merged;
398
+ }
399
+ queueRecv.append({ type: "audio-frame", data: carrySamples });
400
+ vad.processAudio(carrySamples);
401
+ /* give the processing a chance to still process the remaining samples */
402
+ setTimeout(() => {
403
+ endOfStream = true;
404
+ this.push(null);
405
+ callback();
406
+ }, 2000);
407
+ }
408
+ else {
409
+ endOfStream = true;
410
+ this.push(null);
411
+ callback();
412
+ }
413
+ }
414
+ });
415
+ }
416
+ /* close node */
417
+ async close() {
418
+ /* close stream */
419
+ if (this.stream !== null) {
420
+ this.stream.destroy();
421
+ this.stream = null;
422
+ }
423
+ /* close VAD */
424
+ if (this.vad !== null) {
425
+ await this.vad.flush();
426
+ this.vad.destroy();
427
+ this.vad = null;
428
+ }
429
+ }
430
+ }
431
+ exports.default = SpeechFlowNodeVAD;
@@ -0,0 +1,13 @@
1
+ import SpeechFlowNode from "./speechflow-node";
2
+ export default class SpeechFlowNodeKokoro extends SpeechFlowNode {
3
+ static name: string;
4
+ private kokoro;
5
+ private static speexInitialized;
6
+ constructor(id: string, cfg: {
7
+ [id: string]: any;
8
+ }, opts: {
9
+ [id: string]: any;
10
+ }, args: any[]);
11
+ open(): Promise<void>;
12
+ close(): Promise<void>;
13
+ }