speechflow 0.9.8 → 0.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/LICENSE.txt +674 -0
- package/README.md +66 -16
- package/dst/speechflow-node-a2a-vad.d.ts +16 -0
- package/dst/speechflow-node-a2a-vad.js +431 -0
- package/dst/speechflow-node-t2a-kokoro.d.ts +13 -0
- package/dst/speechflow-node-t2a-kokoro.js +147 -0
- package/dst/speechflow-node-t2t-gemma.js +23 -3
- package/dst/speechflow-node-t2t-ollama.d.ts +13 -0
- package/dst/speechflow-node-t2t-ollama.js +245 -0
- package/dst/speechflow-node-t2t-openai.d.ts +13 -0
- package/dst/speechflow-node-t2t-openai.js +225 -0
- package/dst/speechflow-node-t2t-opus.js +1 -1
- package/dst/speechflow-node-t2t-transformers.d.ts +14 -0
- package/dst/speechflow-node-t2t-transformers.js +260 -0
- package/dst/speechflow-node-x2x-trace.js +2 -2
- package/dst/speechflow.js +86 -40
- package/etc/speechflow.yaml +9 -2
- package/etc/stx.conf +1 -1
- package/package.json +7 -6
- package/src/speechflow-node-t2a-kokoro.ts +160 -0
- package/src/{speechflow-node-t2t-gemma.ts → speechflow-node-t2t-ollama.ts} +44 -10
- package/src/speechflow-node-t2t-openai.ts +246 -0
- package/src/speechflow-node-t2t-transformers.ts +244 -0
- package/src/speechflow-node-x2x-trace.ts +2 -2
- package/src/speechflow.ts +86 -40
- package/src/speechflow-node-t2t-opus.ts +0 -111
package/README.md
CHANGED
|
@@ -14,9 +14,13 @@ SpeechFlow
|
|
|
14
14
|
About
|
|
15
15
|
-----
|
|
16
16
|
|
|
17
|
-
**SpeechFlow** is a command-line interface based tool for establishing
|
|
18
|
-
directed data flow graph of audio and text processing nodes. This
|
|
19
|
-
it allows to perform various speech processing tasks in a
|
|
17
|
+
**SpeechFlow** is a command-line interface based tool for establishing
|
|
18
|
+
a directed data flow graph of audio and text processing nodes. This
|
|
19
|
+
way, it allows to perform various speech processing tasks in a very
|
|
20
|
+
flexible and configurable way. The usual supported tasks are capturing
|
|
21
|
+
audio, generate narrations of text (aka text-to-speech), generate
|
|
22
|
+
transcriptions or subtitles for audio (aka speech-to-text), and generate
|
|
23
|
+
translations for audio (aka speech-to-speech).
|
|
20
24
|
|
|
21
25
|
**SpeechFlow** comes with built-in graph nodes for
|
|
22
26
|
local file I/O,
|
|
@@ -26,8 +30,8 @@ remote MQTT network I/O,
|
|
|
26
30
|
cloud-based [Deepgram](https://deepgram.com) speech-to-text conversion,
|
|
27
31
|
cloud-based [ElevenLabs](https://elevenlabs.io/) text-to-speech conversion,
|
|
28
32
|
cloud-based [DeepL](https://deepl.com) text-to-text translation,
|
|
29
|
-
|
|
30
|
-
local [Gemma
|
|
33
|
+
cloud-based [OpenAI/GPT](https://openai.com) text-to-text translation (or spelling correction),
|
|
34
|
+
local [Ollama/Gemma](https://ollama.com) text-to-text translation (or spelling correction),
|
|
31
35
|
local [OPUS/ONNX](https://github.com/Helsinki-NLP/Opus-MT) text-to-text translation,
|
|
32
36
|
local [FFmpeg](https://ffmpeg.org/) speech-to-speech encoding,
|
|
33
37
|
local WAV speech-to-speech encoding,
|
|
@@ -88,7 +92,7 @@ They can also be found in the sample [speechflow.yaml](./etc/speechflow.yaml) fi
|
|
|
88
92
|
}
|
|
89
93
|
```
|
|
90
94
|
|
|
91
|
-
- **
|
|
95
|
+
- **Transcription**: Generate text file with German transcription of MP3 audio file:
|
|
92
96
|
|
|
93
97
|
```
|
|
94
98
|
file(path: argv.0, mode: "r", type: "audio") |
|
|
@@ -108,6 +112,15 @@ They can also be found in the sample [speechflow.yaml](./etc/speechflow.yaml) fi
|
|
|
108
112
|
file(path: argv.1, mode: "w", type: "text")
|
|
109
113
|
```
|
|
110
114
|
|
|
115
|
+
- **Speaking**: Generate audio file with English voice for a text file:
|
|
116
|
+
|
|
117
|
+
```
|
|
118
|
+
file(path: argv.0, mode: "r", type: "text") |
|
|
119
|
+
kokoro(language: "en") |
|
|
120
|
+
wav(mode: "encode") |
|
|
121
|
+
file(path: argv.1, mode: "w", type: "audio")
|
|
122
|
+
```
|
|
123
|
+
|
|
111
124
|
- **Ad-Hoc Translation**: Ad-Hoc text translation from German to English
|
|
112
125
|
via stdin/stdout:
|
|
113
126
|
|
|
@@ -166,8 +179,9 @@ First a short overview of the available processing nodes:
|
|
|
166
179
|
**deepgram**.
|
|
167
180
|
- Text-to-Text nodes:
|
|
168
181
|
**deepl**,
|
|
169
|
-
**
|
|
170
|
-
**
|
|
182
|
+
**openai**,
|
|
183
|
+
**ollama**,
|
|
184
|
+
**transformers**,
|
|
171
185
|
**subtitle**,
|
|
172
186
|
**format**.
|
|
173
187
|
- Text-to-Audio nodes:
|
|
@@ -305,10 +319,10 @@ First a short overview of the available processing nodes:
|
|
|
305
319
|
| **src** | 0 | "de" | `/^(?:de\|en)$/` |
|
|
306
320
|
| **dst** | 1 | "en" | `/^(?:de\|en)$/` |
|
|
307
321
|
|
|
308
|
-
- Node: **
|
|
309
|
-
Purpose: **
|
|
310
|
-
Example: `
|
|
311
|
-
Notice
|
|
322
|
+
- Node: **openai**<br/>
|
|
323
|
+
Purpose: **OpenAI/GPT Text-to-Text translation and spelling correction**<br/>
|
|
324
|
+
Example: `openai(src: "de", dst: "en")`<br/>
|
|
325
|
+
Notice: this node requires an OpenAI API key!
|
|
312
326
|
|
|
313
327
|
| Port | Payload |
|
|
314
328
|
| ------- | ----------- |
|
|
@@ -317,13 +331,32 @@ First a short overview of the available processing nodes:
|
|
|
317
331
|
|
|
318
332
|
| Parameter | Position | Default | Requirement |
|
|
319
333
|
| ------------ | --------- | -------- | ------------------ |
|
|
320
|
-
| **
|
|
334
|
+
| **api** | *none* | "https://api.openai.com" | `/^https?:\/\/.+?:\d+$/` |
|
|
321
335
|
| **src** | 0 | "de" | `/^(?:de\|en)$/` |
|
|
322
336
|
| **dst** | 1 | "en" | `/^(?:de\|en)$/` |
|
|
337
|
+
| **key** | *none* | env.SPEECHFLOW\_KEY\_OPENAI | *none* |
|
|
338
|
+
| **model** | *none* | "gpt-4o-mini" | *none* |
|
|
323
339
|
|
|
324
|
-
- Node: **
|
|
325
|
-
Purpose: **
|
|
326
|
-
Example: `
|
|
340
|
+
- Node: **ollama**<br/>
|
|
341
|
+
Purpose: **Ollama/Gemma Text-to-Text translation and spelling correction**<br/>
|
|
342
|
+
Example: `ollama(src: "de", dst: "en")`<br/>
|
|
343
|
+
Notice: this node requires the Ollama API!
|
|
344
|
+
|
|
345
|
+
| Port | Payload |
|
|
346
|
+
| ------- | ----------- |
|
|
347
|
+
| input | text |
|
|
348
|
+
| output | text |
|
|
349
|
+
|
|
350
|
+
| Parameter | Position | Default | Requirement |
|
|
351
|
+
| ------------ | --------- | -------- | ------------------ |
|
|
352
|
+
| **api** | *none* | "http://127.0.0.1:11434" | `/^https?:\/\/.+?:\d+$/` |
|
|
353
|
+
| **model** | *none* | "gemma3:4b-it-q4_K_M" | *none* |
|
|
354
|
+
| **src** | 0 | "de" | `/^(?:de\|en)$/` |
|
|
355
|
+
| **dst** | 1 | "en" | `/^(?:de\|en)$/` |
|
|
356
|
+
|
|
357
|
+
- Node: **transformers**<br/>
|
|
358
|
+
Purpose: **Transformers Text-to-Text translation**<br/>
|
|
359
|
+
Example: `transformers(src: "de", dst: "en")`<br/>
|
|
327
360
|
|
|
328
361
|
| Port | Payload |
|
|
329
362
|
| ------- | ----------- |
|
|
@@ -332,6 +365,7 @@ First a short overview of the available processing nodes:
|
|
|
332
365
|
|
|
333
366
|
| Parameter | Position | Default | Requirement |
|
|
334
367
|
| ------------ | --------- | -------- | ---------------- |
|
|
368
|
+
| **model** | *none* | "OPUS" | `/^(?:OPUS|SmolLM3)$/` |
|
|
335
369
|
| **src** | 0 | "de" | `/^(?:de\|en)$/` |
|
|
336
370
|
| **dst** | 1 | "en" | `/^(?:de\|en)$/` |
|
|
337
371
|
|
|
@@ -379,6 +413,22 @@ First a short overview of the available processing nodes:
|
|
|
379
413
|
| **voice** | 0 | "Brian" | *none* |
|
|
380
414
|
| **language** | 1 | "de" | *none* |
|
|
381
415
|
|
|
416
|
+
- Node: **kokoro**<br/>
|
|
417
|
+
Purpose: **Kokoro Text-to-Speech conversion**<br/>
|
|
418
|
+
Example: `kokoro(language: "en")`<br/>
|
|
419
|
+
Notice: this currently support English language only!
|
|
420
|
+
|
|
421
|
+
| Port | Payload |
|
|
422
|
+
| ------- | ----------- |
|
|
423
|
+
| input | text |
|
|
424
|
+
| output | audio |
|
|
425
|
+
|
|
426
|
+
| Parameter | Position | Default | Requirement |
|
|
427
|
+
| ------------ | --------- | -------- | ----------- |
|
|
428
|
+
| **voice** | 0 | "Aoede" | `/^(?:Aoede|Heart|Puck|Fenrir)$/` |
|
|
429
|
+
| **language** | 1 | "en" | `/^en$/` |
|
|
430
|
+
| **speed** | 2 | 1.25 | 1.0...1.30 |
|
|
431
|
+
|
|
382
432
|
### Any-to-Any Nodes:
|
|
383
433
|
|
|
384
434
|
- Node: **trace**<br/>
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import SpeechFlowNode from "./speechflow-node";
|
|
2
|
+
export default class SpeechFlowNodeVAD extends SpeechFlowNode {
|
|
3
|
+
static name: string;
|
|
4
|
+
private vad;
|
|
5
|
+
private queue;
|
|
6
|
+
private queueRecv;
|
|
7
|
+
private queueVAD;
|
|
8
|
+
private queueSend;
|
|
9
|
+
constructor(id: string, cfg: {
|
|
10
|
+
[id: string]: any;
|
|
11
|
+
}, opts: {
|
|
12
|
+
[id: string]: any;
|
|
13
|
+
}, args: any[]);
|
|
14
|
+
open(): Promise<void>;
|
|
15
|
+
close(): Promise<void>;
|
|
16
|
+
}
|
|
@@ -0,0 +1,431 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/*
|
|
3
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
4
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
5
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
6
|
+
*/
|
|
7
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
8
|
+
if (k2 === undefined) k2 = k;
|
|
9
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
10
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
11
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
12
|
+
}
|
|
13
|
+
Object.defineProperty(o, k2, desc);
|
|
14
|
+
}) : (function(o, m, k, k2) {
|
|
15
|
+
if (k2 === undefined) k2 = k;
|
|
16
|
+
o[k2] = m[k];
|
|
17
|
+
}));
|
|
18
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
19
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
20
|
+
}) : function(o, v) {
|
|
21
|
+
o["default"] = v;
|
|
22
|
+
});
|
|
23
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
24
|
+
var ownKeys = function(o) {
|
|
25
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
26
|
+
var ar = [];
|
|
27
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
28
|
+
return ar;
|
|
29
|
+
};
|
|
30
|
+
return ownKeys(o);
|
|
31
|
+
};
|
|
32
|
+
return function (mod) {
|
|
33
|
+
if (mod && mod.__esModule) return mod;
|
|
34
|
+
var result = {};
|
|
35
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
36
|
+
__setModuleDefault(result, mod);
|
|
37
|
+
return result;
|
|
38
|
+
};
|
|
39
|
+
})();
|
|
40
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
41
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
42
|
+
};
|
|
43
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
44
|
+
/* standard dependencies */
|
|
45
|
+
const node_events_1 = require("node:events");
|
|
46
|
+
const node_stream_1 = __importDefault(require("node:stream"));
|
|
47
|
+
/* external dependencies */
|
|
48
|
+
const wavefile = __importStar(require("wavefile"));
|
|
49
|
+
const vad_node_realtime_1 = require("@ericedouard/vad-node-realtime");
|
|
50
|
+
/* internal dependencies */
|
|
51
|
+
const speechflow_node_1 = __importDefault(require("./speechflow-node"));
|
|
52
|
+
/* audio stream queue pointer */
|
|
53
|
+
class AudioQueuePointer extends node_events_1.EventEmitter {
|
|
54
|
+
name;
|
|
55
|
+
queue;
|
|
56
|
+
/* internal state */
|
|
57
|
+
index = 0;
|
|
58
|
+
/* construction */
|
|
59
|
+
constructor(name, queue) {
|
|
60
|
+
super();
|
|
61
|
+
this.name = name;
|
|
62
|
+
this.queue = queue;
|
|
63
|
+
}
|
|
64
|
+
/* positioning operations */
|
|
65
|
+
maxPosition() {
|
|
66
|
+
return this.queue.elements.length;
|
|
67
|
+
}
|
|
68
|
+
position(index) {
|
|
69
|
+
if (index !== undefined) {
|
|
70
|
+
this.index = index;
|
|
71
|
+
if (this.index < 0)
|
|
72
|
+
this.index = 0;
|
|
73
|
+
else if (this.index >= this.queue.elements.length)
|
|
74
|
+
this.index = this.queue.elements.length;
|
|
75
|
+
this.emit("position", this.index);
|
|
76
|
+
}
|
|
77
|
+
return this.index;
|
|
78
|
+
}
|
|
79
|
+
walk(num) {
|
|
80
|
+
if (num > 0) {
|
|
81
|
+
for (let i = 0; i < num && this.index < this.queue.elements.length; i++)
|
|
82
|
+
this.index++;
|
|
83
|
+
this.emit("position", { start: this.index });
|
|
84
|
+
}
|
|
85
|
+
else if (num < 0) {
|
|
86
|
+
for (let i = 0; i < Math.abs(num) && this.index > 0; i++)
|
|
87
|
+
this.index--;
|
|
88
|
+
this.emit("position", { start: this.index });
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
walkForwardUntil(type) {
|
|
92
|
+
while (this.index < this.queue.elements.length
|
|
93
|
+
&& this.queue.elements[this.index].type !== type)
|
|
94
|
+
this.index++;
|
|
95
|
+
this.emit("position", { start: this.index });
|
|
96
|
+
}
|
|
97
|
+
walkBackwardUntil(type) {
|
|
98
|
+
while (this.index > 0
|
|
99
|
+
&& this.queue.elements[this.index].type !== type)
|
|
100
|
+
this.index--;
|
|
101
|
+
this.emit("position", { start: this.index });
|
|
102
|
+
}
|
|
103
|
+
/* search operations */
|
|
104
|
+
searchForward(type) {
|
|
105
|
+
let position = this.index;
|
|
106
|
+
while (position < this.queue.elements.length
|
|
107
|
+
&& this.queue.elements[position].type !== type)
|
|
108
|
+
position++;
|
|
109
|
+
this.emit("search", { start: this.index, end: position });
|
|
110
|
+
return position;
|
|
111
|
+
}
|
|
112
|
+
searchBackward(type) {
|
|
113
|
+
let position = this.index;
|
|
114
|
+
while (position > 0
|
|
115
|
+
&& this.queue.elements[position].type !== type)
|
|
116
|
+
position--;
|
|
117
|
+
this.emit("search", { start: position, end: this.index });
|
|
118
|
+
}
|
|
119
|
+
/* reading operations */
|
|
120
|
+
peek(position) {
|
|
121
|
+
if (position === undefined)
|
|
122
|
+
position = this.index;
|
|
123
|
+
else {
|
|
124
|
+
if (position < 0)
|
|
125
|
+
position = 0;
|
|
126
|
+
else if (position >= this.queue.elements.length)
|
|
127
|
+
position = this.queue.elements.length;
|
|
128
|
+
}
|
|
129
|
+
const element = this.queue.elements[position];
|
|
130
|
+
this.queue.emit("read", { start: position, end: position });
|
|
131
|
+
return element;
|
|
132
|
+
}
|
|
133
|
+
read() {
|
|
134
|
+
const element = this.queue.elements[this.index];
|
|
135
|
+
if (this.index < this.queue.elements.length)
|
|
136
|
+
this.index++;
|
|
137
|
+
this.queue.emit("read", { start: this.index - 1, end: this.index - 1 });
|
|
138
|
+
return element;
|
|
139
|
+
}
|
|
140
|
+
slice(size) {
|
|
141
|
+
let slice;
|
|
142
|
+
const start = this.index;
|
|
143
|
+
if (size !== undefined) {
|
|
144
|
+
slice = this.queue.elements.slice(this.index, size);
|
|
145
|
+
this.index += size;
|
|
146
|
+
}
|
|
147
|
+
else {
|
|
148
|
+
slice = this.queue.elements.slice(this.index);
|
|
149
|
+
this.index = this.queue.elements.length;
|
|
150
|
+
}
|
|
151
|
+
this.queue.emit("read", { start, end: this.index });
|
|
152
|
+
return slice;
|
|
153
|
+
}
|
|
154
|
+
/* writing operations */
|
|
155
|
+
append(element) {
|
|
156
|
+
this.queue.elements.push(element);
|
|
157
|
+
this.index = this.queue.elements.length;
|
|
158
|
+
this.queue.emit("write", { start: this.index - 1, end: this.index - 1 });
|
|
159
|
+
}
|
|
160
|
+
insert(element) {
|
|
161
|
+
this.queue.elements.splice(this.index++, 0, element);
|
|
162
|
+
this.queue.emit("write", { start: this.index - 1, end: this.index });
|
|
163
|
+
}
|
|
164
|
+
delete() {
|
|
165
|
+
if (this.index >= this.queue.elements.length)
|
|
166
|
+
throw new Error("cannot delete after last element");
|
|
167
|
+
this.queue.elements.splice(this.index, 1);
|
|
168
|
+
this.queue.emit("write", { start: this.index, end: this.index });
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
/* audio stream queue */
|
|
172
|
+
class AudioQueue extends node_events_1.EventEmitter {
|
|
173
|
+
elements = [];
|
|
174
|
+
pointers = new Map();
|
|
175
|
+
pointerUse(name) {
|
|
176
|
+
if (!this.pointers.has(name))
|
|
177
|
+
this.pointers.set(name, new AudioQueuePointer(name, this));
|
|
178
|
+
return this.pointers.get(name);
|
|
179
|
+
}
|
|
180
|
+
pointerDelete(name) {
|
|
181
|
+
if (!this.pointers.has(name))
|
|
182
|
+
throw new Error("pointer not exists");
|
|
183
|
+
this.pointers.delete(name);
|
|
184
|
+
}
|
|
185
|
+
trim() {
|
|
186
|
+
/* determine minimum pointer position */
|
|
187
|
+
let min = this.elements.length;
|
|
188
|
+
for (const pointer of this.pointers.values())
|
|
189
|
+
if (min > pointer.position())
|
|
190
|
+
min = pointer.position();
|
|
191
|
+
/* trim the maximum amount of first elements */
|
|
192
|
+
this.elements.splice(0, min);
|
|
193
|
+
/* shift all pointers */
|
|
194
|
+
for (const pointer of this.pointers.values())
|
|
195
|
+
pointer.position(pointer.position() - min);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
/* SpeechFlow node for VAD speech-to-speech processing */
|
|
199
|
+
class SpeechFlowNodeVAD extends speechflow_node_1.default {
|
|
200
|
+
/* declare official node name */
|
|
201
|
+
static name = "vad";
|
|
202
|
+
/* internal state */
|
|
203
|
+
vad = null;
|
|
204
|
+
queue = new AudioQueue();
|
|
205
|
+
queueRecv = this.queue.pointerUse("recv");
|
|
206
|
+
queueVAD = this.queue.pointerUse("vad");
|
|
207
|
+
queueSend = this.queue.pointerUse("send");
|
|
208
|
+
/* construct node */
|
|
209
|
+
constructor(id, cfg, opts, args) {
|
|
210
|
+
super(id, cfg, opts, args);
|
|
211
|
+
/* declare node configuration parameters */
|
|
212
|
+
this.configure({});
|
|
213
|
+
/* declare node input/output format */
|
|
214
|
+
this.input = "audio";
|
|
215
|
+
this.output = "audio";
|
|
216
|
+
}
|
|
217
|
+
/* open node */
|
|
218
|
+
async open() {
|
|
219
|
+
/* sanity check situation */
|
|
220
|
+
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
|
|
221
|
+
throw new Error("VAD node currently supports PCM-S16LE audio only");
|
|
222
|
+
/* pass-through logging */
|
|
223
|
+
const log = (level, msg) => { this.log(level, msg); };
|
|
224
|
+
/* internal processing constants */
|
|
225
|
+
const sampleRateTarget = 16000;
|
|
226
|
+
const samplesPerVADFrame = 512; /* required for VAD v5 */
|
|
227
|
+
const minFramesPerSecond = Math.trunc(sampleRateTarget / samplesPerVADFrame) + 1;
|
|
228
|
+
/* track audio queue element changes */
|
|
229
|
+
let speechActive = false;
|
|
230
|
+
let speechStart = -1;
|
|
231
|
+
let speechEnd = -1;
|
|
232
|
+
let speechMinSeconds = 2;
|
|
233
|
+
this.queue.on("write", () => {
|
|
234
|
+
if (!speechActive) {
|
|
235
|
+
const position = this.queueSend.searchForward("speech-start");
|
|
236
|
+
const element = this.queueSend.peek(position);
|
|
237
|
+
if (element !== undefined && element.type === "speech-start") {
|
|
238
|
+
this.queueSend.position(position + 1);
|
|
239
|
+
speechActive = true;
|
|
240
|
+
speechStart = this.queueSend.position();
|
|
241
|
+
speechEnd = speechStart;
|
|
242
|
+
speechMinSeconds = 2;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
else {
|
|
246
|
+
speechEnd = this.queueSend.searchForward("speech-end");
|
|
247
|
+
/* determine number of speech and fill frames */
|
|
248
|
+
let framesSpeech = 0;
|
|
249
|
+
for (let f = speechStart; f < speechEnd; f++) {
|
|
250
|
+
const element = this.queueSend.peek(f);
|
|
251
|
+
if (element.type === "audio-frame")
|
|
252
|
+
framesSpeech++;
|
|
253
|
+
}
|
|
254
|
+
let framesFilled = minFramesPerSecond - framesSpeech;
|
|
255
|
+
if (framesFilled < 0)
|
|
256
|
+
framesFilled = 0;
|
|
257
|
+
/* assemble all speech and fill frames */
|
|
258
|
+
/*
|
|
259
|
+
const assembleFrames = () => {
|
|
260
|
+
const speech = new Float32Array((framesSpeech + framesFilled) * samplesPerVADFrame)
|
|
261
|
+
let i = 0
|
|
262
|
+
for (let f = speechStart; f < speechEnd; f++) {
|
|
263
|
+
const element = this.queueSend.peek(f)
|
|
264
|
+
if (element.type === "audio-frame")
|
|
265
|
+
speech.set(element.data, samplesPerVADFrame * i++)
|
|
266
|
+
}
|
|
267
|
+
if (framesFilled > 0)
|
|
268
|
+
speech.fill(0.0, i * samplesPerVADFrame, (i + framesFilled) * samplesPerVADFrame)
|
|
269
|
+
return speech
|
|
270
|
+
}
|
|
271
|
+
*/
|
|
272
|
+
if (speechEnd === this.queueSend.maxPosition()) {
|
|
273
|
+
/* intermediate transcription */
|
|
274
|
+
const duration = ((framesSpeech + framesFilled) * samplesPerVADFrame) / sampleRateTarget;
|
|
275
|
+
if (duration >= speechMinSeconds) {
|
|
276
|
+
/* intermediate transcription of at least the next required minimum seconds */
|
|
277
|
+
// const samples = assembleFrames()
|
|
278
|
+
this.log("info", `trigger intermediate transcription (duration: ${duration.toFixed(1)}s)`);
|
|
279
|
+
// this.tqueue!.enqueue({ id: speechStart, type: "intermediate", audio: samples, language: this.params.language })
|
|
280
|
+
speechMinSeconds++;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
else {
|
|
284
|
+
/* final transcription */
|
|
285
|
+
const duration = ((framesSpeech + framesFilled) * samplesPerVADFrame) / sampleRateTarget;
|
|
286
|
+
if (duration >= 1.0) {
|
|
287
|
+
// const samples = assembleFrames()
|
|
288
|
+
this.log("info", `trigger final transcription (duration: ${duration.toFixed(1)}s)`);
|
|
289
|
+
// this.tqueue!.enqueue({ id: speechStart, type: "final", audio: samples, language: this.params.language })
|
|
290
|
+
this.queueSend.position(speechEnd + 1);
|
|
291
|
+
}
|
|
292
|
+
else
|
|
293
|
+
this.log("info", `skipping final transcription -- too short (duration: ${duration.toFixed(1)}s)`);
|
|
294
|
+
speechActive = false;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
});
|
|
298
|
+
/* Voice Activity Detection (VAD) */
|
|
299
|
+
this.vad = await vad_node_realtime_1.RealTimeVAD.new({
|
|
300
|
+
onSpeechStart: () => {
|
|
301
|
+
this.log("info", "VAD: speech start");
|
|
302
|
+
this.queueVAD.insert({ type: "speech-start" });
|
|
303
|
+
},
|
|
304
|
+
onSpeechEnd: (audio) => {
|
|
305
|
+
this.log("info", `VAD: speech end (samples: ${audio.length})`);
|
|
306
|
+
this.queueVAD.insert({ type: "speech-end", short: false });
|
|
307
|
+
},
|
|
308
|
+
onVADMisfire: () => {
|
|
309
|
+
this.log("info", "VAD: speech end (segment too short)");
|
|
310
|
+
this.queueVAD.insert({ type: "speech-end", short: true });
|
|
311
|
+
},
|
|
312
|
+
onFrameProcessed: () => {
|
|
313
|
+
this.queueVAD.walk(+1);
|
|
314
|
+
},
|
|
315
|
+
sampleRate: 16000,
|
|
316
|
+
model: "v5",
|
|
317
|
+
frameSamples: samplesPerVADFrame, /* (= 32ms: 512 frameSamples / 16000 sampleSize) */
|
|
318
|
+
positiveSpeechThreshold: 0.50,
|
|
319
|
+
negativeSpeechThreshold: 0.35,
|
|
320
|
+
minSpeechFrames: 4, /* (= 128ms: 4 x 512 frameSamples) */
|
|
321
|
+
redemptionFrames: 8, /* (= 256ms: 8 x 512 frameSamples) */
|
|
322
|
+
preSpeechPadFrames: 1, /* (= 32ms: 1 x 512 frameSamples) */
|
|
323
|
+
});
|
|
324
|
+
this.vad.start();
|
|
325
|
+
/* provide Duplex stream and internally attach to VAD */
|
|
326
|
+
const vad = this.vad;
|
|
327
|
+
const cfg = this.config;
|
|
328
|
+
const queueRecv = this.queueRecv;
|
|
329
|
+
const queueSend = this.queueSend;
|
|
330
|
+
let carrySamples = new Float32Array();
|
|
331
|
+
let endOfStream = false;
|
|
332
|
+
this.stream = new node_stream_1.default.Duplex({
|
|
333
|
+
writableObjectMode: true,
|
|
334
|
+
readableObjectMode: true,
|
|
335
|
+
decodeStrings: false,
|
|
336
|
+
/* receive audio samples */
|
|
337
|
+
write(chunk, encoding, callback) {
|
|
338
|
+
if (!Buffer.isBuffer(chunk.payload))
|
|
339
|
+
callback(new Error("expected audio input as Buffer chunks"));
|
|
340
|
+
else if (chunk.payload.byteLength === 0)
|
|
341
|
+
callback();
|
|
342
|
+
else {
|
|
343
|
+
/* convert audio samples from PCM/I16/48KHz to PCM/F32/16KHz */
|
|
344
|
+
const bufferToInt16Array = (buf) => {
|
|
345
|
+
const dataView = new DataView(buf.buffer);
|
|
346
|
+
const result = new Int16Array(buf.length / 2);
|
|
347
|
+
for (let i = 0; i < result.length; i++)
|
|
348
|
+
result[i] = dataView.getInt16(i * 2, cfg.audioLittleEndian);
|
|
349
|
+
return result;
|
|
350
|
+
};
|
|
351
|
+
const wav = new wavefile.WaveFile();
|
|
352
|
+
wav.fromScratch(cfg.audioChannels, cfg.audioSampleRate, String(cfg.audioBitDepth), bufferToInt16Array(chunk.payload));
|
|
353
|
+
wav.toBitDepth("32f");
|
|
354
|
+
wav.toSampleRate(16000, { method: "cubic" });
|
|
355
|
+
let data = wav.getSamples(false, Float32Array);
|
|
356
|
+
/* merge previous carry samples */
|
|
357
|
+
if (carrySamples.length > 0) {
|
|
358
|
+
const merged = new Float32Array(carrySamples.length + data.length);
|
|
359
|
+
merged.set(carrySamples);
|
|
360
|
+
merged.set(data, carrySamples.length);
|
|
361
|
+
data = merged;
|
|
362
|
+
carrySamples = new Float32Array();
|
|
363
|
+
}
|
|
364
|
+
/* queue audio samples as individual VAD-sized frames
|
|
365
|
+
and in parallel send it into the Voice Activity Detection (VAD) */
|
|
366
|
+
const chunks = Math.trunc(data.length / samplesPerVADFrame);
|
|
367
|
+
for (let i = 0; i < chunks; i++) {
|
|
368
|
+
const frame = data.slice(i * samplesPerVADFrame, (i + 1) * samplesPerVADFrame);
|
|
369
|
+
queueRecv.append({ type: "audio-frame", data: frame });
|
|
370
|
+
vad.processAudio(frame);
|
|
371
|
+
}
|
|
372
|
+
/* remember new carry samples */
|
|
373
|
+
const bulkLen = chunks * samplesPerVADFrame;
|
|
374
|
+
carrySamples = data.slice(bulkLen);
|
|
375
|
+
callback();
|
|
376
|
+
}
|
|
377
|
+
},
|
|
378
|
+
/* send transcription texts */
|
|
379
|
+
read(size) {
|
|
380
|
+
if (endOfStream)
|
|
381
|
+
this.push(null);
|
|
382
|
+
else {
|
|
383
|
+
queueSend.once("write", (text) => {
|
|
384
|
+
log("info", `VAD: receive data (${text.length} bytes)`);
|
|
385
|
+
this.push(text, cfg.textEncoding);
|
|
386
|
+
});
|
|
387
|
+
}
|
|
388
|
+
},
|
|
389
|
+
/* react on end of input */
|
|
390
|
+
final(callback) {
|
|
391
|
+
if (carrySamples.length > 0) {
|
|
392
|
+
/* flush pending audio samples */
|
|
393
|
+
if (carrySamples.length < samplesPerVADFrame) {
|
|
394
|
+
const merged = new Float32Array(samplesPerVADFrame);
|
|
395
|
+
merged.set(carrySamples);
|
|
396
|
+
merged.fill(0.0, carrySamples.length, samplesPerVADFrame);
|
|
397
|
+
carrySamples = merged;
|
|
398
|
+
}
|
|
399
|
+
queueRecv.append({ type: "audio-frame", data: carrySamples });
|
|
400
|
+
vad.processAudio(carrySamples);
|
|
401
|
+
/* give the processing a chance to still process the remaining samples */
|
|
402
|
+
setTimeout(() => {
|
|
403
|
+
endOfStream = true;
|
|
404
|
+
this.push(null);
|
|
405
|
+
callback();
|
|
406
|
+
}, 2000);
|
|
407
|
+
}
|
|
408
|
+
else {
|
|
409
|
+
endOfStream = true;
|
|
410
|
+
this.push(null);
|
|
411
|
+
callback();
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
});
|
|
415
|
+
}
|
|
416
|
+
/* close node */
|
|
417
|
+
async close() {
|
|
418
|
+
/* close stream */
|
|
419
|
+
if (this.stream !== null) {
|
|
420
|
+
this.stream.destroy();
|
|
421
|
+
this.stream = null;
|
|
422
|
+
}
|
|
423
|
+
/* close VAD */
|
|
424
|
+
if (this.vad !== null) {
|
|
425
|
+
await this.vad.flush();
|
|
426
|
+
this.vad.destroy();
|
|
427
|
+
this.vad = null;
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
exports.default = SpeechFlowNodeVAD;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import SpeechFlowNode from "./speechflow-node";
|
|
2
|
+
export default class SpeechFlowNodeKokoro extends SpeechFlowNode {
|
|
3
|
+
static name: string;
|
|
4
|
+
private kokoro;
|
|
5
|
+
private static speexInitialized;
|
|
6
|
+
constructor(id: string, cfg: {
|
|
7
|
+
[id: string]: any;
|
|
8
|
+
}, opts: {
|
|
9
|
+
[id: string]: any;
|
|
10
|
+
}, args: any[]);
|
|
11
|
+
open(): Promise<void>;
|
|
12
|
+
close(): Promise<void>;
|
|
13
|
+
}
|