speechflow 0.9.4 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/README.md +227 -54
  3. package/dst/speechflow-node-a2a-ffmpeg.d.ts +13 -0
  4. package/dst/speechflow-node-a2a-ffmpeg.js +152 -0
  5. package/dst/speechflow-node-a2a-wav.d.ts +11 -0
  6. package/dst/speechflow-node-a2a-wav.js +170 -0
  7. package/dst/speechflow-node-a2t-deepgram.d.ts +12 -0
  8. package/dst/speechflow-node-a2t-deepgram.js +220 -0
  9. package/dst/speechflow-node-deepgram.d.ts +3 -1
  10. package/dst/speechflow-node-deepgram.js +86 -22
  11. package/dst/speechflow-node-deepl.d.ts +3 -1
  12. package/dst/speechflow-node-deepl.js +25 -20
  13. package/dst/speechflow-node-device.d.ts +3 -1
  14. package/dst/speechflow-node-device.js +53 -2
  15. package/dst/speechflow-node-elevenlabs.d.ts +4 -1
  16. package/dst/speechflow-node-elevenlabs.js +88 -49
  17. package/dst/speechflow-node-ffmpeg.d.ts +3 -1
  18. package/dst/speechflow-node-ffmpeg.js +42 -4
  19. package/dst/speechflow-node-file.d.ts +3 -1
  20. package/dst/speechflow-node-file.js +84 -13
  21. package/dst/speechflow-node-format.d.ts +11 -0
  22. package/dst/speechflow-node-format.js +80 -0
  23. package/dst/speechflow-node-gemma.d.ts +3 -1
  24. package/dst/speechflow-node-gemma.js +84 -23
  25. package/dst/speechflow-node-mqtt.d.ts +13 -0
  26. package/dst/speechflow-node-mqtt.js +181 -0
  27. package/dst/speechflow-node-opus.d.ts +12 -0
  28. package/dst/speechflow-node-opus.js +135 -0
  29. package/dst/speechflow-node-subtitle.d.ts +12 -0
  30. package/dst/speechflow-node-subtitle.js +96 -0
  31. package/dst/speechflow-node-t2a-elevenlabs.d.ts +13 -0
  32. package/dst/speechflow-node-t2a-elevenlabs.js +182 -0
  33. package/dst/speechflow-node-t2t-deepl.d.ts +12 -0
  34. package/dst/speechflow-node-t2t-deepl.js +133 -0
  35. package/dst/speechflow-node-t2t-format.d.ts +11 -0
  36. package/dst/speechflow-node-t2t-format.js +80 -0
  37. package/dst/speechflow-node-t2t-gemma.d.ts +13 -0
  38. package/dst/speechflow-node-t2t-gemma.js +213 -0
  39. package/dst/speechflow-node-t2t-opus.d.ts +12 -0
  40. package/dst/speechflow-node-t2t-opus.js +135 -0
  41. package/dst/speechflow-node-t2t-subtitle.d.ts +12 -0
  42. package/dst/speechflow-node-t2t-subtitle.js +96 -0
  43. package/dst/speechflow-node-trace.d.ts +11 -0
  44. package/dst/speechflow-node-trace.js +88 -0
  45. package/dst/speechflow-node-wav.d.ts +11 -0
  46. package/dst/speechflow-node-wav.js +170 -0
  47. package/dst/speechflow-node-websocket.d.ts +3 -1
  48. package/dst/speechflow-node-websocket.js +149 -49
  49. package/dst/speechflow-node-whisper-common.d.ts +34 -0
  50. package/dst/speechflow-node-whisper-common.js +7 -0
  51. package/dst/speechflow-node-whisper-ggml.d.ts +1 -0
  52. package/dst/speechflow-node-whisper-ggml.js +97 -0
  53. package/dst/speechflow-node-whisper-onnx.d.ts +1 -0
  54. package/dst/speechflow-node-whisper-onnx.js +131 -0
  55. package/dst/speechflow-node-whisper-worker-ggml.d.ts +1 -0
  56. package/dst/speechflow-node-whisper-worker-ggml.js +97 -0
  57. package/dst/speechflow-node-whisper-worker-onnx.d.ts +1 -0
  58. package/dst/speechflow-node-whisper-worker-onnx.js +131 -0
  59. package/dst/speechflow-node-whisper-worker.d.ts +1 -0
  60. package/dst/speechflow-node-whisper-worker.js +116 -0
  61. package/dst/speechflow-node-whisper-worker2.d.ts +1 -0
  62. package/dst/speechflow-node-whisper-worker2.js +82 -0
  63. package/dst/speechflow-node-whisper.d.ts +19 -0
  64. package/dst/speechflow-node-whisper.js +604 -0
  65. package/dst/speechflow-node-x2x-trace.d.ts +11 -0
  66. package/dst/speechflow-node-x2x-trace.js +88 -0
  67. package/dst/speechflow-node-xio-device.d.ts +13 -0
  68. package/dst/speechflow-node-xio-device.js +205 -0
  69. package/dst/speechflow-node-xio-file.d.ts +11 -0
  70. package/dst/speechflow-node-xio-file.js +176 -0
  71. package/dst/speechflow-node-xio-mqtt.d.ts +13 -0
  72. package/dst/speechflow-node-xio-mqtt.js +181 -0
  73. package/dst/speechflow-node-xio-websocket.d.ts +13 -0
  74. package/dst/speechflow-node-xio-websocket.js +275 -0
  75. package/dst/speechflow-node.d.ts +25 -7
  76. package/dst/speechflow-node.js +74 -9
  77. package/dst/speechflow-utils.d.ts +23 -0
  78. package/dst/speechflow-utils.js +194 -0
  79. package/dst/speechflow.js +146 -43
  80. package/etc/biome.jsonc +12 -4
  81. package/etc/stx.conf +65 -0
  82. package/package.d/@ericedouard+vad-node-realtime+0.2.0.patch +18 -0
  83. package/package.json +49 -31
  84. package/sample.yaml +61 -23
  85. package/src/lib.d.ts +6 -1
  86. package/src/{speechflow-node-ffmpeg.ts → speechflow-node-a2a-ffmpeg.ts} +10 -4
  87. package/src/speechflow-node-a2a-wav.ts +143 -0
  88. package/src/speechflow-node-a2t-deepgram.ts +199 -0
  89. package/src/speechflow-node-t2a-elevenlabs.ts +160 -0
  90. package/src/{speechflow-node-deepl.ts → speechflow-node-t2t-deepl.ts} +36 -25
  91. package/src/speechflow-node-t2t-format.ts +85 -0
  92. package/src/{speechflow-node-gemma.ts → speechflow-node-t2t-gemma.ts} +89 -25
  93. package/src/speechflow-node-t2t-opus.ts +111 -0
  94. package/src/speechflow-node-t2t-subtitle.ts +101 -0
  95. package/src/speechflow-node-x2x-trace.ts +92 -0
  96. package/src/{speechflow-node-device.ts → speechflow-node-xio-device.ts} +25 -3
  97. package/src/speechflow-node-xio-file.ts +153 -0
  98. package/src/speechflow-node-xio-mqtt.ts +154 -0
  99. package/src/speechflow-node-xio-websocket.ts +248 -0
  100. package/src/speechflow-node.ts +78 -13
  101. package/src/speechflow-utils.ts +212 -0
  102. package/src/speechflow.ts +150 -43
  103. package/etc/nps.yaml +0 -40
  104. package/src/speechflow-node-deepgram.ts +0 -133
  105. package/src/speechflow-node-elevenlabs.ts +0 -116
  106. package/src/speechflow-node-file.ts +0 -108
  107. package/src/speechflow-node-websocket.ts +0 -179
@@ -0,0 +1,604 @@
1
+ "use strict";
2
+ /*
3
+ ** SpeechFlow - Speech Processing Flow Graph
4
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
5
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
6
+ */
7
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
8
+ if (k2 === undefined) k2 = k;
9
+ var desc = Object.getOwnPropertyDescriptor(m, k);
10
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
11
+ desc = { enumerable: true, get: function() { return m[k]; } };
12
+ }
13
+ Object.defineProperty(o, k2, desc);
14
+ }) : (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ o[k2] = m[k];
17
+ }));
18
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
19
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
20
+ }) : function(o, v) {
21
+ o["default"] = v;
22
+ });
23
+ var __importStar = (this && this.__importStar) || (function () {
24
+ var ownKeys = function(o) {
25
+ ownKeys = Object.getOwnPropertyNames || function (o) {
26
+ var ar = [];
27
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
28
+ return ar;
29
+ };
30
+ return ownKeys(o);
31
+ };
32
+ return function (mod) {
33
+ if (mod && mod.__esModule) return mod;
34
+ var result = {};
35
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
36
+ __setModuleDefault(result, mod);
37
+ return result;
38
+ };
39
+ })();
40
+ var __importDefault = (this && this.__importDefault) || function (mod) {
41
+ return (mod && mod.__esModule) ? mod : { "default": mod };
42
+ };
43
+ Object.defineProperty(exports, "__esModule", { value: true });
44
+ /* standard dependencies */
45
+ const node_os_1 = __importDefault(require("node:os"));
46
+ const node_path_1 = __importDefault(require("node:path"));
47
+ const node_events_1 = require("node:events");
48
+ const node_stream_1 = __importDefault(require("node:stream"));
49
+ const node_worker_threads_1 = require("node:worker_threads");
50
+ const wavefile = __importStar(require("wavefile"));
51
+ const vad_node_realtime_1 = require("@ericedouard/vad-node-realtime");
52
+ /* internal dependencies */
53
+ const speechflow_node_1 = __importDefault(require("./speechflow-node"));
54
+ /* audio stream queue pointer */
55
+ class AudioQueuePointer extends node_events_1.EventEmitter {
56
+ name;
57
+ queue;
58
+ /* internal state */
59
+ index = 0;
60
+ /* construction */
61
+ constructor(name, queue) {
62
+ super();
63
+ this.name = name;
64
+ this.queue = queue;
65
+ }
66
+ /* positioning operations */
67
+ maxPosition() {
68
+ return this.queue.elements.length;
69
+ }
70
+ position(index) {
71
+ if (index !== undefined) {
72
+ this.index = index;
73
+ if (this.index < 0)
74
+ this.index = 0;
75
+ else if (this.index >= this.queue.elements.length)
76
+ this.index = this.queue.elements.length;
77
+ this.emit("position", this.index);
78
+ }
79
+ return this.index;
80
+ }
81
+ walk(num) {
82
+ if (num > 0) {
83
+ for (let i = 0; i < num && this.index < this.queue.elements.length; i++)
84
+ this.index++;
85
+ this.emit("position", { start: this.index });
86
+ }
87
+ else if (num < 0) {
88
+ for (let i = 0; i < Math.abs(num) && this.index > 0; i++)
89
+ this.index--;
90
+ this.emit("position", { start: this.index });
91
+ }
92
+ }
93
+ walkForwardUntil(type) {
94
+ while (this.index < this.queue.elements.length
95
+ && this.queue.elements[this.index].type !== type)
96
+ this.index++;
97
+ this.emit("position", { start: this.index });
98
+ }
99
+ walkBackwardUntil(type) {
100
+ while (this.index > 0
101
+ && this.queue.elements[this.index].type !== type)
102
+ this.index--;
103
+ this.emit("position", { start: this.index });
104
+ }
105
+ /* search operations */
106
+ searchForward(type) {
107
+ let position = this.index;
108
+ while (position < this.queue.elements.length
109
+ && this.queue.elements[position].type !== type)
110
+ position++;
111
+ this.emit("search", { start: this.index, end: position });
112
+ return position;
113
+ }
114
+ searchBackward(type) {
115
+ let position = this.index;
116
+ while (position > 0
117
+ && this.queue.elements[position].type !== type)
118
+ position--;
119
+ this.emit("search", { start: position, end: this.index });
120
+ }
121
+ /* reading operations */
122
+ peek(position) {
123
+ if (position === undefined)
124
+ position = this.index;
125
+ else {
126
+ if (position < 0)
127
+ position = 0;
128
+ else if (position >= this.queue.elements.length)
129
+ position = this.queue.elements.length;
130
+ }
131
+ const element = this.queue.elements[position];
132
+ this.queue.emit("read", { start: position, end: position });
133
+ return element;
134
+ }
135
+ read() {
136
+ const element = this.queue.elements[this.index];
137
+ if (this.index < this.queue.elements.length)
138
+ this.index++;
139
+ this.queue.emit("read", { start: this.index - 1, end: this.index - 1 });
140
+ return element;
141
+ }
142
+ slice(size) {
143
+ let slice;
144
+ const start = this.index;
145
+ if (size !== undefined) {
146
+ slice = this.queue.elements.slice(this.index, size);
147
+ this.index += size;
148
+ }
149
+ else {
150
+ slice = this.queue.elements.slice(this.index);
151
+ this.index = this.queue.elements.length;
152
+ }
153
+ this.queue.emit("read", { start, end: this.index });
154
+ return slice;
155
+ }
156
+ /* writing operations */
157
+ append(element) {
158
+ this.queue.elements.push(element);
159
+ this.index = this.queue.elements.length;
160
+ this.queue.emit("write", { start: this.index - 1, end: this.index - 1 });
161
+ }
162
+ insert(element) {
163
+ this.queue.elements.splice(this.index++, 0, element);
164
+ this.queue.emit("write", { start: this.index - 1, end: this.index });
165
+ }
166
+ delete() {
167
+ if (this.index >= this.queue.elements.length)
168
+ throw new Error("cannot delete after last element");
169
+ this.queue.elements.splice(this.index, 1);
170
+ this.queue.emit("write", { start: this.index, end: this.index });
171
+ }
172
+ }
173
+ /* audio stream queue */
174
+ class AudioQueue extends node_events_1.EventEmitter {
175
+ elements = [];
176
+ pointers = new Map();
177
+ pointerUse(name) {
178
+ if (!this.pointers.has(name))
179
+ this.pointers.set(name, new AudioQueuePointer(name, this));
180
+ return this.pointers.get(name);
181
+ }
182
+ pointerDelete(name) {
183
+ if (!this.pointers.has(name))
184
+ throw new Error("pointer not exists");
185
+ this.pointers.delete(name);
186
+ }
187
+ trim() {
188
+ /* determine minimum pointer position */
189
+ let min = this.elements.length;
190
+ for (const pointer of this.pointers.values())
191
+ if (min > pointer.position())
192
+ min = pointer.position();
193
+ /* trim the maximum amount of first elements */
194
+ this.elements.splice(0, min);
195
+ /* shift all pointers */
196
+ for (const pointer of this.pointers.values())
197
+ pointer.position(pointer.position() - min);
198
+ }
199
+ }
200
+ /* transcription queue */
201
+ class TranscriptionQueue extends node_events_1.EventEmitter {
202
+ cacheDir;
203
+ model;
204
+ runtime;
205
+ log;
206
+ tasks = [];
207
+ timer = null;
208
+ busy = false;
209
+ worker = null;
210
+ constructor(cacheDir, model, runtime, log) {
211
+ super();
212
+ this.cacheDir = cacheDir;
213
+ this.model = model;
214
+ this.runtime = runtime;
215
+ this.log = log;
216
+ if (this.runtime === "auto") {
217
+ const platform = node_os_1.default.platform();
218
+ if (platform === "win32")
219
+ this.runtime = "onnx";
220
+ else if (platform === "darwin")
221
+ this.runtime = "ggml";
222
+ else
223
+ this.runtime = "onnx";
224
+ }
225
+ }
226
+ enqueue(task) {
227
+ /* destroy previous tasks of same id */
228
+ while (this.tasks.length > 0
229
+ && this.tasks[this.tasks.length - 1].id === task.id) {
230
+ this.log(`dropping existing queued request for ${task.type} transcription task #${task.id}`);
231
+ this.tasks.splice(this.tasks.length - 1, 1);
232
+ }
233
+ /* add task */
234
+ this.log(`enqueue request for ${task.type} transcription task #${task.id}`);
235
+ this.tasks.push(task);
236
+ this.dequeue();
237
+ }
238
+ dequeue() {
239
+ if (this.tasks.length === 0)
240
+ return;
241
+ if (!this.busy && this.worker !== null) {
242
+ this.busy = true;
243
+ const task = this.tasks.shift();
244
+ if (task !== undefined) {
245
+ this.log(`dequeue and send request for ${task.type} transcription task #${task.id}`);
246
+ this.worker.postMessage({ type: "task-request", task });
247
+ }
248
+ }
249
+ }
250
+ async start() {
251
+ this.log("start transcription service worker: BEGIN");
252
+ if (this.runtime === "ggml") {
253
+ const basedir = node_path_1.default.dirname(await require.resolve("smart-whisper/package.json"));
254
+ process.env.GGML_METAL_PATH_RESOURCES = node_path_1.default.resolve(basedir, "whisper.cpp/ggml/src");
255
+ console.log(process.env.GGML_METAL_PATH_RESOURCES);
256
+ }
257
+ const script = node_path_1.default.resolve(__dirname, `speechflow-node-whisper-${this.runtime}.js`);
258
+ this.worker = new node_worker_threads_1.Worker(script, { env: { ...process.env } });
259
+ this.worker.postMessage({
260
+ type: "open",
261
+ cacheDir: this.cacheDir,
262
+ model: this.model
263
+ });
264
+ this.worker.on("message", (response) => {
265
+ if (response.type === "log")
266
+ this.log(response.message);
267
+ });
268
+ await new Promise((resolve, reject) => {
269
+ let cb = null;
270
+ const cleanResolve = () => {
271
+ this.worker.off("message", cb);
272
+ resolve();
273
+ };
274
+ const cleanReject = (error) => {
275
+ this.worker.off("message", cb);
276
+ reject(error);
277
+ };
278
+ cb = (response) => {
279
+ if (response.type === "ok")
280
+ cleanResolve();
281
+ else if (response.type === "error")
282
+ cleanReject(new Error(response.message));
283
+ };
284
+ this.worker.on("message", cb);
285
+ });
286
+ this.worker.on("message", (response) => {
287
+ this.busy = false;
288
+ if (response.type === "error")
289
+ this.emit("error", response.message);
290
+ else if (response.type === "task-response") {
291
+ console.log(`receive response for task #${response.task.id}`);
292
+ this.emit("task", response.task);
293
+ }
294
+ this.dequeue();
295
+ });
296
+ if (this.timer !== null)
297
+ clearTimeout(this.timer);
298
+ this.timer = setInterval(() => {
299
+ this.dequeue();
300
+ }, 10);
301
+ this.log("start transcription service worker: END");
302
+ }
303
+ async stop() {
304
+ this.log("stop transcription service worker: BEGIN");
305
+ if (this.timer !== null) {
306
+ clearTimeout(this.timer);
307
+ this.timer = null;
308
+ }
309
+ if (this.worker !== null) {
310
+ this.worker.postMessage({ type: "close" });
311
+ await this.worker.terminate();
312
+ this.worker = null;
313
+ }
314
+ this.busy = false;
315
+ this.log("stop transcription service worker: END");
316
+ }
317
+ }
318
+ /* SpeechFlow node for Whisper speech-to-text conversion */
319
+ class SpeechFlowNodeWhisper extends speechflow_node_1.default {
320
+ /* declare official node name */
321
+ static name = "whisper";
322
+ /* OpenAI Whisper https://github.com/openai/whisper/ */
323
+ models = {
324
+ "v1-tiny": { version: "v1", released: "2022-09", paramsM: 39, vramGB: 1, speed: 10 },
325
+ "v1-base": { version: "v1", released: "2022-09", paramsM: 74, vramGB: 1, speed: 7 },
326
+ "v1-small": { version: "v1", released: "2022-09", paramsM: 244, vramGB: 2, speed: 4 },
327
+ "v1-medium": { version: "v1", released: "2022-09", paramsM: 769, vramGB: 5, speed: 2 },
328
+ "v2-large": { version: "v2", released: "2022-12", paramsM: 1550, vramGB: 10, speed: 1 },
329
+ "v3-large": { version: "v3", released: "2023-11", paramsM: 1550, vramGB: 10, speed: 1 },
330
+ "v3-large-turbo": { version: "v3", released: "2024-09", paramsM: 798, vramGB: 6, speed: 8 }
331
+ };
332
+ /* internal state */
333
+ transcriber = null;
334
+ vad = null;
335
+ queue = new AudioQueue();
336
+ queueRecv = this.queue.pointerUse("recv");
337
+ queueVAD = this.queue.pointerUse("vad");
338
+ queueSTT = this.queue.pointerUse("stt");
339
+ tqueue = null;
340
+ /* construct node */
341
+ constructor(id, cfg, opts, args) {
342
+ super(id, cfg, opts, args);
343
+ /* declare node configuration parameters */
344
+ const validModels = new RegExp(`^(?:${Object.keys(this.models).join("|")})$`);
345
+ this.configure({
346
+ language: { type: "string", val: "en", pos: 0, match: /^(?:en|de)$/ },
347
+ model: { type: "string", val: "v3-large-turbo", pos: 1, match: validModels },
348
+ runtime: { type: "string", val: "auto", pos: 2, match: /^(?:auto|onnx|ggml)$/ }
349
+ });
350
+ /* sanity check model */
351
+ if (this.models[this.params.model] === undefined)
352
+ throw new Error(`invalid OpenAI Whisper model "${this.params.model}`);
353
+ /* declare node input/output format */
354
+ this.input = "audio";
355
+ this.output = "text";
356
+ }
357
+ /* open node */
358
+ async open() {
359
+ /* sanity check situation */
360
+ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
361
+ throw new Error("Whisper node currently supports PCM-S16LE audio only");
362
+ /* pass-through logging */
363
+ const log = (level, msg) => {
364
+ this.log(level, msg);
365
+ };
366
+ /* create queue for results */
367
+ const queueOutput = new node_events_1.EventEmitter();
368
+ /* internal processing constants */
369
+ const sampleRateTarget = 16000;
370
+ const samplesPerVADFrame = 512; /* required for VAD v5 */
371
+ const minFramesPerSecond = Math.trunc(sampleRateTarget / samplesPerVADFrame) + 1;
372
+ /* initialize the transcription pipeline */
373
+ const model = this.models[this.params.model];
374
+ this.log("info", `loading OpenAI Whisper ${this.params.model} ` +
375
+ `(version: ${model.version}, released: ${model.released}, parameters: ${model.paramsM})`);
376
+ /* transcribe a chunk of audio */
377
+ this.tqueue = new TranscriptionQueue(this.config.cacheDir, this.params.model, this.params.runtime, (msg) => { this.log("info", msg); });
378
+ await this.tqueue.start();
379
+ this.tqueue.on("task", async (task) => {
380
+ // if (task.type === "intermediate")
381
+ // return
382
+ this.log("info", `received ${task.type} transcription #${task.id}: "${task.text}"`);
383
+ // DEBUG
384
+ // const wav = new wavefile.WaveFile()
385
+ // wav.fromScratch(1, sampleRateTarget, "32f", task.audio)
386
+ // const data = wav.toBuffer()
387
+ // fs.writeFileSync(`chunk-out-${n++}.wav`, data)
388
+ });
389
+ /* track audio queue element changes */
390
+ let speechActive = false;
391
+ let speechStart = -1;
392
+ let speechEnd = -1;
393
+ let speechMinSeconds = 2;
394
+ this.queue.on("write", () => {
395
+ if (!speechActive) {
396
+ const position = this.queueSTT.searchForward("speech-start");
397
+ const element = this.queueSTT.peek(position);
398
+ if (element !== undefined && element.type === "speech-start") {
399
+ this.queueSTT.position(position + 1);
400
+ speechActive = true;
401
+ speechStart = this.queueSTT.position();
402
+ speechEnd = speechStart;
403
+ speechMinSeconds = 2;
404
+ }
405
+ }
406
+ else {
407
+ speechEnd = this.queueSTT.searchForward("speech-end");
408
+ /* determine number of speech and fill frames */
409
+ let framesSpeech = 0;
410
+ for (let f = speechStart; f < speechEnd; f++) {
411
+ const element = this.queueSTT.peek(f);
412
+ if (element.type === "audio-frame")
413
+ framesSpeech++;
414
+ }
415
+ let framesFilled = minFramesPerSecond - framesSpeech;
416
+ if (framesFilled < 0)
417
+ framesFilled = 0;
418
+ /* assemble all speech and fill frames */
419
+ const assembleFrames = () => {
420
+ const speech = new Float32Array((framesSpeech + framesFilled) * samplesPerVADFrame);
421
+ let i = 0;
422
+ for (let f = speechStart; f < speechEnd; f++) {
423
+ const element = this.queueSTT.peek(f);
424
+ if (element.type === "audio-frame")
425
+ speech.set(element.data, samplesPerVADFrame * i++);
426
+ }
427
+ if (framesFilled > 0)
428
+ speech.fill(0.0, i * samplesPerVADFrame, (i + framesFilled) * samplesPerVADFrame);
429
+ // DEBUG
430
+ // const wav = new wavefile.WaveFile()
431
+ // wav.fromScratch(1, sampleRateTarget, "32f", speech)
432
+ // const data = wav.toBuffer()
433
+ // fs.writeFileSync(`chunk-speech-${m++}.wav`, data)
434
+ return speech;
435
+ };
436
+ if (speechEnd === this.queueSTT.maxPosition()) {
437
+ /* intermediate transcription */
438
+ const duration = ((framesSpeech + framesFilled) * samplesPerVADFrame) / sampleRateTarget;
439
+ if (duration >= speechMinSeconds) {
440
+ /* intermediate transcription of at least the next required minimum seconds */
441
+ const samples = assembleFrames();
442
+ this.log("info", `trigger intermediate transcription (duration: ${duration.toFixed(1)}s)`);
443
+ this.tqueue.enqueue({ id: speechStart, type: "intermediate", audio: samples, language: this.params.language });
444
+ speechMinSeconds++;
445
+ }
446
+ }
447
+ else {
448
+ /* final transcription */
449
+ const duration = ((framesSpeech + framesFilled) * samplesPerVADFrame) / sampleRateTarget;
450
+ if (duration >= 1.0) {
451
+ const samples = assembleFrames();
452
+ this.log("info", `trigger final transcription (duration: ${duration.toFixed(1)}s)`);
453
+ this.tqueue.enqueue({ id: speechStart, type: "final", audio: samples, language: this.params.language });
454
+ this.queueSTT.position(speechEnd + 1);
455
+ }
456
+ else
457
+ this.log("info", `skipping final transcription -- too short (duration: ${duration.toFixed(1)}s)`);
458
+ speechActive = false;
459
+ }
460
+ }
461
+ });
462
+ /* Voice Activity Detection (VAD) */
463
+ this.vad = await vad_node_realtime_1.RealTimeVAD.new({
464
+ onSpeechStart: () => {
465
+ this.log("info", "VAD: speech start");
466
+ this.queueVAD.insert({ type: "speech-start" });
467
+ },
468
+ onSpeechEnd: (audio) => {
469
+ this.log("info", `VAD: speech end (samples: ${audio.length})`);
470
+ this.queueVAD.insert({ type: "speech-end", short: false });
471
+ },
472
+ onVADMisfire: () => {
473
+ this.log("info", "VAD: speech end (segment too short)");
474
+ this.queueVAD.insert({ type: "speech-end", short: true });
475
+ },
476
+ onFrameProcessed: () => {
477
+ this.queueVAD.walk(+1);
478
+ },
479
+ sampleRate: 16000,
480
+ model: "v5",
481
+ frameSamples: samplesPerVADFrame, /* (= 32ms: 512 frameSamples / 16000 sampleSize) */
482
+ positiveSpeechThreshold: 0.50,
483
+ negativeSpeechThreshold: 0.35,
484
+ minSpeechFrames: 4, /* (= 128ms: 4 x 512 frameSamples) */
485
+ redemptionFrames: 8, /* (= 256ms: 8 x 512 frameSamples) */
486
+ preSpeechPadFrames: 1, /* (= 32ms: 1 x 512 frameSamples) */
487
+ });
488
+ this.vad.start();
489
+ /* provide Duplex stream and internally attach to Ollama API */
490
+ const vad = this.vad;
491
+ const cfg = this.config;
492
+ const queueRecv = this.queueRecv;
493
+ let carrySamples = new Float32Array();
494
+ let endOfStream = false;
495
+ this.stream = new node_stream_1.default.Duplex({
496
+ writableObjectMode: true,
497
+ readableObjectMode: true,
498
+ decodeStrings: false,
499
+ /* receive audio samples */
500
+ write(chunk, encoding, callback) {
501
+ if (!Buffer.isBuffer(chunk))
502
+ callback(new Error("expected audio input as Buffer chunks"));
503
+ else if (chunk.byteLength === 0)
504
+ callback();
505
+ else {
506
+ /* convert audio samples from PCM/I16/48KHz to PCM/F32/16KHz */
507
+ const bufferToInt16Array = (buf) => {
508
+ const dataView = new DataView(buf.buffer);
509
+ const result = new Int16Array(buf.length / 2);
510
+ for (let i = 0; i < result.length; i++)
511
+ result[i] = dataView.getInt16(i * 2, cfg.audioLittleEndian);
512
+ return result;
513
+ };
514
+ const wav = new wavefile.WaveFile();
515
+ wav.fromScratch(cfg.audioChannels, cfg.audioSampleRate, String(cfg.audioBitDepth), bufferToInt16Array(chunk));
516
+ wav.toBitDepth("32f");
517
+ wav.toSampleRate(16000, { method: "cubic" });
518
+ let data = wav.getSamples(false, Float32Array);
519
+ /* merge previous carry samples */
520
+ if (carrySamples.length > 0) {
521
+ const merged = new Float32Array(carrySamples.length + data.length);
522
+ merged.set(carrySamples);
523
+ merged.set(data, carrySamples.length);
524
+ data = merged;
525
+ carrySamples = new Float32Array();
526
+ }
527
+ /* DEBUG */
528
+ // const wav2 = new wavefile.WaveFile()
529
+ // wav2.fromScratch(1, sampleRateTarget, "32f", data)
530
+ // const data2 = wav.toBuffer()
531
+ // fs.writeFileSync(`chunk-in-${k++}.wav`, data2)
532
+ /* queue audio samples as individual VAD-sized frames
533
+ and in parallel send it into the Voice Activity Detection (VAD) */
534
+ const chunks = Math.trunc(data.length / samplesPerVADFrame);
535
+ for (let i = 0; i < chunks; i++) {
536
+ const frame = data.slice(i * samplesPerVADFrame, (i + 1) * samplesPerVADFrame);
537
+ queueRecv.append({ type: "audio-frame", data: frame });
538
+ vad.processAudio(frame);
539
+ }
540
+ /* remember new carry samples */
541
+ const bulkLen = chunks * samplesPerVADFrame;
542
+ carrySamples = data.slice(bulkLen);
543
+ callback();
544
+ }
545
+ },
546
+ /* send transcription texts */
547
+ read(size) {
548
+ if (endOfStream)
549
+ this.push(null);
550
+ else {
551
+ queueOutput.once("text", (text) => {
552
+ log("info", `Whisper: receive data (${text.length} bytes)`);
553
+ this.push(text, cfg.textEncoding);
554
+ });
555
+ }
556
+ },
557
+ /* react on end of input */
558
+ final(callback) {
559
+ if (carrySamples.length > 0) {
560
+ /* flush pending audio samples */
561
+ if (carrySamples.length < samplesPerVADFrame) {
562
+ const merged = new Float32Array(samplesPerVADFrame);
563
+ merged.set(carrySamples);
564
+ merged.fill(0.0, carrySamples.length, samplesPerVADFrame);
565
+ carrySamples = merged;
566
+ }
567
+ queueRecv.append({ type: "audio-frame", data: carrySamples });
568
+ vad.processAudio(carrySamples);
569
+ /* give the processing a chance to still process the remaining samples */
570
+ setTimeout(() => {
571
+ endOfStream = true;
572
+ this.push(null);
573
+ callback();
574
+ }, 2000);
575
+ }
576
+ else {
577
+ endOfStream = true;
578
+ this.push(null);
579
+ callback();
580
+ }
581
+ }
582
+ });
583
+ }
584
+ /* close node */
585
+ async close() {
586
+ /* close stream */
587
+ if (this.stream !== null) {
588
+ this.stream.destroy();
589
+ this.stream = null;
590
+ }
591
+ /* close VAD */
592
+ if (this.vad !== null) {
593
+ await this.vad.flush();
594
+ this.vad.destroy();
595
+ this.vad = null;
596
+ }
597
+ /* close transcription queue */
598
+ if (this.tqueue !== null) {
599
+ await this.tqueue.stop();
600
+ this.tqueue = null;
601
+ }
602
+ }
603
+ }
604
+ exports.default = SpeechFlowNodeWhisper;
@@ -0,0 +1,11 @@
1
+ import SpeechFlowNode from "./speechflow-node";
2
+ export default class SpeechFlowNodeTrace extends SpeechFlowNode {
3
+ static name: string;
4
+ constructor(id: string, cfg: {
5
+ [id: string]: any;
6
+ }, opts: {
7
+ [id: string]: any;
8
+ }, args: any[]);
9
+ open(): Promise<void>;
10
+ close(): Promise<void>;
11
+ }