speechflow 0.9.7 → 0.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/LICENSE.txt +674 -0
- package/README.md +67 -18
- package/dst/speechflow-node-a2a-vad.d.ts +16 -0
- package/dst/speechflow-node-a2a-vad.js +431 -0
- package/dst/speechflow-node-t2a-kokoro.d.ts +13 -0
- package/dst/speechflow-node-t2a-kokoro.js +147 -0
- package/dst/speechflow-node-t2t-gemma.js +23 -3
- package/dst/speechflow-node-t2t-ollama.d.ts +13 -0
- package/dst/speechflow-node-t2t-ollama.js +245 -0
- package/dst/speechflow-node-t2t-openai.d.ts +13 -0
- package/dst/speechflow-node-t2t-openai.js +225 -0
- package/dst/speechflow-node-t2t-opus.js +1 -1
- package/dst/speechflow-node-t2t-transformers.d.ts +14 -0
- package/dst/speechflow-node-t2t-transformers.js +260 -0
- package/dst/speechflow-node-x2x-trace.js +2 -2
- package/dst/speechflow.js +86 -40
- package/etc/speechflow.bat +6 -0
- package/etc/speechflow.sh +5 -0
- package/{sample.yaml → etc/speechflow.yaml} +9 -2
- package/etc/stx.conf +1 -1
- package/package.json +7 -6
- package/src/speechflow-node-t2a-kokoro.ts +160 -0
- package/src/{speechflow-node-t2t-gemma.ts → speechflow-node-t2t-ollama.ts} +44 -10
- package/src/speechflow-node-t2t-openai.ts +246 -0
- package/src/speechflow-node-t2t-transformers.ts +244 -0
- package/src/speechflow-node-x2x-trace.ts +2 -2
- package/src/speechflow.ts +86 -40
- package/src/speechflow-node-t2t-opus.ts +0 -111
package/dst/speechflow.js
CHANGED
|
@@ -17,6 +17,7 @@ const node_events_1 = require("node:events");
|
|
|
17
17
|
const luxon_1 = require("luxon");
|
|
18
18
|
const cli_io_1 = __importDefault(require("cli-io"));
|
|
19
19
|
const yargs_1 = __importDefault(require("yargs"));
|
|
20
|
+
const helpers_1 = require("yargs/helpers");
|
|
20
21
|
const js_yaml_1 = __importDefault(require("js-yaml"));
|
|
21
22
|
const flowlink_1 = __importDefault(require("flowlink"));
|
|
22
23
|
const object_path_1 = __importDefault(require("object-path"));
|
|
@@ -33,6 +34,7 @@ let cli = null;
|
|
|
33
34
|
dataDirAutoCreate: true
|
|
34
35
|
});
|
|
35
36
|
/* parse command-line arguments */
|
|
37
|
+
const coerce = (arg) => Array.isArray(arg) ? arg[arg.length - 1] : arg;
|
|
36
38
|
const args = await (0, yargs_1.default)()
|
|
37
39
|
/* eslint @stylistic/indent: off */
|
|
38
40
|
.usage("Usage: $0 " +
|
|
@@ -44,27 +46,68 @@ let cli = null;
|
|
|
44
46
|
"[-f|--file <file>] " +
|
|
45
47
|
"[-c|--config <id>@<yaml-config-file>] " +
|
|
46
48
|
"[<argument> [...]]")
|
|
47
|
-
.
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
.
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
49
|
+
.option("V", {
|
|
50
|
+
alias: "version",
|
|
51
|
+
type: "boolean",
|
|
52
|
+
array: false,
|
|
53
|
+
coerce,
|
|
54
|
+
default: false,
|
|
55
|
+
describe: "show program version information"
|
|
56
|
+
})
|
|
57
|
+
.option("v", {
|
|
58
|
+
alias: "log-level",
|
|
59
|
+
type: "string",
|
|
60
|
+
array: false,
|
|
61
|
+
coerce,
|
|
62
|
+
nargs: 1,
|
|
63
|
+
default: "warning",
|
|
64
|
+
describe: "level for verbose logging ('none', 'error', 'warning', 'info', 'debug')"
|
|
65
|
+
})
|
|
66
|
+
.option("C", {
|
|
67
|
+
alias: "cache",
|
|
68
|
+
type: "string",
|
|
69
|
+
array: false,
|
|
70
|
+
coerce,
|
|
71
|
+
nargs: 1,
|
|
72
|
+
default: node_path_1.default.join(dataDir, "cache"),
|
|
73
|
+
describe: "directory for cached files (primarily AI model files)"
|
|
74
|
+
})
|
|
75
|
+
.option("e", {
|
|
76
|
+
alias: "expression",
|
|
77
|
+
type: "string",
|
|
78
|
+
array: false,
|
|
79
|
+
coerce,
|
|
80
|
+
nargs: 1,
|
|
81
|
+
default: "",
|
|
82
|
+
describe: "FlowLink expression string"
|
|
83
|
+
})
|
|
84
|
+
.option("f", {
|
|
85
|
+
alias: "file",
|
|
86
|
+
type: "string",
|
|
87
|
+
array: false,
|
|
88
|
+
coerce,
|
|
89
|
+
nargs: 1,
|
|
90
|
+
default: "",
|
|
91
|
+
describe: "FlowLink expression file"
|
|
92
|
+
})
|
|
93
|
+
.option("c", {
|
|
94
|
+
alias: "config",
|
|
95
|
+
type: "string",
|
|
96
|
+
array: false,
|
|
97
|
+
coerce,
|
|
98
|
+
nargs: 1,
|
|
99
|
+
default: "",
|
|
100
|
+
describe: "FlowLink expression reference into YAML file (in format <id>@<file>)"
|
|
101
|
+
})
|
|
102
|
+
.help("h", "show usage help")
|
|
103
|
+
.alias("h", "help")
|
|
104
|
+
.showHelpOnFail(true)
|
|
61
105
|
.version(false)
|
|
62
106
|
.strict()
|
|
63
|
-
.showHelpOnFail(true)
|
|
64
107
|
.demand(0)
|
|
65
|
-
.parse(process.argv
|
|
108
|
+
.parse((0, helpers_1.hideBin)(process.argv));
|
|
66
109
|
/* short-circuit version request */
|
|
67
|
-
if (args.
|
|
110
|
+
if (args.V) {
|
|
68
111
|
process.stderr.write(`SpeechFlow ${package_json_1.default["x-stdver"]} (${package_json_1.default["x-release"]}) <${package_json_1.default.homepage}>\n`);
|
|
69
112
|
process.stderr.write(`${package_json_1.default.description}\n`);
|
|
70
113
|
process.stderr.write(`Copyright (c) 2024-2025 ${package_json_1.default.author.name} <${package_json_1.default.author.url}>\n`);
|
|
@@ -74,7 +117,7 @@ let cli = null;
|
|
|
74
117
|
/* establish CLI environment */
|
|
75
118
|
cli = new cli_io_1.default({
|
|
76
119
|
encoding: "utf8",
|
|
77
|
-
logLevel: args.
|
|
120
|
+
logLevel: args.v,
|
|
78
121
|
logTime: true,
|
|
79
122
|
logPrefix: package_json_1.default.name
|
|
80
123
|
});
|
|
@@ -100,30 +143,30 @@ let cli = null;
|
|
|
100
143
|
});
|
|
101
144
|
/* sanity check usage */
|
|
102
145
|
let n = 0;
|
|
103
|
-
if (typeof args.
|
|
146
|
+
if (typeof args.e === "string" && args.e !== "")
|
|
104
147
|
n++;
|
|
105
|
-
if (typeof args.
|
|
148
|
+
if (typeof args.f === "string" && args.f !== "")
|
|
106
149
|
n++;
|
|
107
|
-
if (typeof args.
|
|
150
|
+
if (typeof args.c === "string" && args.c !== "")
|
|
108
151
|
n++;
|
|
109
152
|
if (n !== 1)
|
|
110
153
|
throw new Error("cannot use more than one FlowLink specification source (either option -e, -f or -c)");
|
|
111
154
|
/* read configuration */
|
|
112
155
|
let config = "";
|
|
113
|
-
if (typeof args.
|
|
114
|
-
config = args.
|
|
115
|
-
else if (typeof args.
|
|
116
|
-
config = await cli.input(args.
|
|
117
|
-
else if (typeof args.
|
|
118
|
-
const m = args.
|
|
156
|
+
if (typeof args.e === "string" && args.e !== "")
|
|
157
|
+
config = args.e;
|
|
158
|
+
else if (typeof args.f === "string" && args.f !== "")
|
|
159
|
+
config = await cli.input(args.f, { encoding: "utf8" });
|
|
160
|
+
else if (typeof args.c === "string" && args.c !== "") {
|
|
161
|
+
const m = args.c.match(/^(.+?)@(.+)$/);
|
|
119
162
|
if (m === null)
|
|
120
|
-
throw new Error("invalid configuration file specification (expected \"<
|
|
121
|
-
const [,
|
|
163
|
+
throw new Error("invalid configuration file specification (expected \"<id>@<yaml-config-file>\")");
|
|
164
|
+
const [, id, file] = m;
|
|
122
165
|
const yaml = await cli.input(file, { encoding: "utf8" });
|
|
123
166
|
const obj = js_yaml_1.default.load(yaml);
|
|
124
|
-
if (obj[
|
|
125
|
-
throw new Error(`no such
|
|
126
|
-
config = obj[
|
|
167
|
+
if (obj[id] === undefined)
|
|
168
|
+
throw new Error(`no such id "${id}" found in configuration file`);
|
|
169
|
+
config = obj[id];
|
|
127
170
|
}
|
|
128
171
|
/* track the available SpeechFlow nodes */
|
|
129
172
|
const nodes = {};
|
|
@@ -133,11 +176,14 @@ let cli = null;
|
|
|
133
176
|
"./speechflow-node-a2a-wav.js",
|
|
134
177
|
"./speechflow-node-a2t-deepgram.js",
|
|
135
178
|
"./speechflow-node-t2a-elevenlabs.js",
|
|
179
|
+
"./speechflow-node-t2a-kokoro.js",
|
|
136
180
|
"./speechflow-node-t2t-deepl.js",
|
|
137
|
-
"./speechflow-node-t2t-
|
|
138
|
-
"./speechflow-node-t2t-
|
|
181
|
+
"./speechflow-node-t2t-openai.js",
|
|
182
|
+
"./speechflow-node-t2t-ollama.js",
|
|
183
|
+
"./speechflow-node-t2t-transformers.js",
|
|
139
184
|
"./speechflow-node-t2t-opus.js",
|
|
140
185
|
"./speechflow-node-t2t-subtitle.js",
|
|
186
|
+
"./speechflow-node-t2t-format.js",
|
|
141
187
|
"./speechflow-node-x2x-trace.js",
|
|
142
188
|
"./speechflow-node-xio-device.js",
|
|
143
189
|
"./speechflow-node-xio-file.js",
|
|
@@ -186,7 +232,7 @@ let cli = null;
|
|
|
186
232
|
audioLittleEndian: true,
|
|
187
233
|
audioSampleRate: 48000,
|
|
188
234
|
textEncoding: "utf8",
|
|
189
|
-
cacheDir: args.
|
|
235
|
+
cacheDir: args.C
|
|
190
236
|
};
|
|
191
237
|
let ast;
|
|
192
238
|
try {
|
|
@@ -240,9 +286,9 @@ let cli = null;
|
|
|
240
286
|
}
|
|
241
287
|
catch (err) {
|
|
242
288
|
if (err instanceof Error && err.name === "FlowLinkError")
|
|
243
|
-
cli.log("error", `failed to materialize SpeechFlow configuration: ${err.toString()}
|
|
289
|
+
cli.log("error", `failed to materialize SpeechFlow configuration: ${err.toString()}`);
|
|
244
290
|
else if (err instanceof Error)
|
|
245
|
-
cli.log("error", `failed to materialize SpeechFlow configuration: ${err.message}
|
|
291
|
+
cli.log("error", `failed to materialize SpeechFlow configuration: ${err.message}`);
|
|
246
292
|
else
|
|
247
293
|
cli.log("error", "failed to materialize SpeechFlow configuration: internal error");
|
|
248
294
|
process.exit(1);
|
|
@@ -332,7 +378,7 @@ let cli = null;
|
|
|
332
378
|
});
|
|
333
379
|
}
|
|
334
380
|
/* start of internal stream processing */
|
|
335
|
-
cli.log("info", "everything established -- stream processing in SpeechFlow graph starts");
|
|
381
|
+
cli.log("info", "**** everything established -- stream processing in SpeechFlow graph starts ****");
|
|
336
382
|
/* gracefully shutdown process */
|
|
337
383
|
let shuttingDown = false;
|
|
338
384
|
const shutdown = async (signal) => {
|
|
@@ -340,9 +386,9 @@ let cli = null;
|
|
|
340
386
|
return;
|
|
341
387
|
shuttingDown = true;
|
|
342
388
|
if (signal === "finished")
|
|
343
|
-
cli.log("info", "streams of all nodes finished -- shutting down service");
|
|
389
|
+
cli.log("info", "**** streams of all nodes finished -- shutting down service ****");
|
|
344
390
|
else
|
|
345
|
-
cli.log("warning",
|
|
391
|
+
cli.log("warning", `**** received signal ${signal} -- shutting down service ****`);
|
|
346
392
|
/* graph processing: PASS 1: disconnect node streams */
|
|
347
393
|
for (const node of graphNodes) {
|
|
348
394
|
if (node.stream === null) {
|
|
@@ -17,8 +17,8 @@ pass-through: |
|
|
|
17
17
|
device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
|
|
18
18
|
}
|
|
19
19
|
|
|
20
|
-
# Generate text file with German
|
|
21
|
-
|
|
20
|
+
# Generate text file with German transcription of MP3 audio file
|
|
21
|
+
transcription: |
|
|
22
22
|
file(path: argv.0, mode: "r", type: "audio") |
|
|
23
23
|
ffmpeg(src: "mp3", dst: "pcm") |
|
|
24
24
|
deepgram(language: "de", key: env.SPEECHFLOW_KEY_DEEPGRAM) |
|
|
@@ -39,6 +39,13 @@ translation: |
|
|
|
39
39
|
deepl(src: "de", dst: "en") |
|
|
40
40
|
file(path: "-", mode: "w", type: "text")
|
|
41
41
|
|
|
42
|
+
# Generate audio file with English voice for a text file
|
|
43
|
+
speaking: |
|
|
44
|
+
file(path: argv.0, mode: "r", type: "text") |
|
|
45
|
+
kokoro(language: "en") |
|
|
46
|
+
wav(mode: "encode") |
|
|
47
|
+
file(path: argv.1, mode: "w", type: "audio")
|
|
48
|
+
|
|
42
49
|
# Real-time studio translation from German to English,
|
|
43
50
|
# including the capturing of all involved inputs and outputs:
|
|
44
51
|
studio: |
|
package/etc/stx.conf
CHANGED
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "speechflow",
|
|
3
|
-
"version": "0.9.
|
|
4
|
-
"x-stdver": "0.9.
|
|
5
|
-
"x-release": "2025-07-
|
|
3
|
+
"version": "0.9.9",
|
|
4
|
+
"x-stdver": "0.9.9-EA",
|
|
5
|
+
"x-release": "2025-07-13",
|
|
6
6
|
"homepage": "https://github.com/rse/speechflow",
|
|
7
7
|
"description": "Speech Processing Flow Graph",
|
|
8
8
|
"license": "GPL-3.0-only",
|
|
@@ -35,6 +35,7 @@
|
|
|
35
35
|
"utf-8-validate": "6.0.5",
|
|
36
36
|
"@opensumi/reconnecting-websocket": "4.4.0",
|
|
37
37
|
"ollama": "0.5.16",
|
|
38
|
+
"openai": "5.9.0",
|
|
38
39
|
"@rse/ffmpeg": "1.4.2",
|
|
39
40
|
"ffmpeg-stream": "1.0.1",
|
|
40
41
|
"installed-packages": "1.0.13",
|
|
@@ -45,10 +46,10 @@
|
|
|
45
46
|
"pure-uuid": "1.8.1",
|
|
46
47
|
"wavefile": "11.0.0",
|
|
47
48
|
"@huggingface/transformers": "3.6.3",
|
|
49
|
+
"kokoro-js": "1.2.1",
|
|
48
50
|
"@ericedouard/vad-node-realtime": "0.2.0",
|
|
49
51
|
"luxon": "3.7.1",
|
|
50
|
-
"wrap-text": "1.0.10"
|
|
51
|
-
"smart-whisper": "0.8.1"
|
|
52
|
+
"wrap-text": "1.0.10"
|
|
52
53
|
},
|
|
53
54
|
"devDependencies": {
|
|
54
55
|
"eslint": "9.31.0",
|
|
@@ -84,7 +85,7 @@
|
|
|
84
85
|
"cross-env": "7.0.3"
|
|
85
86
|
},
|
|
86
87
|
"overrides": {
|
|
87
|
-
"onnxruntime-node":
|
|
88
|
+
"@huggingface/transformers": { "onnxruntime-node": "1.23.0-dev.20250703-7fc6235861" }
|
|
88
89
|
},
|
|
89
90
|
"upd": [ "!@biomejs/biome" ],
|
|
90
91
|
"engines": {
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import { KokoroTTS } from "kokoro-js"
|
|
12
|
+
import SpeexResampler from "speex-resampler"
|
|
13
|
+
|
|
14
|
+
/* internal dependencies */
|
|
15
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
16
|
+
|
|
17
|
+
/* SpeechFlow node for Kokoro text-to-speech conversion */
|
|
18
|
+
export default class SpeechFlowNodeKokoro extends SpeechFlowNode {
|
|
19
|
+
/* declare official node name */
|
|
20
|
+
public static name = "kokoro"
|
|
21
|
+
|
|
22
|
+
/* internal state */
|
|
23
|
+
private kokoro: KokoroTTS | null = null
|
|
24
|
+
private static speexInitialized = false
|
|
25
|
+
|
|
26
|
+
/* construct node */
|
|
27
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
28
|
+
super(id, cfg, opts, args)
|
|
29
|
+
|
|
30
|
+
/* declare node configuration parameters */
|
|
31
|
+
this.configure({
|
|
32
|
+
voice: { type: "string", val: "Aoede", pos: 0, match: /^(?:Aoede|Heart|Puck|Fenrir)$/ },
|
|
33
|
+
language: { type: "string", val: "en", pos: 1, match: /^(?:en)$/ },
|
|
34
|
+
speed: { type: "number", val: 1.25, pos: 2, match: (n: number) => n >= 1.0 && n <= 1.30 },
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
/* declare node input/output format */
|
|
38
|
+
this.input = "text"
|
|
39
|
+
this.output = "audio"
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/* open node */
|
|
43
|
+
async open () {
|
|
44
|
+
/* establish Kokoro */
|
|
45
|
+
const model = "onnx-community/Kokoro-82M-v1.0-ONNX"
|
|
46
|
+
const progressState = new Map<string, number>()
|
|
47
|
+
const progressCallback = (progress: any) => {
|
|
48
|
+
let artifact = model
|
|
49
|
+
if (typeof progress.file === "string")
|
|
50
|
+
artifact += `:${progress.file}`
|
|
51
|
+
let percent = 0
|
|
52
|
+
if (typeof progress.loaded === "number" && typeof progress.total === "number")
|
|
53
|
+
percent = (progress.loaded as number / progress.total as number) * 100
|
|
54
|
+
else if (typeof progress.progress === "number")
|
|
55
|
+
percent = progress.progress
|
|
56
|
+
if (percent > 0)
|
|
57
|
+
progressState.set(artifact, percent)
|
|
58
|
+
}
|
|
59
|
+
const interval = setInterval(() => {
|
|
60
|
+
for (const [ artifact, percent ] of progressState) {
|
|
61
|
+
this.log("info", `downloaded ${percent.toFixed(2)}% of artifact "${artifact}"`)
|
|
62
|
+
if (percent >= 1.0)
|
|
63
|
+
progressState.delete(artifact)
|
|
64
|
+
}
|
|
65
|
+
}, 1000)
|
|
66
|
+
this.kokoro = await KokoroTTS.from_pretrained(model, {
|
|
67
|
+
dtype: "q4f16",
|
|
68
|
+
progress_callback: progressCallback
|
|
69
|
+
})
|
|
70
|
+
clearInterval(interval)
|
|
71
|
+
if (this.kokoro === null)
|
|
72
|
+
throw new Error("failed to instantiate Kokoro")
|
|
73
|
+
|
|
74
|
+
/* establish resampler from Kokoro's maximum 24Khz
|
|
75
|
+
output to our standard audio sample rate (48KHz) */
|
|
76
|
+
if (!SpeechFlowNodeKokoro.speexInitialized) {
|
|
77
|
+
/* at least once initialize resampler */
|
|
78
|
+
await SpeexResampler.initPromise
|
|
79
|
+
SpeechFlowNodeKokoro.speexInitialized = true
|
|
80
|
+
}
|
|
81
|
+
const resampler = new SpeexResampler(1, 24000, this.config.audioSampleRate, 7)
|
|
82
|
+
|
|
83
|
+
/* determine voice for text-to-speech operation */
|
|
84
|
+
const voices = {
|
|
85
|
+
"Aoede": "af_aoede",
|
|
86
|
+
"Heart": "af_heart",
|
|
87
|
+
"Puck": "am_puck",
|
|
88
|
+
"Fenrir": "am_fenrir"
|
|
89
|
+
}
|
|
90
|
+
const voice = ((voices as any)[this.params.voice]) as string | undefined
|
|
91
|
+
if (voice === undefined)
|
|
92
|
+
throw new Error(`invalid Kokoro voice "${this.params.voice}"`)
|
|
93
|
+
|
|
94
|
+
/* perform text-to-speech operation with Elevenlabs API */
|
|
95
|
+
const text2speech = async (text: string) => {
|
|
96
|
+
this.log("info", `Kokoro: input: "${text}"`)
|
|
97
|
+
const audio = await this.kokoro!.generate(text, {
|
|
98
|
+
speed: this.params.speed,
|
|
99
|
+
voice: voice as any
|
|
100
|
+
})
|
|
101
|
+
if (audio.sampling_rate !== 24000)
|
|
102
|
+
throw new Error("expected 24KHz sampling rate in Kokoro output")
|
|
103
|
+
|
|
104
|
+
/* convert audio samples from PCM/F32/24Khz to PCM/I16/24KHz */
|
|
105
|
+
const samples = audio.audio
|
|
106
|
+
const buffer1 = Buffer.alloc(samples.length * 2)
|
|
107
|
+
for (let i = 0; i < samples.length; i++) {
|
|
108
|
+
const sample = Math.max(-1, Math.min(1, samples[i]))
|
|
109
|
+
buffer1.writeInt16LE(sample * 0x7FFF, i * 2)
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/* resample audio samples from PCM/I16/24Khz to PCM/I16/48KHz */
|
|
113
|
+
const buffer2 = resampler.processChunk(buffer1)
|
|
114
|
+
|
|
115
|
+
return buffer2
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/* create transform stream and connect it to the Kokoro API */
|
|
119
|
+
const log = (level: string, msg: string) => { this.log(level, msg) }
|
|
120
|
+
this.stream = new Stream.Transform({
|
|
121
|
+
writableObjectMode: true,
|
|
122
|
+
readableObjectMode: true,
|
|
123
|
+
decodeStrings: false,
|
|
124
|
+
transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
125
|
+
if (Buffer.isBuffer(chunk.payload))
|
|
126
|
+
callback(new Error("invalid chunk payload type"))
|
|
127
|
+
else {
|
|
128
|
+
text2speech(chunk.payload).then((buffer) => {
|
|
129
|
+
log("info", `Kokoro: received audio (buffer length: ${buffer.byteLength})`)
|
|
130
|
+
chunk = chunk.clone()
|
|
131
|
+
chunk.type = "audio"
|
|
132
|
+
chunk.payload = buffer
|
|
133
|
+
this.push(chunk)
|
|
134
|
+
callback()
|
|
135
|
+
}).catch((err) => {
|
|
136
|
+
callback(err)
|
|
137
|
+
})
|
|
138
|
+
}
|
|
139
|
+
},
|
|
140
|
+
final (callback) {
|
|
141
|
+
this.push(null)
|
|
142
|
+
callback()
|
|
143
|
+
}
|
|
144
|
+
})
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/* close node */
|
|
148
|
+
async close () {
|
|
149
|
+
/* destroy stream */
|
|
150
|
+
if (this.stream !== null) {
|
|
151
|
+
this.stream.destroy()
|
|
152
|
+
this.stream = null
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/* destroy Kokoro API */
|
|
156
|
+
if (this.kokoro !== null)
|
|
157
|
+
this.kokoro = null
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
@@ -17,10 +17,10 @@ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
|
17
17
|
type ConfigEntry = { systemPrompt: string, chat: Array<{ role: string, content: string }> }
|
|
18
18
|
type Config = { [ key: string ]: ConfigEntry }
|
|
19
19
|
|
|
20
|
-
/* SpeechFlow node for
|
|
21
|
-
export default class
|
|
20
|
+
/* SpeechFlow node for Ollama text-to-text translation */
|
|
21
|
+
export default class SpeechFlowNodeOllama extends SpeechFlowNode {
|
|
22
22
|
/* declare official node name */
|
|
23
|
-
public static name = "
|
|
23
|
+
public static name = "ollama"
|
|
24
24
|
|
|
25
25
|
/* internal state */
|
|
26
26
|
private ollama: Ollama | null = null
|
|
@@ -103,7 +103,8 @@ export default class SpeechFlowNodeGemma extends SpeechFlowNode {
|
|
|
103
103
|
"Do not show any prolog.\n" +
|
|
104
104
|
"Do not show any epilog.\n" +
|
|
105
105
|
"Get to the point.\n" +
|
|
106
|
-
"
|
|
106
|
+
"Preserve the original meaning, tone, and nuance.\n" +
|
|
107
|
+
"Directly translate text from English (EN) to fluent and natural German (DE) language.\n",
|
|
107
108
|
chat: [
|
|
108
109
|
{ role: "user", content: "I love my wife." },
|
|
109
110
|
{ role: "system", content: "Ich liebe meine Frau." },
|
|
@@ -121,13 +122,14 @@ export default class SpeechFlowNodeGemma extends SpeechFlowNode {
|
|
|
121
122
|
"Output only the requested text.\n" +
|
|
122
123
|
"Do not use markdown.\n" +
|
|
123
124
|
"Do not chat.\n" +
|
|
124
|
-
"Do not show any explanations
|
|
125
|
+
"Do not show any explanations.\n" +
|
|
125
126
|
"Do not show any introduction.\n" +
|
|
126
127
|
"Do not show any preamble. \n" +
|
|
127
128
|
"Do not show any prolog. \n" +
|
|
128
129
|
"Do not show any epilog. \n" +
|
|
129
130
|
"Get to the point.\n" +
|
|
130
|
-
"
|
|
131
|
+
"Preserve the original meaning, tone, and nuance.\n" +
|
|
132
|
+
"Directly translate text from German (DE) to fluent and natural English (EN) language.\n",
|
|
131
133
|
chat: [
|
|
132
134
|
{ role: "user", content: "Ich liebe meine Frau." },
|
|
133
135
|
{ role: "system", content: "I love my wife." },
|
|
@@ -145,11 +147,19 @@ export default class SpeechFlowNodeGemma extends SpeechFlowNode {
|
|
|
145
147
|
|
|
146
148
|
/* declare node configuration parameters */
|
|
147
149
|
this.configure({
|
|
148
|
-
api:
|
|
149
|
-
|
|
150
|
-
|
|
150
|
+
api: { type: "string", val: "http://127.0.0.1:11434", match: /^https?:\/\/.+?:\d+$/ },
|
|
151
|
+
model: { type: "string", val: "gemma3:4b-it-q4_K_M", match: /^.+$/ },
|
|
152
|
+
src: { type: "string", pos: 0, val: "de", match: /^(?:de|en)$/ },
|
|
153
|
+
dst: { type: "string", pos: 1, val: "en", match: /^(?:de|en)$/ }
|
|
151
154
|
})
|
|
152
155
|
|
|
156
|
+
/* tell effective mode */
|
|
157
|
+
if (this.params.src === this.params.dst)
|
|
158
|
+
this.log("info", `Ollama: operation mode: spellchecking for language "${this.params.src}"`)
|
|
159
|
+
else
|
|
160
|
+
this.log("info", `Ollama: operation mode: translation from language "${this.params.src}"` +
|
|
161
|
+
` to language "${this.params.dst}"`)
|
|
162
|
+
|
|
153
163
|
/* declare node input/output format */
|
|
154
164
|
this.input = "text"
|
|
155
165
|
this.output = "text"
|
|
@@ -160,12 +170,36 @@ export default class SpeechFlowNodeGemma extends SpeechFlowNode {
|
|
|
160
170
|
/* instantiate Ollama API */
|
|
161
171
|
this.ollama = new Ollama({ host: this.params.api })
|
|
162
172
|
|
|
173
|
+
/* ensure the model is available */
|
|
174
|
+
const model = this.params.model
|
|
175
|
+
const models = await this.ollama.list()
|
|
176
|
+
const exists = models.models.some((m) => m.name === model)
|
|
177
|
+
if (!exists) {
|
|
178
|
+
this.log("info", `Ollama: model "${model}" still not present in Ollama -- ` +
|
|
179
|
+
"automatically downloading model")
|
|
180
|
+
let artifact = ""
|
|
181
|
+
let percent = 0
|
|
182
|
+
const interval = setInterval(() => {
|
|
183
|
+
this.log("info", `downloaded ${percent.toFixed(2)}% of artifact "${artifact}"`)
|
|
184
|
+
}, 1000)
|
|
185
|
+
const progress = await this.ollama.pull({ model, stream: true })
|
|
186
|
+
for await (const event of progress) {
|
|
187
|
+
if (event.digest)
|
|
188
|
+
artifact = event.digest
|
|
189
|
+
if (event.completed && event.total)
|
|
190
|
+
percent = (event.completed / event.total) * 100
|
|
191
|
+
}
|
|
192
|
+
clearInterval(interval)
|
|
193
|
+
}
|
|
194
|
+
else
|
|
195
|
+
this.log("info", `Ollama: model "${model}" already present in Ollama`)
|
|
196
|
+
|
|
163
197
|
/* provide text-to-text translation */
|
|
164
198
|
const translate = async (text: string) => {
|
|
165
199
|
const key = `${this.params.src}-${this.params.dst}`
|
|
166
200
|
const cfg = this.setup[key]
|
|
167
201
|
const response = await this.ollama!.chat({
|
|
168
|
-
model
|
|
202
|
+
model,
|
|
169
203
|
messages: [
|
|
170
204
|
{ role: "system", content: cfg.systemPrompt },
|
|
171
205
|
...cfg.chat,
|