speechflow 1.7.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/README.md +425 -146
- package/etc/claude.md +5 -5
- package/etc/speechflow.yaml +2 -2
- package/package.json +3 -3
- package/speechflow-cli/dst/speechflow-main-api.js +6 -5
- package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-graph.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-main-graph.js +35 -13
- package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
- package/speechflow-cli/dst/speechflow-main-status.js +3 -7
- package/speechflow-cli/dst/speechflow-main-status.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +3 -0
- package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +4 -2
- package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-expander.js +4 -2
- package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-gender.js +2 -2
- package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +1 -2
- package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js +32 -5
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +1 -6
- package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-deepgram.d.ts +0 -1
- package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js +9 -9
- package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +17 -0
- package/speechflow-cli/dst/speechflow-node-a2t-google.js +320 -0
- package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js +6 -4
- package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +6 -11
- package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +6 -5
- package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-google.d.ts +15 -0
- package/speechflow-cli/dst/speechflow-node-t2a-google.js +218 -0
- package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +2 -0
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +19 -6
- package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2a-openai.d.ts +15 -0
- package/speechflow-cli/dst/speechflow-node-t2a-openai.js +195 -0
- package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +17 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +608 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/{speechflow-node-t2t-transformers.d.ts → speechflow-node-t2t-opus.d.ts} +1 -3
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js +159 -0
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.d.ts +11 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.js +118 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.d.ts +13 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +220 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -0
- package/speechflow-cli/dst/{speechflow-node-t2t-openai.d.ts → speechflow-node-t2t-spellcheck.d.ts} +2 -2
- package/speechflow-cli/dst/{speechflow-node-t2t-openai.js → speechflow-node-t2t-spellcheck.js} +47 -99
- package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +3 -6
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-summary.d.ts +16 -0
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js +241 -0
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -0
- package/speechflow-cli/dst/{speechflow-node-t2t-ollama.d.ts → speechflow-node-t2t-translate.d.ts} +2 -2
- package/speechflow-cli/dst/{speechflow-node-t2t-transformers.js → speechflow-node-t2t-translate.js} +53 -115
- package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-x2x-filter.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-node-x2x-filter.js +10 -0
- package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-x2x-trace.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-device.js +3 -3
- package/speechflow-cli/dst/speechflow-node-xio-device.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-exec.d.ts +12 -0
- package/speechflow-cli/dst/speechflow-node-xio-exec.js +223 -0
- package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-file.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-file.js +80 -67
- package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-mqtt.js +2 -1
- package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-vban.d.ts +17 -0
- package/speechflow-cli/dst/speechflow-node-xio-vban.js +330 -0
- package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.d.ts +39 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +500 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-websocket.js +2 -1
- package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-audio.js +5 -6
- package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-error.d.ts +1 -1
- package/speechflow-cli/dst/speechflow-util-error.js +5 -7
- package/speechflow-cli/dst/speechflow-util-error.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-llm.d.ts +35 -0
- package/speechflow-cli/dst/speechflow-util-llm.js +363 -0
- package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -0
- package/speechflow-cli/dst/speechflow-util-misc.d.ts +1 -1
- package/speechflow-cli/dst/speechflow-util-misc.js +4 -4
- package/speechflow-cli/dst/speechflow-util-misc.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-queue.js +3 -3
- package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-stream.js +4 -2
- package/speechflow-cli/dst/speechflow-util-stream.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-util.js +1 -0
- package/speechflow-cli/dst/speechflow-util.js.map +1 -1
- package/speechflow-cli/etc/oxlint.jsonc +2 -1
- package/speechflow-cli/package.json +34 -17
- package/speechflow-cli/src/lib.d.ts +5 -0
- package/speechflow-cli/src/speechflow-main-api.ts +6 -5
- package/speechflow-cli/src/speechflow-main-graph.ts +40 -13
- package/speechflow-cli/src/speechflow-main-status.ts +4 -8
- package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +4 -0
- package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +4 -2
- package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -1
- package/speechflow-cli/src/speechflow-node-a2a-expander.ts +4 -2
- package/speechflow-cli/src/speechflow-node-a2a-gender.ts +2 -2
- package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +1 -2
- package/speechflow-cli/src/speechflow-node-a2a-wav.ts +33 -6
- package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +6 -11
- package/speechflow-cli/src/speechflow-node-a2t-deepgram.ts +13 -12
- package/speechflow-cli/src/speechflow-node-a2t-google.ts +322 -0
- package/speechflow-cli/src/speechflow-node-a2t-openai.ts +8 -4
- package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +7 -11
- package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +6 -5
- package/speechflow-cli/src/speechflow-node-t2a-google.ts +206 -0
- package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +22 -6
- package/speechflow-cli/src/speechflow-node-t2a-openai.ts +179 -0
- package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +701 -0
- package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +2 -1
- package/speechflow-cli/src/speechflow-node-t2t-opus.ts +136 -0
- package/speechflow-cli/src/speechflow-node-t2t-profanity.ts +93 -0
- package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +201 -0
- package/speechflow-cli/src/{speechflow-node-t2t-openai.ts → speechflow-node-t2t-spellcheck.ts} +48 -107
- package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +3 -6
- package/speechflow-cli/src/speechflow-node-t2t-summary.ts +229 -0
- package/speechflow-cli/src/speechflow-node-t2t-translate.ts +181 -0
- package/speechflow-cli/src/speechflow-node-x2x-filter.ts +16 -3
- package/speechflow-cli/src/speechflow-node-x2x-trace.ts +3 -3
- package/speechflow-cli/src/speechflow-node-xio-device.ts +4 -7
- package/speechflow-cli/src/speechflow-node-xio-exec.ts +210 -0
- package/speechflow-cli/src/speechflow-node-xio-file.ts +93 -80
- package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +3 -2
- package/speechflow-cli/src/speechflow-node-xio-vban.ts +325 -0
- package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +533 -0
- package/speechflow-cli/src/speechflow-node-xio-websocket.ts +2 -1
- package/speechflow-cli/src/speechflow-util-audio-wt.ts +4 -4
- package/speechflow-cli/src/speechflow-util-audio.ts +10 -10
- package/speechflow-cli/src/speechflow-util-error.ts +9 -7
- package/speechflow-cli/src/speechflow-util-llm.ts +367 -0
- package/speechflow-cli/src/speechflow-util-misc.ts +4 -4
- package/speechflow-cli/src/speechflow-util-queue.ts +4 -4
- package/speechflow-cli/src/speechflow-util-stream.ts +5 -3
- package/speechflow-cli/src/speechflow-util.ts +1 -0
- package/speechflow-ui-db/package.json +9 -9
- package/speechflow-ui-st/package.json +9 -9
- package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +0 -293
- package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +0 -1
- package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +0 -281
- package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +0 -247
|
@@ -68,11 +68,10 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
|
|
|
68
68
|
public static name = "a2t-amazon"
|
|
69
69
|
|
|
70
70
|
/* internal state */
|
|
71
|
-
private client: TranscribeStreamingClient
|
|
72
|
-
private clientStream: AsyncIterable<TranscriptResultStream>
|
|
73
|
-
private closing
|
|
74
|
-
private
|
|
75
|
-
private connectionTimeout: ReturnType<typeof setTimeout> | null = null
|
|
71
|
+
private client: TranscribeStreamingClient | null = null
|
|
72
|
+
private clientStream: AsyncIterable<TranscriptResultStream> | null = null
|
|
73
|
+
private closing = false
|
|
74
|
+
private connectionTimeout: ReturnType<typeof setTimeout> | null = null
|
|
76
75
|
private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
|
|
77
76
|
|
|
78
77
|
/* construct node */
|
|
@@ -194,8 +193,8 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
|
|
|
194
193
|
this.queue?.write(chunk)
|
|
195
194
|
}
|
|
196
195
|
}
|
|
197
|
-
})().catch((err:
|
|
198
|
-
this.log("warning", `failed to establish connectivity to Amazon Transcribe: ${err}`)
|
|
196
|
+
})().catch((err: unknown) => {
|
|
197
|
+
this.log("warning", `failed to establish connectivity to Amazon Transcribe: ${util.ensureError(err).message}`)
|
|
199
198
|
})
|
|
200
199
|
}
|
|
201
200
|
|
|
@@ -281,10 +280,6 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
|
|
|
281
280
|
this.closing = true
|
|
282
281
|
|
|
283
282
|
/* cleanup all timers */
|
|
284
|
-
if (this.initTimeout !== null) {
|
|
285
|
-
clearTimeout(this.initTimeout)
|
|
286
|
-
this.initTimeout = null
|
|
287
|
-
}
|
|
288
283
|
if (this.connectionTimeout !== null) {
|
|
289
284
|
clearTimeout(this.connectionTimeout)
|
|
290
285
|
this.connectionTimeout = null
|
|
@@ -21,10 +21,9 @@ export default class SpeechFlowNodeA2TDeepgram extends SpeechFlowNode {
|
|
|
21
21
|
public static name = "a2t-deepgram"
|
|
22
22
|
|
|
23
23
|
/* internal state */
|
|
24
|
-
private dg: Deepgram.LiveClient
|
|
25
|
-
private closing
|
|
26
|
-
private
|
|
27
|
-
private connectionTimeout: ReturnType<typeof setTimeout> | null = null
|
|
24
|
+
private dg: Deepgram.LiveClient | null = null
|
|
25
|
+
private closing = false
|
|
26
|
+
private connectionTimeout: ReturnType<typeof setTimeout> | null = null
|
|
28
27
|
private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
|
|
29
28
|
|
|
30
29
|
/* construct node */
|
|
@@ -41,6 +40,10 @@ export default class SpeechFlowNodeA2TDeepgram extends SpeechFlowNode {
|
|
|
41
40
|
interim: { type: "boolean", val: false, pos: 3 }
|
|
42
41
|
})
|
|
43
42
|
|
|
43
|
+
/* sanity check parameters */
|
|
44
|
+
if (!this.params.key)
|
|
45
|
+
throw new Error("Deepgram API key not configured")
|
|
46
|
+
|
|
44
47
|
/* declare node input/output format */
|
|
45
48
|
this.input = "audio"
|
|
46
49
|
this.output = "text"
|
|
@@ -126,7 +129,7 @@ export default class SpeechFlowNodeA2TDeepgram extends SpeechFlowNode {
|
|
|
126
129
|
this.log("info", `text received (start: ${data.start}s, ` +
|
|
127
130
|
`duration: ${data.duration.toFixed(2)}s, ` +
|
|
128
131
|
`kind: ${isFinal ? "final" : "intermediate"}): ` +
|
|
129
|
-
|
|
132
|
+
`"${text}"`)
|
|
130
133
|
const start = Duration.fromMillis(data.start * 1000).plus(this.timeZeroOffset)
|
|
131
134
|
const end = start.plus({ seconds: data.duration })
|
|
132
135
|
const metas = metastore.fetch(start, end)
|
|
@@ -163,14 +166,16 @@ export default class SpeechFlowNodeA2TDeepgram extends SpeechFlowNode {
|
|
|
163
166
|
this.log("error", `error: ${error.message}`)
|
|
164
167
|
if (!this.closing && this.queue !== null)
|
|
165
168
|
this.queue.write(null)
|
|
166
|
-
this.emit("error")
|
|
169
|
+
this.emit("error", error)
|
|
167
170
|
})
|
|
168
171
|
|
|
169
172
|
/* wait for Deepgram API to be available */
|
|
170
173
|
await new Promise((resolve, reject) => {
|
|
171
174
|
this.connectionTimeout = setTimeout(() => {
|
|
172
|
-
this.connectionTimeout
|
|
173
|
-
|
|
175
|
+
if (this.connectionTimeout !== null) {
|
|
176
|
+
this.connectionTimeout = null
|
|
177
|
+
reject(new Error("Deepgram: timeout waiting for connection open"))
|
|
178
|
+
}
|
|
174
179
|
}, 8000)
|
|
175
180
|
this.dg!.once(Deepgram.LiveTranscriptionEvents.Open, () => {
|
|
176
181
|
this.log("info", "connection open")
|
|
@@ -271,10 +276,6 @@ export default class SpeechFlowNodeA2TDeepgram extends SpeechFlowNode {
|
|
|
271
276
|
this.closing = true
|
|
272
277
|
|
|
273
278
|
/* cleanup all timers */
|
|
274
|
-
if (this.initTimeout !== null) {
|
|
275
|
-
clearTimeout(this.initTimeout)
|
|
276
|
-
this.initTimeout = null
|
|
277
|
-
}
|
|
278
279
|
if (this.connectionTimeout !== null) {
|
|
279
280
|
clearTimeout(this.connectionTimeout)
|
|
280
281
|
this.connectionTimeout = null
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import * as GoogleSpeech from "@google-cloud/speech"
|
|
12
|
+
import { DateTime, Duration } from "luxon"
|
|
13
|
+
import * as arktype from "arktype"
|
|
14
|
+
|
|
15
|
+
/* internal dependencies */
|
|
16
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
17
|
+
import * as util from "./speechflow-util"
|
|
18
|
+
|
|
19
|
+
/* SpeechFlow node for Google Cloud speech-to-text conversion */
|
|
20
|
+
export default class SpeechFlowNodeA2TGoogle extends SpeechFlowNode {
|
|
21
|
+
/* declare official node name */
|
|
22
|
+
public static name = "a2t-google"
|
|
23
|
+
|
|
24
|
+
/* internal state */
|
|
25
|
+
private client: GoogleSpeech.SpeechClient | null = null
|
|
26
|
+
private recognizeStream: ReturnType<GoogleSpeech.SpeechClient["streamingRecognize"]> | null = null
|
|
27
|
+
private connectionTimeout: ReturnType<typeof setTimeout> | null = null
|
|
28
|
+
private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
|
|
29
|
+
private closing = false
|
|
30
|
+
|
|
31
|
+
/* construct node */
|
|
32
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
33
|
+
super(id, cfg, opts, args)
|
|
34
|
+
|
|
35
|
+
/* declare node configuration parameters */
|
|
36
|
+
this.configure({
|
|
37
|
+
key: { type: "string", val: process.env.SPEECHFLOW_GOOGLE_KEY ?? "" },
|
|
38
|
+
model: { type: "string", pos: 0, val: "latest_long" },
|
|
39
|
+
language: { type: "string", pos: 1, val: "en-US" },
|
|
40
|
+
interim: { type: "boolean", pos: 2, val: false }
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
/* validate API key */
|
|
44
|
+
if (this.params.key === "")
|
|
45
|
+
throw new Error("Google Cloud API credentials JSON key is required")
|
|
46
|
+
|
|
47
|
+
/* declare node input/output format */
|
|
48
|
+
this.input = "audio"
|
|
49
|
+
this.output = "text"
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/* one-time status of node */
|
|
53
|
+
async status () {
|
|
54
|
+
return {}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/* open node */
|
|
58
|
+
async open () {
|
|
59
|
+
/* sanity check situation */
|
|
60
|
+
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
|
|
61
|
+
throw new Error("Google Speech node currently supports PCM-S16LE audio only")
|
|
62
|
+
|
|
63
|
+
/* clear destruction flag */
|
|
64
|
+
this.closing = false
|
|
65
|
+
|
|
66
|
+
/* create queue for results */
|
|
67
|
+
this.queue = new util.SingleQueue<SpeechFlowChunk | null>()
|
|
68
|
+
|
|
69
|
+
/* create a store for the meta information */
|
|
70
|
+
const metastore = new util.TimeStore<Map<string, any>>()
|
|
71
|
+
|
|
72
|
+
/* instantiate Google Speech client */
|
|
73
|
+
const data = util.run("Google Cloud API credentials key", () =>
|
|
74
|
+
JSON.parse(this.params.key))
|
|
75
|
+
const credentials = util.importObject("Google Cloud API credentials key",
|
|
76
|
+
data,
|
|
77
|
+
arktype.type({
|
|
78
|
+
project_id: "string",
|
|
79
|
+
private_key: "string",
|
|
80
|
+
client_email: "string"
|
|
81
|
+
})
|
|
82
|
+
)
|
|
83
|
+
this.client = new GoogleSpeech.SpeechClient({
|
|
84
|
+
credentials: {
|
|
85
|
+
private_key: credentials.private_key,
|
|
86
|
+
client_email: credentials.client_email
|
|
87
|
+
},
|
|
88
|
+
projectId: credentials.project_id
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
/* create streaming recognition request */
|
|
92
|
+
this.recognizeStream = this.client.streamingRecognize({
|
|
93
|
+
config: {
|
|
94
|
+
encoding: "LINEAR16",
|
|
95
|
+
sampleRateHertz: this.config.audioSampleRate,
|
|
96
|
+
languageCode: this.params.language,
|
|
97
|
+
model: this.params.model,
|
|
98
|
+
enableAutomaticPunctuation: true,
|
|
99
|
+
enableWordTimeOffsets: true
|
|
100
|
+
},
|
|
101
|
+
interimResults: this.params.interim
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
/* hook onto Google Speech API events */
|
|
105
|
+
this.recognizeStream.on("data", (data: GoogleSpeech.protos.google.cloud.speech.v1.IStreamingRecognizeResponse) => {
|
|
106
|
+
if (this.closing || this.queue === null)
|
|
107
|
+
return
|
|
108
|
+
if (!data.results || data.results.length === 0)
|
|
109
|
+
return
|
|
110
|
+
for (const result of data.results) {
|
|
111
|
+
if (!result.alternatives || result.alternatives.length === 0)
|
|
112
|
+
continue
|
|
113
|
+
const alternative = result.alternatives[0]
|
|
114
|
+
const text = alternative.transcript ?? ""
|
|
115
|
+
if (text === "")
|
|
116
|
+
continue
|
|
117
|
+
const isFinal = result.isFinal ?? false
|
|
118
|
+
if (!isFinal && !this.params.interim)
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
/* calculate timestamps */
|
|
122
|
+
let tsStart = Duration.fromMillis(0)
|
|
123
|
+
let tsEnd = Duration.fromMillis(0)
|
|
124
|
+
|
|
125
|
+
/* extract word timing information if available */
|
|
126
|
+
const words: { word: string, start: Duration, end: Duration }[] = []
|
|
127
|
+
if (alternative.words && alternative.words.length > 0) {
|
|
128
|
+
for (const wordInfo of alternative.words) {
|
|
129
|
+
const wordStart = wordInfo.startTime
|
|
130
|
+
? Duration.fromMillis(
|
|
131
|
+
(Number(wordInfo.startTime.seconds ?? 0) * 1000) +
|
|
132
|
+
(Number(wordInfo.startTime.nanos ?? 0) / 1000000)
|
|
133
|
+
).plus(this.timeZeroOffset)
|
|
134
|
+
: Duration.fromMillis(0)
|
|
135
|
+
const wordEnd = wordInfo.endTime
|
|
136
|
+
? Duration.fromMillis(
|
|
137
|
+
(Number(wordInfo.endTime.seconds ?? 0) * 1000) +
|
|
138
|
+
(Number(wordInfo.endTime.nanos ?? 0) / 1000000)
|
|
139
|
+
).plus(this.timeZeroOffset)
|
|
140
|
+
: Duration.fromMillis(0)
|
|
141
|
+
words.push({
|
|
142
|
+
word: wordInfo.word ?? "",
|
|
143
|
+
start: wordStart,
|
|
144
|
+
end: wordEnd
|
|
145
|
+
})
|
|
146
|
+
}
|
|
147
|
+
if (words.length > 0) {
|
|
148
|
+
tsStart = words[0].start
|
|
149
|
+
tsEnd = words[words.length - 1].end
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
else {
|
|
153
|
+
/* fallback: use result timing */
|
|
154
|
+
const resultEnd = result.resultEndTime
|
|
155
|
+
if (resultEnd) {
|
|
156
|
+
tsEnd = Duration.fromMillis(
|
|
157
|
+
(Number(resultEnd.seconds ?? 0) * 1000) +
|
|
158
|
+
(Number(resultEnd.nanos ?? 0) / 1000000)
|
|
159
|
+
).plus(this.timeZeroOffset)
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
this.log("info", `text received (start: ${tsStart.toMillis()}ms, ` +
|
|
163
|
+
`end: ${tsEnd.toMillis()}ms, ` +
|
|
164
|
+
`kind: ${isFinal ? "final" : "intermediate"}): ` +
|
|
165
|
+
`"${text}"`)
|
|
166
|
+
|
|
167
|
+
/* fetch and merge meta information */
|
|
168
|
+
const metas = metastore.fetch(tsStart, tsEnd)
|
|
169
|
+
const meta = metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
|
|
170
|
+
curr.forEach((val, key) => { prev.set(key, val) })
|
|
171
|
+
return prev
|
|
172
|
+
}, new Map<string, any>())
|
|
173
|
+
metastore.prune(tsStart)
|
|
174
|
+
|
|
175
|
+
/* add word timing to meta */
|
|
176
|
+
if (words.length > 0)
|
|
177
|
+
meta.set("words", words)
|
|
178
|
+
|
|
179
|
+
/* create and enqueue chunk */
|
|
180
|
+
const chunk = new SpeechFlowChunk(tsStart, tsEnd,
|
|
181
|
+
isFinal ? "final" : "intermediate", "text", text, meta)
|
|
182
|
+
this.queue.write(chunk)
|
|
183
|
+
}
|
|
184
|
+
})
|
|
185
|
+
this.recognizeStream.on("error", (error: Error) => {
|
|
186
|
+
this.log("error", `error: ${error.message}`)
|
|
187
|
+
if (!this.closing && this.queue !== null)
|
|
188
|
+
this.queue.write(null)
|
|
189
|
+
this.emit("error", error)
|
|
190
|
+
})
|
|
191
|
+
this.recognizeStream.on("end", () => {
|
|
192
|
+
this.log("info", "stream ended")
|
|
193
|
+
if (!this.closing && this.queue !== null)
|
|
194
|
+
this.queue.write(null)
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
/* remember opening time to receive time zero offset */
|
|
198
|
+
this.timeOpen = DateTime.now()
|
|
199
|
+
|
|
200
|
+
/* provide Duplex stream and internally attach to Google Speech API */
|
|
201
|
+
const self = this
|
|
202
|
+
const reads = new util.PromiseSet<void>()
|
|
203
|
+
this.stream = new Stream.Duplex({
|
|
204
|
+
writableObjectMode: true,
|
|
205
|
+
readableObjectMode: true,
|
|
206
|
+
decodeStrings: false,
|
|
207
|
+
highWaterMark: 1,
|
|
208
|
+
write (chunk: SpeechFlowChunk, encoding, callback) {
|
|
209
|
+
if (self.closing || self.recognizeStream === null) {
|
|
210
|
+
callback(new Error("stream already destroyed"))
|
|
211
|
+
return
|
|
212
|
+
}
|
|
213
|
+
if (chunk.type !== "audio")
|
|
214
|
+
callback(new Error("expected audio input chunk"))
|
|
215
|
+
else if (!Buffer.isBuffer(chunk.payload))
|
|
216
|
+
callback(new Error("expected Buffer input chunk"))
|
|
217
|
+
else {
|
|
218
|
+
if (chunk.payload.byteLength > 0) {
|
|
219
|
+
self.log("debug", `send data (${chunk.payload.byteLength} bytes)`)
|
|
220
|
+
if (chunk.meta.size > 0)
|
|
221
|
+
metastore.store(chunk.timestampStart, chunk.timestampEnd, chunk.meta)
|
|
222
|
+
try {
|
|
223
|
+
self.recognizeStream.write(chunk.payload)
|
|
224
|
+
}
|
|
225
|
+
catch (error) {
|
|
226
|
+
callback(util.ensureError(error, "failed to send to Google Speech"))
|
|
227
|
+
return
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
callback()
|
|
231
|
+
}
|
|
232
|
+
},
|
|
233
|
+
async final (callback) {
|
|
234
|
+
/* short-circuiting in case of own closing */
|
|
235
|
+
if (self.closing || self.recognizeStream === null) {
|
|
236
|
+
callback()
|
|
237
|
+
return
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/* close Google Speech stream */
|
|
241
|
+
try {
|
|
242
|
+
self.recognizeStream.end()
|
|
243
|
+
}
|
|
244
|
+
catch (error) {
|
|
245
|
+
self.log("warning", `error closing Google Speech stream: ${error}`)
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/* await all read operations */
|
|
249
|
+
await reads.awaitAll()
|
|
250
|
+
callback()
|
|
251
|
+
},
|
|
252
|
+
read (size) {
|
|
253
|
+
if (self.closing || self.queue === null) {
|
|
254
|
+
this.push(null)
|
|
255
|
+
return
|
|
256
|
+
}
|
|
257
|
+
reads.add(self.queue.read().then((chunk) => {
|
|
258
|
+
if (self.closing || self.queue === null) {
|
|
259
|
+
this.push(null)
|
|
260
|
+
return
|
|
261
|
+
}
|
|
262
|
+
if (chunk === null) {
|
|
263
|
+
self.log("info", "received EOF signal")
|
|
264
|
+
this.push(null)
|
|
265
|
+
}
|
|
266
|
+
else {
|
|
267
|
+
self.log("debug", `received data (${chunk.payload.length} bytes)`)
|
|
268
|
+
this.push(chunk)
|
|
269
|
+
}
|
|
270
|
+
}).catch((error: unknown) => {
|
|
271
|
+
if (!self.closing && self.queue !== null)
|
|
272
|
+
self.log("error", `queue read error: ${util.ensureError(error).message}`)
|
|
273
|
+
}))
|
|
274
|
+
}
|
|
275
|
+
})
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/* close node */
|
|
279
|
+
async close () {
|
|
280
|
+
/* indicate closing first to stop all async operations */
|
|
281
|
+
this.closing = true
|
|
282
|
+
|
|
283
|
+
/* cleanup all timers */
|
|
284
|
+
if (this.connectionTimeout !== null) {
|
|
285
|
+
clearTimeout(this.connectionTimeout)
|
|
286
|
+
this.connectionTimeout = null
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/* shutdown stream */
|
|
290
|
+
if (this.stream !== null) {
|
|
291
|
+
await util.destroyStream(this.stream)
|
|
292
|
+
this.stream = null
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/* close Google Speech stream and client */
|
|
296
|
+
if (this.recognizeStream !== null) {
|
|
297
|
+
try {
|
|
298
|
+
this.recognizeStream.removeAllListeners()
|
|
299
|
+
this.recognizeStream.destroy()
|
|
300
|
+
}
|
|
301
|
+
catch (error) {
|
|
302
|
+
this.log("warning", `error during Google Speech stream cleanup: ${error}`)
|
|
303
|
+
}
|
|
304
|
+
this.recognizeStream = null
|
|
305
|
+
}
|
|
306
|
+
if (this.client !== null) {
|
|
307
|
+
try {
|
|
308
|
+
await this.client.close()
|
|
309
|
+
}
|
|
310
|
+
catch (error) {
|
|
311
|
+
this.log("warning", `error closing Google Speech client: ${error}`)
|
|
312
|
+
}
|
|
313
|
+
this.client = null
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/* signal EOF to any pending read operations */
|
|
317
|
+
if (this.queue !== null) {
|
|
318
|
+
this.queue.write(null)
|
|
319
|
+
this.queue = null
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
@@ -43,6 +43,10 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
43
43
|
interim: { type: "boolean", val: false }
|
|
44
44
|
})
|
|
45
45
|
|
|
46
|
+
/* sanity check parameters */
|
|
47
|
+
if (!this.params.key)
|
|
48
|
+
throw new Error("OpenAI API key not configured")
|
|
49
|
+
|
|
46
50
|
/* declare node input/output format */
|
|
47
51
|
this.input = "audio"
|
|
48
52
|
this.output = "text"
|
|
@@ -349,10 +353,10 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
|
|
|
349
353
|
this.ws.close()
|
|
350
354
|
this.ws = null
|
|
351
355
|
}
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
+
this.openai = null
|
|
357
|
+
|
|
358
|
+
/* close resampler */
|
|
359
|
+
this.resampler = null
|
|
356
360
|
|
|
357
361
|
/* shutdown stream */
|
|
358
362
|
if (this.stream !== null) {
|
|
@@ -83,7 +83,7 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
83
83
|
"Ruth": { language: "en", languageCode: "en-US", engine: "generative" },
|
|
84
84
|
"Stephen": { language: "en", languageCode: "en-US", engine: "generative" },
|
|
85
85
|
"Vicki": { language: "de", languageCode: "de-DE", engine: "generative" },
|
|
86
|
-
"Daniel": { language: "de", languageCode: "de-DE", engine: "generative" }
|
|
86
|
+
"Daniel": { language: "de", languageCode: "de-DE", engine: "generative" }
|
|
87
87
|
}
|
|
88
88
|
const voiceConfig = voices[this.params.voice as keyof typeof voices]
|
|
89
89
|
if (voiceConfig === undefined)
|
|
@@ -147,11 +147,6 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
147
147
|
callback()
|
|
148
148
|
},
|
|
149
149
|
final (callback) {
|
|
150
|
-
if (self.closing) {
|
|
151
|
-
callback()
|
|
152
|
-
return
|
|
153
|
-
}
|
|
154
|
-
this.push(null)
|
|
155
150
|
callback()
|
|
156
151
|
}
|
|
157
152
|
})
|
|
@@ -162,6 +157,12 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
162
157
|
/* indicate closing */
|
|
163
158
|
this.closing = true
|
|
164
159
|
|
|
160
|
+
/* shutdown stream */
|
|
161
|
+
if (this.stream !== null) {
|
|
162
|
+
await util.destroyStream(this.stream)
|
|
163
|
+
this.stream = null
|
|
164
|
+
}
|
|
165
|
+
|
|
165
166
|
/* destroy resampler */
|
|
166
167
|
if (this.resampler !== null)
|
|
167
168
|
this.resampler = null
|
|
@@ -171,11 +172,6 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
|
|
|
171
172
|
this.client.destroy()
|
|
172
173
|
this.client = null
|
|
173
174
|
}
|
|
174
|
-
/* shutdown stream */
|
|
175
|
-
if (this.stream !== null) {
|
|
176
|
-
await util.destroyStream(this.stream)
|
|
177
|
-
this.stream = null
|
|
178
|
-
}
|
|
179
175
|
}
|
|
180
176
|
}
|
|
181
177
|
|
|
@@ -103,14 +103,15 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
|
|
|
103
103
|
throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`)
|
|
104
104
|
}
|
|
105
105
|
const labels = voice.labels ?? {}
|
|
106
|
-
const info = Object.keys(labels).length > 0
|
|
107
|
-
", " + Object.entries(labels).map(([ key, val ]) => `${key}: "${val}"`).join(", ")
|
|
106
|
+
const info = Object.keys(labels).length > 0
|
|
107
|
+
? ", " + Object.entries(labels).map(([ key, val ]) => `${key}: "${val}"`).join(", ")
|
|
108
|
+
: ""
|
|
108
109
|
this.log("info", `selected voice: name: "${voice.name}"${info}`)
|
|
109
110
|
|
|
110
111
|
/* perform text-to-speech operation with Elevenlabs API */
|
|
111
|
-
const model = this.params.optimize === "quality"
|
|
112
|
-
"eleven_turbo_v2_5"
|
|
113
|
-
"eleven_flash_v2_5"
|
|
112
|
+
const model = this.params.optimize === "quality"
|
|
113
|
+
? "eleven_turbo_v2_5"
|
|
114
|
+
: "eleven_flash_v2_5"
|
|
114
115
|
const speechStream = (text: string) => {
|
|
115
116
|
this.log("info", `ElevenLabs: send text "${text}"`)
|
|
116
117
|
return this.elevenlabs!.textToSpeech.convert(voice.voiceId, {
|