speechflow 1.7.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/README.md +387 -119
- package/etc/claude.md +5 -5
- package/etc/speechflow.yaml +2 -2
- package/package.json +3 -3
- package/speechflow-cli/dst/speechflow-main-graph.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-main-graph.js +28 -5
- package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js +24 -4
- package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +17 -0
- package/speechflow-cli/dst/speechflow-node-a2t-google.js +320 -0
- package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2a-google.d.ts +15 -0
- package/speechflow-cli/dst/speechflow-node-t2a-google.js +218 -0
- package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2a-openai.d.ts +15 -0
- package/speechflow-cli/dst/speechflow-node-t2a-openai.js +195 -0
- package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +17 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +608 -0
- package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
- package/speechflow-cli/dst/{speechflow-node-t2t-transformers.d.ts → speechflow-node-t2t-opus.d.ts} +1 -3
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js +159 -0
- package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.d.ts +11 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.js +118 -0
- package/speechflow-cli/dst/speechflow-node-t2t-profanity.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.d.ts +13 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +220 -0
- package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -0
- package/speechflow-cli/dst/{speechflow-node-t2t-openai.d.ts → speechflow-node-t2t-spellcheck.d.ts} +2 -2
- package/speechflow-cli/dst/{speechflow-node-t2t-openai.js → speechflow-node-t2t-spellcheck.js} +47 -99
- package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +3 -6
- package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-t2t-summary.d.ts +16 -0
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js +241 -0
- package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -0
- package/speechflow-cli/dst/{speechflow-node-t2t-ollama.d.ts → speechflow-node-t2t-translate.d.ts} +2 -2
- package/speechflow-cli/dst/{speechflow-node-t2t-transformers.js → speechflow-node-t2t-translate.js} +53 -115
- package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-exec.d.ts +12 -0
- package/speechflow-cli/dst/speechflow-node-xio-exec.js +223 -0
- package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-file.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-file.js +79 -66
- package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
- package/speechflow-cli/dst/speechflow-node-xio-vban.d.ts +17 -0
- package/speechflow-cli/dst/speechflow-node-xio-vban.js +330 -0
- package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.d.ts +39 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +500 -0
- package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -0
- package/speechflow-cli/dst/speechflow-util-audio.js +4 -5
- package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-error.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-util-error.js +5 -0
- package/speechflow-cli/dst/speechflow-util-error.js.map +1 -1
- package/speechflow-cli/dst/speechflow-util-llm.d.ts +35 -0
- package/speechflow-cli/dst/speechflow-util-llm.js +363 -0
- package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -0
- package/speechflow-cli/dst/speechflow-util.d.ts +1 -0
- package/speechflow-cli/dst/speechflow-util.js +1 -0
- package/speechflow-cli/dst/speechflow-util.js.map +1 -1
- package/speechflow-cli/etc/oxlint.jsonc +2 -1
- package/speechflow-cli/package.json +34 -17
- package/speechflow-cli/src/lib.d.ts +5 -0
- package/speechflow-cli/src/speechflow-main-graph.ts +31 -5
- package/speechflow-cli/src/speechflow-node-a2a-wav.ts +24 -4
- package/speechflow-cli/src/speechflow-node-a2t-google.ts +322 -0
- package/speechflow-cli/src/speechflow-node-t2a-google.ts +206 -0
- package/speechflow-cli/src/speechflow-node-t2a-openai.ts +179 -0
- package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +701 -0
- package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +2 -1
- package/speechflow-cli/src/speechflow-node-t2t-opus.ts +136 -0
- package/speechflow-cli/src/speechflow-node-t2t-profanity.ts +93 -0
- package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +201 -0
- package/speechflow-cli/src/{speechflow-node-t2t-openai.ts → speechflow-node-t2t-spellcheck.ts} +48 -107
- package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +3 -6
- package/speechflow-cli/src/speechflow-node-t2t-summary.ts +229 -0
- package/speechflow-cli/src/speechflow-node-t2t-translate.ts +181 -0
- package/speechflow-cli/src/speechflow-node-xio-exec.ts +210 -0
- package/speechflow-cli/src/speechflow-node-xio-file.ts +92 -79
- package/speechflow-cli/src/speechflow-node-xio-vban.ts +325 -0
- package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +533 -0
- package/speechflow-cli/src/speechflow-util-audio.ts +5 -5
- package/speechflow-cli/src/speechflow-util-error.ts +9 -0
- package/speechflow-cli/src/speechflow-util-llm.ts +367 -0
- package/speechflow-cli/src/speechflow-util.ts +1 -0
- package/speechflow-ui-db/package.json +9 -9
- package/speechflow-ui-st/package.json +9 -9
- package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +0 -293
- package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +0 -1
- package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +0 -1
- package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +0 -281
- package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +0 -247
|
@@ -94,7 +94,8 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
|
|
|
94
94
|
|
|
95
95
|
/* declare node configuration parameters */
|
|
96
96
|
this.configure({
|
|
97
|
-
mode:
|
|
97
|
+
mode: { type: "string", pos: 0, val: "encode", match: /^(?:encode|decode)$/ },
|
|
98
|
+
seekable: { type: "boolean", pos: 1, val: false }
|
|
98
99
|
})
|
|
99
100
|
|
|
100
101
|
/* declare node input/output format */
|
|
@@ -106,7 +107,9 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
|
|
|
106
107
|
async open () {
|
|
107
108
|
/* establish a transform stream */
|
|
108
109
|
const self = this
|
|
109
|
-
let
|
|
110
|
+
let isFirstChunk = true
|
|
111
|
+
let headerChunkSent: SpeechFlowChunk | null = null
|
|
112
|
+
let totalSize = 0
|
|
110
113
|
this.stream = new Stream.Transform({
|
|
111
114
|
readableObjectMode: true,
|
|
112
115
|
writableObjectMode: true,
|
|
@@ -115,7 +118,7 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
|
|
|
115
118
|
transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
116
119
|
if (!Buffer.isBuffer(chunk.payload))
|
|
117
120
|
callback(new Error("invalid chunk payload type"))
|
|
118
|
-
else if (
|
|
121
|
+
else if (isFirstChunk) {
|
|
119
122
|
if (self.params.mode === "encode") {
|
|
120
123
|
/* convert raw/PCM to WAV/PCM
|
|
121
124
|
(NOTICE: as this is a continuous stream, the
|
|
@@ -132,7 +135,9 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
|
|
|
132
135
|
const headerChunk = chunk.clone()
|
|
133
136
|
headerChunk.payload = headerBuffer
|
|
134
137
|
this.push(headerChunk)
|
|
138
|
+
headerChunkSent = headerChunk
|
|
135
139
|
this.push(chunk)
|
|
140
|
+
totalSize += chunk.payload.byteLength
|
|
136
141
|
callback()
|
|
137
142
|
}
|
|
138
143
|
else if (self.params.mode === "decode") {
|
|
@@ -173,21 +178,36 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
|
|
|
173
178
|
}
|
|
174
179
|
chunk.payload = chunk.payload.subarray(44)
|
|
175
180
|
this.push(chunk)
|
|
181
|
+
totalSize += chunk.payload.byteLength
|
|
176
182
|
callback()
|
|
177
183
|
}
|
|
178
184
|
else {
|
|
179
185
|
callback(new Error(`invalid operation mode "${self.params.mode}"`))
|
|
180
186
|
return
|
|
181
187
|
}
|
|
182
|
-
|
|
188
|
+
isFirstChunk = false
|
|
183
189
|
}
|
|
184
190
|
else {
|
|
185
191
|
/* pass-through original chunk */
|
|
186
192
|
this.push(chunk)
|
|
193
|
+
totalSize += chunk.payload.byteLength
|
|
187
194
|
callback()
|
|
188
195
|
}
|
|
189
196
|
},
|
|
190
197
|
final (callback) {
|
|
198
|
+
if (self.params.seekable && headerChunkSent !== null) {
|
|
199
|
+
self.log("info", "sending updated WAV header")
|
|
200
|
+
const headerBuffer = writeWavHeader(totalSize, {
|
|
201
|
+
audioFormat: 0x0001 /* PCM */,
|
|
202
|
+
channels: self.config.audioChannels,
|
|
203
|
+
sampleRate: self.config.audioSampleRate,
|
|
204
|
+
bitDepth: self.config.audioBitDepth
|
|
205
|
+
})
|
|
206
|
+
const headerChunk = headerChunkSent?.clone()
|
|
207
|
+
headerChunk.payload = headerBuffer
|
|
208
|
+
headerChunk.meta.set("chunk:seek", 0)
|
|
209
|
+
this.push(headerChunk)
|
|
210
|
+
}
|
|
191
211
|
callback()
|
|
192
212
|
}
|
|
193
213
|
})
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import * as GoogleSpeech from "@google-cloud/speech"
|
|
12
|
+
import { DateTime, Duration } from "luxon"
|
|
13
|
+
import * as arktype from "arktype"
|
|
14
|
+
|
|
15
|
+
/* internal dependencies */
|
|
16
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
17
|
+
import * as util from "./speechflow-util"
|
|
18
|
+
|
|
19
|
+
/* SpeechFlow node for Google Cloud speech-to-text conversion */
|
|
20
|
+
export default class SpeechFlowNodeA2TGoogle extends SpeechFlowNode {
|
|
21
|
+
/* declare official node name */
|
|
22
|
+
public static name = "a2t-google"
|
|
23
|
+
|
|
24
|
+
/* internal state */
|
|
25
|
+
private client: GoogleSpeech.SpeechClient | null = null
|
|
26
|
+
private recognizeStream: ReturnType<GoogleSpeech.SpeechClient["streamingRecognize"]> | null = null
|
|
27
|
+
private connectionTimeout: ReturnType<typeof setTimeout> | null = null
|
|
28
|
+
private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
|
|
29
|
+
private closing = false
|
|
30
|
+
|
|
31
|
+
/* construct node */
|
|
32
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
33
|
+
super(id, cfg, opts, args)
|
|
34
|
+
|
|
35
|
+
/* declare node configuration parameters */
|
|
36
|
+
this.configure({
|
|
37
|
+
key: { type: "string", val: process.env.SPEECHFLOW_GOOGLE_KEY ?? "" },
|
|
38
|
+
model: { type: "string", pos: 0, val: "latest_long" },
|
|
39
|
+
language: { type: "string", pos: 1, val: "en-US" },
|
|
40
|
+
interim: { type: "boolean", pos: 2, val: false }
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
/* validate API key */
|
|
44
|
+
if (this.params.key === "")
|
|
45
|
+
throw new Error("Google Cloud API credentials JSON key is required")
|
|
46
|
+
|
|
47
|
+
/* declare node input/output format */
|
|
48
|
+
this.input = "audio"
|
|
49
|
+
this.output = "text"
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/* one-time status of node */
|
|
53
|
+
async status () {
|
|
54
|
+
return {}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/* open node */
|
|
58
|
+
async open () {
|
|
59
|
+
/* sanity check situation */
|
|
60
|
+
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
|
|
61
|
+
throw new Error("Google Speech node currently supports PCM-S16LE audio only")
|
|
62
|
+
|
|
63
|
+
/* clear destruction flag */
|
|
64
|
+
this.closing = false
|
|
65
|
+
|
|
66
|
+
/* create queue for results */
|
|
67
|
+
this.queue = new util.SingleQueue<SpeechFlowChunk | null>()
|
|
68
|
+
|
|
69
|
+
/* create a store for the meta information */
|
|
70
|
+
const metastore = new util.TimeStore<Map<string, any>>()
|
|
71
|
+
|
|
72
|
+
/* instantiate Google Speech client */
|
|
73
|
+
const data = util.run("Google Cloud API credentials key", () =>
|
|
74
|
+
JSON.parse(this.params.key))
|
|
75
|
+
const credentials = util.importObject("Google Cloud API credentials key",
|
|
76
|
+
data,
|
|
77
|
+
arktype.type({
|
|
78
|
+
project_id: "string",
|
|
79
|
+
private_key: "string",
|
|
80
|
+
client_email: "string"
|
|
81
|
+
})
|
|
82
|
+
)
|
|
83
|
+
this.client = new GoogleSpeech.SpeechClient({
|
|
84
|
+
credentials: {
|
|
85
|
+
private_key: credentials.private_key,
|
|
86
|
+
client_email: credentials.client_email
|
|
87
|
+
},
|
|
88
|
+
projectId: credentials.project_id
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
/* create streaming recognition request */
|
|
92
|
+
this.recognizeStream = this.client.streamingRecognize({
|
|
93
|
+
config: {
|
|
94
|
+
encoding: "LINEAR16",
|
|
95
|
+
sampleRateHertz: this.config.audioSampleRate,
|
|
96
|
+
languageCode: this.params.language,
|
|
97
|
+
model: this.params.model,
|
|
98
|
+
enableAutomaticPunctuation: true,
|
|
99
|
+
enableWordTimeOffsets: true
|
|
100
|
+
},
|
|
101
|
+
interimResults: this.params.interim
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
/* hook onto Google Speech API events */
|
|
105
|
+
this.recognizeStream.on("data", (data: GoogleSpeech.protos.google.cloud.speech.v1.IStreamingRecognizeResponse) => {
|
|
106
|
+
if (this.closing || this.queue === null)
|
|
107
|
+
return
|
|
108
|
+
if (!data.results || data.results.length === 0)
|
|
109
|
+
return
|
|
110
|
+
for (const result of data.results) {
|
|
111
|
+
if (!result.alternatives || result.alternatives.length === 0)
|
|
112
|
+
continue
|
|
113
|
+
const alternative = result.alternatives[0]
|
|
114
|
+
const text = alternative.transcript ?? ""
|
|
115
|
+
if (text === "")
|
|
116
|
+
continue
|
|
117
|
+
const isFinal = result.isFinal ?? false
|
|
118
|
+
if (!isFinal && !this.params.interim)
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
/* calculate timestamps */
|
|
122
|
+
let tsStart = Duration.fromMillis(0)
|
|
123
|
+
let tsEnd = Duration.fromMillis(0)
|
|
124
|
+
|
|
125
|
+
/* extract word timing information if available */
|
|
126
|
+
const words: { word: string, start: Duration, end: Duration }[] = []
|
|
127
|
+
if (alternative.words && alternative.words.length > 0) {
|
|
128
|
+
for (const wordInfo of alternative.words) {
|
|
129
|
+
const wordStart = wordInfo.startTime
|
|
130
|
+
? Duration.fromMillis(
|
|
131
|
+
(Number(wordInfo.startTime.seconds ?? 0) * 1000) +
|
|
132
|
+
(Number(wordInfo.startTime.nanos ?? 0) / 1000000)
|
|
133
|
+
).plus(this.timeZeroOffset)
|
|
134
|
+
: Duration.fromMillis(0)
|
|
135
|
+
const wordEnd = wordInfo.endTime
|
|
136
|
+
? Duration.fromMillis(
|
|
137
|
+
(Number(wordInfo.endTime.seconds ?? 0) * 1000) +
|
|
138
|
+
(Number(wordInfo.endTime.nanos ?? 0) / 1000000)
|
|
139
|
+
).plus(this.timeZeroOffset)
|
|
140
|
+
: Duration.fromMillis(0)
|
|
141
|
+
words.push({
|
|
142
|
+
word: wordInfo.word ?? "",
|
|
143
|
+
start: wordStart,
|
|
144
|
+
end: wordEnd
|
|
145
|
+
})
|
|
146
|
+
}
|
|
147
|
+
if (words.length > 0) {
|
|
148
|
+
tsStart = words[0].start
|
|
149
|
+
tsEnd = words[words.length - 1].end
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
else {
|
|
153
|
+
/* fallback: use result timing */
|
|
154
|
+
const resultEnd = result.resultEndTime
|
|
155
|
+
if (resultEnd) {
|
|
156
|
+
tsEnd = Duration.fromMillis(
|
|
157
|
+
(Number(resultEnd.seconds ?? 0) * 1000) +
|
|
158
|
+
(Number(resultEnd.nanos ?? 0) / 1000000)
|
|
159
|
+
).plus(this.timeZeroOffset)
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
this.log("info", `text received (start: ${tsStart.toMillis()}ms, ` +
|
|
163
|
+
`end: ${tsEnd.toMillis()}ms, ` +
|
|
164
|
+
`kind: ${isFinal ? "final" : "intermediate"}): ` +
|
|
165
|
+
`"${text}"`)
|
|
166
|
+
|
|
167
|
+
/* fetch and merge meta information */
|
|
168
|
+
const metas = metastore.fetch(tsStart, tsEnd)
|
|
169
|
+
const meta = metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
|
|
170
|
+
curr.forEach((val, key) => { prev.set(key, val) })
|
|
171
|
+
return prev
|
|
172
|
+
}, new Map<string, any>())
|
|
173
|
+
metastore.prune(tsStart)
|
|
174
|
+
|
|
175
|
+
/* add word timing to meta */
|
|
176
|
+
if (words.length > 0)
|
|
177
|
+
meta.set("words", words)
|
|
178
|
+
|
|
179
|
+
/* create and enqueue chunk */
|
|
180
|
+
const chunk = new SpeechFlowChunk(tsStart, tsEnd,
|
|
181
|
+
isFinal ? "final" : "intermediate", "text", text, meta)
|
|
182
|
+
this.queue.write(chunk)
|
|
183
|
+
}
|
|
184
|
+
})
|
|
185
|
+
this.recognizeStream.on("error", (error: Error) => {
|
|
186
|
+
this.log("error", `error: ${error.message}`)
|
|
187
|
+
if (!this.closing && this.queue !== null)
|
|
188
|
+
this.queue.write(null)
|
|
189
|
+
this.emit("error", error)
|
|
190
|
+
})
|
|
191
|
+
this.recognizeStream.on("end", () => {
|
|
192
|
+
this.log("info", "stream ended")
|
|
193
|
+
if (!this.closing && this.queue !== null)
|
|
194
|
+
this.queue.write(null)
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
/* remember opening time to receive time zero offset */
|
|
198
|
+
this.timeOpen = DateTime.now()
|
|
199
|
+
|
|
200
|
+
/* provide Duplex stream and internally attach to Google Speech API */
|
|
201
|
+
const self = this
|
|
202
|
+
const reads = new util.PromiseSet<void>()
|
|
203
|
+
this.stream = new Stream.Duplex({
|
|
204
|
+
writableObjectMode: true,
|
|
205
|
+
readableObjectMode: true,
|
|
206
|
+
decodeStrings: false,
|
|
207
|
+
highWaterMark: 1,
|
|
208
|
+
write (chunk: SpeechFlowChunk, encoding, callback) {
|
|
209
|
+
if (self.closing || self.recognizeStream === null) {
|
|
210
|
+
callback(new Error("stream already destroyed"))
|
|
211
|
+
return
|
|
212
|
+
}
|
|
213
|
+
if (chunk.type !== "audio")
|
|
214
|
+
callback(new Error("expected audio input chunk"))
|
|
215
|
+
else if (!Buffer.isBuffer(chunk.payload))
|
|
216
|
+
callback(new Error("expected Buffer input chunk"))
|
|
217
|
+
else {
|
|
218
|
+
if (chunk.payload.byteLength > 0) {
|
|
219
|
+
self.log("debug", `send data (${chunk.payload.byteLength} bytes)`)
|
|
220
|
+
if (chunk.meta.size > 0)
|
|
221
|
+
metastore.store(chunk.timestampStart, chunk.timestampEnd, chunk.meta)
|
|
222
|
+
try {
|
|
223
|
+
self.recognizeStream.write(chunk.payload)
|
|
224
|
+
}
|
|
225
|
+
catch (error) {
|
|
226
|
+
callback(util.ensureError(error, "failed to send to Google Speech"))
|
|
227
|
+
return
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
callback()
|
|
231
|
+
}
|
|
232
|
+
},
|
|
233
|
+
async final (callback) {
|
|
234
|
+
/* short-circuiting in case of own closing */
|
|
235
|
+
if (self.closing || self.recognizeStream === null) {
|
|
236
|
+
callback()
|
|
237
|
+
return
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/* close Google Speech stream */
|
|
241
|
+
try {
|
|
242
|
+
self.recognizeStream.end()
|
|
243
|
+
}
|
|
244
|
+
catch (error) {
|
|
245
|
+
self.log("warning", `error closing Google Speech stream: ${error}`)
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/* await all read operations */
|
|
249
|
+
await reads.awaitAll()
|
|
250
|
+
callback()
|
|
251
|
+
},
|
|
252
|
+
read (size) {
|
|
253
|
+
if (self.closing || self.queue === null) {
|
|
254
|
+
this.push(null)
|
|
255
|
+
return
|
|
256
|
+
}
|
|
257
|
+
reads.add(self.queue.read().then((chunk) => {
|
|
258
|
+
if (self.closing || self.queue === null) {
|
|
259
|
+
this.push(null)
|
|
260
|
+
return
|
|
261
|
+
}
|
|
262
|
+
if (chunk === null) {
|
|
263
|
+
self.log("info", "received EOF signal")
|
|
264
|
+
this.push(null)
|
|
265
|
+
}
|
|
266
|
+
else {
|
|
267
|
+
self.log("debug", `received data (${chunk.payload.length} bytes)`)
|
|
268
|
+
this.push(chunk)
|
|
269
|
+
}
|
|
270
|
+
}).catch((error: unknown) => {
|
|
271
|
+
if (!self.closing && self.queue !== null)
|
|
272
|
+
self.log("error", `queue read error: ${util.ensureError(error).message}`)
|
|
273
|
+
}))
|
|
274
|
+
}
|
|
275
|
+
})
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/* close node */
|
|
279
|
+
async close () {
|
|
280
|
+
/* indicate closing first to stop all async operations */
|
|
281
|
+
this.closing = true
|
|
282
|
+
|
|
283
|
+
/* cleanup all timers */
|
|
284
|
+
if (this.connectionTimeout !== null) {
|
|
285
|
+
clearTimeout(this.connectionTimeout)
|
|
286
|
+
this.connectionTimeout = null
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/* shutdown stream */
|
|
290
|
+
if (this.stream !== null) {
|
|
291
|
+
await util.destroyStream(this.stream)
|
|
292
|
+
this.stream = null
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/* close Google Speech stream and client */
|
|
296
|
+
if (this.recognizeStream !== null) {
|
|
297
|
+
try {
|
|
298
|
+
this.recognizeStream.removeAllListeners()
|
|
299
|
+
this.recognizeStream.destroy()
|
|
300
|
+
}
|
|
301
|
+
catch (error) {
|
|
302
|
+
this.log("warning", `error during Google Speech stream cleanup: ${error}`)
|
|
303
|
+
}
|
|
304
|
+
this.recognizeStream = null
|
|
305
|
+
}
|
|
306
|
+
if (this.client !== null) {
|
|
307
|
+
try {
|
|
308
|
+
await this.client.close()
|
|
309
|
+
}
|
|
310
|
+
catch (error) {
|
|
311
|
+
this.log("warning", `error closing Google Speech client: ${error}`)
|
|
312
|
+
}
|
|
313
|
+
this.client = null
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/* signal EOF to any pending read operations */
|
|
317
|
+
if (this.queue !== null) {
|
|
318
|
+
this.queue.write(null)
|
|
319
|
+
this.queue = null
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
/*
|
|
2
|
+
** SpeechFlow - Speech Processing Flow Graph
|
|
3
|
+
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
|
|
4
|
+
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
/* standard dependencies */
|
|
8
|
+
import Stream from "node:stream"
|
|
9
|
+
|
|
10
|
+
/* external dependencies */
|
|
11
|
+
import * as GoogleTTS from "@google-cloud/text-to-speech"
|
|
12
|
+
import { Duration } from "luxon"
|
|
13
|
+
import SpeexResampler from "speex-resampler"
|
|
14
|
+
import * as arktype from "arktype"
|
|
15
|
+
|
|
16
|
+
/* internal dependencies */
|
|
17
|
+
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
|
|
18
|
+
import * as util from "./speechflow-util"
|
|
19
|
+
|
|
20
|
+
/* SpeechFlow node for Google Cloud text-to-speech conversion */
|
|
21
|
+
export default class SpeechFlowNodeT2AGoogle extends SpeechFlowNode {
|
|
22
|
+
/* declare official node name */
|
|
23
|
+
public static name = "t2a-google"
|
|
24
|
+
|
|
25
|
+
/* internal state */
|
|
26
|
+
private client: GoogleTTS.TextToSpeechClient | null = null
|
|
27
|
+
private resampler: SpeexResampler | null = null
|
|
28
|
+
private closing = false
|
|
29
|
+
|
|
30
|
+
/* construct node */
|
|
31
|
+
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
|
|
32
|
+
super(id, cfg, opts, args)
|
|
33
|
+
|
|
34
|
+
/* declare node configuration parameters */
|
|
35
|
+
this.configure({
|
|
36
|
+
key: { type: "string", val: process.env.SPEECHFLOW_GOOGLE_KEY ?? "" },
|
|
37
|
+
voice: { type: "string", pos: 0, val: "en-US-Neural2-J" },
|
|
38
|
+
language: { type: "string", pos: 1, val: "en-US" },
|
|
39
|
+
speed: { type: "number", pos: 2, val: 1.0, match: (n: number) => n >= 0.25 && n <= 4.0 },
|
|
40
|
+
pitch: { type: "number", pos: 3, val: 0.0, match: (n: number) => n >= -20.0 && n <= 20.0 }
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
/* validate API key */
|
|
44
|
+
if (this.params.key === "")
|
|
45
|
+
throw new Error("Google Cloud API credentials JSON key is required")
|
|
46
|
+
|
|
47
|
+
/* declare node input/output format */
|
|
48
|
+
this.input = "text"
|
|
49
|
+
this.output = "audio"
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/* one-time status of node */
|
|
53
|
+
async status () {
|
|
54
|
+
return {}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/* open node */
|
|
58
|
+
async open () {
|
|
59
|
+
/* clear destruction flag */
|
|
60
|
+
this.closing = false
|
|
61
|
+
|
|
62
|
+
/* instantiate Google TTS client */
|
|
63
|
+
const data = util.run("Google Cloud API credentials key", () =>
|
|
64
|
+
JSON.parse(this.params.key))
|
|
65
|
+
const credentials = util.importObject("Google Cloud API credentials key",
|
|
66
|
+
data,
|
|
67
|
+
arktype.type({
|
|
68
|
+
project_id: "string",
|
|
69
|
+
private_key: "string",
|
|
70
|
+
client_email: "string"
|
|
71
|
+
})
|
|
72
|
+
)
|
|
73
|
+
this.client = new GoogleTTS.TextToSpeechClient({
|
|
74
|
+
credentials: {
|
|
75
|
+
private_key: credentials.private_key,
|
|
76
|
+
client_email: credentials.client_email
|
|
77
|
+
},
|
|
78
|
+
projectId: credentials.project_id
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
/* establish resampler from Google TTS's output sample rate
|
|
82
|
+
to our standard audio sample rate (48KHz) */
|
|
83
|
+
const googleSampleRate = 24000 /* Google TTS outputs 24kHz for LINEAR16 */
|
|
84
|
+
this.resampler = new SpeexResampler(1, googleSampleRate, this.config.audioSampleRate, 7)
|
|
85
|
+
|
|
86
|
+
/* perform text-to-speech operation with Google Cloud TTS API */
|
|
87
|
+
const textToSpeech = async (text: string) => {
|
|
88
|
+
this.log("info", `Google TTS: send text "${text}"`)
|
|
89
|
+
const [ response ] = await this.client!.synthesizeSpeech({
|
|
90
|
+
input: { text },
|
|
91
|
+
voice: {
|
|
92
|
+
languageCode: this.params.language,
|
|
93
|
+
name: this.params.voice
|
|
94
|
+
},
|
|
95
|
+
audioConfig: {
|
|
96
|
+
audioEncoding: "LINEAR16",
|
|
97
|
+
sampleRateHertz: googleSampleRate,
|
|
98
|
+
speakingRate: this.params.speed,
|
|
99
|
+
pitch: this.params.pitch
|
|
100
|
+
}
|
|
101
|
+
})
|
|
102
|
+
if (!response.audioContent)
|
|
103
|
+
throw new Error("no audio content returned from Google TTS")
|
|
104
|
+
|
|
105
|
+
/* convert response to buffer */
|
|
106
|
+
const buffer = Buffer.isBuffer(response.audioContent)
|
|
107
|
+
? response.audioContent
|
|
108
|
+
: Buffer.from(response.audioContent)
|
|
109
|
+
this.log("info", `Google TTS: received audio (buffer length: ${buffer.byteLength})`)
|
|
110
|
+
|
|
111
|
+
/* resample from Google's sample rate to our standard rate */
|
|
112
|
+
const bufferResampled = this.resampler!.processChunk(buffer)
|
|
113
|
+
this.log("info", `Google TTS: forwarding resampled audio (buffer length: ${bufferResampled.byteLength})`)
|
|
114
|
+
return bufferResampled
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/* create transform stream and connect it to the Google TTS API */
|
|
118
|
+
const self = this
|
|
119
|
+
this.stream = new Stream.Transform({
|
|
120
|
+
writableObjectMode: true,
|
|
121
|
+
readableObjectMode: true,
|
|
122
|
+
decodeStrings: false,
|
|
123
|
+
highWaterMark: 1,
|
|
124
|
+
async transform (chunk: SpeechFlowChunk, encoding, callback) {
|
|
125
|
+
if (self.closing)
|
|
126
|
+
callback(new Error("stream already destroyed"))
|
|
127
|
+
else if (Buffer.isBuffer(chunk.payload))
|
|
128
|
+
callback(new Error("invalid chunk payload type"))
|
|
129
|
+
else if (chunk.payload === "") {
|
|
130
|
+
/* pass through empty chunks */
|
|
131
|
+
this.push(chunk)
|
|
132
|
+
callback()
|
|
133
|
+
}
|
|
134
|
+
else {
|
|
135
|
+
let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
|
|
136
|
+
processTimeout = null
|
|
137
|
+
callback(new Error("Google TTS API timeout"))
|
|
138
|
+
}, 60 * 1000)
|
|
139
|
+
const clearProcessTimeout = () => {
|
|
140
|
+
if (processTimeout !== null) {
|
|
141
|
+
clearTimeout(processTimeout)
|
|
142
|
+
processTimeout = null
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
try {
|
|
146
|
+
if (self.closing) {
|
|
147
|
+
clearProcessTimeout()
|
|
148
|
+
callback(new Error("stream destroyed during processing"))
|
|
149
|
+
return
|
|
150
|
+
}
|
|
151
|
+
const buffer = await textToSpeech(chunk.payload as string)
|
|
152
|
+
if (self.closing) {
|
|
153
|
+
clearProcessTimeout()
|
|
154
|
+
callback(new Error("stream destroyed during processing"))
|
|
155
|
+
return
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/* calculate actual audio duration from PCM buffer size */
|
|
159
|
+
const durationMs = util.audioBufferDuration(buffer,
|
|
160
|
+
self.config.audioSampleRate, self.config.audioBitDepth) * 1000
|
|
161
|
+
|
|
162
|
+
/* create new chunk with recalculated timestamps */
|
|
163
|
+
const chunkNew = chunk.clone()
|
|
164
|
+
chunkNew.type = "audio"
|
|
165
|
+
chunkNew.payload = buffer
|
|
166
|
+
chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
|
|
167
|
+
clearProcessTimeout()
|
|
168
|
+
this.push(chunkNew)
|
|
169
|
+
callback()
|
|
170
|
+
}
|
|
171
|
+
catch (error) {
|
|
172
|
+
clearProcessTimeout()
|
|
173
|
+
callback(util.ensureError(error, "Google TTS processing failed"))
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
},
|
|
177
|
+
final (callback) {
|
|
178
|
+
callback()
|
|
179
|
+
}
|
|
180
|
+
})
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/* close node */
|
|
184
|
+
async close () {
|
|
185
|
+
/* indicate closing */
|
|
186
|
+
this.closing = true
|
|
187
|
+
|
|
188
|
+
/* shutdown stream */
|
|
189
|
+
if (this.stream !== null) {
|
|
190
|
+
await util.destroyStream(this.stream)
|
|
191
|
+
this.stream = null
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/* destroy resampler */
|
|
195
|
+
if (this.resampler !== null)
|
|
196
|
+
this.resampler = null
|
|
197
|
+
|
|
198
|
+
/* destroy Google TTS client */
|
|
199
|
+
if (this.client !== null) {
|
|
200
|
+
await this.client.close().catch((error) => {
|
|
201
|
+
this.log("warning", `error closing Google TTS client: ${error}`)
|
|
202
|
+
})
|
|
203
|
+
this.client = null
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|