speechflow 1.7.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/CHANGELOG.md +18 -0
  2. package/README.md +387 -119
  3. package/etc/claude.md +5 -5
  4. package/etc/speechflow.yaml +2 -2
  5. package/package.json +3 -3
  6. package/speechflow-cli/dst/speechflow-main-graph.d.ts +1 -0
  7. package/speechflow-cli/dst/speechflow-main-graph.js +28 -5
  8. package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
  9. package/speechflow-cli/dst/speechflow-node-a2a-wav.js +24 -4
  10. package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
  11. package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +17 -0
  12. package/speechflow-cli/dst/speechflow-node-a2t-google.js +320 -0
  13. package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -0
  14. package/speechflow-cli/dst/speechflow-node-t2a-google.d.ts +15 -0
  15. package/speechflow-cli/dst/speechflow-node-t2a-google.js +218 -0
  16. package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -0
  17. package/speechflow-cli/dst/speechflow-node-t2a-openai.d.ts +15 -0
  18. package/speechflow-cli/dst/speechflow-node-t2a-openai.js +195 -0
  19. package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -0
  20. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +17 -0
  21. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +608 -0
  22. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -0
  23. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
  24. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.d.ts → speechflow-node-t2t-opus.d.ts} +1 -3
  25. package/speechflow-cli/dst/speechflow-node-t2t-opus.js +159 -0
  26. package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -0
  27. package/speechflow-cli/dst/speechflow-node-t2t-profanity.d.ts +11 -0
  28. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js +118 -0
  29. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js.map +1 -0
  30. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.d.ts +13 -0
  31. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +220 -0
  32. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -0
  33. package/speechflow-cli/dst/{speechflow-node-t2t-openai.d.ts → speechflow-node-t2t-spellcheck.d.ts} +2 -2
  34. package/speechflow-cli/dst/{speechflow-node-t2t-openai.js → speechflow-node-t2t-spellcheck.js} +47 -99
  35. package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -0
  36. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +3 -6
  37. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
  38. package/speechflow-cli/dst/speechflow-node-t2t-summary.d.ts +16 -0
  39. package/speechflow-cli/dst/speechflow-node-t2t-summary.js +241 -0
  40. package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -0
  41. package/speechflow-cli/dst/{speechflow-node-t2t-ollama.d.ts → speechflow-node-t2t-translate.d.ts} +2 -2
  42. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.js → speechflow-node-t2t-translate.js} +53 -115
  43. package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -0
  44. package/speechflow-cli/dst/speechflow-node-xio-exec.d.ts +12 -0
  45. package/speechflow-cli/dst/speechflow-node-xio-exec.js +223 -0
  46. package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -0
  47. package/speechflow-cli/dst/speechflow-node-xio-file.d.ts +1 -0
  48. package/speechflow-cli/dst/speechflow-node-xio-file.js +79 -66
  49. package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
  50. package/speechflow-cli/dst/speechflow-node-xio-vban.d.ts +17 -0
  51. package/speechflow-cli/dst/speechflow-node-xio-vban.js +330 -0
  52. package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -0
  53. package/speechflow-cli/dst/speechflow-node-xio-webrtc.d.ts +39 -0
  54. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +500 -0
  55. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -0
  56. package/speechflow-cli/dst/speechflow-util-audio.js +4 -5
  57. package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
  58. package/speechflow-cli/dst/speechflow-util-error.d.ts +1 -0
  59. package/speechflow-cli/dst/speechflow-util-error.js +5 -0
  60. package/speechflow-cli/dst/speechflow-util-error.js.map +1 -1
  61. package/speechflow-cli/dst/speechflow-util-llm.d.ts +35 -0
  62. package/speechflow-cli/dst/speechflow-util-llm.js +363 -0
  63. package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -0
  64. package/speechflow-cli/dst/speechflow-util.d.ts +1 -0
  65. package/speechflow-cli/dst/speechflow-util.js +1 -0
  66. package/speechflow-cli/dst/speechflow-util.js.map +1 -1
  67. package/speechflow-cli/etc/oxlint.jsonc +2 -1
  68. package/speechflow-cli/package.json +34 -17
  69. package/speechflow-cli/src/lib.d.ts +5 -0
  70. package/speechflow-cli/src/speechflow-main-graph.ts +31 -5
  71. package/speechflow-cli/src/speechflow-node-a2a-wav.ts +24 -4
  72. package/speechflow-cli/src/speechflow-node-a2t-google.ts +322 -0
  73. package/speechflow-cli/src/speechflow-node-t2a-google.ts +206 -0
  74. package/speechflow-cli/src/speechflow-node-t2a-openai.ts +179 -0
  75. package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +701 -0
  76. package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +2 -1
  77. package/speechflow-cli/src/speechflow-node-t2t-opus.ts +136 -0
  78. package/speechflow-cli/src/speechflow-node-t2t-profanity.ts +93 -0
  79. package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +201 -0
  80. package/speechflow-cli/src/{speechflow-node-t2t-openai.ts → speechflow-node-t2t-spellcheck.ts} +48 -107
  81. package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +3 -6
  82. package/speechflow-cli/src/speechflow-node-t2t-summary.ts +229 -0
  83. package/speechflow-cli/src/speechflow-node-t2t-translate.ts +181 -0
  84. package/speechflow-cli/src/speechflow-node-xio-exec.ts +210 -0
  85. package/speechflow-cli/src/speechflow-node-xio-file.ts +92 -79
  86. package/speechflow-cli/src/speechflow-node-xio-vban.ts +325 -0
  87. package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +533 -0
  88. package/speechflow-cli/src/speechflow-util-audio.ts +5 -5
  89. package/speechflow-cli/src/speechflow-util-error.ts +9 -0
  90. package/speechflow-cli/src/speechflow-util-llm.ts +367 -0
  91. package/speechflow-cli/src/speechflow-util.ts +1 -0
  92. package/speechflow-ui-db/package.json +9 -9
  93. package/speechflow-ui-st/package.json +9 -9
  94. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +0 -293
  95. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +0 -1
  96. package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +0 -1
  97. package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +0 -1
  98. package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +0 -281
  99. package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +0 -247
@@ -94,7 +94,8 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
94
94
 
95
95
  /* declare node configuration parameters */
96
96
  this.configure({
97
- mode: { type: "string", pos: 1, val: "encode", match: /^(?:encode|decode)$/ }
97
+ mode: { type: "string", pos: 0, val: "encode", match: /^(?:encode|decode)$/ },
98
+ seekable: { type: "boolean", pos: 1, val: false }
98
99
  })
99
100
 
100
101
  /* declare node input/output format */
@@ -106,7 +107,9 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
106
107
  async open () {
107
108
  /* establish a transform stream */
108
109
  const self = this
109
- let firstChunk = true
110
+ let isFirstChunk = true
111
+ let headerChunkSent: SpeechFlowChunk | null = null
112
+ let totalSize = 0
110
113
  this.stream = new Stream.Transform({
111
114
  readableObjectMode: true,
112
115
  writableObjectMode: true,
@@ -115,7 +118,7 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
115
118
  transform (chunk: SpeechFlowChunk, encoding, callback) {
116
119
  if (!Buffer.isBuffer(chunk.payload))
117
120
  callback(new Error("invalid chunk payload type"))
118
- else if (firstChunk) {
121
+ else if (isFirstChunk) {
119
122
  if (self.params.mode === "encode") {
120
123
  /* convert raw/PCM to WAV/PCM
121
124
  (NOTICE: as this is a continuous stream, the
@@ -132,7 +135,9 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
132
135
  const headerChunk = chunk.clone()
133
136
  headerChunk.payload = headerBuffer
134
137
  this.push(headerChunk)
138
+ headerChunkSent = headerChunk
135
139
  this.push(chunk)
140
+ totalSize += chunk.payload.byteLength
136
141
  callback()
137
142
  }
138
143
  else if (self.params.mode === "decode") {
@@ -173,21 +178,36 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
173
178
  }
174
179
  chunk.payload = chunk.payload.subarray(44)
175
180
  this.push(chunk)
181
+ totalSize += chunk.payload.byteLength
176
182
  callback()
177
183
  }
178
184
  else {
179
185
  callback(new Error(`invalid operation mode "${self.params.mode}"`))
180
186
  return
181
187
  }
182
- firstChunk = false
188
+ isFirstChunk = false
183
189
  }
184
190
  else {
185
191
  /* pass-through original chunk */
186
192
  this.push(chunk)
193
+ totalSize += chunk.payload.byteLength
187
194
  callback()
188
195
  }
189
196
  },
190
197
  final (callback) {
198
+ if (self.params.seekable && headerChunkSent !== null) {
199
+ self.log("info", "sending updated WAV header")
200
+ const headerBuffer = writeWavHeader(totalSize, {
201
+ audioFormat: 0x0001 /* PCM */,
202
+ channels: self.config.audioChannels,
203
+ sampleRate: self.config.audioSampleRate,
204
+ bitDepth: self.config.audioBitDepth
205
+ })
206
+ const headerChunk = headerChunkSent?.clone()
207
+ headerChunk.payload = headerBuffer
208
+ headerChunk.meta.set("chunk:seek", 0)
209
+ this.push(headerChunk)
210
+ }
191
211
  callback()
192
212
  }
193
213
  })
@@ -0,0 +1,322 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* external dependencies */
11
+ import * as GoogleSpeech from "@google-cloud/speech"
12
+ import { DateTime, Duration } from "luxon"
13
+ import * as arktype from "arktype"
14
+
15
+ /* internal dependencies */
16
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
17
+ import * as util from "./speechflow-util"
18
+
19
+ /* SpeechFlow node for Google Cloud speech-to-text conversion */
20
+ export default class SpeechFlowNodeA2TGoogle extends SpeechFlowNode {
21
+ /* declare official node name */
22
+ public static name = "a2t-google"
23
+
24
+ /* internal state */
25
+ private client: GoogleSpeech.SpeechClient | null = null
26
+ private recognizeStream: ReturnType<GoogleSpeech.SpeechClient["streamingRecognize"]> | null = null
27
+ private connectionTimeout: ReturnType<typeof setTimeout> | null = null
28
+ private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
29
+ private closing = false
30
+
31
+ /* construct node */
32
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
33
+ super(id, cfg, opts, args)
34
+
35
+ /* declare node configuration parameters */
36
+ this.configure({
37
+ key: { type: "string", val: process.env.SPEECHFLOW_GOOGLE_KEY ?? "" },
38
+ model: { type: "string", pos: 0, val: "latest_long" },
39
+ language: { type: "string", pos: 1, val: "en-US" },
40
+ interim: { type: "boolean", pos: 2, val: false }
41
+ })
42
+
43
+ /* validate API key */
44
+ if (this.params.key === "")
45
+ throw new Error("Google Cloud API credentials JSON key is required")
46
+
47
+ /* declare node input/output format */
48
+ this.input = "audio"
49
+ this.output = "text"
50
+ }
51
+
52
+ /* one-time status of node */
53
+ async status () {
54
+ return {}
55
+ }
56
+
57
+ /* open node */
58
+ async open () {
59
+ /* sanity check situation */
60
+ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
61
+ throw new Error("Google Speech node currently supports PCM-S16LE audio only")
62
+
63
+ /* clear destruction flag */
64
+ this.closing = false
65
+
66
+ /* create queue for results */
67
+ this.queue = new util.SingleQueue<SpeechFlowChunk | null>()
68
+
69
+ /* create a store for the meta information */
70
+ const metastore = new util.TimeStore<Map<string, any>>()
71
+
72
+ /* instantiate Google Speech client */
73
+ const data = util.run("Google Cloud API credentials key", () =>
74
+ JSON.parse(this.params.key))
75
+ const credentials = util.importObject("Google Cloud API credentials key",
76
+ data,
77
+ arktype.type({
78
+ project_id: "string",
79
+ private_key: "string",
80
+ client_email: "string"
81
+ })
82
+ )
83
+ this.client = new GoogleSpeech.SpeechClient({
84
+ credentials: {
85
+ private_key: credentials.private_key,
86
+ client_email: credentials.client_email
87
+ },
88
+ projectId: credentials.project_id
89
+ })
90
+
91
+ /* create streaming recognition request */
92
+ this.recognizeStream = this.client.streamingRecognize({
93
+ config: {
94
+ encoding: "LINEAR16",
95
+ sampleRateHertz: this.config.audioSampleRate,
96
+ languageCode: this.params.language,
97
+ model: this.params.model,
98
+ enableAutomaticPunctuation: true,
99
+ enableWordTimeOffsets: true
100
+ },
101
+ interimResults: this.params.interim
102
+ })
103
+
104
+ /* hook onto Google Speech API events */
105
+ this.recognizeStream.on("data", (data: GoogleSpeech.protos.google.cloud.speech.v1.IStreamingRecognizeResponse) => {
106
+ if (this.closing || this.queue === null)
107
+ return
108
+ if (!data.results || data.results.length === 0)
109
+ return
110
+ for (const result of data.results) {
111
+ if (!result.alternatives || result.alternatives.length === 0)
112
+ continue
113
+ const alternative = result.alternatives[0]
114
+ const text = alternative.transcript ?? ""
115
+ if (text === "")
116
+ continue
117
+ const isFinal = result.isFinal ?? false
118
+ if (!isFinal && !this.params.interim)
119
+ continue
120
+
121
+ /* calculate timestamps */
122
+ let tsStart = Duration.fromMillis(0)
123
+ let tsEnd = Duration.fromMillis(0)
124
+
125
+ /* extract word timing information if available */
126
+ const words: { word: string, start: Duration, end: Duration }[] = []
127
+ if (alternative.words && alternative.words.length > 0) {
128
+ for (const wordInfo of alternative.words) {
129
+ const wordStart = wordInfo.startTime
130
+ ? Duration.fromMillis(
131
+ (Number(wordInfo.startTime.seconds ?? 0) * 1000) +
132
+ (Number(wordInfo.startTime.nanos ?? 0) / 1000000)
133
+ ).plus(this.timeZeroOffset)
134
+ : Duration.fromMillis(0)
135
+ const wordEnd = wordInfo.endTime
136
+ ? Duration.fromMillis(
137
+ (Number(wordInfo.endTime.seconds ?? 0) * 1000) +
138
+ (Number(wordInfo.endTime.nanos ?? 0) / 1000000)
139
+ ).plus(this.timeZeroOffset)
140
+ : Duration.fromMillis(0)
141
+ words.push({
142
+ word: wordInfo.word ?? "",
143
+ start: wordStart,
144
+ end: wordEnd
145
+ })
146
+ }
147
+ if (words.length > 0) {
148
+ tsStart = words[0].start
149
+ tsEnd = words[words.length - 1].end
150
+ }
151
+ }
152
+ else {
153
+ /* fallback: use result timing */
154
+ const resultEnd = result.resultEndTime
155
+ if (resultEnd) {
156
+ tsEnd = Duration.fromMillis(
157
+ (Number(resultEnd.seconds ?? 0) * 1000) +
158
+ (Number(resultEnd.nanos ?? 0) / 1000000)
159
+ ).plus(this.timeZeroOffset)
160
+ }
161
+ }
162
+ this.log("info", `text received (start: ${tsStart.toMillis()}ms, ` +
163
+ `end: ${tsEnd.toMillis()}ms, ` +
164
+ `kind: ${isFinal ? "final" : "intermediate"}): ` +
165
+ `"${text}"`)
166
+
167
+ /* fetch and merge meta information */
168
+ const metas = metastore.fetch(tsStart, tsEnd)
169
+ const meta = metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
170
+ curr.forEach((val, key) => { prev.set(key, val) })
171
+ return prev
172
+ }, new Map<string, any>())
173
+ metastore.prune(tsStart)
174
+
175
+ /* add word timing to meta */
176
+ if (words.length > 0)
177
+ meta.set("words", words)
178
+
179
+ /* create and enqueue chunk */
180
+ const chunk = new SpeechFlowChunk(tsStart, tsEnd,
181
+ isFinal ? "final" : "intermediate", "text", text, meta)
182
+ this.queue.write(chunk)
183
+ }
184
+ })
185
+ this.recognizeStream.on("error", (error: Error) => {
186
+ this.log("error", `error: ${error.message}`)
187
+ if (!this.closing && this.queue !== null)
188
+ this.queue.write(null)
189
+ this.emit("error", error)
190
+ })
191
+ this.recognizeStream.on("end", () => {
192
+ this.log("info", "stream ended")
193
+ if (!this.closing && this.queue !== null)
194
+ this.queue.write(null)
195
+ })
196
+
197
+ /* remember opening time to receive time zero offset */
198
+ this.timeOpen = DateTime.now()
199
+
200
+ /* provide Duplex stream and internally attach to Google Speech API */
201
+ const self = this
202
+ const reads = new util.PromiseSet<void>()
203
+ this.stream = new Stream.Duplex({
204
+ writableObjectMode: true,
205
+ readableObjectMode: true,
206
+ decodeStrings: false,
207
+ highWaterMark: 1,
208
+ write (chunk: SpeechFlowChunk, encoding, callback) {
209
+ if (self.closing || self.recognizeStream === null) {
210
+ callback(new Error("stream already destroyed"))
211
+ return
212
+ }
213
+ if (chunk.type !== "audio")
214
+ callback(new Error("expected audio input chunk"))
215
+ else if (!Buffer.isBuffer(chunk.payload))
216
+ callback(new Error("expected Buffer input chunk"))
217
+ else {
218
+ if (chunk.payload.byteLength > 0) {
219
+ self.log("debug", `send data (${chunk.payload.byteLength} bytes)`)
220
+ if (chunk.meta.size > 0)
221
+ metastore.store(chunk.timestampStart, chunk.timestampEnd, chunk.meta)
222
+ try {
223
+ self.recognizeStream.write(chunk.payload)
224
+ }
225
+ catch (error) {
226
+ callback(util.ensureError(error, "failed to send to Google Speech"))
227
+ return
228
+ }
229
+ }
230
+ callback()
231
+ }
232
+ },
233
+ async final (callback) {
234
+ /* short-circuiting in case of own closing */
235
+ if (self.closing || self.recognizeStream === null) {
236
+ callback()
237
+ return
238
+ }
239
+
240
+ /* close Google Speech stream */
241
+ try {
242
+ self.recognizeStream.end()
243
+ }
244
+ catch (error) {
245
+ self.log("warning", `error closing Google Speech stream: ${error}`)
246
+ }
247
+
248
+ /* await all read operations */
249
+ await reads.awaitAll()
250
+ callback()
251
+ },
252
+ read (size) {
253
+ if (self.closing || self.queue === null) {
254
+ this.push(null)
255
+ return
256
+ }
257
+ reads.add(self.queue.read().then((chunk) => {
258
+ if (self.closing || self.queue === null) {
259
+ this.push(null)
260
+ return
261
+ }
262
+ if (chunk === null) {
263
+ self.log("info", "received EOF signal")
264
+ this.push(null)
265
+ }
266
+ else {
267
+ self.log("debug", `received data (${chunk.payload.length} bytes)`)
268
+ this.push(chunk)
269
+ }
270
+ }).catch((error: unknown) => {
271
+ if (!self.closing && self.queue !== null)
272
+ self.log("error", `queue read error: ${util.ensureError(error).message}`)
273
+ }))
274
+ }
275
+ })
276
+ }
277
+
278
+ /* close node */
279
+ async close () {
280
+ /* indicate closing first to stop all async operations */
281
+ this.closing = true
282
+
283
+ /* cleanup all timers */
284
+ if (this.connectionTimeout !== null) {
285
+ clearTimeout(this.connectionTimeout)
286
+ this.connectionTimeout = null
287
+ }
288
+
289
+ /* shutdown stream */
290
+ if (this.stream !== null) {
291
+ await util.destroyStream(this.stream)
292
+ this.stream = null
293
+ }
294
+
295
+ /* close Google Speech stream and client */
296
+ if (this.recognizeStream !== null) {
297
+ try {
298
+ this.recognizeStream.removeAllListeners()
299
+ this.recognizeStream.destroy()
300
+ }
301
+ catch (error) {
302
+ this.log("warning", `error during Google Speech stream cleanup: ${error}`)
303
+ }
304
+ this.recognizeStream = null
305
+ }
306
+ if (this.client !== null) {
307
+ try {
308
+ await this.client.close()
309
+ }
310
+ catch (error) {
311
+ this.log("warning", `error closing Google Speech client: ${error}`)
312
+ }
313
+ this.client = null
314
+ }
315
+
316
+ /* signal EOF to any pending read operations */
317
+ if (this.queue !== null) {
318
+ this.queue.write(null)
319
+ this.queue = null
320
+ }
321
+ }
322
+ }
@@ -0,0 +1,206 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* external dependencies */
11
+ import * as GoogleTTS from "@google-cloud/text-to-speech"
12
+ import { Duration } from "luxon"
13
+ import SpeexResampler from "speex-resampler"
14
+ import * as arktype from "arktype"
15
+
16
+ /* internal dependencies */
17
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
18
+ import * as util from "./speechflow-util"
19
+
20
+ /* SpeechFlow node for Google Cloud text-to-speech conversion */
21
+ export default class SpeechFlowNodeT2AGoogle extends SpeechFlowNode {
22
+ /* declare official node name */
23
+ public static name = "t2a-google"
24
+
25
+ /* internal state */
26
+ private client: GoogleTTS.TextToSpeechClient | null = null
27
+ private resampler: SpeexResampler | null = null
28
+ private closing = false
29
+
30
+ /* construct node */
31
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
32
+ super(id, cfg, opts, args)
33
+
34
+ /* declare node configuration parameters */
35
+ this.configure({
36
+ key: { type: "string", val: process.env.SPEECHFLOW_GOOGLE_KEY ?? "" },
37
+ voice: { type: "string", pos: 0, val: "en-US-Neural2-J" },
38
+ language: { type: "string", pos: 1, val: "en-US" },
39
+ speed: { type: "number", pos: 2, val: 1.0, match: (n: number) => n >= 0.25 && n <= 4.0 },
40
+ pitch: { type: "number", pos: 3, val: 0.0, match: (n: number) => n >= -20.0 && n <= 20.0 }
41
+ })
42
+
43
+ /* validate API key */
44
+ if (this.params.key === "")
45
+ throw new Error("Google Cloud API credentials JSON key is required")
46
+
47
+ /* declare node input/output format */
48
+ this.input = "text"
49
+ this.output = "audio"
50
+ }
51
+
52
+ /* one-time status of node */
53
+ async status () {
54
+ return {}
55
+ }
56
+
57
+ /* open node */
58
+ async open () {
59
+ /* clear destruction flag */
60
+ this.closing = false
61
+
62
+ /* instantiate Google TTS client */
63
+ const data = util.run("Google Cloud API credentials key", () =>
64
+ JSON.parse(this.params.key))
65
+ const credentials = util.importObject("Google Cloud API credentials key",
66
+ data,
67
+ arktype.type({
68
+ project_id: "string",
69
+ private_key: "string",
70
+ client_email: "string"
71
+ })
72
+ )
73
+ this.client = new GoogleTTS.TextToSpeechClient({
74
+ credentials: {
75
+ private_key: credentials.private_key,
76
+ client_email: credentials.client_email
77
+ },
78
+ projectId: credentials.project_id
79
+ })
80
+
81
+ /* establish resampler from Google TTS's output sample rate
82
+ to our standard audio sample rate (48KHz) */
83
+ const googleSampleRate = 24000 /* Google TTS outputs 24kHz for LINEAR16 */
84
+ this.resampler = new SpeexResampler(1, googleSampleRate, this.config.audioSampleRate, 7)
85
+
86
+ /* perform text-to-speech operation with Google Cloud TTS API */
87
+ const textToSpeech = async (text: string) => {
88
+ this.log("info", `Google TTS: send text "${text}"`)
89
+ const [ response ] = await this.client!.synthesizeSpeech({
90
+ input: { text },
91
+ voice: {
92
+ languageCode: this.params.language,
93
+ name: this.params.voice
94
+ },
95
+ audioConfig: {
96
+ audioEncoding: "LINEAR16",
97
+ sampleRateHertz: googleSampleRate,
98
+ speakingRate: this.params.speed,
99
+ pitch: this.params.pitch
100
+ }
101
+ })
102
+ if (!response.audioContent)
103
+ throw new Error("no audio content returned from Google TTS")
104
+
105
+ /* convert response to buffer */
106
+ const buffer = Buffer.isBuffer(response.audioContent)
107
+ ? response.audioContent
108
+ : Buffer.from(response.audioContent)
109
+ this.log("info", `Google TTS: received audio (buffer length: ${buffer.byteLength})`)
110
+
111
+ /* resample from Google's sample rate to our standard rate */
112
+ const bufferResampled = this.resampler!.processChunk(buffer)
113
+ this.log("info", `Google TTS: forwarding resampled audio (buffer length: ${bufferResampled.byteLength})`)
114
+ return bufferResampled
115
+ }
116
+
117
+ /* create transform stream and connect it to the Google TTS API */
118
+ const self = this
119
+ this.stream = new Stream.Transform({
120
+ writableObjectMode: true,
121
+ readableObjectMode: true,
122
+ decodeStrings: false,
123
+ highWaterMark: 1,
124
+ async transform (chunk: SpeechFlowChunk, encoding, callback) {
125
+ if (self.closing)
126
+ callback(new Error("stream already destroyed"))
127
+ else if (Buffer.isBuffer(chunk.payload))
128
+ callback(new Error("invalid chunk payload type"))
129
+ else if (chunk.payload === "") {
130
+ /* pass through empty chunks */
131
+ this.push(chunk)
132
+ callback()
133
+ }
134
+ else {
135
+ let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
136
+ processTimeout = null
137
+ callback(new Error("Google TTS API timeout"))
138
+ }, 60 * 1000)
139
+ const clearProcessTimeout = () => {
140
+ if (processTimeout !== null) {
141
+ clearTimeout(processTimeout)
142
+ processTimeout = null
143
+ }
144
+ }
145
+ try {
146
+ if (self.closing) {
147
+ clearProcessTimeout()
148
+ callback(new Error("stream destroyed during processing"))
149
+ return
150
+ }
151
+ const buffer = await textToSpeech(chunk.payload as string)
152
+ if (self.closing) {
153
+ clearProcessTimeout()
154
+ callback(new Error("stream destroyed during processing"))
155
+ return
156
+ }
157
+
158
+ /* calculate actual audio duration from PCM buffer size */
159
+ const durationMs = util.audioBufferDuration(buffer,
160
+ self.config.audioSampleRate, self.config.audioBitDepth) * 1000
161
+
162
+ /* create new chunk with recalculated timestamps */
163
+ const chunkNew = chunk.clone()
164
+ chunkNew.type = "audio"
165
+ chunkNew.payload = buffer
166
+ chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
167
+ clearProcessTimeout()
168
+ this.push(chunkNew)
169
+ callback()
170
+ }
171
+ catch (error) {
172
+ clearProcessTimeout()
173
+ callback(util.ensureError(error, "Google TTS processing failed"))
174
+ }
175
+ }
176
+ },
177
+ final (callback) {
178
+ callback()
179
+ }
180
+ })
181
+ }
182
+
183
+ /* close node */
184
+ async close () {
185
+ /* indicate closing */
186
+ this.closing = true
187
+
188
+ /* shutdown stream */
189
+ if (this.stream !== null) {
190
+ await util.destroyStream(this.stream)
191
+ this.stream = null
192
+ }
193
+
194
+ /* destroy resampler */
195
+ if (this.resampler !== null)
196
+ this.resampler = null
197
+
198
+ /* destroy Google TTS client */
199
+ if (this.client !== null) {
200
+ await this.client.close().catch((error) => {
201
+ this.log("warning", `error closing Google TTS client: ${error}`)
202
+ })
203
+ this.client = null
204
+ }
205
+ }
206
+ }