speechflow 1.4.5 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. package/CHANGELOG.md +28 -0
  2. package/README.md +220 -7
  3. package/etc/claude.md +70 -0
  4. package/etc/speechflow.yaml +5 -3
  5. package/etc/stx.conf +7 -0
  6. package/package.json +7 -6
  7. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.d.ts +1 -0
  8. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +155 -0
  9. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -0
  10. package/speechflow-cli/dst/speechflow-node-a2a-compressor.d.ts +15 -0
  11. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +287 -0
  12. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -0
  13. package/speechflow-cli/dst/speechflow-node-a2a-dynamics-wt.d.ts +1 -0
  14. package/speechflow-cli/dst/speechflow-node-a2a-dynamics-wt.js +208 -0
  15. package/speechflow-cli/dst/speechflow-node-a2a-dynamics-wt.js.map +1 -0
  16. package/speechflow-cli/dst/speechflow-node-a2a-dynamics.d.ts +15 -0
  17. package/speechflow-cli/dst/speechflow-node-a2a-dynamics.js +312 -0
  18. package/speechflow-cli/dst/speechflow-node-a2a-dynamics.js.map +1 -0
  19. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.d.ts +1 -0
  20. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +161 -0
  21. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js.map +1 -0
  22. package/speechflow-cli/dst/speechflow-node-a2a-expander.d.ts +13 -0
  23. package/speechflow-cli/dst/speechflow-node-a2a-expander.js +208 -0
  24. package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -0
  25. package/speechflow-cli/dst/speechflow-node-a2a-ffmpeg.js +13 -3
  26. package/speechflow-cli/dst/speechflow-node-a2a-ffmpeg.js.map +1 -1
  27. package/speechflow-cli/dst/speechflow-node-a2a-filler.d.ts +14 -0
  28. package/speechflow-cli/dst/speechflow-node-a2a-filler.js +233 -0
  29. package/speechflow-cli/dst/speechflow-node-a2a-filler.js.map +1 -0
  30. package/speechflow-cli/dst/speechflow-node-a2a-gain.d.ts +12 -0
  31. package/speechflow-cli/dst/speechflow-node-a2a-gain.js +125 -0
  32. package/speechflow-cli/dst/speechflow-node-a2a-gain.js.map +1 -0
  33. package/speechflow-cli/dst/speechflow-node-a2a-gender.d.ts +0 -1
  34. package/speechflow-cli/dst/speechflow-node-a2a-gender.js +28 -12
  35. package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
  36. package/speechflow-cli/dst/speechflow-node-a2a-meter.d.ts +1 -0
  37. package/speechflow-cli/dst/speechflow-node-a2a-meter.js +12 -8
  38. package/speechflow-cli/dst/speechflow-node-a2a-meter.js.map +1 -1
  39. package/speechflow-cli/dst/speechflow-node-a2a-mute.js +2 -1
  40. package/speechflow-cli/dst/speechflow-node-a2a-mute.js.map +1 -1
  41. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.d.ts +1 -0
  42. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js +55 -0
  43. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js.map +1 -0
  44. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.d.ts +14 -0
  45. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js +184 -0
  46. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js.map +1 -0
  47. package/speechflow-cli/dst/speechflow-node-a2a-speex.d.ts +14 -0
  48. package/speechflow-cli/dst/speechflow-node-a2a-speex.js +156 -0
  49. package/speechflow-cli/dst/speechflow-node-a2a-speex.js.map +1 -0
  50. package/speechflow-cli/dst/speechflow-node-a2a-vad.js +3 -3
  51. package/speechflow-cli/dst/speechflow-node-a2a-vad.js.map +1 -1
  52. package/speechflow-cli/dst/speechflow-node-a2a-wav.js +22 -17
  53. package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
  54. package/speechflow-cli/dst/speechflow-node-a2t-awstranscribe.d.ts +18 -0
  55. package/speechflow-cli/dst/speechflow-node-a2t-awstranscribe.js +317 -0
  56. package/speechflow-cli/dst/speechflow-node-a2t-awstranscribe.js.map +1 -0
  57. package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js +15 -13
  58. package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js.map +1 -1
  59. package/speechflow-cli/dst/speechflow-node-a2t-openaitranscribe.d.ts +19 -0
  60. package/speechflow-cli/dst/speechflow-node-a2t-openaitranscribe.js +351 -0
  61. package/speechflow-cli/dst/speechflow-node-a2t-openaitranscribe.js.map +1 -0
  62. package/speechflow-cli/dst/speechflow-node-t2a-awspolly.d.ts +16 -0
  63. package/speechflow-cli/dst/speechflow-node-t2a-awspolly.js +171 -0
  64. package/speechflow-cli/dst/speechflow-node-t2a-awspolly.js.map +1 -0
  65. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +19 -14
  66. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
  67. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +11 -6
  68. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
  69. package/speechflow-cli/dst/speechflow-node-t2t-awstranslate.d.ts +13 -0
  70. package/speechflow-cli/dst/speechflow-node-t2t-awstranslate.js +141 -0
  71. package/speechflow-cli/dst/speechflow-node-t2t-awstranslate.js.map +1 -0
  72. package/speechflow-cli/dst/speechflow-node-t2t-deepl.js +13 -15
  73. package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
  74. package/speechflow-cli/dst/speechflow-node-t2t-format.js +10 -15
  75. package/speechflow-cli/dst/speechflow-node-t2t-format.js.map +1 -1
  76. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +44 -31
  77. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +1 -1
  78. package/speechflow-cli/dst/speechflow-node-t2t-openai.js +44 -45
  79. package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +1 -1
  80. package/speechflow-cli/dst/speechflow-node-t2t-sentence.js +8 -8
  81. package/speechflow-cli/dst/speechflow-node-t2t-sentence.js.map +1 -1
  82. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +10 -12
  83. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
  84. package/speechflow-cli/dst/speechflow-node-t2t-transformers.js +22 -27
  85. package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +1 -1
  86. package/speechflow-cli/dst/speechflow-node-x2x-filter.d.ts +1 -0
  87. package/speechflow-cli/dst/speechflow-node-x2x-filter.js +50 -15
  88. package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
  89. package/speechflow-cli/dst/speechflow-node-x2x-trace.js +17 -18
  90. package/speechflow-cli/dst/speechflow-node-x2x-trace.js.map +1 -1
  91. package/speechflow-cli/dst/speechflow-node-xio-device.js +13 -21
  92. package/speechflow-cli/dst/speechflow-node-xio-device.js.map +1 -1
  93. package/speechflow-cli/dst/speechflow-node-xio-mqtt.d.ts +1 -0
  94. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js +22 -16
  95. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
  96. package/speechflow-cli/dst/speechflow-node-xio-websocket.js +19 -19
  97. package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
  98. package/speechflow-cli/dst/speechflow-node.d.ts +6 -3
  99. package/speechflow-cli/dst/speechflow-node.js +13 -2
  100. package/speechflow-cli/dst/speechflow-node.js.map +1 -1
  101. package/speechflow-cli/dst/speechflow-utils-audio-wt.d.ts +1 -0
  102. package/speechflow-cli/dst/speechflow-utils-audio-wt.js +124 -0
  103. package/speechflow-cli/dst/speechflow-utils-audio-wt.js.map +1 -0
  104. package/speechflow-cli/dst/speechflow-utils-audio.d.ts +13 -0
  105. package/speechflow-cli/dst/speechflow-utils-audio.js +137 -0
  106. package/speechflow-cli/dst/speechflow-utils-audio.js.map +1 -0
  107. package/speechflow-cli/dst/speechflow-utils.d.ts +18 -0
  108. package/speechflow-cli/dst/speechflow-utils.js +123 -35
  109. package/speechflow-cli/dst/speechflow-utils.js.map +1 -1
  110. package/speechflow-cli/dst/speechflow.js +69 -14
  111. package/speechflow-cli/dst/speechflow.js.map +1 -1
  112. package/speechflow-cli/etc/oxlint.jsonc +112 -11
  113. package/speechflow-cli/etc/stx.conf +2 -2
  114. package/speechflow-cli/etc/tsconfig.json +1 -1
  115. package/speechflow-cli/package.d/@shiguredo+rnnoise-wasm+2025.1.5.patch +25 -0
  116. package/speechflow-cli/package.json +102 -94
  117. package/speechflow-cli/src/lib.d.ts +24 -0
  118. package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +151 -0
  119. package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +303 -0
  120. package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +158 -0
  121. package/speechflow-cli/src/speechflow-node-a2a-expander.ts +212 -0
  122. package/speechflow-cli/src/speechflow-node-a2a-ffmpeg.ts +13 -3
  123. package/speechflow-cli/src/speechflow-node-a2a-filler.ts +223 -0
  124. package/speechflow-cli/src/speechflow-node-a2a-gain.ts +98 -0
  125. package/speechflow-cli/src/speechflow-node-a2a-gender.ts +31 -17
  126. package/speechflow-cli/src/speechflow-node-a2a-meter.ts +13 -9
  127. package/speechflow-cli/src/speechflow-node-a2a-mute.ts +3 -2
  128. package/speechflow-cli/src/speechflow-node-a2a-rnnoise-wt.ts +62 -0
  129. package/speechflow-cli/src/speechflow-node-a2a-rnnoise.ts +164 -0
  130. package/speechflow-cli/src/speechflow-node-a2a-speex.ts +137 -0
  131. package/speechflow-cli/src/speechflow-node-a2a-vad.ts +3 -3
  132. package/speechflow-cli/src/speechflow-node-a2a-wav.ts +20 -13
  133. package/speechflow-cli/src/speechflow-node-a2t-awstranscribe.ts +308 -0
  134. package/speechflow-cli/src/speechflow-node-a2t-deepgram.ts +15 -13
  135. package/speechflow-cli/src/speechflow-node-a2t-openaitranscribe.ts +337 -0
  136. package/speechflow-cli/src/speechflow-node-t2a-awspolly.ts +187 -0
  137. package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +19 -14
  138. package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +12 -7
  139. package/speechflow-cli/src/speechflow-node-t2t-awstranslate.ts +152 -0
  140. package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +13 -15
  141. package/speechflow-cli/src/speechflow-node-t2t-format.ts +10 -15
  142. package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +55 -42
  143. package/speechflow-cli/src/speechflow-node-t2t-openai.ts +58 -58
  144. package/speechflow-cli/src/speechflow-node-t2t-sentence.ts +10 -10
  145. package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +15 -16
  146. package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +27 -32
  147. package/speechflow-cli/src/speechflow-node-x2x-filter.ts +20 -16
  148. package/speechflow-cli/src/speechflow-node-x2x-trace.ts +20 -19
  149. package/speechflow-cli/src/speechflow-node-xio-device.ts +15 -23
  150. package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +23 -16
  151. package/speechflow-cli/src/speechflow-node-xio-websocket.ts +19 -19
  152. package/speechflow-cli/src/speechflow-node.ts +21 -8
  153. package/speechflow-cli/src/speechflow-utils-audio-wt.ts +172 -0
  154. package/speechflow-cli/src/speechflow-utils-audio.ts +147 -0
  155. package/speechflow-cli/src/speechflow-utils.ts +125 -32
  156. package/speechflow-cli/src/speechflow.ts +74 -17
  157. package/speechflow-ui-db/dst/index.js +31 -31
  158. package/speechflow-ui-db/etc/eslint.mjs +0 -1
  159. package/speechflow-ui-db/etc/tsc-client.json +3 -3
  160. package/speechflow-ui-db/package.json +11 -10
  161. package/speechflow-ui-db/src/app.vue +20 -6
  162. package/speechflow-ui-st/dst/index.js +26 -26
  163. package/speechflow-ui-st/etc/eslint.mjs +0 -1
  164. package/speechflow-ui-st/etc/tsc-client.json +3 -3
  165. package/speechflow-ui-st/package.json +11 -10
  166. package/speechflow-ui-st/src/app.vue +5 -12
@@ -0,0 +1,337 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* external dependencies */
11
+ import OpenAI from "openai"
12
+ import { DateTime } from "luxon"
13
+ import SpeexResampler from "speex-resampler"
14
+ import ws from "ws"
15
+
16
+ /* internal dependencies */
17
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
18
+ import * as utils from "./speechflow-utils"
19
+
20
+ /* SpeechFlow node for OpenAI Transcribe speech-to-text conversion */
21
+ export default class SpeechFlowNodeOpenAITranscribe extends SpeechFlowNode {
22
+ /* declare official node name */
23
+ public static name = "openaitranscribe"
24
+
25
+ /* internal state */
26
+ private static speexInitialized = false
27
+ private openai: OpenAI | null = null
28
+ private ws: ws.WebSocket | null = null
29
+ private queue: utils.SingleQueue<SpeechFlowChunk | null> | null = null
30
+ private resampler: SpeexResampler | null = null
31
+ private destroyed = false
32
+ private connectionTimeout: ReturnType<typeof setTimeout> | null = null
33
+
34
+ /* construct node */
35
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
36
+ super(id, cfg, opts, args)
37
+
38
+ /* declare node configuration parameters */
39
+ this.configure({
40
+ key: { type: "string", val: process.env.SPEECHFLOW_OPENAI_KEY },
41
+ api: { type: "string", val: "https://api.openai.com/v1", match: /^https?:\/\/.+/ },
42
+ model: { type: "string", val: "gpt-4o-mini-transcribe" },
43
+ language: { type: "string", val: "de", match: /^(?:en|de)$/ },
44
+ interim: { type: "boolean", val: false }
45
+ })
46
+
47
+ /* declare node input/output format */
48
+ this.input = "audio"
49
+ this.output = "text"
50
+ }
51
+
52
+ /* one-time status of node */
53
+ async status () {
54
+ return {}
55
+ }
56
+
57
+ /* open node */
58
+ async open () {
59
+ /* sanity check situation */
60
+ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
61
+ throw new Error("OpenAI transcribe node currently supports PCM-S16LE audio only")
62
+
63
+ /* clear destruction flag */
64
+ this.destroyed = false
65
+
66
+ /* create queue for results */
67
+ this.queue = new utils.SingleQueue<SpeechFlowChunk | null>()
68
+
69
+ /* create a store for the meta information */
70
+ const metastore = new utils.TimeStore<Map<string, any>>()
71
+
72
+ /* establish resampler from our standard audio sample rate (48Khz)
73
+ to OpenAI's maximum 24Khz input sample rate */
74
+ if (!SpeechFlowNodeOpenAITranscribe.speexInitialized) {
75
+ /* at least once initialize resampler */
76
+ await SpeexResampler.initPromise
77
+ SpeechFlowNodeOpenAITranscribe.speexInitialized = true
78
+ }
79
+ this.resampler = new SpeexResampler(1, this.config.audioSampleRate, 24000, 7)
80
+
81
+ /* instantiate OpenAI API */
82
+ this.openai = new OpenAI({
83
+ baseURL: this.params.api,
84
+ apiKey: this.params.key,
85
+ dangerouslyAllowBrowser: true
86
+ })
87
+
88
+ /* open the WebSocket connection for streaming */
89
+ const url = `${this.params.api.replace(/^http/, "ws")}/realtime?intent=transcription`
90
+ this.ws = new ws.WebSocket(url, {
91
+ headers: {
92
+ Authorization: `Bearer ${this.params.key}`,
93
+ "OpenAI-Beta": "realtime=v1"
94
+ }
95
+ })
96
+ const sendMessage = (obj: any) => {
97
+ this.ws?.send(JSON.stringify(obj))
98
+ }
99
+
100
+ /* wait for OpenAI API to be available */
101
+ await new Promise((resolve, reject) => {
102
+ this.connectionTimeout = setTimeout(() => {
103
+ if (this.connectionTimeout !== null) {
104
+ this.connectionTimeout = null
105
+ reject(new Error("OpenAI: timeout waiting for connection open"))
106
+ }
107
+ }, 8000)
108
+ this.ws!.once("open", () => {
109
+ this.log("info", "connection open")
110
+ if (this.connectionTimeout !== null) {
111
+ clearTimeout(this.connectionTimeout)
112
+ this.connectionTimeout = null
113
+ }
114
+ resolve(true)
115
+ })
116
+ this.ws!.once("error", (err) => {
117
+ if (this.connectionTimeout !== null) {
118
+ clearTimeout(this.connectionTimeout)
119
+ this.connectionTimeout = null
120
+ }
121
+ reject(err)
122
+ })
123
+ })
124
+
125
+ /* configure session */
126
+ sendMessage({
127
+ type: "transcription_session.update",
128
+ session: {
129
+ input_audio_format: "pcm16",
130
+ input_audio_transcription: {
131
+ model: this.params.model,
132
+ language: this.params.language
133
+ },
134
+ turn_detection: {
135
+ type: "server_vad",
136
+ threshold: 0.5,
137
+ prefix_padding_ms: 300,
138
+ silence_duration_ms: 500
139
+ }
140
+ }
141
+ })
142
+
143
+ /* hook onto session events */
144
+ this.ws.on("open", () => {
145
+ this.log("info", "WebSocket connection opened")
146
+ sendMessage({ type: "transcription.create" })
147
+ })
148
+ this.ws.on("close", () => {
149
+ this.log("info", "WebSocket connection closed")
150
+ this.queue!.write(null)
151
+ })
152
+ this.ws.on("error", (err) => {
153
+ this.log("error", `WebSocket connection error: ${err}`)
154
+ })
155
+ let text = ""
156
+ this.ws.on("message", (data) => {
157
+ let ev: any
158
+ try {
159
+ ev = JSON.parse(data.toString())
160
+ }
161
+ catch (err) {
162
+ this.log("warning", `failed to parse WebSocket message: ${err}`)
163
+ return
164
+ }
165
+ if (!(typeof ev === "object" && ev !== null)) {
166
+ this.log("warning", "received invalid WebSocket message")
167
+ return
168
+ }
169
+ switch (ev.type) {
170
+ case "transcription_session.created":
171
+ break
172
+ case "conversation.item.created":
173
+ text = ""
174
+ break
175
+ case "conversation.item.input_audio_transcription.delta": {
176
+ text += ev.delta as string
177
+ if (this.params.interim) {
178
+ const start = DateTime.now().diff(this.timeOpen!) // FIXME: OpenAI does not provide timestamps
179
+ const end = start // FIXME: OpenAI does not provide timestamps
180
+ const metas = metastore.fetch(start, end)
181
+ const meta = metas.reduce((prev: Map<string, any>, curr: Map<string, any>) => {
182
+ curr.forEach((val, key) => { prev.set(key, val) })
183
+ return prev
184
+ }, new Map<string, any>())
185
+ const chunk = new SpeechFlowChunk(start, end, "intermediate", "text", text)
186
+ chunk.meta = meta
187
+ this.queue!.write(chunk)
188
+ }
189
+ break
190
+ }
191
+ case "conversation.item.input_audio_transcription.completed": {
192
+ text = ev.transcript as string
193
+ const start = DateTime.now().diff(this.timeOpen!) // FIXME: OpenAI does not provide timestamps
194
+ const end = start // FIXME: OpenAI does not provide timestamps
195
+ const metas = metastore.fetch(start, end)
196
+ const meta = metas.reduce((prev: Map<string, any>, curr: Map<string, any>) => {
197
+ curr.forEach((val, key) => { prev.set(key, val) })
198
+ return prev
199
+ }, new Map<string, any>())
200
+ metastore.prune(start)
201
+ const chunk = new SpeechFlowChunk(start, end, "final", "text", text)
202
+ chunk.meta = meta
203
+ this.queue!.write(chunk)
204
+ text = ""
205
+ break
206
+ }
207
+ case "input_audio_buffer.speech_started":
208
+ this.log("info", "VAD: speech started")
209
+ break
210
+ case "input_audio_buffer.speech_stopped":
211
+ this.log("info", "VAD: speech stopped")
212
+ break
213
+ case "input_audio_buffer.committed":
214
+ this.log("info", "input buffer committed")
215
+ break
216
+ case "error":
217
+ this.log("error", `error: ${ev.error?.message}`)
218
+ break
219
+ default:
220
+ break
221
+ }
222
+ })
223
+
224
+ /* remember opening time to receive time zero offset */
225
+ this.timeOpen = DateTime.now()
226
+
227
+ /* provide Duplex stream and internally attach to OpenAI API */
228
+ const self = this
229
+ this.stream = new Stream.Duplex({
230
+ writableObjectMode: true,
231
+ readableObjectMode: true,
232
+ decodeStrings: false,
233
+ highWaterMark: 1,
234
+ write (chunk: SpeechFlowChunk, encoding, callback) {
235
+ if (self.destroyed || self.ws === null) {
236
+ callback(new Error("stream already destroyed"))
237
+ return
238
+ }
239
+ if (chunk.type !== "audio")
240
+ callback(new Error("expected audio input chunk"))
241
+ else if (!Buffer.isBuffer(chunk.payload))
242
+ callback(new Error("expected Buffer input chunk"))
243
+ else {
244
+ if (chunk.payload.byteLength > 0) {
245
+ self.log("debug", `send data (${chunk.payload.byteLength} bytes)`)
246
+ if (chunk.meta.size > 0)
247
+ metastore.store(chunk.timestampStart, chunk.timestampEnd, chunk.meta)
248
+ try {
249
+ const payload = self.resampler!.processChunk(chunk.payload)
250
+ const audioB64 = payload.toString("base64")
251
+ sendMessage({
252
+ type: "input_audio_buffer.append",
253
+ audio: audioB64 /* intentionally discard all time information */
254
+ })
255
+ }
256
+ catch (error) {
257
+ callback(error instanceof Error ? error : new Error("failed to send to OpenAI transcribe"))
258
+ return
259
+ }
260
+ }
261
+ callback()
262
+ }
263
+ },
264
+ read (size) {
265
+ if (self.destroyed || self.queue === null) {
266
+ this.push(null)
267
+ return
268
+ }
269
+ self.queue.read().then((chunk) => {
270
+ if (self.destroyed) {
271
+ this.push(null)
272
+ return
273
+ }
274
+ if (chunk === null) {
275
+ self.log("info", "received EOF signal")
276
+ this.push(null)
277
+ }
278
+ else {
279
+ self.log("debug", `received data (${chunk.payload.length} bytes)`)
280
+ this.push(chunk)
281
+ }
282
+ }).catch((error) => {
283
+ if (!self.destroyed)
284
+ self.log("error", `queue read error: ${error.message}`)
285
+ })
286
+ },
287
+ final (callback) {
288
+ if (self.destroyed || self.ws === null) {
289
+ callback()
290
+ return
291
+ }
292
+ try {
293
+ sendMessage({ type: "input_audio_buffer.commit" })
294
+ self.ws.close()
295
+ /* NOTICE: do not push null here -- let the OpenAI close event handle it */
296
+ callback()
297
+ }
298
+ catch (error) {
299
+ self.log("warning", `error closing OpenAI connection: ${error}`)
300
+ callback(error instanceof Error ? error : new Error("failed to close OpenAI connection"))
301
+ }
302
+ }
303
+ })
304
+ }
305
+
306
+ /* close node */
307
+ async close () {
308
+ /* indicate destruction first to stop all async operations */
309
+ this.destroyed = true
310
+
311
+ /* clear connection timeout */
312
+ if (this.connectionTimeout !== null) {
313
+ clearTimeout(this.connectionTimeout)
314
+ this.connectionTimeout = null
315
+ }
316
+
317
+ /* signal EOF to any pending read operations */
318
+ if (this.queue !== null) {
319
+ this.queue.write(null)
320
+ this.queue = null
321
+ }
322
+
323
+ /* close OpenAI connection */
324
+ if (this.ws !== null) {
325
+ this.ws.close()
326
+ this.ws = null
327
+ }
328
+ if (this.openai !== null)
329
+ this.openai = null
330
+
331
+ /* close stream */
332
+ if (this.stream !== null) {
333
+ this.stream.destroy()
334
+ this.stream = null
335
+ }
336
+ }
337
+ }
@@ -0,0 +1,187 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* external dependencies */
11
+ import { getStreamAsBuffer } from "get-stream"
12
+ import SpeexResampler from "speex-resampler"
13
+ import {
14
+ PollyClient, SynthesizeSpeechCommand,
15
+ Engine, VoiceId, LanguageCode, TextType
16
+ } from "@aws-sdk/client-polly"
17
+
18
+ /* internal dependencies */
19
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
20
+
21
+ /* SpeechFlow node for AWS Polly text-to-speech conversion */
22
+ export default class SpeechFlowNodeAWSPolly extends SpeechFlowNode {
23
+ /* declare official node name */
24
+ public static name = "awspolly"
25
+
26
+ /* internal state */
27
+ private client: PollyClient | null = null
28
+ private static speexInitialized = false
29
+ private destroyed = false
30
+ private resampler: SpeexResampler | null = null
31
+
32
+ /* construct node */
33
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
34
+ super(id, cfg, opts, args)
35
+
36
+ /* declare node configuration parameters */
37
+ this.configure({
38
+ key: { type: "string", val: process.env.SPEECHFLOW_AMAZON_KEY },
39
+ secKey: { type: "string", val: process.env.SPEECHFLOW_AMAZON_KEY_SEC },
40
+ region: { type: "string", val: "eu-central-1" },
41
+ voice: { type: "string", val: "Amy", pos: 0, match: /^(?:Amy|Danielle|Joanna|Matthew|Ruth|Stephen|Vicki|Daniel)$/ },
42
+ language: { type: "string", val: "en", pos: 1, match: /^(?:de|en)$/ }
43
+ })
44
+
45
+ /* sanity check parameters */
46
+ if (!this.params.key)
47
+ throw new Error("AWS Access Key not configured")
48
+ if (!this.params.secKey)
49
+ throw new Error("AWS Secret Access Key not configured")
50
+
51
+ /* declare node input/output format */
52
+ this.input = "text"
53
+ this.output = "audio"
54
+ }
55
+
56
+ /* one-time status of node */
57
+ async status () {
58
+ return {}
59
+ }
60
+
61
+ /* open node */
62
+ async open () {
63
+ /* clear destruction flag */
64
+ this.destroyed = false
65
+
66
+ /* establish AWS Polly connection */
67
+ this.client = new PollyClient({
68
+ region: this.params.region,
69
+ credentials: {
70
+ accessKeyId: this.params.key,
71
+ secretAccessKey: this.params.secKey
72
+ }
73
+ })
74
+ if (this.client === null)
75
+ throw new Error("failed to establish AWS Polly client")
76
+
77
+ /* list of voices */
78
+ const voices = {
79
+ "Amy": { language: "en", languageCode: "en-GB", engine: "generative" },
80
+ "Danielle": { language: "en", languageCode: "en-US", engine: "generative" },
81
+ "Joanna": { language: "en", languageCode: "en-US", engine: "generative" },
82
+ "Matthew": { language: "en", languageCode: "en-US", engine: "generative" },
83
+ "Ruth": { language: "en", languageCode: "en-US", engine: "generative" },
84
+ "Stephen": { language: "en", languageCode: "en-US", engine: "generative" },
85
+ "Vicki": { language: "de", languageCode: "de-DE", engine: "generative" },
86
+ "Daniel": { language: "de", languageCode: "de-DE", engine: "generative" },
87
+ }
88
+ const voiceConfig = voices[this.params.voice as keyof typeof voices]
89
+ if (voiceConfig === undefined)
90
+ throw new Error("unsupported voice")
91
+ if (voiceConfig.language !== this.params.language)
92
+ throw new Error(`voice does only support language "${voiceConfig.language}"`)
93
+
94
+ /* perform text-to-speech operation with AWS Polly API */
95
+ const textToSpeech = async (text: string) => {
96
+ const cmd = new SynthesizeSpeechCommand({
97
+ LanguageCode: voiceConfig.languageCode as LanguageCode,
98
+ Engine: voiceConfig.engine as Engine,
99
+ VoiceId: this.params.voice as VoiceId,
100
+ OutputFormat: "pcm",
101
+ SampleRate: "16000", /* maximum supported for PCM output */
102
+ TextType: "text" as TextType,
103
+ Text: text
104
+ })
105
+ const res = await this.client!.send(cmd)
106
+ const stream = res.AudioStream as AsyncIterable<Uint8Array> | null
107
+ if (stream === null)
108
+ throw new Error("stream not returned")
109
+ const buffer = await getStreamAsBuffer(stream)
110
+ const bufferResampled = this.resampler!.processChunk(buffer)
111
+ return bufferResampled
112
+ }
113
+
114
+ /* establish resampler from AWS Polly's maximum 16Khz output
115
+ (for PCM output) to our standard audio sample rate (48KHz) */
116
+ if (!SpeechFlowNodeAWSPolly.speexInitialized) {
117
+ /* at least once initialize resampler */
118
+ await SpeexResampler.initPromise
119
+ SpeechFlowNodeAWSPolly.speexInitialized = true
120
+ }
121
+ this.resampler = new SpeexResampler(1, 16000, this.config.audioSampleRate, 7)
122
+
123
+ /* create transform stream and connect it to the AWS Polly API */
124
+ const self = this
125
+ this.stream = new Stream.Transform({
126
+ writableObjectMode: true,
127
+ readableObjectMode: true,
128
+ decodeStrings: false,
129
+ highWaterMark: 1,
130
+ transform (chunk: SpeechFlowChunk, encoding, callback) {
131
+ if (self.destroyed) {
132
+ callback(new Error("stream already destroyed"))
133
+ return
134
+ }
135
+ if (Buffer.isBuffer(chunk.payload))
136
+ callback(new Error("invalid chunk payload type"))
137
+ else if (chunk.payload.length > 0) {
138
+ self.log("debug", `send data (${chunk.payload.length} bytes): "${chunk.payload}"`)
139
+ textToSpeech(chunk.payload as string).then((buffer) => {
140
+ if (self.destroyed)
141
+ throw new Error("stream destroyed during processing")
142
+ const chunkNew = chunk.clone()
143
+ chunkNew.type = "audio"
144
+ chunkNew.payload = buffer
145
+ this.push(chunkNew)
146
+ callback()
147
+ }).catch((error) => {
148
+ callback(error instanceof Error ?
149
+ error : new Error(`failed to send to AWS Polly: ${String(error)}`))
150
+ })
151
+ }
152
+ else
153
+ callback()
154
+ },
155
+ final (callback) {
156
+ if (self.destroyed) {
157
+ callback()
158
+ return
159
+ }
160
+ this.push(null)
161
+ callback()
162
+ }
163
+ })
164
+ }
165
+
166
+ /* close node */
167
+ async close () {
168
+ /* indicate destruction */
169
+ this.destroyed = true
170
+
171
+ /* destroy resampler */
172
+ if (this.resampler !== null)
173
+ this.resampler = null
174
+
175
+ /* destroy AWS Polly API */
176
+ if (this.client !== null) {
177
+ this.client.destroy()
178
+ this.client = null
179
+ }
180
+ /* destroy stream */
181
+ if (this.stream !== null) {
182
+ this.stream.destroy()
183
+ this.stream = null
184
+ }
185
+ }
186
+ }
187
+
@@ -52,10 +52,17 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
52
52
 
53
53
  /* one-time status of node */
54
54
  async status () {
55
- const elevenlabs = new ElevenLabs.ElevenLabsClient({ apiKey: this.params.key })
56
- const subscription = await elevenlabs.user.subscription.get()
57
- const percent = subscription.characterCount / subscription.characterLimit
58
- return { usage: `${percent.toFixed(2)}%` }
55
+ try {
56
+ const elevenlabs = new ElevenLabs.ElevenLabsClient({ apiKey: this.params.key })
57
+ const subscription = await elevenlabs.user.subscription.get()
58
+ const percent = subscription.characterLimit > 0
59
+ ? subscription.characterCount / subscription.characterLimit
60
+ : 0
61
+ return { usage: `${percent.toFixed(2)}%` }
62
+ }
63
+ catch (_error) {
64
+ return { usage: "unknown" }
65
+ }
59
66
  }
60
67
 
61
68
  /* open node */
@@ -88,15 +95,15 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
88
95
  /* determine voice for text-to-speech operation
89
96
  (for details see https://elevenlabs.io/text-to-speech) */
90
97
  const voices = await this.elevenlabs.voices.getAll()
91
- let voice = voices.voices.find((voice) => voice.name === this.params.voice)
98
+ let voice = voices.voices.find((v) => v.name === this.params.voice)
92
99
  if (voice === undefined) {
93
- voice = voices.voices.find((voice) => voice.name!.startsWith(this.params.voice))
100
+ voice = voices.voices.find((v) => (v.name ?? "").startsWith(this.params.voice))
94
101
  if (voice === undefined)
95
102
  throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`)
96
103
  }
97
- const info = Object.keys(voice.labels ?? {}).length > 0 ?
98
- (", " + Object.entries(voice.labels!)
99
- .map(([ key, val ]) => `${key}: "${val}"`).join(", ")) : ""
104
+ const labels = voice.labels ?? {}
105
+ const info = Object.keys(labels).length > 0 ?
106
+ ", " + Object.entries(labels).map(([ key, val ]) => `${key}: "${val}"`).join(", ") : ""
100
107
  this.log("info", `selected voice: name: "${voice.name}"${info}`)
101
108
 
102
109
  /* perform text-to-speech operation with Elevenlabs API */
@@ -139,11 +146,9 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
139
146
  decodeStrings: false,
140
147
  highWaterMark: 1,
141
148
  transform (chunk: SpeechFlowChunk, encoding, callback) {
142
- if (self.destroyed) {
149
+ if (self.destroyed)
143
150
  callback(new Error("stream already destroyed"))
144
- return
145
- }
146
- if (Buffer.isBuffer(chunk.payload))
151
+ else if (Buffer.isBuffer(chunk.payload))
147
152
  callback(new Error("invalid chunk payload type"))
148
153
  else {
149
154
  (async () => {
@@ -158,12 +163,12 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
158
163
  }
159
164
  }
160
165
  try {
161
- const stream = await speechStream(chunk.payload as string)
162
166
  if (self.destroyed) {
163
167
  clearProcessTimeout()
164
168
  callback(new Error("stream destroyed during processing"))
165
169
  return
166
170
  }
171
+ const stream = await speechStream(chunk.payload as string)
167
172
  const buffer = await getStreamAsBuffer(stream)
168
173
  if (self.destroyed) {
169
174
  clearProcessTimeout()
@@ -51,26 +51,31 @@ export default class SpeechFlowNodeKokoro extends SpeechFlowNode {
51
51
  artifact += `:${progress.file}`
52
52
  let percent = 0
53
53
  if (typeof progress.loaded === "number" && typeof progress.total === "number")
54
- percent = (progress.loaded as number / progress.total as number) * 100
54
+ percent = (progress.loaded / progress.total) * 100
55
55
  else if (typeof progress.progress === "number")
56
56
  percent = progress.progress
57
57
  if (percent > 0)
58
58
  progressState.set(artifact, percent)
59
59
  }
60
- const interval = setInterval(() => {
60
+ let interval: ReturnType<typeof setInterval> | null = setInterval(() => {
61
61
  for (const [ artifact, percent ] of progressState) {
62
62
  this.log("info", `downloaded ${percent.toFixed(2)}% of artifact "${artifact}"`)
63
63
  if (percent >= 100.0)
64
64
  progressState.delete(artifact)
65
65
  }
66
- if (progressState.size === 0)
66
+ if (progressState.size === 0 && interval !== null) {
67
67
  clearInterval(interval)
68
+ interval = null
69
+ }
68
70
  }, 1000)
69
71
  this.kokoro = await KokoroTTS.from_pretrained(model, {
70
72
  dtype: "q4f16",
71
73
  progress_callback: progressCallback
72
74
  })
73
- clearInterval(interval)
75
+ if (interval !== null) {
76
+ clearInterval(interval)
77
+ interval = null
78
+ }
74
79
  if (this.kokoro === null)
75
80
  throw new Error("failed to instantiate Kokoro")
76
81
 
@@ -78,19 +83,19 @@ export default class SpeechFlowNodeKokoro extends SpeechFlowNode {
78
83
  output to our standard audio sample rate (48KHz) */
79
84
  if (!SpeechFlowNodeKokoro.speexInitialized) {
80
85
  /* at least once initialize resampler */
81
- await SpeexResampler.initPromise
82
86
  SpeechFlowNodeKokoro.speexInitialized = true
87
+ await SpeexResampler.initPromise
83
88
  }
84
89
  this.resampler = new SpeexResampler(1, 24000, this.config.audioSampleRate, 7)
85
90
 
86
91
  /* determine voice for text-to-speech operation */
87
- const voices = {
92
+ const voices: Record<string, string> = {
88
93
  "Aoede": "af_aoede",
89
94
  "Heart": "af_heart",
90
95
  "Puck": "am_puck",
91
96
  "Fenrir": "am_fenrir"
92
97
  }
93
- const voice = ((voices as any)[this.params.voice]) as string | undefined
98
+ const voice = voices[this.params.voice]
94
99
  if (voice === undefined)
95
100
  throw new Error(`invalid Kokoro voice "${this.params.voice}"`)
96
101