speechflow 1.7.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/README.md +425 -146
  3. package/etc/claude.md +5 -5
  4. package/etc/speechflow.yaml +2 -2
  5. package/package.json +3 -3
  6. package/speechflow-cli/dst/speechflow-main-api.js +6 -5
  7. package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
  8. package/speechflow-cli/dst/speechflow-main-graph.d.ts +1 -0
  9. package/speechflow-cli/dst/speechflow-main-graph.js +35 -13
  10. package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
  11. package/speechflow-cli/dst/speechflow-main-status.js +3 -7
  12. package/speechflow-cli/dst/speechflow-main-status.js.map +1 -1
  13. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +3 -0
  14. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
  15. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +4 -2
  16. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
  17. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -1
  18. package/speechflow-cli/dst/speechflow-node-a2a-expander.js +4 -2
  19. package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
  20. package/speechflow-cli/dst/speechflow-node-a2a-gender.js +2 -2
  21. package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
  22. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +1 -2
  23. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
  24. package/speechflow-cli/dst/speechflow-node-a2a-wav.js +32 -5
  25. package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
  26. package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
  27. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +1 -6
  28. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
  29. package/speechflow-cli/dst/speechflow-node-a2t-deepgram.d.ts +0 -1
  30. package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js +9 -9
  31. package/speechflow-cli/dst/speechflow-node-a2t-deepgram.js.map +1 -1
  32. package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +17 -0
  33. package/speechflow-cli/dst/speechflow-node-a2t-google.js +320 -0
  34. package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -0
  35. package/speechflow-cli/dst/speechflow-node-a2t-openai.js +6 -4
  36. package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
  37. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +6 -11
  38. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
  39. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +6 -5
  40. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
  41. package/speechflow-cli/dst/speechflow-node-t2a-google.d.ts +15 -0
  42. package/speechflow-cli/dst/speechflow-node-t2a-google.js +218 -0
  43. package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -0
  44. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +2 -0
  45. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +19 -6
  46. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
  47. package/speechflow-cli/dst/speechflow-node-t2a-openai.d.ts +15 -0
  48. package/speechflow-cli/dst/speechflow-node-t2a-openai.js +195 -0
  49. package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -0
  50. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +17 -0
  51. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +608 -0
  52. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -0
  53. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
  54. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.d.ts → speechflow-node-t2t-opus.d.ts} +1 -3
  55. package/speechflow-cli/dst/speechflow-node-t2t-opus.js +159 -0
  56. package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -0
  57. package/speechflow-cli/dst/speechflow-node-t2t-profanity.d.ts +11 -0
  58. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js +118 -0
  59. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js.map +1 -0
  60. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.d.ts +13 -0
  61. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +220 -0
  62. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -0
  63. package/speechflow-cli/dst/{speechflow-node-t2t-openai.d.ts → speechflow-node-t2t-spellcheck.d.ts} +2 -2
  64. package/speechflow-cli/dst/{speechflow-node-t2t-openai.js → speechflow-node-t2t-spellcheck.js} +47 -99
  65. package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -0
  66. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +3 -6
  67. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
  68. package/speechflow-cli/dst/speechflow-node-t2t-summary.d.ts +16 -0
  69. package/speechflow-cli/dst/speechflow-node-t2t-summary.js +241 -0
  70. package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -0
  71. package/speechflow-cli/dst/{speechflow-node-t2t-ollama.d.ts → speechflow-node-t2t-translate.d.ts} +2 -2
  72. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.js → speechflow-node-t2t-translate.js} +53 -115
  73. package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -0
  74. package/speechflow-cli/dst/speechflow-node-x2x-filter.d.ts +1 -0
  75. package/speechflow-cli/dst/speechflow-node-x2x-filter.js +10 -0
  76. package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
  77. package/speechflow-cli/dst/speechflow-node-x2x-trace.js.map +1 -1
  78. package/speechflow-cli/dst/speechflow-node-xio-device.js +3 -3
  79. package/speechflow-cli/dst/speechflow-node-xio-device.js.map +1 -1
  80. package/speechflow-cli/dst/speechflow-node-xio-exec.d.ts +12 -0
  81. package/speechflow-cli/dst/speechflow-node-xio-exec.js +223 -0
  82. package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -0
  83. package/speechflow-cli/dst/speechflow-node-xio-file.d.ts +1 -0
  84. package/speechflow-cli/dst/speechflow-node-xio-file.js +80 -67
  85. package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
  86. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js +2 -1
  87. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
  88. package/speechflow-cli/dst/speechflow-node-xio-vban.d.ts +17 -0
  89. package/speechflow-cli/dst/speechflow-node-xio-vban.js +330 -0
  90. package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -0
  91. package/speechflow-cli/dst/speechflow-node-xio-webrtc.d.ts +39 -0
  92. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +500 -0
  93. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -0
  94. package/speechflow-cli/dst/speechflow-node-xio-websocket.js +2 -1
  95. package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
  96. package/speechflow-cli/dst/speechflow-util-audio.js +5 -6
  97. package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
  98. package/speechflow-cli/dst/speechflow-util-error.d.ts +1 -1
  99. package/speechflow-cli/dst/speechflow-util-error.js +5 -7
  100. package/speechflow-cli/dst/speechflow-util-error.js.map +1 -1
  101. package/speechflow-cli/dst/speechflow-util-llm.d.ts +35 -0
  102. package/speechflow-cli/dst/speechflow-util-llm.js +363 -0
  103. package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -0
  104. package/speechflow-cli/dst/speechflow-util-misc.d.ts +1 -1
  105. package/speechflow-cli/dst/speechflow-util-misc.js +4 -4
  106. package/speechflow-cli/dst/speechflow-util-misc.js.map +1 -1
  107. package/speechflow-cli/dst/speechflow-util-queue.js +3 -3
  108. package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
  109. package/speechflow-cli/dst/speechflow-util-stream.js +4 -2
  110. package/speechflow-cli/dst/speechflow-util-stream.js.map +1 -1
  111. package/speechflow-cli/dst/speechflow-util.d.ts +1 -0
  112. package/speechflow-cli/dst/speechflow-util.js +1 -0
  113. package/speechflow-cli/dst/speechflow-util.js.map +1 -1
  114. package/speechflow-cli/etc/oxlint.jsonc +2 -1
  115. package/speechflow-cli/package.json +34 -17
  116. package/speechflow-cli/src/lib.d.ts +5 -0
  117. package/speechflow-cli/src/speechflow-main-api.ts +6 -5
  118. package/speechflow-cli/src/speechflow-main-graph.ts +40 -13
  119. package/speechflow-cli/src/speechflow-main-status.ts +4 -8
  120. package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +4 -0
  121. package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +4 -2
  122. package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -1
  123. package/speechflow-cli/src/speechflow-node-a2a-expander.ts +4 -2
  124. package/speechflow-cli/src/speechflow-node-a2a-gender.ts +2 -2
  125. package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +1 -2
  126. package/speechflow-cli/src/speechflow-node-a2a-wav.ts +33 -6
  127. package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +6 -11
  128. package/speechflow-cli/src/speechflow-node-a2t-deepgram.ts +13 -12
  129. package/speechflow-cli/src/speechflow-node-a2t-google.ts +322 -0
  130. package/speechflow-cli/src/speechflow-node-a2t-openai.ts +8 -4
  131. package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +7 -11
  132. package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +6 -5
  133. package/speechflow-cli/src/speechflow-node-t2a-google.ts +206 -0
  134. package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +22 -6
  135. package/speechflow-cli/src/speechflow-node-t2a-openai.ts +179 -0
  136. package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +701 -0
  137. package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +2 -1
  138. package/speechflow-cli/src/speechflow-node-t2t-opus.ts +136 -0
  139. package/speechflow-cli/src/speechflow-node-t2t-profanity.ts +93 -0
  140. package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +201 -0
  141. package/speechflow-cli/src/{speechflow-node-t2t-openai.ts → speechflow-node-t2t-spellcheck.ts} +48 -107
  142. package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +3 -6
  143. package/speechflow-cli/src/speechflow-node-t2t-summary.ts +229 -0
  144. package/speechflow-cli/src/speechflow-node-t2t-translate.ts +181 -0
  145. package/speechflow-cli/src/speechflow-node-x2x-filter.ts +16 -3
  146. package/speechflow-cli/src/speechflow-node-x2x-trace.ts +3 -3
  147. package/speechflow-cli/src/speechflow-node-xio-device.ts +4 -7
  148. package/speechflow-cli/src/speechflow-node-xio-exec.ts +210 -0
  149. package/speechflow-cli/src/speechflow-node-xio-file.ts +93 -80
  150. package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +3 -2
  151. package/speechflow-cli/src/speechflow-node-xio-vban.ts +325 -0
  152. package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +533 -0
  153. package/speechflow-cli/src/speechflow-node-xio-websocket.ts +2 -1
  154. package/speechflow-cli/src/speechflow-util-audio-wt.ts +4 -4
  155. package/speechflow-cli/src/speechflow-util-audio.ts +10 -10
  156. package/speechflow-cli/src/speechflow-util-error.ts +9 -7
  157. package/speechflow-cli/src/speechflow-util-llm.ts +367 -0
  158. package/speechflow-cli/src/speechflow-util-misc.ts +4 -4
  159. package/speechflow-cli/src/speechflow-util-queue.ts +4 -4
  160. package/speechflow-cli/src/speechflow-util-stream.ts +5 -3
  161. package/speechflow-cli/src/speechflow-util.ts +1 -0
  162. package/speechflow-ui-db/package.json +9 -9
  163. package/speechflow-ui-st/package.json +9 -9
  164. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +0 -293
  165. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +0 -1
  166. package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +0 -1
  167. package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +0 -1
  168. package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +0 -281
  169. package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +0 -247
@@ -0,0 +1,206 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* external dependencies */
11
+ import * as GoogleTTS from "@google-cloud/text-to-speech"
12
+ import { Duration } from "luxon"
13
+ import SpeexResampler from "speex-resampler"
14
+ import * as arktype from "arktype"
15
+
16
+ /* internal dependencies */
17
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
18
+ import * as util from "./speechflow-util"
19
+
20
+ /* SpeechFlow node for Google Cloud text-to-speech conversion */
21
+ export default class SpeechFlowNodeT2AGoogle extends SpeechFlowNode {
22
+ /* declare official node name */
23
+ public static name = "t2a-google"
24
+
25
+ /* internal state */
26
+ private client: GoogleTTS.TextToSpeechClient | null = null
27
+ private resampler: SpeexResampler | null = null
28
+ private closing = false
29
+
30
+ /* construct node */
31
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
32
+ super(id, cfg, opts, args)
33
+
34
+ /* declare node configuration parameters */
35
+ this.configure({
36
+ key: { type: "string", val: process.env.SPEECHFLOW_GOOGLE_KEY ?? "" },
37
+ voice: { type: "string", pos: 0, val: "en-US-Neural2-J" },
38
+ language: { type: "string", pos: 1, val: "en-US" },
39
+ speed: { type: "number", pos: 2, val: 1.0, match: (n: number) => n >= 0.25 && n <= 4.0 },
40
+ pitch: { type: "number", pos: 3, val: 0.0, match: (n: number) => n >= -20.0 && n <= 20.0 }
41
+ })
42
+
43
+ /* validate API key */
44
+ if (this.params.key === "")
45
+ throw new Error("Google Cloud API credentials JSON key is required")
46
+
47
+ /* declare node input/output format */
48
+ this.input = "text"
49
+ this.output = "audio"
50
+ }
51
+
52
+ /* one-time status of node */
53
+ async status () {
54
+ return {}
55
+ }
56
+
57
+ /* open node */
58
+ async open () {
59
+ /* clear destruction flag */
60
+ this.closing = false
61
+
62
+ /* instantiate Google TTS client */
63
+ const data = util.run("Google Cloud API credentials key", () =>
64
+ JSON.parse(this.params.key))
65
+ const credentials = util.importObject("Google Cloud API credentials key",
66
+ data,
67
+ arktype.type({
68
+ project_id: "string",
69
+ private_key: "string",
70
+ client_email: "string"
71
+ })
72
+ )
73
+ this.client = new GoogleTTS.TextToSpeechClient({
74
+ credentials: {
75
+ private_key: credentials.private_key,
76
+ client_email: credentials.client_email
77
+ },
78
+ projectId: credentials.project_id
79
+ })
80
+
81
+ /* establish resampler from Google TTS's output sample rate
82
+ to our standard audio sample rate (48KHz) */
83
+ const googleSampleRate = 24000 /* Google TTS outputs 24kHz for LINEAR16 */
84
+ this.resampler = new SpeexResampler(1, googleSampleRate, this.config.audioSampleRate, 7)
85
+
86
+ /* perform text-to-speech operation with Google Cloud TTS API */
87
+ const textToSpeech = async (text: string) => {
88
+ this.log("info", `Google TTS: send text "${text}"`)
89
+ const [ response ] = await this.client!.synthesizeSpeech({
90
+ input: { text },
91
+ voice: {
92
+ languageCode: this.params.language,
93
+ name: this.params.voice
94
+ },
95
+ audioConfig: {
96
+ audioEncoding: "LINEAR16",
97
+ sampleRateHertz: googleSampleRate,
98
+ speakingRate: this.params.speed,
99
+ pitch: this.params.pitch
100
+ }
101
+ })
102
+ if (!response.audioContent)
103
+ throw new Error("no audio content returned from Google TTS")
104
+
105
+ /* convert response to buffer */
106
+ const buffer = Buffer.isBuffer(response.audioContent)
107
+ ? response.audioContent
108
+ : Buffer.from(response.audioContent)
109
+ this.log("info", `Google TTS: received audio (buffer length: ${buffer.byteLength})`)
110
+
111
+ /* resample from Google's sample rate to our standard rate */
112
+ const bufferResampled = this.resampler!.processChunk(buffer)
113
+ this.log("info", `Google TTS: forwarding resampled audio (buffer length: ${bufferResampled.byteLength})`)
114
+ return bufferResampled
115
+ }
116
+
117
+ /* create transform stream and connect it to the Google TTS API */
118
+ const self = this
119
+ this.stream = new Stream.Transform({
120
+ writableObjectMode: true,
121
+ readableObjectMode: true,
122
+ decodeStrings: false,
123
+ highWaterMark: 1,
124
+ async transform (chunk: SpeechFlowChunk, encoding, callback) {
125
+ if (self.closing)
126
+ callback(new Error("stream already destroyed"))
127
+ else if (Buffer.isBuffer(chunk.payload))
128
+ callback(new Error("invalid chunk payload type"))
129
+ else if (chunk.payload === "") {
130
+ /* pass through empty chunks */
131
+ this.push(chunk)
132
+ callback()
133
+ }
134
+ else {
135
+ let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
136
+ processTimeout = null
137
+ callback(new Error("Google TTS API timeout"))
138
+ }, 60 * 1000)
139
+ const clearProcessTimeout = () => {
140
+ if (processTimeout !== null) {
141
+ clearTimeout(processTimeout)
142
+ processTimeout = null
143
+ }
144
+ }
145
+ try {
146
+ if (self.closing) {
147
+ clearProcessTimeout()
148
+ callback(new Error("stream destroyed during processing"))
149
+ return
150
+ }
151
+ const buffer = await textToSpeech(chunk.payload as string)
152
+ if (self.closing) {
153
+ clearProcessTimeout()
154
+ callback(new Error("stream destroyed during processing"))
155
+ return
156
+ }
157
+
158
+ /* calculate actual audio duration from PCM buffer size */
159
+ const durationMs = util.audioBufferDuration(buffer,
160
+ self.config.audioSampleRate, self.config.audioBitDepth) * 1000
161
+
162
+ /* create new chunk with recalculated timestamps */
163
+ const chunkNew = chunk.clone()
164
+ chunkNew.type = "audio"
165
+ chunkNew.payload = buffer
166
+ chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
167
+ clearProcessTimeout()
168
+ this.push(chunkNew)
169
+ callback()
170
+ }
171
+ catch (error) {
172
+ clearProcessTimeout()
173
+ callback(util.ensureError(error, "Google TTS processing failed"))
174
+ }
175
+ }
176
+ },
177
+ final (callback) {
178
+ callback()
179
+ }
180
+ })
181
+ }
182
+
183
+ /* close node */
184
+ async close () {
185
+ /* indicate closing */
186
+ this.closing = true
187
+
188
+ /* shutdown stream */
189
+ if (this.stream !== null) {
190
+ await util.destroyStream(this.stream)
191
+ this.stream = null
192
+ }
193
+
194
+ /* destroy resampler */
195
+ if (this.resampler !== null)
196
+ this.resampler = null
197
+
198
+ /* destroy Google TTS client */
199
+ if (this.client !== null) {
200
+ await this.client.close().catch((error) => {
201
+ this.log("warning", `error closing Google TTS client: ${error}`)
202
+ })
203
+ this.client = null
204
+ }
205
+ }
206
+ }
@@ -22,6 +22,7 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
22
22
 
23
23
  /* internal state */
24
24
  private kokoro: KokoroTTS | null = null
25
+ private closing = false
25
26
  private resampler: SpeexResampler | null = null
26
27
 
27
28
  /* construct node */
@@ -32,7 +33,7 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
32
33
  this.configure({
33
34
  voice: { type: "string", val: "Aoede", pos: 0, match: /^(?:Aoede|Heart|Puck|Fenrir)$/ },
34
35
  language: { type: "string", val: "en", pos: 1, match: /^(?:en)$/ },
35
- speed: { type: "number", val: 1.25, pos: 2, match: (n: number) => n >= 1.0 && n <= 1.30 },
36
+ speed: { type: "number", val: 1.25, pos: 2, match: (n: number) => n >= 1.0 && n <= 1.30 }
36
37
  })
37
38
 
38
39
  /* declare node input/output format */
@@ -40,8 +41,16 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
40
41
  this.output = "audio"
41
42
  }
42
43
 
44
+ /* one-time status of node */
45
+ async status () {
46
+ return {}
47
+ }
48
+
43
49
  /* open node */
44
50
  async open () {
51
+ /* clear destruction flag */
52
+ this.closing = false
53
+
45
54
  /* establish Kokoro */
46
55
  const model = "onnx-community/Kokoro-82M-v1.0-ONNX"
47
56
  const progressState = new Map<string, number>()
@@ -126,15 +135,19 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
126
135
  decodeStrings: false,
127
136
  highWaterMark: 1,
128
137
  transform (chunk: SpeechFlowChunk, encoding, callback) {
129
- if (Buffer.isBuffer(chunk.payload))
138
+ if (self.closing)
139
+ callback(new Error("stream already destroyed"))
140
+ else if (Buffer.isBuffer(chunk.payload))
130
141
  callback(new Error("invalid chunk payload type"))
131
142
  else {
132
143
  text2speech(chunk.payload).then((buffer) => {
144
+ if (self.closing)
145
+ throw new Error("stream destroyed during processing")
133
146
  self.log("info", `Kokoro: received audio (buffer length: ${buffer.byteLength})`)
134
- chunk = chunk.clone()
135
- chunk.type = "audio"
136
- chunk.payload = buffer
137
- this.push(chunk)
147
+ const chunkNew = chunk.clone()
148
+ chunkNew.type = "audio"
149
+ chunkNew.payload = buffer
150
+ this.push(chunkNew)
138
151
  callback()
139
152
  }).catch((error: unknown) => {
140
153
  callback(util.ensureError(error))
@@ -149,6 +162,9 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
149
162
 
150
163
  /* close node */
151
164
  async close () {
165
+ /* indicate closing */
166
+ this.closing = true
167
+
152
168
  /* shutdown stream */
153
169
  if (this.stream !== null) {
154
170
  await util.destroyStream(this.stream)
@@ -0,0 +1,179 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* external dependencies */
11
+ import OpenAI from "openai"
12
+ import { Duration } from "luxon"
13
+ import SpeexResampler from "speex-resampler"
14
+
15
+ /* internal dependencies */
16
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
17
+ import * as util from "./speechflow-util"
18
+
19
+ /* SpeechFlow node for OpenAI text-to-speech conversion */
20
+ export default class SpeechFlowNodeT2AOpenAI extends SpeechFlowNode {
21
+ /* declare official node name */
22
+ public static name = "t2a-openai"
23
+
24
+ /* internal state */
25
+ private openai: OpenAI | null = null
26
+ private resampler: SpeexResampler | null = null
27
+ private closing = false
28
+
29
+ /* construct node */
30
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
31
+ super(id, cfg, opts, args)
32
+
33
+ /* declare node configuration parameters */
34
+ this.configure({
35
+ key: { type: "string", val: process.env.SPEECHFLOW_OPENAI_KEY },
36
+ api: { type: "string", val: "https://api.openai.com/v1", match: /^https?:\/\/.+/ },
37
+ voice: { type: "string", val: "alloy", pos: 0, match: /^(?:alloy|echo|fable|onyx|nova|shimmer)$/ },
38
+ model: { type: "string", val: "tts-1", pos: 1, match: /^(?:tts-1|tts-1-hd)$/ },
39
+ speed: { type: "number", val: 1.0, pos: 2, match: (n: number) => n >= 0.25 && n <= 4.0 }
40
+ })
41
+
42
+ /* sanity check parameters */
43
+ if (!this.params.key)
44
+ throw new Error("OpenAI API key not configured")
45
+
46
+ /* declare node input/output format */
47
+ this.input = "text"
48
+ this.output = "audio"
49
+ }
50
+
51
+ /* one-time status of node */
52
+ async status () {
53
+ return {}
54
+ }
55
+
56
+ /* open node */
57
+ async open () {
58
+ /* clear destruction flag */
59
+ this.closing = false
60
+
61
+ /* establish OpenAI API connection */
62
+ this.openai = new OpenAI({
63
+ baseURL: this.params.api,
64
+ apiKey: this.params.key,
65
+ timeout: 60000
66
+ })
67
+
68
+ /* establish resampler from OpenAI's 24Khz PCM output
69
+ to our standard audio sample rate (48KHz) */
70
+ this.resampler = new SpeexResampler(1, 24000, this.config.audioSampleRate, 7)
71
+
72
+ /* perform text-to-speech operation with OpenAI API */
73
+ const textToSpeech = async (text: string) => {
74
+ this.log("info", `OpenAI TTS: send text "${text}"`)
75
+ const response = await this.openai!.audio.speech.create({
76
+ model: this.params.model,
77
+ voice: this.params.voice,
78
+ input: text,
79
+ response_format: "pcm",
80
+ speed: this.params.speed
81
+ })
82
+
83
+ /* convert response to buffer (PCM 24kHz, 16-bit, little-endian) */
84
+ const arrayBuffer = await response.arrayBuffer()
85
+ const buffer = Buffer.from(arrayBuffer)
86
+ this.log("info", `OpenAI TTS: received audio (buffer length: ${buffer.byteLength})`)
87
+
88
+ /* resample from 24kHz to 48kHz */
89
+ const bufferResampled = this.resampler!.processChunk(buffer)
90
+ this.log("info", `OpenAI TTS: forwarding resampled audio (buffer length: ${bufferResampled.byteLength})`)
91
+ return bufferResampled
92
+ }
93
+
94
+ /* create transform stream and connect it to the OpenAI API */
95
+ const self = this
96
+ this.stream = new Stream.Transform({
97
+ writableObjectMode: true,
98
+ readableObjectMode: true,
99
+ decodeStrings: false,
100
+ highWaterMark: 1,
101
+ async transform (chunk: SpeechFlowChunk, encoding, callback) {
102
+ if (self.closing)
103
+ callback(new Error("stream already destroyed"))
104
+ else if (Buffer.isBuffer(chunk.payload))
105
+ callback(new Error("invalid chunk payload type"))
106
+ else if (chunk.payload === "") {
107
+ /* pass through empty chunks */
108
+ this.push(chunk)
109
+ callback()
110
+ }
111
+ else {
112
+ let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
113
+ processTimeout = null
114
+ callback(new Error("OpenAI TTS API timeout"))
115
+ }, 60 * 1000)
116
+ const clearProcessTimeout = () => {
117
+ if (processTimeout !== null) {
118
+ clearTimeout(processTimeout)
119
+ processTimeout = null
120
+ }
121
+ }
122
+ try {
123
+ if (self.closing) {
124
+ clearProcessTimeout()
125
+ callback(new Error("stream destroyed during processing"))
126
+ return
127
+ }
128
+ const buffer = await textToSpeech(chunk.payload as string)
129
+ if (self.closing) {
130
+ clearProcessTimeout()
131
+ callback(new Error("stream destroyed during processing"))
132
+ return
133
+ }
134
+
135
+ /* calculate actual audio duration from PCM buffer size */
136
+ const durationMs = util.audioBufferDuration(buffer,
137
+ self.config.audioSampleRate, self.config.audioBitDepth) * 1000
138
+
139
+ /* create new chunk with recalculated timestamps */
140
+ const chunkNew = chunk.clone()
141
+ chunkNew.type = "audio"
142
+ chunkNew.payload = buffer
143
+ chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
144
+ clearProcessTimeout()
145
+ this.push(chunkNew)
146
+ callback()
147
+ }
148
+ catch (error) {
149
+ clearProcessTimeout()
150
+ callback(util.ensureError(error, "OpenAI TTS processing failed"))
151
+ }
152
+ }
153
+ },
154
+ final (callback) {
155
+ callback()
156
+ }
157
+ })
158
+ }
159
+
160
+ /* close node */
161
+ async close () {
162
+ /* indicate closing */
163
+ this.closing = true
164
+
165
+ /* shutdown stream */
166
+ if (this.stream !== null) {
167
+ await util.destroyStream(this.stream)
168
+ this.stream = null
169
+ }
170
+
171
+ /* destroy resampler */
172
+ if (this.resampler !== null)
173
+ this.resampler = null
174
+
175
+ /* destroy OpenAI API */
176
+ if (this.openai !== null)
177
+ this.openai = null
178
+ }
179
+ }