speechflow 1.7.1 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. package/CHANGELOG.md +24 -0
  2. package/README.md +388 -120
  3. package/etc/claude.md +5 -5
  4. package/etc/speechflow.yaml +2 -2
  5. package/package.json +3 -3
  6. package/speechflow-cli/dst/speechflow-main-api.js.map +1 -1
  7. package/speechflow-cli/dst/speechflow-main-cli.js +1 -0
  8. package/speechflow-cli/dst/speechflow-main-cli.js.map +1 -1
  9. package/speechflow-cli/dst/speechflow-main-graph.d.ts +1 -0
  10. package/speechflow-cli/dst/speechflow-main-graph.js +30 -9
  11. package/speechflow-cli/dst/speechflow-main-graph.js.map +1 -1
  12. package/speechflow-cli/dst/speechflow-main-nodes.js +1 -0
  13. package/speechflow-cli/dst/speechflow-main-nodes.js.map +1 -1
  14. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js +1 -0
  15. package/speechflow-cli/dst/speechflow-node-a2a-compressor-wt.js.map +1 -1
  16. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js +7 -9
  17. package/speechflow-cli/dst/speechflow-node-a2a-compressor.js.map +1 -1
  18. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js +1 -0
  19. package/speechflow-cli/dst/speechflow-node-a2a-expander-wt.js.map +1 -1
  20. package/speechflow-cli/dst/speechflow-node-a2a-expander.js +8 -9
  21. package/speechflow-cli/dst/speechflow-node-a2a-expander.js.map +1 -1
  22. package/speechflow-cli/dst/speechflow-node-a2a-filler.js +2 -0
  23. package/speechflow-cli/dst/speechflow-node-a2a-filler.js.map +1 -1
  24. package/speechflow-cli/dst/speechflow-node-a2a-gender.js +1 -1
  25. package/speechflow-cli/dst/speechflow-node-a2a-gender.js.map +1 -1
  26. package/speechflow-cli/dst/speechflow-node-a2a-meter.js +1 -1
  27. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js +11 -9
  28. package/speechflow-cli/dst/speechflow-node-a2a-pitch.js.map +1 -1
  29. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js +1 -0
  30. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise-wt.js.map +1 -1
  31. package/speechflow-cli/dst/speechflow-node-a2a-rnnoise.js.map +1 -1
  32. package/speechflow-cli/dst/speechflow-node-a2a-speex.js +4 -2
  33. package/speechflow-cli/dst/speechflow-node-a2a-speex.js.map +1 -1
  34. package/speechflow-cli/dst/speechflow-node-a2a-vad.js +19 -22
  35. package/speechflow-cli/dst/speechflow-node-a2a-vad.js.map +1 -1
  36. package/speechflow-cli/dst/speechflow-node-a2a-wav.js +31 -4
  37. package/speechflow-cli/dst/speechflow-node-a2a-wav.js.map +1 -1
  38. package/speechflow-cli/dst/speechflow-node-a2t-amazon.d.ts +0 -1
  39. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js +2 -11
  40. package/speechflow-cli/dst/speechflow-node-a2t-amazon.js.map +1 -1
  41. package/speechflow-cli/dst/speechflow-node-a2t-google.d.ts +16 -0
  42. package/speechflow-cli/dst/speechflow-node-a2t-google.js +314 -0
  43. package/speechflow-cli/dst/speechflow-node-a2t-google.js.map +1 -0
  44. package/speechflow-cli/dst/speechflow-node-a2t-openai.js +6 -1
  45. package/speechflow-cli/dst/speechflow-node-a2t-openai.js.map +1 -1
  46. package/speechflow-cli/dst/speechflow-node-t2a-amazon.d.ts +1 -1
  47. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js +27 -7
  48. package/speechflow-cli/dst/speechflow-node-t2a-amazon.js.map +1 -1
  49. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.d.ts +1 -1
  50. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js +5 -3
  51. package/speechflow-cli/dst/speechflow-node-t2a-elevenlabs.js.map +1 -1
  52. package/speechflow-cli/dst/speechflow-node-t2a-google.d.ts +15 -0
  53. package/speechflow-cli/dst/speechflow-node-t2a-google.js +215 -0
  54. package/speechflow-cli/dst/speechflow-node-t2a-google.js.map +1 -0
  55. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.d.ts +1 -1
  56. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js +27 -6
  57. package/speechflow-cli/dst/speechflow-node-t2a-kokoro.js.map +1 -1
  58. package/speechflow-cli/dst/speechflow-node-t2a-openai.d.ts +15 -0
  59. package/speechflow-cli/dst/speechflow-node-t2a-openai.js +192 -0
  60. package/speechflow-cli/dst/speechflow-node-t2a-openai.js.map +1 -0
  61. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.d.ts +17 -0
  62. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js +619 -0
  63. package/speechflow-cli/dst/speechflow-node-t2a-supertonic.js.map +1 -0
  64. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js +0 -2
  65. package/speechflow-cli/dst/speechflow-node-t2t-amazon.js.map +1 -1
  66. package/speechflow-cli/dst/speechflow-node-t2t-deepl.js.map +1 -1
  67. package/speechflow-cli/dst/speechflow-node-t2t-google.js.map +1 -1
  68. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.d.ts → speechflow-node-t2t-opus.d.ts} +1 -3
  69. package/speechflow-cli/dst/speechflow-node-t2t-opus.js +161 -0
  70. package/speechflow-cli/dst/speechflow-node-t2t-opus.js.map +1 -0
  71. package/speechflow-cli/dst/speechflow-node-t2t-profanity.d.ts +11 -0
  72. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js +118 -0
  73. package/speechflow-cli/dst/speechflow-node-t2t-profanity.js.map +1 -0
  74. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.d.ts +13 -0
  75. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js +220 -0
  76. package/speechflow-cli/dst/speechflow-node-t2t-punctuation.js.map +1 -0
  77. package/speechflow-cli/dst/{speechflow-node-t2t-openai.d.ts → speechflow-node-t2t-spellcheck.d.ts} +2 -2
  78. package/speechflow-cli/dst/{speechflow-node-t2t-openai.js → speechflow-node-t2t-spellcheck.js} +48 -100
  79. package/speechflow-cli/dst/speechflow-node-t2t-spellcheck.js.map +1 -0
  80. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js +8 -8
  81. package/speechflow-cli/dst/speechflow-node-t2t-subtitle.js.map +1 -1
  82. package/speechflow-cli/dst/speechflow-node-t2t-summary.d.ts +16 -0
  83. package/speechflow-cli/dst/speechflow-node-t2t-summary.js +241 -0
  84. package/speechflow-cli/dst/speechflow-node-t2t-summary.js.map +1 -0
  85. package/speechflow-cli/dst/{speechflow-node-t2t-ollama.d.ts → speechflow-node-t2t-translate.d.ts} +2 -2
  86. package/speechflow-cli/dst/{speechflow-node-t2t-transformers.js → speechflow-node-t2t-translate.js} +53 -115
  87. package/speechflow-cli/dst/speechflow-node-t2t-translate.js.map +1 -0
  88. package/speechflow-cli/dst/speechflow-node-x2x-filter.js +2 -0
  89. package/speechflow-cli/dst/speechflow-node-x2x-filter.js.map +1 -1
  90. package/speechflow-cli/dst/speechflow-node-xio-exec.d.ts +12 -0
  91. package/speechflow-cli/dst/speechflow-node-xio-exec.js +224 -0
  92. package/speechflow-cli/dst/speechflow-node-xio-exec.js.map +1 -0
  93. package/speechflow-cli/dst/speechflow-node-xio-file.d.ts +1 -0
  94. package/speechflow-cli/dst/speechflow-node-xio-file.js +78 -67
  95. package/speechflow-cli/dst/speechflow-node-xio-file.js.map +1 -1
  96. package/speechflow-cli/dst/speechflow-node-xio-mqtt.js.map +1 -1
  97. package/speechflow-cli/dst/speechflow-node-xio-vban.d.ts +17 -0
  98. package/speechflow-cli/dst/speechflow-node-xio-vban.js +330 -0
  99. package/speechflow-cli/dst/speechflow-node-xio-vban.js.map +1 -0
  100. package/speechflow-cli/dst/speechflow-node-xio-webrtc.d.ts +39 -0
  101. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js +502 -0
  102. package/speechflow-cli/dst/speechflow-node-xio-webrtc.js.map +1 -0
  103. package/speechflow-cli/dst/speechflow-node-xio-websocket.js +9 -9
  104. package/speechflow-cli/dst/speechflow-node-xio-websocket.js.map +1 -1
  105. package/speechflow-cli/dst/speechflow-util-audio.js +8 -5
  106. package/speechflow-cli/dst/speechflow-util-audio.js.map +1 -1
  107. package/speechflow-cli/dst/speechflow-util-error.d.ts +1 -0
  108. package/speechflow-cli/dst/speechflow-util-error.js +5 -0
  109. package/speechflow-cli/dst/speechflow-util-error.js.map +1 -1
  110. package/speechflow-cli/dst/speechflow-util-llm.d.ts +35 -0
  111. package/speechflow-cli/dst/speechflow-util-llm.js +363 -0
  112. package/speechflow-cli/dst/speechflow-util-llm.js.map +1 -0
  113. package/speechflow-cli/dst/speechflow-util-queue.js +2 -1
  114. package/speechflow-cli/dst/speechflow-util-queue.js.map +1 -1
  115. package/speechflow-cli/dst/speechflow-util.d.ts +1 -0
  116. package/speechflow-cli/dst/speechflow-util.js +2 -0
  117. package/speechflow-cli/dst/speechflow-util.js.map +1 -1
  118. package/speechflow-cli/etc/oxlint.jsonc +2 -1
  119. package/speechflow-cli/package.json +35 -18
  120. package/speechflow-cli/src/lib.d.ts +5 -0
  121. package/speechflow-cli/src/speechflow-main-api.ts +16 -16
  122. package/speechflow-cli/src/speechflow-main-cli.ts +1 -0
  123. package/speechflow-cli/src/speechflow-main-graph.ts +38 -14
  124. package/speechflow-cli/src/speechflow-main-nodes.ts +1 -0
  125. package/speechflow-cli/src/speechflow-node-a2a-compressor-wt.ts +1 -0
  126. package/speechflow-cli/src/speechflow-node-a2a-compressor.ts +8 -10
  127. package/speechflow-cli/src/speechflow-node-a2a-expander-wt.ts +1 -0
  128. package/speechflow-cli/src/speechflow-node-a2a-expander.ts +9 -10
  129. package/speechflow-cli/src/speechflow-node-a2a-filler.ts +2 -0
  130. package/speechflow-cli/src/speechflow-node-a2a-gender.ts +3 -3
  131. package/speechflow-cli/src/speechflow-node-a2a-meter.ts +2 -2
  132. package/speechflow-cli/src/speechflow-node-a2a-pitch.ts +11 -9
  133. package/speechflow-cli/src/speechflow-node-a2a-rnnoise-wt.ts +1 -0
  134. package/speechflow-cli/src/speechflow-node-a2a-rnnoise.ts +1 -1
  135. package/speechflow-cli/src/speechflow-node-a2a-speex.ts +5 -3
  136. package/speechflow-cli/src/speechflow-node-a2a-vad.ts +20 -23
  137. package/speechflow-cli/src/speechflow-node-a2a-wav.ts +31 -4
  138. package/speechflow-cli/src/speechflow-node-a2t-amazon.ts +6 -18
  139. package/speechflow-cli/src/speechflow-node-a2t-google.ts +315 -0
  140. package/speechflow-cli/src/speechflow-node-a2t-openai.ts +12 -7
  141. package/speechflow-cli/src/speechflow-node-t2a-amazon.ts +32 -10
  142. package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts +6 -4
  143. package/speechflow-cli/src/speechflow-node-t2a-google.ts +203 -0
  144. package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts +33 -10
  145. package/speechflow-cli/src/speechflow-node-t2a-openai.ts +176 -0
  146. package/speechflow-cli/src/speechflow-node-t2a-supertonic.ts +710 -0
  147. package/speechflow-cli/src/speechflow-node-t2t-amazon.ts +3 -4
  148. package/speechflow-cli/src/speechflow-node-t2t-deepl.ts +2 -2
  149. package/speechflow-cli/src/speechflow-node-t2t-google.ts +1 -1
  150. package/speechflow-cli/src/speechflow-node-t2t-opus.ts +137 -0
  151. package/speechflow-cli/src/speechflow-node-t2t-profanity.ts +93 -0
  152. package/speechflow-cli/src/speechflow-node-t2t-punctuation.ts +201 -0
  153. package/speechflow-cli/src/speechflow-node-t2t-spellcheck.ts +188 -0
  154. package/speechflow-cli/src/speechflow-node-t2t-subtitle.ts +8 -8
  155. package/speechflow-cli/src/speechflow-node-t2t-summary.ts +229 -0
  156. package/speechflow-cli/src/speechflow-node-t2t-translate.ts +181 -0
  157. package/speechflow-cli/src/speechflow-node-x2x-filter.ts +2 -0
  158. package/speechflow-cli/src/speechflow-node-xio-exec.ts +211 -0
  159. package/speechflow-cli/src/speechflow-node-xio-file.ts +91 -80
  160. package/speechflow-cli/src/speechflow-node-xio-mqtt.ts +2 -2
  161. package/speechflow-cli/src/speechflow-node-xio-vban.ts +325 -0
  162. package/speechflow-cli/src/speechflow-node-xio-webrtc.ts +535 -0
  163. package/speechflow-cli/src/speechflow-node-xio-websocket.ts +9 -9
  164. package/speechflow-cli/src/speechflow-util-audio.ts +10 -5
  165. package/speechflow-cli/src/speechflow-util-error.ts +9 -0
  166. package/speechflow-cli/src/speechflow-util-llm.ts +367 -0
  167. package/speechflow-cli/src/speechflow-util-queue.ts +3 -3
  168. package/speechflow-cli/src/speechflow-util.ts +2 -0
  169. package/speechflow-ui-db/package.json +9 -9
  170. package/speechflow-ui-st/package.json +9 -9
  171. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js +0 -293
  172. package/speechflow-cli/dst/speechflow-node-t2t-ollama.js.map +0 -1
  173. package/speechflow-cli/dst/speechflow-node-t2t-openai.js.map +0 -1
  174. package/speechflow-cli/dst/speechflow-node-t2t-transformers.js.map +0 -1
  175. package/speechflow-cli/src/speechflow-node-t2t-ollama.ts +0 -281
  176. package/speechflow-cli/src/speechflow-node-t2t-openai.ts +0 -247
  177. package/speechflow-cli/src/speechflow-node-t2t-transformers.ts +0 -247
@@ -85,6 +85,18 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
85
85
  }
86
86
  }
87
87
 
88
+ /* helper function for tail timer handling */
89
+ const startTailTimer = () => {
90
+ tail = true
91
+ clearTailTimer()
92
+ this.tailTimer = setTimeout(() => {
93
+ if (this.closing || this.tailTimer === null)
94
+ return
95
+ tail = false
96
+ this.tailTimer = null
97
+ }, this.params.postSpeechTail)
98
+ }
99
+
88
100
  /* establish Voice Activity Detection (VAD) facility */
89
101
  let tail = false
90
102
  try {
@@ -111,31 +123,15 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
111
123
  return
112
124
  const duration = util.audioArrayDuration(audio, vadSampleRateTarget)
113
125
  this.log("info", `VAD: speech end (duration: ${duration.toFixed(2)}s)`)
114
- if (this.params.mode === "unplugged") {
115
- tail = true
116
- clearTailTimer()
117
- this.tailTimer = setTimeout(() => {
118
- if (this.closing || this.tailTimer === null)
119
- return
120
- tail = false
121
- this.tailTimer = null
122
- }, this.params.postSpeechTail)
123
- }
126
+ if (this.params.mode === "unplugged")
127
+ startTailTimer()
124
128
  },
125
129
  onVADMisfire: () => {
126
130
  if (this.closing)
127
131
  return
128
132
  this.log("info", "VAD: speech end (segment too short)")
129
- if (this.params.mode === "unplugged") {
130
- tail = true
131
- clearTailTimer()
132
- this.tailTimer = setTimeout(() => {
133
- if (this.closing || this.tailTimer === null)
134
- return
135
- tail = false
136
- this.tailTimer = null
137
- }, this.params.postSpeechTail)
138
- }
133
+ if (this.params.mode === "unplugged")
134
+ startTailTimer()
139
135
  },
140
136
  onFrameProcessed: (audio) => {
141
137
  if (this.closing)
@@ -144,7 +140,7 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
144
140
  /* annotate the current audio segment */
145
141
  const element = this.queueVAD.peek()
146
142
  if (element === undefined || element.type !== "audio-frame")
147
- throw new Error("internal error which cannot happen: no more queued element")
143
+ throw new Error("internal error that cannot happen: no more queued element")
148
144
  if (element.segmentIdx >= element.segmentData.length)
149
145
  throw new Error("segment index out of bounds")
150
146
  const segment = element.segmentData[element.segmentIdx++]
@@ -227,6 +223,7 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
227
223
  }
228
224
  }
229
225
 
226
+ /* signal completion */
230
227
  callback()
231
228
  }
232
229
  catch (error) {
@@ -322,6 +319,7 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
322
319
  }
323
320
  }
324
321
 
322
+ /* peek at send queue element */
325
323
  const element = self.queueSend.peek()
326
324
  if (element !== undefined && element.type === "audio-eof")
327
325
  this.push(null)
@@ -371,8 +369,7 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
371
369
  if (this.vad !== null) {
372
370
  try {
373
371
  const flushPromise = this.vad.flush()
374
- const timeoutPromise = new Promise((resolve) =>
375
- setTimeout(resolve, 5000))
372
+ const timeoutPromise = new Promise((resolve) => { setTimeout(resolve, 5000) })
376
373
  await Promise.race([ flushPromise, timeoutPromise ])
377
374
  }
378
375
  catch (error) {
@@ -21,15 +21,18 @@ const writeWavHeader = (
21
21
  const sampleRate = options?.sampleRate ?? 44100 /* 44KHz */
22
22
  const bitDepth = options?.bitDepth ?? 16 /* 16-Bit */
23
23
 
24
+ /* determine header dimensions */
24
25
  const headerLength = 44
25
26
  const maxDataSize = Math.pow(2, 32) - 100 /* safe maximum for 32-bit WAV files */
26
27
  const dataLength = length ?? maxDataSize
27
28
  const fileSize = dataLength + headerLength
28
29
  const header = Buffer.alloc(headerLength)
29
30
 
31
+ /* calculate byte rate and block alignment */
30
32
  const byteRate = (sampleRate * channels * bitDepth) / 8
31
33
  const blockAlign = (channels * bitDepth) / 8
32
34
 
35
+ /* write header fields */
33
36
  let offset = 0
34
37
  header.write("RIFF", offset); offset += 4
35
38
  header.writeUInt32LE(fileSize - 8, offset); offset += 4
@@ -45,6 +48,7 @@ const writeWavHeader = (
45
48
  header.write("data", offset); offset += 4
46
49
  header.writeUInt32LE(dataLength, offset); offset += 4
47
50
 
51
+ /* return completed header */
48
52
  return header
49
53
  }
50
54
 
@@ -53,6 +57,7 @@ const readWavHeader = (buffer: Buffer) => {
53
57
  if (buffer.length < 44)
54
58
  throw new Error("WAV header too short, expected at least 44 bytes")
55
59
 
60
+ /* read header fields */
56
61
  let offset = 0
57
62
  const riffHead = buffer.subarray(offset, offset + 4).toString(); offset += 4
58
63
  const fileSize = buffer.readUInt32LE(offset); offset += 4
@@ -68,6 +73,7 @@ const readWavHeader = (buffer: Buffer) => {
68
73
  const data = buffer.subarray(offset, offset + 4).toString(); offset += 4
69
74
  const dataLength = buffer.readUInt32LE(offset); offset += 4
70
75
 
76
+ /* validate RIFF header */
71
77
  if (riffHead !== "RIFF")
72
78
  throw new Error(`Invalid WAV file: expected RIFF header, got "${riffHead}"`)
73
79
  if (waveHead !== "WAVE")
@@ -77,6 +83,7 @@ const readWavHeader = (buffer: Buffer) => {
77
83
  if (data !== "data")
78
84
  throw new Error(`Invalid WAV file: expected "data" header, got "${data}"`)
79
85
 
86
+ /* return parsed header data */
80
87
  return {
81
88
  riffHead, fileSize, waveHead, fmtHead, formatLength, audioFormat,
82
89
  channels, sampleRate, byteRate, blockAlign, bitDepth, data, dataLength
@@ -94,7 +101,8 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
94
101
 
95
102
  /* declare node configuration parameters */
96
103
  this.configure({
97
- mode: { type: "string", pos: 1, val: "encode", match: /^(?:encode|decode)$/ }
104
+ mode: { type: "string", pos: 0, val: "encode", match: /^(?:encode|decode)$/ },
105
+ seekable: { type: "boolean", pos: 1, val: false }
98
106
  })
99
107
 
100
108
  /* declare node input/output format */
@@ -106,7 +114,9 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
106
114
  async open () {
107
115
  /* establish a transform stream */
108
116
  const self = this
109
- let firstChunk = true
117
+ let isFirstChunk = true
118
+ let headerChunkSent: SpeechFlowChunk | null = null
119
+ let totalSize = 0
110
120
  this.stream = new Stream.Transform({
111
121
  readableObjectMode: true,
112
122
  writableObjectMode: true,
@@ -115,7 +125,7 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
115
125
  transform (chunk: SpeechFlowChunk, encoding, callback) {
116
126
  if (!Buffer.isBuffer(chunk.payload))
117
127
  callback(new Error("invalid chunk payload type"))
118
- else if (firstChunk) {
128
+ else if (isFirstChunk) {
119
129
  if (self.params.mode === "encode") {
120
130
  /* convert raw/PCM to WAV/PCM
121
131
  (NOTICE: as this is a continuous stream, the
@@ -132,7 +142,9 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
132
142
  const headerChunk = chunk.clone()
133
143
  headerChunk.payload = headerBuffer
134
144
  this.push(headerChunk)
145
+ headerChunkSent = headerChunk
135
146
  this.push(chunk)
147
+ totalSize += chunk.payload.byteLength
136
148
  callback()
137
149
  }
138
150
  else if (self.params.mode === "decode") {
@@ -173,21 +185,36 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
173
185
  }
174
186
  chunk.payload = chunk.payload.subarray(44)
175
187
  this.push(chunk)
188
+ totalSize += chunk.payload.byteLength
176
189
  callback()
177
190
  }
178
191
  else {
179
192
  callback(new Error(`invalid operation mode "${self.params.mode}"`))
180
193
  return
181
194
  }
182
- firstChunk = false
195
+ isFirstChunk = false
183
196
  }
184
197
  else {
185
198
  /* pass-through original chunk */
186
199
  this.push(chunk)
200
+ totalSize += chunk.payload.byteLength
187
201
  callback()
188
202
  }
189
203
  },
190
204
  final (callback) {
205
+ if (self.params.seekable && headerChunkSent !== null) {
206
+ self.log("info", "sending updated WAV header")
207
+ const headerBuffer = writeWavHeader(totalSize, {
208
+ audioFormat: 0x0001 /* PCM */,
209
+ channels: self.config.audioChannels,
210
+ sampleRate: self.config.audioSampleRate,
211
+ bitDepth: self.config.audioBitDepth
212
+ })
213
+ const headerChunk = headerChunkSent?.clone()
214
+ headerChunk.payload = headerBuffer
215
+ headerChunk.meta.set("chunk:seek", 0)
216
+ this.push(headerChunk)
217
+ }
191
218
  callback()
192
219
  }
193
220
  })
@@ -53,7 +53,7 @@ class AsyncQueue<T> {
53
53
  continue
54
54
  }
55
55
  else {
56
- const it = await new Promise<IteratorResult<T>>((resolve) => this.resolvers.push(resolve))
56
+ const it = await new Promise<IteratorResult<T>>((resolve) => { this.resolvers.push(resolve) })
57
57
  if (it.done)
58
58
  return
59
59
  yield it.value
@@ -68,11 +68,10 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
68
68
  public static name = "a2t-amazon"
69
69
 
70
70
  /* internal state */
71
- private client: TranscribeStreamingClient | null = null
72
- private clientStream: AsyncIterable<TranscriptResultStream> | null = null
73
- private closing = false
74
- private connectionTimeout: ReturnType<typeof setTimeout> | null = null
75
- private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
71
+ private client: TranscribeStreamingClient | null = null
72
+ private clientStream: AsyncIterable<TranscriptResultStream> | null = null
73
+ private closing = false
74
+ private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
76
75
 
77
76
  /* construct node */
78
77
  constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
@@ -126,8 +125,6 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
126
125
  secretAccessKey: this.params.secKey
127
126
  }
128
127
  })
129
- if (this.client === null)
130
- throw new Error("failed to establish Amazon Transcribe client")
131
128
 
132
129
  /* create an AudioStream for Amazon Transcribe */
133
130
  const audioQueue = new AsyncQueue<Uint8Array>()
@@ -236,11 +233,8 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
236
233
  callback()
237
234
  return
238
235
  }
239
-
240
- /* await all read operations */
241
236
  await reads.awaitAll()
242
-
243
- util.run(
237
+ util.run("closing Amazon Transcribe connection",
244
238
  () => self.client!.destroy(),
245
239
  (error: Error) => self.log("warning", `error closing Amazon Transcribe connection: ${error}`)
246
240
  )
@@ -279,12 +273,6 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
279
273
  /* indicate closing first to stop all async operations */
280
274
  this.closing = true
281
275
 
282
- /* cleanup all timers */
283
- if (this.connectionTimeout !== null) {
284
- clearTimeout(this.connectionTimeout)
285
- this.connectionTimeout = null
286
- }
287
-
288
276
  /* close queue */
289
277
  if (this.queue !== null) {
290
278
  this.queue.write(null)
@@ -0,0 +1,315 @@
1
+ /*
2
+ ** SpeechFlow - Speech Processing Flow Graph
3
+ ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
4
+ ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
5
+ */
6
+
7
+ /* standard dependencies */
8
+ import Stream from "node:stream"
9
+
10
+ /* external dependencies */
11
+ import * as GoogleSpeech from "@google-cloud/speech"
12
+ import { DateTime, Duration } from "luxon"
13
+ import * as arktype from "arktype"
14
+
15
+ /* internal dependencies */
16
+ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
17
+ import * as util from "./speechflow-util"
18
+
19
+ /* SpeechFlow node for Google Cloud speech-to-text conversion */
20
+ export default class SpeechFlowNodeA2TGoogle extends SpeechFlowNode {
21
+ /* declare official node name */
22
+ public static name = "a2t-google"
23
+
24
+ /* internal state */
25
+ private client: GoogleSpeech.SpeechClient | null = null
26
+ private recognizeStream: ReturnType<GoogleSpeech.SpeechClient["streamingRecognize"]> | null = null
27
+ private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
28
+ private closing = false
29
+
30
+ /* construct node */
31
+ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
32
+ super(id, cfg, opts, args)
33
+
34
+ /* declare node configuration parameters */
35
+ this.configure({
36
+ key: { type: "string", val: process.env.SPEECHFLOW_GOOGLE_KEY ?? "" },
37
+ model: { type: "string", pos: 0, val: "latest_long" },
38
+ language: { type: "string", pos: 1, val: "en-US" },
39
+ interim: { type: "boolean", pos: 2, val: false }
40
+ })
41
+
42
+ /* validate API key */
43
+ if (this.params.key === "")
44
+ throw new Error("Google Cloud API credentials JSON key is required")
45
+
46
+ /* declare node input/output format */
47
+ this.input = "audio"
48
+ this.output = "text"
49
+ }
50
+
51
+ /* one-time status of node */
52
+ async status () {
53
+ return {}
54
+ }
55
+
56
+ /* open node */
57
+ async open () {
58
+ /* sanity check situation */
59
+ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
60
+ throw new Error("Google Speech node currently supports PCM-S16LE audio only")
61
+
62
+ /* clear destruction flag */
63
+ this.closing = false
64
+
65
+ /* create queue for results */
66
+ this.queue = new util.SingleQueue<SpeechFlowChunk | null>()
67
+
68
+ /* create a store for the meta information */
69
+ const metastore = new util.TimeStore<Map<string, any>>()
70
+
71
+ /* instantiate Google Speech client */
72
+ const data = util.run("Google Cloud API credentials key", () =>
73
+ JSON.parse(this.params.key))
74
+ const credentials = util.importObject("Google Cloud API credentials key",
75
+ data,
76
+ arktype.type({
77
+ project_id: "string",
78
+ private_key: "string",
79
+ client_email: "string"
80
+ })
81
+ )
82
+ this.client = new GoogleSpeech.SpeechClient({
83
+ credentials: {
84
+ private_key: credentials.private_key,
85
+ client_email: credentials.client_email
86
+ },
87
+ projectId: credentials.project_id
88
+ })
89
+
90
+ /* create streaming recognition request */
91
+ this.recognizeStream = this.client.streamingRecognize({
92
+ config: {
93
+ encoding: "LINEAR16",
94
+ sampleRateHertz: this.config.audioSampleRate,
95
+ languageCode: this.params.language,
96
+ model: this.params.model,
97
+ enableAutomaticPunctuation: true,
98
+ enableWordTimeOffsets: true
99
+ },
100
+ interimResults: this.params.interim
101
+ })
102
+
103
+ /* hook onto Google Speech API events */
104
+ this.recognizeStream.on("data", (data: GoogleSpeech.protos.google.cloud.speech.v1.IStreamingRecognizeResponse) => {
105
+ if (this.closing || this.queue === null)
106
+ return
107
+ if (!data.results || data.results.length === 0)
108
+ return
109
+ for (const result of data.results) {
110
+ if (!result.alternatives || result.alternatives.length === 0)
111
+ continue
112
+ const alternative = result.alternatives[0]
113
+ const text = alternative.transcript ?? ""
114
+ if (text === "")
115
+ continue
116
+ const isFinal = result.isFinal ?? false
117
+ if (!isFinal && !this.params.interim)
118
+ continue
119
+
120
+ /* calculate timestamps */
121
+ let tsStart = Duration.fromMillis(0)
122
+ let tsEnd = Duration.fromMillis(0)
123
+
124
+ /* extract word timing information if available */
125
+ const words: { word: string, start: Duration, end: Duration }[] = []
126
+ if (alternative.words && alternative.words.length > 0) {
127
+ for (const wordInfo of alternative.words) {
128
+ const wordStart = wordInfo.startTime
129
+ ? Duration.fromMillis(
130
+ (Number(wordInfo.startTime.seconds ?? 0) * 1000) +
131
+ (Number(wordInfo.startTime.nanos ?? 0) / 1000000)
132
+ ).plus(this.timeZeroOffset)
133
+ : Duration.fromMillis(0)
134
+ const wordEnd = wordInfo.endTime
135
+ ? Duration.fromMillis(
136
+ (Number(wordInfo.endTime.seconds ?? 0) * 1000) +
137
+ (Number(wordInfo.endTime.nanos ?? 0) / 1000000)
138
+ ).plus(this.timeZeroOffset)
139
+ : Duration.fromMillis(0)
140
+ words.push({
141
+ word: wordInfo.word ?? "",
142
+ start: wordStart,
143
+ end: wordEnd
144
+ })
145
+ }
146
+ if (words.length > 0) {
147
+ tsStart = words[0].start
148
+ tsEnd = words[words.length - 1].end
149
+ }
150
+ }
151
+ else {
152
+ /* fallback: use result timing */
153
+ const resultEnd = result.resultEndTime
154
+ if (resultEnd) {
155
+ tsEnd = Duration.fromMillis(
156
+ (Number(resultEnd.seconds ?? 0) * 1000) +
157
+ (Number(resultEnd.nanos ?? 0) / 1000000)
158
+ ).plus(this.timeZeroOffset)
159
+ }
160
+ }
161
+ this.log("info", `text received (start: ${tsStart.toMillis()}ms, ` +
162
+ `end: ${tsEnd.toMillis()}ms, ` +
163
+ `kind: ${isFinal ? "final" : "intermediate"}): ` +
164
+ `"${text}"`)
165
+
166
+ /* fetch and merge meta information */
167
+ const metas = metastore.fetch(tsStart, tsEnd)
168
+ const meta = metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
169
+ curr.forEach((val, key) => { prev.set(key, val) })
170
+ return prev
171
+ }, new Map<string, any>())
172
+ metastore.prune(tsStart)
173
+
174
+ /* add word timing to meta */
175
+ if (words.length > 0)
176
+ meta.set("words", words)
177
+
178
+ /* create and enqueue chunk */
179
+ const chunk = new SpeechFlowChunk(tsStart, tsEnd,
180
+ isFinal ? "final" : "intermediate", "text", text, meta)
181
+ this.queue.write(chunk)
182
+ }
183
+ })
184
+ this.recognizeStream.on("error", (error: Error) => {
185
+ this.log("error", `error: ${error.message}`)
186
+ if (!this.closing && this.queue !== null)
187
+ this.queue.write(null)
188
+ this.emit("error", error)
189
+ })
190
+ this.recognizeStream.on("end", () => {
191
+ this.log("info", "stream ended")
192
+ if (!this.closing && this.queue !== null)
193
+ this.queue.write(null)
194
+ })
195
+
196
+ /* remember opening time to receive time zero offset */
197
+ this.timeOpen = DateTime.now()
198
+
199
+ /* provide Duplex stream and internally attach to Google Speech API */
200
+ const self = this
201
+ const reads = new util.PromiseSet<void>()
202
+ this.stream = new Stream.Duplex({
203
+ writableObjectMode: true,
204
+ readableObjectMode: true,
205
+ decodeStrings: false,
206
+ highWaterMark: 1,
207
+ write (chunk: SpeechFlowChunk, encoding, callback) {
208
+ if (self.closing || self.recognizeStream === null) {
209
+ callback(new Error("stream already destroyed"))
210
+ return
211
+ }
212
+ if (chunk.type !== "audio")
213
+ callback(new Error("expected audio input chunk"))
214
+ else if (!Buffer.isBuffer(chunk.payload))
215
+ callback(new Error("expected Buffer input chunk"))
216
+ else {
217
+ if (chunk.payload.byteLength > 0) {
218
+ self.log("debug", `send data (${chunk.payload.byteLength} bytes)`)
219
+ if (chunk.meta.size > 0)
220
+ metastore.store(chunk.timestampStart, chunk.timestampEnd, chunk.meta)
221
+ try {
222
+ self.recognizeStream.write(chunk.payload)
223
+ }
224
+ catch (error) {
225
+ callback(util.ensureError(error, "failed to send to Google Speech"))
226
+ return
227
+ }
228
+ }
229
+ callback()
230
+ }
231
+ },
232
+ async final (callback) {
233
+ /* short-circuiting in case of own closing */
234
+ if (self.closing || self.recognizeStream === null) {
235
+ callback()
236
+ return
237
+ }
238
+
239
+ /* close Google Speech stream */
240
+ try {
241
+ self.recognizeStream.end()
242
+ }
243
+ catch (error) {
244
+ self.log("warning", `error closing Google Speech stream: ${error}`)
245
+ }
246
+
247
+ /* await all read operations */
248
+ await reads.awaitAll()
249
+ callback()
250
+ },
251
+ read (size) {
252
+ if (self.closing || self.queue === null) {
253
+ this.push(null)
254
+ return
255
+ }
256
+ reads.add(self.queue.read().then((chunk) => {
257
+ if (self.closing || self.queue === null) {
258
+ this.push(null)
259
+ return
260
+ }
261
+ if (chunk === null) {
262
+ self.log("info", "received EOF signal")
263
+ this.push(null)
264
+ }
265
+ else {
266
+ self.log("debug", `received data (${chunk.payload.length} bytes)`)
267
+ this.push(chunk)
268
+ }
269
+ }).catch((error: unknown) => {
270
+ if (!self.closing && self.queue !== null)
271
+ self.log("error", `queue read error: ${util.ensureError(error).message}`)
272
+ }))
273
+ }
274
+ })
275
+ }
276
+
277
+ /* close node */
278
+ async close () {
279
+ /* indicate closing first to stop all async operations */
280
+ this.closing = true
281
+
282
+ /* shutdown stream */
283
+ if (this.stream !== null) {
284
+ await util.destroyStream(this.stream)
285
+ this.stream = null
286
+ }
287
+
288
+ /* close Google Speech stream and client */
289
+ if (this.recognizeStream !== null) {
290
+ try {
291
+ this.recognizeStream.removeAllListeners()
292
+ this.recognizeStream.destroy()
293
+ }
294
+ catch (error) {
295
+ this.log("warning", `error during Google Speech stream cleanup: ${error}`)
296
+ }
297
+ this.recognizeStream = null
298
+ }
299
+ if (this.client !== null) {
300
+ try {
301
+ await this.client.close()
302
+ }
303
+ catch (error) {
304
+ this.log("warning", `error closing Google Speech client: ${error}`)
305
+ }
306
+ this.client = null
307
+ }
308
+
309
+ /* signal EOF to any pending read operations */
310
+ if (this.queue !== null) {
311
+ this.queue.write(null)
312
+ this.queue = null
313
+ }
314
+ }
315
+ }
@@ -23,12 +23,12 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
23
23
  public static name = "a2t-openai"
24
24
 
25
25
  /* internal state */
26
- private openai: OpenAI | null = null
27
- private ws: ws.WebSocket | null = null
28
- private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
29
- private resampler: SpeexResampler | null = null
30
- private closing = false
31
- private connectionTimeout: ReturnType<typeof setTimeout> | null = null
26
+ private openai: OpenAI | null = null
27
+ private ws: ws.WebSocket | null = null
28
+ private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
29
+ private resampler: SpeexResampler | null = null
30
+ private closing = false
31
+ private connectionTimeout: ReturnType<typeof setTimeout> | null = null
32
32
 
33
33
  /* construct node */
34
34
  constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
@@ -150,6 +150,9 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
150
150
  })
151
151
  this.ws.on("error", (err) => {
152
152
  this.log("error", `WebSocket connection error: ${err}`)
153
+ if (!this.closing && this.queue !== null)
154
+ this.queue.write(null)
155
+ this.emit("error", err)
153
156
  })
154
157
 
155
158
  /* track speech timing by item_id (OpenAI provides timestamps via VAD events) */
@@ -164,6 +167,7 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
164
167
  }, new Map<string, any>())
165
168
  }
166
169
 
170
+ /* track transcription text */
167
171
  let text = ""
168
172
  this.ws.on("message", (data) => {
169
173
  let ev: any
@@ -353,7 +357,8 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
353
357
  this.ws.close()
354
358
  this.ws = null
355
359
  }
356
- this.openai = null
360
+ if (this.openai !== null)
361
+ this.openai = null
357
362
 
358
363
  /* close resampler */
359
364
  this.resampler = null