npm - speechflow - Versions diffs - 1.6.6 → 1.7.0 - Mend

speechflow 1.6.6 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

package/speechflow-cli/src/speechflow-node-a2t-deepgram.ts CHANGED Viewed

@@ -187,6 +187,7 @@ export default class SpeechFlowNodeA2TDeepgram extends SpeechFlowNode {
         /*  provide Duplex stream and internally attach to Deepgram API  */
         const self = this
+        const reads = new util.PromiseSet<void>()
         this.stream = new Stream.Duplex({
             writableObjectMode: true,
             readableObjectMode: true,
@@ -217,12 +218,33 @@ export default class SpeechFlowNodeA2TDeepgram extends SpeechFlowNode {
                     callback()
                 }
             },
+            async final (callback) {
+                /*  short-circuiting in case of own closing  */
+                if (self.closing || self.dg === null) {
+                    callback()
+                    return
+                }
+                /*  close Deepgram API  */
+                try {
+                    self.dg.requestClose()
+                }
+                catch (error) {
+                    self.log("warning", `error closing Deepgram connection: ${error}`)
+                }
+                /*  await all read operations  */
+                await reads.awaitAll()
+                /*  NOTICE: do not push null here -- let the Deepgram close event handle it  */
+                callback()
+            },
             read (size) {
                 if (self.closing || self.queue === null) {
                     this.push(null)
                     return
                 }
-                self.queue.read().then((chunk) => {
+                reads.add(self.queue.read().then((chunk) => {
                     if (self.closing || self.queue === null) {
                         this.push(null)
                         return
@@ -238,21 +260,7 @@ export default class SpeechFlowNodeA2TDeepgram extends SpeechFlowNode {
                 }).catch((error: unknown) => {
                     if (!self.closing && self.queue !== null)
                         self.log("error", `queue read error: ${util.ensureError(error).message}`)
-                })
-            },
-            final (callback) {
-                if (self.closing || self.dg === null) {
-                    callback()
-                    return
-                }
-                try {
-                    self.dg.requestClose()
-                }
-                catch (error) {
-                    self.log("warning", `error closing Deepgram connection: ${error}`)
-                }
-                /*  NOTICE: do not push null here -- let the Deepgram close event handle it  */
-                callback()
+                }))
             }
         })
     }

package/speechflow-cli/src/speechflow-node-a2t-openai.ts CHANGED Viewed

@@ -9,7 +9,7 @@ import Stream from "node:stream"
 /*  external dependencies  */
 import OpenAI                 from "openai"
-import { DateTime }           from "luxon"
+import { DateTime, Duration } from "luxon"
 import SpeexResampler         from "speex-resampler"
 import ws                     from "ws"
@@ -23,11 +23,11 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
     public static name = "a2t-openai"
     /*  internal state  */
-    private openai:     OpenAI | null = null
-    private ws:         ws.WebSocket | null = null
-    private queue:      util.SingleQueue<SpeechFlowChunk | null> | null = null
-    private resampler:  SpeexResampler | null = null
-    private closing   = false
+    private openai:            OpenAI | null = null
+    private ws:                ws.WebSocket | null = null
+    private queue:             util.SingleQueue<SpeechFlowChunk | null> | null = null
+    private resampler:         SpeexResampler | null = null
+    private closing            = false
     private connectionTimeout: ReturnType<typeof setTimeout> | null = null
     /*  construct node  */
@@ -141,11 +141,25 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
         })
         this.ws.on("close", () => {
             this.log("info", "WebSocket connection closed")
-            this.queue!.write(null)
+            if (!this.closing && this.queue !== null)
+                this.queue.write(null)
         })
         this.ws.on("error", (err) => {
             this.log("error", `WebSocket connection error: ${err}`)
         })
+        /*  track speech timing by item_id (OpenAI provides timestamps via VAD events)  */
+        const speechTiming = new Map<string, { startMs: number, endMs: number }>()
+        /*  helper function for aggregating meta information  */
+        const aggregateMeta = (start: Duration, end: Duration): Map<string, any> => {
+            const metas = metastore.fetch(start, end)
+            return metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
+                curr.forEach((val, key) => { prev.set(key, val) })
+                return prev
+            }, new Map<string, any>())
+        }
         let text = ""
         this.ws.on("message", (data) => {
             let ev: any
@@ -163,53 +177,63 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
             switch (ev.type) {
                 case "transcription_session.created":
                     break
-                case "conversation.item.created":
+                case "conversation.item.created": {
                     text = ""
                     break
+                }
                 case "conversation.item.input_audio_transcription.delta": {
                     text += ev.delta as string
-                    if (this.params.interim) {
-                        const start = DateTime.now().diff(this.timeOpen!) // FIXME: OpenAI does not provide timestamps
-                        const end   = start                               // FIXME: OpenAI does not provide timestamps
-                        const metas = metastore.fetch(start, end)
-                        const meta = metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
-                            curr.forEach((val, key) => { prev.set(key, val) })
-                            return prev
-                        }, new Map<string, any>())
-                        const chunk = new SpeechFlowChunk(start, end, "intermediate", "text", text)
-                        chunk.meta = meta
-                        this.queue!.write(chunk)
+                    if (this.params.interim && !this.closing && this.queue !== null) {
+                        const itemId = ev.item_id as string
+                        const timing = speechTiming.get(itemId)
+                        const start  = timing ? Duration.fromMillis(timing.startMs) : DateTime.now().diff(this.timeOpen!)
+                        const end    = timing ? Duration.fromMillis(timing.endMs)   : start
+                        const chunk  = new SpeechFlowChunk(start, end, "intermediate", "text", text)
+                        chunk.meta = aggregateMeta(start, end)
+                        this.queue.write(chunk)
                     }
                     break
                 }
                 case "conversation.item.input_audio_transcription.completed": {
-                    text = ev.transcript as string
-                    const start = DateTime.now().diff(this.timeOpen!) // FIXME: OpenAI does not provide timestamps
-                    const end   = start                               // FIXME: OpenAI does not provide timestamps
-                    const metas = metastore.fetch(start, end)
-                    const meta = metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
-                        curr.forEach((val, key) => { prev.set(key, val) })
-                        return prev
-                    }, new Map<string, any>())
-                    metastore.prune(start)
-                    const chunk = new SpeechFlowChunk(start, end, "final", "text", text)
-                    chunk.meta = meta
-                    this.queue!.write(chunk)
-                    text = ""
+                    if (!this.closing && this.queue !== null) {
+                        text = ev.transcript as string
+                        const itemId = ev.item_id as string
+                        const timing = speechTiming.get(itemId)
+                        const start  = timing ? Duration.fromMillis(timing.startMs) : DateTime.now().diff(this.timeOpen!)
+                        const end    = timing ? Duration.fromMillis(timing.endMs)   : start
+                        const chunk  = new SpeechFlowChunk(start, end, "final", "text", text)
+                        chunk.meta = aggregateMeta(start, end)
+                        metastore.prune(start)
+                        speechTiming.delete(itemId)
+                        this.queue.write(chunk)
+                        text = ""
+                    }
                     break
                 }
-                case "input_audio_buffer.speech_started":
+                case "input_audio_buffer.speech_started": {
                     this.log("info", "VAD: speech started")
+                    const itemId = ev.item_id as string
+                    const audioStartMs = ev.audio_start_ms as number
+                    speechTiming.set(itemId, { startMs: audioStartMs, endMs: audioStartMs })
                     break
-                case "input_audio_buffer.speech_stopped":
+                }
+                case "input_audio_buffer.speech_stopped": {
                     this.log("info", "VAD: speech stopped")
+                    const itemId = ev.item_id as string
+                    const audioEndMs = ev.audio_end_ms as number
+                    const timing = speechTiming.get(itemId)
+                    if (timing)
+                        timing.endMs = audioEndMs
                     break
-                case "input_audio_buffer.committed":
+                }
+                case "input_audio_buffer.committed": {
                     this.log("info", "input buffer committed")
                     break
-                case "error":
+                }
+                case "error": {
                     this.log("error", `error: ${ev.error?.message}`)
                     break
+                }
                 default:
                     break
             }
@@ -220,6 +244,7 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
         /*  provide Duplex stream and internally attach to OpenAI API  */
         const self = this
+        const reads = new util.PromiseSet<void>()
         this.stream = new Stream.Duplex({
             writableObjectMode: true,
             readableObjectMode: true,
@@ -255,12 +280,32 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
                     callback()
                 }
             },
+            async final (callback) {
+                if (self.closing || self.ws === null) {
+                    callback()
+                    return
+                }
+                try {
+                    sendMessage({ type: "input_audio_buffer.commit" })
+                    self.ws.close()
+                    await util.sleep(50)
+                }
+                catch (error) {
+                    self.log("warning", `error closing OpenAI connection: ${error}`)
+                }
+                await reads.awaitAll()
+                const chunks: Array<SpeechFlowChunk | null> = self.queue?.drain() ?? []
+                for (const chunk of chunks)
+                    this.push(chunk)
+                this.push(null)
+                callback()
+            },
             read (size) {
                 if (self.closing || self.queue === null) {
                     this.push(null)
                     return
                 }
-                self.queue.read().then((chunk) => {
+                reads.add(self.queue.read().then((chunk) => {
                     if (self.closing || self.queue === null) {
                         this.push(null)
                         return
@@ -276,23 +321,7 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
                 }).catch((error: unknown) => {
                     if (!self.closing && self.queue !== null)
                         self.log("error", `queue read error: ${util.ensureError(error).message}`)
-                })
-            },
-            final (callback) {
-                if (self.closing || self.ws === null) {
-                    callback()
-                    return
-                }
-                try {
-                    sendMessage({ type: "input_audio_buffer.commit" })
-                    self.ws.close()
-                    /*  NOTICE: do not push null here -- let the OpenAI close event handle it  */
-                    callback()
-                }
-                catch (error) {
-                    self.log("warning", `error closing OpenAI connection: ${error}`)
-                    callback(util.ensureError(error, "failed to close OpenAI connection"))
-                }
+                }))
             }
         })
     }
@@ -316,11 +345,14 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
         /*  close OpenAI connection  */
         if (this.ws !== null) {
+            this.ws.removeAllListeners()
             this.ws.close()
             this.ws = null
         }
         if (this.openai !== null)
             this.openai = null
+        if (this.resampler !== null)
+            this.resampler = null
         /*  shutdown stream  */
         if (this.stream !== null) {

package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts CHANGED Viewed

@@ -10,6 +10,7 @@ import Stream from "node:stream"
 /*  external dependencies  */
 import * as ElevenLabs       from "@elevenlabs/elevenlabs-js"
 import { getStreamAsBuffer } from "get-stream"
+import { Duration }          from "luxon"
 import SpeexResampler        from "speex-resampler"
 /*  internal dependencies  */
@@ -140,58 +141,60 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
             readableObjectMode: true,
             decodeStrings:      false,
             highWaterMark:      1,
-            transform (chunk: SpeechFlowChunk, encoding, callback) {
+            async transform (chunk: SpeechFlowChunk, encoding, callback) {
                 if (self.closing)
                     callback(new Error("stream already destroyed"))
                 else if (Buffer.isBuffer(chunk.payload))
                     callback(new Error("invalid chunk payload type"))
                 else {
-                    (async () => {
-                        let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
+                    let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
+                        processTimeout = null
+                        callback(new Error("ElevenLabs API timeout"))
+                    }, 60 * 1000)
+                    const clearProcessTimeout = () => {
+                        if (processTimeout !== null) {
+                            clearTimeout(processTimeout)
                             processTimeout = null
-                            callback(new Error("ElevenLabs API timeout"))
-                        }, 60 * 1000)
-                        const clearProcessTimeout = () => {
-                            if (processTimeout !== null) {
-                                clearTimeout(processTimeout)
-                                processTimeout = null
-                            }
                         }
-                        try {
-                            if (self.closing) {
-                                clearProcessTimeout()
-                                callback(new Error("stream destroyed during processing"))
-                                return
-                            }
-                            const stream = await speechStream(chunk.payload as string)
-                            const buffer = await getStreamAsBuffer(stream)
-                            if (self.closing) {
-                                clearProcessTimeout()
-                                callback(new Error("stream destroyed during processing"))
-                                return
-                            }
-                            const bufferResampled = self.resampler!.processChunk(buffer)
-                            self.log("info", `ElevenLabs: received audio (buffer length: ${buffer.byteLength})`)
-                            const chunkNew = chunk.clone()
-                            chunkNew.type = "audio"
-                            chunkNew.payload = bufferResampled
+                    }
+                    try {
+                        if (self.closing) {
                             clearProcessTimeout()
-                            this.push(chunkNew)
-                            callback()
+                            callback(new Error("stream destroyed during processing"))
+                            return
                         }
-                        catch (error) {
+                        const stream = await speechStream(chunk.payload as string)
+                        const buffer = await getStreamAsBuffer(stream)
+                        if (self.closing) {
                             clearProcessTimeout()
-                            callback(util.ensureError(error, "ElevenLabs processing failed"))
+                            callback(new Error("stream destroyed during processing"))
+                            return
                         }
-                    })()
+                        self.log("info", `ElevenLabs: received audio (buffer length: ${buffer.byteLength})`)
+                        const bufferResampled = self.resampler!.processChunk(buffer)
+                        self.log("info", "ElevenLabs: forwarding resampled audio " +
+                            `(buffer length: ${bufferResampled.byteLength})`)
+                        /*  calculate actual audio duration from PCM buffer size  */
+                        const durationMs = util.audioBufferDuration(bufferResampled,
+                            self.config.audioSampleRate, self.config.audioBitDepth) * 1000
+                        /*  create new chunk with recalculated timestamps  */
+                        const chunkNew = chunk.clone()
+                        chunkNew.type         = "audio"
+                        chunkNew.payload      = bufferResampled
+                        chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
+                        clearProcessTimeout()
+                        this.push(chunkNew)
+                        callback()
+                    }
+                    catch (error) {
+                        clearProcessTimeout()
+                        callback(util.ensureError(error, "ElevenLabs processing failed"))
+                    }
                 }
             },
             final (callback) {
-                if (self.closing) {
-                    callback()
-                    return
-                }
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts CHANGED Viewed

@@ -142,7 +142,6 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-amazon.ts CHANGED Viewed

@@ -98,7 +98,7 @@ export default class SpeechFlowNodeT2TAmazon extends SpeechFlowNode {
                     if (!retriable || attempt >= maxRetries)
                         break
                     const delayMs = Math.min(1000 * Math.pow(2, attempt - 1), 5000)
-                    await new Promise((resolve) => setTimeout(resolve, delayMs))
+                    await util.sleep(delayMs)
                 }
             }
             throw util.ensureError(lastError)
@@ -129,7 +129,6 @@ export default class SpeechFlowNodeT2TAmazon extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-deepl.ts CHANGED Viewed

@@ -100,7 +100,6 @@ export default class SpeechFlowNodeT2TDeepL extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-format.ts CHANGED Viewed

@@ -64,7 +64,6 @@ export default class SpeechFlowNodeT2TFormat extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-google.ts CHANGED Viewed

@@ -110,7 +110,6 @@ export default class SpeechFlowNodeT2TGoogle extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-modify.ts CHANGED Viewed

@@ -67,7 +67,6 @@ export default class SpeechFlowNodeT2TModify extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-ollama.ts CHANGED Viewed

@@ -258,7 +258,6 @@ export default class SpeechFlowNodeT2TOllama extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-openai.ts CHANGED Viewed

@@ -226,7 +226,6 @@ export default class SpeechFlowNodeT2TOpenAI extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })