npm - speechflow - Versions diffs - 1.6.7 → 1.7.1 - Mend

speechflow 1.6.7 → 1.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (152) hide show

package/speechflow-cli/src/speechflow-node-a2t-openai.ts CHANGED Viewed

@@ -9,7 +9,7 @@ import Stream from "node:stream"
 /*  external dependencies  */
 import OpenAI                 from "openai"
-import { DateTime }           from "luxon"
+import { DateTime, Duration } from "luxon"
 import SpeexResampler         from "speex-resampler"
 import ws                     from "ws"
@@ -23,11 +23,11 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
     public static name = "a2t-openai"
     /*  internal state  */
-    private openai:     OpenAI | null = null
-    private ws:         ws.WebSocket | null = null
-    private queue:      util.SingleQueue<SpeechFlowChunk | null> | null = null
-    private resampler:  SpeexResampler | null = null
-    private closing   = false
+    private openai:            OpenAI | null = null
+    private ws:                ws.WebSocket | null = null
+    private queue:             util.SingleQueue<SpeechFlowChunk | null> | null = null
+    private resampler:         SpeexResampler | null = null
+    private closing            = false
     private connectionTimeout: ReturnType<typeof setTimeout> | null = null
     /*  construct node  */
@@ -43,6 +43,10 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
             interim:  { type: "boolean", val: false }
         })
+        /*  sanity check parameters  */
+        if (!this.params.key)
+            throw new Error("OpenAI API key not configured")
         /*  declare node input/output format  */
         this.input  = "audio"
         this.output = "text"
@@ -141,11 +145,25 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
         })
         this.ws.on("close", () => {
             this.log("info", "WebSocket connection closed")
-            this.queue!.write(null)
+            if (!this.closing && this.queue !== null)
+                this.queue.write(null)
         })
         this.ws.on("error", (err) => {
             this.log("error", `WebSocket connection error: ${err}`)
         })
+        /*  track speech timing by item_id (OpenAI provides timestamps via VAD events)  */
+        const speechTiming = new Map<string, { startMs: number, endMs: number }>()
+        /*  helper function for aggregating meta information  */
+        const aggregateMeta = (start: Duration, end: Duration): Map<string, any> => {
+            const metas = metastore.fetch(start, end)
+            return metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
+                curr.forEach((val, key) => { prev.set(key, val) })
+                return prev
+            }, new Map<string, any>())
+        }
         let text = ""
         this.ws.on("message", (data) => {
             let ev: any
@@ -163,53 +181,63 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
             switch (ev.type) {
                 case "transcription_session.created":
                     break
-                case "conversation.item.created":
+                case "conversation.item.created": {
                     text = ""
                     break
+                }
                 case "conversation.item.input_audio_transcription.delta": {
                     text += ev.delta as string
-                    if (this.params.interim) {
-                        const start = DateTime.now().diff(this.timeOpen!) // FIXME: OpenAI does not provide timestamps
-                        const end   = start                               // FIXME: OpenAI does not provide timestamps
-                        const metas = metastore.fetch(start, end)
-                        const meta = metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
-                            curr.forEach((val, key) => { prev.set(key, val) })
-                            return prev
-                        }, new Map<string, any>())
-                        const chunk = new SpeechFlowChunk(start, end, "intermediate", "text", text)
-                        chunk.meta = meta
-                        this.queue!.write(chunk)
+                    if (this.params.interim && !this.closing && this.queue !== null) {
+                        const itemId = ev.item_id as string
+                        const timing = speechTiming.get(itemId)
+                        const start  = timing ? Duration.fromMillis(timing.startMs) : DateTime.now().diff(this.timeOpen!)
+                        const end    = timing ? Duration.fromMillis(timing.endMs)   : start
+                        const chunk  = new SpeechFlowChunk(start, end, "intermediate", "text", text)
+                        chunk.meta = aggregateMeta(start, end)
+                        this.queue.write(chunk)
                     }
                     break
                 }
                 case "conversation.item.input_audio_transcription.completed": {
-                    text = ev.transcript as string
-                    const start = DateTime.now().diff(this.timeOpen!) // FIXME: OpenAI does not provide timestamps
-                    const end   = start                               // FIXME: OpenAI does not provide timestamps
-                    const metas = metastore.fetch(start, end)
-                    const meta = metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
-                        curr.forEach((val, key) => { prev.set(key, val) })
-                        return prev
-                    }, new Map<string, any>())
-                    metastore.prune(start)
-                    const chunk = new SpeechFlowChunk(start, end, "final", "text", text)
-                    chunk.meta = meta
-                    this.queue!.write(chunk)
-                    text = ""
+                    if (!this.closing && this.queue !== null) {
+                        text = ev.transcript as string
+                        const itemId = ev.item_id as string
+                        const timing = speechTiming.get(itemId)
+                        const start  = timing ? Duration.fromMillis(timing.startMs) : DateTime.now().diff(this.timeOpen!)
+                        const end    = timing ? Duration.fromMillis(timing.endMs)   : start
+                        const chunk  = new SpeechFlowChunk(start, end, "final", "text", text)
+                        chunk.meta = aggregateMeta(start, end)
+                        metastore.prune(start)
+                        speechTiming.delete(itemId)
+                        this.queue.write(chunk)
+                        text = ""
+                    }
                     break
                 }
-                case "input_audio_buffer.speech_started":
+                case "input_audio_buffer.speech_started": {
                     this.log("info", "VAD: speech started")
+                    const itemId = ev.item_id as string
+                    const audioStartMs = ev.audio_start_ms as number
+                    speechTiming.set(itemId, { startMs: audioStartMs, endMs: audioStartMs })
                     break
-                case "input_audio_buffer.speech_stopped":
+                }
+                case "input_audio_buffer.speech_stopped": {
                     this.log("info", "VAD: speech stopped")
+                    const itemId = ev.item_id as string
+                    const audioEndMs = ev.audio_end_ms as number
+                    const timing = speechTiming.get(itemId)
+                    if (timing)
+                        timing.endMs = audioEndMs
                     break
-                case "input_audio_buffer.committed":
+                }
+                case "input_audio_buffer.committed": {
                     this.log("info", "input buffer committed")
                     break
-                case "error":
+                }
+                case "error": {
                     this.log("error", `error: ${ev.error?.message}`)
                     break
+                }
                 default:
                     break
             }
@@ -220,6 +248,7 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
         /*  provide Duplex stream and internally attach to OpenAI API  */
         const self = this
+        const reads = new util.PromiseSet<void>()
         this.stream = new Stream.Duplex({
             writableObjectMode: true,
             readableObjectMode: true,
@@ -255,12 +284,32 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
                     callback()
                 }
             },
+            async final (callback) {
+                if (self.closing || self.ws === null) {
+                    callback()
+                    return
+                }
+                try {
+                    sendMessage({ type: "input_audio_buffer.commit" })
+                    self.ws.close()
+                    await util.sleep(50)
+                }
+                catch (error) {
+                    self.log("warning", `error closing OpenAI connection: ${error}`)
+                }
+                await reads.awaitAll()
+                const chunks: Array<SpeechFlowChunk | null> = self.queue?.drain() ?? []
+                for (const chunk of chunks)
+                    this.push(chunk)
+                this.push(null)
+                callback()
+            },
             read (size) {
                 if (self.closing || self.queue === null) {
                     this.push(null)
                     return
                 }
-                self.queue.read().then((chunk) => {
+                reads.add(self.queue.read().then((chunk) => {
                     if (self.closing || self.queue === null) {
                         this.push(null)
                         return
@@ -276,23 +325,7 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
                 }).catch((error: unknown) => {
                     if (!self.closing && self.queue !== null)
                         self.log("error", `queue read error: ${util.ensureError(error).message}`)
-                })
-            },
-            final (callback) {
-                if (self.closing || self.ws === null) {
-                    callback()
-                    return
-                }
-                try {
-                    sendMessage({ type: "input_audio_buffer.commit" })
-                    self.ws.close()
-                    /*  NOTICE: do not push null here -- let the OpenAI close event handle it  */
-                    callback()
-                }
-                catch (error) {
-                    self.log("warning", `error closing OpenAI connection: ${error}`)
-                    callback(util.ensureError(error, "failed to close OpenAI connection"))
-                }
+                }))
             }
         })
     }
@@ -316,11 +349,14 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
         /*  close OpenAI connection  */
         if (this.ws !== null) {
+            this.ws.removeAllListeners()
             this.ws.close()
             this.ws = null
         }
-        if (this.openai !== null)
-            this.openai = null
+        this.openai = null
+        /*  close resampler  */
+        this.resampler = null
         /*  shutdown stream  */
         if (this.stream !== null) {

package/speechflow-cli/src/speechflow-node-t2a-amazon.ts CHANGED Viewed

@@ -83,7 +83,7 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
             "Ruth":     { language: "en", languageCode: "en-US", engine: "generative" },
             "Stephen":  { language: "en", languageCode: "en-US", engine: "generative" },
             "Vicki":    { language: "de", languageCode: "de-DE", engine: "generative" },
-            "Daniel":   { language: "de", languageCode: "de-DE", engine: "generative" },
+            "Daniel":   { language: "de", languageCode: "de-DE", engine: "generative" }
         }
         const voiceConfig = voices[this.params.voice as keyof typeof voices]
         if (voiceConfig === undefined)
@@ -147,11 +147,6 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
                     callback()
             },
             final (callback) {
-                if (self.closing) {
-                    callback()
-                    return
-                }
-                this.push(null)
                 callback()
             }
         })
@@ -162,6 +157,12 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
         /*  indicate closing  */
         this.closing = true
+        /*  shutdown stream  */
+        if (this.stream !== null) {
+            await util.destroyStream(this.stream)
+            this.stream = null
+        }
         /*  destroy resampler  */
         if (this.resampler !== null)
             this.resampler = null
@@ -171,11 +172,6 @@ export default class SpeechFlowNodeT2AAmazon extends SpeechFlowNode {
             this.client.destroy()
             this.client = null
         }
-        /*  shutdown stream  */
-        if (this.stream !== null) {
-            await util.destroyStream(this.stream)
-            this.stream = null
-        }
     }
 }

package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts CHANGED Viewed

@@ -10,6 +10,7 @@ import Stream from "node:stream"
 /*  external dependencies  */
 import * as ElevenLabs       from "@elevenlabs/elevenlabs-js"
 import { getStreamAsBuffer } from "get-stream"
+import { Duration }          from "luxon"
 import SpeexResampler        from "speex-resampler"
 /*  internal dependencies  */
@@ -102,14 +103,15 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
                 throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`)
         }
         const labels = voice.labels ?? {}
-        const info = Object.keys(labels).length > 0 ?
-            ", " + Object.entries(labels).map(([ key, val ]) => `${key}: "${val}"`).join(", ") : ""
+        const info = Object.keys(labels).length > 0
+            ? ", " + Object.entries(labels).map(([ key, val ]) => `${key}: "${val}"`).join(", ")
+            : ""
         this.log("info", `selected voice: name: "${voice.name}"${info}`)
         /*  perform text-to-speech operation with Elevenlabs API  */
-        const model = this.params.optimize === "quality" ?
-            "eleven_turbo_v2_5" :
-            "eleven_flash_v2_5"
+        const model = this.params.optimize === "quality"
+            ? "eleven_turbo_v2_5"
+            : "eleven_flash_v2_5"
         const speechStream = (text: string) => {
             this.log("info", `ElevenLabs: send text "${text}"`)
             return this.elevenlabs!.textToSpeech.convert(voice.voiceId, {
@@ -140,58 +142,60 @@ export default class SpeechFlowNodeT2AElevenlabs extends SpeechFlowNode {
             readableObjectMode: true,
             decodeStrings:      false,
             highWaterMark:      1,
-            transform (chunk: SpeechFlowChunk, encoding, callback) {
+            async transform (chunk: SpeechFlowChunk, encoding, callback) {
                 if (self.closing)
                     callback(new Error("stream already destroyed"))
                 else if (Buffer.isBuffer(chunk.payload))
                     callback(new Error("invalid chunk payload type"))
                 else {
-                    (async () => {
-                        let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
+                    let processTimeout: ReturnType<typeof setTimeout> | null = setTimeout(() => {
+                        processTimeout = null
+                        callback(new Error("ElevenLabs API timeout"))
+                    }, 60 * 1000)
+                    const clearProcessTimeout = () => {
+                        if (processTimeout !== null) {
+                            clearTimeout(processTimeout)
                             processTimeout = null
-                            callback(new Error("ElevenLabs API timeout"))
-                        }, 60 * 1000)
-                        const clearProcessTimeout = () => {
-                            if (processTimeout !== null) {
-                                clearTimeout(processTimeout)
-                                processTimeout = null
-                            }
                         }
-                        try {
-                            if (self.closing) {
-                                clearProcessTimeout()
-                                callback(new Error("stream destroyed during processing"))
-                                return
-                            }
-                            const stream = await speechStream(chunk.payload as string)
-                            const buffer = await getStreamAsBuffer(stream)
-                            if (self.closing) {
-                                clearProcessTimeout()
-                                callback(new Error("stream destroyed during processing"))
-                                return
-                            }
-                            const bufferResampled = self.resampler!.processChunk(buffer)
-                            self.log("info", `ElevenLabs: received audio (buffer length: ${buffer.byteLength})`)
-                            const chunkNew = chunk.clone()
-                            chunkNew.type = "audio"
-                            chunkNew.payload = bufferResampled
+                    }
+                    try {
+                        if (self.closing) {
                             clearProcessTimeout()
-                            this.push(chunkNew)
-                            callback()
+                            callback(new Error("stream destroyed during processing"))
+                            return
                         }
-                        catch (error) {
+                        const stream = await speechStream(chunk.payload as string)
+                        const buffer = await getStreamAsBuffer(stream)
+                        if (self.closing) {
                             clearProcessTimeout()
-                            callback(util.ensureError(error, "ElevenLabs processing failed"))
+                            callback(new Error("stream destroyed during processing"))
+                            return
                         }
-                    })()
+                        self.log("info", `ElevenLabs: received audio (buffer length: ${buffer.byteLength})`)
+                        const bufferResampled = self.resampler!.processChunk(buffer)
+                        self.log("info", "ElevenLabs: forwarding resampled audio " +
+                            `(buffer length: ${bufferResampled.byteLength})`)
+                        /*  calculate actual audio duration from PCM buffer size  */
+                        const durationMs = util.audioBufferDuration(bufferResampled,
+                            self.config.audioSampleRate, self.config.audioBitDepth) * 1000
+                        /*  create new chunk with recalculated timestamps  */
+                        const chunkNew = chunk.clone()
+                        chunkNew.type         = "audio"
+                        chunkNew.payload      = bufferResampled
+                        chunkNew.timestampEnd = Duration.fromMillis(chunkNew.timestampStart.toMillis() + durationMs)
+                        clearProcessTimeout()
+                        this.push(chunkNew)
+                        callback()
+                    }
+                    catch (error) {
+                        clearProcessTimeout()
+                        callback(util.ensureError(error, "ElevenLabs processing failed"))
+                    }
                 }
             },
             final (callback) {
-                if (self.closing) {
-                    callback()
-                    return
-                }
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts CHANGED Viewed

@@ -22,6 +22,7 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
     /*  internal state  */
     private kokoro: KokoroTTS | null = null
+    private closing = false
     private resampler: SpeexResampler | null = null
     /*  construct node  */
@@ -32,7 +33,7 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
         this.configure({
             voice:    { type: "string", val: "Aoede", pos: 0, match: /^(?:Aoede|Heart|Puck|Fenrir)$/ },
             language: { type: "string", val: "en",    pos: 1, match: /^(?:en)$/ },
-            speed:    { type: "number", val: 1.25,    pos: 2, match: (n: number) => n >= 1.0 && n <= 1.30 },
+            speed:    { type: "number", val: 1.25,    pos: 2, match: (n: number) => n >= 1.0 && n <= 1.30 }
         })
         /*  declare node input/output format  */
@@ -40,8 +41,16 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
         this.output = "audio"
     }
+    /*  one-time status of node  */
+    async status () {
+        return {}
+    }
     /*  open node  */
     async open () {
+        /*  clear destruction flag  */
+        this.closing = false
         /*  establish Kokoro  */
         const model = "onnx-community/Kokoro-82M-v1.0-ONNX"
         const progressState = new Map<string, number>()
@@ -126,15 +135,19 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
             decodeStrings:      false,
             highWaterMark:      1,
             transform (chunk: SpeechFlowChunk, encoding, callback) {
-                if (Buffer.isBuffer(chunk.payload))
+                if (self.closing)
+                    callback(new Error("stream already destroyed"))
+                else if (Buffer.isBuffer(chunk.payload))
                     callback(new Error("invalid chunk payload type"))
                 else {
                     text2speech(chunk.payload).then((buffer) => {
+                        if (self.closing)
+                            throw new Error("stream destroyed during processing")
                         self.log("info", `Kokoro: received audio (buffer length: ${buffer.byteLength})`)
-                        chunk = chunk.clone()
-                        chunk.type = "audio"
-                        chunk.payload = buffer
-                        this.push(chunk)
+                        const chunkNew = chunk.clone()
+                        chunkNew.type = "audio"
+                        chunkNew.payload = buffer
+                        this.push(chunkNew)
                         callback()
                     }).catch((error: unknown) => {
                         callback(util.ensureError(error))
@@ -142,7 +155,6 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })
@@ -150,6 +162,9 @@ export default class SpeechFlowNodeT2AKokoro extends SpeechFlowNode {
     /*  close node  */
     async close () {
+        /*  indicate closing  */
+        this.closing = true
         /*  shutdown stream  */
         if (this.stream !== null) {
             await util.destroyStream(this.stream)

package/speechflow-cli/src/speechflow-node-t2t-amazon.ts CHANGED Viewed

@@ -98,7 +98,7 @@ export default class SpeechFlowNodeT2TAmazon extends SpeechFlowNode {
                     if (!retriable || attempt >= maxRetries)
                         break
                     const delayMs = Math.min(1000 * Math.pow(2, attempt - 1), 5000)
-                    await new Promise((resolve) => setTimeout(resolve, delayMs))
+                    await util.sleep(delayMs)
                 }
             }
             throw util.ensureError(lastError)
@@ -129,7 +129,6 @@ export default class SpeechFlowNodeT2TAmazon extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-deepl.ts CHANGED Viewed

@@ -100,7 +100,6 @@ export default class SpeechFlowNodeT2TDeepL extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-format.ts CHANGED Viewed

@@ -64,7 +64,6 @@ export default class SpeechFlowNodeT2TFormat extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-google.ts CHANGED Viewed

@@ -110,7 +110,6 @@ export default class SpeechFlowNodeT2TGoogle extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-modify.ts CHANGED Viewed

@@ -67,7 +67,6 @@ export default class SpeechFlowNodeT2TModify extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-ollama.ts CHANGED Viewed

@@ -258,7 +258,6 @@ export default class SpeechFlowNodeT2TOllama extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-t2t-openai.ts CHANGED Viewed

@@ -226,7 +226,6 @@ export default class SpeechFlowNodeT2TOpenAI extends SpeechFlowNode {
                 }
             },
             final (callback) {
-                this.push(null)
                 callback()
             }
         })