npm - speechflow - Versions diffs - 1.7.1 → 2.0.1 - Mend

speechflow 1.7.1 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (177) hide show

package/speechflow-cli/src/speechflow-node-a2a-vad.ts CHANGED Viewed

@@ -85,6 +85,18 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
             }
         }
+        /*  helper function for tail timer handling  */
+        const startTailTimer = () => {
+            tail = true
+            clearTailTimer()
+            this.tailTimer = setTimeout(() => {
+                if (this.closing || this.tailTimer === null)
+                    return
+                tail = false
+                this.tailTimer = null
+            }, this.params.postSpeechTail)
+        }
         /*  establish Voice Activity Detection (VAD) facility  */
         let tail = false
         try {
@@ -111,31 +123,15 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
                         return
                     const duration = util.audioArrayDuration(audio, vadSampleRateTarget)
                     this.log("info", `VAD: speech end (duration: ${duration.toFixed(2)}s)`)
-                    if (this.params.mode === "unplugged") {
-                        tail = true
-                        clearTailTimer()
-                        this.tailTimer = setTimeout(() => {
-                            if (this.closing || this.tailTimer === null)
-                                return
-                            tail = false
-                            this.tailTimer = null
-                        }, this.params.postSpeechTail)
-                    }
+                    if (this.params.mode === "unplugged")
+                        startTailTimer()
                 },
                 onVADMisfire: () => {
                     if (this.closing)
                         return
                     this.log("info", "VAD: speech end (segment too short)")
-                    if (this.params.mode === "unplugged") {
-                        tail = true
-                        clearTailTimer()
-                        this.tailTimer = setTimeout(() => {
-                            if (this.closing || this.tailTimer === null)
-                                return
-                            tail = false
-                            this.tailTimer = null
-                        }, this.params.postSpeechTail)
-                    }
+                    if (this.params.mode === "unplugged")
+                        startTailTimer()
                 },
                 onFrameProcessed: (audio) => {
                     if (this.closing)
@@ -144,7 +140,7 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
                         /*  annotate the current audio segment  */
                         const element = this.queueVAD.peek()
                         if (element === undefined || element.type !== "audio-frame")
-                            throw new Error("internal error which cannot happen: no more queued element")
+                            throw new Error("internal error that cannot happen: no more queued element")
                         if (element.segmentIdx >= element.segmentData.length)
                             throw new Error("segment index out of bounds")
                         const segment = element.segmentData[element.segmentIdx++]
@@ -227,6 +223,7 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
                             }
                         }
+                        /*  signal completion  */
                         callback()
                     }
                     catch (error) {
@@ -322,6 +319,7 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
                         }
                     }
+                    /*  peek at send queue element  */
                     const element = self.queueSend.peek()
                     if (element !== undefined && element.type === "audio-eof")
                         this.push(null)
@@ -371,8 +369,7 @@ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode {
         if (this.vad !== null) {
             try {
                 const flushPromise = this.vad.flush()
-                const timeoutPromise = new Promise((resolve) =>
-                    setTimeout(resolve, 5000))
+                const timeoutPromise = new Promise((resolve) => { setTimeout(resolve, 5000) })
                 await Promise.race([ flushPromise, timeoutPromise ])
             }
             catch (error) {

package/speechflow-cli/src/speechflow-node-a2a-wav.ts CHANGED Viewed

@@ -21,15 +21,18 @@ const writeWavHeader = (
     const sampleRate   = options?.sampleRate  ?? 44100 /* 44KHz */
     const bitDepth     = options?.bitDepth    ?? 16    /* 16-Bit */
+    /*  determine header dimensions  */
     const headerLength = 44
     const maxDataSize  = Math.pow(2, 32) - 100 /* safe maximum for 32-bit WAV files */
     const dataLength   = length ?? maxDataSize
     const fileSize     = dataLength + headerLength
     const header       = Buffer.alloc(headerLength)
+    /*  calculate byte rate and block alignment  */
     const byteRate     = (sampleRate * channels * bitDepth) / 8
     const blockAlign   = (channels * bitDepth) / 8
+    /*  write header fields  */
     let offset = 0
     header.write("RIFF", offset);               offset += 4
     header.writeUInt32LE(fileSize - 8, offset); offset += 4
@@ -45,6 +48,7 @@ const writeWavHeader = (
     header.write("data", offset);               offset += 4
     header.writeUInt32LE(dataLength, offset);   offset += 4
+    /*  return completed header  */
     return header
 }
@@ -53,6 +57,7 @@ const readWavHeader = (buffer: Buffer) => {
     if (buffer.length < 44)
         throw new Error("WAV header too short, expected at least 44 bytes")
+    /*  read header fields  */
     let offset = 0
     const riffHead     = buffer.subarray(offset, offset + 4).toString(); offset += 4
     const fileSize     = buffer.readUInt32LE(offset);                    offset += 4
@@ -68,6 +73,7 @@ const readWavHeader = (buffer: Buffer) => {
     const data         = buffer.subarray(offset, offset + 4).toString(); offset += 4
     const dataLength   = buffer.readUInt32LE(offset);                    offset += 4
+    /*  validate RIFF header  */
     if (riffHead !== "RIFF")
         throw new Error(`Invalid WAV file: expected RIFF header, got "${riffHead}"`)
     if (waveHead !== "WAVE")
@@ -77,6 +83,7 @@ const readWavHeader = (buffer: Buffer) => {
     if (data !== "data")
         throw new Error(`Invalid WAV file: expected "data" header, got "${data}"`)
+    /*  return parsed header data  */
     return {
         riffHead, fileSize, waveHead, fmtHead, formatLength, audioFormat,
         channels, sampleRate, byteRate, blockAlign, bitDepth, data, dataLength
@@ -94,7 +101,8 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
         /*  declare node configuration parameters  */
         this.configure({
-            mode: { type: "string", pos: 1, val: "encode", match: /^(?:encode|decode)$/ }
+            mode:     { type: "string",  pos: 0, val: "encode", match: /^(?:encode|decode)$/ },
+            seekable: { type: "boolean", pos: 1, val: false }
         })
         /*  declare node input/output format  */
@@ -106,7 +114,9 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
     async open () {
         /*  establish a transform stream  */
         const self = this
-        let firstChunk = true
+        let isFirstChunk = true
+        let headerChunkSent: SpeechFlowChunk | null = null
+        let totalSize = 0
         this.stream = new Stream.Transform({
             readableObjectMode: true,
             writableObjectMode: true,
@@ -115,7 +125,7 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
             transform (chunk: SpeechFlowChunk, encoding, callback) {
                 if (!Buffer.isBuffer(chunk.payload))
                     callback(new Error("invalid chunk payload type"))
-                else if (firstChunk) {
+                else if (isFirstChunk) {
                     if (self.params.mode === "encode") {
                         /*  convert raw/PCM to WAV/PCM
                             (NOTICE: as this is a continuous stream, the
@@ -132,7 +142,9 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
                         const headerChunk = chunk.clone()
                         headerChunk.payload = headerBuffer
                         this.push(headerChunk)
+                        headerChunkSent = headerChunk
                         this.push(chunk)
+                        totalSize += chunk.payload.byteLength
                         callback()
                     }
                     else if (self.params.mode === "decode") {
@@ -173,21 +185,36 @@ export default class SpeechFlowNodeA2AWAV extends SpeechFlowNode {
                         }
                         chunk.payload = chunk.payload.subarray(44)
                         this.push(chunk)
+                        totalSize += chunk.payload.byteLength
                         callback()
                     }
                     else {
                         callback(new Error(`invalid operation mode "${self.params.mode}"`))
                         return
                     }
-                    firstChunk = false
+                    isFirstChunk = false
                 }
                 else {
                     /*  pass-through original chunk  */
                     this.push(chunk)
+                    totalSize += chunk.payload.byteLength
                     callback()
                 }
             },
             final (callback) {
+                if (self.params.seekable && headerChunkSent !== null) {
+                    self.log("info", "sending updated WAV header")
+                    const headerBuffer = writeWavHeader(totalSize, {
+                        audioFormat: 0x0001 /* PCM */,
+                        channels:    self.config.audioChannels,
+                        sampleRate:  self.config.audioSampleRate,
+                        bitDepth:    self.config.audioBitDepth
+                    })
+                    const headerChunk = headerChunkSent?.clone()
+                    headerChunk.payload = headerBuffer
+                    headerChunk.meta.set("chunk:seek", 0)
+                    this.push(headerChunk)
+                }
                 callback()
             }
         })

package/speechflow-cli/src/speechflow-node-a2t-amazon.ts CHANGED Viewed

@@ -53,7 +53,7 @@ class AsyncQueue<T> {
                 continue
             }
             else {
-                const it = await new Promise<IteratorResult<T>>((resolve) => this.resolvers.push(resolve))
+                const it = await new Promise<IteratorResult<T>>((resolve) => { this.resolvers.push(resolve) })
                 if (it.done)
                     return
                 yield it.value
@@ -68,11 +68,10 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
     public static name = "a2t-amazon"
     /*  internal state  */
-    private client:            TranscribeStreamingClient                | null = null
-    private clientStream:      AsyncIterable<TranscriptResultStream>    | null = null
-    private closing                                                            = false
-    private connectionTimeout: ReturnType<typeof setTimeout>            | null = null
-    private queue:             util.SingleQueue<SpeechFlowChunk | null> | null = null
+    private client:       TranscribeStreamingClient                | null = null
+    private clientStream: AsyncIterable<TranscriptResultStream>    | null = null
+    private closing                                                       = false
+    private queue:        util.SingleQueue<SpeechFlowChunk | null> | null = null
     /*  construct node  */
     constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
@@ -126,8 +125,6 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
                 secretAccessKey: this.params.secKey
             }
         })
-        if (this.client === null)
-            throw new Error("failed to establish Amazon Transcribe client")
         /*  create an AudioStream for Amazon Transcribe  */
         const audioQueue = new AsyncQueue<Uint8Array>()
@@ -236,11 +233,8 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
                     callback()
                     return
                 }
-                /*  await all read operations  */
                 await reads.awaitAll()
-                util.run(
+                util.run("closing Amazon Transcribe connection",
                     () => self.client!.destroy(),
                     (error: Error) => self.log("warning", `error closing Amazon Transcribe connection: ${error}`)
                 )
@@ -279,12 +273,6 @@ export default class SpeechFlowNodeA2TAmazon extends SpeechFlowNode {
         /*  indicate closing first to stop all async operations  */
         this.closing = true
-        /*  cleanup all timers  */
-        if (this.connectionTimeout !== null) {
-            clearTimeout(this.connectionTimeout)
-            this.connectionTimeout = null
-        }
         /*  close queue  */
         if (this.queue !== null) {
             this.queue.write(null)

package/speechflow-cli/src/speechflow-node-a2t-google.ts ADDED Viewed

@@ -0,0 +1,315 @@
+/*
+**  SpeechFlow - Speech Processing Flow Graph
+**  Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
+**  Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
+*/
+/*  standard dependencies  */
+import Stream from "node:stream"
+/*  external dependencies  */
+import * as GoogleSpeech      from "@google-cloud/speech"
+import { DateTime, Duration } from "luxon"
+import * as arktype           from "arktype"
+/*  internal dependencies  */
+import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
+import * as util                           from "./speechflow-util"
+/*  SpeechFlow node for Google Cloud speech-to-text conversion  */
+export default class SpeechFlowNodeA2TGoogle extends SpeechFlowNode {
+    /*  declare official node name  */
+    public static name = "a2t-google"
+    /*  internal state  */
+    private client:          GoogleSpeech.SpeechClient                                   | null = null
+    private recognizeStream: ReturnType<GoogleSpeech.SpeechClient["streamingRecognize"]> | null = null
+    private queue:           util.SingleQueue<SpeechFlowChunk | null>                    | null = null
+    private closing                                                                             = false
+    /*  construct node  */
+    constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
+        super(id, cfg, opts, args)
+        /*  declare node configuration parameters  */
+        this.configure({
+            key:      { type: "string",          val: process.env.SPEECHFLOW_GOOGLE_KEY ?? "" },
+            model:    { type: "string",  pos: 0, val: "latest_long" },
+            language: { type: "string",  pos: 1, val: "en-US" },
+            interim:  { type: "boolean", pos: 2, val: false }
+        })
+        /*  validate API key  */
+        if (this.params.key === "")
+            throw new Error("Google Cloud API credentials JSON key is required")
+        /*  declare node input/output format  */
+        this.input  = "audio"
+        this.output = "text"
+    }
+    /*  one-time status of node  */
+    async status () {
+        return {}
+    }
+    /*  open node  */
+    async open () {
+        /*  sanity check situation  */
+        if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
+            throw new Error("Google Speech node currently supports PCM-S16LE audio only")
+        /*  clear destruction flag  */
+        this.closing = false
+        /*  create queue for results  */
+        this.queue = new util.SingleQueue<SpeechFlowChunk | null>()
+        /*  create a store for the meta information  */
+        const metastore = new util.TimeStore<Map<string, any>>()
+        /*  instantiate Google Speech client  */
+        const data = util.run("Google Cloud API credentials key", () =>
+            JSON.parse(this.params.key))
+        const credentials = util.importObject("Google Cloud API credentials key",
+            data,
+            arktype.type({
+                project_id:   "string",
+                private_key:  "string",
+                client_email: "string"
+            })
+        )
+        this.client = new GoogleSpeech.SpeechClient({
+            credentials: {
+                private_key:  credentials.private_key,
+                client_email: credentials.client_email
+            },
+            projectId: credentials.project_id
+        })
+        /*  create streaming recognition request  */
+        this.recognizeStream = this.client.streamingRecognize({
+            config: {
+                encoding:                   "LINEAR16",
+                sampleRateHertz:            this.config.audioSampleRate,
+                languageCode:               this.params.language,
+                model:                      this.params.model,
+                enableAutomaticPunctuation: true,
+                enableWordTimeOffsets:      true
+            },
+            interimResults: this.params.interim
+        })
+        /*  hook onto Google Speech API events  */
+        this.recognizeStream.on("data", (data: GoogleSpeech.protos.google.cloud.speech.v1.IStreamingRecognizeResponse) => {
+            if (this.closing || this.queue === null)
+                return
+            if (!data.results || data.results.length === 0)
+                return
+            for (const result of data.results) {
+                if (!result.alternatives || result.alternatives.length === 0)
+                    continue
+                const alternative = result.alternatives[0]
+                const text = alternative.transcript ?? ""
+                if (text === "")
+                    continue
+                const isFinal = result.isFinal ?? false
+                if (!isFinal && !this.params.interim)
+                    continue
+                /*  calculate timestamps  */
+                let tsStart = Duration.fromMillis(0)
+                let tsEnd   = Duration.fromMillis(0)
+                /*  extract word timing information if available  */
+                const words: { word: string, start: Duration, end: Duration }[] = []
+                if (alternative.words && alternative.words.length > 0) {
+                    for (const wordInfo of alternative.words) {
+                        const wordStart = wordInfo.startTime
+                            ? Duration.fromMillis(
+                                (Number(wordInfo.startTime.seconds ?? 0) * 1000) +
+                                (Number(wordInfo.startTime.nanos ?? 0) / 1000000)
+                            ).plus(this.timeZeroOffset)
+                            : Duration.fromMillis(0)
+                        const wordEnd = wordInfo.endTime
+                            ? Duration.fromMillis(
+                                (Number(wordInfo.endTime.seconds ?? 0) * 1000) +
+                                (Number(wordInfo.endTime.nanos ?? 0) / 1000000)
+                            ).plus(this.timeZeroOffset)
+                            : Duration.fromMillis(0)
+                        words.push({
+                            word:  wordInfo.word ?? "",
+                            start: wordStart,
+                            end:   wordEnd
+                        })
+                    }
+                    if (words.length > 0) {
+                        tsStart = words[0].start
+                        tsEnd   = words[words.length - 1].end
+                    }
+                }
+                else {
+                    /*  fallback: use result timing  */
+                    const resultEnd = result.resultEndTime
+                    if (resultEnd) {
+                        tsEnd = Duration.fromMillis(
+                            (Number(resultEnd.seconds ?? 0) * 1000) +
+                            (Number(resultEnd.nanos ?? 0) / 1000000)
+                        ).plus(this.timeZeroOffset)
+                    }
+                }
+                this.log("info", `text received (start: ${tsStart.toMillis()}ms, ` +
+                    `end: ${tsEnd.toMillis()}ms, ` +
+                    `kind: ${isFinal ? "final" : "intermediate"}): ` +
+                    `"${text}"`)
+                /*  fetch and merge meta information  */
+                const metas = metastore.fetch(tsStart, tsEnd)
+                const meta = metas.toReversed().reduce((prev: Map<string, any>, curr: Map<string, any>) => {
+                    curr.forEach((val, key) => { prev.set(key, val) })
+                    return prev
+                }, new Map<string, any>())
+                metastore.prune(tsStart)
+                /*  add word timing to meta  */
+                if (words.length > 0)
+                    meta.set("words", words)
+                /*  create and enqueue chunk  */
+                const chunk = new SpeechFlowChunk(tsStart, tsEnd,
+                    isFinal ? "final" : "intermediate", "text", text, meta)
+                this.queue.write(chunk)
+            }
+        })
+        this.recognizeStream.on("error", (error: Error) => {
+            this.log("error", `error: ${error.message}`)
+            if (!this.closing && this.queue !== null)
+                this.queue.write(null)
+            this.emit("error", error)
+        })
+        this.recognizeStream.on("end", () => {
+            this.log("info", "stream ended")
+            if (!this.closing && this.queue !== null)
+                this.queue.write(null)
+        })
+        /*  remember opening time to receive time zero offset  */
+        this.timeOpen = DateTime.now()
+        /*  provide Duplex stream and internally attach to Google Speech API  */
+        const self = this
+        const reads = new util.PromiseSet<void>()
+        this.stream = new Stream.Duplex({
+            writableObjectMode: true,
+            readableObjectMode: true,
+            decodeStrings:      false,
+            highWaterMark:      1,
+            write (chunk: SpeechFlowChunk, encoding, callback) {
+                if (self.closing || self.recognizeStream === null) {
+                    callback(new Error("stream already destroyed"))
+                    return
+                }
+                if (chunk.type !== "audio")
+                    callback(new Error("expected audio input chunk"))
+                else if (!Buffer.isBuffer(chunk.payload))
+                    callback(new Error("expected Buffer input chunk"))
+                else {
+                    if (chunk.payload.byteLength > 0) {
+                        self.log("debug", `send data (${chunk.payload.byteLength} bytes)`)
+                        if (chunk.meta.size > 0)
+                            metastore.store(chunk.timestampStart, chunk.timestampEnd, chunk.meta)
+                        try {
+                            self.recognizeStream.write(chunk.payload)
+                        }
+                        catch (error) {
+                            callback(util.ensureError(error, "failed to send to Google Speech"))
+                            return
+                        }
+                    }
+                    callback()
+                }
+            },
+            async final (callback) {
+                /*  short-circuiting in case of own closing  */
+                if (self.closing || self.recognizeStream === null) {
+                    callback()
+                    return
+                }
+                /*  close Google Speech stream  */
+                try {
+                    self.recognizeStream.end()
+                }
+                catch (error) {
+                    self.log("warning", `error closing Google Speech stream: ${error}`)
+                }
+                /*  await all read operations  */
+                await reads.awaitAll()
+                callback()
+            },
+            read (size) {
+                if (self.closing || self.queue === null) {
+                    this.push(null)
+                    return
+                }
+                reads.add(self.queue.read().then((chunk) => {
+                    if (self.closing || self.queue === null) {
+                        this.push(null)
+                        return
+                    }
+                    if (chunk === null) {
+                        self.log("info", "received EOF signal")
+                        this.push(null)
+                    }
+                    else {
+                        self.log("debug", `received data (${chunk.payload.length} bytes)`)
+                        this.push(chunk)
+                    }
+                }).catch((error: unknown) => {
+                    if (!self.closing && self.queue !== null)
+                        self.log("error", `queue read error: ${util.ensureError(error).message}`)
+                }))
+            }
+        })
+    }
+    /*  close node  */
+    async close () {
+        /*  indicate closing first to stop all async operations  */
+        this.closing = true
+        /*  shutdown stream  */
+        if (this.stream !== null) {
+            await util.destroyStream(this.stream)
+            this.stream = null
+        }
+        /*  close Google Speech stream and client  */
+        if (this.recognizeStream !== null) {
+            try {
+                this.recognizeStream.removeAllListeners()
+                this.recognizeStream.destroy()
+            }
+            catch (error) {
+                this.log("warning", `error during Google Speech stream cleanup: ${error}`)
+            }
+            this.recognizeStream = null
+        }
+        if (this.client !== null) {
+            try {
+                await this.client.close()
+            }
+            catch (error) {
+                this.log("warning", `error closing Google Speech client: ${error}`)
+            }
+            this.client = null
+        }
+        /*  signal EOF to any pending read operations  */
+        if (this.queue !== null) {
+            this.queue.write(null)
+            this.queue = null
+        }
+    }
+}

package/speechflow-cli/src/speechflow-node-a2t-openai.ts CHANGED Viewed

@@ -23,12 +23,12 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
     public static name = "a2t-openai"
     /*  internal state  */
-    private openai:            OpenAI | null = null
-    private ws:                ws.WebSocket | null = null
-    private queue:             util.SingleQueue<SpeechFlowChunk | null> | null = null
-    private resampler:         SpeexResampler | null = null
-    private closing            = false
-    private connectionTimeout: ReturnType<typeof setTimeout> | null = null
+    private openai:            OpenAI                                    | null = null
+    private ws:                ws.WebSocket                              | null = null
+    private queue:             util.SingleQueue<SpeechFlowChunk | null>  | null = null
+    private resampler:         SpeexResampler                            | null = null
+    private closing                                                             = false
+    private connectionTimeout: ReturnType<typeof setTimeout>             | null = null
     /*  construct node  */
     constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
@@ -150,6 +150,9 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
         })
         this.ws.on("error", (err) => {
             this.log("error", `WebSocket connection error: ${err}`)
+            if (!this.closing && this.queue !== null)
+                this.queue.write(null)
+            this.emit("error", err)
         })
         /*  track speech timing by item_id (OpenAI provides timestamps via VAD events)  */
@@ -164,6 +167,7 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
             }, new Map<string, any>())
         }
+        /*  track transcription text  */
         let text = ""
         this.ws.on("message", (data) => {
             let ev: any
@@ -353,7 +357,8 @@ export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
             this.ws.close()
             this.ws = null
         }
-        this.openai = null
+        if (this.openai !== null)
+            this.openai = null
         /*  close resampler  */
         this.resampler = null