npm - speechflow - Versions diffs - 1.4.5 → 1.5.0 - Mend

speechflow 1.4.5 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

package/speechflow-cli/src/speechflow-node-a2t-openaitranscribe.ts ADDED Viewed

@@ -0,0 +1,337 @@
+/*
+**  SpeechFlow - Speech Processing Flow Graph
+**  Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
+**  Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
+*/
+/*  standard dependencies  */
+import Stream from "node:stream"
+/*  external dependencies  */
+import OpenAI                 from "openai"
+import { DateTime }           from "luxon"
+import SpeexResampler         from "speex-resampler"
+import ws                     from "ws"
+/*  internal dependencies  */
+import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
+import * as utils                          from "./speechflow-utils"
+/*  SpeechFlow node for OpenAI Transcribe speech-to-text conversion  */
+export default class SpeechFlowNodeOpenAITranscribe extends SpeechFlowNode {
+    /*  declare official node name  */
+    public static name = "openaitranscribe"
+    /*  internal state  */
+    private static speexInitialized = false
+    private openai:     OpenAI | null = null
+    private ws:         ws.WebSocket | null = null
+    private queue:      utils.SingleQueue<SpeechFlowChunk | null> | null = null
+    private resampler:  SpeexResampler | null = null
+    private destroyed   = false
+    private connectionTimeout: ReturnType<typeof setTimeout> | null = null
+    /*  construct node  */
+    constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
+        super(id, cfg, opts, args)
+        /*  declare node configuration parameters  */
+        this.configure({
+            key:      { type: "string",  val: process.env.SPEECHFLOW_OPENAI_KEY },
+            api:      { type: "string",  val: "https://api.openai.com/v1", match: /^https?:\/\/.+/ },
+            model:    { type: "string",  val: "gpt-4o-mini-transcribe" },
+            language: { type: "string",  val: "de", match: /^(?:en|de)$/ },
+            interim:  { type: "boolean", val: false }
+        })
+        /*  declare node input/output format  */
+        this.input  = "audio"
+        this.output = "text"
+    }
+    /*  one-time status of node  */
+    async status () {
+        return {}
+    }
+    /*  open node  */
+    async open () {
+        /*  sanity check situation  */
+        if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
+            throw new Error("OpenAI transcribe node currently supports PCM-S16LE audio only")
+        /*  clear destruction flag  */
+        this.destroyed = false
+        /*  create queue for results  */
+        this.queue = new utils.SingleQueue<SpeechFlowChunk | null>()
+        /*  create a store for the meta information  */
+        const metastore = new utils.TimeStore<Map<string, any>>()
+        /*  establish resampler from our standard audio sample rate (48Khz)
+            to OpenAI's maximum 24Khz input sample rate  */
+        if (!SpeechFlowNodeOpenAITranscribe.speexInitialized) {
+            /*  at least once initialize resampler  */
+            await SpeexResampler.initPromise
+            SpeechFlowNodeOpenAITranscribe.speexInitialized = true
+        }
+        this.resampler = new SpeexResampler(1, this.config.audioSampleRate, 24000, 7)
+        /*  instantiate OpenAI API  */
+        this.openai = new OpenAI({
+            baseURL: this.params.api,
+            apiKey:  this.params.key,
+            dangerouslyAllowBrowser: true
+        })
+        /*  open the WebSocket connection for streaming  */
+        const url = `${this.params.api.replace(/^http/, "ws")}/realtime?intent=transcription`
+        this.ws = new ws.WebSocket(url, {
+            headers: {
+                Authorization: `Bearer ${this.params.key}`,
+                "OpenAI-Beta": "realtime=v1"
+            }
+        })
+        const sendMessage = (obj: any) => {
+            this.ws?.send(JSON.stringify(obj))
+        }
+        /*  wait for OpenAI API to be available  */
+        await new Promise((resolve, reject) => {
+            this.connectionTimeout = setTimeout(() => {
+                if (this.connectionTimeout !== null) {
+                    this.connectionTimeout = null
+                    reject(new Error("OpenAI: timeout waiting for connection open"))
+                }
+            }, 8000)
+            this.ws!.once("open", () => {
+                this.log("info", "connection open")
+                if (this.connectionTimeout !== null) {
+                    clearTimeout(this.connectionTimeout)
+                    this.connectionTimeout = null
+                }
+                resolve(true)
+            })
+            this.ws!.once("error", (err) => {
+                if (this.connectionTimeout !== null) {
+                    clearTimeout(this.connectionTimeout)
+                    this.connectionTimeout = null
+                }
+                reject(err)
+            })
+        })
+        /*  configure session  */
+        sendMessage({
+            type: "transcription_session.update",
+            session: {
+                input_audio_format: "pcm16",
+                input_audio_transcription: {
+                    model:    this.params.model,
+                    language: this.params.language
+                },
+                turn_detection: {
+                    type: "server_vad",
+                    threshold: 0.5,
+                    prefix_padding_ms: 300,
+                    silence_duration_ms: 500
+                }
+            }
+        })
+        /*  hook onto session events  */
+        this.ws.on("open", () => {
+            this.log("info", "WebSocket connection opened")
+            sendMessage({ type: "transcription.create" })
+        })
+        this.ws.on("close", () => {
+            this.log("info", "WebSocket connection closed")
+            this.queue!.write(null)
+        })
+        this.ws.on("error", (err) => {
+            this.log("error", `WebSocket connection error: ${err}`)
+        })
+        let text = ""
+        this.ws.on("message", (data) => {
+            let ev: any
+            try {
+                ev = JSON.parse(data.toString())
+            }
+            catch (err) {
+                this.log("warning", `failed to parse WebSocket message: ${err}`)
+                return
+            }
+            if (!(typeof ev === "object" && ev !== null)) {
+                this.log("warning", "received invalid WebSocket message")
+                return
+            }
+            switch (ev.type) {
+                case "transcription_session.created":
+                    break
+                case "conversation.item.created":
+                    text = ""
+                    break
+                case "conversation.item.input_audio_transcription.delta": {
+                    text += ev.delta as string
+                    if (this.params.interim) {
+                        const start = DateTime.now().diff(this.timeOpen!) // FIXME: OpenAI does not provide timestamps
+                        const end   = start                               // FIXME: OpenAI does not provide timestamps
+                        const metas = metastore.fetch(start, end)
+                        const meta = metas.reduce((prev: Map<string, any>, curr: Map<string, any>) => {
+                            curr.forEach((val, key) => { prev.set(key, val) })
+                            return prev
+                        }, new Map<string, any>())
+                        const chunk = new SpeechFlowChunk(start, end, "intermediate", "text", text)
+                        chunk.meta = meta
+                        this.queue!.write(chunk)
+                    }
+                    break
+                }
+                case "conversation.item.input_audio_transcription.completed": {
+                    text = ev.transcript as string
+                    const start = DateTime.now().diff(this.timeOpen!) // FIXME: OpenAI does not provide timestamps
+                    const end   = start                               // FIXME: OpenAI does not provide timestamps
+                    const metas = metastore.fetch(start, end)
+                    const meta = metas.reduce((prev: Map<string, any>, curr: Map<string, any>) => {
+                        curr.forEach((val, key) => { prev.set(key, val) })
+                        return prev
+                    }, new Map<string, any>())
+                    metastore.prune(start)
+                    const chunk = new SpeechFlowChunk(start, end, "final", "text", text)
+                    chunk.meta = meta
+                    this.queue!.write(chunk)
+                    text = ""
+                    break
+                }
+                case "input_audio_buffer.speech_started":
+                    this.log("info", "VAD: speech started")
+                    break
+                case "input_audio_buffer.speech_stopped":
+                    this.log("info", "VAD: speech stopped")
+                    break
+                case "input_audio_buffer.committed":
+                    this.log("info", "input buffer committed")
+                    break
+                case "error":
+                    this.log("error", `error: ${ev.error?.message}`)
+                    break
+                default:
+                    break
+            }
+        })
+        /*  remember opening time to receive time zero offset  */
+        this.timeOpen = DateTime.now()
+        /*  provide Duplex stream and internally attach to OpenAI API  */
+        const self = this
+        this.stream = new Stream.Duplex({
+            writableObjectMode: true,
+            readableObjectMode: true,
+            decodeStrings:      false,
+            highWaterMark:      1,
+            write (chunk: SpeechFlowChunk, encoding, callback) {
+                if (self.destroyed || self.ws === null) {
+                    callback(new Error("stream already destroyed"))
+                    return
+                }
+                if (chunk.type !== "audio")
+                    callback(new Error("expected audio input chunk"))
+                else if (!Buffer.isBuffer(chunk.payload))
+                    callback(new Error("expected Buffer input chunk"))
+                else {
+                    if (chunk.payload.byteLength > 0) {
+                        self.log("debug", `send data (${chunk.payload.byteLength} bytes)`)
+                        if (chunk.meta.size > 0)
+                            metastore.store(chunk.timestampStart, chunk.timestampEnd, chunk.meta)
+                        try {
+                            const payload = self.resampler!.processChunk(chunk.payload)
+                            const audioB64 = payload.toString("base64")
+                            sendMessage({
+                                type: "input_audio_buffer.append",
+                                audio: audioB64 /* intentionally discard all time information */
+                            })
+                        }
+                        catch (error) {
+                            callback(error instanceof Error ? error : new Error("failed to send to OpenAI transcribe"))
+                            return
+                        }
+                    }
+                    callback()
+                }
+            },
+            read (size) {
+                if (self.destroyed || self.queue === null) {
+                    this.push(null)
+                    return
+                }
+                self.queue.read().then((chunk) => {
+                    if (self.destroyed) {
+                        this.push(null)
+                        return
+                    }
+                    if (chunk === null) {
+                        self.log("info", "received EOF signal")
+                        this.push(null)
+                    }
+                    else {
+                        self.log("debug", `received data (${chunk.payload.length} bytes)`)
+                        this.push(chunk)
+                    }
+                }).catch((error) => {
+                    if (!self.destroyed)
+                        self.log("error", `queue read error: ${error.message}`)
+                })
+            },
+            final (callback) {
+                if (self.destroyed || self.ws === null) {
+                    callback()
+                    return
+                }
+                try {
+                    sendMessage({ type: "input_audio_buffer.commit" })
+                    self.ws.close()
+                    /*  NOTICE: do not push null here -- let the OpenAI close event handle it  */
+                    callback()
+                }
+                catch (error) {
+                    self.log("warning", `error closing OpenAI connection: ${error}`)
+                    callback(error instanceof Error ? error : new Error("failed to close OpenAI connection"))
+                }
+            }
+        })
+    }
+    /*  close node  */
+    async close () {
+        /*  indicate destruction first to stop all async operations  */
+        this.destroyed = true
+        /*  clear connection timeout  */
+        if (this.connectionTimeout !== null) {
+            clearTimeout(this.connectionTimeout)
+            this.connectionTimeout = null
+        }
+        /*  signal EOF to any pending read operations  */
+        if (this.queue !== null) {
+            this.queue.write(null)
+            this.queue = null
+        }
+        /*  close OpenAI connection  */
+        if (this.ws !== null) {
+            this.ws.close()
+            this.ws = null
+        }
+        if (this.openai !== null)
+            this.openai = null
+        /*  close stream  */
+        if (this.stream !== null) {
+            this.stream.destroy()
+            this.stream = null
+        }
+    }
+}

package/speechflow-cli/src/speechflow-node-t2a-awspolly.ts ADDED Viewed

@@ -0,0 +1,187 @@
+/*
+**  SpeechFlow - Speech Processing Flow Graph
+**  Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
+**  Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
+*/
+/*  standard dependencies  */
+import Stream from "node:stream"
+/*  external dependencies  */
+import { getStreamAsBuffer } from "get-stream"
+import SpeexResampler        from "speex-resampler"
+import {
+    PollyClient, SynthesizeSpeechCommand,
+    Engine, VoiceId, LanguageCode, TextType
+} from "@aws-sdk/client-polly"
+/*  internal dependencies  */
+import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
+/*  SpeechFlow node for AWS Polly text-to-speech conversion  */
+export default class SpeechFlowNodeAWSPolly extends SpeechFlowNode {
+    /*  declare official node name  */
+    public static name = "awspolly"
+    /*  internal state  */
+    private client: PollyClient | null = null
+    private static speexInitialized = false
+    private destroyed = false
+    private resampler: SpeexResampler | null = null
+    /*  construct node  */
+    constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
+        super(id, cfg, opts, args)
+        /*  declare node configuration parameters  */
+        this.configure({
+            key:        { type: "string", val: process.env.SPEECHFLOW_AMAZON_KEY },
+            secKey:     { type: "string", val: process.env.SPEECHFLOW_AMAZON_KEY_SEC },
+            region:     { type: "string", val: "eu-central-1" },
+            voice:      { type: "string", val: "Amy", pos: 0, match: /^(?:Amy|Danielle|Joanna|Matthew|Ruth|Stephen|Vicki|Daniel)$/ },
+            language:   { type: "string", val: "en",  pos: 1, match: /^(?:de|en)$/ }
+        })
+        /*  sanity check parameters  */
+        if (!this.params.key)
+            throw new Error("AWS Access Key not configured")
+        if (!this.params.secKey)
+            throw new Error("AWS Secret Access Key not configured")
+        /*  declare node input/output format  */
+        this.input  = "text"
+        this.output = "audio"
+    }
+    /*  one-time status of node  */
+    async status () {
+        return {}
+    }
+    /*  open node  */
+    async open () {
+        /*  clear destruction flag  */
+        this.destroyed = false
+        /*  establish AWS Polly connection  */
+        this.client = new PollyClient({
+            region: this.params.region,
+            credentials: {
+                accessKeyId:     this.params.key,
+                secretAccessKey: this.params.secKey
+            }
+        })
+        if (this.client === null)
+            throw new Error("failed to establish AWS Polly client")
+        /*  list of voices  */
+        const voices = {
+            "Amy":      { language: "en", languageCode: "en-GB", engine: "generative" },
+            "Danielle": { language: "en", languageCode: "en-US", engine: "generative" },
+            "Joanna":   { language: "en", languageCode: "en-US", engine: "generative" },
+            "Matthew":  { language: "en", languageCode: "en-US", engine: "generative" },
+            "Ruth":     { language: "en", languageCode: "en-US", engine: "generative" },
+            "Stephen":  { language: "en", languageCode: "en-US", engine: "generative" },
+            "Vicki":    { language: "de", languageCode: "de-DE", engine: "generative" },
+            "Daniel":   { language: "de", languageCode: "de-DE", engine: "generative" },
+        }
+        const voiceConfig = voices[this.params.voice as keyof typeof voices]
+        if (voiceConfig === undefined)
+            throw new Error("unsupported voice")
+        if (voiceConfig.language !== this.params.language)
+            throw new Error(`voice does only support language "${voiceConfig.language}"`)
+        /*  perform text-to-speech operation with AWS Polly API  */
+        const textToSpeech = async (text: string) => {
+            const cmd = new SynthesizeSpeechCommand({
+                LanguageCode: voiceConfig.languageCode as LanguageCode,
+                Engine:       voiceConfig.engine as Engine,
+                VoiceId:      this.params.voice as VoiceId,
+                OutputFormat: "pcm",
+                SampleRate:   "16000", /* maximum supported for PCM output */
+                TextType:     "text" as TextType,
+                Text:         text
+            })
+            const res = await this.client!.send(cmd)
+            const stream = res.AudioStream as AsyncIterable<Uint8Array> | null
+            if (stream === null)
+                throw new Error("stream not returned")
+            const buffer = await getStreamAsBuffer(stream)
+            const bufferResampled = this.resampler!.processChunk(buffer)
+            return bufferResampled
+        }
+        /*  establish resampler from AWS Polly's maximum 16Khz output
+            (for PCM output) to our standard audio sample rate (48KHz)  */
+        if (!SpeechFlowNodeAWSPolly.speexInitialized) {
+            /*  at least once initialize resampler  */
+            await SpeexResampler.initPromise
+            SpeechFlowNodeAWSPolly.speexInitialized = true
+        }
+        this.resampler = new SpeexResampler(1, 16000, this.config.audioSampleRate, 7)
+        /*  create transform stream and connect it to the AWS Polly API  */
+        const self = this
+        this.stream = new Stream.Transform({
+            writableObjectMode: true,
+            readableObjectMode: true,
+            decodeStrings:      false,
+            highWaterMark:      1,
+            transform (chunk: SpeechFlowChunk, encoding, callback) {
+                if (self.destroyed) {
+                    callback(new Error("stream already destroyed"))
+                    return
+                }
+                if (Buffer.isBuffer(chunk.payload))
+                    callback(new Error("invalid chunk payload type"))
+                else if (chunk.payload.length > 0) {
+                    self.log("debug", `send data (${chunk.payload.length} bytes): "${chunk.payload}"`)
+                    textToSpeech(chunk.payload as string).then((buffer) => {
+                        if (self.destroyed)
+                            throw new Error("stream destroyed during processing")
+                        const chunkNew = chunk.clone()
+                        chunkNew.type = "audio"
+                        chunkNew.payload = buffer
+                        this.push(chunkNew)
+                        callback()
+                    }).catch((error) => {
+                        callback(error instanceof Error ?
+                            error : new Error(`failed to send to AWS Polly: ${String(error)}`))
+                    })
+                }
+                else
+                    callback()
+            },
+            final (callback) {
+                if (self.destroyed) {
+                    callback()
+                    return
+                }
+                this.push(null)
+                callback()
+            }
+        })
+    }
+    /*  close node  */
+    async close () {
+        /*  indicate destruction  */
+        this.destroyed = true
+        /*  destroy resampler  */
+        if (this.resampler !== null)
+            this.resampler = null
+        /*  destroy AWS Polly API  */
+        if (this.client !== null) {
+            this.client.destroy()
+            this.client = null
+        }
+        /*  destroy stream  */
+        if (this.stream !== null) {
+            this.stream.destroy()
+            this.stream = null
+        }
+    }
+}

package/speechflow-cli/src/speechflow-node-t2a-elevenlabs.ts CHANGED Viewed

@@ -52,10 +52,17 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
     /*  one-time status of node  */
     async status () {
-        const elevenlabs = new ElevenLabs.ElevenLabsClient({ apiKey: this.params.key })
-        const subscription = await elevenlabs.user.subscription.get()
-        const percent = subscription.characterCount / subscription.characterLimit
-        return { usage: `${percent.toFixed(2)}%` }
+        try {
+            const elevenlabs = new ElevenLabs.ElevenLabsClient({ apiKey: this.params.key })
+            const subscription = await elevenlabs.user.subscription.get()
+            const percent = subscription.characterLimit > 0
+                ? subscription.characterCount / subscription.characterLimit
+                : 0
+            return { usage: `${percent.toFixed(2)}%` }
+        }
+        catch (_error) {
+            return { usage: "unknown" }
+        }
     }
     /*  open node  */
@@ -88,15 +95,15 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
         /*  determine voice for text-to-speech operation
             (for details see https://elevenlabs.io/text-to-speech)  */
         const voices = await this.elevenlabs.voices.getAll()
-        let voice = voices.voices.find((voice) => voice.name === this.params.voice)
+        let voice = voices.voices.find((v) => v.name === this.params.voice)
         if (voice === undefined) {
-            voice = voices.voices.find((voice) => voice.name!.startsWith(this.params.voice))
+            voice = voices.voices.find((v) => (v.name ?? "").startsWith(this.params.voice))
             if (voice === undefined)
                 throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`)
         }
-        const info = Object.keys(voice.labels ?? {}).length > 0 ?
-            (", " + Object.entries(voice.labels!)
-                .map(([ key, val ]) => `${key}: "${val}"`).join(", ")) : ""
+        const labels = voice.labels ?? {}
+        const info = Object.keys(labels).length > 0 ?
+            ", " + Object.entries(labels).map(([ key, val ]) => `${key}: "${val}"`).join(", ") : ""
         this.log("info", `selected voice: name: "${voice.name}"${info}`)
         /*  perform text-to-speech operation with Elevenlabs API  */
@@ -139,11 +146,9 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
             decodeStrings:      false,
             highWaterMark:      1,
             transform (chunk: SpeechFlowChunk, encoding, callback) {
-                if (self.destroyed) {
+                if (self.destroyed)
                     callback(new Error("stream already destroyed"))
-                    return
-                }
-                if (Buffer.isBuffer(chunk.payload))
+                else if (Buffer.isBuffer(chunk.payload))
                     callback(new Error("invalid chunk payload type"))
                 else {
                     (async () => {
@@ -158,12 +163,12 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
                             }
                         }
                         try {
-                            const stream = await speechStream(chunk.payload as string)
                             if (self.destroyed) {
                                 clearProcessTimeout()
                                 callback(new Error("stream destroyed during processing"))
                                 return
                             }
+                            const stream = await speechStream(chunk.payload as string)
                             const buffer = await getStreamAsBuffer(stream)
                             if (self.destroyed) {
                                 clearProcessTimeout()

package/speechflow-cli/src/speechflow-node-t2a-kokoro.ts CHANGED Viewed

@@ -51,26 +51,31 @@ export default class SpeechFlowNodeKokoro extends SpeechFlowNode {
                 artifact += `:${progress.file}`
             let percent = 0
             if (typeof progress.loaded === "number" && typeof progress.total === "number")
-                percent = (progress.loaded as number / progress.total as number) * 100
+                percent = (progress.loaded / progress.total) * 100
             else if (typeof progress.progress === "number")
                 percent = progress.progress
             if (percent > 0)
                 progressState.set(artifact, percent)
         }
-        const interval = setInterval(() => {
+        let interval: ReturnType<typeof setInterval> | null = setInterval(() => {
             for (const [ artifact, percent ] of progressState) {
                 this.log("info", `downloaded ${percent.toFixed(2)}% of artifact "${artifact}"`)
                 if (percent >= 100.0)
                     progressState.delete(artifact)
             }
-            if (progressState.size === 0)
+            if (progressState.size === 0 && interval !== null) {
                 clearInterval(interval)
+                interval = null
+            }
         }, 1000)
         this.kokoro = await KokoroTTS.from_pretrained(model, {
             dtype: "q4f16",
             progress_callback: progressCallback
         })
-        clearInterval(interval)
+        if (interval !== null) {
+            clearInterval(interval)
+            interval = null
+        }
         if (this.kokoro === null)
             throw new Error("failed to instantiate Kokoro")
@@ -78,19 +83,19 @@ export default class SpeechFlowNodeKokoro extends SpeechFlowNode {
             output to our standard audio sample rate (48KHz)  */
         if (!SpeechFlowNodeKokoro.speexInitialized) {
             /*  at least once initialize resampler  */
-            await SpeexResampler.initPromise
             SpeechFlowNodeKokoro.speexInitialized = true
+            await SpeexResampler.initPromise
         }
         this.resampler = new SpeexResampler(1, 24000, this.config.audioSampleRate, 7)
         /*  determine voice for text-to-speech operation  */
-        const voices = {
+        const voices: Record<string, string> = {
             "Aoede":  "af_aoede",
             "Heart":  "af_heart",
             "Puck":   "am_puck",
             "Fenrir": "am_fenrir"
         }
-        const voice = ((voices as any)[this.params.voice]) as string | undefined
+        const voice = voices[this.params.voice]
         if (voice === undefined)
             throw new Error(`invalid Kokoro voice "${this.params.voice}"`)