npm - speechflow - Versions diffs - 0.9.4 → 0.9.5 - Mend

speechflow 0.9.4 → 0.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +8 -3
package/dst/speechflow-node-elevenlabs.d.ts +1 -0
package/dst/speechflow-node-elevenlabs.js +59 -15
package/dst/speechflow-node.d.ts +1 -1
package/dst/speechflow-node.js +11 -3
package/package.json +2 -2
package/sample.yaml +20 -14
package/src/speechflow-node-elevenlabs.ts +69 -18
package/src/speechflow-node.ts +15 -7

package/README.md CHANGED Viewed

@@ -17,14 +17,19 @@ About
 **SpeechFlow** is a command-line interface based tool for establishing a
 directed data flow graph of audio and text processing nodes. This way,
 it allows to perform various speech processing tasks in a flexible way.
-Currently, **SpeechFlow** comes with graph nodes for local file I/O, local audio
+**SpeechFlow** comes with built-in graph nodes for local file I/O, local audio
 device I/O, local/remote WebSocket network I/O, cloud-based [Deepgram](https://deepgram.com)
 speech-to-text conversion, cloud-based [DeepL](https://deepl.com) text-to-text
 translation, local [Gemma/Ollama](https://ollama.com/library/gemma3)
 text-to-text translation, cloud-based [ElevenLabs](https://elevenlabs.io/)
 text-to-speech conversion, and local [FFmpeg](https://ffmpeg.org/)
-speech-to-speech encoding. **SpeechFlow** is written in TypeScript and
-ships as a package for the Node Package Manager (NPM).
+speech-to-speech encoding. Additional SpeechFlow graph nodes can be provided externally
+by NPM packages named `speechflow-node-xxx` which expose a class
+derived from the exported `SpeechFlowNode` class of the `speechflow` package.
+**SpeechFlow** is written in TypeScript and
+ships as an installable package for the Node Package Manager (NPM).
 Installation
 ------------

package/dst/speechflow-node-elevenlabs.d.ts CHANGED Viewed

@@ -2,6 +2,7 @@ import SpeechFlowNode from "./speechflow-node";
 export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
     static name: string;
     private elevenlabs;
+    private static speexInitialized;
     constructor(id: string, opts: {
         [id: string]: any;
     }, args: any[]);

package/dst/speechflow-node-elevenlabs.js CHANGED Viewed

@@ -47,6 +47,7 @@ const node_events_1 = require("node:events");
 /*  external dependencies  */
 const ElevenLabs = __importStar(require("elevenlabs"));
 const get_stream_1 = require("get-stream");
+const speex_resampler_1 = __importDefault(require("speex-resampler"));
 /*  internal dependencies  */
 const speechflow_node_1 = __importDefault(require("./speechflow-node"));
 /*
@@ -68,14 +69,17 @@ class SpeechFlowNodeElevenlabs extends speechflow_node_1.default {
     static name = "elevenlabs";
     /*  internal state  */
     elevenlabs = null;
+    static speexInitialized = false;
     /*  construct node  */
     constructor(id, opts, args) {
         super(id, opts, args);
         /*  declare node configuration parameters  */
         this.configure({
             key: { type: "string", val: process.env.SPEECHFLOW_KEY_ELEVENLABS },
-            voice: { type: "string", val: "Brian", pos: 0 },
-            language: { type: "string", val: "de", pos: 1 }
+            voice: { type: "string", val: "Brian", pos: 0, match: /^(?:.+)$/ },
+            language: { type: "string", val: "en", pos: 1, match: /^(?:de|en)$/ },
+            speed: { type: "number", val: 1.05, pos: 2, match: (n) => n >= 0.7 && n <= 1.2 },
+            optimize: { type: "string", val: "latency", pos: 3, match: /^(?:latency|quality)$/ }
         });
         /*  declare node input/output format  */
         this.input = "text";
@@ -83,39 +87,76 @@ class SpeechFlowNodeElevenlabs extends speechflow_node_1.default {
     }
     /*  open node  */
     async open() {
+        /*  establish ElevenLabs API connection  */
         this.elevenlabs = new ElevenLabs.ElevenLabsClient({
             apiKey: this.params.key
         });
+        /*  determine maximum sample rate of ElevenLabs tier  */
+        const maxSampleRates = {
+            "free": 16000,
+            "starter": 22050,
+            "creator": 24000,
+            "independent_publisher": 44100,
+            "growing_business": 44100,
+            "enterprise": 44100
+        };
+        const sub = await this.elevenlabs.user.getSubscription();
+        const tier = (sub.tier ?? "free");
+        this.log("info", `determined ElevenLabs tier: "${tier}"`);
+        let maxSampleRate = 16000;
+        if (maxSampleRates[tier] !== undefined)
+            maxSampleRate = maxSampleRates[tier];
+        this.log("info", `determined maximum audio sample rate: ${maxSampleRate}`);
+        /*  determine voice for text-to-speech operation
+            (for details see https://elevenlabs.io/text-to-speech)  */
         const voices = await this.elevenlabs.voices.getAll();
-        const voice = voices.voices.find((voice) => voice.name === this.params.voice);
-        if (voice === undefined)
-            throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`);
+        let voice = voices.voices.find((voice) => voice.name === this.params.voice);
+        if (voice === undefined) {
+            voice = voices.voices.find((voice) => voice.name.startsWith(this.params.voice));
+            if (voice === undefined)
+                throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`);
+        }
+        const info = Object.keys(voice.labels ?? {}).length > 0 ?
+            (", " + Object.entries(voice.labels)
+                .map(([key, val]) => `${key}: "${val}"`).join(", ")) : "";
+        this.log("info", `selected voice: name: "${voice.name}"${info}`);
+        /*  perform text-to-speech operation with Elevenlabs API  */
+        const model = this.params.optimize === "quality" ?
+            "eleven_multilingual_v2" :
+            "eleven_flash_v2_5";
         const speechStream = (text) => {
             return this.elevenlabs.textToSpeech.convert(voice.voice_id, {
                 text,
-                optimize_streaming_latency: 2,
-                output_format: "pcm_16000", // S16LE
-                model_id: "eleven_flash_v2_5",
-                /*
+                model_id: model,
+                language_code: this.params.language,
+                output_format: `pcm_${maxSampleRate}`,
+                seed: 815, /* arbitrary, but fixated by us */
                 voice_settings: {
-                    stability: 0,
-                    similarity_boost: 0
+                    speed: this.params.speed
                 }
-                */
             }, {
                 timeoutInSeconds: 30,
                 maxRetries: 10
             });
         };
+        /*  internal queue of results  */
         const queue = new node_events_1.EventEmitter();
+        /*  establish resampler from ElevenLabs's maximum 24Khz
+            output to our standard audio sample rate (48KHz)  */
+        if (!SpeechFlowNodeElevenlabs.speexInitialized) {
+            /*  at least once initialize resampler  */
+            await speex_resampler_1.default.initPromise;
+            SpeechFlowNodeElevenlabs.speexInitialized = true;
+        }
+        const resampler = new speex_resampler_1.default(1, maxSampleRate, this.config.audioSampleRate, 7);
+        /*  create duplex stream and connect it to the ElevenLabs API  */
         this.stream = new node_stream_1.default.Duplex({
             write(chunk, encoding, callback) {
-                if (encoding !== "utf8" && encoding !== "utf-8")
-                    callback(new Error("only text input supported by Elevenlabs node"));
                 const data = chunk.toString();
                 speechStream(data).then((stream) => {
                     (0, get_stream_1.getStreamAsBuffer)(stream).then((buffer) => {
-                        queue.emit("audio", buffer);
+                        const bufferResampled = resampler.processChunk(buffer);
+                        queue.emit("audio", bufferResampled);
                         callback();
                     }).catch((error) => {
                         callback(error);
@@ -138,6 +179,9 @@ class SpeechFlowNodeElevenlabs extends speechflow_node_1.default {
             this.stream.destroy();
             this.stream = null;
         }
+        /*  destroy ElevenLabs API  */
+        if (this.elevenlabs !== null)
+            this.elevenlabs = null;
     }
 }
 exports.default = SpeechFlowNodeElevenlabs;

package/dst/speechflow-node.d.ts CHANGED Viewed

@@ -27,7 +27,7 @@ export default class SpeechFlowNode extends Events.EventEmitter {
             type: string;
             pos?: number;
             val?: any;
-            match?: RegExp;
+            match?: RegExp | ((x: any) => boolean);
         };
     }): void;
     connect(other: SpeechFlowNode): void;

package/dst/speechflow-node.js CHANGED Viewed

@@ -46,9 +46,11 @@ class SpeechFlowNode extends node_events_1.default.EventEmitter {
                     throw new Error(`invalid type of named parameter "${name}" ` +
                         `(has to be ${spec[name].type})`);
                 if ("match" in spec[name]
-                    && this.opts[name].match(spec[name].match) === null)
-                    throw new Error(`invalid value of named parameter "${name}" ` +
-                        `(has to match ${spec[name].match})`);
+                    && ((spec[name].match instanceof RegExp
+                        && this.opts[name].match(spec[name].match) === null)
+                        || (typeof spec[name].match === "function"
+                            && !spec[name].match(this.opts[name]))))
+                    throw new Error(`invalid value "${this.opts[name]}" of named parameter "${name}"`);
                 this.params[name] = this.opts[name];
             }
             else if (this.opts[name] === undefined
@@ -63,6 +65,12 @@ class SpeechFlowNode extends node_events_1.default.EventEmitter {
                     && this.args[spec[name].pos].match(spec[name].match) === null)
                     throw new Error(`invalid value of positional parameter "${name}" ` +
                         `(has to match ${spec[name].match})`);
+                if ("match" in spec[name]
+                    && ((spec[name].match instanceof RegExp
+                        && this.args[spec[name].pos].match(spec[name].match) === null)
+                        || (typeof spec[name].match === "function"
+                            && !spec[name].match(this.args[spec[name].pos]))))
+                    throw new Error(`invalid value "${this.opts[name]}" of positional parameter "${name}"`);
                 this.params[name] = this.args[spec[name].pos];
             }
             else if ("val" in spec[name] && spec[name].val !== undefined)

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
     "name":                                 "speechflow",
-    "version":                              "0.9.4",
-    "x-stdver":                             "0.9.4-EA",
+    "version":                              "0.9.5",
+    "x-stdver":                             "0.9.5-EA",
     "x-release":                            "2025-04-27",
     "homepage":                             "https://github.com/rse/speechflow",
     "description":                          "Speech Processing Flow Graph",

package/sample.yaml CHANGED Viewed

@@ -4,30 +4,36 @@
 #  capture audio from microphone to file
 capture-microphone: |
-     device(device: "wasapi:VoiceMeeter Output", mode: "r") |
-     file(path: "capture.pcm", mode: "w", type: "audio")
+    device(device: "wasapi:VoiceMeeter Output", mode: "r") |
+    file(path: "capture.pcm", mode: "w", type: "audio")
 #  generate audio file with narration of text file
 generate-narration: |
-     file(path: argv.0, mode: "r", type: "audio") |
-     deepgram(key: env.SPEECHFLOW_KEY_DEEPGRAM)   |
-     file(path: argv.1, mode: "w", type: "text")
+    file(path: argv.0, mode: "r", type: "audio") |
+    deepgram(key: env.SPEECHFLOW_KEY_DEEPGRAM)   |
+    file(path: argv.1, mode: "w", type: "text")
 #  pass-through audio from microphone to speaker and in parallel record it to file
 microphone-to-speaker: |
-     device(device: "wasapi:VoiceMeeter Output", mode: "r") | {
-         file(path: "capture.pcm", mode: "w", type: "audio"),
-         device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
-     }
+    device(device: "wasapi:VoiceMeeter Output", mode: "r") | {
+        file(path: "capture.pcm", mode: "w", type: "audio"),
+        device(device: "wasapi:VoiceMeeter VAIO3 Input", mode: "w")
+    }
 #  translate stdin to stdout
 translation: |
-     file(path: "-", mode: "r", type: "text") |
-     deepl(key: env.SPEECHFLOW_KEY_DEEPL, src: "de", dst: "en-US") |
-     file(path: "-", mode: "w", type: "text")
+    file(path: "-", mode: "r", type: "text") |
+    deepl(key: env.SPEECHFLOW_KEY_DEEPL, src: "de", dst: "en-US") |
+    file(path: "-", mode: "w", type: "text")
 #  sample for development
 sample: |
-     device(device: "coreaudio:Elgato Wave:3", mode: "r") |
-     file(path: "capture.pcm", mode: "w", type: "audio")
+    file(path: "sample.txt", mode: "r", type: "text") |
+    elevenlabs(voice: "Mark", speed: 1.05) |
+    ffmpeg(dst: "wav") |
+    file(path: "sample.wav", mode: "w", type: "audio")
+sample2: |
+    device(device: "coreaudio:Elgato Wave:3", mode: "r") |
+    ffmpeg(dst: "wav") |
+    file(path: "sample.wav", mode: "w", type: "audio")

package/src/speechflow-node-elevenlabs.ts CHANGED Viewed

@@ -11,6 +11,7 @@ import { EventEmitter }      from "node:events"
 /*  external dependencies  */
 import * as ElevenLabs       from "elevenlabs"
 import { getStreamAsBuffer } from "get-stream"
+import SpeexResampler        from "speex-resampler"
 /*  internal dependencies  */
 import SpeechFlowNode        from "./speechflow-node"
@@ -36,6 +37,7 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
     /*  internal state  */
     private elevenlabs: ElevenLabs.ElevenLabsClient | null = null
+    private static speexInitialized = false
     /*  construct node  */
     constructor (id: string, opts: { [ id: string ]: any }, args: any[]) {
@@ -44,8 +46,10 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
         /*  declare node configuration parameters  */
         this.configure({
             key:      { type: "string", val: process.env.SPEECHFLOW_KEY_ELEVENLABS },
-            voice:    { type: "string", val: "Brian",  pos: 0 },
-            language: { type: "string", val: "de",     pos: 1 }
+            voice:    { type: "string", val: "Brian",   pos: 0, match: /^(?:.+)$/ },
+            language: { type: "string", val: "en",      pos: 1, match: /^(?:de|en)$/ },
+            speed:    { type: "number", val: 1.05,      pos: 2, match: (n: number) => n >= 0.7 && n <= 1.2 },
+            optimize: { type: "string", val: "latency", pos: 3, match: /^(?:latency|quality)$/ }
         })
         /*  declare node input/output format  */
@@ -55,39 +59,82 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
     /*  open node  */
     async open () {
+        /*  establish ElevenLabs API connection  */
         this.elevenlabs = new ElevenLabs.ElevenLabsClient({
             apiKey: this.params.key
         })
+        /*  determine maximum sample rate of ElevenLabs tier  */
+        const maxSampleRates = {
+            "free":                  16000,
+            "starter":               22050,
+            "creator":               24000,
+            "independent_publisher": 44100,
+            "growing_business":      44100,
+            "enterprise":            44100
+        }
+        const sub = await this.elevenlabs.user.getSubscription()
+        const tier = (sub.tier ?? "free") as keyof typeof maxSampleRates
+        this.log("info", `determined ElevenLabs tier: "${tier}"`)
+        let maxSampleRate = 16000
+        if (maxSampleRates[tier] !== undefined)
+            maxSampleRate = maxSampleRates[tier]
+        this.log("info", `determined maximum audio sample rate: ${maxSampleRate}`)
+        /*  determine voice for text-to-speech operation
+            (for details see https://elevenlabs.io/text-to-speech)  */
         const voices = await this.elevenlabs.voices.getAll()
-        const voice = voices.voices.find((voice) => voice.name === this.params.voice)
-        if (voice === undefined)
-            throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`)
+        let voice = voices.voices.find((voice) => voice.name === this.params.voice)
+        if (voice === undefined) {
+            voice = voices.voices.find((voice) => voice.name!.startsWith(this.params.voice))
+            if (voice === undefined)
+                throw new Error(`invalid ElevenLabs voice "${this.params.voice}"`)
+        }
+        const info = Object.keys(voice.labels ?? {}).length > 0 ?
+            (", " + Object.entries(voice.labels!)
+                .map(([ key, val ]) => `${key}: "${val}"`).join(", ")) : ""
+        this.log("info", `selected voice: name: "${voice.name}"${info}`)
+        /*  perform text-to-speech operation with Elevenlabs API  */
+        const model = this.params.optimize === "quality" ?
+            "eleven_multilingual_v2" :
+            "eleven_flash_v2_5"
         const speechStream = (text: string) => {
             return this.elevenlabs!.textToSpeech.convert(voice.voice_id, {
                 text,
-                optimize_streaming_latency: 2,
-                output_format: "pcm_16000", // S16LE
-                model_id: "eleven_flash_v2_5",
-                /*
+                model_id:         model,
+                language_code:    this.params.language,
+                output_format:    `pcm_${maxSampleRate}` as ElevenLabs.ElevenLabs.OutputFormat,
+                seed:             815, /* arbitrary, but fixated by us */
                 voice_settings: {
-                    stability: 0,
-                    similarity_boost: 0
+                    speed:        this.params.speed
                 }
-                */
             }, {
                 timeoutInSeconds: 30,
-                maxRetries: 10
+                maxRetries:       10
             })
         }
+        /*  internal queue of results  */
         const queue = new EventEmitter()
+        /*  establish resampler from ElevenLabs's maximum 24Khz
+            output to our standard audio sample rate (48KHz)  */
+        if (!SpeechFlowNodeElevenlabs.speexInitialized) {
+            /*  at least once initialize resampler  */
+            await SpeexResampler.initPromise
+            SpeechFlowNodeElevenlabs.speexInitialized = true
+        }
+        const resampler = new SpeexResampler(1, maxSampleRate, this.config.audioSampleRate, 7)
+        /*  create duplex stream and connect it to the ElevenLabs API  */
         this.stream = new Stream.Duplex({
-            write (chunk: Buffer, encoding: BufferEncoding, callback: (error?: Error | null | undefined) => void) {
-                if (encoding !== "utf8" && encoding !== "utf-8")
-                    callback(new Error("only text input supported by Elevenlabs node"))
+            write (chunk: Buffer, encoding, callback) {
                 const data = chunk.toString()
                 speechStream(data).then((stream) => {
                     getStreamAsBuffer(stream).then((buffer) => {
-                        queue.emit("audio", buffer)
+                        const bufferResampled = resampler.processChunk(buffer)
+                        queue.emit("audio", bufferResampled)
                         callback()
                     }).catch((error) => {
                         callback(error)
@@ -96,7 +143,7 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
                     callback(error)
                 })
             },
-            read (size: number) {
+            read (size) {
                 queue.once("audio", (buffer: Buffer) => {
                     this.push(buffer, "binary")
                 })
@@ -111,6 +158,10 @@ export default class SpeechFlowNodeElevenlabs extends SpeechFlowNode {
             this.stream.destroy()
             this.stream = null
         }
+        /*  destroy ElevenLabs API  */
+        if (this.elevenlabs !== null)
+            this.elevenlabs = null
     }
 }

package/src/speechflow-node.ts CHANGED Viewed

@@ -37,7 +37,7 @@ export default class SpeechFlowNode extends Events.EventEmitter {
     }
     /*  INTERNAL: utility function: create "params" attribute from constructor of sub-classes  */
-    configure (spec: { [ id: string ]: { type: string, pos?: number, val?: any, match?: RegExp } }) {
+    configure (spec: { [ id: string ]: { type: string, pos?: number, val?: any, match?: RegExp | ((x: any) => boolean) } }) {
         for (const name of Object.keys(spec)) {
             if (this.opts[name] !== undefined) {
                 /*  named parameter  */
@@ -45,9 +45,11 @@ export default class SpeechFlowNode extends Events.EventEmitter {
                     throw new Error(`invalid type of named parameter "${name}" ` +
                         `(has to be ${spec[name].type})`)
                 if ("match" in spec[name]
-                    && this.opts[name].match(spec[name].match) === null)
-                    throw new Error(`invalid value of named parameter "${name}" ` +
-                        `(has to match ${spec[name].match})`)
+                    && (   (   spec[name].match instanceof RegExp
+                            && this.opts[name].match(spec[name].match) === null)
+                        || (   typeof spec[name].match === "function"
+                            && !spec[name].match(this.opts[name])    )          ))
+                    throw new Error(`invalid value "${this.opts[name]}" of named parameter "${name}"`)
                 this.params[name] = this.opts[name]
             }
             else if (this.opts[name] === undefined
@@ -55,14 +57,20 @@ export default class SpeechFlowNode extends Events.EventEmitter {
                 && typeof spec[name].pos === "number"
                 && spec[name].pos < this.args.length) {
                 /*  positional argument  */
-                if (typeof this.args[spec[name].pos!] !== spec[name].type)
+                if (typeof this.args[spec[name].pos] !== spec[name].type)
                     throw new Error(`invalid type of positional parameter "${name}" ` +
                         `(has to be ${spec[name].type})`)
                 if ("match" in spec[name]
-                    && this.args[spec[name].pos!].match(spec[name].match) === null)
+                    && this.args[spec[name].pos].match(spec[name].match) === null)
                     throw new Error(`invalid value of positional parameter "${name}" ` +
                         `(has to match ${spec[name].match})`)
-                this.params[name] = this.args[spec[name].pos!]
+                if ("match" in spec[name]
+                    && (   (   spec[name].match instanceof RegExp
+                            && this.args[spec[name].pos].match(spec[name].match) === null)
+                        || (   typeof spec[name].match === "function"
+                            && !spec[name].match(this.args[spec[name].pos])    )          ))
+                    throw new Error(`invalid value "${this.opts[name]}" of positional parameter "${name}"`)
+                this.params[name] = this.args[spec[name].pos]
             }
             else if ("val" in spec[name] && spec[name].val !== undefined)
                 /*  default argument  */