npm - modelfusion - Versions diffs - 0.47.3 → 0.48.0 - Mend

modelfusion 0.47.3 → 0.48.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/README.md +39 -2
package/core/getRun.cjs +5 -3
package/core/getRun.js +5 -3
package/index.cjs +1 -0
package/index.d.ts +1 -0
package/index.js +1 -0
package/model-function/synthesize-speech/SpeechSynthesisModel.d.ts +5 -1
package/model-function/synthesize-speech/synthesizeSpeech.cjs +60 -17
package/model-function/synthesize-speech/synthesizeSpeech.d.ts +8 -2
package/model-function/synthesize-speech/synthesizeSpeech.js +59 -16
package/model-provider/elevenlabs/ElevenLabsApiConfiguration.cjs +3 -0
package/model-provider/elevenlabs/ElevenLabsApiConfiguration.d.ts +1 -0
package/model-provider/elevenlabs/ElevenLabsApiConfiguration.js +3 -0
package/model-provider/elevenlabs/ElevenLabsSpeechSynthesisModel.cjs +122 -10
package/model-provider/elevenlabs/ElevenLabsSpeechSynthesisModel.d.ts +12 -3
package/model-provider/elevenlabs/ElevenLabsSpeechSynthesisModel.js +122 -10
package/model-provider/lmnt/LmntSpeechSynthesisModel.cjs +1 -1
package/model-provider/lmnt/LmntSpeechSynthesisModel.d.ts +1 -1
package/model-provider/lmnt/LmntSpeechSynthesisModel.js +1 -1
package/package.json +3 -1
package/ui/MediaSourceAppender.cjs +54 -0
package/ui/MediaSourceAppender.d.ts +11 -0
package/ui/MediaSourceAppender.js +50 -0
package/ui/index.cjs +17 -0
package/ui/index.d.ts +1 -0
package/ui/index.js +1 -0
package/util/SimpleWebSocket.cjs +41 -0
package/util/SimpleWebSocket.d.ts +12 -0
package/util/SimpleWebSocket.js +14 -0

package/README.md CHANGED Viewed

@@ -274,7 +274,11 @@ Providers: [OpenAI (Whisper)](https://modelfusion.dev/integration/model-provider
 ### [Synthesize Speech](https://modelfusion.dev/guide/function/synthesize-speech)
-Turn text into speech (audio).
+Generate speech (audio) from text. Also called TTS (text-to-speech).
+Providers: [Eleven Labs](https://modelfusion.dev/integration/model-provider/elevenlabs), [LMNT](https://modelfusion.dev/integration/model-provider/lmnt)
+#### Standard mode
 ```ts
 // `speech` is a Buffer with MP3 audio data
@@ -289,7 +293,28 @@ const speech = await synthesizeSpeech(
 );
 ```
-Providers: [Eleven Labs](https://modelfusion.dev/integration/model-provider/elevenlabs), [LMNT](https://modelfusion.dev/integration/model-provider/lmnt)
+#### Duplex streaming mode
+```ts
+const textStream = await streamText(/* ... */);
+const speechStream = await synthesizeSpeech(
+  new ElevenLabsSpeechSynthesisModel({
+    voice: "pNInz6obpgDQGcFmaJgB", // Adam
+    model: "eleven_monolingual_v1",
+    voiceSettings: { stability: 1, similarityBoost: 0.35 },
+    generationConfig: {
+      chunkLengthSchedule: [50, 90, 120, 150, 200],
+    },
+  }),
+  textStream,
+  { mode: "stream-duplex" }
+);
+for await (const part of speechStream) {
+  // each part is a Buffer with MP3 audio data
+}
+```
 ### [Describe Image](https://modelfusion.dev/guide/function/describe-image)
@@ -603,6 +628,12 @@ Create an 19th century painting image for your input.
 Record audio with push-to-talk and transcribe it using Whisper, implemented as a Next.js app. The app shows a list of the transcriptions.
+### [Duplex Speech Streaming (Vite(React) + Fastify))](https://github.com/lgrammel/modelfusion/tree/main/examples/duplex-speech-streaming-vite-react-fastify)
+> _Speech Streaming_, _OpenAI_, _Elevenlabs_ _streaming_, _Vite_, _Fastify_
+Given a prompt, the server returns both a text and a speech stream response.
 ### [BabyAGI Agent](https://github.com/lgrammel/modelfusion/tree/main/examples/babyagi-agent)
 > _terminal app_, _agent_, _BabyAGI_
@@ -627,6 +658,12 @@ Small agent that solves middle school math problems. It uses a calculator tool t
 Extracts information about a topic from a PDF and writes a tweet in your own style about it.
+### [Cloudflare Workers](https://github.com/lgrammel/modelfusion/tree/main/examples/cloudflare-workers)
+> _Cloudflare_, _OpenAI_
+Generate text on a Cloudflare Worker using ModelFusion and OpenAI.
 ## Contributing
 ### [Contributing Guide](https://github.com/lgrammel/modelfusion/blob/main/CONTRIBUTING.md)

package/core/getRun.cjs CHANGED Viewed

@@ -25,10 +25,12 @@ var __importStar = (this && this.__importStar) || function (mod) {
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.withRun = exports.getRun = void 0;
 let runStorage;
-const isNode = typeof process !== "undefined" &&
-    process.versions != null &&
-    process.versions.node != null;
 async function ensureLoaded() {
+    // Note: using process[versions] instead of process.versions to avoid Next.js edge runtime warnings.
+    const versions = "versions";
+    const isNode = typeof process !== "undefined" &&
+        process[versions] != null &&
+        process[versions].node != null;
     if (!isNode)
         return Promise.resolve();
     if (!runStorage) {

package/core/getRun.js CHANGED Viewed

@@ -1,8 +1,10 @@
 let runStorage;
-const isNode = typeof process !== "undefined" &&
-    process.versions != null &&
-    process.versions.node != null;
 async function ensureLoaded() {
+    // Note: using process[versions] instead of process.versions to avoid Next.js edge runtime warnings.
+    const versions = "versions";
+    const isNode = typeof process !== "undefined" &&
+        process[versions] != null &&
+        process[versions].node != null;
     if (!isNode)
         return Promise.resolve();
     if (!runStorage) {

package/index.cjs CHANGED Viewed

@@ -25,5 +25,6 @@ __exportStar(require("./observability/index.cjs"), exports);
 __exportStar(require("./retriever/index.cjs"), exports);
 __exportStar(require("./text-chunk/index.cjs"), exports);
 __exportStar(require("./tool/index.cjs"), exports);
+__exportStar(require("./ui/index.cjs"), exports);
 __exportStar(require("./util/index.cjs"), exports);
 __exportStar(require("./vector-index/index.cjs"), exports);

package/index.d.ts CHANGED Viewed

@@ -9,5 +9,6 @@ export * from "./observability/index.js";
 export * from "./retriever/index.js";
 export * from "./text-chunk/index.js";
 export * from "./tool/index.js";
+export * from "./ui/index.js";
 export * from "./util/index.js";
 export * from "./vector-index/index.js";

package/index.js CHANGED Viewed

@@ -9,5 +9,6 @@ export * from "./observability/index.js";
 export * from "./retriever/index.js";
 export * from "./text-chunk/index.js";
 export * from "./tool/index.js";
+export * from "./ui/index.js";
 export * from "./util/index.js";
 export * from "./vector-index/index.js";

package/model-function/synthesize-speech/SpeechSynthesisModel.d.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 /// <reference types="node" />
 import { FunctionOptions } from "../../core/FunctionOptions.js";
+import { Delta } from "../../model-function/Delta.js";
 import { Model, ModelSettings } from "../Model.js";
 export interface SpeechSynthesisModelSettings extends ModelSettings {
 }
@@ -7,5 +8,8 @@ export interface SpeechSynthesisModel<SETTINGS extends SpeechSynthesisModelSetti
     /**
      * Generates an mp3 audio buffer that contains the speech for the given text.
      */
-    generateSpeechResponse: (text: string, options?: FunctionOptions) => PromiseLike<Buffer>;
+    doSynthesizeSpeechStandard(text: string, options?: FunctionOptions): PromiseLike<Buffer>;
+}
+export interface DuplexSpeechSynthesisModel<SETTINGS extends SpeechSynthesisModelSettings = SpeechSynthesisModelSettings> extends SpeechSynthesisModel<SETTINGS> {
+    doSynthesizeSpeechStreamDuplex(textStream: AsyncIterable<string>, options?: FunctionOptions): PromiseLike<AsyncIterable<Delta<Buffer>>>;
 }

package/model-function/synthesize-speech/synthesizeSpeech.cjs CHANGED Viewed

@@ -1,24 +1,67 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.synthesizeSpeech = void 0;
-const executeCall_js_1 = require("../executeCall.cjs");
+const AsyncIterableResultPromise_js_1 = require("../../model-function/AsyncIterableResultPromise.cjs");
 const ModelFunctionPromise_js_1 = require("../ModelFunctionPromise.cjs");
-/**
- * Synthesizes speech from text.
- */
+const executeCall_js_1 = require("../executeCall.cjs");
 function synthesizeSpeech(model, text, options) {
-    return new ModelFunctionPromise_js_1.ModelFunctionPromise((0, executeCall_js_1.executeCall)({
-        functionType: "speech-synthesis",
-        input: text,
-        model,
-        options,
-        generateResponse: async (options) => {
-            const response = await model.generateSpeechResponse(text, options);
-            return {
-                response,
-                extractedValue: response,
-            };
-        },
-    }));
+    const mode = options?.mode ?? "standard";
+    switch (mode) {
+        case "standard": {
+            if (typeof text !== "string") {
+                throw new Error(`The "standard" mode only supports a string input, but received ${text}`);
+            }
+            return new ModelFunctionPromise_js_1.ModelFunctionPromise((0, executeCall_js_1.executeCall)({
+                functionType: "speech-synthesis",
+                input: text,
+                model,
+                options,
+                generateResponse: async (options) => {
+                    const response = await model.doSynthesizeSpeechStandard(text, options);
+                    return {
+                        response,
+                        extractedValue: response,
+                    };
+                },
+            }));
+        }
+        case "stream-duplex": {
+            if (typeof text === "string") {
+                throw new Error(`The "stream-duplex" mode only supports an AsyncIterable<string> input, but received ${text}`);
+            }
+            if (!("doSynthesizeSpeechStreamDuplex" in model) ||
+                typeof model.doSynthesizeSpeechStreamDuplex !== "function") {
+                throw new Error(`The "stream-duplex" mode is not supported by this model.`);
+            }
+            return new AsyncIterableResultPromise_js_1.AsyncIterableResultPromise(doSynthesizeSpeechStreamDuplex(model, text, options));
+        }
+        default: {
+            const mode_ = mode;
+            throw new Error(`Unsupported mode: ${mode_}`);
+        }
+    }
 }
 exports.synthesizeSpeech = synthesizeSpeech;
+async function doSynthesizeSpeechStreamDuplex(model, text, options) {
+    const speechDeltas = await model.doSynthesizeSpeechStreamDuplex(text, options);
+    // Convert the speechDeltas (AsyncIterable<Delta<Buffer>>) to an AsyncIterable<Buffer>
+    const bufferStream = convertDeltasToBuffers(speechDeltas);
+    return {
+        output: bufferStream,
+        metadata: {
+            model: model.modelInformation,
+            callId: "test",
+            startTimestamp: new Date(),
+        },
+    };
+}
+async function* convertDeltasToBuffers(deltas) {
+    for await (const delta of deltas) {
+        switch (delta.type) {
+            case "error":
+                throw delta.error;
+            case "delta":
+                yield delta.valueDelta;
+        }
+    }
+}

package/model-function/synthesize-speech/synthesizeSpeech.d.ts CHANGED Viewed

@@ -1,8 +1,14 @@
 /// <reference types="node" />
 import { FunctionOptions } from "../../core/FunctionOptions.js";
+import { AsyncIterableResultPromise } from "../../model-function/AsyncIterableResultPromise.js";
 import { ModelFunctionPromise } from "../ModelFunctionPromise.js";
-import { SpeechSynthesisModel, SpeechSynthesisModelSettings } from "./SpeechSynthesisModel.js";
+import { DuplexSpeechSynthesisModel, SpeechSynthesisModel, SpeechSynthesisModelSettings } from "./SpeechSynthesisModel.js";
 /**
  * Synthesizes speech from text.
  */
-export declare function synthesizeSpeech(model: SpeechSynthesisModel<SpeechSynthesisModelSettings>, text: string, options?: FunctionOptions): ModelFunctionPromise<Buffer>;
+export declare function synthesizeSpeech(model: SpeechSynthesisModel<SpeechSynthesisModelSettings>, text: string, options?: FunctionOptions & {
+    mode?: "standard";
+}): ModelFunctionPromise<Buffer>;
+export declare function synthesizeSpeech(model: DuplexSpeechSynthesisModel<SpeechSynthesisModelSettings>, text: AsyncIterable<string>, options: FunctionOptions & {
+    mode: "stream-duplex";
+}): AsyncIterableResultPromise<Buffer>;

package/model-function/synthesize-speech/synthesizeSpeech.js CHANGED Viewed

@@ -1,20 +1,63 @@
-import { executeCall } from "../executeCall.js";
+import { AsyncIterableResultPromise } from "../../model-function/AsyncIterableResultPromise.js";
 import { ModelFunctionPromise } from "../ModelFunctionPromise.js";
-/**
- * Synthesizes speech from text.
- */
+import { executeCall } from "../executeCall.js";
 export function synthesizeSpeech(model, text, options) {
-    return new ModelFunctionPromise(executeCall({
-        functionType: "speech-synthesis",
-        input: text,
-        model,
-        options,
-        generateResponse: async (options) => {
-            const response = await model.generateSpeechResponse(text, options);
-            return {
-                response,
-                extractedValue: response,
-            };
+    const mode = options?.mode ?? "standard";
+    switch (mode) {
+        case "standard": {
+            if (typeof text !== "string") {
+                throw new Error(`The "standard" mode only supports a string input, but received ${text}`);
+            }
+            return new ModelFunctionPromise(executeCall({
+                functionType: "speech-synthesis",
+                input: text,
+                model,
+                options,
+                generateResponse: async (options) => {
+                    const response = await model.doSynthesizeSpeechStandard(text, options);
+                    return {
+                        response,
+                        extractedValue: response,
+                    };
+                },
+            }));
+        }
+        case "stream-duplex": {
+            if (typeof text === "string") {
+                throw new Error(`The "stream-duplex" mode only supports an AsyncIterable<string> input, but received ${text}`);
+            }
+            if (!("doSynthesizeSpeechStreamDuplex" in model) ||
+                typeof model.doSynthesizeSpeechStreamDuplex !== "function") {
+                throw new Error(`The "stream-duplex" mode is not supported by this model.`);
+            }
+            return new AsyncIterableResultPromise(doSynthesizeSpeechStreamDuplex(model, text, options));
+        }
+        default: {
+            const mode_ = mode;
+            throw new Error(`Unsupported mode: ${mode_}`);
+        }
+    }
+}
+async function doSynthesizeSpeechStreamDuplex(model, text, options) {
+    const speechDeltas = await model.doSynthesizeSpeechStreamDuplex(text, options);
+    // Convert the speechDeltas (AsyncIterable<Delta<Buffer>>) to an AsyncIterable<Buffer>
+    const bufferStream = convertDeltasToBuffers(speechDeltas);
+    return {
+        output: bufferStream,
+        metadata: {
+            model: model.modelInformation,
+            callId: "test",
+            startTimestamp: new Date(),
         },
-    }));
+    };
+}
+async function* convertDeltasToBuffers(deltas) {
+    for await (const delta of deltas) {
+        switch (delta.type) {
+            case "error":
+                throw delta.error;
+            case "delta":
+                yield delta.valueDelta;
+        }
+    }
 }

package/model-provider/elevenlabs/ElevenLabsApiConfiguration.cjs CHANGED Viewed

@@ -18,5 +18,8 @@ class ElevenLabsApiConfiguration extends BaseUrlApiConfiguration_js_1.BaseUrlApi
             throttle,
         });
     }
+    get apiKey() {
+        return this.headers["xi-api-key"];
+    }
 }
 exports.ElevenLabsApiConfiguration = ElevenLabsApiConfiguration;

package/model-provider/elevenlabs/ElevenLabsApiConfiguration.d.ts CHANGED Viewed

@@ -8,4 +8,5 @@ export declare class ElevenLabsApiConfiguration extends BaseUrlApiConfiguration
         retry?: RetryFunction;
         throttle?: ThrottleFunction;
     });
+    get apiKey(): string;
 }

package/model-provider/elevenlabs/ElevenLabsApiConfiguration.js CHANGED Viewed

@@ -15,4 +15,7 @@ export class ElevenLabsApiConfiguration extends BaseUrlApiConfiguration {
             throttle,
         });
     }
+    get apiKey() {
+        return this.headers["xi-api-key"];
+    }
 }

package/model-provider/elevenlabs/ElevenLabsSpeechSynthesisModel.cjs CHANGED Viewed

@@ -1,11 +1,21 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.ElevenLabsSpeechSynthesisModel = void 0;
+const zod_1 = require("zod");
 const callWithRetryAndThrottle_js_1 = require("../../core/api/callWithRetryAndThrottle.cjs");
 const postToApi_js_1 = require("../../core/api/postToApi.cjs");
+const AsyncQueue_js_1 = require("../../event-source/AsyncQueue.cjs");
 const AbstractModel_js_1 = require("../../model-function/AbstractModel.cjs");
+const SimpleWebSocket_js_1 = require("../../util/SimpleWebSocket.cjs");
+const parseJSON_js_1 = require("../../util/parseJSON.cjs");
 const ElevenLabsApiConfiguration_js_1 = require("./ElevenLabsApiConfiguration.cjs");
 const ElevenLabsError_js_1 = require("./ElevenLabsError.cjs");
+const elevenLabsModels = [
+    "eleven_multilingual_v2",
+    "eleven_multilingual_v1",
+    "eleven_monolingual_v1",
+];
+const defaultModel = "eleven_multilingual_v2";
 /**
  * Synthesize speech using the ElevenLabs Text to Speech API.
  *
@@ -45,9 +55,101 @@ class ElevenLabsSpeechSynthesisModel extends AbstractModel_js_1.AbstractModel {
             voiceSettings: this.settings.voiceSettings,
         };
     }
-    generateSpeechResponse(text, options) {
+    doSynthesizeSpeechStandard(text, options) {
         return this.callAPI(text, options);
     }
+    async doSynthesizeSpeechStreamDuplex(textStream
+    // options?: FunctionOptions | undefined
+    ) {
+        const responseSchema = zod_1.z.union([
+            zod_1.z.object({
+                audio: zod_1.z.string(),
+                isFinal: zod_1.z.literal(false).nullable(),
+                normalizedAlignment: zod_1.z
+                    .object({
+                    chars: zod_1.z.array(zod_1.z.string()),
+                    charStartTimesMs: zod_1.z.array(zod_1.z.number()),
+                    charDurationsMs: zod_1.z.array(zod_1.z.number()),
+                })
+                    .nullable(),
+            }),
+            zod_1.z.object({
+                isFinal: zod_1.z.literal(true),
+            }),
+            zod_1.z.object({
+                message: zod_1.z.string(),
+                error: zod_1.z.string(),
+                code: zod_1.z.number(),
+            }),
+        ]);
+        const queue = new AsyncQueue_js_1.AsyncQueue();
+        const model = this.settings.model ?? defaultModel;
+        const socket = await (0, SimpleWebSocket_js_1.createSimpleWebSocket)(`wss://api.elevenlabs.io/v1/text-to-speech/${this.settings.voice}/stream-input?model_id=${model}`);
+        socket.onopen = async () => {
+            const api = this.settings.api ?? new ElevenLabsApiConfiguration_js_1.ElevenLabsApiConfiguration();
+            // send begin-of-stream (BOS) message:
+            socket.send(JSON.stringify({
+                // The JS WebSocket API does not support authorization headers, so we send the API key in the BOS message.
+                // See https://stackoverflow.com/questions/4361173/http-headers-in-websockets-client-api
+                xi_api_key: api.apiKey,
+                text: " ",
+                voice_settings: toApiVoiceSettings(this.settings.voiceSettings),
+                generation_config: toGenerationConfig(this.settings.generationConfig),
+            }));
+            // send text in chunks:
+            let textBuffer = "";
+            for await (const textDelta of textStream) {
+                textBuffer += textDelta;
+                // using ". " as separator: sending in full sentences improves the quality
+                // of the audio output significantly.
+                const separator = textBuffer.lastIndexOf(". ");
+                if (separator === -1) {
+                    continue;
+                }
+                const textToProcess = textBuffer.slice(0, separator);
+                textBuffer = textBuffer.slice(separator + 1);
+                socket.send(JSON.stringify({
+                    text: textToProcess,
+                    try_trigger_generation: true,
+                }));
+            }
+            // send remaining text:
+            if (textBuffer.length > 0) {
+                socket.send(JSON.stringify({
+                    text: `${textBuffer} `,
+                    try_trigger_generation: true,
+                }));
+            }
+            // send end-of-stream (EOS) message:
+            socket.send(JSON.stringify({ text: "" }));
+        };
+        socket.onmessage = (event) => {
+            const parseResult = (0, parseJSON_js_1.safeParseJsonWithZod)(event.data, responseSchema);
+            if (!parseResult.success) {
+                queue.push({ type: "error", error: parseResult.error });
+                return;
+            }
+            const response = parseResult.data;
+            if ("error" in response) {
+                queue.push({ type: "error", error: response });
+                return;
+            }
+            if (!response.isFinal) {
+                queue.push({
+                    type: "delta",
+                    fullDelta: event,
+                    valueDelta: Buffer.from(response.audio, "base64"),
+                });
+            }
+        };
+        socket.onerror = (error) => {
+            queue.push({ type: "error", error });
+        };
+        socket.onclose = () => {
+            queue.close();
+        };
+        return queue;
+    }
     withSettings(additionalSettings) {
         return new ElevenLabsSpeechSynthesisModel({
             ...this.settings,
@@ -62,18 +164,28 @@ async function callElevenLabsTextToSpeechAPI({ api = new ElevenLabsApiConfigurat
         headers: api.headers,
         body: {
             text,
-            model_id: modelId,
-            voice_settings: voiceSettings != null
-                ? {
-                    stability: voiceSettings.stability,
-                    similarity_boost: voiceSettings.similarityBoost,
-                    style: voiceSettings.style,
-                    use_speaker_boost: voiceSettings.useSpeakerBoost,
-                }
-                : undefined,
+            model_id: modelId ?? defaultModel,
+            voice_settings: toApiVoiceSettings(voiceSettings),
         },
         failedResponseHandler: ElevenLabsError_js_1.failedElevenLabsCallResponseHandler,
         successfulResponseHandler: (0, postToApi_js_1.createAudioMpegResponseHandler)(),
         abortSignal,
     });
 }
+function toApiVoiceSettings(voiceSettings) {
+    return voiceSettings != null
+        ? {
+            stability: voiceSettings.stability,
+            similarity_boost: voiceSettings.similarityBoost,
+            style: voiceSettings.style,
+            use_speaker_boost: voiceSettings.useSpeakerBoost,
+        }
+        : undefined;
+}
+function toGenerationConfig(generationConfig) {
+    return generationConfig != null
+        ? {
+            chunk_length_schedule: generationConfig.chunkLengthSchedule,
+        }
+        : undefined;
+}

package/model-provider/elevenlabs/ElevenLabsSpeechSynthesisModel.d.ts CHANGED Viewed

@@ -2,17 +2,24 @@
 import { FunctionOptions } from "../../core/FunctionOptions.js";
 import { ApiConfiguration } from "../../core/api/ApiConfiguration.js";
 import { AbstractModel } from "../../model-function/AbstractModel.js";
+import { Delta } from "../../model-function/Delta.js";
 import { SpeechSynthesisModel, SpeechSynthesisModelSettings } from "../../model-function/synthesize-speech/SpeechSynthesisModel.js";
+declare const elevenLabsModels: readonly ["eleven_multilingual_v2", "eleven_multilingual_v1", "eleven_monolingual_v1"];
 export interface ElevenLabsSpeechSynthesisModelSettings extends SpeechSynthesisModelSettings {
-    api?: ApiConfiguration;
+    api?: ApiConfiguration & {
+        apiKey: string;
+    };
     voice: string;
-    model?: string;
+    model?: (typeof elevenLabsModels)[number] | (string & {});
     voiceSettings?: {
         stability: number;
         similarityBoost: number;
         style?: number;
         useSpeakerBoost?: boolean;
     };
+    generationConfig?: {
+        chunkLengthSchedule: number[];
+    };
 }
 /**
  * Synthesize speech using the ElevenLabs Text to Speech API.
@@ -25,6 +32,8 @@ export declare class ElevenLabsSpeechSynthesisModel extends AbstractModel<Eleven
     get modelName(): string;
     private callAPI;
     get settingsForEvent(): Partial<ElevenLabsSpeechSynthesisModelSettings>;
-    generateSpeechResponse(text: string, options?: FunctionOptions): Promise<Buffer>;
+    doSynthesizeSpeechStandard(text: string, options?: FunctionOptions): Promise<Buffer>;
+    doSynthesizeSpeechStreamDuplex(textStream: AsyncIterable<string>): Promise<AsyncIterable<Delta<Buffer>>>;
     withSettings(additionalSettings: Partial<ElevenLabsSpeechSynthesisModelSettings>): this;
 }
+export {};

package/model-provider/elevenlabs/ElevenLabsSpeechSynthesisModel.js CHANGED Viewed

@@ -1,8 +1,18 @@
+import { z } from "zod";
 import { callWithRetryAndThrottle } from "../../core/api/callWithRetryAndThrottle.js";
 import { createAudioMpegResponseHandler, postJsonToApi, } from "../../core/api/postToApi.js";
+import { AsyncQueue } from "../../event-source/AsyncQueue.js";
 import { AbstractModel } from "../../model-function/AbstractModel.js";
+import { createSimpleWebSocket } from "../../util/SimpleWebSocket.js";
+import { safeParseJsonWithZod } from "../../util/parseJSON.js";
 import { ElevenLabsApiConfiguration } from "./ElevenLabsApiConfiguration.js";
 import { failedElevenLabsCallResponseHandler } from "./ElevenLabsError.js";
+const elevenLabsModels = [
+    "eleven_multilingual_v2",
+    "eleven_multilingual_v1",
+    "eleven_monolingual_v1",
+];
+const defaultModel = "eleven_multilingual_v2";
 /**
  * Synthesize speech using the ElevenLabs Text to Speech API.
  *
@@ -42,9 +52,101 @@ export class ElevenLabsSpeechSynthesisModel extends AbstractModel {
             voiceSettings: this.settings.voiceSettings,
         };
     }
-    generateSpeechResponse(text, options) {
+    doSynthesizeSpeechStandard(text, options) {
         return this.callAPI(text, options);
     }
+    async doSynthesizeSpeechStreamDuplex(textStream
+    // options?: FunctionOptions | undefined
+    ) {
+        const responseSchema = z.union([
+            z.object({
+                audio: z.string(),
+                isFinal: z.literal(false).nullable(),
+                normalizedAlignment: z
+                    .object({
+                    chars: z.array(z.string()),
+                    charStartTimesMs: z.array(z.number()),
+                    charDurationsMs: z.array(z.number()),
+                })
+                    .nullable(),
+            }),
+            z.object({
+                isFinal: z.literal(true),
+            }),
+            z.object({
+                message: z.string(),
+                error: z.string(),
+                code: z.number(),
+            }),
+        ]);
+        const queue = new AsyncQueue();
+        const model = this.settings.model ?? defaultModel;
+        const socket = await createSimpleWebSocket(`wss://api.elevenlabs.io/v1/text-to-speech/${this.settings.voice}/stream-input?model_id=${model}`);
+        socket.onopen = async () => {
+            const api = this.settings.api ?? new ElevenLabsApiConfiguration();
+            // send begin-of-stream (BOS) message:
+            socket.send(JSON.stringify({
+                // The JS WebSocket API does not support authorization headers, so we send the API key in the BOS message.
+                // See https://stackoverflow.com/questions/4361173/http-headers-in-websockets-client-api
+                xi_api_key: api.apiKey,
+                text: " ",
+                voice_settings: toApiVoiceSettings(this.settings.voiceSettings),
+                generation_config: toGenerationConfig(this.settings.generationConfig),
+            }));
+            // send text in chunks:
+            let textBuffer = "";
+            for await (const textDelta of textStream) {
+                textBuffer += textDelta;
+                // using ". " as separator: sending in full sentences improves the quality
+                // of the audio output significantly.
+                const separator = textBuffer.lastIndexOf(". ");
+                if (separator === -1) {
+                    continue;
+                }
+                const textToProcess = textBuffer.slice(0, separator);
+                textBuffer = textBuffer.slice(separator + 1);
+                socket.send(JSON.stringify({
+                    text: textToProcess,
+                    try_trigger_generation: true,
+                }));
+            }
+            // send remaining text:
+            if (textBuffer.length > 0) {
+                socket.send(JSON.stringify({
+                    text: `${textBuffer} `,
+                    try_trigger_generation: true,
+                }));
+            }
+            // send end-of-stream (EOS) message:
+            socket.send(JSON.stringify({ text: "" }));
+        };
+        socket.onmessage = (event) => {
+            const parseResult = safeParseJsonWithZod(event.data, responseSchema);
+            if (!parseResult.success) {
+                queue.push({ type: "error", error: parseResult.error });
+                return;
+            }
+            const response = parseResult.data;
+            if ("error" in response) {
+                queue.push({ type: "error", error: response });
+                return;
+            }
+            if (!response.isFinal) {
+                queue.push({
+                    type: "delta",
+                    fullDelta: event,
+                    valueDelta: Buffer.from(response.audio, "base64"),
+                });
+            }
+        };
+        socket.onerror = (error) => {
+            queue.push({ type: "error", error });
+        };
+        socket.onclose = () => {
+            queue.close();
+        };
+        return queue;
+    }
     withSettings(additionalSettings) {
         return new ElevenLabsSpeechSynthesisModel({
             ...this.settings,
@@ -58,18 +160,28 @@ async function callElevenLabsTextToSpeechAPI({ api = new ElevenLabsApiConfigurat
         headers: api.headers,
         body: {
             text,
-            model_id: modelId,
-            voice_settings: voiceSettings != null
-                ? {
-                    stability: voiceSettings.stability,
-                    similarity_boost: voiceSettings.similarityBoost,
-                    style: voiceSettings.style,
-                    use_speaker_boost: voiceSettings.useSpeakerBoost,
-                }
-                : undefined,
+            model_id: modelId ?? defaultModel,
+            voice_settings: toApiVoiceSettings(voiceSettings),
         },
         failedResponseHandler: failedElevenLabsCallResponseHandler,
         successfulResponseHandler: createAudioMpegResponseHandler(),
         abortSignal,
     });
 }
+function toApiVoiceSettings(voiceSettings) {
+    return voiceSettings != null
+        ? {
+            stability: voiceSettings.stability,
+            similarity_boost: voiceSettings.similarityBoost,
+            style: voiceSettings.style,
+            use_speaker_boost: voiceSettings.useSpeakerBoost,
+        }
+        : undefined;
+}
+function toGenerationConfig(generationConfig) {
+    return generationConfig != null
+        ? {
+            chunk_length_schedule: generationConfig.chunkLengthSchedule,
+        }
+        : undefined;
+}

package/model-provider/lmnt/LmntSpeechSynthesisModel.cjs CHANGED Viewed

@@ -43,7 +43,7 @@ class LmntSpeechSynthesisModel extends AbstractModel_js_1.AbstractModel {
             length: this.settings.length,
         };
     }
-    generateSpeechResponse(text, options) {
+    doSynthesizeSpeechStandard(text, options) {
         return this.callAPI(text, options);
     }
     withSettings(additionalSettings) {

package/model-provider/lmnt/LmntSpeechSynthesisModel.d.ts CHANGED Viewed

@@ -21,6 +21,6 @@ export declare class LmntSpeechSynthesisModel extends AbstractModel<LmntSpeechSy
     get modelName(): string;
     private callAPI;
     get settingsForEvent(): Partial<LmntSpeechSynthesisModelSettings>;
-    generateSpeechResponse(text: string, options?: FunctionOptions): Promise<Buffer>;
+    doSynthesizeSpeechStandard(text: string, options?: FunctionOptions): Promise<Buffer>;
     withSettings(additionalSettings: Partial<LmntSpeechSynthesisModelSettings>): this;
 }

package/model-provider/lmnt/LmntSpeechSynthesisModel.js CHANGED Viewed

@@ -40,7 +40,7 @@ export class LmntSpeechSynthesisModel extends AbstractModel {
             length: this.settings.length,
         };
     }
-    generateSpeechResponse(text, options) {
+    doSynthesizeSpeechStandard(text, options) {
         return this.callAPI(text, options);
     }
     withSettings(additionalSettings) {

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "modelfusion",
   "description": "Build multimodal applications, chatbots, and agents with JavaScript and TypeScript.",
-  "version": "0.47.3",
+  "version": "0.48.0",
   "author": "Lars Grammel",
   "license": "MIT",
   "keywords": [
@@ -57,6 +57,7 @@
     "js-tiktoken": "1.0.7",
     "nanoid": "3.3.6",
     "secure-json-parse": "2.7.0",
+    "ws": "8.14.2",
     "zod": "3.22.4",
     "zod-to-json-schema": "3.21.4"
   },
@@ -64,6 +65,7 @@
     "@tsconfig/recommended": "1.0.3",
     "@types/deep-equal": "^1.0.2",
     "@types/node": "18.11.9",
+    "@types/ws": "^8.5.7",
     "@typescript-eslint/eslint-plugin": "^6.1.0",
     "@typescript-eslint/parser": "^6.1.0",
     "copyfiles": "2.4.1",

package/ui/MediaSourceAppender.cjs ADDED Viewed

@@ -0,0 +1,54 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.MediaSourceAppender = void 0;
+class MediaSourceAppender {
+    constructor(type) {
+        Object.defineProperty(this, "mediaSource", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: new MediaSource()
+        });
+        Object.defineProperty(this, "audioChunks", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: []
+        });
+        Object.defineProperty(this, "sourceBuffer", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: void 0
+        });
+        this.mediaSource.addEventListener("sourceopen", async () => {
+            this.sourceBuffer = this.mediaSource.addSourceBuffer(type);
+            this.sourceBuffer.addEventListener("updateend", () => {
+                this.tryAppendNextChunk();
+            });
+        });
+    }
+    tryAppendNextChunk() {
+        if (this.sourceBuffer != null &&
+            !this.sourceBuffer.updating &&
+            this.audioChunks.length > 0) {
+            this.sourceBuffer.appendBuffer(this.audioChunks.shift());
+        }
+    }
+    addBase64Data(base64Data) {
+        this.addData(Uint8Array.from(atob(base64Data), (char) => char.charCodeAt(0)).buffer);
+    }
+    addData(data) {
+        this.audioChunks.push(data);
+        this.tryAppendNextChunk();
+    }
+    close() {
+        if (this.mediaSource.readyState === "open") {
+            this.mediaSource.endOfStream();
+        }
+    }
+    get mediaSourceUrl() {
+        return URL.createObjectURL(this.mediaSource);
+    }
+}
+exports.MediaSourceAppender = MediaSourceAppender;

package/ui/MediaSourceAppender.d.ts ADDED Viewed

@@ -0,0 +1,11 @@
+export declare class MediaSourceAppender {
+    private readonly mediaSource;
+    private readonly audioChunks;
+    private sourceBuffer?;
+    constructor(type: string);
+    private tryAppendNextChunk;
+    addBase64Data(base64Data: string): void;
+    addData(data: ArrayBuffer): void;
+    close(): void;
+    get mediaSourceUrl(): string;
+}

package/ui/MediaSourceAppender.js ADDED Viewed

@@ -0,0 +1,50 @@
+export class MediaSourceAppender {
+    constructor(type) {
+        Object.defineProperty(this, "mediaSource", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: new MediaSource()
+        });
+        Object.defineProperty(this, "audioChunks", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: []
+        });
+        Object.defineProperty(this, "sourceBuffer", {
+            enumerable: true,
+            configurable: true,
+            writable: true,
+            value: void 0
+        });
+        this.mediaSource.addEventListener("sourceopen", async () => {
+            this.sourceBuffer = this.mediaSource.addSourceBuffer(type);
+            this.sourceBuffer.addEventListener("updateend", () => {
+                this.tryAppendNextChunk();
+            });
+        });
+    }
+    tryAppendNextChunk() {
+        if (this.sourceBuffer != null &&
+            !this.sourceBuffer.updating &&
+            this.audioChunks.length > 0) {
+            this.sourceBuffer.appendBuffer(this.audioChunks.shift());
+        }
+    }
+    addBase64Data(base64Data) {
+        this.addData(Uint8Array.from(atob(base64Data), (char) => char.charCodeAt(0)).buffer);
+    }
+    addData(data) {
+        this.audioChunks.push(data);
+        this.tryAppendNextChunk();
+    }
+    close() {
+        if (this.mediaSource.readyState === "open") {
+            this.mediaSource.endOfStream();
+        }
+    }
+    get mediaSourceUrl() {
+        return URL.createObjectURL(this.mediaSource);
+    }
+}

package/ui/index.cjs ADDED Viewed

@@ -0,0 +1,17 @@
+"use strict";
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __exportStar = (this && this.__exportStar) || function(m, exports) {
+    for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+__exportStar(require("./MediaSourceAppender.cjs"), exports);

package/ui/index.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export * from "./MediaSourceAppender.js";

package/ui/index.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export * from "./MediaSourceAppender.js";

package/util/SimpleWebSocket.cjs ADDED Viewed

@@ -0,0 +1,41 @@
+"use strict";
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || function (mod) {
+    if (mod && mod.__esModule) return mod;
+    var result = {};
+    if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
+    __setModuleDefault(result, mod);
+    return result;
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.createSimpleWebSocket = void 0;
+/**
+ * Creates a simplified websocket connection. This function works in both Node.js and browser.
+ */
+async function createSimpleWebSocket(url) {
+    if (typeof window === "undefined") {
+        // Use ws library in Node.js:
+        const { default: WebSocket } = await Promise.resolve().then(() => __importStar(require("ws")));
+        return new WebSocket(url);
+    }
+    else {
+        // Use native WebSocket in browser:
+        return new WebSocket(url);
+    }
+}
+exports.createSimpleWebSocket = createSimpleWebSocket;

package/util/SimpleWebSocket.d.ts ADDED Viewed

@@ -0,0 +1,12 @@
+export interface SimpleWebSocket {
+    send(data: string): void;
+    onmessage: ((event: MessageEvent) => void) | null;
+    onopen: ((event: Event) => void) | null;
+    onclose: ((event: CloseEvent) => void) | null;
+    onerror: ((event: Event) => void) | null;
+    close(code?: number, reason?: string): void;
+}
+/**
+ * Creates a simplified websocket connection. This function works in both Node.js and browser.
+ */
+export declare function createSimpleWebSocket(url: string): Promise<SimpleWebSocket>;

package/util/SimpleWebSocket.js ADDED Viewed

@@ -0,0 +1,14 @@
+/**
+ * Creates a simplified websocket connection. This function works in both Node.js and browser.
+ */
+export async function createSimpleWebSocket(url) {
+    if (typeof window === "undefined") {
+        // Use ws library in Node.js:
+        const { default: WebSocket } = await import("ws");
+        return new WebSocket(url);
+    }
+    else {
+        // Use native WebSocket in browser:
+        return new WebSocket(url);
+    }
+}