modelfusion 0.47.3 → 0.48.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -274,7 +274,11 @@ Providers: [OpenAI (Whisper)](https://modelfusion.dev/integration/model-provider
274
274
 
275
275
  ### [Synthesize Speech](https://modelfusion.dev/guide/function/synthesize-speech)
276
276
 
277
- Turn text into speech (audio).
277
+ Generate speech (audio) from text. Also called TTS (text-to-speech).
278
+
279
+ Providers: [Eleven Labs](https://modelfusion.dev/integration/model-provider/elevenlabs), [LMNT](https://modelfusion.dev/integration/model-provider/lmnt)
280
+
281
+ #### Standard mode
278
282
 
279
283
  ```ts
280
284
  // `speech` is a Buffer with MP3 audio data
@@ -289,7 +293,28 @@ const speech = await synthesizeSpeech(
289
293
  );
290
294
  ```
291
295
 
292
- Providers: [Eleven Labs](https://modelfusion.dev/integration/model-provider/elevenlabs), [LMNT](https://modelfusion.dev/integration/model-provider/lmnt)
296
+ #### Duplex streaming mode
297
+
298
+ ```ts
299
+ const textStream = await streamText(/* ... */);
300
+
301
+ const speechStream = await synthesizeSpeech(
302
+ new ElevenLabsSpeechSynthesisModel({
303
+ voice: "pNInz6obpgDQGcFmaJgB", // Adam
304
+ model: "eleven_monolingual_v1",
305
+ voiceSettings: { stability: 1, similarityBoost: 0.35 },
306
+ generationConfig: {
307
+ chunkLengthSchedule: [50, 90, 120, 150, 200],
308
+ },
309
+ }),
310
+ textStream,
311
+ { mode: "stream-duplex" }
312
+ );
313
+
314
+ for await (const part of speechStream) {
315
+ // each part is a Buffer with MP3 audio data
316
+ }
317
+ ```
293
318
 
294
319
  ### [Describe Image](https://modelfusion.dev/guide/function/describe-image)
295
320
 
@@ -603,6 +628,12 @@ Create an 19th century painting image for your input.
603
628
 
604
629
  Record audio with push-to-talk and transcribe it using Whisper, implemented as a Next.js app. The app shows a list of the transcriptions.
605
630
 
631
+ ### [Duplex Speech Streaming (Vite(React) + Fastify))](https://github.com/lgrammel/modelfusion/tree/main/examples/duplex-speech-streaming-vite-react-fastify)
632
+
633
+ > _Speech Streaming_, _OpenAI_, _Elevenlabs_ _streaming_, _Vite_, _Fastify_
634
+
635
+ Given a prompt, the server returns both a text and a speech stream response.
636
+
606
637
  ### [BabyAGI Agent](https://github.com/lgrammel/modelfusion/tree/main/examples/babyagi-agent)
607
638
 
608
639
  > _terminal app_, _agent_, _BabyAGI_
@@ -627,6 +658,12 @@ Small agent that solves middle school math problems. It uses a calculator tool t
627
658
 
628
659
  Extracts information about a topic from a PDF and writes a tweet in your own style about it.
629
660
 
661
+ ### [Cloudflare Workers](https://github.com/lgrammel/modelfusion/tree/main/examples/cloudflare-workers)
662
+
663
+ > _Cloudflare_, _OpenAI_
664
+
665
+ Generate text on a Cloudflare Worker using ModelFusion and OpenAI.
666
+
630
667
  ## Contributing
631
668
 
632
669
  ### [Contributing Guide](https://github.com/lgrammel/modelfusion/blob/main/CONTRIBUTING.md)
package/core/getRun.cjs CHANGED
@@ -25,10 +25,12 @@ var __importStar = (this && this.__importStar) || function (mod) {
25
25
  Object.defineProperty(exports, "__esModule", { value: true });
26
26
  exports.withRun = exports.getRun = void 0;
27
27
  let runStorage;
28
- const isNode = typeof process !== "undefined" &&
29
- process.versions != null &&
30
- process.versions.node != null;
31
28
  async function ensureLoaded() {
29
+ // Note: using process[versions] instead of process.versions to avoid Next.js edge runtime warnings.
30
+ const versions = "versions";
31
+ const isNode = typeof process !== "undefined" &&
32
+ process[versions] != null &&
33
+ process[versions].node != null;
32
34
  if (!isNode)
33
35
  return Promise.resolve();
34
36
  if (!runStorage) {
package/core/getRun.js CHANGED
@@ -1,8 +1,10 @@
1
1
  let runStorage;
2
- const isNode = typeof process !== "undefined" &&
3
- process.versions != null &&
4
- process.versions.node != null;
5
2
  async function ensureLoaded() {
3
+ // Note: using process[versions] instead of process.versions to avoid Next.js edge runtime warnings.
4
+ const versions = "versions";
5
+ const isNode = typeof process !== "undefined" &&
6
+ process[versions] != null &&
7
+ process[versions].node != null;
6
8
  if (!isNode)
7
9
  return Promise.resolve();
8
10
  if (!runStorage) {
package/index.cjs CHANGED
@@ -25,5 +25,6 @@ __exportStar(require("./observability/index.cjs"), exports);
25
25
  __exportStar(require("./retriever/index.cjs"), exports);
26
26
  __exportStar(require("./text-chunk/index.cjs"), exports);
27
27
  __exportStar(require("./tool/index.cjs"), exports);
28
+ __exportStar(require("./ui/index.cjs"), exports);
28
29
  __exportStar(require("./util/index.cjs"), exports);
29
30
  __exportStar(require("./vector-index/index.cjs"), exports);
package/index.d.ts CHANGED
@@ -9,5 +9,6 @@ export * from "./observability/index.js";
9
9
  export * from "./retriever/index.js";
10
10
  export * from "./text-chunk/index.js";
11
11
  export * from "./tool/index.js";
12
+ export * from "./ui/index.js";
12
13
  export * from "./util/index.js";
13
14
  export * from "./vector-index/index.js";
package/index.js CHANGED
@@ -9,5 +9,6 @@ export * from "./observability/index.js";
9
9
  export * from "./retriever/index.js";
10
10
  export * from "./text-chunk/index.js";
11
11
  export * from "./tool/index.js";
12
+ export * from "./ui/index.js";
12
13
  export * from "./util/index.js";
13
14
  export * from "./vector-index/index.js";
@@ -1,5 +1,6 @@
1
1
  /// <reference types="node" />
2
2
  import { FunctionOptions } from "../../core/FunctionOptions.js";
3
+ import { Delta } from "../../model-function/Delta.js";
3
4
  import { Model, ModelSettings } from "../Model.js";
4
5
  export interface SpeechSynthesisModelSettings extends ModelSettings {
5
6
  }
@@ -7,5 +8,8 @@ export interface SpeechSynthesisModel<SETTINGS extends SpeechSynthesisModelSetti
7
8
  /**
8
9
  * Generates an mp3 audio buffer that contains the speech for the given text.
9
10
  */
10
- generateSpeechResponse: (text: string, options?: FunctionOptions) => PromiseLike<Buffer>;
11
+ doSynthesizeSpeechStandard(text: string, options?: FunctionOptions): PromiseLike<Buffer>;
12
+ }
13
+ export interface DuplexSpeechSynthesisModel<SETTINGS extends SpeechSynthesisModelSettings = SpeechSynthesisModelSettings> extends SpeechSynthesisModel<SETTINGS> {
14
+ doSynthesizeSpeechStreamDuplex(textStream: AsyncIterable<string>, options?: FunctionOptions): PromiseLike<AsyncIterable<Delta<Buffer>>>;
11
15
  }
@@ -1,24 +1,67 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.synthesizeSpeech = void 0;
4
- const executeCall_js_1 = require("../executeCall.cjs");
4
+ const AsyncIterableResultPromise_js_1 = require("../../model-function/AsyncIterableResultPromise.cjs");
5
5
  const ModelFunctionPromise_js_1 = require("../ModelFunctionPromise.cjs");
6
- /**
7
- * Synthesizes speech from text.
8
- */
6
+ const executeCall_js_1 = require("../executeCall.cjs");
9
7
  function synthesizeSpeech(model, text, options) {
10
- return new ModelFunctionPromise_js_1.ModelFunctionPromise((0, executeCall_js_1.executeCall)({
11
- functionType: "speech-synthesis",
12
- input: text,
13
- model,
14
- options,
15
- generateResponse: async (options) => {
16
- const response = await model.generateSpeechResponse(text, options);
17
- return {
18
- response,
19
- extractedValue: response,
20
- };
21
- },
22
- }));
8
+ const mode = options?.mode ?? "standard";
9
+ switch (mode) {
10
+ case "standard": {
11
+ if (typeof text !== "string") {
12
+ throw new Error(`The "standard" mode only supports a string input, but received ${text}`);
13
+ }
14
+ return new ModelFunctionPromise_js_1.ModelFunctionPromise((0, executeCall_js_1.executeCall)({
15
+ functionType: "speech-synthesis",
16
+ input: text,
17
+ model,
18
+ options,
19
+ generateResponse: async (options) => {
20
+ const response = await model.doSynthesizeSpeechStandard(text, options);
21
+ return {
22
+ response,
23
+ extractedValue: response,
24
+ };
25
+ },
26
+ }));
27
+ }
28
+ case "stream-duplex": {
29
+ if (typeof text === "string") {
30
+ throw new Error(`The "stream-duplex" mode only supports an AsyncIterable<string> input, but received ${text}`);
31
+ }
32
+ if (!("doSynthesizeSpeechStreamDuplex" in model) ||
33
+ typeof model.doSynthesizeSpeechStreamDuplex !== "function") {
34
+ throw new Error(`The "stream-duplex" mode is not supported by this model.`);
35
+ }
36
+ return new AsyncIterableResultPromise_js_1.AsyncIterableResultPromise(doSynthesizeSpeechStreamDuplex(model, text, options));
37
+ }
38
+ default: {
39
+ const mode_ = mode;
40
+ throw new Error(`Unsupported mode: ${mode_}`);
41
+ }
42
+ }
23
43
  }
24
44
  exports.synthesizeSpeech = synthesizeSpeech;
45
+ async function doSynthesizeSpeechStreamDuplex(model, text, options) {
46
+ const speechDeltas = await model.doSynthesizeSpeechStreamDuplex(text, options);
47
+ // Convert the speechDeltas (AsyncIterable<Delta<Buffer>>) to an AsyncIterable<Buffer>
48
+ const bufferStream = convertDeltasToBuffers(speechDeltas);
49
+ return {
50
+ output: bufferStream,
51
+ metadata: {
52
+ model: model.modelInformation,
53
+ callId: "test",
54
+ startTimestamp: new Date(),
55
+ },
56
+ };
57
+ }
58
+ async function* convertDeltasToBuffers(deltas) {
59
+ for await (const delta of deltas) {
60
+ switch (delta.type) {
61
+ case "error":
62
+ throw delta.error;
63
+ case "delta":
64
+ yield delta.valueDelta;
65
+ }
66
+ }
67
+ }
@@ -1,8 +1,14 @@
1
1
  /// <reference types="node" />
2
2
  import { FunctionOptions } from "../../core/FunctionOptions.js";
3
+ import { AsyncIterableResultPromise } from "../../model-function/AsyncIterableResultPromise.js";
3
4
  import { ModelFunctionPromise } from "../ModelFunctionPromise.js";
4
- import { SpeechSynthesisModel, SpeechSynthesisModelSettings } from "./SpeechSynthesisModel.js";
5
+ import { DuplexSpeechSynthesisModel, SpeechSynthesisModel, SpeechSynthesisModelSettings } from "./SpeechSynthesisModel.js";
5
6
  /**
6
7
  * Synthesizes speech from text.
7
8
  */
8
- export declare function synthesizeSpeech(model: SpeechSynthesisModel<SpeechSynthesisModelSettings>, text: string, options?: FunctionOptions): ModelFunctionPromise<Buffer>;
9
+ export declare function synthesizeSpeech(model: SpeechSynthesisModel<SpeechSynthesisModelSettings>, text: string, options?: FunctionOptions & {
10
+ mode?: "standard";
11
+ }): ModelFunctionPromise<Buffer>;
12
+ export declare function synthesizeSpeech(model: DuplexSpeechSynthesisModel<SpeechSynthesisModelSettings>, text: AsyncIterable<string>, options: FunctionOptions & {
13
+ mode: "stream-duplex";
14
+ }): AsyncIterableResultPromise<Buffer>;
@@ -1,20 +1,63 @@
1
- import { executeCall } from "../executeCall.js";
1
+ import { AsyncIterableResultPromise } from "../../model-function/AsyncIterableResultPromise.js";
2
2
  import { ModelFunctionPromise } from "../ModelFunctionPromise.js";
3
- /**
4
- * Synthesizes speech from text.
5
- */
3
+ import { executeCall } from "../executeCall.js";
6
4
  export function synthesizeSpeech(model, text, options) {
7
- return new ModelFunctionPromise(executeCall({
8
- functionType: "speech-synthesis",
9
- input: text,
10
- model,
11
- options,
12
- generateResponse: async (options) => {
13
- const response = await model.generateSpeechResponse(text, options);
14
- return {
15
- response,
16
- extractedValue: response,
17
- };
5
+ const mode = options?.mode ?? "standard";
6
+ switch (mode) {
7
+ case "standard": {
8
+ if (typeof text !== "string") {
9
+ throw new Error(`The "standard" mode only supports a string input, but received ${text}`);
10
+ }
11
+ return new ModelFunctionPromise(executeCall({
12
+ functionType: "speech-synthesis",
13
+ input: text,
14
+ model,
15
+ options,
16
+ generateResponse: async (options) => {
17
+ const response = await model.doSynthesizeSpeechStandard(text, options);
18
+ return {
19
+ response,
20
+ extractedValue: response,
21
+ };
22
+ },
23
+ }));
24
+ }
25
+ case "stream-duplex": {
26
+ if (typeof text === "string") {
27
+ throw new Error(`The "stream-duplex" mode only supports an AsyncIterable<string> input, but received ${text}`);
28
+ }
29
+ if (!("doSynthesizeSpeechStreamDuplex" in model) ||
30
+ typeof model.doSynthesizeSpeechStreamDuplex !== "function") {
31
+ throw new Error(`The "stream-duplex" mode is not supported by this model.`);
32
+ }
33
+ return new AsyncIterableResultPromise(doSynthesizeSpeechStreamDuplex(model, text, options));
34
+ }
35
+ default: {
36
+ const mode_ = mode;
37
+ throw new Error(`Unsupported mode: ${mode_}`);
38
+ }
39
+ }
40
+ }
41
+ async function doSynthesizeSpeechStreamDuplex(model, text, options) {
42
+ const speechDeltas = await model.doSynthesizeSpeechStreamDuplex(text, options);
43
+ // Convert the speechDeltas (AsyncIterable<Delta<Buffer>>) to an AsyncIterable<Buffer>
44
+ const bufferStream = convertDeltasToBuffers(speechDeltas);
45
+ return {
46
+ output: bufferStream,
47
+ metadata: {
48
+ model: model.modelInformation,
49
+ callId: "test",
50
+ startTimestamp: new Date(),
18
51
  },
19
- }));
52
+ };
53
+ }
54
+ async function* convertDeltasToBuffers(deltas) {
55
+ for await (const delta of deltas) {
56
+ switch (delta.type) {
57
+ case "error":
58
+ throw delta.error;
59
+ case "delta":
60
+ yield delta.valueDelta;
61
+ }
62
+ }
20
63
  }
@@ -18,5 +18,8 @@ class ElevenLabsApiConfiguration extends BaseUrlApiConfiguration_js_1.BaseUrlApi
18
18
  throttle,
19
19
  });
20
20
  }
21
+ get apiKey() {
22
+ return this.headers["xi-api-key"];
23
+ }
21
24
  }
22
25
  exports.ElevenLabsApiConfiguration = ElevenLabsApiConfiguration;
@@ -8,4 +8,5 @@ export declare class ElevenLabsApiConfiguration extends BaseUrlApiConfiguration
8
8
  retry?: RetryFunction;
9
9
  throttle?: ThrottleFunction;
10
10
  });
11
+ get apiKey(): string;
11
12
  }
@@ -15,4 +15,7 @@ export class ElevenLabsApiConfiguration extends BaseUrlApiConfiguration {
15
15
  throttle,
16
16
  });
17
17
  }
18
+ get apiKey() {
19
+ return this.headers["xi-api-key"];
20
+ }
18
21
  }
@@ -1,11 +1,21 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.ElevenLabsSpeechSynthesisModel = void 0;
4
+ const zod_1 = require("zod");
4
5
  const callWithRetryAndThrottle_js_1 = require("../../core/api/callWithRetryAndThrottle.cjs");
5
6
  const postToApi_js_1 = require("../../core/api/postToApi.cjs");
7
+ const AsyncQueue_js_1 = require("../../event-source/AsyncQueue.cjs");
6
8
  const AbstractModel_js_1 = require("../../model-function/AbstractModel.cjs");
9
+ const SimpleWebSocket_js_1 = require("../../util/SimpleWebSocket.cjs");
10
+ const parseJSON_js_1 = require("../../util/parseJSON.cjs");
7
11
  const ElevenLabsApiConfiguration_js_1 = require("./ElevenLabsApiConfiguration.cjs");
8
12
  const ElevenLabsError_js_1 = require("./ElevenLabsError.cjs");
13
+ const elevenLabsModels = [
14
+ "eleven_multilingual_v2",
15
+ "eleven_multilingual_v1",
16
+ "eleven_monolingual_v1",
17
+ ];
18
+ const defaultModel = "eleven_multilingual_v2";
9
19
  /**
10
20
  * Synthesize speech using the ElevenLabs Text to Speech API.
11
21
  *
@@ -45,9 +55,101 @@ class ElevenLabsSpeechSynthesisModel extends AbstractModel_js_1.AbstractModel {
45
55
  voiceSettings: this.settings.voiceSettings,
46
56
  };
47
57
  }
48
- generateSpeechResponse(text, options) {
58
+ doSynthesizeSpeechStandard(text, options) {
49
59
  return this.callAPI(text, options);
50
60
  }
61
+ async doSynthesizeSpeechStreamDuplex(textStream
62
+ // options?: FunctionOptions | undefined
63
+ ) {
64
+ const responseSchema = zod_1.z.union([
65
+ zod_1.z.object({
66
+ audio: zod_1.z.string(),
67
+ isFinal: zod_1.z.literal(false).nullable(),
68
+ normalizedAlignment: zod_1.z
69
+ .object({
70
+ chars: zod_1.z.array(zod_1.z.string()),
71
+ charStartTimesMs: zod_1.z.array(zod_1.z.number()),
72
+ charDurationsMs: zod_1.z.array(zod_1.z.number()),
73
+ })
74
+ .nullable(),
75
+ }),
76
+ zod_1.z.object({
77
+ isFinal: zod_1.z.literal(true),
78
+ }),
79
+ zod_1.z.object({
80
+ message: zod_1.z.string(),
81
+ error: zod_1.z.string(),
82
+ code: zod_1.z.number(),
83
+ }),
84
+ ]);
85
+ const queue = new AsyncQueue_js_1.AsyncQueue();
86
+ const model = this.settings.model ?? defaultModel;
87
+ const socket = await (0, SimpleWebSocket_js_1.createSimpleWebSocket)(`wss://api.elevenlabs.io/v1/text-to-speech/${this.settings.voice}/stream-input?model_id=${model}`);
88
+ socket.onopen = async () => {
89
+ const api = this.settings.api ?? new ElevenLabsApiConfiguration_js_1.ElevenLabsApiConfiguration();
90
+ // send begin-of-stream (BOS) message:
91
+ socket.send(JSON.stringify({
92
+ // The JS WebSocket API does not support authorization headers, so we send the API key in the BOS message.
93
+ // See https://stackoverflow.com/questions/4361173/http-headers-in-websockets-client-api
94
+ xi_api_key: api.apiKey,
95
+ text: " ",
96
+ voice_settings: toApiVoiceSettings(this.settings.voiceSettings),
97
+ generation_config: toGenerationConfig(this.settings.generationConfig),
98
+ }));
99
+ // send text in chunks:
100
+ let textBuffer = "";
101
+ for await (const textDelta of textStream) {
102
+ textBuffer += textDelta;
103
+ // using ". " as separator: sending in full sentences improves the quality
104
+ // of the audio output significantly.
105
+ const separator = textBuffer.lastIndexOf(". ");
106
+ if (separator === -1) {
107
+ continue;
108
+ }
109
+ const textToProcess = textBuffer.slice(0, separator);
110
+ textBuffer = textBuffer.slice(separator + 1);
111
+ socket.send(JSON.stringify({
112
+ text: textToProcess,
113
+ try_trigger_generation: true,
114
+ }));
115
+ }
116
+ // send remaining text:
117
+ if (textBuffer.length > 0) {
118
+ socket.send(JSON.stringify({
119
+ text: `${textBuffer} `,
120
+ try_trigger_generation: true,
121
+ }));
122
+ }
123
+ // send end-of-stream (EOS) message:
124
+ socket.send(JSON.stringify({ text: "" }));
125
+ };
126
+ socket.onmessage = (event) => {
127
+ const parseResult = (0, parseJSON_js_1.safeParseJsonWithZod)(event.data, responseSchema);
128
+ if (!parseResult.success) {
129
+ queue.push({ type: "error", error: parseResult.error });
130
+ return;
131
+ }
132
+ const response = parseResult.data;
133
+ if ("error" in response) {
134
+ queue.push({ type: "error", error: response });
135
+ return;
136
+ }
137
+ if (!response.isFinal) {
138
+ queue.push({
139
+ type: "delta",
140
+ fullDelta: event,
141
+ valueDelta: Buffer.from(response.audio, "base64"),
142
+ });
143
+ }
144
+ };
145
+ socket.onerror = (error) => {
146
+ queue.push({ type: "error", error });
147
+ };
148
+ socket.onclose = () => {
149
+ queue.close();
150
+ };
151
+ return queue;
152
+ }
51
153
  withSettings(additionalSettings) {
52
154
  return new ElevenLabsSpeechSynthesisModel({
53
155
  ...this.settings,
@@ -62,18 +164,28 @@ async function callElevenLabsTextToSpeechAPI({ api = new ElevenLabsApiConfigurat
62
164
  headers: api.headers,
63
165
  body: {
64
166
  text,
65
- model_id: modelId,
66
- voice_settings: voiceSettings != null
67
- ? {
68
- stability: voiceSettings.stability,
69
- similarity_boost: voiceSettings.similarityBoost,
70
- style: voiceSettings.style,
71
- use_speaker_boost: voiceSettings.useSpeakerBoost,
72
- }
73
- : undefined,
167
+ model_id: modelId ?? defaultModel,
168
+ voice_settings: toApiVoiceSettings(voiceSettings),
74
169
  },
75
170
  failedResponseHandler: ElevenLabsError_js_1.failedElevenLabsCallResponseHandler,
76
171
  successfulResponseHandler: (0, postToApi_js_1.createAudioMpegResponseHandler)(),
77
172
  abortSignal,
78
173
  });
79
174
  }
175
+ function toApiVoiceSettings(voiceSettings) {
176
+ return voiceSettings != null
177
+ ? {
178
+ stability: voiceSettings.stability,
179
+ similarity_boost: voiceSettings.similarityBoost,
180
+ style: voiceSettings.style,
181
+ use_speaker_boost: voiceSettings.useSpeakerBoost,
182
+ }
183
+ : undefined;
184
+ }
185
+ function toGenerationConfig(generationConfig) {
186
+ return generationConfig != null
187
+ ? {
188
+ chunk_length_schedule: generationConfig.chunkLengthSchedule,
189
+ }
190
+ : undefined;
191
+ }
@@ -2,17 +2,24 @@
2
2
  import { FunctionOptions } from "../../core/FunctionOptions.js";
3
3
  import { ApiConfiguration } from "../../core/api/ApiConfiguration.js";
4
4
  import { AbstractModel } from "../../model-function/AbstractModel.js";
5
+ import { Delta } from "../../model-function/Delta.js";
5
6
  import { SpeechSynthesisModel, SpeechSynthesisModelSettings } from "../../model-function/synthesize-speech/SpeechSynthesisModel.js";
7
+ declare const elevenLabsModels: readonly ["eleven_multilingual_v2", "eleven_multilingual_v1", "eleven_monolingual_v1"];
6
8
  export interface ElevenLabsSpeechSynthesisModelSettings extends SpeechSynthesisModelSettings {
7
- api?: ApiConfiguration;
9
+ api?: ApiConfiguration & {
10
+ apiKey: string;
11
+ };
8
12
  voice: string;
9
- model?: string;
13
+ model?: (typeof elevenLabsModels)[number] | (string & {});
10
14
  voiceSettings?: {
11
15
  stability: number;
12
16
  similarityBoost: number;
13
17
  style?: number;
14
18
  useSpeakerBoost?: boolean;
15
19
  };
20
+ generationConfig?: {
21
+ chunkLengthSchedule: number[];
22
+ };
16
23
  }
17
24
  /**
18
25
  * Synthesize speech using the ElevenLabs Text to Speech API.
@@ -25,6 +32,8 @@ export declare class ElevenLabsSpeechSynthesisModel extends AbstractModel<Eleven
25
32
  get modelName(): string;
26
33
  private callAPI;
27
34
  get settingsForEvent(): Partial<ElevenLabsSpeechSynthesisModelSettings>;
28
- generateSpeechResponse(text: string, options?: FunctionOptions): Promise<Buffer>;
35
+ doSynthesizeSpeechStandard(text: string, options?: FunctionOptions): Promise<Buffer>;
36
+ doSynthesizeSpeechStreamDuplex(textStream: AsyncIterable<string>): Promise<AsyncIterable<Delta<Buffer>>>;
29
37
  withSettings(additionalSettings: Partial<ElevenLabsSpeechSynthesisModelSettings>): this;
30
38
  }
39
+ export {};
@@ -1,8 +1,18 @@
1
+ import { z } from "zod";
1
2
  import { callWithRetryAndThrottle } from "../../core/api/callWithRetryAndThrottle.js";
2
3
  import { createAudioMpegResponseHandler, postJsonToApi, } from "../../core/api/postToApi.js";
4
+ import { AsyncQueue } from "../../event-source/AsyncQueue.js";
3
5
  import { AbstractModel } from "../../model-function/AbstractModel.js";
6
+ import { createSimpleWebSocket } from "../../util/SimpleWebSocket.js";
7
+ import { safeParseJsonWithZod } from "../../util/parseJSON.js";
4
8
  import { ElevenLabsApiConfiguration } from "./ElevenLabsApiConfiguration.js";
5
9
  import { failedElevenLabsCallResponseHandler } from "./ElevenLabsError.js";
10
+ const elevenLabsModels = [
11
+ "eleven_multilingual_v2",
12
+ "eleven_multilingual_v1",
13
+ "eleven_monolingual_v1",
14
+ ];
15
+ const defaultModel = "eleven_multilingual_v2";
6
16
  /**
7
17
  * Synthesize speech using the ElevenLabs Text to Speech API.
8
18
  *
@@ -42,9 +52,101 @@ export class ElevenLabsSpeechSynthesisModel extends AbstractModel {
42
52
  voiceSettings: this.settings.voiceSettings,
43
53
  };
44
54
  }
45
- generateSpeechResponse(text, options) {
55
+ doSynthesizeSpeechStandard(text, options) {
46
56
  return this.callAPI(text, options);
47
57
  }
58
+ async doSynthesizeSpeechStreamDuplex(textStream
59
+ // options?: FunctionOptions | undefined
60
+ ) {
61
+ const responseSchema = z.union([
62
+ z.object({
63
+ audio: z.string(),
64
+ isFinal: z.literal(false).nullable(),
65
+ normalizedAlignment: z
66
+ .object({
67
+ chars: z.array(z.string()),
68
+ charStartTimesMs: z.array(z.number()),
69
+ charDurationsMs: z.array(z.number()),
70
+ })
71
+ .nullable(),
72
+ }),
73
+ z.object({
74
+ isFinal: z.literal(true),
75
+ }),
76
+ z.object({
77
+ message: z.string(),
78
+ error: z.string(),
79
+ code: z.number(),
80
+ }),
81
+ ]);
82
+ const queue = new AsyncQueue();
83
+ const model = this.settings.model ?? defaultModel;
84
+ const socket = await createSimpleWebSocket(`wss://api.elevenlabs.io/v1/text-to-speech/${this.settings.voice}/stream-input?model_id=${model}`);
85
+ socket.onopen = async () => {
86
+ const api = this.settings.api ?? new ElevenLabsApiConfiguration();
87
+ // send begin-of-stream (BOS) message:
88
+ socket.send(JSON.stringify({
89
+ // The JS WebSocket API does not support authorization headers, so we send the API key in the BOS message.
90
+ // See https://stackoverflow.com/questions/4361173/http-headers-in-websockets-client-api
91
+ xi_api_key: api.apiKey,
92
+ text: " ",
93
+ voice_settings: toApiVoiceSettings(this.settings.voiceSettings),
94
+ generation_config: toGenerationConfig(this.settings.generationConfig),
95
+ }));
96
+ // send text in chunks:
97
+ let textBuffer = "";
98
+ for await (const textDelta of textStream) {
99
+ textBuffer += textDelta;
100
+ // using ". " as separator: sending in full sentences improves the quality
101
+ // of the audio output significantly.
102
+ const separator = textBuffer.lastIndexOf(". ");
103
+ if (separator === -1) {
104
+ continue;
105
+ }
106
+ const textToProcess = textBuffer.slice(0, separator);
107
+ textBuffer = textBuffer.slice(separator + 1);
108
+ socket.send(JSON.stringify({
109
+ text: textToProcess,
110
+ try_trigger_generation: true,
111
+ }));
112
+ }
113
+ // send remaining text:
114
+ if (textBuffer.length > 0) {
115
+ socket.send(JSON.stringify({
116
+ text: `${textBuffer} `,
117
+ try_trigger_generation: true,
118
+ }));
119
+ }
120
+ // send end-of-stream (EOS) message:
121
+ socket.send(JSON.stringify({ text: "" }));
122
+ };
123
+ socket.onmessage = (event) => {
124
+ const parseResult = safeParseJsonWithZod(event.data, responseSchema);
125
+ if (!parseResult.success) {
126
+ queue.push({ type: "error", error: parseResult.error });
127
+ return;
128
+ }
129
+ const response = parseResult.data;
130
+ if ("error" in response) {
131
+ queue.push({ type: "error", error: response });
132
+ return;
133
+ }
134
+ if (!response.isFinal) {
135
+ queue.push({
136
+ type: "delta",
137
+ fullDelta: event,
138
+ valueDelta: Buffer.from(response.audio, "base64"),
139
+ });
140
+ }
141
+ };
142
+ socket.onerror = (error) => {
143
+ queue.push({ type: "error", error });
144
+ };
145
+ socket.onclose = () => {
146
+ queue.close();
147
+ };
148
+ return queue;
149
+ }
48
150
  withSettings(additionalSettings) {
49
151
  return new ElevenLabsSpeechSynthesisModel({
50
152
  ...this.settings,
@@ -58,18 +160,28 @@ async function callElevenLabsTextToSpeechAPI({ api = new ElevenLabsApiConfigurat
58
160
  headers: api.headers,
59
161
  body: {
60
162
  text,
61
- model_id: modelId,
62
- voice_settings: voiceSettings != null
63
- ? {
64
- stability: voiceSettings.stability,
65
- similarity_boost: voiceSettings.similarityBoost,
66
- style: voiceSettings.style,
67
- use_speaker_boost: voiceSettings.useSpeakerBoost,
68
- }
69
- : undefined,
163
+ model_id: modelId ?? defaultModel,
164
+ voice_settings: toApiVoiceSettings(voiceSettings),
70
165
  },
71
166
  failedResponseHandler: failedElevenLabsCallResponseHandler,
72
167
  successfulResponseHandler: createAudioMpegResponseHandler(),
73
168
  abortSignal,
74
169
  });
75
170
  }
171
+ function toApiVoiceSettings(voiceSettings) {
172
+ return voiceSettings != null
173
+ ? {
174
+ stability: voiceSettings.stability,
175
+ similarity_boost: voiceSettings.similarityBoost,
176
+ style: voiceSettings.style,
177
+ use_speaker_boost: voiceSettings.useSpeakerBoost,
178
+ }
179
+ : undefined;
180
+ }
181
+ function toGenerationConfig(generationConfig) {
182
+ return generationConfig != null
183
+ ? {
184
+ chunk_length_schedule: generationConfig.chunkLengthSchedule,
185
+ }
186
+ : undefined;
187
+ }
@@ -43,7 +43,7 @@ class LmntSpeechSynthesisModel extends AbstractModel_js_1.AbstractModel {
43
43
  length: this.settings.length,
44
44
  };
45
45
  }
46
- generateSpeechResponse(text, options) {
46
+ doSynthesizeSpeechStandard(text, options) {
47
47
  return this.callAPI(text, options);
48
48
  }
49
49
  withSettings(additionalSettings) {
@@ -21,6 +21,6 @@ export declare class LmntSpeechSynthesisModel extends AbstractModel<LmntSpeechSy
21
21
  get modelName(): string;
22
22
  private callAPI;
23
23
  get settingsForEvent(): Partial<LmntSpeechSynthesisModelSettings>;
24
- generateSpeechResponse(text: string, options?: FunctionOptions): Promise<Buffer>;
24
+ doSynthesizeSpeechStandard(text: string, options?: FunctionOptions): Promise<Buffer>;
25
25
  withSettings(additionalSettings: Partial<LmntSpeechSynthesisModelSettings>): this;
26
26
  }
@@ -40,7 +40,7 @@ export class LmntSpeechSynthesisModel extends AbstractModel {
40
40
  length: this.settings.length,
41
41
  };
42
42
  }
43
- generateSpeechResponse(text, options) {
43
+ doSynthesizeSpeechStandard(text, options) {
44
44
  return this.callAPI(text, options);
45
45
  }
46
46
  withSettings(additionalSettings) {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "modelfusion",
3
3
  "description": "Build multimodal applications, chatbots, and agents with JavaScript and TypeScript.",
4
- "version": "0.47.3",
4
+ "version": "0.48.0",
5
5
  "author": "Lars Grammel",
6
6
  "license": "MIT",
7
7
  "keywords": [
@@ -57,6 +57,7 @@
57
57
  "js-tiktoken": "1.0.7",
58
58
  "nanoid": "3.3.6",
59
59
  "secure-json-parse": "2.7.0",
60
+ "ws": "8.14.2",
60
61
  "zod": "3.22.4",
61
62
  "zod-to-json-schema": "3.21.4"
62
63
  },
@@ -64,6 +65,7 @@
64
65
  "@tsconfig/recommended": "1.0.3",
65
66
  "@types/deep-equal": "^1.0.2",
66
67
  "@types/node": "18.11.9",
68
+ "@types/ws": "^8.5.7",
67
69
  "@typescript-eslint/eslint-plugin": "^6.1.0",
68
70
  "@typescript-eslint/parser": "^6.1.0",
69
71
  "copyfiles": "2.4.1",
@@ -0,0 +1,54 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.MediaSourceAppender = void 0;
4
+ class MediaSourceAppender {
5
+ constructor(type) {
6
+ Object.defineProperty(this, "mediaSource", {
7
+ enumerable: true,
8
+ configurable: true,
9
+ writable: true,
10
+ value: new MediaSource()
11
+ });
12
+ Object.defineProperty(this, "audioChunks", {
13
+ enumerable: true,
14
+ configurable: true,
15
+ writable: true,
16
+ value: []
17
+ });
18
+ Object.defineProperty(this, "sourceBuffer", {
19
+ enumerable: true,
20
+ configurable: true,
21
+ writable: true,
22
+ value: void 0
23
+ });
24
+ this.mediaSource.addEventListener("sourceopen", async () => {
25
+ this.sourceBuffer = this.mediaSource.addSourceBuffer(type);
26
+ this.sourceBuffer.addEventListener("updateend", () => {
27
+ this.tryAppendNextChunk();
28
+ });
29
+ });
30
+ }
31
+ tryAppendNextChunk() {
32
+ if (this.sourceBuffer != null &&
33
+ !this.sourceBuffer.updating &&
34
+ this.audioChunks.length > 0) {
35
+ this.sourceBuffer.appendBuffer(this.audioChunks.shift());
36
+ }
37
+ }
38
+ addBase64Data(base64Data) {
39
+ this.addData(Uint8Array.from(atob(base64Data), (char) => char.charCodeAt(0)).buffer);
40
+ }
41
+ addData(data) {
42
+ this.audioChunks.push(data);
43
+ this.tryAppendNextChunk();
44
+ }
45
+ close() {
46
+ if (this.mediaSource.readyState === "open") {
47
+ this.mediaSource.endOfStream();
48
+ }
49
+ }
50
+ get mediaSourceUrl() {
51
+ return URL.createObjectURL(this.mediaSource);
52
+ }
53
+ }
54
+ exports.MediaSourceAppender = MediaSourceAppender;
@@ -0,0 +1,11 @@
1
+ export declare class MediaSourceAppender {
2
+ private readonly mediaSource;
3
+ private readonly audioChunks;
4
+ private sourceBuffer?;
5
+ constructor(type: string);
6
+ private tryAppendNextChunk;
7
+ addBase64Data(base64Data: string): void;
8
+ addData(data: ArrayBuffer): void;
9
+ close(): void;
10
+ get mediaSourceUrl(): string;
11
+ }
@@ -0,0 +1,50 @@
1
+ export class MediaSourceAppender {
2
+ constructor(type) {
3
+ Object.defineProperty(this, "mediaSource", {
4
+ enumerable: true,
5
+ configurable: true,
6
+ writable: true,
7
+ value: new MediaSource()
8
+ });
9
+ Object.defineProperty(this, "audioChunks", {
10
+ enumerable: true,
11
+ configurable: true,
12
+ writable: true,
13
+ value: []
14
+ });
15
+ Object.defineProperty(this, "sourceBuffer", {
16
+ enumerable: true,
17
+ configurable: true,
18
+ writable: true,
19
+ value: void 0
20
+ });
21
+ this.mediaSource.addEventListener("sourceopen", async () => {
22
+ this.sourceBuffer = this.mediaSource.addSourceBuffer(type);
23
+ this.sourceBuffer.addEventListener("updateend", () => {
24
+ this.tryAppendNextChunk();
25
+ });
26
+ });
27
+ }
28
+ tryAppendNextChunk() {
29
+ if (this.sourceBuffer != null &&
30
+ !this.sourceBuffer.updating &&
31
+ this.audioChunks.length > 0) {
32
+ this.sourceBuffer.appendBuffer(this.audioChunks.shift());
33
+ }
34
+ }
35
+ addBase64Data(base64Data) {
36
+ this.addData(Uint8Array.from(atob(base64Data), (char) => char.charCodeAt(0)).buffer);
37
+ }
38
+ addData(data) {
39
+ this.audioChunks.push(data);
40
+ this.tryAppendNextChunk();
41
+ }
42
+ close() {
43
+ if (this.mediaSource.readyState === "open") {
44
+ this.mediaSource.endOfStream();
45
+ }
46
+ }
47
+ get mediaSourceUrl() {
48
+ return URL.createObjectURL(this.mediaSource);
49
+ }
50
+ }
package/ui/index.cjs ADDED
@@ -0,0 +1,17 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ __exportStar(require("./MediaSourceAppender.cjs"), exports);
package/ui/index.d.ts ADDED
@@ -0,0 +1 @@
1
+ export * from "./MediaSourceAppender.js";
package/ui/index.js ADDED
@@ -0,0 +1 @@
1
+ export * from "./MediaSourceAppender.js";
@@ -0,0 +1,41 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || function (mod) {
19
+ if (mod && mod.__esModule) return mod;
20
+ var result = {};
21
+ if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22
+ __setModuleDefault(result, mod);
23
+ return result;
24
+ };
25
+ Object.defineProperty(exports, "__esModule", { value: true });
26
+ exports.createSimpleWebSocket = void 0;
27
+ /**
28
+ * Creates a simplified websocket connection. This function works in both Node.js and browser.
29
+ */
30
+ async function createSimpleWebSocket(url) {
31
+ if (typeof window === "undefined") {
32
+ // Use ws library in Node.js:
33
+ const { default: WebSocket } = await Promise.resolve().then(() => __importStar(require("ws")));
34
+ return new WebSocket(url);
35
+ }
36
+ else {
37
+ // Use native WebSocket in browser:
38
+ return new WebSocket(url);
39
+ }
40
+ }
41
+ exports.createSimpleWebSocket = createSimpleWebSocket;
@@ -0,0 +1,12 @@
1
+ export interface SimpleWebSocket {
2
+ send(data: string): void;
3
+ onmessage: ((event: MessageEvent) => void) | null;
4
+ onopen: ((event: Event) => void) | null;
5
+ onclose: ((event: CloseEvent) => void) | null;
6
+ onerror: ((event: Event) => void) | null;
7
+ close(code?: number, reason?: string): void;
8
+ }
9
+ /**
10
+ * Creates a simplified websocket connection. This function works in both Node.js and browser.
11
+ */
12
+ export declare function createSimpleWebSocket(url: string): Promise<SimpleWebSocket>;
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Creates a simplified websocket connection. This function works in both Node.js and browser.
3
+ */
4
+ export async function createSimpleWebSocket(url) {
5
+ if (typeof window === "undefined") {
6
+ // Use ws library in Node.js:
7
+ const { default: WebSocket } = await import("ws");
8
+ return new WebSocket(url);
9
+ }
10
+ else {
11
+ // Use native WebSocket in browser:
12
+ return new WebSocket(url);
13
+ }
14
+ }