modelfusion 0.50.0 → 0.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # ModelFusion
2
2
 
3
- > ### Build multi-modal AI apps, chatbots, and agents with JavaScript and TypeScript.
3
+ > ### The TypeScript library for building multi-modal AI applications.
4
4
 
5
5
  [![NPM Version](https://img.shields.io/npm/v/modelfusion?color=33cd56&logo=npm)](https://www.npmjs.com/package/modelfusion)
6
6
  [![MIT License](https://img.shields.io/github/license/lgrammel/modelfusion)](https://opensource.org/licenses/MIT)
@@ -10,12 +10,9 @@
10
10
 
11
11
  [Introduction](#introduction) | [Quick Install](#quick-install) | [Usage](#usage-examples) | [Documentation](#documentation) | [Examples](#more-examples) | [Contributing](#contributing) | [modelfusion.dev](https://modelfusion.dev)
12
12
 
13
- > [!NOTE]
14
- > ModelFusion is in its initial development phase. Until version 1.0 there may be breaking changes, because I am still exploring the API design. Feedback and suggestions are welcome.
15
-
16
13
  ## Introduction
17
14
 
18
- ModelFusion is a library for building AI applications, chatbots, and agents. Here are the main features:
15
+ **ModelFusion** is a TypeScript library for building AI applications, chatbots, and agents.
19
16
 
20
17
  - **Multimodal**: ModelFusion supports a wide range of models including text generation, image generation, text-to-speech, speech-to-text, and embedding models.
21
18
  - **Streaming**: ModelFusion supports streaming for many generation models, e.g. text streaming, structure streaming, and full duplex speech streaming.
@@ -26,6 +23,9 @@ ModelFusion is a library for building AI applications, chatbots, and agents. Her
26
23
 
27
24
  ## Quick Install
28
25
 
26
+ > [!NOTE]
27
+ > ModelFusion is in its initial development phase. The main API is now mostly stable, but until version 1.0 there may be minor breaking changes. Feedback and suggestions are welcome.
28
+
29
29
  ```sh
30
30
  npm install modelfusion
31
31
  ```
@@ -118,7 +118,7 @@ const textStream = await streamText(/* ... */);
118
118
  const speechStream = await streamSpeech(
119
119
  new ElevenLabsSpeechModel({
120
120
  voice: "pNInz6obpgDQGcFmaJgB", // Adam
121
- model: "eleven_monolingual_v1",
121
+ optimizeStreamingLatency: 1,
122
122
  voiceSettings: { stability: 1, similarityBoost: 0.35 },
123
123
  generationConfig: {
124
124
  chunkLengthSchedule: [50, 90, 120, 150, 200],
@@ -5,7 +5,6 @@ const ApiCallError_js_1 = require("../../core/api/ApiCallError.cjs");
5
5
  const failedElevenLabsCallResponseHandler = async ({ response, url, requestBodyValues }) => {
6
6
  const responseBody = await response.text();
7
7
  try {
8
- // TODO implement ElevenLabsError
9
8
  return new ApiCallError_js_1.ApiCallError({
10
9
  message: responseBody,
11
10
  statusCode: response.status,
@@ -2,7 +2,6 @@ import { ApiCallError } from "../../core/api/ApiCallError.js";
2
2
  export const failedElevenLabsCallResponseHandler = async ({ response, url, requestBodyValues }) => {
3
3
  const responseBody = await response.text();
4
4
  try {
5
- // TODO implement ElevenLabsError
6
5
  return new ApiCallError({
7
6
  message: responseBody,
8
7
  statusCode: response.status,
@@ -15,11 +15,14 @@ const elevenLabsModels = [
15
15
  "eleven_multilingual_v1",
16
16
  "eleven_monolingual_v1",
17
17
  ];
18
- const defaultModel = "eleven_multilingual_v2";
18
+ const defaultModel = "eleven_monolingual_v1";
19
19
  /**
20
20
  * Synthesize speech using the ElevenLabs Text to Speech API.
21
21
  *
22
- * @see https://api.elevenlabs.io/docs#/text-to-speech/Text_to_speech_v1_text_to_speech__voice_id__post
22
+ * Both regular text-to-speech and full duplex text-to-speech streaming are supported.
23
+ *
24
+ * @see https://docs.elevenlabs.io/api-reference/text-to-speech
25
+ * @see https://docs.elevenlabs.io/api-reference/text-to-speech-websockets
23
26
  */
24
27
  class ElevenLabsSpeechModel extends AbstractModel_js_1.AbstractModel {
25
28
  constructor(settings) {
@@ -84,7 +87,11 @@ class ElevenLabsSpeechModel extends AbstractModel_js_1.AbstractModel {
84
87
  ]);
85
88
  const queue = new AsyncQueue_js_1.AsyncQueue();
86
89
  const model = this.settings.model ?? defaultModel;
87
- const socket = await (0, SimpleWebSocket_js_1.createSimpleWebSocket)(`wss://api.elevenlabs.io/v1/text-to-speech/${this.settings.voice}/stream-input?model_id=${model}`);
90
+ const socket = await (0, SimpleWebSocket_js_1.createSimpleWebSocket)(`wss://api.elevenlabs.io/v1/text-to-speech/${this.settings.voice}/stream-input${assembleQuery({
91
+ model_id: model,
92
+ optimize_streaming_latency: this.settings.optimizeStreamingLatency,
93
+ output_format: this.settings.outputFormat,
94
+ })}`);
88
95
  socket.onopen = async () => {
89
96
  const api = this.settings.api ?? new ElevenLabsApiConfiguration_js_1.ElevenLabsApiConfiguration();
90
97
  // send begin-of-stream (BOS) message:
@@ -158,9 +165,12 @@ class ElevenLabsSpeechModel extends AbstractModel_js_1.AbstractModel {
158
165
  }
159
166
  }
160
167
  exports.ElevenLabsSpeechModel = ElevenLabsSpeechModel;
161
- async function callElevenLabsTextToSpeechAPI({ api = new ElevenLabsApiConfiguration_js_1.ElevenLabsApiConfiguration(), abortSignal, text, voiceId, modelId, voiceSettings, }) {
168
+ async function callElevenLabsTextToSpeechAPI({ api = new ElevenLabsApiConfiguration_js_1.ElevenLabsApiConfiguration(), abortSignal, text, voiceId, modelId, optimizeStreamingLatency, outputFormat, voiceSettings, }) {
162
169
  return (0, postToApi_js_1.postJsonToApi)({
163
- url: api.assembleUrl(`/text-to-speech/${voiceId}`),
170
+ url: api.assembleUrl(`/text-to-speech/${voiceId}${assembleQuery({
171
+ optimize_streaming_latency: optimizeStreamingLatency,
172
+ output_format: outputFormat,
173
+ })}`),
164
174
  headers: api.headers,
165
175
  body: {
166
176
  text,
@@ -172,6 +182,24 @@ async function callElevenLabsTextToSpeechAPI({ api = new ElevenLabsApiConfigurat
172
182
  abortSignal,
173
183
  });
174
184
  }
185
+ function assembleQuery(parameters) {
186
+ let query = "";
187
+ let hasQuestionMark = false;
188
+ for (const [key, value] of Object.entries(parameters)) {
189
+ if (value == null) {
190
+ continue;
191
+ }
192
+ if (!hasQuestionMark) {
193
+ query += "?";
194
+ hasQuestionMark = true;
195
+ }
196
+ else {
197
+ query += "&";
198
+ }
199
+ query += `${key}=${value}`;
200
+ }
201
+ return query;
202
+ }
175
203
  function toApiVoiceSettings(voiceSettings) {
176
204
  return voiceSettings != null
177
205
  ? {
@@ -11,6 +11,8 @@ export interface ElevenLabsSpeechModelSettings extends SpeechGenerationModelSett
11
11
  };
12
12
  voice: string;
13
13
  model?: (typeof elevenLabsModels)[number] | (string & {});
14
+ optimizeStreamingLatency?: 0 | 1 | 2 | 3 | 4;
15
+ outputFormat?: "mp3_44100" | "pcm_16000" | "pcm_22050" | "pcm_24000" | "pcm_44100";
14
16
  voiceSettings?: {
15
17
  stability: number;
16
18
  similarityBoost: number;
@@ -24,7 +26,10 @@ export interface ElevenLabsSpeechModelSettings extends SpeechGenerationModelSett
24
26
  /**
25
27
  * Synthesize speech using the ElevenLabs Text to Speech API.
26
28
  *
27
- * @see https://api.elevenlabs.io/docs#/text-to-speech/Text_to_speech_v1_text_to_speech__voice_id__post
29
+ * Both regular text-to-speech and full duplex text-to-speech streaming are supported.
30
+ *
31
+ * @see https://docs.elevenlabs.io/api-reference/text-to-speech
32
+ * @see https://docs.elevenlabs.io/api-reference/text-to-speech-websockets
28
33
  */
29
34
  export declare class ElevenLabsSpeechModel extends AbstractModel<ElevenLabsSpeechModelSettings> implements StreamingSpeechGenerationModel<ElevenLabsSpeechModelSettings> {
30
35
  constructor(settings: ElevenLabsSpeechModelSettings);
@@ -12,11 +12,14 @@ const elevenLabsModels = [
12
12
  "eleven_multilingual_v1",
13
13
  "eleven_monolingual_v1",
14
14
  ];
15
- const defaultModel = "eleven_multilingual_v2";
15
+ const defaultModel = "eleven_monolingual_v1";
16
16
  /**
17
17
  * Synthesize speech using the ElevenLabs Text to Speech API.
18
18
  *
19
- * @see https://api.elevenlabs.io/docs#/text-to-speech/Text_to_speech_v1_text_to_speech__voice_id__post
19
+ * Both regular text-to-speech and full duplex text-to-speech streaming are supported.
20
+ *
21
+ * @see https://docs.elevenlabs.io/api-reference/text-to-speech
22
+ * @see https://docs.elevenlabs.io/api-reference/text-to-speech-websockets
20
23
  */
21
24
  export class ElevenLabsSpeechModel extends AbstractModel {
22
25
  constructor(settings) {
@@ -81,7 +84,11 @@ export class ElevenLabsSpeechModel extends AbstractModel {
81
84
  ]);
82
85
  const queue = new AsyncQueue();
83
86
  const model = this.settings.model ?? defaultModel;
84
- const socket = await createSimpleWebSocket(`wss://api.elevenlabs.io/v1/text-to-speech/${this.settings.voice}/stream-input?model_id=${model}`);
87
+ const socket = await createSimpleWebSocket(`wss://api.elevenlabs.io/v1/text-to-speech/${this.settings.voice}/stream-input${assembleQuery({
88
+ model_id: model,
89
+ optimize_streaming_latency: this.settings.optimizeStreamingLatency,
90
+ output_format: this.settings.outputFormat,
91
+ })}`);
85
92
  socket.onopen = async () => {
86
93
  const api = this.settings.api ?? new ElevenLabsApiConfiguration();
87
94
  // send begin-of-stream (BOS) message:
@@ -154,9 +161,12 @@ export class ElevenLabsSpeechModel extends AbstractModel {
154
161
  });
155
162
  }
156
163
  }
157
- async function callElevenLabsTextToSpeechAPI({ api = new ElevenLabsApiConfiguration(), abortSignal, text, voiceId, modelId, voiceSettings, }) {
164
+ async function callElevenLabsTextToSpeechAPI({ api = new ElevenLabsApiConfiguration(), abortSignal, text, voiceId, modelId, optimizeStreamingLatency, outputFormat, voiceSettings, }) {
158
165
  return postJsonToApi({
159
- url: api.assembleUrl(`/text-to-speech/${voiceId}`),
166
+ url: api.assembleUrl(`/text-to-speech/${voiceId}${assembleQuery({
167
+ optimize_streaming_latency: optimizeStreamingLatency,
168
+ output_format: outputFormat,
169
+ })}`),
160
170
  headers: api.headers,
161
171
  body: {
162
172
  text,
@@ -168,6 +178,24 @@ async function callElevenLabsTextToSpeechAPI({ api = new ElevenLabsApiConfigurat
168
178
  abortSignal,
169
179
  });
170
180
  }
181
+ function assembleQuery(parameters) {
182
+ let query = "";
183
+ let hasQuestionMark = false;
184
+ for (const [key, value] of Object.entries(parameters)) {
185
+ if (value == null) {
186
+ continue;
187
+ }
188
+ if (!hasQuestionMark) {
189
+ query += "?";
190
+ hasQuestionMark = true;
191
+ }
192
+ else {
193
+ query += "&";
194
+ }
195
+ query += `${key}=${value}`;
196
+ }
197
+ return query;
198
+ }
171
199
  function toApiVoiceSettings(voiceSettings) {
172
200
  return voiceSettings != null
173
201
  ? {
@@ -5,7 +5,6 @@ const ApiCallError_js_1 = require("../../core/api/ApiCallError.cjs");
5
5
  const failedLmntCallResponseHandler = async ({ response, url, requestBodyValues }) => {
6
6
  const responseBody = await response.text();
7
7
  try {
8
- // TODO implement LmntError
9
8
  return new ApiCallError_js_1.ApiCallError({
10
9
  message: responseBody,
11
10
  statusCode: response.status,
@@ -2,7 +2,6 @@ import { ApiCallError } from "../../core/api/ApiCallError.js";
2
2
  export const failedLmntCallResponseHandler = async ({ response, url, requestBodyValues }) => {
3
3
  const responseBody = await response.text();
4
4
  try {
5
- // TODO implement LmntError
6
5
  return new ApiCallError({
7
6
  message: responseBody,
8
7
  statusCode: response.status,
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "modelfusion",
3
3
  "description": "Build multimodal applications, chatbots, and agents with JavaScript and TypeScript.",
4
- "version": "0.50.0",
4
+ "version": "0.51.0",
5
5
  "author": "Lars Grammel",
6
6
  "license": "MIT",
7
7
  "keywords": [