@ai-sdk/elevenlabs 2.0.7 → 2.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/dist/index.js +1 -1
- package/dist/index.mjs +1 -1
- package/package.json +5 -4
- package/src/elevenlabs-api-types.ts +100 -0
- package/src/elevenlabs-config.ts +9 -0
- package/src/elevenlabs-error.test.ts +34 -0
- package/src/elevenlabs-error.ts +16 -0
- package/src/elevenlabs-provider.ts +138 -0
- package/src/elevenlabs-speech-api-types.ts +23 -0
- package/src/elevenlabs-speech-model.test.ts +179 -0
- package/src/elevenlabs-speech-model.ts +258 -0
- package/src/elevenlabs-speech-options.ts +12 -0
- package/src/elevenlabs-transcription-model.test.ts +389 -0
- package/src/elevenlabs-transcription-model.ts +183 -0
- package/src/elevenlabs-transcription-options.ts +4 -0
- package/src/index.ts +10 -0
- package/src/transcript-test.mp3 +0 -0
- package/src/version.ts +6 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
import { SpeechModelV3, SharedV3Warning } from '@ai-sdk/provider';
|
|
2
|
+
import {
|
|
3
|
+
combineHeaders,
|
|
4
|
+
createBinaryResponseHandler,
|
|
5
|
+
parseProviderOptions,
|
|
6
|
+
postJsonToApi,
|
|
7
|
+
} from '@ai-sdk/provider-utils';
|
|
8
|
+
import { z } from 'zod/v4';
|
|
9
|
+
import { ElevenLabsConfig } from './elevenlabs-config';
|
|
10
|
+
import { elevenlabsFailedResponseHandler } from './elevenlabs-error';
|
|
11
|
+
import { ElevenLabsSpeechAPITypes } from './elevenlabs-speech-api-types';
|
|
12
|
+
import {
|
|
13
|
+
ElevenLabsSpeechModelId,
|
|
14
|
+
ElevenLabsSpeechVoiceId,
|
|
15
|
+
} from './elevenlabs-speech-options';
|
|
16
|
+
|
|
17
|
+
// Schema for camelCase input from users
|
|
18
|
+
const ElevenLabsProviderOptionsSchema = z.object({
|
|
19
|
+
languageCode: z.string().optional(),
|
|
20
|
+
voiceSettings: z
|
|
21
|
+
.object({
|
|
22
|
+
stability: z.number().min(0).max(1).optional(),
|
|
23
|
+
similarityBoost: z.number().min(0).max(1).optional(),
|
|
24
|
+
style: z.number().min(0).max(1).optional(),
|
|
25
|
+
useSpeakerBoost: z.boolean().optional(),
|
|
26
|
+
})
|
|
27
|
+
.optional(),
|
|
28
|
+
pronunciationDictionaryLocators: z
|
|
29
|
+
.array(
|
|
30
|
+
z.object({
|
|
31
|
+
pronunciationDictionaryId: z.string(),
|
|
32
|
+
versionId: z.string().optional(),
|
|
33
|
+
}),
|
|
34
|
+
)
|
|
35
|
+
.max(3)
|
|
36
|
+
.optional(),
|
|
37
|
+
seed: z.number().min(0).max(4294967295).optional(),
|
|
38
|
+
previousText: z.string().optional(),
|
|
39
|
+
nextText: z.string().optional(),
|
|
40
|
+
previousRequestIds: z.array(z.string()).max(3).optional(),
|
|
41
|
+
nextRequestIds: z.array(z.string()).max(3).optional(),
|
|
42
|
+
applyTextNormalization: z.enum(['auto', 'on', 'off']).optional(),
|
|
43
|
+
applyLanguageTextNormalization: z.boolean().optional(),
|
|
44
|
+
enableLogging: z.boolean().optional(),
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
export type ElevenLabsSpeechCallOptions = z.infer<
|
|
48
|
+
typeof ElevenLabsProviderOptionsSchema
|
|
49
|
+
>;
|
|
50
|
+
|
|
51
|
+
interface ElevenLabsSpeechModelConfig extends ElevenLabsConfig {
|
|
52
|
+
_internal?: {
|
|
53
|
+
currentDate?: () => Date;
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export class ElevenLabsSpeechModel implements SpeechModelV3 {
|
|
58
|
+
readonly specificationVersion = 'v3';
|
|
59
|
+
|
|
60
|
+
get provider(): string {
|
|
61
|
+
return this.config.provider;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
constructor(
|
|
65
|
+
readonly modelId: ElevenLabsSpeechModelId,
|
|
66
|
+
private readonly config: ElevenLabsSpeechModelConfig,
|
|
67
|
+
) {}
|
|
68
|
+
|
|
69
|
+
private async getArgs({
|
|
70
|
+
text,
|
|
71
|
+
voice = '21m00Tcm4TlvDq8ikWAM',
|
|
72
|
+
outputFormat = 'mp3_44100_128',
|
|
73
|
+
instructions,
|
|
74
|
+
language,
|
|
75
|
+
speed,
|
|
76
|
+
providerOptions,
|
|
77
|
+
}: Parameters<SpeechModelV3['doGenerate']>[0]) {
|
|
78
|
+
const warnings: SharedV3Warning[] = [];
|
|
79
|
+
|
|
80
|
+
// Parse provider options
|
|
81
|
+
const elevenLabsOptions = await parseProviderOptions({
|
|
82
|
+
provider: 'elevenlabs',
|
|
83
|
+
providerOptions,
|
|
84
|
+
schema: ElevenLabsProviderOptionsSchema,
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
// Create request body
|
|
88
|
+
const requestBody: ElevenLabsSpeechAPITypes = {
|
|
89
|
+
text,
|
|
90
|
+
model_id: this.modelId,
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
// Prepare query parameters
|
|
94
|
+
const queryParams: Record<string, string> = {};
|
|
95
|
+
|
|
96
|
+
// Map outputFormat to ElevenLabs format (as query param)
|
|
97
|
+
if (outputFormat) {
|
|
98
|
+
const formatMap: Record<string, string> = {
|
|
99
|
+
mp3: 'mp3_44100_128',
|
|
100
|
+
mp3_32: 'mp3_44100_32',
|
|
101
|
+
mp3_64: 'mp3_44100_64',
|
|
102
|
+
mp3_96: 'mp3_44100_96',
|
|
103
|
+
mp3_128: 'mp3_44100_128',
|
|
104
|
+
mp3_192: 'mp3_44100_192',
|
|
105
|
+
pcm: 'pcm_44100',
|
|
106
|
+
pcm_16000: 'pcm_16000',
|
|
107
|
+
pcm_22050: 'pcm_22050',
|
|
108
|
+
pcm_24000: 'pcm_24000',
|
|
109
|
+
pcm_44100: 'pcm_44100',
|
|
110
|
+
ulaw: 'ulaw_8000',
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
const mappedFormat = formatMap[outputFormat] || outputFormat;
|
|
114
|
+
queryParams.output_format = mappedFormat;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Add language code if provided
|
|
118
|
+
if (language) {
|
|
119
|
+
requestBody.language_code = language;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const voiceSettings: typeof requestBody.voice_settings = {};
|
|
123
|
+
|
|
124
|
+
if (speed != null) {
|
|
125
|
+
voiceSettings.speed = speed;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Add provider-specific options - map from camelCase to snake_case
|
|
129
|
+
if (elevenLabsOptions) {
|
|
130
|
+
if (elevenLabsOptions.voiceSettings) {
|
|
131
|
+
// Map camelCase voice settings to snake_case for API
|
|
132
|
+
if (elevenLabsOptions.voiceSettings.stability != null) {
|
|
133
|
+
voiceSettings.stability = elevenLabsOptions.voiceSettings.stability;
|
|
134
|
+
}
|
|
135
|
+
if (elevenLabsOptions.voiceSettings.similarityBoost != null) {
|
|
136
|
+
voiceSettings.similarity_boost =
|
|
137
|
+
elevenLabsOptions.voiceSettings.similarityBoost;
|
|
138
|
+
}
|
|
139
|
+
if (elevenLabsOptions.voiceSettings.style != null) {
|
|
140
|
+
voiceSettings.style = elevenLabsOptions.voiceSettings.style;
|
|
141
|
+
}
|
|
142
|
+
if (elevenLabsOptions.voiceSettings.useSpeakerBoost != null) {
|
|
143
|
+
voiceSettings.use_speaker_boost =
|
|
144
|
+
elevenLabsOptions.voiceSettings.useSpeakerBoost;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
// Add language code from provider options if not already set
|
|
148
|
+
if (elevenLabsOptions.languageCode && !requestBody.language_code) {
|
|
149
|
+
requestBody.language_code = elevenLabsOptions.languageCode;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Map pronunciation dictionary locators
|
|
153
|
+
if (elevenLabsOptions.pronunciationDictionaryLocators) {
|
|
154
|
+
requestBody.pronunciation_dictionary_locators =
|
|
155
|
+
elevenLabsOptions.pronunciationDictionaryLocators.map(locator => ({
|
|
156
|
+
pronunciation_dictionary_id: locator.pronunciationDictionaryId,
|
|
157
|
+
...(locator.versionId && { version_id: locator.versionId }),
|
|
158
|
+
}));
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (elevenLabsOptions.seed != null) {
|
|
162
|
+
requestBody.seed = elevenLabsOptions.seed;
|
|
163
|
+
}
|
|
164
|
+
if (elevenLabsOptions.previousText) {
|
|
165
|
+
requestBody.previous_text = elevenLabsOptions.previousText;
|
|
166
|
+
}
|
|
167
|
+
if (elevenLabsOptions.nextText) {
|
|
168
|
+
requestBody.next_text = elevenLabsOptions.nextText;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Add previous and next request IDs
|
|
172
|
+
if (elevenLabsOptions.previousRequestIds) {
|
|
173
|
+
requestBody.previous_request_ids = elevenLabsOptions.previousRequestIds;
|
|
174
|
+
}
|
|
175
|
+
if (elevenLabsOptions.nextRequestIds) {
|
|
176
|
+
requestBody.next_request_ids = elevenLabsOptions.nextRequestIds;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Add text normalization options
|
|
180
|
+
if (elevenLabsOptions.applyTextNormalization) {
|
|
181
|
+
requestBody.apply_text_normalization =
|
|
182
|
+
elevenLabsOptions.applyTextNormalization;
|
|
183
|
+
}
|
|
184
|
+
if (elevenLabsOptions.applyLanguageTextNormalization != null) {
|
|
185
|
+
requestBody.apply_language_text_normalization =
|
|
186
|
+
elevenLabsOptions.applyLanguageTextNormalization;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// enable_logging is a query parameter
|
|
190
|
+
if (elevenLabsOptions.enableLogging != null) {
|
|
191
|
+
queryParams.enable_logging = String(elevenLabsOptions.enableLogging);
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Only add voice_settings if there are settings to add
|
|
196
|
+
if (Object.keys(voiceSettings).length > 0) {
|
|
197
|
+
requestBody.voice_settings = voiceSettings;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (instructions) {
|
|
201
|
+
warnings.push({
|
|
202
|
+
type: 'unsupported',
|
|
203
|
+
feature: 'instructions',
|
|
204
|
+
details: `ElevenLabs speech models do not support instructions. Instructions parameter was ignored.`,
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
return {
|
|
209
|
+
requestBody,
|
|
210
|
+
queryParams,
|
|
211
|
+
warnings,
|
|
212
|
+
voiceId: voice as ElevenLabsSpeechVoiceId,
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
async doGenerate(
|
|
217
|
+
options: Parameters<SpeechModelV3['doGenerate']>[0],
|
|
218
|
+
): Promise<Awaited<ReturnType<SpeechModelV3['doGenerate']>>> {
|
|
219
|
+
const currentDate = this.config._internal?.currentDate?.() ?? new Date();
|
|
220
|
+
const { requestBody, queryParams, warnings, voiceId } =
|
|
221
|
+
await this.getArgs(options);
|
|
222
|
+
|
|
223
|
+
const {
|
|
224
|
+
value: audio,
|
|
225
|
+
responseHeaders,
|
|
226
|
+
rawValue: rawResponse,
|
|
227
|
+
} = await postJsonToApi({
|
|
228
|
+
url: (() => {
|
|
229
|
+
const baseUrl = this.config.url({
|
|
230
|
+
path: `/v1/text-to-speech/${voiceId}`,
|
|
231
|
+
modelId: this.modelId,
|
|
232
|
+
});
|
|
233
|
+
const queryString = new URLSearchParams(queryParams).toString();
|
|
234
|
+
return queryString ? `${baseUrl}?${queryString}` : baseUrl;
|
|
235
|
+
})(),
|
|
236
|
+
headers: combineHeaders(this.config.headers(), options.headers),
|
|
237
|
+
body: requestBody,
|
|
238
|
+
failedResponseHandler: elevenlabsFailedResponseHandler,
|
|
239
|
+
successfulResponseHandler: createBinaryResponseHandler(),
|
|
240
|
+
abortSignal: options.abortSignal,
|
|
241
|
+
fetch: this.config.fetch,
|
|
242
|
+
});
|
|
243
|
+
|
|
244
|
+
return {
|
|
245
|
+
audio,
|
|
246
|
+
warnings,
|
|
247
|
+
request: {
|
|
248
|
+
body: JSON.stringify(requestBody),
|
|
249
|
+
},
|
|
250
|
+
response: {
|
|
251
|
+
timestamp: currentDate,
|
|
252
|
+
modelId: this.modelId,
|
|
253
|
+
headers: responseHeaders,
|
|
254
|
+
body: rawResponse,
|
|
255
|
+
},
|
|
256
|
+
};
|
|
257
|
+
}
|
|
258
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
export type ElevenLabsSpeechModelId =
|
|
2
|
+
| 'eleven_v3'
|
|
3
|
+
| 'eleven_multilingual_v2'
|
|
4
|
+
| 'eleven_flash_v2_5'
|
|
5
|
+
| 'eleven_flash_v2'
|
|
6
|
+
| 'eleven_turbo_v2_5'
|
|
7
|
+
| 'eleven_turbo_v2'
|
|
8
|
+
| 'eleven_monolingual_v1'
|
|
9
|
+
| 'eleven_multilingual_v1'
|
|
10
|
+
| (string & {});
|
|
11
|
+
|
|
12
|
+
export type ElevenLabsSpeechVoiceId = string;
|
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
import { createTestServer } from '@ai-sdk/test-server/with-vitest';
|
|
2
|
+
import { ElevenLabsTranscriptionModel } from './elevenlabs-transcription-model';
|
|
3
|
+
import { createElevenLabs } from './elevenlabs-provider';
|
|
4
|
+
import { readFile } from 'node:fs/promises';
|
|
5
|
+
import path from 'node:path';
|
|
6
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
7
|
+
|
|
8
|
+
vi.mock('./version', () => ({
|
|
9
|
+
VERSION: '0.0.0-test',
|
|
10
|
+
}));
|
|
11
|
+
|
|
12
|
+
const audioData = await readFile(path.join(__dirname, 'transcript-test.mp3'));
|
|
13
|
+
const provider = createElevenLabs({ apiKey: 'test-api-key' });
|
|
14
|
+
const model = provider.transcription('scribe_v1');
|
|
15
|
+
|
|
16
|
+
const server = createTestServer({
|
|
17
|
+
'https://api.elevenlabs.io/v1/speech-to-text': {},
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
describe('doGenerate', () => {
|
|
21
|
+
function prepareJsonResponse({
|
|
22
|
+
headers,
|
|
23
|
+
}: {
|
|
24
|
+
headers?: Record<string, string>;
|
|
25
|
+
} = {}) {
|
|
26
|
+
server.urls['https://api.elevenlabs.io/v1/speech-to-text'].response = {
|
|
27
|
+
type: 'json-value',
|
|
28
|
+
headers,
|
|
29
|
+
body: {
|
|
30
|
+
language_code: 'en',
|
|
31
|
+
language_probability: 0.98,
|
|
32
|
+
text: 'Hello world!',
|
|
33
|
+
words: [
|
|
34
|
+
{
|
|
35
|
+
text: 'Hello',
|
|
36
|
+
type: 'word',
|
|
37
|
+
start: 0,
|
|
38
|
+
end: 0.5,
|
|
39
|
+
speaker_id: 'speaker_1',
|
|
40
|
+
characters: [
|
|
41
|
+
{
|
|
42
|
+
text: 'text',
|
|
43
|
+
start: 0,
|
|
44
|
+
end: 0.1,
|
|
45
|
+
},
|
|
46
|
+
],
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
text: ' ',
|
|
50
|
+
type: 'spacing',
|
|
51
|
+
start: 0.5,
|
|
52
|
+
end: 0.5,
|
|
53
|
+
speaker_id: 'speaker_1',
|
|
54
|
+
characters: [
|
|
55
|
+
{
|
|
56
|
+
text: 'text',
|
|
57
|
+
start: 0,
|
|
58
|
+
end: 0.1,
|
|
59
|
+
},
|
|
60
|
+
],
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
text: 'world!',
|
|
64
|
+
type: 'word',
|
|
65
|
+
start: 0.5,
|
|
66
|
+
end: 1.2,
|
|
67
|
+
speaker_id: 'speaker_1',
|
|
68
|
+
characters: [
|
|
69
|
+
{
|
|
70
|
+
text: 'text',
|
|
71
|
+
start: 0,
|
|
72
|
+
end: 0.1,
|
|
73
|
+
},
|
|
74
|
+
],
|
|
75
|
+
},
|
|
76
|
+
],
|
|
77
|
+
additional_formats: [
|
|
78
|
+
{
|
|
79
|
+
requested_format: 'requested_format',
|
|
80
|
+
file_extension: 'file_extension',
|
|
81
|
+
content_type: 'content_type',
|
|
82
|
+
is_base64_encoded: true,
|
|
83
|
+
content: 'content',
|
|
84
|
+
},
|
|
85
|
+
],
|
|
86
|
+
},
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
it('should pass the model', async () => {
|
|
91
|
+
prepareJsonResponse();
|
|
92
|
+
|
|
93
|
+
await model.doGenerate({
|
|
94
|
+
audio: audioData,
|
|
95
|
+
mediaType: 'audio/wav',
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
expect(await server.calls[0].requestBodyMultipart).toMatchObject({
|
|
99
|
+
model_id: 'scribe_v1',
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
it('should pass headers', async () => {
|
|
104
|
+
prepareJsonResponse();
|
|
105
|
+
|
|
106
|
+
const provider = createElevenLabs({
|
|
107
|
+
apiKey: 'test-api-key',
|
|
108
|
+
headers: {
|
|
109
|
+
'Custom-Provider-Header': 'provider-header-value',
|
|
110
|
+
},
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
await provider.transcription('scribe_v1').doGenerate({
|
|
114
|
+
audio: audioData,
|
|
115
|
+
mediaType: 'audio/wav',
|
|
116
|
+
headers: {
|
|
117
|
+
'Custom-Request-Header': 'request-header-value',
|
|
118
|
+
},
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
expect(server.calls[0].requestHeaders).toMatchObject({
|
|
122
|
+
'xi-api-key': 'test-api-key',
|
|
123
|
+
'content-type': expect.stringMatching(
|
|
124
|
+
/^multipart\/form-data; boundary=----formdata-undici-\d+$/,
|
|
125
|
+
),
|
|
126
|
+
'custom-provider-header': 'provider-header-value',
|
|
127
|
+
'custom-request-header': 'request-header-value',
|
|
128
|
+
});
|
|
129
|
+
expect(server.calls[0].requestUserAgent).toContain(
|
|
130
|
+
`ai-sdk/elevenlabs/0.0.0-test`,
|
|
131
|
+
);
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
it('should extract the transcription text', async () => {
|
|
135
|
+
prepareJsonResponse();
|
|
136
|
+
|
|
137
|
+
const result = await model.doGenerate({
|
|
138
|
+
audio: audioData,
|
|
139
|
+
mediaType: 'audio/wav',
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
expect(result.text).toBe('Hello world!');
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
it('should include response data with timestamp, modelId and headers', async () => {
|
|
146
|
+
prepareJsonResponse({
|
|
147
|
+
headers: {
|
|
148
|
+
'x-request-id': 'test-request-id',
|
|
149
|
+
'x-ratelimit-remaining': '123',
|
|
150
|
+
},
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
const testDate = new Date(0);
|
|
154
|
+
const customModel = new ElevenLabsTranscriptionModel('scribe_v1', {
|
|
155
|
+
provider: 'test-provider',
|
|
156
|
+
url: () => 'https://api.elevenlabs.io/v1/speech-to-text',
|
|
157
|
+
headers: () => ({}),
|
|
158
|
+
_internal: {
|
|
159
|
+
currentDate: () => testDate,
|
|
160
|
+
},
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
const result = await customModel.doGenerate({
|
|
164
|
+
audio: audioData,
|
|
165
|
+
mediaType: 'audio/wav',
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
expect(result.response).toMatchObject({
|
|
169
|
+
timestamp: testDate,
|
|
170
|
+
modelId: 'scribe_v1',
|
|
171
|
+
headers: {
|
|
172
|
+
'content-type': 'application/json',
|
|
173
|
+
'x-request-id': 'test-request-id',
|
|
174
|
+
'x-ratelimit-remaining': '123',
|
|
175
|
+
},
|
|
176
|
+
});
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
it('should use real date when no custom date provider is specified', async () => {
|
|
180
|
+
prepareJsonResponse();
|
|
181
|
+
|
|
182
|
+
const testDate = new Date(0);
|
|
183
|
+
const customModel = new ElevenLabsTranscriptionModel('scribe_v1', {
|
|
184
|
+
provider: 'test-provider',
|
|
185
|
+
url: () => 'https://api.elevenlabs.io/v1/speech-to-text',
|
|
186
|
+
headers: () => ({}),
|
|
187
|
+
_internal: {
|
|
188
|
+
currentDate: () => testDate,
|
|
189
|
+
},
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
const result = await customModel.doGenerate({
|
|
193
|
+
audio: audioData,
|
|
194
|
+
mediaType: 'audio/wav',
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
expect(result.response.timestamp.getTime()).toEqual(testDate.getTime());
|
|
198
|
+
expect(result.response.modelId).toBe('scribe_v1');
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
it('should work when no additional formats are returned', async () => {
|
|
202
|
+
server.urls['https://api.elevenlabs.io/v1/speech-to-text'].response = {
|
|
203
|
+
type: 'json-value',
|
|
204
|
+
body: {
|
|
205
|
+
language_code: 'en',
|
|
206
|
+
language_probability: 0.98,
|
|
207
|
+
text: 'Hello world!',
|
|
208
|
+
words: [
|
|
209
|
+
{
|
|
210
|
+
text: 'Hello',
|
|
211
|
+
type: 'word',
|
|
212
|
+
start: 0,
|
|
213
|
+
end: 0.5,
|
|
214
|
+
speaker_id: 'speaker_1',
|
|
215
|
+
characters: [
|
|
216
|
+
{
|
|
217
|
+
text: 'text',
|
|
218
|
+
start: 0,
|
|
219
|
+
end: 0.1,
|
|
220
|
+
},
|
|
221
|
+
],
|
|
222
|
+
},
|
|
223
|
+
{
|
|
224
|
+
text: ' ',
|
|
225
|
+
type: 'spacing',
|
|
226
|
+
start: 0.5,
|
|
227
|
+
end: 0.5,
|
|
228
|
+
speaker_id: 'speaker_1',
|
|
229
|
+
characters: [
|
|
230
|
+
{
|
|
231
|
+
text: 'text',
|
|
232
|
+
start: 0,
|
|
233
|
+
end: 0.1,
|
|
234
|
+
},
|
|
235
|
+
],
|
|
236
|
+
},
|
|
237
|
+
{
|
|
238
|
+
text: 'world!',
|
|
239
|
+
type: 'word',
|
|
240
|
+
start: 0.5,
|
|
241
|
+
end: 1.2,
|
|
242
|
+
speaker_id: 'speaker_1',
|
|
243
|
+
characters: [
|
|
244
|
+
{
|
|
245
|
+
text: 'text',
|
|
246
|
+
start: 0,
|
|
247
|
+
end: 0.1,
|
|
248
|
+
},
|
|
249
|
+
],
|
|
250
|
+
},
|
|
251
|
+
],
|
|
252
|
+
},
|
|
253
|
+
};
|
|
254
|
+
|
|
255
|
+
const testDate = new Date(0);
|
|
256
|
+
const customModel = new ElevenLabsTranscriptionModel('scribe_v1', {
|
|
257
|
+
provider: 'test-provider',
|
|
258
|
+
url: () => 'https://api.elevenlabs.io/v1/speech-to-text',
|
|
259
|
+
headers: () => ({}),
|
|
260
|
+
_internal: {
|
|
261
|
+
currentDate: () => testDate,
|
|
262
|
+
},
|
|
263
|
+
});
|
|
264
|
+
|
|
265
|
+
const result = await customModel.doGenerate({
|
|
266
|
+
audio: audioData,
|
|
267
|
+
mediaType: 'audio/wav',
|
|
268
|
+
});
|
|
269
|
+
|
|
270
|
+
expect(result).toMatchInlineSnapshot(`
|
|
271
|
+
{
|
|
272
|
+
"durationInSeconds": 1.2,
|
|
273
|
+
"language": "en",
|
|
274
|
+
"response": {
|
|
275
|
+
"body": {
|
|
276
|
+
"language_code": "en",
|
|
277
|
+
"language_probability": 0.98,
|
|
278
|
+
"text": "Hello world!",
|
|
279
|
+
"words": [
|
|
280
|
+
{
|
|
281
|
+
"characters": [
|
|
282
|
+
{
|
|
283
|
+
"end": 0.1,
|
|
284
|
+
"start": 0,
|
|
285
|
+
"text": "text",
|
|
286
|
+
},
|
|
287
|
+
],
|
|
288
|
+
"end": 0.5,
|
|
289
|
+
"speaker_id": "speaker_1",
|
|
290
|
+
"start": 0,
|
|
291
|
+
"text": "Hello",
|
|
292
|
+
"type": "word",
|
|
293
|
+
},
|
|
294
|
+
{
|
|
295
|
+
"characters": [
|
|
296
|
+
{
|
|
297
|
+
"end": 0.1,
|
|
298
|
+
"start": 0,
|
|
299
|
+
"text": "text",
|
|
300
|
+
},
|
|
301
|
+
],
|
|
302
|
+
"end": 0.5,
|
|
303
|
+
"speaker_id": "speaker_1",
|
|
304
|
+
"start": 0.5,
|
|
305
|
+
"text": " ",
|
|
306
|
+
"type": "spacing",
|
|
307
|
+
},
|
|
308
|
+
{
|
|
309
|
+
"characters": [
|
|
310
|
+
{
|
|
311
|
+
"end": 0.1,
|
|
312
|
+
"start": 0,
|
|
313
|
+
"text": "text",
|
|
314
|
+
},
|
|
315
|
+
],
|
|
316
|
+
"end": 1.2,
|
|
317
|
+
"speaker_id": "speaker_1",
|
|
318
|
+
"start": 0.5,
|
|
319
|
+
"text": "world!",
|
|
320
|
+
"type": "word",
|
|
321
|
+
},
|
|
322
|
+
],
|
|
323
|
+
},
|
|
324
|
+
"headers": {
|
|
325
|
+
"content-length": "467",
|
|
326
|
+
"content-type": "application/json",
|
|
327
|
+
},
|
|
328
|
+
"modelId": "scribe_v1",
|
|
329
|
+
"timestamp": 1970-01-01T00:00:00.000Z,
|
|
330
|
+
},
|
|
331
|
+
"segments": [
|
|
332
|
+
{
|
|
333
|
+
"endSecond": 0.5,
|
|
334
|
+
"startSecond": 0,
|
|
335
|
+
"text": "Hello",
|
|
336
|
+
},
|
|
337
|
+
{
|
|
338
|
+
"endSecond": 0.5,
|
|
339
|
+
"startSecond": 0.5,
|
|
340
|
+
"text": " ",
|
|
341
|
+
},
|
|
342
|
+
{
|
|
343
|
+
"endSecond": 1.2,
|
|
344
|
+
"startSecond": 0.5,
|
|
345
|
+
"text": "world!",
|
|
346
|
+
},
|
|
347
|
+
],
|
|
348
|
+
"text": "Hello world!",
|
|
349
|
+
"warnings": [],
|
|
350
|
+
}
|
|
351
|
+
`);
|
|
352
|
+
});
|
|
353
|
+
|
|
354
|
+
it('should pass provider options correctly', async () => {
|
|
355
|
+
prepareJsonResponse();
|
|
356
|
+
|
|
357
|
+
await model.doGenerate({
|
|
358
|
+
audio: audioData,
|
|
359
|
+
mediaType: 'audio/wav',
|
|
360
|
+
providerOptions: {
|
|
361
|
+
elevenlabs: {
|
|
362
|
+
languageCode: 'en',
|
|
363
|
+
fileFormat: 'pcm_s16le_16',
|
|
364
|
+
tagAudioEvents: false,
|
|
365
|
+
numSpeakers: 2,
|
|
366
|
+
timestampsGranularity: 'character',
|
|
367
|
+
diarize: true,
|
|
368
|
+
},
|
|
369
|
+
},
|
|
370
|
+
});
|
|
371
|
+
|
|
372
|
+
expect(await server.calls[0].requestBodyMultipart).toMatchInlineSnapshot(`
|
|
373
|
+
{
|
|
374
|
+
"diarize": "true",
|
|
375
|
+
"file": File {
|
|
376
|
+
Symbol(kHandle): Blob {},
|
|
377
|
+
Symbol(kLength): 40169,
|
|
378
|
+
Symbol(kType): "audio/wav",
|
|
379
|
+
},
|
|
380
|
+
"file_format": "pcm_s16le_16",
|
|
381
|
+
"language_code": "en",
|
|
382
|
+
"model_id": "scribe_v1",
|
|
383
|
+
"num_speakers": "2",
|
|
384
|
+
"tag_audio_events": "false",
|
|
385
|
+
"timestamps_granularity": "character",
|
|
386
|
+
}
|
|
387
|
+
`);
|
|
388
|
+
});
|
|
389
|
+
});
|