@ai-sdk/gladia 2.0.7 → 2.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/dist/index.js +1 -1
- package/dist/index.mjs +1 -1
- package/package.json +5 -4
- package/src/gladia-api-types.ts +134 -0
- package/src/gladia-config.ts +9 -0
- package/src/gladia-error.test.ts +34 -0
- package/src/gladia-error.ts +16 -0
- package/src/gladia-provider.ts +117 -0
- package/src/gladia-transcription-model.test.ts +230 -0
- package/src/gladia-transcription-model.ts +652 -0
- package/src/index.ts +3 -0
- package/src/transcript-test.mp3 +0 -0
- package/src/version.ts +6 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,19 @@
|
|
|
1
1
|
# @ai-sdk/gladia
|
|
2
2
|
|
|
3
|
+
## 2.0.9
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 8dc54db: chore: add src folders to package bundle
|
|
8
|
+
|
|
9
|
+
## 2.0.8
|
|
10
|
+
|
|
11
|
+
### Patch Changes
|
|
12
|
+
|
|
13
|
+
- Updated dependencies [5c090e7]
|
|
14
|
+
- @ai-sdk/provider@3.0.4
|
|
15
|
+
- @ai-sdk/provider-utils@4.0.8
|
|
16
|
+
|
|
3
17
|
## 2.0.7
|
|
4
18
|
|
|
5
19
|
### Patch Changes
|
package/dist/index.js
CHANGED
|
@@ -553,7 +553,7 @@ var gladiaTranscriptionResultResponseSchema = import_v42.z.object({
|
|
|
553
553
|
});
|
|
554
554
|
|
|
555
555
|
// src/version.ts
|
|
556
|
-
var VERSION = true ? "2.0.
|
|
556
|
+
var VERSION = true ? "2.0.9" : "0.0.0-test";
|
|
557
557
|
|
|
558
558
|
// src/gladia-provider.ts
|
|
559
559
|
function createGladia(options = {}) {
|
package/dist/index.mjs
CHANGED
|
@@ -542,7 +542,7 @@ var gladiaTranscriptionResultResponseSchema = z2.object({
|
|
|
542
542
|
});
|
|
543
543
|
|
|
544
544
|
// src/version.ts
|
|
545
|
-
var VERSION = true ? "2.0.
|
|
545
|
+
var VERSION = true ? "2.0.9" : "0.0.0-test";
|
|
546
546
|
|
|
547
547
|
// src/gladia-provider.ts
|
|
548
548
|
function createGladia(options = {}) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ai-sdk/gladia",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.9",
|
|
4
4
|
"license": "Apache-2.0",
|
|
5
5
|
"sideEffects": false,
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
"types": "./dist/index.d.ts",
|
|
9
9
|
"files": [
|
|
10
10
|
"dist/**/*",
|
|
11
|
+
"src",
|
|
11
12
|
"CHANGELOG.md",
|
|
12
13
|
"README.md"
|
|
13
14
|
],
|
|
@@ -20,15 +21,15 @@
|
|
|
20
21
|
}
|
|
21
22
|
},
|
|
22
23
|
"dependencies": {
|
|
23
|
-
"@ai-sdk/provider": "3.0.
|
|
24
|
-
"@ai-sdk/provider-utils": "4.0.
|
|
24
|
+
"@ai-sdk/provider": "3.0.4",
|
|
25
|
+
"@ai-sdk/provider-utils": "4.0.8"
|
|
25
26
|
},
|
|
26
27
|
"devDependencies": {
|
|
27
28
|
"@types/node": "20.17.24",
|
|
28
29
|
"tsup": "^8",
|
|
29
30
|
"typescript": "5.6.3",
|
|
30
31
|
"zod": "3.25.76",
|
|
31
|
-
"@ai-sdk/test-server": "1.0.
|
|
32
|
+
"@ai-sdk/test-server": "1.0.2",
|
|
32
33
|
"@vercel/ai-tsconfig": "0.0.0"
|
|
33
34
|
},
|
|
34
35
|
"peerDependencies": {
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
export type GladiaTranscriptionInitiateAPITypes = {
|
|
2
|
+
/** URL to a Gladia file or to an external audio or video file */
|
|
3
|
+
audio_url: string;
|
|
4
|
+
/** [Alpha] Context to feed the transcription model with for possible better accuracy */
|
|
5
|
+
context_prompt?: string;
|
|
6
|
+
/** [Beta] Can be either boolean to enable custom_vocabulary or an array with specific vocabulary */
|
|
7
|
+
custom_vocabulary?: boolean | any[];
|
|
8
|
+
/** [Beta] Custom vocabulary configuration */
|
|
9
|
+
custom_vocabulary_config?: {
|
|
10
|
+
/** Vocabulary array with string or object containing value, intensity, pronunciations, and language */
|
|
11
|
+
vocabulary: Array<
|
|
12
|
+
| string
|
|
13
|
+
| {
|
|
14
|
+
/** Vocabulary value */
|
|
15
|
+
value: string;
|
|
16
|
+
/** Intensity of the vocabulary */
|
|
17
|
+
intensity?: number;
|
|
18
|
+
/** Pronunciation variations */
|
|
19
|
+
pronunciations?: string[];
|
|
20
|
+
/** Language of the vocabulary */
|
|
21
|
+
language?: string;
|
|
22
|
+
}
|
|
23
|
+
>;
|
|
24
|
+
/** Default intensity for vocabulary */
|
|
25
|
+
default_intensity?: number;
|
|
26
|
+
};
|
|
27
|
+
/** Detect the language from the given audio */
|
|
28
|
+
detect_language?: boolean;
|
|
29
|
+
/** Detect multiple languages in the given audio */
|
|
30
|
+
enable_code_switching?: boolean;
|
|
31
|
+
/** Configuration for code-switching */
|
|
32
|
+
code_switching_config?: {
|
|
33
|
+
/** Specify the languages you want to use when detecting multiple languages */
|
|
34
|
+
languages?: string[];
|
|
35
|
+
};
|
|
36
|
+
/** The original language in iso639-1 format */
|
|
37
|
+
language?: string;
|
|
38
|
+
/** Enable callback for this transcription */
|
|
39
|
+
callback?: boolean;
|
|
40
|
+
/** Configuration for callback */
|
|
41
|
+
callback_config?: {
|
|
42
|
+
/** The URL to be called with the result of the transcription */
|
|
43
|
+
url: string;
|
|
44
|
+
/** The HTTP method to be used */
|
|
45
|
+
method?: 'POST' | 'PUT';
|
|
46
|
+
};
|
|
47
|
+
/** Enable subtitles generation for this transcription */
|
|
48
|
+
subtitles?: boolean;
|
|
49
|
+
/** Configuration for subtitles */
|
|
50
|
+
subtitles_config?: {
|
|
51
|
+
/** Subtitles formats */
|
|
52
|
+
formats?: ('srt' | 'vtt')[];
|
|
53
|
+
/** Minimum duration of a subtitle in seconds */
|
|
54
|
+
minimum_duration?: number;
|
|
55
|
+
/** Maximum duration of a subtitle in seconds */
|
|
56
|
+
maximum_duration?: number;
|
|
57
|
+
/** Maximum number of characters per row */
|
|
58
|
+
maximum_characters_per_row?: number;
|
|
59
|
+
/** Maximum number of rows per caption */
|
|
60
|
+
maximum_rows_per_caption?: number;
|
|
61
|
+
/** Style of the subtitles */
|
|
62
|
+
style?: 'default' | 'compliance';
|
|
63
|
+
};
|
|
64
|
+
/** Enable speaker recognition (diarization) for this audio */
|
|
65
|
+
diarization?: boolean;
|
|
66
|
+
/** Configuration for diarization */
|
|
67
|
+
diarization_config?: {
|
|
68
|
+
/** Exact number of speakers in the audio */
|
|
69
|
+
number_of_speakers?: number;
|
|
70
|
+
/** Minimum number of speakers in the audio */
|
|
71
|
+
min_speakers?: number;
|
|
72
|
+
/** Maximum number of speakers in the audio */
|
|
73
|
+
max_speakers?: number;
|
|
74
|
+
/** [Alpha] Use enhanced diarization for this audio */
|
|
75
|
+
enhanced?: boolean;
|
|
76
|
+
};
|
|
77
|
+
/** [Beta] Enable translation for this audio */
|
|
78
|
+
translation?: boolean;
|
|
79
|
+
/** Configuration for translation */
|
|
80
|
+
translation_config?: {
|
|
81
|
+
/** The target language in iso639-1 format */
|
|
82
|
+
target_languages: string[];
|
|
83
|
+
/** Model for translation */
|
|
84
|
+
model?: 'base' | 'enhanced';
|
|
85
|
+
/** Align translated utterances with the original ones */
|
|
86
|
+
match_original_utterances?: boolean;
|
|
87
|
+
};
|
|
88
|
+
/** [Beta] Enable summarization for this audio */
|
|
89
|
+
summarization?: boolean;
|
|
90
|
+
/** Configuration for summarization */
|
|
91
|
+
summarization_config?: {
|
|
92
|
+
/** The type of summarization to apply */
|
|
93
|
+
type?: 'general' | 'bullet_points' | 'concise';
|
|
94
|
+
};
|
|
95
|
+
/** [Alpha] Enable moderation for this audio */
|
|
96
|
+
moderation?: boolean;
|
|
97
|
+
/** [Alpha] Enable named entity recognition for this audio */
|
|
98
|
+
named_entity_recognition?: boolean;
|
|
99
|
+
/** [Alpha] Enable chapterization for this audio */
|
|
100
|
+
chapterization?: boolean;
|
|
101
|
+
/** [Alpha] Enable names consistency for this audio */
|
|
102
|
+
name_consistency?: boolean;
|
|
103
|
+
/** [Alpha] Enable custom spelling for this audio */
|
|
104
|
+
custom_spelling?: boolean;
|
|
105
|
+
/** Configuration for custom spelling */
|
|
106
|
+
custom_spelling_config?: {
|
|
107
|
+
/** The list of spelling applied on the audio transcription */
|
|
108
|
+
spelling_dictionary: Record<string, string[]>;
|
|
109
|
+
};
|
|
110
|
+
/** [Alpha] Enable structured data extraction for this audio */
|
|
111
|
+
structured_data_extraction?: boolean;
|
|
112
|
+
/** Configuration for structured data extraction */
|
|
113
|
+
structured_data_extraction_config?: {
|
|
114
|
+
/** The list of classes to extract from the audio transcription */
|
|
115
|
+
classes: string[];
|
|
116
|
+
};
|
|
117
|
+
/** [Alpha] Enable sentiment analysis for this audio */
|
|
118
|
+
sentiment_analysis?: boolean;
|
|
119
|
+
/** [Alpha] Enable audio to llm processing for this audio */
|
|
120
|
+
audio_to_llm?: boolean;
|
|
121
|
+
/** Configuration for audio to llm */
|
|
122
|
+
audio_to_llm_config?: {
|
|
123
|
+
/** The list of prompts applied on the audio transcription */
|
|
124
|
+
prompts: string[];
|
|
125
|
+
};
|
|
126
|
+
/** Custom metadata you can attach to this transcription */
|
|
127
|
+
custom_metadata?: Record<string, any>;
|
|
128
|
+
/** Enable sentences for this audio */
|
|
129
|
+
sentences?: boolean;
|
|
130
|
+
/** [Alpha] Allows to change the output display_mode for this audio */
|
|
131
|
+
display_mode?: boolean;
|
|
132
|
+
/** [Alpha] Use enhanced punctuation for this audio */
|
|
133
|
+
punctuation_enhanced?: boolean;
|
|
134
|
+
};
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { FetchFunction } from '@ai-sdk/provider-utils';
|
|
2
|
+
|
|
3
|
+
export type GladiaConfig = {
|
|
4
|
+
provider: string;
|
|
5
|
+
url: (options: { modelId: string; path: string }) => string;
|
|
6
|
+
headers: () => Record<string, string | undefined>;
|
|
7
|
+
fetch?: FetchFunction;
|
|
8
|
+
generateId?: () => string;
|
|
9
|
+
};
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { safeParseJSON } from '@ai-sdk/provider-utils';
|
|
2
|
+
import { gladiaErrorDataSchema } from './gladia-error';
|
|
3
|
+
import { describe, it, expect } from 'vitest';
|
|
4
|
+
|
|
5
|
+
describe('gladiaErrorDataSchema', () => {
|
|
6
|
+
it('should parse Gladia resource exhausted error', async () => {
|
|
7
|
+
const error = `
|
|
8
|
+
{"error":{"message":"{\\n \\"error\\": {\\n \\"code\\": 429,\\n \\"message\\": \\"Resource has been exhausted (e.g. check quota).\\",\\n \\"status\\": \\"RESOURCE_EXHAUSTED\\"\\n }\\n}\\n","code":429}}
|
|
9
|
+
`;
|
|
10
|
+
|
|
11
|
+
const result = await safeParseJSON({
|
|
12
|
+
text: error,
|
|
13
|
+
schema: gladiaErrorDataSchema,
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
expect(result).toStrictEqual({
|
|
17
|
+
success: true,
|
|
18
|
+
value: {
|
|
19
|
+
error: {
|
|
20
|
+
message:
|
|
21
|
+
'{\n "error": {\n "code": 429,\n "message": "Resource has been exhausted (e.g. check quota).",\n "status": "RESOURCE_EXHAUSTED"\n }\n}\n',
|
|
22
|
+
code: 429,
|
|
23
|
+
},
|
|
24
|
+
},
|
|
25
|
+
rawValue: {
|
|
26
|
+
error: {
|
|
27
|
+
message:
|
|
28
|
+
'{\n "error": {\n "code": 429,\n "message": "Resource has been exhausted (e.g. check quota).",\n "status": "RESOURCE_EXHAUSTED"\n }\n}\n',
|
|
29
|
+
code: 429,
|
|
30
|
+
},
|
|
31
|
+
},
|
|
32
|
+
});
|
|
33
|
+
});
|
|
34
|
+
});
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { z } from 'zod/v4';
|
|
2
|
+
import { createJsonErrorResponseHandler } from '@ai-sdk/provider-utils';
|
|
3
|
+
|
|
4
|
+
export const gladiaErrorDataSchema = z.object({
|
|
5
|
+
error: z.object({
|
|
6
|
+
message: z.string(),
|
|
7
|
+
code: z.number(),
|
|
8
|
+
}),
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
export type GladiaErrorData = z.infer<typeof gladiaErrorDataSchema>;
|
|
12
|
+
|
|
13
|
+
export const gladiaFailedResponseHandler = createJsonErrorResponseHandler({
|
|
14
|
+
errorSchema: gladiaErrorDataSchema,
|
|
15
|
+
errorToMessage: data => data.error.message,
|
|
16
|
+
});
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import {
|
|
2
|
+
TranscriptionModelV3,
|
|
3
|
+
ProviderV3,
|
|
4
|
+
NoSuchModelError,
|
|
5
|
+
} from '@ai-sdk/provider';
|
|
6
|
+
import {
|
|
7
|
+
FetchFunction,
|
|
8
|
+
loadApiKey,
|
|
9
|
+
withUserAgentSuffix,
|
|
10
|
+
} from '@ai-sdk/provider-utils';
|
|
11
|
+
import { GladiaTranscriptionModel } from './gladia-transcription-model';
|
|
12
|
+
import { VERSION } from './version';
|
|
13
|
+
|
|
14
|
+
export interface GladiaProvider extends ProviderV3 {
|
|
15
|
+
(): {
|
|
16
|
+
transcription: GladiaTranscriptionModel;
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
Creates a model for transcription.
|
|
21
|
+
*/
|
|
22
|
+
transcription(): TranscriptionModelV3;
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* @deprecated Use `embeddingModel` instead.
|
|
26
|
+
*/
|
|
27
|
+
textEmbeddingModel(modelId: string): never;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export interface GladiaProviderSettings {
|
|
31
|
+
/**
|
|
32
|
+
API key for authenticating requests.
|
|
33
|
+
*/
|
|
34
|
+
apiKey?: string;
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
Custom headers to include in the requests.
|
|
38
|
+
*/
|
|
39
|
+
headers?: Record<string, string>;
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
Custom fetch implementation. You can use it as a middleware to intercept requests,
|
|
43
|
+
or to provide a custom fetch implementation for e.g. testing.
|
|
44
|
+
*/
|
|
45
|
+
fetch?: FetchFunction;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
Create a Gladia provider instance.
|
|
50
|
+
*/
|
|
51
|
+
export function createGladia(
|
|
52
|
+
options: GladiaProviderSettings = {},
|
|
53
|
+
): GladiaProvider {
|
|
54
|
+
const getHeaders = () =>
|
|
55
|
+
withUserAgentSuffix(
|
|
56
|
+
{
|
|
57
|
+
'x-gladia-key': loadApiKey({
|
|
58
|
+
apiKey: options.apiKey,
|
|
59
|
+
environmentVariableName: 'GLADIA_API_KEY',
|
|
60
|
+
description: 'Gladia',
|
|
61
|
+
}),
|
|
62
|
+
...options.headers,
|
|
63
|
+
},
|
|
64
|
+
`ai-sdk/gladia/${VERSION}`,
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
const createTranscriptionModel = () =>
|
|
68
|
+
new GladiaTranscriptionModel('default', {
|
|
69
|
+
provider: `gladia.transcription`,
|
|
70
|
+
url: ({ path }) => `https://api.gladia.io${path}`,
|
|
71
|
+
headers: getHeaders,
|
|
72
|
+
fetch: options.fetch,
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
const provider = function () {
|
|
76
|
+
return {
|
|
77
|
+
transcription: createTranscriptionModel(),
|
|
78
|
+
};
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
provider.specificationVersion = 'v3' as const;
|
|
82
|
+
provider.transcription = createTranscriptionModel;
|
|
83
|
+
provider.transcriptionModel = createTranscriptionModel;
|
|
84
|
+
|
|
85
|
+
// Required ProviderV3 methods that are not supported
|
|
86
|
+
provider.languageModel = (modelId: string) => {
|
|
87
|
+
throw new NoSuchModelError({
|
|
88
|
+
modelId,
|
|
89
|
+
modelType: 'languageModel',
|
|
90
|
+
message: 'Gladia does not provide language models',
|
|
91
|
+
});
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
provider.embeddingModel = (modelId: string) => {
|
|
95
|
+
throw new NoSuchModelError({
|
|
96
|
+
modelId,
|
|
97
|
+
modelType: 'embeddingModel',
|
|
98
|
+
message: 'Gladia does not provide embedding models',
|
|
99
|
+
});
|
|
100
|
+
};
|
|
101
|
+
provider.textEmbeddingModel = provider.embeddingModel;
|
|
102
|
+
|
|
103
|
+
provider.imageModel = (modelId: string) => {
|
|
104
|
+
throw new NoSuchModelError({
|
|
105
|
+
modelId,
|
|
106
|
+
modelType: 'imageModel',
|
|
107
|
+
message: 'Gladia does not provide image models',
|
|
108
|
+
});
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
return provider as GladiaProvider;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
Default Gladia provider instance.
|
|
116
|
+
*/
|
|
117
|
+
export const gladia = createGladia();
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
import { createTestServer } from '@ai-sdk/test-server/with-vitest';
|
|
2
|
+
import { GladiaTranscriptionModel } from './gladia-transcription-model';
|
|
3
|
+
import { createGladia } from './gladia-provider';
|
|
4
|
+
import { readFile } from 'node:fs/promises';
|
|
5
|
+
import path from 'node:path';
|
|
6
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
7
|
+
|
|
8
|
+
vi.mock('./version', () => ({
|
|
9
|
+
VERSION: '0.0.0-test',
|
|
10
|
+
}));
|
|
11
|
+
|
|
12
|
+
const audioData = await readFile(path.join(__dirname, 'transcript-test.mp3'));
|
|
13
|
+
const provider = createGladia({ apiKey: 'test-api-key' });
|
|
14
|
+
const model = provider.transcription();
|
|
15
|
+
|
|
16
|
+
const server = createTestServer({
|
|
17
|
+
'https://api.gladia.io/v2/upload': {
|
|
18
|
+
response: {
|
|
19
|
+
type: 'json-value',
|
|
20
|
+
body: {
|
|
21
|
+
audio_url: 'https://storage.gladia.io/mock-upload-url',
|
|
22
|
+
audio_metadata: {
|
|
23
|
+
id: 'test-id',
|
|
24
|
+
filename: 'test-file.mp3',
|
|
25
|
+
extension: 'mp3',
|
|
26
|
+
size: 1024,
|
|
27
|
+
audio_duration: 60,
|
|
28
|
+
number_of_channels: 2,
|
|
29
|
+
},
|
|
30
|
+
},
|
|
31
|
+
},
|
|
32
|
+
},
|
|
33
|
+
'https://api.gladia.io/v2/pre-recorded': {},
|
|
34
|
+
'https://api.gladia.io/v2/transcription/test-id': {},
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
describe('doGenerate', () => {
|
|
38
|
+
function prepareJsonResponse({
|
|
39
|
+
headers,
|
|
40
|
+
}: {
|
|
41
|
+
headers?: Record<string, string>;
|
|
42
|
+
} = {}) {
|
|
43
|
+
// No need to set the upload response here as it's already set in the server creation
|
|
44
|
+
server.urls['https://api.gladia.io/v2/pre-recorded'].response = {
|
|
45
|
+
type: 'json-value',
|
|
46
|
+
headers,
|
|
47
|
+
body: {
|
|
48
|
+
id: 'test-id',
|
|
49
|
+
result_url: 'https://api.gladia.io/v2/transcription/test-id',
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
server.urls['https://api.gladia.io/v2/transcription/test-id'].response = {
|
|
53
|
+
type: 'json-value',
|
|
54
|
+
headers,
|
|
55
|
+
body: {
|
|
56
|
+
id: '45463597-20b7-4af7-b3b3-f5fb778203ab',
|
|
57
|
+
request_id: 'G-45463597',
|
|
58
|
+
version: 2,
|
|
59
|
+
status: 'done',
|
|
60
|
+
created_at: '2023-12-28T09:04:17.210Z',
|
|
61
|
+
completed_at: '2023-12-28T09:04:37.210Z',
|
|
62
|
+
custom_metadata: {},
|
|
63
|
+
error_code: null,
|
|
64
|
+
kind: 'pre-recorded',
|
|
65
|
+
file: {
|
|
66
|
+
id: 'test-id',
|
|
67
|
+
filename: 'test-file.mp3',
|
|
68
|
+
source: 'upload',
|
|
69
|
+
audio_duration: 60,
|
|
70
|
+
number_of_channels: 2,
|
|
71
|
+
},
|
|
72
|
+
request_params: {
|
|
73
|
+
audio_url: 'https://storage.gladia.io/mock-upload-url',
|
|
74
|
+
},
|
|
75
|
+
result: {
|
|
76
|
+
metadata: {
|
|
77
|
+
audio_duration: 60,
|
|
78
|
+
number_of_distinct_channels: 2,
|
|
79
|
+
billing_time: 60,
|
|
80
|
+
transcription_time: 20,
|
|
81
|
+
},
|
|
82
|
+
transcription: {
|
|
83
|
+
full_transcript: 'Smoke from hundreds of wildfires.',
|
|
84
|
+
languages: ['en'],
|
|
85
|
+
utterances: [
|
|
86
|
+
{
|
|
87
|
+
language: 'en',
|
|
88
|
+
start: 0,
|
|
89
|
+
end: 3,
|
|
90
|
+
confidence: 0.95,
|
|
91
|
+
channel: 1,
|
|
92
|
+
speaker: 1,
|
|
93
|
+
words: [
|
|
94
|
+
{
|
|
95
|
+
word: 'Smoke',
|
|
96
|
+
start: 0,
|
|
97
|
+
end: 1,
|
|
98
|
+
confidence: 0.95,
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
word: 'from',
|
|
102
|
+
start: 1,
|
|
103
|
+
end: 2,
|
|
104
|
+
confidence: 0.95,
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
word: 'hundreds',
|
|
108
|
+
start: 2,
|
|
109
|
+
end: 3,
|
|
110
|
+
confidence: 0.95,
|
|
111
|
+
},
|
|
112
|
+
],
|
|
113
|
+
text: 'Smoke from hundreds of wildfires.',
|
|
114
|
+
},
|
|
115
|
+
],
|
|
116
|
+
},
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
it('should pass the model', async () => {
|
|
123
|
+
prepareJsonResponse();
|
|
124
|
+
|
|
125
|
+
await model.doGenerate({
|
|
126
|
+
audio: audioData,
|
|
127
|
+
mediaType: 'audio/wav',
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
expect(await server.calls[1].requestBodyJson).toMatchObject({
|
|
131
|
+
audio_url: 'https://storage.gladia.io/mock-upload-url',
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
it('should pass headers', async () => {
|
|
136
|
+
prepareJsonResponse();
|
|
137
|
+
|
|
138
|
+
const provider = createGladia({
|
|
139
|
+
apiKey: 'test-api-key',
|
|
140
|
+
headers: {
|
|
141
|
+
'Custom-Provider-Header': 'provider-header-value',
|
|
142
|
+
},
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
await provider.transcription().doGenerate({
|
|
146
|
+
audio: audioData,
|
|
147
|
+
mediaType: 'audio/wav',
|
|
148
|
+
headers: {
|
|
149
|
+
'Custom-Request-Header': 'request-header-value',
|
|
150
|
+
},
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
expect(server.calls[1].requestHeaders).toMatchObject({
|
|
154
|
+
'x-gladia-key': 'test-api-key',
|
|
155
|
+
'content-type': 'application/json',
|
|
156
|
+
'custom-provider-header': 'provider-header-value',
|
|
157
|
+
'custom-request-header': 'request-header-value',
|
|
158
|
+
});
|
|
159
|
+
expect(server.calls[0].requestUserAgent).toContain(
|
|
160
|
+
`ai-sdk/gladia/0.0.0-test`,
|
|
161
|
+
);
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
it('should extract the transcription text', async () => {
|
|
165
|
+
prepareJsonResponse();
|
|
166
|
+
|
|
167
|
+
const result = await model.doGenerate({
|
|
168
|
+
audio: audioData,
|
|
169
|
+
mediaType: 'audio/wav',
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
expect(result.text).toBe('Smoke from hundreds of wildfires.');
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
it('should include response data with timestamp, modelId and headers', async () => {
|
|
176
|
+
prepareJsonResponse({
|
|
177
|
+
headers: {
|
|
178
|
+
'x-request-id': 'test-request-id',
|
|
179
|
+
'x-ratelimit-remaining': '123',
|
|
180
|
+
},
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
const testDate = new Date(0);
|
|
184
|
+
const customModel = new GladiaTranscriptionModel('default', {
|
|
185
|
+
provider: 'test-provider',
|
|
186
|
+
url: ({ path }) => `https://api.gladia.io${path}`,
|
|
187
|
+
headers: () => ({}),
|
|
188
|
+
_internal: {
|
|
189
|
+
currentDate: () => testDate,
|
|
190
|
+
},
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
const result = await customModel.doGenerate({
|
|
194
|
+
audio: audioData,
|
|
195
|
+
mediaType: 'audio/wav',
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
expect(result.response).toMatchObject({
|
|
199
|
+
timestamp: testDate,
|
|
200
|
+
modelId: 'default',
|
|
201
|
+
headers: {
|
|
202
|
+
'content-type': 'application/json',
|
|
203
|
+
'x-request-id': 'test-request-id',
|
|
204
|
+
'x-ratelimit-remaining': '123',
|
|
205
|
+
},
|
|
206
|
+
});
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
it('should use real date when no custom date provider is specified', async () => {
|
|
210
|
+
prepareJsonResponse();
|
|
211
|
+
|
|
212
|
+
const testDate = new Date(0);
|
|
213
|
+
const customModel = new GladiaTranscriptionModel('default', {
|
|
214
|
+
provider: 'test-provider',
|
|
215
|
+
url: ({ path }) => `https://api.gladia.io${path}`,
|
|
216
|
+
headers: () => ({}),
|
|
217
|
+
_internal: {
|
|
218
|
+
currentDate: () => testDate,
|
|
219
|
+
},
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
const result = await customModel.doGenerate({
|
|
223
|
+
audio: audioData,
|
|
224
|
+
mediaType: 'audio/wav',
|
|
225
|
+
});
|
|
226
|
+
|
|
227
|
+
expect(result.response.timestamp.getTime()).toEqual(testDate.getTime());
|
|
228
|
+
expect(result.response.modelId).toBe('default');
|
|
229
|
+
});
|
|
230
|
+
});
|
|
@@ -0,0 +1,652 @@
|
|
|
1
|
+
import {
|
|
2
|
+
AISDKError,
|
|
3
|
+
TranscriptionModelV3,
|
|
4
|
+
SharedV3Warning,
|
|
5
|
+
} from '@ai-sdk/provider';
|
|
6
|
+
import {
|
|
7
|
+
combineHeaders,
|
|
8
|
+
convertBase64ToUint8Array,
|
|
9
|
+
createJsonResponseHandler,
|
|
10
|
+
mediaTypeToExtension,
|
|
11
|
+
delay,
|
|
12
|
+
getFromApi,
|
|
13
|
+
parseProviderOptions,
|
|
14
|
+
postFormDataToApi,
|
|
15
|
+
postJsonToApi,
|
|
16
|
+
} from '@ai-sdk/provider-utils';
|
|
17
|
+
import { z } from 'zod/v4';
|
|
18
|
+
import { GladiaConfig } from './gladia-config';
|
|
19
|
+
import { gladiaFailedResponseHandler } from './gladia-error';
|
|
20
|
+
import { GladiaTranscriptionInitiateAPITypes } from './gladia-api-types';
|
|
21
|
+
|
|
22
|
+
// https://docs.gladia.io/api-reference/v2/pre-recorded/init
|
|
23
|
+
const gladiaProviderOptionsSchema = z.object({
|
|
24
|
+
/**
|
|
25
|
+
* Optional context prompt to guide the transcription.
|
|
26
|
+
*/
|
|
27
|
+
contextPrompt: z.string().nullish(),
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Custom vocabulary to improve transcription accuracy.
|
|
31
|
+
* Can be a boolean or an array of custom terms.
|
|
32
|
+
*/
|
|
33
|
+
customVocabulary: z.union([z.boolean(), z.array(z.any())]).nullish(),
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Configuration for custom vocabulary.
|
|
37
|
+
*/
|
|
38
|
+
customVocabularyConfig: z
|
|
39
|
+
.object({
|
|
40
|
+
/**
|
|
41
|
+
* Array of vocabulary terms or objects with pronunciation details.
|
|
42
|
+
*/
|
|
43
|
+
vocabulary: z.array(
|
|
44
|
+
z.union([
|
|
45
|
+
z.string(),
|
|
46
|
+
z.object({
|
|
47
|
+
/**
|
|
48
|
+
* The vocabulary term.
|
|
49
|
+
*/
|
|
50
|
+
value: z.string(),
|
|
51
|
+
/**
|
|
52
|
+
* Intensity of the term in recognition (optional).
|
|
53
|
+
*/
|
|
54
|
+
intensity: z.number().nullish(),
|
|
55
|
+
/**
|
|
56
|
+
* Alternative pronunciations for the term (optional).
|
|
57
|
+
*/
|
|
58
|
+
pronunciations: z.array(z.string()).nullish(),
|
|
59
|
+
/**
|
|
60
|
+
* Language of the term (optional).
|
|
61
|
+
*/
|
|
62
|
+
language: z.string().nullish(),
|
|
63
|
+
}),
|
|
64
|
+
]),
|
|
65
|
+
),
|
|
66
|
+
/**
|
|
67
|
+
* Default intensity for all vocabulary terms.
|
|
68
|
+
*/
|
|
69
|
+
defaultIntensity: z.number().nullish(),
|
|
70
|
+
})
|
|
71
|
+
.nullish(),
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Whether to automatically detect the language of the audio.
|
|
75
|
+
*/
|
|
76
|
+
detectLanguage: z.boolean().nullish(),
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Whether to enable code switching (multiple languages in the same audio).
|
|
80
|
+
*/
|
|
81
|
+
enableCodeSwitching: z.boolean().nullish(),
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Configuration for code switching.
|
|
85
|
+
*/
|
|
86
|
+
codeSwitchingConfig: z
|
|
87
|
+
.object({
|
|
88
|
+
/**
|
|
89
|
+
* Languages to consider for code switching.
|
|
90
|
+
*/
|
|
91
|
+
languages: z.array(z.string()).nullish(),
|
|
92
|
+
})
|
|
93
|
+
.nullish(),
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Specific language for transcription.
|
|
97
|
+
*/
|
|
98
|
+
language: z.string().nullish(),
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Whether to enable callback when transcription is complete.
|
|
102
|
+
*/
|
|
103
|
+
callback: z.boolean().nullish(),
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Configuration for callback.
|
|
107
|
+
*/
|
|
108
|
+
callbackConfig: z
|
|
109
|
+
.object({
|
|
110
|
+
/**
|
|
111
|
+
* URL to send the callback to.
|
|
112
|
+
*/
|
|
113
|
+
url: z.string(),
|
|
114
|
+
/**
|
|
115
|
+
* HTTP method for the callback.
|
|
116
|
+
*/
|
|
117
|
+
method: z.enum(['POST', 'PUT']).nullish(),
|
|
118
|
+
})
|
|
119
|
+
.nullish(),
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Whether to generate subtitles.
|
|
123
|
+
*/
|
|
124
|
+
subtitles: z.boolean().nullish(),
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Configuration for subtitles generation.
|
|
128
|
+
*/
|
|
129
|
+
subtitlesConfig: z
|
|
130
|
+
.object({
|
|
131
|
+
/**
|
|
132
|
+
* Subtitle file formats to generate.
|
|
133
|
+
*/
|
|
134
|
+
formats: z.array(z.enum(['srt', 'vtt'])).nullish(),
|
|
135
|
+
/**
|
|
136
|
+
* Minimum duration for subtitle segments.
|
|
137
|
+
*/
|
|
138
|
+
minimumDuration: z.number().nullish(),
|
|
139
|
+
/**
|
|
140
|
+
* Maximum duration for subtitle segments.
|
|
141
|
+
*/
|
|
142
|
+
maximumDuration: z.number().nullish(),
|
|
143
|
+
/**
|
|
144
|
+
* Maximum characters per row in subtitles.
|
|
145
|
+
*/
|
|
146
|
+
maximumCharactersPerRow: z.number().nullish(),
|
|
147
|
+
/**
|
|
148
|
+
* Maximum rows per caption in subtitles.
|
|
149
|
+
*/
|
|
150
|
+
maximumRowsPerCaption: z.number().nullish(),
|
|
151
|
+
/**
|
|
152
|
+
* Style of subtitles.
|
|
153
|
+
*/
|
|
154
|
+
style: z.enum(['default', 'compliance']).nullish(),
|
|
155
|
+
})
|
|
156
|
+
.nullish(),
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Whether to enable speaker diarization (speaker identification).
|
|
160
|
+
*/
|
|
161
|
+
diarization: z.boolean().nullish(),
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Configuration for diarization.
|
|
165
|
+
*/
|
|
166
|
+
diarizationConfig: z
|
|
167
|
+
.object({
|
|
168
|
+
/**
|
|
169
|
+
* Exact number of speakers to identify.
|
|
170
|
+
*/
|
|
171
|
+
numberOfSpeakers: z.number().nullish(),
|
|
172
|
+
/**
|
|
173
|
+
* Minimum number of speakers to identify.
|
|
174
|
+
*/
|
|
175
|
+
minSpeakers: z.number().nullish(),
|
|
176
|
+
/**
|
|
177
|
+
* Maximum number of speakers to identify.
|
|
178
|
+
*/
|
|
179
|
+
maxSpeakers: z.number().nullish(),
|
|
180
|
+
/**
|
|
181
|
+
* Whether to use enhanced diarization.
|
|
182
|
+
*/
|
|
183
|
+
enhanced: z.boolean().nullish(),
|
|
184
|
+
})
|
|
185
|
+
.nullish(),
|
|
186
|
+
|
|
187
|
+
/**
|
|
188
|
+
* Whether to translate the transcription.
|
|
189
|
+
*/
|
|
190
|
+
translation: z.boolean().nullish(),
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Configuration for translation.
|
|
194
|
+
*/
|
|
195
|
+
translationConfig: z
|
|
196
|
+
.object({
|
|
197
|
+
/**
|
|
198
|
+
* Target languages for translation.
|
|
199
|
+
*/
|
|
200
|
+
targetLanguages: z.array(z.string()),
|
|
201
|
+
/**
|
|
202
|
+
* Translation model to use.
|
|
203
|
+
*/
|
|
204
|
+
model: z.enum(['base', 'enhanced']).nullish(),
|
|
205
|
+
/**
|
|
206
|
+
* Whether to match original utterances in translation.
|
|
207
|
+
*/
|
|
208
|
+
matchOriginalUtterances: z.boolean().nullish(),
|
|
209
|
+
})
|
|
210
|
+
.nullish(),
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Whether to generate a summary of the transcription.
|
|
214
|
+
*/
|
|
215
|
+
summarization: z.boolean().nullish(),
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* Configuration for summarization.
|
|
219
|
+
*/
|
|
220
|
+
summarizationConfig: z
|
|
221
|
+
.object({
|
|
222
|
+
/**
|
|
223
|
+
* Type of summary to generate.
|
|
224
|
+
*/
|
|
225
|
+
type: z.enum(['general', 'bullet_points', 'concise']).nullish(),
|
|
226
|
+
})
|
|
227
|
+
.nullish(),
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Whether to enable content moderation.
|
|
231
|
+
*/
|
|
232
|
+
moderation: z.boolean().nullish(),
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Whether to enable named entity recognition.
|
|
236
|
+
*/
|
|
237
|
+
namedEntityRecognition: z.boolean().nullish(),
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* Whether to enable automatic chapter creation.
|
|
241
|
+
*/
|
|
242
|
+
chapterization: z.boolean().nullish(),
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Whether to ensure consistent naming of entities.
|
|
246
|
+
*/
|
|
247
|
+
nameConsistency: z.boolean().nullish(),
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Whether to enable custom spelling.
|
|
251
|
+
*/
|
|
252
|
+
customSpelling: z.boolean().nullish(),
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Configuration for custom spelling.
|
|
256
|
+
*/
|
|
257
|
+
customSpellingConfig: z
|
|
258
|
+
.object({
|
|
259
|
+
/**
|
|
260
|
+
* Dictionary of custom spellings.
|
|
261
|
+
*/
|
|
262
|
+
spellingDictionary: z.record(z.string(), z.array(z.string())),
|
|
263
|
+
})
|
|
264
|
+
.nullish(),
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Whether to extract structured data from the transcription.
|
|
268
|
+
*/
|
|
269
|
+
structuredDataExtraction: z.boolean().nullish(),
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Configuration for structured data extraction.
|
|
273
|
+
*/
|
|
274
|
+
structuredDataExtractionConfig: z
|
|
275
|
+
.object({
|
|
276
|
+
/**
|
|
277
|
+
* Classes of data to extract.
|
|
278
|
+
*/
|
|
279
|
+
classes: z.array(z.string()),
|
|
280
|
+
})
|
|
281
|
+
.nullish(),
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Whether to perform sentiment analysis on the transcription.
|
|
285
|
+
*/
|
|
286
|
+
sentimentAnalysis: z.boolean().nullish(),
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Whether to send audio to a language model for processing.
|
|
290
|
+
*/
|
|
291
|
+
audioToLlm: z.boolean().nullish(),
|
|
292
|
+
|
|
293
|
+
/**
|
|
294
|
+
* Configuration for audio to language model processing.
|
|
295
|
+
*/
|
|
296
|
+
audioToLlmConfig: z
|
|
297
|
+
.object({
|
|
298
|
+
/**
|
|
299
|
+
* Prompts to send to the language model.
|
|
300
|
+
*/
|
|
301
|
+
prompts: z.array(z.string()),
|
|
302
|
+
})
|
|
303
|
+
.nullish(),
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* Custom metadata to include with the transcription.
|
|
307
|
+
*/
|
|
308
|
+
customMetadata: z.record(z.string(), z.any()).nullish(),
|
|
309
|
+
|
|
310
|
+
/**
|
|
311
|
+
* Whether to include sentence-level segmentation.
|
|
312
|
+
*/
|
|
313
|
+
sentences: z.boolean().nullish(),
|
|
314
|
+
|
|
315
|
+
/**
|
|
316
|
+
* Whether to enable display mode.
|
|
317
|
+
*/
|
|
318
|
+
displayMode: z.boolean().nullish(),
|
|
319
|
+
|
|
320
|
+
/**
|
|
321
|
+
* Whether to enhance punctuation in the transcription.
|
|
322
|
+
*/
|
|
323
|
+
punctuationEnhanced: z.boolean().nullish(),
|
|
324
|
+
});
|
|
325
|
+
|
|
326
|
+
export type GladiaTranscriptionCallOptions = z.infer<
|
|
327
|
+
typeof gladiaProviderOptionsSchema
|
|
328
|
+
>;
|
|
329
|
+
|
|
330
|
+
interface GladiaTranscriptionModelConfig extends GladiaConfig {
|
|
331
|
+
_internal?: {
|
|
332
|
+
currentDate?: () => Date;
|
|
333
|
+
};
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
export class GladiaTranscriptionModel implements TranscriptionModelV3 {
|
|
337
|
+
readonly specificationVersion = 'v3';
|
|
338
|
+
|
|
339
|
+
get provider(): string {
|
|
340
|
+
return this.config.provider;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
constructor(
|
|
344
|
+
readonly modelId: 'default',
|
|
345
|
+
private readonly config: GladiaTranscriptionModelConfig,
|
|
346
|
+
) {}
|
|
347
|
+
|
|
348
|
+
private async getArgs({
|
|
349
|
+
providerOptions,
|
|
350
|
+
}: Parameters<TranscriptionModelV3['doGenerate']>[0]) {
|
|
351
|
+
const warnings: SharedV3Warning[] = [];
|
|
352
|
+
|
|
353
|
+
// Parse provider options
|
|
354
|
+
const gladiaOptions = await parseProviderOptions({
|
|
355
|
+
provider: 'gladia',
|
|
356
|
+
providerOptions,
|
|
357
|
+
schema: gladiaProviderOptionsSchema,
|
|
358
|
+
});
|
|
359
|
+
|
|
360
|
+
const body: Omit<GladiaTranscriptionInitiateAPITypes, 'audio_url'> = {};
|
|
361
|
+
|
|
362
|
+
// Add provider-specific options
|
|
363
|
+
if (gladiaOptions) {
|
|
364
|
+
body.context_prompt = gladiaOptions.contextPrompt ?? undefined;
|
|
365
|
+
body.custom_vocabulary = gladiaOptions.customVocabulary ?? undefined;
|
|
366
|
+
body.detect_language = gladiaOptions.detectLanguage ?? undefined;
|
|
367
|
+
body.enable_code_switching =
|
|
368
|
+
gladiaOptions.enableCodeSwitching ?? undefined;
|
|
369
|
+
body.language = gladiaOptions.language ?? undefined;
|
|
370
|
+
body.callback = gladiaOptions.callback ?? undefined;
|
|
371
|
+
body.subtitles = gladiaOptions.subtitles ?? undefined;
|
|
372
|
+
body.diarization = gladiaOptions.diarization ?? undefined;
|
|
373
|
+
body.translation = gladiaOptions.translation ?? undefined;
|
|
374
|
+
body.summarization = gladiaOptions.summarization ?? undefined;
|
|
375
|
+
body.moderation = gladiaOptions.moderation ?? undefined;
|
|
376
|
+
body.named_entity_recognition =
|
|
377
|
+
gladiaOptions.namedEntityRecognition ?? undefined;
|
|
378
|
+
body.chapterization = gladiaOptions.chapterization ?? undefined;
|
|
379
|
+
body.name_consistency = gladiaOptions.nameConsistency ?? undefined;
|
|
380
|
+
body.custom_spelling = gladiaOptions.customSpelling ?? undefined;
|
|
381
|
+
body.structured_data_extraction =
|
|
382
|
+
gladiaOptions.structuredDataExtraction ?? undefined;
|
|
383
|
+
body.structured_data_extraction_config =
|
|
384
|
+
gladiaOptions.structuredDataExtractionConfig ?? undefined;
|
|
385
|
+
body.sentiment_analysis = gladiaOptions.sentimentAnalysis ?? undefined;
|
|
386
|
+
body.audio_to_llm = gladiaOptions.audioToLlm ?? undefined;
|
|
387
|
+
body.audio_to_llm_config = gladiaOptions.audioToLlmConfig ?? undefined;
|
|
388
|
+
body.custom_metadata = gladiaOptions.customMetadata ?? undefined;
|
|
389
|
+
body.sentences = gladiaOptions.sentences ?? undefined;
|
|
390
|
+
body.display_mode = gladiaOptions.displayMode ?? undefined;
|
|
391
|
+
body.punctuation_enhanced =
|
|
392
|
+
gladiaOptions.punctuationEnhanced ?? undefined;
|
|
393
|
+
|
|
394
|
+
if (gladiaOptions.customVocabularyConfig) {
|
|
395
|
+
body.custom_vocabulary_config = {
|
|
396
|
+
vocabulary: gladiaOptions.customVocabularyConfig.vocabulary.map(
|
|
397
|
+
item => {
|
|
398
|
+
if (typeof item === 'string') return item;
|
|
399
|
+
return {
|
|
400
|
+
value: item.value,
|
|
401
|
+
intensity: item.intensity ?? undefined,
|
|
402
|
+
pronunciations: item.pronunciations ?? undefined,
|
|
403
|
+
language: item.language ?? undefined,
|
|
404
|
+
};
|
|
405
|
+
},
|
|
406
|
+
),
|
|
407
|
+
default_intensity:
|
|
408
|
+
gladiaOptions.customVocabularyConfig.defaultIntensity ?? undefined,
|
|
409
|
+
};
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// Handle code switching config
|
|
413
|
+
if (gladiaOptions.codeSwitchingConfig) {
|
|
414
|
+
body.code_switching_config = {
|
|
415
|
+
languages: gladiaOptions.codeSwitchingConfig.languages ?? undefined,
|
|
416
|
+
};
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
// Handle callback config
|
|
420
|
+
if (gladiaOptions.callbackConfig) {
|
|
421
|
+
body.callback_config = {
|
|
422
|
+
url: gladiaOptions.callbackConfig.url,
|
|
423
|
+
method: gladiaOptions.callbackConfig.method ?? undefined,
|
|
424
|
+
};
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
// Handle subtitles config
|
|
428
|
+
if (gladiaOptions.subtitlesConfig) {
|
|
429
|
+
body.subtitles_config = {
|
|
430
|
+
formats: gladiaOptions.subtitlesConfig.formats ?? undefined,
|
|
431
|
+
minimum_duration:
|
|
432
|
+
gladiaOptions.subtitlesConfig.minimumDuration ?? undefined,
|
|
433
|
+
maximum_duration:
|
|
434
|
+
gladiaOptions.subtitlesConfig.maximumDuration ?? undefined,
|
|
435
|
+
maximum_characters_per_row:
|
|
436
|
+
gladiaOptions.subtitlesConfig.maximumCharactersPerRow ?? undefined,
|
|
437
|
+
maximum_rows_per_caption:
|
|
438
|
+
gladiaOptions.subtitlesConfig.maximumRowsPerCaption ?? undefined,
|
|
439
|
+
style: gladiaOptions.subtitlesConfig.style ?? undefined,
|
|
440
|
+
};
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
// Handle diarization config
|
|
444
|
+
if (gladiaOptions.diarizationConfig) {
|
|
445
|
+
body.diarization_config = {
|
|
446
|
+
number_of_speakers:
|
|
447
|
+
gladiaOptions.diarizationConfig.numberOfSpeakers ?? undefined,
|
|
448
|
+
min_speakers:
|
|
449
|
+
gladiaOptions.diarizationConfig.minSpeakers ?? undefined,
|
|
450
|
+
max_speakers:
|
|
451
|
+
gladiaOptions.diarizationConfig.maxSpeakers ?? undefined,
|
|
452
|
+
enhanced: gladiaOptions.diarizationConfig.enhanced ?? undefined,
|
|
453
|
+
};
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
// Handle translation config
|
|
457
|
+
if (gladiaOptions.translationConfig) {
|
|
458
|
+
body.translation_config = {
|
|
459
|
+
target_languages: gladiaOptions.translationConfig.targetLanguages,
|
|
460
|
+
model: gladiaOptions.translationConfig.model ?? undefined,
|
|
461
|
+
match_original_utterances:
|
|
462
|
+
gladiaOptions.translationConfig.matchOriginalUtterances ??
|
|
463
|
+
undefined,
|
|
464
|
+
};
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// Handle summarization config
|
|
468
|
+
if (gladiaOptions.summarizationConfig) {
|
|
469
|
+
body.summarization_config = {
|
|
470
|
+
type: gladiaOptions.summarizationConfig.type ?? undefined,
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
// Handle custom spelling config
|
|
475
|
+
if (gladiaOptions.customSpellingConfig) {
|
|
476
|
+
body.custom_spelling_config = {
|
|
477
|
+
spelling_dictionary:
|
|
478
|
+
gladiaOptions.customSpellingConfig.spellingDictionary,
|
|
479
|
+
};
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
return {
|
|
484
|
+
body,
|
|
485
|
+
warnings,
|
|
486
|
+
};
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
async doGenerate(
|
|
490
|
+
options: Parameters<TranscriptionModelV3['doGenerate']>[0],
|
|
491
|
+
): Promise<Awaited<ReturnType<TranscriptionModelV3['doGenerate']>>> {
|
|
492
|
+
const currentDate = this.config._internal?.currentDate?.() ?? new Date();
|
|
493
|
+
|
|
494
|
+
// Create form data with base fields
|
|
495
|
+
const formData = new FormData();
|
|
496
|
+
const blob =
|
|
497
|
+
options.audio instanceof Uint8Array
|
|
498
|
+
? new Blob([options.audio])
|
|
499
|
+
: new Blob([convertBase64ToUint8Array(options.audio)]);
|
|
500
|
+
|
|
501
|
+
const fileExtension = mediaTypeToExtension(options.mediaType);
|
|
502
|
+
formData.append(
|
|
503
|
+
'audio',
|
|
504
|
+
new File([blob], 'audio', { type: options.mediaType }),
|
|
505
|
+
`audio.${fileExtension}`,
|
|
506
|
+
);
|
|
507
|
+
|
|
508
|
+
const { value: uploadResponse } = await postFormDataToApi({
|
|
509
|
+
url: this.config.url({
|
|
510
|
+
path: '/v2/upload',
|
|
511
|
+
modelId: 'default',
|
|
512
|
+
}),
|
|
513
|
+
headers: combineHeaders(this.config.headers(), options.headers),
|
|
514
|
+
formData,
|
|
515
|
+
failedResponseHandler: gladiaFailedResponseHandler,
|
|
516
|
+
successfulResponseHandler: createJsonResponseHandler(
|
|
517
|
+
gladiaUploadResponseSchema,
|
|
518
|
+
),
|
|
519
|
+
abortSignal: options.abortSignal,
|
|
520
|
+
fetch: this.config.fetch,
|
|
521
|
+
});
|
|
522
|
+
|
|
523
|
+
const { body, warnings } = await this.getArgs(options);
|
|
524
|
+
|
|
525
|
+
const { value: transcriptionInitResponse } = await postJsonToApi({
|
|
526
|
+
url: this.config.url({
|
|
527
|
+
path: '/v2/pre-recorded',
|
|
528
|
+
modelId: 'default',
|
|
529
|
+
}),
|
|
530
|
+
headers: combineHeaders(this.config.headers(), options.headers),
|
|
531
|
+
body: {
|
|
532
|
+
...body,
|
|
533
|
+
audio_url: uploadResponse.audio_url,
|
|
534
|
+
},
|
|
535
|
+
failedResponseHandler: gladiaFailedResponseHandler,
|
|
536
|
+
successfulResponseHandler: createJsonResponseHandler(
|
|
537
|
+
gladiaTranscriptionInitializeResponseSchema,
|
|
538
|
+
),
|
|
539
|
+
abortSignal: options.abortSignal,
|
|
540
|
+
fetch: this.config.fetch,
|
|
541
|
+
});
|
|
542
|
+
|
|
543
|
+
// Poll the result URL until the transcription is done or an error occurs
|
|
544
|
+
const resultUrl = transcriptionInitResponse.result_url;
|
|
545
|
+
let transcriptionResult;
|
|
546
|
+
let transcriptionResultHeaders;
|
|
547
|
+
const timeoutMs = 60 * 1000; // 60 seconds timeout
|
|
548
|
+
const startTime = Date.now();
|
|
549
|
+
const pollingInterval = 1000;
|
|
550
|
+
|
|
551
|
+
while (true) {
|
|
552
|
+
// Check if we've exceeded the timeout
|
|
553
|
+
if (Date.now() - startTime > timeoutMs) {
|
|
554
|
+
throw new AISDKError({
|
|
555
|
+
message: 'Transcription job polling timed out',
|
|
556
|
+
name: 'TranscriptionJobPollingTimedOut',
|
|
557
|
+
cause: transcriptionResult,
|
|
558
|
+
});
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
const response = await getFromApi({
|
|
562
|
+
url: resultUrl,
|
|
563
|
+
headers: combineHeaders(this.config.headers(), options.headers),
|
|
564
|
+
failedResponseHandler: gladiaFailedResponseHandler,
|
|
565
|
+
successfulResponseHandler: createJsonResponseHandler(
|
|
566
|
+
gladiaTranscriptionResultResponseSchema,
|
|
567
|
+
),
|
|
568
|
+
abortSignal: options.abortSignal,
|
|
569
|
+
fetch: this.config.fetch,
|
|
570
|
+
});
|
|
571
|
+
|
|
572
|
+
transcriptionResult = response.value;
|
|
573
|
+
transcriptionResultHeaders = response.responseHeaders;
|
|
574
|
+
|
|
575
|
+
if (transcriptionResult.status === 'done') {
|
|
576
|
+
break;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
if (transcriptionResult.status === 'error') {
|
|
580
|
+
throw new AISDKError({
|
|
581
|
+
message: 'Transcription job failed',
|
|
582
|
+
name: 'TranscriptionJobFailed',
|
|
583
|
+
cause: transcriptionResult,
|
|
584
|
+
});
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
// Wait for the configured polling interval before checking again
|
|
588
|
+
await delay(pollingInterval);
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
if (!transcriptionResult.result) {
|
|
592
|
+
throw new AISDKError({
|
|
593
|
+
message: 'Transcription result is empty',
|
|
594
|
+
name: 'TranscriptionResultEmpty',
|
|
595
|
+
cause: transcriptionResult,
|
|
596
|
+
});
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
// Process the successful result
|
|
600
|
+
return {
|
|
601
|
+
text: transcriptionResult.result.transcription.full_transcript,
|
|
602
|
+
durationInSeconds: transcriptionResult.result.metadata.audio_duration,
|
|
603
|
+
language: transcriptionResult.result.transcription.languages.at(0),
|
|
604
|
+
segments: transcriptionResult.result.transcription.utterances.map(
|
|
605
|
+
utterance => ({
|
|
606
|
+
text: utterance.text,
|
|
607
|
+
startSecond: utterance.start,
|
|
608
|
+
endSecond: utterance.end,
|
|
609
|
+
}),
|
|
610
|
+
),
|
|
611
|
+
response: {
|
|
612
|
+
timestamp: currentDate,
|
|
613
|
+
modelId: 'default',
|
|
614
|
+
headers: transcriptionResultHeaders,
|
|
615
|
+
},
|
|
616
|
+
providerMetadata: {
|
|
617
|
+
gladia: transcriptionResult,
|
|
618
|
+
},
|
|
619
|
+
warnings,
|
|
620
|
+
};
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
const gladiaUploadResponseSchema = z.object({
|
|
625
|
+
audio_url: z.string(),
|
|
626
|
+
});
|
|
627
|
+
|
|
628
|
+
const gladiaTranscriptionInitializeResponseSchema = z.object({
|
|
629
|
+
result_url: z.string(),
|
|
630
|
+
});
|
|
631
|
+
|
|
632
|
+
const gladiaTranscriptionResultResponseSchema = z.object({
|
|
633
|
+
status: z.enum(['queued', 'processing', 'done', 'error']),
|
|
634
|
+
result: z
|
|
635
|
+
.object({
|
|
636
|
+
metadata: z.object({
|
|
637
|
+
audio_duration: z.number(),
|
|
638
|
+
}),
|
|
639
|
+
transcription: z.object({
|
|
640
|
+
full_transcript: z.string(),
|
|
641
|
+
languages: z.array(z.string()),
|
|
642
|
+
utterances: z.array(
|
|
643
|
+
z.object({
|
|
644
|
+
start: z.number(),
|
|
645
|
+
end: z.number(),
|
|
646
|
+
text: z.string(),
|
|
647
|
+
}),
|
|
648
|
+
),
|
|
649
|
+
}),
|
|
650
|
+
})
|
|
651
|
+
.nullish(),
|
|
652
|
+
});
|
package/src/index.ts
ADDED
|
Binary file
|