@ai-sdk/gladia 2.0.7 → 2.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,19 @@
1
1
  # @ai-sdk/gladia
2
2
 
3
+ ## 2.0.9
4
+
5
+ ### Patch Changes
6
+
7
+ - 8dc54db: chore: add src folders to package bundle
8
+
9
+ ## 2.0.8
10
+
11
+ ### Patch Changes
12
+
13
+ - Updated dependencies [5c090e7]
14
+ - @ai-sdk/provider@3.0.4
15
+ - @ai-sdk/provider-utils@4.0.8
16
+
3
17
  ## 2.0.7
4
18
 
5
19
  ### Patch Changes
package/dist/index.js CHANGED
@@ -553,7 +553,7 @@ var gladiaTranscriptionResultResponseSchema = import_v42.z.object({
553
553
  });
554
554
 
555
555
  // src/version.ts
556
- var VERSION = true ? "2.0.7" : "0.0.0-test";
556
+ var VERSION = true ? "2.0.9" : "0.0.0-test";
557
557
 
558
558
  // src/gladia-provider.ts
559
559
  function createGladia(options = {}) {
package/dist/index.mjs CHANGED
@@ -542,7 +542,7 @@ var gladiaTranscriptionResultResponseSchema = z2.object({
542
542
  });
543
543
 
544
544
  // src/version.ts
545
- var VERSION = true ? "2.0.7" : "0.0.0-test";
545
+ var VERSION = true ? "2.0.9" : "0.0.0-test";
546
546
 
547
547
  // src/gladia-provider.ts
548
548
  function createGladia(options = {}) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ai-sdk/gladia",
3
- "version": "2.0.7",
3
+ "version": "2.0.9",
4
4
  "license": "Apache-2.0",
5
5
  "sideEffects": false,
6
6
  "main": "./dist/index.js",
@@ -8,6 +8,7 @@
8
8
  "types": "./dist/index.d.ts",
9
9
  "files": [
10
10
  "dist/**/*",
11
+ "src",
11
12
  "CHANGELOG.md",
12
13
  "README.md"
13
14
  ],
@@ -20,15 +21,15 @@
20
21
  }
21
22
  },
22
23
  "dependencies": {
23
- "@ai-sdk/provider": "3.0.3",
24
- "@ai-sdk/provider-utils": "4.0.7"
24
+ "@ai-sdk/provider": "3.0.4",
25
+ "@ai-sdk/provider-utils": "4.0.8"
25
26
  },
26
27
  "devDependencies": {
27
28
  "@types/node": "20.17.24",
28
29
  "tsup": "^8",
29
30
  "typescript": "5.6.3",
30
31
  "zod": "3.25.76",
31
- "@ai-sdk/test-server": "1.0.1",
32
+ "@ai-sdk/test-server": "1.0.2",
32
33
  "@vercel/ai-tsconfig": "0.0.0"
33
34
  },
34
35
  "peerDependencies": {
@@ -0,0 +1,134 @@
1
+ export type GladiaTranscriptionInitiateAPITypes = {
2
+ /** URL to a Gladia file or to an external audio or video file */
3
+ audio_url: string;
4
+ /** [Alpha] Context to feed the transcription model with for possible better accuracy */
5
+ context_prompt?: string;
6
+ /** [Beta] Can be either boolean to enable custom_vocabulary or an array with specific vocabulary */
7
+ custom_vocabulary?: boolean | any[];
8
+ /** [Beta] Custom vocabulary configuration */
9
+ custom_vocabulary_config?: {
10
+ /** Vocabulary array with string or object containing value, intensity, pronunciations, and language */
11
+ vocabulary: Array<
12
+ | string
13
+ | {
14
+ /** Vocabulary value */
15
+ value: string;
16
+ /** Intensity of the vocabulary */
17
+ intensity?: number;
18
+ /** Pronunciation variations */
19
+ pronunciations?: string[];
20
+ /** Language of the vocabulary */
21
+ language?: string;
22
+ }
23
+ >;
24
+ /** Default intensity for vocabulary */
25
+ default_intensity?: number;
26
+ };
27
+ /** Detect the language from the given audio */
28
+ detect_language?: boolean;
29
+ /** Detect multiple languages in the given audio */
30
+ enable_code_switching?: boolean;
31
+ /** Configuration for code-switching */
32
+ code_switching_config?: {
33
+ /** Specify the languages you want to use when detecting multiple languages */
34
+ languages?: string[];
35
+ };
36
+ /** The original language in iso639-1 format */
37
+ language?: string;
38
+ /** Enable callback for this transcription */
39
+ callback?: boolean;
40
+ /** Configuration for callback */
41
+ callback_config?: {
42
+ /** The URL to be called with the result of the transcription */
43
+ url: string;
44
+ /** The HTTP method to be used */
45
+ method?: 'POST' | 'PUT';
46
+ };
47
+ /** Enable subtitles generation for this transcription */
48
+ subtitles?: boolean;
49
+ /** Configuration for subtitles */
50
+ subtitles_config?: {
51
+ /** Subtitles formats */
52
+ formats?: ('srt' | 'vtt')[];
53
+ /** Minimum duration of a subtitle in seconds */
54
+ minimum_duration?: number;
55
+ /** Maximum duration of a subtitle in seconds */
56
+ maximum_duration?: number;
57
+ /** Maximum number of characters per row */
58
+ maximum_characters_per_row?: number;
59
+ /** Maximum number of rows per caption */
60
+ maximum_rows_per_caption?: number;
61
+ /** Style of the subtitles */
62
+ style?: 'default' | 'compliance';
63
+ };
64
+ /** Enable speaker recognition (diarization) for this audio */
65
+ diarization?: boolean;
66
+ /** Configuration for diarization */
67
+ diarization_config?: {
68
+ /** Exact number of speakers in the audio */
69
+ number_of_speakers?: number;
70
+ /** Minimum number of speakers in the audio */
71
+ min_speakers?: number;
72
+ /** Maximum number of speakers in the audio */
73
+ max_speakers?: number;
74
+ /** [Alpha] Use enhanced diarization for this audio */
75
+ enhanced?: boolean;
76
+ };
77
+ /** [Beta] Enable translation for this audio */
78
+ translation?: boolean;
79
+ /** Configuration for translation */
80
+ translation_config?: {
81
+ /** The target language in iso639-1 format */
82
+ target_languages: string[];
83
+ /** Model for translation */
84
+ model?: 'base' | 'enhanced';
85
+ /** Align translated utterances with the original ones */
86
+ match_original_utterances?: boolean;
87
+ };
88
+ /** [Beta] Enable summarization for this audio */
89
+ summarization?: boolean;
90
+ /** Configuration for summarization */
91
+ summarization_config?: {
92
+ /** The type of summarization to apply */
93
+ type?: 'general' | 'bullet_points' | 'concise';
94
+ };
95
+ /** [Alpha] Enable moderation for this audio */
96
+ moderation?: boolean;
97
+ /** [Alpha] Enable named entity recognition for this audio */
98
+ named_entity_recognition?: boolean;
99
+ /** [Alpha] Enable chapterization for this audio */
100
+ chapterization?: boolean;
101
+ /** [Alpha] Enable names consistency for this audio */
102
+ name_consistency?: boolean;
103
+ /** [Alpha] Enable custom spelling for this audio */
104
+ custom_spelling?: boolean;
105
+ /** Configuration for custom spelling */
106
+ custom_spelling_config?: {
107
+ /** The list of spelling applied on the audio transcription */
108
+ spelling_dictionary: Record<string, string[]>;
109
+ };
110
+ /** [Alpha] Enable structured data extraction for this audio */
111
+ structured_data_extraction?: boolean;
112
+ /** Configuration for structured data extraction */
113
+ structured_data_extraction_config?: {
114
+ /** The list of classes to extract from the audio transcription */
115
+ classes: string[];
116
+ };
117
+ /** [Alpha] Enable sentiment analysis for this audio */
118
+ sentiment_analysis?: boolean;
119
+ /** [Alpha] Enable audio to llm processing for this audio */
120
+ audio_to_llm?: boolean;
121
+ /** Configuration for audio to llm */
122
+ audio_to_llm_config?: {
123
+ /** The list of prompts applied on the audio transcription */
124
+ prompts: string[];
125
+ };
126
+ /** Custom metadata you can attach to this transcription */
127
+ custom_metadata?: Record<string, any>;
128
+ /** Enable sentences for this audio */
129
+ sentences?: boolean;
130
+ /** [Alpha] Allows to change the output display_mode for this audio */
131
+ display_mode?: boolean;
132
+ /** [Alpha] Use enhanced punctuation for this audio */
133
+ punctuation_enhanced?: boolean;
134
+ };
@@ -0,0 +1,9 @@
1
+ import { FetchFunction } from '@ai-sdk/provider-utils';
2
+
3
+ export type GladiaConfig = {
4
+ provider: string;
5
+ url: (options: { modelId: string; path: string }) => string;
6
+ headers: () => Record<string, string | undefined>;
7
+ fetch?: FetchFunction;
8
+ generateId?: () => string;
9
+ };
@@ -0,0 +1,34 @@
1
+ import { safeParseJSON } from '@ai-sdk/provider-utils';
2
+ import { gladiaErrorDataSchema } from './gladia-error';
3
+ import { describe, it, expect } from 'vitest';
4
+
5
+ describe('gladiaErrorDataSchema', () => {
6
+ it('should parse Gladia resource exhausted error', async () => {
7
+ const error = `
8
+ {"error":{"message":"{\\n \\"error\\": {\\n \\"code\\": 429,\\n \\"message\\": \\"Resource has been exhausted (e.g. check quota).\\",\\n \\"status\\": \\"RESOURCE_EXHAUSTED\\"\\n }\\n}\\n","code":429}}
9
+ `;
10
+
11
+ const result = await safeParseJSON({
12
+ text: error,
13
+ schema: gladiaErrorDataSchema,
14
+ });
15
+
16
+ expect(result).toStrictEqual({
17
+ success: true,
18
+ value: {
19
+ error: {
20
+ message:
21
+ '{\n "error": {\n "code": 429,\n "message": "Resource has been exhausted (e.g. check quota).",\n "status": "RESOURCE_EXHAUSTED"\n }\n}\n',
22
+ code: 429,
23
+ },
24
+ },
25
+ rawValue: {
26
+ error: {
27
+ message:
28
+ '{\n "error": {\n "code": 429,\n "message": "Resource has been exhausted (e.g. check quota).",\n "status": "RESOURCE_EXHAUSTED"\n }\n}\n',
29
+ code: 429,
30
+ },
31
+ },
32
+ });
33
+ });
34
+ });
@@ -0,0 +1,16 @@
1
+ import { z } from 'zod/v4';
2
+ import { createJsonErrorResponseHandler } from '@ai-sdk/provider-utils';
3
+
4
+ export const gladiaErrorDataSchema = z.object({
5
+ error: z.object({
6
+ message: z.string(),
7
+ code: z.number(),
8
+ }),
9
+ });
10
+
11
+ export type GladiaErrorData = z.infer<typeof gladiaErrorDataSchema>;
12
+
13
+ export const gladiaFailedResponseHandler = createJsonErrorResponseHandler({
14
+ errorSchema: gladiaErrorDataSchema,
15
+ errorToMessage: data => data.error.message,
16
+ });
@@ -0,0 +1,117 @@
1
+ import {
2
+ TranscriptionModelV3,
3
+ ProviderV3,
4
+ NoSuchModelError,
5
+ } from '@ai-sdk/provider';
6
+ import {
7
+ FetchFunction,
8
+ loadApiKey,
9
+ withUserAgentSuffix,
10
+ } from '@ai-sdk/provider-utils';
11
+ import { GladiaTranscriptionModel } from './gladia-transcription-model';
12
+ import { VERSION } from './version';
13
+
14
+ export interface GladiaProvider extends ProviderV3 {
15
+ (): {
16
+ transcription: GladiaTranscriptionModel;
17
+ };
18
+
19
+ /**
20
+ Creates a model for transcription.
21
+ */
22
+ transcription(): TranscriptionModelV3;
23
+
24
+ /**
25
+ * @deprecated Use `embeddingModel` instead.
26
+ */
27
+ textEmbeddingModel(modelId: string): never;
28
+ }
29
+
30
+ export interface GladiaProviderSettings {
31
+ /**
32
+ API key for authenticating requests.
33
+ */
34
+ apiKey?: string;
35
+
36
+ /**
37
+ Custom headers to include in the requests.
38
+ */
39
+ headers?: Record<string, string>;
40
+
41
+ /**
42
+ Custom fetch implementation. You can use it as a middleware to intercept requests,
43
+ or to provide a custom fetch implementation for e.g. testing.
44
+ */
45
+ fetch?: FetchFunction;
46
+ }
47
+
48
+ /**
49
+ Create a Gladia provider instance.
50
+ */
51
+ export function createGladia(
52
+ options: GladiaProviderSettings = {},
53
+ ): GladiaProvider {
54
+ const getHeaders = () =>
55
+ withUserAgentSuffix(
56
+ {
57
+ 'x-gladia-key': loadApiKey({
58
+ apiKey: options.apiKey,
59
+ environmentVariableName: 'GLADIA_API_KEY',
60
+ description: 'Gladia',
61
+ }),
62
+ ...options.headers,
63
+ },
64
+ `ai-sdk/gladia/${VERSION}`,
65
+ );
66
+
67
+ const createTranscriptionModel = () =>
68
+ new GladiaTranscriptionModel('default', {
69
+ provider: `gladia.transcription`,
70
+ url: ({ path }) => `https://api.gladia.io${path}`,
71
+ headers: getHeaders,
72
+ fetch: options.fetch,
73
+ });
74
+
75
+ const provider = function () {
76
+ return {
77
+ transcription: createTranscriptionModel(),
78
+ };
79
+ };
80
+
81
+ provider.specificationVersion = 'v3' as const;
82
+ provider.transcription = createTranscriptionModel;
83
+ provider.transcriptionModel = createTranscriptionModel;
84
+
85
+ // Required ProviderV3 methods that are not supported
86
+ provider.languageModel = (modelId: string) => {
87
+ throw new NoSuchModelError({
88
+ modelId,
89
+ modelType: 'languageModel',
90
+ message: 'Gladia does not provide language models',
91
+ });
92
+ };
93
+
94
+ provider.embeddingModel = (modelId: string) => {
95
+ throw new NoSuchModelError({
96
+ modelId,
97
+ modelType: 'embeddingModel',
98
+ message: 'Gladia does not provide embedding models',
99
+ });
100
+ };
101
+ provider.textEmbeddingModel = provider.embeddingModel;
102
+
103
+ provider.imageModel = (modelId: string) => {
104
+ throw new NoSuchModelError({
105
+ modelId,
106
+ modelType: 'imageModel',
107
+ message: 'Gladia does not provide image models',
108
+ });
109
+ };
110
+
111
+ return provider as GladiaProvider;
112
+ }
113
+
114
+ /**
115
+ Default Gladia provider instance.
116
+ */
117
+ export const gladia = createGladia();
@@ -0,0 +1,230 @@
1
+ import { createTestServer } from '@ai-sdk/test-server/with-vitest';
2
+ import { GladiaTranscriptionModel } from './gladia-transcription-model';
3
+ import { createGladia } from './gladia-provider';
4
+ import { readFile } from 'node:fs/promises';
5
+ import path from 'node:path';
6
+ import { describe, it, expect, vi } from 'vitest';
7
+
8
+ vi.mock('./version', () => ({
9
+ VERSION: '0.0.0-test',
10
+ }));
11
+
12
+ const audioData = await readFile(path.join(__dirname, 'transcript-test.mp3'));
13
+ const provider = createGladia({ apiKey: 'test-api-key' });
14
+ const model = provider.transcription();
15
+
16
+ const server = createTestServer({
17
+ 'https://api.gladia.io/v2/upload': {
18
+ response: {
19
+ type: 'json-value',
20
+ body: {
21
+ audio_url: 'https://storage.gladia.io/mock-upload-url',
22
+ audio_metadata: {
23
+ id: 'test-id',
24
+ filename: 'test-file.mp3',
25
+ extension: 'mp3',
26
+ size: 1024,
27
+ audio_duration: 60,
28
+ number_of_channels: 2,
29
+ },
30
+ },
31
+ },
32
+ },
33
+ 'https://api.gladia.io/v2/pre-recorded': {},
34
+ 'https://api.gladia.io/v2/transcription/test-id': {},
35
+ });
36
+
37
+ describe('doGenerate', () => {
38
+ function prepareJsonResponse({
39
+ headers,
40
+ }: {
41
+ headers?: Record<string, string>;
42
+ } = {}) {
43
+ // No need to set the upload response here as it's already set in the server creation
44
+ server.urls['https://api.gladia.io/v2/pre-recorded'].response = {
45
+ type: 'json-value',
46
+ headers,
47
+ body: {
48
+ id: 'test-id',
49
+ result_url: 'https://api.gladia.io/v2/transcription/test-id',
50
+ },
51
+ };
52
+ server.urls['https://api.gladia.io/v2/transcription/test-id'].response = {
53
+ type: 'json-value',
54
+ headers,
55
+ body: {
56
+ id: '45463597-20b7-4af7-b3b3-f5fb778203ab',
57
+ request_id: 'G-45463597',
58
+ version: 2,
59
+ status: 'done',
60
+ created_at: '2023-12-28T09:04:17.210Z',
61
+ completed_at: '2023-12-28T09:04:37.210Z',
62
+ custom_metadata: {},
63
+ error_code: null,
64
+ kind: 'pre-recorded',
65
+ file: {
66
+ id: 'test-id',
67
+ filename: 'test-file.mp3',
68
+ source: 'upload',
69
+ audio_duration: 60,
70
+ number_of_channels: 2,
71
+ },
72
+ request_params: {
73
+ audio_url: 'https://storage.gladia.io/mock-upload-url',
74
+ },
75
+ result: {
76
+ metadata: {
77
+ audio_duration: 60,
78
+ number_of_distinct_channels: 2,
79
+ billing_time: 60,
80
+ transcription_time: 20,
81
+ },
82
+ transcription: {
83
+ full_transcript: 'Smoke from hundreds of wildfires.',
84
+ languages: ['en'],
85
+ utterances: [
86
+ {
87
+ language: 'en',
88
+ start: 0,
89
+ end: 3,
90
+ confidence: 0.95,
91
+ channel: 1,
92
+ speaker: 1,
93
+ words: [
94
+ {
95
+ word: 'Smoke',
96
+ start: 0,
97
+ end: 1,
98
+ confidence: 0.95,
99
+ },
100
+ {
101
+ word: 'from',
102
+ start: 1,
103
+ end: 2,
104
+ confidence: 0.95,
105
+ },
106
+ {
107
+ word: 'hundreds',
108
+ start: 2,
109
+ end: 3,
110
+ confidence: 0.95,
111
+ },
112
+ ],
113
+ text: 'Smoke from hundreds of wildfires.',
114
+ },
115
+ ],
116
+ },
117
+ },
118
+ },
119
+ };
120
+ }
121
+
122
+ it('should pass the model', async () => {
123
+ prepareJsonResponse();
124
+
125
+ await model.doGenerate({
126
+ audio: audioData,
127
+ mediaType: 'audio/wav',
128
+ });
129
+
130
+ expect(await server.calls[1].requestBodyJson).toMatchObject({
131
+ audio_url: 'https://storage.gladia.io/mock-upload-url',
132
+ });
133
+ });
134
+
135
+ it('should pass headers', async () => {
136
+ prepareJsonResponse();
137
+
138
+ const provider = createGladia({
139
+ apiKey: 'test-api-key',
140
+ headers: {
141
+ 'Custom-Provider-Header': 'provider-header-value',
142
+ },
143
+ });
144
+
145
+ await provider.transcription().doGenerate({
146
+ audio: audioData,
147
+ mediaType: 'audio/wav',
148
+ headers: {
149
+ 'Custom-Request-Header': 'request-header-value',
150
+ },
151
+ });
152
+
153
+ expect(server.calls[1].requestHeaders).toMatchObject({
154
+ 'x-gladia-key': 'test-api-key',
155
+ 'content-type': 'application/json',
156
+ 'custom-provider-header': 'provider-header-value',
157
+ 'custom-request-header': 'request-header-value',
158
+ });
159
+ expect(server.calls[0].requestUserAgent).toContain(
160
+ `ai-sdk/gladia/0.0.0-test`,
161
+ );
162
+ });
163
+
164
+ it('should extract the transcription text', async () => {
165
+ prepareJsonResponse();
166
+
167
+ const result = await model.doGenerate({
168
+ audio: audioData,
169
+ mediaType: 'audio/wav',
170
+ });
171
+
172
+ expect(result.text).toBe('Smoke from hundreds of wildfires.');
173
+ });
174
+
175
+ it('should include response data with timestamp, modelId and headers', async () => {
176
+ prepareJsonResponse({
177
+ headers: {
178
+ 'x-request-id': 'test-request-id',
179
+ 'x-ratelimit-remaining': '123',
180
+ },
181
+ });
182
+
183
+ const testDate = new Date(0);
184
+ const customModel = new GladiaTranscriptionModel('default', {
185
+ provider: 'test-provider',
186
+ url: ({ path }) => `https://api.gladia.io${path}`,
187
+ headers: () => ({}),
188
+ _internal: {
189
+ currentDate: () => testDate,
190
+ },
191
+ });
192
+
193
+ const result = await customModel.doGenerate({
194
+ audio: audioData,
195
+ mediaType: 'audio/wav',
196
+ });
197
+
198
+ expect(result.response).toMatchObject({
199
+ timestamp: testDate,
200
+ modelId: 'default',
201
+ headers: {
202
+ 'content-type': 'application/json',
203
+ 'x-request-id': 'test-request-id',
204
+ 'x-ratelimit-remaining': '123',
205
+ },
206
+ });
207
+ });
208
+
209
+ it('should use real date when no custom date provider is specified', async () => {
210
+ prepareJsonResponse();
211
+
212
+ const testDate = new Date(0);
213
+ const customModel = new GladiaTranscriptionModel('default', {
214
+ provider: 'test-provider',
215
+ url: ({ path }) => `https://api.gladia.io${path}`,
216
+ headers: () => ({}),
217
+ _internal: {
218
+ currentDate: () => testDate,
219
+ },
220
+ });
221
+
222
+ const result = await customModel.doGenerate({
223
+ audio: audioData,
224
+ mediaType: 'audio/wav',
225
+ });
226
+
227
+ expect(result.response.timestamp.getTime()).toEqual(testDate.getTime());
228
+ expect(result.response.modelId).toBe('default');
229
+ });
230
+ });
@@ -0,0 +1,652 @@
1
+ import {
2
+ AISDKError,
3
+ TranscriptionModelV3,
4
+ SharedV3Warning,
5
+ } from '@ai-sdk/provider';
6
+ import {
7
+ combineHeaders,
8
+ convertBase64ToUint8Array,
9
+ createJsonResponseHandler,
10
+ mediaTypeToExtension,
11
+ delay,
12
+ getFromApi,
13
+ parseProviderOptions,
14
+ postFormDataToApi,
15
+ postJsonToApi,
16
+ } from '@ai-sdk/provider-utils';
17
+ import { z } from 'zod/v4';
18
+ import { GladiaConfig } from './gladia-config';
19
+ import { gladiaFailedResponseHandler } from './gladia-error';
20
+ import { GladiaTranscriptionInitiateAPITypes } from './gladia-api-types';
21
+
22
+ // https://docs.gladia.io/api-reference/v2/pre-recorded/init
23
+ const gladiaProviderOptionsSchema = z.object({
24
+ /**
25
+ * Optional context prompt to guide the transcription.
26
+ */
27
+ contextPrompt: z.string().nullish(),
28
+
29
+ /**
30
+ * Custom vocabulary to improve transcription accuracy.
31
+ * Can be a boolean or an array of custom terms.
32
+ */
33
+ customVocabulary: z.union([z.boolean(), z.array(z.any())]).nullish(),
34
+
35
+ /**
36
+ * Configuration for custom vocabulary.
37
+ */
38
+ customVocabularyConfig: z
39
+ .object({
40
+ /**
41
+ * Array of vocabulary terms or objects with pronunciation details.
42
+ */
43
+ vocabulary: z.array(
44
+ z.union([
45
+ z.string(),
46
+ z.object({
47
+ /**
48
+ * The vocabulary term.
49
+ */
50
+ value: z.string(),
51
+ /**
52
+ * Intensity of the term in recognition (optional).
53
+ */
54
+ intensity: z.number().nullish(),
55
+ /**
56
+ * Alternative pronunciations for the term (optional).
57
+ */
58
+ pronunciations: z.array(z.string()).nullish(),
59
+ /**
60
+ * Language of the term (optional).
61
+ */
62
+ language: z.string().nullish(),
63
+ }),
64
+ ]),
65
+ ),
66
+ /**
67
+ * Default intensity for all vocabulary terms.
68
+ */
69
+ defaultIntensity: z.number().nullish(),
70
+ })
71
+ .nullish(),
72
+
73
+ /**
74
+ * Whether to automatically detect the language of the audio.
75
+ */
76
+ detectLanguage: z.boolean().nullish(),
77
+
78
+ /**
79
+ * Whether to enable code switching (multiple languages in the same audio).
80
+ */
81
+ enableCodeSwitching: z.boolean().nullish(),
82
+
83
+ /**
84
+ * Configuration for code switching.
85
+ */
86
+ codeSwitchingConfig: z
87
+ .object({
88
+ /**
89
+ * Languages to consider for code switching.
90
+ */
91
+ languages: z.array(z.string()).nullish(),
92
+ })
93
+ .nullish(),
94
+
95
+ /**
96
+ * Specific language for transcription.
97
+ */
98
+ language: z.string().nullish(),
99
+
100
+ /**
101
+ * Whether to enable callback when transcription is complete.
102
+ */
103
+ callback: z.boolean().nullish(),
104
+
105
+ /**
106
+ * Configuration for callback.
107
+ */
108
+ callbackConfig: z
109
+ .object({
110
+ /**
111
+ * URL to send the callback to.
112
+ */
113
+ url: z.string(),
114
+ /**
115
+ * HTTP method for the callback.
116
+ */
117
+ method: z.enum(['POST', 'PUT']).nullish(),
118
+ })
119
+ .nullish(),
120
+
121
+ /**
122
+ * Whether to generate subtitles.
123
+ */
124
+ subtitles: z.boolean().nullish(),
125
+
126
+ /**
127
+ * Configuration for subtitles generation.
128
+ */
129
+ subtitlesConfig: z
130
+ .object({
131
+ /**
132
+ * Subtitle file formats to generate.
133
+ */
134
+ formats: z.array(z.enum(['srt', 'vtt'])).nullish(),
135
+ /**
136
+ * Minimum duration for subtitle segments.
137
+ */
138
+ minimumDuration: z.number().nullish(),
139
+ /**
140
+ * Maximum duration for subtitle segments.
141
+ */
142
+ maximumDuration: z.number().nullish(),
143
+ /**
144
+ * Maximum characters per row in subtitles.
145
+ */
146
+ maximumCharactersPerRow: z.number().nullish(),
147
+ /**
148
+ * Maximum rows per caption in subtitles.
149
+ */
150
+ maximumRowsPerCaption: z.number().nullish(),
151
+ /**
152
+ * Style of subtitles.
153
+ */
154
+ style: z.enum(['default', 'compliance']).nullish(),
155
+ })
156
+ .nullish(),
157
+
158
+ /**
159
+ * Whether to enable speaker diarization (speaker identification).
160
+ */
161
+ diarization: z.boolean().nullish(),
162
+
163
+ /**
164
+ * Configuration for diarization.
165
+ */
166
+ diarizationConfig: z
167
+ .object({
168
+ /**
169
+ * Exact number of speakers to identify.
170
+ */
171
+ numberOfSpeakers: z.number().nullish(),
172
+ /**
173
+ * Minimum number of speakers to identify.
174
+ */
175
+ minSpeakers: z.number().nullish(),
176
+ /**
177
+ * Maximum number of speakers to identify.
178
+ */
179
+ maxSpeakers: z.number().nullish(),
180
+ /**
181
+ * Whether to use enhanced diarization.
182
+ */
183
+ enhanced: z.boolean().nullish(),
184
+ })
185
+ .nullish(),
186
+
187
+ /**
188
+ * Whether to translate the transcription.
189
+ */
190
+ translation: z.boolean().nullish(),
191
+
192
+ /**
193
+ * Configuration for translation.
194
+ */
195
+ translationConfig: z
196
+ .object({
197
+ /**
198
+ * Target languages for translation.
199
+ */
200
+ targetLanguages: z.array(z.string()),
201
+ /**
202
+ * Translation model to use.
203
+ */
204
+ model: z.enum(['base', 'enhanced']).nullish(),
205
+ /**
206
+ * Whether to match original utterances in translation.
207
+ */
208
+ matchOriginalUtterances: z.boolean().nullish(),
209
+ })
210
+ .nullish(),
211
+
212
+ /**
213
+ * Whether to generate a summary of the transcription.
214
+ */
215
+ summarization: z.boolean().nullish(),
216
+
217
+ /**
218
+ * Configuration for summarization.
219
+ */
220
+ summarizationConfig: z
221
+ .object({
222
+ /**
223
+ * Type of summary to generate.
224
+ */
225
+ type: z.enum(['general', 'bullet_points', 'concise']).nullish(),
226
+ })
227
+ .nullish(),
228
+
229
+ /**
230
+ * Whether to enable content moderation.
231
+ */
232
+ moderation: z.boolean().nullish(),
233
+
234
+ /**
235
+ * Whether to enable named entity recognition.
236
+ */
237
+ namedEntityRecognition: z.boolean().nullish(),
238
+
239
+ /**
240
+ * Whether to enable automatic chapter creation.
241
+ */
242
+ chapterization: z.boolean().nullish(),
243
+
244
+ /**
245
+ * Whether to ensure consistent naming of entities.
246
+ */
247
+ nameConsistency: z.boolean().nullish(),
248
+
249
+ /**
250
+ * Whether to enable custom spelling.
251
+ */
252
+ customSpelling: z.boolean().nullish(),
253
+
254
+ /**
255
+ * Configuration for custom spelling.
256
+ */
257
+ customSpellingConfig: z
258
+ .object({
259
+ /**
260
+ * Dictionary of custom spellings.
261
+ */
262
+ spellingDictionary: z.record(z.string(), z.array(z.string())),
263
+ })
264
+ .nullish(),
265
+
266
+ /**
267
+ * Whether to extract structured data from the transcription.
268
+ */
269
+ structuredDataExtraction: z.boolean().nullish(),
270
+
271
+ /**
272
+ * Configuration for structured data extraction.
273
+ */
274
+ structuredDataExtractionConfig: z
275
+ .object({
276
+ /**
277
+ * Classes of data to extract.
278
+ */
279
+ classes: z.array(z.string()),
280
+ })
281
+ .nullish(),
282
+
283
+ /**
284
+ * Whether to perform sentiment analysis on the transcription.
285
+ */
286
+ sentimentAnalysis: z.boolean().nullish(),
287
+
288
+ /**
289
+ * Whether to send audio to a language model for processing.
290
+ */
291
+ audioToLlm: z.boolean().nullish(),
292
+
293
+ /**
294
+ * Configuration for audio to language model processing.
295
+ */
296
+ audioToLlmConfig: z
297
+ .object({
298
+ /**
299
+ * Prompts to send to the language model.
300
+ */
301
+ prompts: z.array(z.string()),
302
+ })
303
+ .nullish(),
304
+
305
+ /**
306
+ * Custom metadata to include with the transcription.
307
+ */
308
+ customMetadata: z.record(z.string(), z.any()).nullish(),
309
+
310
+ /**
311
+ * Whether to include sentence-level segmentation.
312
+ */
313
+ sentences: z.boolean().nullish(),
314
+
315
+ /**
316
+ * Whether to enable display mode.
317
+ */
318
+ displayMode: z.boolean().nullish(),
319
+
320
+ /**
321
+ * Whether to enhance punctuation in the transcription.
322
+ */
323
+ punctuationEnhanced: z.boolean().nullish(),
324
+ });
325
+
326
+ export type GladiaTranscriptionCallOptions = z.infer<
327
+ typeof gladiaProviderOptionsSchema
328
+ >;
329
+
330
+ interface GladiaTranscriptionModelConfig extends GladiaConfig {
331
+ _internal?: {
332
+ currentDate?: () => Date;
333
+ };
334
+ }
335
+
336
+ export class GladiaTranscriptionModel implements TranscriptionModelV3 {
337
+ readonly specificationVersion = 'v3';
338
+
339
+ get provider(): string {
340
+ return this.config.provider;
341
+ }
342
+
343
+ constructor(
344
+ readonly modelId: 'default',
345
+ private readonly config: GladiaTranscriptionModelConfig,
346
+ ) {}
347
+
348
+ private async getArgs({
349
+ providerOptions,
350
+ }: Parameters<TranscriptionModelV3['doGenerate']>[0]) {
351
+ const warnings: SharedV3Warning[] = [];
352
+
353
+ // Parse provider options
354
+ const gladiaOptions = await parseProviderOptions({
355
+ provider: 'gladia',
356
+ providerOptions,
357
+ schema: gladiaProviderOptionsSchema,
358
+ });
359
+
360
+ const body: Omit<GladiaTranscriptionInitiateAPITypes, 'audio_url'> = {};
361
+
362
+ // Add provider-specific options
363
+ if (gladiaOptions) {
364
+ body.context_prompt = gladiaOptions.contextPrompt ?? undefined;
365
+ body.custom_vocabulary = gladiaOptions.customVocabulary ?? undefined;
366
+ body.detect_language = gladiaOptions.detectLanguage ?? undefined;
367
+ body.enable_code_switching =
368
+ gladiaOptions.enableCodeSwitching ?? undefined;
369
+ body.language = gladiaOptions.language ?? undefined;
370
+ body.callback = gladiaOptions.callback ?? undefined;
371
+ body.subtitles = gladiaOptions.subtitles ?? undefined;
372
+ body.diarization = gladiaOptions.diarization ?? undefined;
373
+ body.translation = gladiaOptions.translation ?? undefined;
374
+ body.summarization = gladiaOptions.summarization ?? undefined;
375
+ body.moderation = gladiaOptions.moderation ?? undefined;
376
+ body.named_entity_recognition =
377
+ gladiaOptions.namedEntityRecognition ?? undefined;
378
+ body.chapterization = gladiaOptions.chapterization ?? undefined;
379
+ body.name_consistency = gladiaOptions.nameConsistency ?? undefined;
380
+ body.custom_spelling = gladiaOptions.customSpelling ?? undefined;
381
+ body.structured_data_extraction =
382
+ gladiaOptions.structuredDataExtraction ?? undefined;
383
+ body.structured_data_extraction_config =
384
+ gladiaOptions.structuredDataExtractionConfig ?? undefined;
385
+ body.sentiment_analysis = gladiaOptions.sentimentAnalysis ?? undefined;
386
+ body.audio_to_llm = gladiaOptions.audioToLlm ?? undefined;
387
+ body.audio_to_llm_config = gladiaOptions.audioToLlmConfig ?? undefined;
388
+ body.custom_metadata = gladiaOptions.customMetadata ?? undefined;
389
+ body.sentences = gladiaOptions.sentences ?? undefined;
390
+ body.display_mode = gladiaOptions.displayMode ?? undefined;
391
+ body.punctuation_enhanced =
392
+ gladiaOptions.punctuationEnhanced ?? undefined;
393
+
394
+ if (gladiaOptions.customVocabularyConfig) {
395
+ body.custom_vocabulary_config = {
396
+ vocabulary: gladiaOptions.customVocabularyConfig.vocabulary.map(
397
+ item => {
398
+ if (typeof item === 'string') return item;
399
+ return {
400
+ value: item.value,
401
+ intensity: item.intensity ?? undefined,
402
+ pronunciations: item.pronunciations ?? undefined,
403
+ language: item.language ?? undefined,
404
+ };
405
+ },
406
+ ),
407
+ default_intensity:
408
+ gladiaOptions.customVocabularyConfig.defaultIntensity ?? undefined,
409
+ };
410
+ }
411
+
412
+ // Handle code switching config
413
+ if (gladiaOptions.codeSwitchingConfig) {
414
+ body.code_switching_config = {
415
+ languages: gladiaOptions.codeSwitchingConfig.languages ?? undefined,
416
+ };
417
+ }
418
+
419
+ // Handle callback config
420
+ if (gladiaOptions.callbackConfig) {
421
+ body.callback_config = {
422
+ url: gladiaOptions.callbackConfig.url,
423
+ method: gladiaOptions.callbackConfig.method ?? undefined,
424
+ };
425
+ }
426
+
427
+ // Handle subtitles config
428
+ if (gladiaOptions.subtitlesConfig) {
429
+ body.subtitles_config = {
430
+ formats: gladiaOptions.subtitlesConfig.formats ?? undefined,
431
+ minimum_duration:
432
+ gladiaOptions.subtitlesConfig.minimumDuration ?? undefined,
433
+ maximum_duration:
434
+ gladiaOptions.subtitlesConfig.maximumDuration ?? undefined,
435
+ maximum_characters_per_row:
436
+ gladiaOptions.subtitlesConfig.maximumCharactersPerRow ?? undefined,
437
+ maximum_rows_per_caption:
438
+ gladiaOptions.subtitlesConfig.maximumRowsPerCaption ?? undefined,
439
+ style: gladiaOptions.subtitlesConfig.style ?? undefined,
440
+ };
441
+ }
442
+
443
+ // Handle diarization config
444
+ if (gladiaOptions.diarizationConfig) {
445
+ body.diarization_config = {
446
+ number_of_speakers:
447
+ gladiaOptions.diarizationConfig.numberOfSpeakers ?? undefined,
448
+ min_speakers:
449
+ gladiaOptions.diarizationConfig.minSpeakers ?? undefined,
450
+ max_speakers:
451
+ gladiaOptions.diarizationConfig.maxSpeakers ?? undefined,
452
+ enhanced: gladiaOptions.diarizationConfig.enhanced ?? undefined,
453
+ };
454
+ }
455
+
456
+ // Handle translation config
457
+ if (gladiaOptions.translationConfig) {
458
+ body.translation_config = {
459
+ target_languages: gladiaOptions.translationConfig.targetLanguages,
460
+ model: gladiaOptions.translationConfig.model ?? undefined,
461
+ match_original_utterances:
462
+ gladiaOptions.translationConfig.matchOriginalUtterances ??
463
+ undefined,
464
+ };
465
+ }
466
+
467
+ // Handle summarization config
468
+ if (gladiaOptions.summarizationConfig) {
469
+ body.summarization_config = {
470
+ type: gladiaOptions.summarizationConfig.type ?? undefined,
471
+ };
472
+ }
473
+
474
+ // Handle custom spelling config
475
+ if (gladiaOptions.customSpellingConfig) {
476
+ body.custom_spelling_config = {
477
+ spelling_dictionary:
478
+ gladiaOptions.customSpellingConfig.spellingDictionary,
479
+ };
480
+ }
481
+ }
482
+
483
+ return {
484
+ body,
485
+ warnings,
486
+ };
487
+ }
488
+
489
+ async doGenerate(
490
+ options: Parameters<TranscriptionModelV3['doGenerate']>[0],
491
+ ): Promise<Awaited<ReturnType<TranscriptionModelV3['doGenerate']>>> {
492
+ const currentDate = this.config._internal?.currentDate?.() ?? new Date();
493
+
494
+ // Create form data with base fields
495
+ const formData = new FormData();
496
+ const blob =
497
+ options.audio instanceof Uint8Array
498
+ ? new Blob([options.audio])
499
+ : new Blob([convertBase64ToUint8Array(options.audio)]);
500
+
501
+ const fileExtension = mediaTypeToExtension(options.mediaType);
502
+ formData.append(
503
+ 'audio',
504
+ new File([blob], 'audio', { type: options.mediaType }),
505
+ `audio.${fileExtension}`,
506
+ );
507
+
508
+ const { value: uploadResponse } = await postFormDataToApi({
509
+ url: this.config.url({
510
+ path: '/v2/upload',
511
+ modelId: 'default',
512
+ }),
513
+ headers: combineHeaders(this.config.headers(), options.headers),
514
+ formData,
515
+ failedResponseHandler: gladiaFailedResponseHandler,
516
+ successfulResponseHandler: createJsonResponseHandler(
517
+ gladiaUploadResponseSchema,
518
+ ),
519
+ abortSignal: options.abortSignal,
520
+ fetch: this.config.fetch,
521
+ });
522
+
523
+ const { body, warnings } = await this.getArgs(options);
524
+
525
+ const { value: transcriptionInitResponse } = await postJsonToApi({
526
+ url: this.config.url({
527
+ path: '/v2/pre-recorded',
528
+ modelId: 'default',
529
+ }),
530
+ headers: combineHeaders(this.config.headers(), options.headers),
531
+ body: {
532
+ ...body,
533
+ audio_url: uploadResponse.audio_url,
534
+ },
535
+ failedResponseHandler: gladiaFailedResponseHandler,
536
+ successfulResponseHandler: createJsonResponseHandler(
537
+ gladiaTranscriptionInitializeResponseSchema,
538
+ ),
539
+ abortSignal: options.abortSignal,
540
+ fetch: this.config.fetch,
541
+ });
542
+
543
+ // Poll the result URL until the transcription is done or an error occurs
544
+ const resultUrl = transcriptionInitResponse.result_url;
545
+ let transcriptionResult;
546
+ let transcriptionResultHeaders;
547
+ const timeoutMs = 60 * 1000; // 60 seconds timeout
548
+ const startTime = Date.now();
549
+ const pollingInterval = 1000;
550
+
551
+ while (true) {
552
+ // Check if we've exceeded the timeout
553
+ if (Date.now() - startTime > timeoutMs) {
554
+ throw new AISDKError({
555
+ message: 'Transcription job polling timed out',
556
+ name: 'TranscriptionJobPollingTimedOut',
557
+ cause: transcriptionResult,
558
+ });
559
+ }
560
+
561
+ const response = await getFromApi({
562
+ url: resultUrl,
563
+ headers: combineHeaders(this.config.headers(), options.headers),
564
+ failedResponseHandler: gladiaFailedResponseHandler,
565
+ successfulResponseHandler: createJsonResponseHandler(
566
+ gladiaTranscriptionResultResponseSchema,
567
+ ),
568
+ abortSignal: options.abortSignal,
569
+ fetch: this.config.fetch,
570
+ });
571
+
572
+ transcriptionResult = response.value;
573
+ transcriptionResultHeaders = response.responseHeaders;
574
+
575
+ if (transcriptionResult.status === 'done') {
576
+ break;
577
+ }
578
+
579
+ if (transcriptionResult.status === 'error') {
580
+ throw new AISDKError({
581
+ message: 'Transcription job failed',
582
+ name: 'TranscriptionJobFailed',
583
+ cause: transcriptionResult,
584
+ });
585
+ }
586
+
587
+ // Wait for the configured polling interval before checking again
588
+ await delay(pollingInterval);
589
+ }
590
+
591
+ if (!transcriptionResult.result) {
592
+ throw new AISDKError({
593
+ message: 'Transcription result is empty',
594
+ name: 'TranscriptionResultEmpty',
595
+ cause: transcriptionResult,
596
+ });
597
+ }
598
+
599
+ // Process the successful result
600
+ return {
601
+ text: transcriptionResult.result.transcription.full_transcript,
602
+ durationInSeconds: transcriptionResult.result.metadata.audio_duration,
603
+ language: transcriptionResult.result.transcription.languages.at(0),
604
+ segments: transcriptionResult.result.transcription.utterances.map(
605
+ utterance => ({
606
+ text: utterance.text,
607
+ startSecond: utterance.start,
608
+ endSecond: utterance.end,
609
+ }),
610
+ ),
611
+ response: {
612
+ timestamp: currentDate,
613
+ modelId: 'default',
614
+ headers: transcriptionResultHeaders,
615
+ },
616
+ providerMetadata: {
617
+ gladia: transcriptionResult,
618
+ },
619
+ warnings,
620
+ };
621
+ }
622
+ }
623
+
624
+ const gladiaUploadResponseSchema = z.object({
625
+ audio_url: z.string(),
626
+ });
627
+
628
+ const gladiaTranscriptionInitializeResponseSchema = z.object({
629
+ result_url: z.string(),
630
+ });
631
+
632
+ const gladiaTranscriptionResultResponseSchema = z.object({
633
+ status: z.enum(['queued', 'processing', 'done', 'error']),
634
+ result: z
635
+ .object({
636
+ metadata: z.object({
637
+ audio_duration: z.number(),
638
+ }),
639
+ transcription: z.object({
640
+ full_transcript: z.string(),
641
+ languages: z.array(z.string()),
642
+ utterances: z.array(
643
+ z.object({
644
+ start: z.number(),
645
+ end: z.number(),
646
+ text: z.string(),
647
+ }),
648
+ ),
649
+ }),
650
+ })
651
+ .nullish(),
652
+ });
package/src/index.ts ADDED
@@ -0,0 +1,3 @@
1
+ export { createGladia, gladia } from './gladia-provider';
2
+ export type { GladiaProvider, GladiaProviderSettings } from './gladia-provider';
3
+ export { VERSION } from './version';
Binary file
package/src/version.ts ADDED
@@ -0,0 +1,6 @@
1
+ // Version string of this package injected at build time.
2
+ declare const __PACKAGE_VERSION__: string | undefined;
3
+ export const VERSION: string =
4
+ typeof __PACKAGE_VERSION__ !== 'undefined'
5
+ ? __PACKAGE_VERSION__
6
+ : '0.0.0-test';