@ai-sdk/revai 0.0.0-70e0935a-20260114150030 → 0.0.0-98261322-20260122142521

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,516 @@
1
+ import {
2
+ AISDKError,
3
+ TranscriptionModelV3,
4
+ SharedV3Warning,
5
+ } from '@ai-sdk/provider';
6
+ import {
7
+ combineHeaders,
8
+ convertBase64ToUint8Array,
9
+ createJsonResponseHandler,
10
+ mediaTypeToExtension,
11
+ delay,
12
+ getFromApi,
13
+ parseProviderOptions,
14
+ postFormDataToApi,
15
+ } from '@ai-sdk/provider-utils';
16
+ import { z } from 'zod/v4';
17
+ import { RevaiConfig } from './revai-config';
18
+ import { revaiFailedResponseHandler } from './revai-error';
19
+ import { RevaiTranscriptionModelId } from './revai-transcription-options';
20
+ import { RevaiTranscriptionAPITypes } from './revai-api-types';
21
+
22
+ // https://docs.rev.ai/api/asynchronous/reference/#operation/SubmitTranscriptionJob
23
+ const revaiProviderOptionsSchema = z.object({
24
+ /**
25
+ * Optional metadata string to associate with the transcription job.
26
+ */
27
+ metadata: z.string().nullish(),
28
+ /**
29
+ * Configuration for webhook notifications when job is complete.
30
+ */
31
+ notification_config: z
32
+ .object({
33
+ /**
34
+ * URL to send the notification to.
35
+ */
36
+ url: z.string(),
37
+ /**
38
+ * Optional authorization headers for the notification request.
39
+ */
40
+ auth_headers: z
41
+ .object({
42
+ Authorization: z.string(),
43
+ })
44
+ .nullish(),
45
+ })
46
+ .nullish(),
47
+ /**
48
+ * Number of seconds after which the job will be automatically deleted.
49
+ */
50
+ delete_after_seconds: z.number().nullish(),
51
+ /**
52
+ * Whether to include filler words and false starts in the transcription.
53
+ */
54
+ verbatim: z.boolean().optional(),
55
+ /**
56
+ * Whether to prioritize the job for faster processing.
57
+ */
58
+ rush: z.boolean().nullish().default(false),
59
+ /**
60
+ * Whether to run the job in test mode.
61
+ */
62
+ test_mode: z.boolean().nullish().default(false),
63
+ /**
64
+ * Specific segments of the audio to transcribe.
65
+ */
66
+ segments_to_transcribe: z
67
+ .array(
68
+ z.object({
69
+ /**
70
+ * Start time of the segment in seconds.
71
+ */
72
+ start: z.number(),
73
+ /**
74
+ * End time of the segment in seconds.
75
+ */
76
+ end: z.number(),
77
+ }),
78
+ )
79
+ .nullish(),
80
+ /**
81
+ * Names to assign to speakers in the transcription.
82
+ */
83
+ speaker_names: z
84
+ .array(
85
+ z.object({
86
+ /**
87
+ * Display name for the speaker.
88
+ */
89
+ display_name: z.string(),
90
+ }),
91
+ )
92
+ .nullish(),
93
+ /**
94
+ * Whether to skip speaker diarization.
95
+ */
96
+ skip_diarization: z.boolean().nullish().default(false),
97
+ /**
98
+ * Whether to skip post-processing steps.
99
+ */
100
+ skip_postprocessing: z.boolean().nullish().default(false),
101
+ /**
102
+ * Whether to skip adding punctuation to the transcription.
103
+ */
104
+ skip_punctuation: z.boolean().nullish().default(false),
105
+ /**
106
+ * Whether to remove disfluencies (um, uh, etc.) from the transcription.
107
+ */
108
+ remove_disfluencies: z.boolean().nullish().default(false),
109
+ /**
110
+ * Whether to remove atmospheric sounds from the transcription.
111
+ */
112
+ remove_atmospherics: z.boolean().nullish().default(false),
113
+ /**
114
+ * Whether to filter profanity from the transcription.
115
+ */
116
+ filter_profanity: z.boolean().nullish().default(false),
117
+ /**
118
+ * Number of speaker channels in the audio.
119
+ */
120
+ speaker_channels_count: z.number().nullish(),
121
+ /**
122
+ * Expected number of speakers in the audio.
123
+ */
124
+ speakers_count: z.number().nullish(),
125
+ /**
126
+ * Type of diarization to use.
127
+ */
128
+ diarization_type: z
129
+ .enum(['standard', 'premium'])
130
+ .nullish()
131
+ .default('standard'),
132
+ /**
133
+ * ID of a custom vocabulary to use for the transcription.
134
+ */
135
+ custom_vocabulary_id: z.string().nullish(),
136
+ /**
137
+ * Custom vocabularies to use for the transcription.
138
+ */
139
+ custom_vocabularies: z.array(z.object({})).optional(),
140
+ /**
141
+ * Whether to strictly enforce custom vocabulary.
142
+ */
143
+ strict_custom_vocabulary: z.boolean().optional(),
144
+ /**
145
+ * Configuration for generating a summary of the transcription.
146
+ */
147
+ summarization_config: z
148
+ .object({
149
+ /**
150
+ * Model to use for summarization.
151
+ */
152
+ model: z.enum(['standard', 'premium']).nullish().default('standard'),
153
+ /**
154
+ * Format of the summary.
155
+ */
156
+ type: z.enum(['paragraph', 'bullets']).nullish().default('paragraph'),
157
+ /**
158
+ * Custom prompt for the summarization.
159
+ */
160
+ prompt: z.string().nullish(),
161
+ })
162
+ .nullish(),
163
+ /**
164
+ * Configuration for translating the transcription.
165
+ */
166
+ translation_config: z
167
+ .object({
168
+ /**
169
+ * Target languages for translation.
170
+ */
171
+ target_languages: z.array(
172
+ z.object({
173
+ /**
174
+ * Language code for translation target.
175
+ */
176
+ language: z.enum([
177
+ 'en',
178
+ 'en-us',
179
+ 'en-gb',
180
+ 'ar',
181
+ 'pt',
182
+ 'pt-br',
183
+ 'pt-pt',
184
+ 'fr',
185
+ 'fr-ca',
186
+ 'es',
187
+ 'es-es',
188
+ 'es-la',
189
+ 'it',
190
+ 'ja',
191
+ 'ko',
192
+ 'de',
193
+ 'ru',
194
+ ]),
195
+ }),
196
+ ),
197
+ /**
198
+ * Model to use for translation.
199
+ */
200
+ model: z.enum(['standard', 'premium']).nullish().default('standard'),
201
+ })
202
+ .nullish(),
203
+ /**
204
+ * Language of the audio content.
205
+ */
206
+ language: z.string().nullish().default('en'),
207
+ /**
208
+ * Whether to perform forced alignment.
209
+ */
210
+ forced_alignment: z.boolean().nullish().default(false),
211
+ });
212
+
213
+ export type RevaiTranscriptionCallOptions = z.infer<
214
+ typeof revaiProviderOptionsSchema
215
+ >;
216
+
217
+ interface RevaiTranscriptionModelConfig extends RevaiConfig {
218
+ _internal?: {
219
+ currentDate?: () => Date;
220
+ };
221
+ }
222
+
223
+ export class RevaiTranscriptionModel implements TranscriptionModelV3 {
224
+ readonly specificationVersion = 'v3';
225
+
226
+ get provider(): string {
227
+ return this.config.provider;
228
+ }
229
+
230
+ constructor(
231
+ readonly modelId: RevaiTranscriptionModelId,
232
+ private readonly config: RevaiTranscriptionModelConfig,
233
+ ) {}
234
+
235
+ private async getArgs({
236
+ audio,
237
+ mediaType,
238
+ providerOptions,
239
+ }: Parameters<TranscriptionModelV3['doGenerate']>[0]) {
240
+ const warnings: SharedV3Warning[] = [];
241
+
242
+ // Parse provider options
243
+ const revaiOptions = await parseProviderOptions({
244
+ provider: 'revai',
245
+ providerOptions,
246
+ schema: revaiProviderOptionsSchema,
247
+ });
248
+
249
+ // Create form data with base fields
250
+ const formData = new FormData();
251
+ const blob =
252
+ audio instanceof Uint8Array
253
+ ? new Blob([audio])
254
+ : new Blob([convertBase64ToUint8Array(audio)]);
255
+
256
+ const fileExtension = mediaTypeToExtension(mediaType);
257
+ formData.append(
258
+ 'media',
259
+ new File([blob], 'audio', { type: mediaType }),
260
+ `audio.${fileExtension}`,
261
+ );
262
+ const transcriptionModelOptions: RevaiTranscriptionAPITypes = {
263
+ transcriber: this.modelId,
264
+ };
265
+
266
+ // Add provider-specific options
267
+ if (revaiOptions) {
268
+ const formDataConfig: RevaiTranscriptionAPITypes = {
269
+ metadata: revaiOptions.metadata ?? undefined,
270
+ notification_config: revaiOptions.notification_config ?? undefined,
271
+ delete_after_seconds: revaiOptions.delete_after_seconds ?? undefined,
272
+ verbatim: revaiOptions.verbatim ?? undefined,
273
+ rush: revaiOptions.rush ?? undefined,
274
+ test_mode: revaiOptions.test_mode ?? undefined,
275
+ segments_to_transcribe:
276
+ revaiOptions.segments_to_transcribe ?? undefined,
277
+ speaker_names: revaiOptions.speaker_names ?? undefined,
278
+ skip_diarization: revaiOptions.skip_diarization ?? undefined,
279
+ skip_postprocessing: revaiOptions.skip_postprocessing ?? undefined,
280
+ skip_punctuation: revaiOptions.skip_punctuation ?? undefined,
281
+ remove_disfluencies: revaiOptions.remove_disfluencies ?? undefined,
282
+ remove_atmospherics: revaiOptions.remove_atmospherics ?? undefined,
283
+ filter_profanity: revaiOptions.filter_profanity ?? undefined,
284
+ speaker_channels_count:
285
+ revaiOptions.speaker_channels_count ?? undefined,
286
+ speakers_count: revaiOptions.speakers_count ?? undefined,
287
+ diarization_type: revaiOptions.diarization_type ?? undefined,
288
+ custom_vocabulary_id: revaiOptions.custom_vocabulary_id ?? undefined,
289
+ custom_vocabularies: revaiOptions.custom_vocabularies ?? undefined,
290
+ strict_custom_vocabulary:
291
+ revaiOptions.strict_custom_vocabulary ?? undefined,
292
+ summarization_config: revaiOptions.summarization_config ?? undefined,
293
+ translation_config: revaiOptions.translation_config ?? undefined,
294
+ language: revaiOptions.language ?? undefined,
295
+ forced_alignment: revaiOptions.forced_alignment ?? undefined,
296
+ };
297
+
298
+ for (const key in formDataConfig) {
299
+ const value = formDataConfig[key as keyof RevaiTranscriptionAPITypes];
300
+ if (value !== undefined) {
301
+ (transcriptionModelOptions as Record<string, unknown>)[
302
+ key as keyof RevaiTranscriptionAPITypes
303
+ ] = value;
304
+ }
305
+ }
306
+ }
307
+
308
+ formData.append('config', JSON.stringify(transcriptionModelOptions));
309
+
310
+ return {
311
+ formData,
312
+ warnings,
313
+ };
314
+ }
315
+
316
+ async doGenerate(
317
+ options: Parameters<TranscriptionModelV3['doGenerate']>[0],
318
+ ): Promise<Awaited<ReturnType<TranscriptionModelV3['doGenerate']>>> {
319
+ const currentDate = this.config._internal?.currentDate?.() ?? new Date();
320
+ const { formData, warnings } = await this.getArgs(options);
321
+
322
+ const { value: submissionResponse } = await postFormDataToApi({
323
+ url: this.config.url({
324
+ path: '/speechtotext/v1/jobs',
325
+ modelId: this.modelId,
326
+ }),
327
+ headers: combineHeaders(this.config.headers(), options.headers),
328
+ formData,
329
+ failedResponseHandler: revaiFailedResponseHandler,
330
+ successfulResponseHandler: createJsonResponseHandler(
331
+ revaiTranscriptionJobResponseSchema,
332
+ ),
333
+ abortSignal: options.abortSignal,
334
+ fetch: this.config.fetch,
335
+ });
336
+
337
+ if (submissionResponse.status === 'failed') {
338
+ throw new AISDKError({
339
+ message: 'Failed to submit transcription job to Rev.ai',
340
+ name: 'TranscriptionJobSubmissionFailed',
341
+ cause: submissionResponse,
342
+ });
343
+ }
344
+
345
+ const jobId = submissionResponse.id;
346
+ const timeoutMs = 60 * 1000; // 60 seconds timeout
347
+ const startTime = Date.now();
348
+ const pollingInterval = 1000;
349
+ let jobResponse = submissionResponse;
350
+
351
+ while (jobResponse.status !== 'transcribed') {
352
+ // Check if we've exceeded the timeout
353
+ if (Date.now() - startTime > timeoutMs) {
354
+ throw new AISDKError({
355
+ message: 'Transcription job polling timed out',
356
+ name: 'TranscriptionJobPollingTimedOut',
357
+ cause: submissionResponse,
358
+ });
359
+ }
360
+
361
+ // Poll for job status
362
+ const pollingResult = await getFromApi({
363
+ url: this.config.url({
364
+ path: `/speechtotext/v1/jobs/${jobId}`,
365
+ modelId: this.modelId,
366
+ }),
367
+ headers: combineHeaders(this.config.headers(), options.headers),
368
+ failedResponseHandler: revaiFailedResponseHandler,
369
+ successfulResponseHandler: createJsonResponseHandler(
370
+ revaiTranscriptionJobResponseSchema,
371
+ ),
372
+ abortSignal: options.abortSignal,
373
+ fetch: this.config.fetch,
374
+ });
375
+
376
+ jobResponse = pollingResult.value;
377
+
378
+ if (jobResponse.status === 'failed') {
379
+ throw new AISDKError({
380
+ message: 'Transcription job failed',
381
+ name: 'TranscriptionJobFailed',
382
+ cause: jobResponse,
383
+ });
384
+ }
385
+
386
+ // Wait before polling again (only if we need to continue polling)
387
+ if (jobResponse.status !== 'transcribed') {
388
+ await delay(pollingInterval);
389
+ }
390
+ }
391
+
392
+ const {
393
+ value: transcriptionResult,
394
+ responseHeaders,
395
+ rawValue: rawResponse,
396
+ } = await getFromApi({
397
+ url: this.config.url({
398
+ path: `/speechtotext/v1/jobs/${jobId}/transcript`,
399
+ modelId: this.modelId,
400
+ }),
401
+ headers: combineHeaders(this.config.headers(), options.headers),
402
+ failedResponseHandler: revaiFailedResponseHandler,
403
+ successfulResponseHandler: createJsonResponseHandler(
404
+ revaiTranscriptionResponseSchema,
405
+ ),
406
+ abortSignal: options.abortSignal,
407
+ fetch: this.config.fetch,
408
+ });
409
+
410
+ let durationInSeconds = 0;
411
+ const segments: {
412
+ text: string;
413
+ startSecond: number;
414
+ endSecond: number;
415
+ }[] = [];
416
+
417
+ for (const monologue of transcriptionResult.monologues ?? []) {
418
+ // Process each monologue to extract segments with timing information
419
+ let currentSegmentText = '';
420
+ let segmentStartSecond = 0;
421
+ let hasStartedSegment = false;
422
+
423
+ for (const element of monologue?.elements ?? []) {
424
+ // Add the element value to the current segment text
425
+ currentSegmentText += element.value;
426
+
427
+ // For text elements, track timing information
428
+ if (element.type === 'text') {
429
+ // Update the overall duration if this is the latest timestamp
430
+ if (element.end_ts && element.end_ts > durationInSeconds) {
431
+ durationInSeconds = element.end_ts;
432
+ }
433
+
434
+ // If this is the first text element in a segment, mark the start time
435
+ if (!hasStartedSegment && typeof element.ts === 'number') {
436
+ segmentStartSecond = element.ts;
437
+ hasStartedSegment = true;
438
+ }
439
+
440
+ // If we have an end timestamp, we can complete a segment
441
+ if (typeof element.end_ts === 'number' && hasStartedSegment) {
442
+ // Only add non-empty segments
443
+ if (currentSegmentText.trim()) {
444
+ segments.push({
445
+ text: currentSegmentText.trim(),
446
+ startSecond: segmentStartSecond,
447
+ endSecond: element.end_ts,
448
+ });
449
+ }
450
+
451
+ // Reset for the next segment
452
+ currentSegmentText = '';
453
+ hasStartedSegment = false;
454
+ }
455
+ }
456
+ }
457
+
458
+ // Handle any remaining segment text that wasn't added
459
+ if (hasStartedSegment && currentSegmentText.trim()) {
460
+ const endSecond =
461
+ durationInSeconds > segmentStartSecond
462
+ ? durationInSeconds
463
+ : segmentStartSecond + 1;
464
+ segments.push({
465
+ text: currentSegmentText.trim(),
466
+ startSecond: segmentStartSecond,
467
+ endSecond: endSecond,
468
+ });
469
+ }
470
+ }
471
+
472
+ return {
473
+ text:
474
+ transcriptionResult.monologues
475
+ ?.map(monologue =>
476
+ monologue?.elements?.map(element => element.value).join(''),
477
+ )
478
+ .join(' ') ?? '',
479
+ segments,
480
+ language: submissionResponse.language ?? undefined,
481
+ durationInSeconds,
482
+ warnings,
483
+ response: {
484
+ timestamp: currentDate,
485
+ modelId: this.modelId,
486
+ headers: responseHeaders,
487
+ body: rawResponse,
488
+ },
489
+ };
490
+ }
491
+ }
492
+
493
+ const revaiTranscriptionJobResponseSchema = z.object({
494
+ id: z.string().nullish(),
495
+ status: z.string().nullish(),
496
+ language: z.string().nullish(),
497
+ });
498
+
499
+ const revaiTranscriptionResponseSchema = z.object({
500
+ monologues: z
501
+ .array(
502
+ z.object({
503
+ elements: z
504
+ .array(
505
+ z.object({
506
+ type: z.string().nullish(),
507
+ value: z.string().nullish(),
508
+ ts: z.number().nullish(),
509
+ end_ts: z.number().nullish(),
510
+ }),
511
+ )
512
+ .nullish(),
513
+ }),
514
+ )
515
+ .nullish(),
516
+ });
@@ -0,0 +1 @@
1
+ export type RevaiTranscriptionModelId = 'machine' | 'low_cost' | 'fusion';
Binary file
package/src/version.ts ADDED
@@ -0,0 +1,6 @@
1
+ // Version string of this package injected at build time.
2
+ declare const __PACKAGE_VERSION__: string | undefined;
3
+ export const VERSION: string =
4
+ typeof __PACKAGE_VERSION__ !== 'undefined'
5
+ ? __PACKAGE_VERSION__
6
+ : '0.0.0-test';