@ai-sdk/google-vertex 4.0.145 → 4.0.147

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,214 @@
1
+ import type { SharedV3Warning, TranscriptionModelV3 } from '@ai-sdk/provider';
2
+ import {
3
+ combineHeaders,
4
+ convertUint8ArrayToBase64,
5
+ createJsonResponseHandler,
6
+ parseProviderOptions,
7
+ postJsonToApi,
8
+ resolve,
9
+ type FetchFunction,
10
+ type Resolvable,
11
+ } from '@ai-sdk/provider-utils';
12
+ import { z } from 'zod/v4';
13
+ import { googleVertexFailedResponseHandler } from './google-vertex-error';
14
+ import {
15
+ googleVertexTranscriptionProviderOptionsSchema,
16
+ type GoogleVertexTranscriptionModelId,
17
+ type GoogleVertexTranscriptionModelOptions,
18
+ } from './google-vertex-transcription-model-options';
19
+
20
+ interface GoogleVertexTranscriptionModelConfig {
21
+ provider: string;
22
+ /** Google Cloud project id. */
23
+ project: string;
24
+ /** Default Speech-to-Text region (overridable via provider options). */
25
+ location: string;
26
+ headers?: Resolvable<Record<string, string | undefined>>;
27
+ fetch?: FetchFunction;
28
+ _internal?: {
29
+ currentDate?: () => Date;
30
+ };
31
+ }
32
+
33
+ // Speech-to-Text Durations are strings like `"1.200s"`; parse to seconds.
34
+ function parseDurationSeconds(
35
+ value: string | null | undefined,
36
+ ): number | undefined {
37
+ if (value == null) {
38
+ return undefined;
39
+ }
40
+ const seconds = Number.parseFloat(value);
41
+ return Number.isFinite(seconds) ? seconds : undefined;
42
+ }
43
+
44
+ function convertBcp47ToIso6391(
45
+ value: string | null | undefined,
46
+ ): string | undefined {
47
+ if (value == null) {
48
+ return undefined;
49
+ }
50
+
51
+ try {
52
+ const language = new Intl.Locale(value).language;
53
+ return language.length === 2 ? language : undefined;
54
+ } catch {
55
+ return undefined;
56
+ }
57
+ }
58
+
59
+ export class GoogleVertexTranscriptionModel implements TranscriptionModelV3 {
60
+ readonly specificationVersion = 'v3';
61
+
62
+ get provider(): string {
63
+ return this.config.provider;
64
+ }
65
+
66
+ constructor(
67
+ readonly modelId: GoogleVertexTranscriptionModelId,
68
+ private readonly config: GoogleVertexTranscriptionModelConfig,
69
+ ) {}
70
+
71
+ async doGenerate(
72
+ options: Parameters<TranscriptionModelV3['doGenerate']>[0],
73
+ ): Promise<Awaited<ReturnType<TranscriptionModelV3['doGenerate']>>> {
74
+ const currentDate = this.config._internal?.currentDate?.() ?? new Date();
75
+ const warnings: SharedV3Warning[] = [];
76
+
77
+ // Provider options may be passed under `googleVertex`, `vertex`, or `google`
78
+ // (matching the Vertex language + embedding models).
79
+ let googleOptions: GoogleVertexTranscriptionModelOptions | undefined;
80
+ for (const provider of ['googleVertex', 'vertex', 'google'] as const) {
81
+ googleOptions = await parseProviderOptions({
82
+ provider,
83
+ providerOptions: options.providerOptions,
84
+ schema: googleVertexTranscriptionProviderOptionsSchema,
85
+ });
86
+ if (googleOptions != null) {
87
+ break;
88
+ }
89
+ }
90
+
91
+ const region = googleOptions?.region ?? this.config.location;
92
+ const languageCodes = googleOptions?.languageCodes ?? ['auto'];
93
+
94
+ // The recognize API takes base64-encoded audio in the `content` field. A
95
+ // string audio input is already base64-encoded per the spec.
96
+ const content =
97
+ typeof options.audio === 'string'
98
+ ? options.audio
99
+ : convertUint8ArrayToBase64(options.audio);
100
+
101
+ const requestBody = {
102
+ config: {
103
+ model: this.modelId,
104
+ languageCodes,
105
+ // Let Speech-to-Text auto-detect the audio encoding (wav/mp3/flac/…).
106
+ autoDecodingConfig: {},
107
+ features: {
108
+ // Word timing populates `segments`.
109
+ enableWordTimeOffsets: googleOptions?.enableWordTimeOffsets ?? true,
110
+ enableAutomaticPunctuation:
111
+ googleOptions?.enableAutomaticPunctuation ?? true,
112
+ },
113
+ },
114
+ content,
115
+ };
116
+
117
+ const host =
118
+ region === 'global'
119
+ ? 'speech.googleapis.com'
120
+ : `${region}-speech.googleapis.com`;
121
+
122
+ const url =
123
+ `https://${host}/v2/projects/` +
124
+ `${this.config.project}/locations/${region}/recognizers/_:recognize`;
125
+
126
+ const {
127
+ value: response,
128
+ responseHeaders,
129
+ rawValue: rawResponse,
130
+ } = await postJsonToApi({
131
+ url,
132
+ headers: combineHeaders(
133
+ this.config.headers ? await resolve(this.config.headers) : undefined,
134
+ options.headers,
135
+ ),
136
+ body: requestBody,
137
+ failedResponseHandler: googleVertexFailedResponseHandler,
138
+ successfulResponseHandler: createJsonResponseHandler(
139
+ googleVertexTranscriptionResponseSchema,
140
+ ),
141
+ abortSignal: options.abortSignal,
142
+ fetch: this.config.fetch,
143
+ });
144
+
145
+ // Results are sequential portions of the audio; concatenate their primary
146
+ // alternatives into the full transcript and collect word-level segments.
147
+ const results = response.results ?? [];
148
+ const text = results
149
+ .map(result => result.alternatives?.[0]?.transcript ?? '')
150
+ .join(' ')
151
+ .trim();
152
+ const segments = results.flatMap(
153
+ result =>
154
+ result.alternatives?.[0]?.words?.flatMap(word => {
155
+ const startSecond = parseDurationSeconds(word.startOffset);
156
+ const endSecond = parseDurationSeconds(word.endOffset);
157
+
158
+ return word.word == null || startSecond == null || endSecond == null
159
+ ? []
160
+ : [{ text: word.word, startSecond, endSecond }];
161
+ }) ?? [],
162
+ );
163
+ const language = convertBcp47ToIso6391(results[0]?.languageCode);
164
+
165
+ return {
166
+ text,
167
+ segments,
168
+ language,
169
+ durationInSeconds: parseDurationSeconds(
170
+ response.metadata?.totalBilledDuration,
171
+ ),
172
+ warnings,
173
+ response: {
174
+ timestamp: currentDate,
175
+ modelId: this.modelId,
176
+ headers: responseHeaders,
177
+ body: rawResponse,
178
+ },
179
+ };
180
+ }
181
+ }
182
+
183
+ // Minimal schema: only the fields the implementation reads, with `.nullish()`
184
+ // so provider API changes don't break parsing.
185
+ const googleVertexTranscriptionResponseSchema = z.object({
186
+ results: z
187
+ .array(
188
+ z.object({
189
+ alternatives: z
190
+ .array(
191
+ z.object({
192
+ transcript: z.string().nullish(),
193
+ words: z
194
+ .array(
195
+ z.object({
196
+ word: z.string().nullish(),
197
+ startOffset: z.string().nullish(),
198
+ endOffset: z.string().nullish(),
199
+ }),
200
+ )
201
+ .nullish(),
202
+ }),
203
+ )
204
+ .nullish(),
205
+ languageCode: z.string().nullish(),
206
+ }),
207
+ )
208
+ .nullish(),
209
+ metadata: z
210
+ .object({
211
+ totalBilledDuration: z.string().nullish(),
212
+ })
213
+ .nullish(),
214
+ });
package/src/index.ts CHANGED
@@ -1,4 +1,8 @@
1
1
  export type { GoogleVertexEmbeddingModelOptions } from './google-vertex-embedding-options';
2
+ export type {
3
+ GoogleVertexTranscriptionModelId,
4
+ GoogleVertexTranscriptionModelOptions,
5
+ } from './google-vertex-transcription-model-options';
2
6
  export type {
3
7
  GoogleVertexImageModelOptions,
4
8
  /** @deprecated Use `GoogleVertexImageModelOptions` instead. */