@ai-sdk/provider 4.0.0-beta.14 → 4.0.0-beta.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,6 @@ import type {
5
5
  SharedV4FileDataUrl,
6
6
  } from '../../shared/v4/shared-v4-file-data';
7
7
  import type { SharedV4ProviderOptions } from '../../shared/v4/shared-v4-provider-options';
8
- import type { SharedV4ProviderReference } from '../../shared/v4/shared-v4-provider-reference';
9
8
 
10
9
  /**
11
10
  * A prompt is a list of messages.
@@ -359,15 +358,28 @@ export type LanguageModelV4ToolResultOutput =
359
358
  providerOptions?: SharedV4ProviderOptions;
360
359
  }
361
360
  | {
362
- type: 'file-data';
361
+ type: 'file';
363
362
 
364
363
  /**
365
- * Base-64 encoded media data.
364
+ * File data as a tagged discriminated union:
365
+ *
366
+ * - `{ type: 'data', data }`: raw bytes (Uint8Array) or base64-encoded string.
367
+ * - `{ type: 'url', url }`: a URL that points to the file.
368
+ * - `{ type: 'reference', reference }`: a provider reference (`{ [provider]: id }`).
369
+ * - `{ type: 'text', text }`: inline text content (e.g. an inline text document).
366
370
  */
367
- data: string;
371
+ data: SharedV4FileData;
368
372
 
369
373
  /**
370
- * IANA media type.
374
+ * Either a full IANA media type (`type/subtype`, e.g. `image/png`) or just
375
+ * the top-level IANA segment (e.g. `image`, `audio`, `video`, `text`).
376
+ *
377
+ * `*`-subtype wildcards (e.g. `image/*`) are normalized as equivalent to the
378
+ * top-level segment alone (e.g. `image`). Providers can use the helpers in
379
+ * `@ai-sdk/provider-utils` (`isFullMediaType`, `getTopLevelMediaType`,
380
+ * `detectMediaType`) to resolve the field according to their API
381
+ * requirements.
382
+ *
371
383
  * @see https://www.iana.org/assignments/media-types/media-types.xhtml
372
384
  */
373
385
  mediaType: string;
@@ -377,39 +389,6 @@ export type LanguageModelV4ToolResultOutput =
377
389
  */
378
390
  filename?: string;
379
391
 
380
- /**
381
- * Provider-specific options.
382
- */
383
- providerOptions?: SharedV4ProviderOptions;
384
- }
385
- | {
386
- type: 'file-url';
387
-
388
- /**
389
- * URL of the file.
390
- */
391
- url: string;
392
-
393
- /**
394
- * IANA media type.
395
- * @see https://www.iana.org/assignments/media-types/media-types.xhtml
396
- */
397
- mediaType: string;
398
-
399
- /**
400
- * Provider-specific options.
401
- */
402
- providerOptions?: SharedV4ProviderOptions;
403
- }
404
- | {
405
- type: 'file-reference';
406
-
407
- /**
408
- * Provider-specific references for the file.
409
- * The key is the provider name, e.g. 'openai' or 'anthropic'.
410
- */
411
- providerReference: SharedV4ProviderReference;
412
-
413
392
  /**
414
393
  * Provider-specific options.
415
394
  */
@@ -0,0 +1 @@
1
+ export * from './v4/index';
@@ -0,0 +1,20 @@
1
+ export type {
2
+ RealtimeFactoryV4 as Experimental_RealtimeFactoryV4,
3
+ RealtimeFactoryV4GetTokenOptions as Experimental_RealtimeFactoryV4GetTokenOptions,
4
+ RealtimeFactoryV4GetTokenResult as Experimental_RealtimeFactoryV4GetTokenResult,
5
+ } from './realtime-factory-v4';
6
+ export type { RealtimeModelV4 as Experimental_RealtimeModelV4 } from './realtime-model-v4';
7
+ export type { RealtimeModelV4ClientEvent as Experimental_RealtimeModelV4ClientEvent } from './realtime-model-v4-client-event';
8
+ export type {
9
+ RealtimeModelV4ClientSecretOptions as Experimental_RealtimeModelV4ClientSecretOptions,
10
+ RealtimeModelV4ClientSecretResult as Experimental_RealtimeModelV4ClientSecretResult,
11
+ } from './realtime-model-v4-client-secret';
12
+ export type {
13
+ RealtimeModelV4ConversationItem as Experimental_RealtimeModelV4ConversationItem,
14
+ RealtimeModelV4TextMessage as Experimental_RealtimeModelV4TextMessage,
15
+ RealtimeModelV4AudioMessage as Experimental_RealtimeModelV4AudioMessage,
16
+ RealtimeModelV4FunctionCallOutput as Experimental_RealtimeModelV4FunctionCallOutput,
17
+ } from './realtime-model-v4-conversation-item';
18
+ export type { RealtimeModelV4ServerEvent as Experimental_RealtimeModelV4ServerEvent } from './realtime-model-v4-server-event';
19
+ export type { RealtimeModelV4SessionConfig as Experimental_RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
20
+ export type { RealtimeModelV4ToolDefinition as Experimental_RealtimeModelV4ToolDefinition } from './realtime-model-v4-tool-definition';
@@ -0,0 +1,20 @@
1
+ import type { RealtimeModelV4 } from './realtime-model-v4';
2
+ import type { RealtimeModelV4ClientSecretOptions } from './realtime-model-v4-client-secret';
3
+
4
+ export type RealtimeFactoryV4GetTokenOptions = {
5
+ model: string;
6
+ } & RealtimeModelV4ClientSecretOptions;
7
+
8
+ export type RealtimeFactoryV4GetTokenResult = {
9
+ token: string;
10
+ url: string;
11
+ expiresAt?: number;
12
+ };
13
+
14
+ export interface RealtimeFactoryV4 {
15
+ (modelId: string): RealtimeModelV4;
16
+
17
+ getToken(
18
+ options: RealtimeFactoryV4GetTokenOptions,
19
+ ): Promise<RealtimeFactoryV4GetTokenResult>;
20
+ }
@@ -0,0 +1,68 @@
1
+ import type { RealtimeModelV4ConversationItem } from './realtime-model-v4-conversation-item';
2
+ import type { RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
3
+
4
+ /**
5
+ * Normalized events sent from the browser to the realtime model.
6
+ * Each provider maps this to its native event format before sending
7
+ * over the WebSocket.
8
+ */
9
+ export type RealtimeModelV4ClientEvent =
10
+ // ── Session ────────────────────────────────────────────────────────
11
+
12
+ | {
13
+ type: 'session-update';
14
+ config: RealtimeModelV4SessionConfig;
15
+ }
16
+
17
+ // ── Input audio buffer ─────────────────────────────────────────────
18
+ | {
19
+ type: 'input-audio-append';
20
+
21
+ /**
22
+ * Base64-encoded audio chunk to append to the input buffer.
23
+ */
24
+ audio: string;
25
+ }
26
+ | {
27
+ type: 'input-audio-commit';
28
+ }
29
+ | {
30
+ type: 'input-audio-clear';
31
+ }
32
+
33
+ // ── Conversation items ─────────────────────────────────────────────
34
+ | {
35
+ type: 'conversation-item-create';
36
+ item: RealtimeModelV4ConversationItem;
37
+ }
38
+ | {
39
+ type: 'conversation-item-truncate';
40
+
41
+ /**
42
+ * The ID of the assistant message item to truncate.
43
+ */
44
+ itemId: string;
45
+
46
+ /**
47
+ * The index of the content part to truncate.
48
+ */
49
+ contentIndex: number;
50
+
51
+ /**
52
+ * Truncate audio after this many milliseconds.
53
+ */
54
+ audioEndMs: number;
55
+ }
56
+
57
+ // ── Response control ───────────────────────────────────────────────
58
+ | {
59
+ type: 'response-create';
60
+ options?: {
61
+ modalities?: string[];
62
+ instructions?: string;
63
+ metadata?: Record<string, unknown>;
64
+ };
65
+ }
66
+ | {
67
+ type: 'response-cancel';
68
+ };
@@ -0,0 +1,40 @@
1
+ import type { RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
2
+
3
+ /**
4
+ * Options for creating an ephemeral client secret for browser-side
5
+ * WebSocket connections to a realtime model.
6
+ */
7
+ export type RealtimeModelV4ClientSecretOptions = {
8
+ /**
9
+ * Number of seconds until the client secret expires.
10
+ */
11
+ expiresAfterSeconds?: number;
12
+
13
+ /**
14
+ * Optional session configuration to embed in the token request.
15
+ * Some providers (e.g. Google) require the full session config at token creation time.
16
+ */
17
+ sessionConfig?: RealtimeModelV4SessionConfig;
18
+ };
19
+
20
+ /**
21
+ * Result of creating an ephemeral client secret.
22
+ */
23
+ export type RealtimeModelV4ClientSecretResult = {
24
+ /**
25
+ * The ephemeral token value. Used as a Bearer token or in the
26
+ * WebSocket subprotocol header for authentication.
27
+ */
28
+ token: string;
29
+
30
+ /**
31
+ * The WebSocket URL to connect to. Includes any provider-specific
32
+ * query parameters (e.g. model ID).
33
+ */
34
+ url: string;
35
+
36
+ /**
37
+ * Unix timestamp (seconds) when this client secret expires.
38
+ */
39
+ expiresAt?: number;
40
+ };
@@ -0,0 +1,55 @@
1
+ /**
2
+ * A conversation item that can be created by the client and sent to
3
+ * the model via the conversation.item.create event.
4
+ */
5
+ export type RealtimeModelV4ConversationItem =
6
+ | RealtimeModelV4TextMessage
7
+ | RealtimeModelV4AudioMessage
8
+ | RealtimeModelV4FunctionCallOutput;
9
+
10
+ /**
11
+ * A text message from the user.
12
+ */
13
+ export type RealtimeModelV4TextMessage = {
14
+ type: 'text-message';
15
+ role: 'user';
16
+ text: string;
17
+ };
18
+
19
+ /**
20
+ * An audio message from the user (complete audio, not streamed).
21
+ */
22
+ export type RealtimeModelV4AudioMessage = {
23
+ type: 'audio-message';
24
+ role: 'user';
25
+
26
+ /**
27
+ * Base64-encoded audio data.
28
+ */
29
+ audio: string;
30
+ };
31
+
32
+ /**
33
+ * The output of a function call, sent back to the model so it can
34
+ * continue generating a response using the tool result.
35
+ */
36
+ export type RealtimeModelV4FunctionCallOutput = {
37
+ type: 'function-call-output';
38
+
39
+ /**
40
+ * The call ID from the function-call-arguments-done event.
41
+ * Must match so the model knows which function call this result is for.
42
+ */
43
+ callId: string;
44
+
45
+ /**
46
+ * The name of the function that was called.
47
+ * Required by some providers (e.g. Google) in the tool response routing.
48
+ */
49
+ name?: string;
50
+
51
+ /**
52
+ * JSON string containing the function call result.
53
+ */
54
+ output: string;
55
+ };
@@ -0,0 +1,199 @@
1
+ /**
2
+ * Normalized events emitted by the realtime model (model → browser).
3
+ * Each provider maps its native event format to this discriminated union.
4
+ *
5
+ * Every event includes a `raw` field with the original provider-specific
6
+ * event data for debugging and provider-specific access.
7
+ */
8
+ export type RealtimeModelV4ServerEvent =
9
+ // ── Session lifecycle ──────────────────────────────────────────────
10
+
11
+ | {
12
+ type: 'session-created';
13
+ sessionId?: string;
14
+ raw: unknown;
15
+ }
16
+ | {
17
+ type: 'session-updated';
18
+ raw: unknown;
19
+ }
20
+
21
+ // ── Input audio buffer ─────────────────────────────────────────────
22
+ | {
23
+ type: 'speech-started';
24
+ itemId?: string;
25
+ raw: unknown;
26
+ }
27
+ | {
28
+ type: 'speech-stopped';
29
+ itemId?: string;
30
+ raw: unknown;
31
+ }
32
+ | {
33
+ type: 'audio-committed';
34
+ itemId?: string;
35
+ previousItemId?: string;
36
+ raw: unknown;
37
+ }
38
+
39
+ // ── Conversation items ─────────────────────────────────────────────
40
+ | {
41
+ type: 'conversation-item-added';
42
+ itemId: string;
43
+ item: unknown;
44
+ raw: unknown;
45
+ }
46
+ | {
47
+ type: 'input-transcription-completed';
48
+ itemId: string;
49
+ transcript: string;
50
+ raw: unknown;
51
+ }
52
+
53
+ // ── Response lifecycle ─────────────────────────────────────────────
54
+ | {
55
+ type: 'response-created';
56
+ responseId: string;
57
+ raw: unknown;
58
+ }
59
+ | {
60
+ type: 'response-done';
61
+ responseId: string;
62
+ status: string;
63
+ raw: unknown;
64
+ }
65
+
66
+ // ── Output item lifecycle ──────────────────────────────────────────
67
+ | {
68
+ type: 'output-item-added';
69
+ responseId: string;
70
+ itemId: string;
71
+ raw: unknown;
72
+ }
73
+ | {
74
+ type: 'output-item-done';
75
+ responseId: string;
76
+ itemId: string;
77
+ raw: unknown;
78
+ }
79
+ | {
80
+ type: 'content-part-added';
81
+ responseId: string;
82
+ itemId: string;
83
+ raw: unknown;
84
+ }
85
+ | {
86
+ type: 'content-part-done';
87
+ responseId: string;
88
+ itemId: string;
89
+ raw: unknown;
90
+ }
91
+
92
+ // ── Audio output ───────────────────────────────────────────────────
93
+ | {
94
+ type: 'audio-delta';
95
+ responseId: string;
96
+ itemId: string;
97
+
98
+ /**
99
+ * Base64-encoded audio chunk.
100
+ */
101
+ delta: string;
102
+ raw: unknown;
103
+ }
104
+ | {
105
+ type: 'audio-done';
106
+ responseId: string;
107
+ itemId: string;
108
+ raw: unknown;
109
+ }
110
+
111
+ // ── Audio transcript output ────────────────────────────────────────
112
+ | {
113
+ type: 'audio-transcript-delta';
114
+ responseId: string;
115
+ itemId: string;
116
+
117
+ /**
118
+ * Text chunk of the audio transcript.
119
+ */
120
+ delta: string;
121
+ raw: unknown;
122
+ }
123
+ | {
124
+ type: 'audio-transcript-done';
125
+ responseId: string;
126
+ itemId: string;
127
+ transcript?: string;
128
+ raw: unknown;
129
+ }
130
+
131
+ // ── Text output ────────────────────────────────────────────────────
132
+ | {
133
+ type: 'text-delta';
134
+ responseId: string;
135
+ itemId: string;
136
+
137
+ /**
138
+ * Text chunk of the model's text response.
139
+ */
140
+ delta: string;
141
+ raw: unknown;
142
+ }
143
+ | {
144
+ type: 'text-done';
145
+ responseId: string;
146
+ itemId: string;
147
+ text?: string;
148
+ raw: unknown;
149
+ }
150
+
151
+ // ── Function calling ───────────────────────────────────────────────
152
+ | {
153
+ type: 'function-call-arguments-delta';
154
+ responseId: string;
155
+ itemId: string;
156
+ callId: string;
157
+
158
+ /**
159
+ * Partial JSON string of function call arguments.
160
+ */
161
+ delta: string;
162
+ raw: unknown;
163
+ }
164
+ | {
165
+ type: 'function-call-arguments-done';
166
+ responseId: string;
167
+ itemId: string;
168
+ callId: string;
169
+
170
+ /**
171
+ * The name of the function to call.
172
+ */
173
+ name: string;
174
+
175
+ /**
176
+ * Complete JSON string of function call arguments.
177
+ */
178
+ arguments: string;
179
+ raw: unknown;
180
+ }
181
+
182
+ // ── Error ──────────────────────────────────────────────────────────
183
+ | {
184
+ type: 'error';
185
+ message: string;
186
+ code?: string;
187
+ raw: unknown;
188
+ }
189
+
190
+ // ── Custom / provider-specific ────────────────────────────────────
191
+ | {
192
+ type: 'custom';
193
+
194
+ /**
195
+ * The original event type string from the provider.
196
+ */
197
+ rawType: string;
198
+ raw: unknown;
199
+ };
@@ -0,0 +1,142 @@
1
+ import type { RealtimeModelV4ToolDefinition } from './realtime-model-v4-tool-definition';
2
+
3
+ /**
4
+ * Provider-neutral configuration for a realtime session.
5
+ * Each provider maps this to their specific session.update payload.
6
+ */
7
+ export type RealtimeModelV4SessionConfig = {
8
+ /**
9
+ * System instructions for the model.
10
+ */
11
+ instructions?: string;
12
+
13
+ /**
14
+ * Voice to use for audio output.
15
+ */
16
+ voice?: string;
17
+
18
+ /**
19
+ * Which output modalities the model should produce.
20
+ */
21
+ outputModalities?: Array<'text' | 'audio'>;
22
+
23
+ /**
24
+ * Audio format configuration for input audio.
25
+ */
26
+ inputAudioFormat?: {
27
+ /**
28
+ * Audio format type (e.g. "audio/pcm", "audio/pcmu", "audio/pcma").
29
+ */
30
+ type: string;
31
+
32
+ /**
33
+ * Sample rate in Hz. Only applicable for PCM format.
34
+ */
35
+ rate?: number;
36
+ };
37
+
38
+ /**
39
+ * Input audio transcription configuration.
40
+ *
41
+ * When enabled, providers that support input transcription emit normalized
42
+ * `input-transcription-completed` events that can be rendered as user
43
+ * messages.
44
+ */
45
+ inputAudioTranscription?: {
46
+ /**
47
+ * Provider-specific transcription model.
48
+ */
49
+ model?: string;
50
+
51
+ /**
52
+ * Optional language hint for the input audio.
53
+ */
54
+ language?: string;
55
+
56
+ /**
57
+ * Optional prompt to guide transcription.
58
+ */
59
+ prompt?: string;
60
+ };
61
+
62
+ /**
63
+ * Output audio transcription configuration.
64
+ *
65
+ * When enabled, providers that support output transcription emit normalized
66
+ * `audio-transcript-delta` / `audio-transcript-done` events for the model's
67
+ * spoken response. Some providers transcribe output by default; setting this
68
+ * makes the behavior explicit rather than relying on that default.
69
+ */
70
+ outputAudioTranscription?: {
71
+ /**
72
+ * Provider-specific transcription model.
73
+ */
74
+ model?: string;
75
+
76
+ /**
77
+ * Optional language hint for the output audio.
78
+ */
79
+ language?: string;
80
+
81
+ /**
82
+ * Optional prompt to guide transcription.
83
+ */
84
+ prompt?: string;
85
+ };
86
+
87
+ /**
88
+ * Audio format configuration for output audio.
89
+ */
90
+ outputAudioFormat?: {
91
+ /**
92
+ * Audio format type (e.g. "audio/pcm", "audio/pcmu", "audio/pcma").
93
+ */
94
+ type: string;
95
+
96
+ /**
97
+ * Sample rate in Hz. Only applicable for PCM format.
98
+ */
99
+ rate?: number;
100
+ };
101
+
102
+ /**
103
+ * Voice activity detection configuration.
104
+ * Set to null or type 'disabled' to turn off VAD (push-to-talk mode).
105
+ */
106
+ turnDetection?: {
107
+ /**
108
+ * VAD mode. 'server-vad' for automatic detection,
109
+ * 'semantic-vad' for OpenAI's semantic detection,
110
+ * 'disabled' to turn off VAD.
111
+ */
112
+ type: 'server-vad' | 'semantic-vad' | 'disabled';
113
+
114
+ /**
115
+ * VAD activation threshold (0.0-1.0).
116
+ * Higher values require louder audio to trigger.
117
+ */
118
+ threshold?: number;
119
+
120
+ /**
121
+ * How long the user must be silent (in ms) before
122
+ * the server ends the turn.
123
+ */
124
+ silenceDurationMs?: number;
125
+
126
+ /**
127
+ * Amount of audio (in ms) to include before the
128
+ * detected start of speech.
129
+ */
130
+ prefixPaddingMs?: number;
131
+ } | null;
132
+
133
+ /**
134
+ * Tool definitions available to the model in this session.
135
+ */
136
+ tools?: RealtimeModelV4ToolDefinition[];
137
+
138
+ /**
139
+ * Provider-specific options that are passed through to the provider.
140
+ */
141
+ providerOptions?: Record<string, unknown>;
142
+ };
@@ -0,0 +1,28 @@
1
+ import type { JSONSchema7 } from 'json-schema';
2
+
3
+ /**
4
+ * A tool definition for realtime models. Sent as part of the session
5
+ * configuration so the model knows which functions it can call.
6
+ */
7
+ export type RealtimeModelV4ToolDefinition = {
8
+ /**
9
+ * The type of the tool (always 'function').
10
+ */
11
+ type: 'function';
12
+
13
+ /**
14
+ * The name of the tool. Unique within the session.
15
+ */
16
+ name: string;
17
+
18
+ /**
19
+ * A description of what the tool does. The model uses this to decide
20
+ * whether to call the tool.
21
+ */
22
+ description?: string;
23
+
24
+ /**
25
+ * JSON Schema describing the parameters the tool expects.
26
+ */
27
+ parameters: JSONSchema7;
28
+ };