@ai-sdk/provider 4.0.0-beta.13 → 4.0.0-beta.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +50 -0
- package/dist/index.d.ts +626 -67
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
- package/src/embedding-model/v2/embedding-model-v2.ts +2 -2
- package/src/embedding-model/v3/embedding-model-v3-call-options.ts +1 -1
- package/src/embedding-model/v3/embedding-model-v3-result.ts +2 -2
- package/src/embedding-model/v3/embedding-model-v3.ts +2 -2
- package/src/embedding-model/v4/embedding-model-v4-call-options.ts +1 -1
- package/src/embedding-model/v4/embedding-model-v4-result.ts +2 -2
- package/src/embedding-model/v4/embedding-model-v4.ts +2 -2
- package/src/embedding-model-middleware/v3/embedding-model-v3-middleware.ts +2 -2
- package/src/embedding-model-middleware/v4/embedding-model-v4-middleware.ts +2 -2
- package/src/errors/no-such-provider-reference-error.ts +1 -1
- package/src/files/v4/files-v4-upload-file-call-options.ts +10 -3
- package/src/files/v4/files-v4-upload-file-result.ts +3 -3
- package/src/files/v4/files-v4.ts +2 -2
- package/src/image-model/v2/image-model-v2-call-options.ts +1 -1
- package/src/image-model/v2/image-model-v2-call-warning.ts +1 -1
- package/src/image-model/v2/image-model-v2.ts +3 -3
- package/src/image-model/v3/image-model-v3-call-options.ts +2 -2
- package/src/image-model/v3/image-model-v3-file.ts +1 -1
- package/src/image-model/v3/image-model-v3.ts +4 -4
- package/src/image-model/v4/image-model-v4-call-options.ts +2 -2
- package/src/image-model/v4/image-model-v4-file.ts +1 -1
- package/src/image-model/v4/image-model-v4-result.ts +3 -3
- package/src/image-model/v4/image-model-v4.ts +2 -2
- package/src/image-model-middleware/v3/image-model-v3-middleware.ts +2 -2
- package/src/image-model-middleware/v4/image-model-v4-middleware.ts +2 -2
- package/src/index.ts +1 -0
- package/src/json-value/is-json.ts +1 -1
- package/src/language-model/v2/language-model-v2-call-options.ts +6 -6
- package/src/language-model/v2/language-model-v2-call-warning.ts +3 -3
- package/src/language-model/v2/language-model-v2-content.ts +6 -6
- package/src/language-model/v2/language-model-v2-function-tool.ts +2 -2
- package/src/language-model/v2/language-model-v2-prompt.ts +3 -3
- package/src/language-model/v2/language-model-v2-reasoning.ts +1 -1
- package/src/language-model/v2/language-model-v2-source.ts +1 -1
- package/src/language-model/v2/language-model-v2-stream-part.ts +9 -9
- package/src/language-model/v2/language-model-v2-text.ts +1 -1
- package/src/language-model/v2/language-model-v2-tool-call.ts +1 -1
- package/src/language-model/v2/language-model-v2-tool-result.ts +1 -1
- package/src/language-model/v2/language-model-v2.ts +9 -9
- package/src/language-model/v3/language-model-v3-call-options.ts +6 -6
- package/src/language-model/v3/language-model-v3-content.ts +7 -7
- package/src/language-model/v3/language-model-v3-file.ts +1 -1
- package/src/language-model/v3/language-model-v3-function-tool.ts +3 -3
- package/src/language-model/v3/language-model-v3-generate-result.ts +6 -6
- package/src/language-model/v3/language-model-v3-prompt.ts +3 -3
- package/src/language-model/v3/language-model-v3-reasoning.ts +1 -1
- package/src/language-model/v3/language-model-v3-source.ts +1 -1
- package/src/language-model/v3/language-model-v3-stream-part.ts +10 -10
- package/src/language-model/v3/language-model-v3-stream-result.ts +2 -2
- package/src/language-model/v3/language-model-v3-text.ts +1 -1
- package/src/language-model/v3/language-model-v3-tool-approval-request.ts +1 -1
- package/src/language-model/v3/language-model-v3-tool-call.ts +1 -1
- package/src/language-model/v3/language-model-v3-tool-result.ts +2 -2
- package/src/language-model/v3/language-model-v3-usage.ts +1 -1
- package/src/language-model/v3/language-model-v3.ts +3 -3
- package/src/language-model/v4/index.ts +0 -1
- package/src/language-model/v4/language-model-v4-call-options.ts +6 -6
- package/src/language-model/v4/language-model-v4-content.ts +9 -9
- package/src/language-model/v4/language-model-v4-custom-content.ts +1 -1
- package/src/language-model/v4/language-model-v4-file.ts +10 -3
- package/src/language-model/v4/language-model-v4-function-tool.ts +3 -3
- package/src/language-model/v4/language-model-v4-generate-result.ts +6 -6
- package/src/language-model/v4/language-model-v4-prompt.ts +43 -48
- package/src/language-model/v4/language-model-v4-reasoning-file.ts +10 -3
- package/src/language-model/v4/language-model-v4-reasoning.ts +1 -1
- package/src/language-model/v4/language-model-v4-source.ts +1 -1
- package/src/language-model/v4/language-model-v4-stream-part.ts +12 -12
- package/src/language-model/v4/language-model-v4-stream-result.ts +2 -2
- package/src/language-model/v4/language-model-v4-text.ts +1 -1
- package/src/language-model/v4/language-model-v4-tool-approval-request.ts +1 -1
- package/src/language-model/v4/language-model-v4-tool-call.ts +1 -1
- package/src/language-model/v4/language-model-v4-tool-result.ts +2 -2
- package/src/language-model/v4/language-model-v4-usage.ts +1 -1
- package/src/language-model/v4/language-model-v4.ts +3 -3
- package/src/language-model-middleware/v2/language-model-v2-middleware.ts +2 -2
- package/src/language-model-middleware/v3/language-model-v3-middleware.ts +4 -4
- package/src/language-model-middleware/v4/language-model-v4-middleware.ts +4 -4
- package/src/provider/v2/provider-v2.ts +5 -5
- package/src/provider/v3/provider-v3.ts +6 -6
- package/src/provider/v4/provider-v4.ts +8 -8
- package/src/realtime-model/index.ts +1 -0
- package/src/realtime-model/v4/index.ts +20 -0
- package/src/realtime-model/v4/realtime-factory-v4.ts +20 -0
- package/src/realtime-model/v4/realtime-model-v4-client-event.ts +68 -0
- package/src/realtime-model/v4/realtime-model-v4-client-secret.ts +40 -0
- package/src/realtime-model/v4/realtime-model-v4-conversation-item.ts +55 -0
- package/src/realtime-model/v4/realtime-model-v4-server-event.ts +199 -0
- package/src/realtime-model/v4/realtime-model-v4-session-config.ts +142 -0
- package/src/realtime-model/v4/realtime-model-v4-tool-definition.ts +28 -0
- package/src/realtime-model/v4/realtime-model-v4.ts +89 -0
- package/src/reranking-model/v3/reranking-model-v3-call-options.ts +2 -2
- package/src/reranking-model/v3/reranking-model-v3.ts +2 -2
- package/src/reranking-model/v4/reranking-model-v4-call-options.ts +2 -2
- package/src/reranking-model/v4/reranking-model-v4-result.ts +1 -1
- package/src/reranking-model/v4/reranking-model-v4.ts +2 -2
- package/src/shared/v2/shared-v2-provider-metadata.ts +1 -1
- package/src/shared/v2/shared-v2-provider-options.ts +1 -1
- package/src/shared/v3/shared-v3-provider-metadata.ts +1 -1
- package/src/shared/v3/shared-v3-provider-options.ts +1 -1
- package/src/shared/v4/index.ts +1 -0
- package/src/shared/v4/shared-v4-file-data.ts +49 -0
- package/src/shared/v4/shared-v4-provider-metadata.ts +1 -1
- package/src/shared/v4/shared-v4-provider-options.ts +1 -1
- package/src/shared/v4/shared-v4-provider-reference.ts +8 -1
- package/src/skills/v4/skills-v4-upload-skill-call-options.ts +10 -3
- package/src/skills/v4/skills-v4-upload-skill-result.ts +3 -3
- package/src/skills/v4/skills-v4.ts +2 -2
- package/src/speech-model/v2/speech-model-v2-call-options.ts +1 -1
- package/src/speech-model/v2/speech-model-v2-call-warning.ts +1 -1
- package/src/speech-model/v2/speech-model-v2.ts +4 -4
- package/src/speech-model/v3/speech-model-v3-call-options.ts +1 -1
- package/src/speech-model/v3/speech-model-v3.ts +4 -4
- package/src/speech-model/v4/speech-model-v4-call-options.ts +1 -1
- package/src/speech-model/v4/speech-model-v4-result.ts +3 -3
- package/src/speech-model/v4/speech-model-v4.ts +2 -2
- package/src/transcription-model/v2/transcription-model-v2-call-options.ts +1 -1
- package/src/transcription-model/v2/transcription-model-v2-call-warning.ts +1 -1
- package/src/transcription-model/v2/transcription-model-v2.ts +4 -4
- package/src/transcription-model/v3/transcription-model-v3-call-options.ts +1 -1
- package/src/transcription-model/v3/transcription-model-v3.ts +4 -4
- package/src/transcription-model/v4/transcription-model-v4-call-options.ts +1 -1
- package/src/transcription-model/v4/transcription-model-v4-result.ts +3 -3
- package/src/transcription-model/v4/transcription-model-v4.ts +2 -2
- package/src/video-model/v3/video-model-v3-call-options.ts +2 -2
- package/src/video-model/v3/video-model-v3-file.ts +1 -1
- package/src/video-model/v4/video-model-v4-call-options.ts +2 -2
- package/src/video-model/v4/video-model-v4-file.ts +1 -1
- package/src/language-model/v4/language-model-v4-data-content.ts +0 -4
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import { EmbeddingModelV4 } from '../../embedding-model/v4/embedding-model-v4';
|
|
2
|
-
import { FilesV4 } from '../../files/v4/files-v4';
|
|
3
|
-
import { ImageModelV4 } from '../../image-model/v4/image-model-v4';
|
|
4
|
-
import { LanguageModelV4 } from '../../language-model/v4/language-model-v4';
|
|
5
|
-
import { RerankingModelV4 } from '../../reranking-model/v4/reranking-model-v4';
|
|
6
|
-
import { SpeechModelV4 } from '../../speech-model/v4/speech-model-v4';
|
|
7
|
-
import { TranscriptionModelV4 } from '../../transcription-model/v4/transcription-model-v4';
|
|
8
|
-
import { SkillsV4 } from '../../skills/v4/skills-v4';
|
|
1
|
+
import type { EmbeddingModelV4 } from '../../embedding-model/v4/embedding-model-v4';
|
|
2
|
+
import type { FilesV4 } from '../../files/v4/files-v4';
|
|
3
|
+
import type { ImageModelV4 } from '../../image-model/v4/image-model-v4';
|
|
4
|
+
import type { LanguageModelV4 } from '../../language-model/v4/language-model-v4';
|
|
5
|
+
import type { RerankingModelV4 } from '../../reranking-model/v4/reranking-model-v4';
|
|
6
|
+
import type { SpeechModelV4 } from '../../speech-model/v4/speech-model-v4';
|
|
7
|
+
import type { TranscriptionModelV4 } from '../../transcription-model/v4/transcription-model-v4';
|
|
8
|
+
import type { SkillsV4 } from '../../skills/v4/skills-v4';
|
|
9
9
|
|
|
10
10
|
/**
|
|
11
11
|
* Provider for language, text embedding, and image generation models.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './v4/index';
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
export type {
|
|
2
|
+
RealtimeFactoryV4 as Experimental_RealtimeFactoryV4,
|
|
3
|
+
RealtimeFactoryV4GetTokenOptions as Experimental_RealtimeFactoryV4GetTokenOptions,
|
|
4
|
+
RealtimeFactoryV4GetTokenResult as Experimental_RealtimeFactoryV4GetTokenResult,
|
|
5
|
+
} from './realtime-factory-v4';
|
|
6
|
+
export type { RealtimeModelV4 as Experimental_RealtimeModelV4 } from './realtime-model-v4';
|
|
7
|
+
export type { RealtimeModelV4ClientEvent as Experimental_RealtimeModelV4ClientEvent } from './realtime-model-v4-client-event';
|
|
8
|
+
export type {
|
|
9
|
+
RealtimeModelV4ClientSecretOptions as Experimental_RealtimeModelV4ClientSecretOptions,
|
|
10
|
+
RealtimeModelV4ClientSecretResult as Experimental_RealtimeModelV4ClientSecretResult,
|
|
11
|
+
} from './realtime-model-v4-client-secret';
|
|
12
|
+
export type {
|
|
13
|
+
RealtimeModelV4ConversationItem as Experimental_RealtimeModelV4ConversationItem,
|
|
14
|
+
RealtimeModelV4TextMessage as Experimental_RealtimeModelV4TextMessage,
|
|
15
|
+
RealtimeModelV4AudioMessage as Experimental_RealtimeModelV4AudioMessage,
|
|
16
|
+
RealtimeModelV4FunctionCallOutput as Experimental_RealtimeModelV4FunctionCallOutput,
|
|
17
|
+
} from './realtime-model-v4-conversation-item';
|
|
18
|
+
export type { RealtimeModelV4ServerEvent as Experimental_RealtimeModelV4ServerEvent } from './realtime-model-v4-server-event';
|
|
19
|
+
export type { RealtimeModelV4SessionConfig as Experimental_RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
|
|
20
|
+
export type { RealtimeModelV4ToolDefinition as Experimental_RealtimeModelV4ToolDefinition } from './realtime-model-v4-tool-definition';
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { RealtimeModelV4 } from './realtime-model-v4';
|
|
2
|
+
import type { RealtimeModelV4ClientSecretOptions } from './realtime-model-v4-client-secret';
|
|
3
|
+
|
|
4
|
+
export type RealtimeFactoryV4GetTokenOptions = {
|
|
5
|
+
model: string;
|
|
6
|
+
} & RealtimeModelV4ClientSecretOptions;
|
|
7
|
+
|
|
8
|
+
export type RealtimeFactoryV4GetTokenResult = {
|
|
9
|
+
token: string;
|
|
10
|
+
url: string;
|
|
11
|
+
expiresAt?: number;
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
export interface RealtimeFactoryV4 {
|
|
15
|
+
(modelId: string): RealtimeModelV4;
|
|
16
|
+
|
|
17
|
+
getToken(
|
|
18
|
+
options: RealtimeFactoryV4GetTokenOptions,
|
|
19
|
+
): Promise<RealtimeFactoryV4GetTokenResult>;
|
|
20
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import type { RealtimeModelV4ConversationItem } from './realtime-model-v4-conversation-item';
|
|
2
|
+
import type { RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Normalized events sent from the browser to the realtime model.
|
|
6
|
+
* Each provider maps this to its native event format before sending
|
|
7
|
+
* over the WebSocket.
|
|
8
|
+
*/
|
|
9
|
+
export type RealtimeModelV4ClientEvent =
|
|
10
|
+
// ── Session ────────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
| {
|
|
13
|
+
type: 'session-update';
|
|
14
|
+
config: RealtimeModelV4SessionConfig;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
// ── Input audio buffer ─────────────────────────────────────────────
|
|
18
|
+
| {
|
|
19
|
+
type: 'input-audio-append';
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Base64-encoded audio chunk to append to the input buffer.
|
|
23
|
+
*/
|
|
24
|
+
audio: string;
|
|
25
|
+
}
|
|
26
|
+
| {
|
|
27
|
+
type: 'input-audio-commit';
|
|
28
|
+
}
|
|
29
|
+
| {
|
|
30
|
+
type: 'input-audio-clear';
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// ── Conversation items ─────────────────────────────────────────────
|
|
34
|
+
| {
|
|
35
|
+
type: 'conversation-item-create';
|
|
36
|
+
item: RealtimeModelV4ConversationItem;
|
|
37
|
+
}
|
|
38
|
+
| {
|
|
39
|
+
type: 'conversation-item-truncate';
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* The ID of the assistant message item to truncate.
|
|
43
|
+
*/
|
|
44
|
+
itemId: string;
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* The index of the content part to truncate.
|
|
48
|
+
*/
|
|
49
|
+
contentIndex: number;
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Truncate audio after this many milliseconds.
|
|
53
|
+
*/
|
|
54
|
+
audioEndMs: number;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// ── Response control ───────────────────────────────────────────────
|
|
58
|
+
| {
|
|
59
|
+
type: 'response-create';
|
|
60
|
+
options?: {
|
|
61
|
+
modalities?: string[];
|
|
62
|
+
instructions?: string;
|
|
63
|
+
metadata?: Record<string, unknown>;
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
| {
|
|
67
|
+
type: 'response-cancel';
|
|
68
|
+
};
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import type { RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Options for creating an ephemeral client secret for browser-side
|
|
5
|
+
* WebSocket connections to a realtime model.
|
|
6
|
+
*/
|
|
7
|
+
export type RealtimeModelV4ClientSecretOptions = {
|
|
8
|
+
/**
|
|
9
|
+
* Number of seconds until the client secret expires.
|
|
10
|
+
*/
|
|
11
|
+
expiresAfterSeconds?: number;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Optional session configuration to embed in the token request.
|
|
15
|
+
* Some providers (e.g. Google) require the full session config at token creation time.
|
|
16
|
+
*/
|
|
17
|
+
sessionConfig?: RealtimeModelV4SessionConfig;
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Result of creating an ephemeral client secret.
|
|
22
|
+
*/
|
|
23
|
+
export type RealtimeModelV4ClientSecretResult = {
|
|
24
|
+
/**
|
|
25
|
+
* The ephemeral token value. Used as a Bearer token or in the
|
|
26
|
+
* WebSocket subprotocol header for authentication.
|
|
27
|
+
*/
|
|
28
|
+
token: string;
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* The WebSocket URL to connect to. Includes any provider-specific
|
|
32
|
+
* query parameters (e.g. model ID).
|
|
33
|
+
*/
|
|
34
|
+
url: string;
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Unix timestamp (seconds) when this client secret expires.
|
|
38
|
+
*/
|
|
39
|
+
expiresAt?: number;
|
|
40
|
+
};
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A conversation item that can be created by the client and sent to
|
|
3
|
+
* the model via the conversation.item.create event.
|
|
4
|
+
*/
|
|
5
|
+
export type RealtimeModelV4ConversationItem =
|
|
6
|
+
| RealtimeModelV4TextMessage
|
|
7
|
+
| RealtimeModelV4AudioMessage
|
|
8
|
+
| RealtimeModelV4FunctionCallOutput;
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* A text message from the user.
|
|
12
|
+
*/
|
|
13
|
+
export type RealtimeModelV4TextMessage = {
|
|
14
|
+
type: 'text-message';
|
|
15
|
+
role: 'user';
|
|
16
|
+
text: string;
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* An audio message from the user (complete audio, not streamed).
|
|
21
|
+
*/
|
|
22
|
+
export type RealtimeModelV4AudioMessage = {
|
|
23
|
+
type: 'audio-message';
|
|
24
|
+
role: 'user';
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Base64-encoded audio data.
|
|
28
|
+
*/
|
|
29
|
+
audio: string;
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* The output of a function call, sent back to the model so it can
|
|
34
|
+
* continue generating a response using the tool result.
|
|
35
|
+
*/
|
|
36
|
+
export type RealtimeModelV4FunctionCallOutput = {
|
|
37
|
+
type: 'function-call-output';
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* The call ID from the function-call-arguments-done event.
|
|
41
|
+
* Must match so the model knows which function call this result is for.
|
|
42
|
+
*/
|
|
43
|
+
callId: string;
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* The name of the function that was called.
|
|
47
|
+
* Required by some providers (e.g. Google) in the tool response routing.
|
|
48
|
+
*/
|
|
49
|
+
name?: string;
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* JSON string containing the function call result.
|
|
53
|
+
*/
|
|
54
|
+
output: string;
|
|
55
|
+
};
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Normalized events emitted by the realtime model (model → browser).
|
|
3
|
+
* Each provider maps its native event format to this discriminated union.
|
|
4
|
+
*
|
|
5
|
+
* Every event includes a `raw` field with the original provider-specific
|
|
6
|
+
* event data for debugging and provider-specific access.
|
|
7
|
+
*/
|
|
8
|
+
export type RealtimeModelV4ServerEvent =
|
|
9
|
+
// ── Session lifecycle ──────────────────────────────────────────────
|
|
10
|
+
|
|
11
|
+
| {
|
|
12
|
+
type: 'session-created';
|
|
13
|
+
sessionId?: string;
|
|
14
|
+
raw: unknown;
|
|
15
|
+
}
|
|
16
|
+
| {
|
|
17
|
+
type: 'session-updated';
|
|
18
|
+
raw: unknown;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// ── Input audio buffer ─────────────────────────────────────────────
|
|
22
|
+
| {
|
|
23
|
+
type: 'speech-started';
|
|
24
|
+
itemId?: string;
|
|
25
|
+
raw: unknown;
|
|
26
|
+
}
|
|
27
|
+
| {
|
|
28
|
+
type: 'speech-stopped';
|
|
29
|
+
itemId?: string;
|
|
30
|
+
raw: unknown;
|
|
31
|
+
}
|
|
32
|
+
| {
|
|
33
|
+
type: 'audio-committed';
|
|
34
|
+
itemId?: string;
|
|
35
|
+
previousItemId?: string;
|
|
36
|
+
raw: unknown;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// ── Conversation items ─────────────────────────────────────────────
|
|
40
|
+
| {
|
|
41
|
+
type: 'conversation-item-added';
|
|
42
|
+
itemId: string;
|
|
43
|
+
item: unknown;
|
|
44
|
+
raw: unknown;
|
|
45
|
+
}
|
|
46
|
+
| {
|
|
47
|
+
type: 'input-transcription-completed';
|
|
48
|
+
itemId: string;
|
|
49
|
+
transcript: string;
|
|
50
|
+
raw: unknown;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// ── Response lifecycle ─────────────────────────────────────────────
|
|
54
|
+
| {
|
|
55
|
+
type: 'response-created';
|
|
56
|
+
responseId: string;
|
|
57
|
+
raw: unknown;
|
|
58
|
+
}
|
|
59
|
+
| {
|
|
60
|
+
type: 'response-done';
|
|
61
|
+
responseId: string;
|
|
62
|
+
status: string;
|
|
63
|
+
raw: unknown;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// ── Output item lifecycle ──────────────────────────────────────────
|
|
67
|
+
| {
|
|
68
|
+
type: 'output-item-added';
|
|
69
|
+
responseId: string;
|
|
70
|
+
itemId: string;
|
|
71
|
+
raw: unknown;
|
|
72
|
+
}
|
|
73
|
+
| {
|
|
74
|
+
type: 'output-item-done';
|
|
75
|
+
responseId: string;
|
|
76
|
+
itemId: string;
|
|
77
|
+
raw: unknown;
|
|
78
|
+
}
|
|
79
|
+
| {
|
|
80
|
+
type: 'content-part-added';
|
|
81
|
+
responseId: string;
|
|
82
|
+
itemId: string;
|
|
83
|
+
raw: unknown;
|
|
84
|
+
}
|
|
85
|
+
| {
|
|
86
|
+
type: 'content-part-done';
|
|
87
|
+
responseId: string;
|
|
88
|
+
itemId: string;
|
|
89
|
+
raw: unknown;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// ── Audio output ───────────────────────────────────────────────────
|
|
93
|
+
| {
|
|
94
|
+
type: 'audio-delta';
|
|
95
|
+
responseId: string;
|
|
96
|
+
itemId: string;
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Base64-encoded audio chunk.
|
|
100
|
+
*/
|
|
101
|
+
delta: string;
|
|
102
|
+
raw: unknown;
|
|
103
|
+
}
|
|
104
|
+
| {
|
|
105
|
+
type: 'audio-done';
|
|
106
|
+
responseId: string;
|
|
107
|
+
itemId: string;
|
|
108
|
+
raw: unknown;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// ── Audio transcript output ────────────────────────────────────────
|
|
112
|
+
| {
|
|
113
|
+
type: 'audio-transcript-delta';
|
|
114
|
+
responseId: string;
|
|
115
|
+
itemId: string;
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Text chunk of the audio transcript.
|
|
119
|
+
*/
|
|
120
|
+
delta: string;
|
|
121
|
+
raw: unknown;
|
|
122
|
+
}
|
|
123
|
+
| {
|
|
124
|
+
type: 'audio-transcript-done';
|
|
125
|
+
responseId: string;
|
|
126
|
+
itemId: string;
|
|
127
|
+
transcript?: string;
|
|
128
|
+
raw: unknown;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// ── Text output ────────────────────────────────────────────────────
|
|
132
|
+
| {
|
|
133
|
+
type: 'text-delta';
|
|
134
|
+
responseId: string;
|
|
135
|
+
itemId: string;
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Text chunk of the model's text response.
|
|
139
|
+
*/
|
|
140
|
+
delta: string;
|
|
141
|
+
raw: unknown;
|
|
142
|
+
}
|
|
143
|
+
| {
|
|
144
|
+
type: 'text-done';
|
|
145
|
+
responseId: string;
|
|
146
|
+
itemId: string;
|
|
147
|
+
text?: string;
|
|
148
|
+
raw: unknown;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// ── Function calling ───────────────────────────────────────────────
|
|
152
|
+
| {
|
|
153
|
+
type: 'function-call-arguments-delta';
|
|
154
|
+
responseId: string;
|
|
155
|
+
itemId: string;
|
|
156
|
+
callId: string;
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Partial JSON string of function call arguments.
|
|
160
|
+
*/
|
|
161
|
+
delta: string;
|
|
162
|
+
raw: unknown;
|
|
163
|
+
}
|
|
164
|
+
| {
|
|
165
|
+
type: 'function-call-arguments-done';
|
|
166
|
+
responseId: string;
|
|
167
|
+
itemId: string;
|
|
168
|
+
callId: string;
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* The name of the function to call.
|
|
172
|
+
*/
|
|
173
|
+
name: string;
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Complete JSON string of function call arguments.
|
|
177
|
+
*/
|
|
178
|
+
arguments: string;
|
|
179
|
+
raw: unknown;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// ── Error ──────────────────────────────────────────────────────────
|
|
183
|
+
| {
|
|
184
|
+
type: 'error';
|
|
185
|
+
message: string;
|
|
186
|
+
code?: string;
|
|
187
|
+
raw: unknown;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// ── Custom / provider-specific ────────────────────────────────────
|
|
191
|
+
| {
|
|
192
|
+
type: 'custom';
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* The original event type string from the provider.
|
|
196
|
+
*/
|
|
197
|
+
rawType: string;
|
|
198
|
+
raw: unknown;
|
|
199
|
+
};
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import type { RealtimeModelV4ToolDefinition } from './realtime-model-v4-tool-definition';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Provider-neutral configuration for a realtime session.
|
|
5
|
+
* Each provider maps this to their specific session.update payload.
|
|
6
|
+
*/
|
|
7
|
+
export type RealtimeModelV4SessionConfig = {
|
|
8
|
+
/**
|
|
9
|
+
* System instructions for the model.
|
|
10
|
+
*/
|
|
11
|
+
instructions?: string;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Voice to use for audio output.
|
|
15
|
+
*/
|
|
16
|
+
voice?: string;
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Which output modalities the model should produce.
|
|
20
|
+
*/
|
|
21
|
+
outputModalities?: Array<'text' | 'audio'>;
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Audio format configuration for input audio.
|
|
25
|
+
*/
|
|
26
|
+
inputAudioFormat?: {
|
|
27
|
+
/**
|
|
28
|
+
* Audio format type (e.g. "audio/pcm", "audio/pcmu", "audio/pcma").
|
|
29
|
+
*/
|
|
30
|
+
type: string;
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Sample rate in Hz. Only applicable for PCM format.
|
|
34
|
+
*/
|
|
35
|
+
rate?: number;
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Input audio transcription configuration.
|
|
40
|
+
*
|
|
41
|
+
* When enabled, providers that support input transcription emit normalized
|
|
42
|
+
* `input-transcription-completed` events that can be rendered as user
|
|
43
|
+
* messages.
|
|
44
|
+
*/
|
|
45
|
+
inputAudioTranscription?: {
|
|
46
|
+
/**
|
|
47
|
+
* Provider-specific transcription model.
|
|
48
|
+
*/
|
|
49
|
+
model?: string;
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Optional language hint for the input audio.
|
|
53
|
+
*/
|
|
54
|
+
language?: string;
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Optional prompt to guide transcription.
|
|
58
|
+
*/
|
|
59
|
+
prompt?: string;
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Output audio transcription configuration.
|
|
64
|
+
*
|
|
65
|
+
* When enabled, providers that support output transcription emit normalized
|
|
66
|
+
* `audio-transcript-delta` / `audio-transcript-done` events for the model's
|
|
67
|
+
* spoken response. Some providers transcribe output by default; setting this
|
|
68
|
+
* makes the behavior explicit rather than relying on that default.
|
|
69
|
+
*/
|
|
70
|
+
outputAudioTranscription?: {
|
|
71
|
+
/**
|
|
72
|
+
* Provider-specific transcription model.
|
|
73
|
+
*/
|
|
74
|
+
model?: string;
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Optional language hint for the output audio.
|
|
78
|
+
*/
|
|
79
|
+
language?: string;
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Optional prompt to guide transcription.
|
|
83
|
+
*/
|
|
84
|
+
prompt?: string;
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Audio format configuration for output audio.
|
|
89
|
+
*/
|
|
90
|
+
outputAudioFormat?: {
|
|
91
|
+
/**
|
|
92
|
+
* Audio format type (e.g. "audio/pcm", "audio/pcmu", "audio/pcma").
|
|
93
|
+
*/
|
|
94
|
+
type: string;
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Sample rate in Hz. Only applicable for PCM format.
|
|
98
|
+
*/
|
|
99
|
+
rate?: number;
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Voice activity detection configuration.
|
|
104
|
+
* Set to null or type 'disabled' to turn off VAD (push-to-talk mode).
|
|
105
|
+
*/
|
|
106
|
+
turnDetection?: {
|
|
107
|
+
/**
|
|
108
|
+
* VAD mode. 'server-vad' for automatic detection,
|
|
109
|
+
* 'semantic-vad' for OpenAI's semantic detection,
|
|
110
|
+
* 'disabled' to turn off VAD.
|
|
111
|
+
*/
|
|
112
|
+
type: 'server-vad' | 'semantic-vad' | 'disabled';
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* VAD activation threshold (0.0-1.0).
|
|
116
|
+
* Higher values require louder audio to trigger.
|
|
117
|
+
*/
|
|
118
|
+
threshold?: number;
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* How long the user must be silent (in ms) before
|
|
122
|
+
* the server ends the turn.
|
|
123
|
+
*/
|
|
124
|
+
silenceDurationMs?: number;
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Amount of audio (in ms) to include before the
|
|
128
|
+
* detected start of speech.
|
|
129
|
+
*/
|
|
130
|
+
prefixPaddingMs?: number;
|
|
131
|
+
} | null;
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Tool definitions available to the model in this session.
|
|
135
|
+
*/
|
|
136
|
+
tools?: RealtimeModelV4ToolDefinition[];
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Provider-specific options that are passed through to the provider.
|
|
140
|
+
*/
|
|
141
|
+
providerOptions?: Record<string, unknown>;
|
|
142
|
+
};
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { JSONSchema7 } from 'json-schema';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* A tool definition for realtime models. Sent as part of the session
|
|
5
|
+
* configuration so the model knows which functions it can call.
|
|
6
|
+
*/
|
|
7
|
+
export type RealtimeModelV4ToolDefinition = {
|
|
8
|
+
/**
|
|
9
|
+
* The type of the tool (always 'function').
|
|
10
|
+
*/
|
|
11
|
+
type: 'function';
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* The name of the tool. Unique within the session.
|
|
15
|
+
*/
|
|
16
|
+
name: string;
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* A description of what the tool does. The model uses this to decide
|
|
20
|
+
* whether to call the tool.
|
|
21
|
+
*/
|
|
22
|
+
description?: string;
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* JSON Schema describing the parameters the tool expects.
|
|
26
|
+
*/
|
|
27
|
+
parameters: JSONSchema7;
|
|
28
|
+
};
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
RealtimeModelV4ClientSecretOptions,
|
|
3
|
+
RealtimeModelV4ClientSecretResult,
|
|
4
|
+
} from './realtime-model-v4-client-secret';
|
|
5
|
+
import type { RealtimeModelV4ClientEvent } from './realtime-model-v4-client-event';
|
|
6
|
+
import type { RealtimeModelV4ServerEvent } from './realtime-model-v4-server-event';
|
|
7
|
+
import type { RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Specification for a realtime model that supports bidirectional
|
|
11
|
+
* audio/text communication over WebSocket.
|
|
12
|
+
*
|
|
13
|
+
* Providers implement this interface to enable realtime voice
|
|
14
|
+
* conversations through the AI SDK.
|
|
15
|
+
*/
|
|
16
|
+
export type RealtimeModelV4 = {
|
|
17
|
+
/**
|
|
18
|
+
* The realtime model must specify which interface version it implements.
|
|
19
|
+
*/
|
|
20
|
+
readonly specificationVersion: 'v4';
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Provider ID (e.g. 'openai', 'xai').
|
|
24
|
+
*/
|
|
25
|
+
readonly provider: string;
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Provider-specific model ID (e.g. 'gpt-4o-realtime', 'grok-3').
|
|
29
|
+
*/
|
|
30
|
+
readonly modelId: string;
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Server-side: Creates an ephemeral client secret for authenticating
|
|
34
|
+
* browser-side WebSocket connections. The secret is short-lived and
|
|
35
|
+
* safe to expose to client code.
|
|
36
|
+
*
|
|
37
|
+
* Naming: "do" prefix to prevent accidental direct usage by the user.
|
|
38
|
+
*/
|
|
39
|
+
doCreateClientSecret(
|
|
40
|
+
options: RealtimeModelV4ClientSecretOptions,
|
|
41
|
+
): PromiseLike<RealtimeModelV4ClientSecretResult>;
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Browser-side: Returns the WebSocket URL and subprotocols to use
|
|
45
|
+
* when connecting. Each provider has its own authentication mechanism
|
|
46
|
+
* (e.g. OpenAI uses subprotocol headers, xAI may use query params).
|
|
47
|
+
*/
|
|
48
|
+
getWebSocketConfig(options: { token: string; url: string }): {
|
|
49
|
+
url: string;
|
|
50
|
+
protocols?: string[];
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Browser-side: Parses a raw JSON event received over the WebSocket
|
|
55
|
+
* and returns one or more normalized events. Providers map their native
|
|
56
|
+
* event format to the common RealtimeModelV4ServerEvent union.
|
|
57
|
+
*
|
|
58
|
+
* Returns an array when a single provider message maps to multiple
|
|
59
|
+
* normalized events (e.g. Google's serverContent can contain audio,
|
|
60
|
+
* text, and turn-complete data in one message).
|
|
61
|
+
*/
|
|
62
|
+
parseServerEvent(
|
|
63
|
+
raw: unknown,
|
|
64
|
+
): RealtimeModelV4ServerEvent | RealtimeModelV4ServerEvent[];
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Browser-side: Serializes a normalized client event into the
|
|
68
|
+
* provider's native JSON format for sending over the WebSocket.
|
|
69
|
+
*/
|
|
70
|
+
serializeClientEvent(
|
|
71
|
+
event: RealtimeModelV4ClientEvent,
|
|
72
|
+
): unknown | PromiseLike<unknown>;
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Browser-side: Builds the provider-specific session configuration
|
|
76
|
+
* payload from a normalized session config. Used to construct the
|
|
77
|
+
* session.update event sent after WebSocket connection.
|
|
78
|
+
*/
|
|
79
|
+
buildSessionConfig(config: RealtimeModelV4SessionConfig): unknown;
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Browser-side: Returns a message to auto-send back over the WebSocket
|
|
83
|
+
* in response to a raw incoming message, or null if no response is needed.
|
|
84
|
+
*
|
|
85
|
+
* Used for provider-specific keepalive protocols (e.g. ping/pong).
|
|
86
|
+
* Called by the session layer before parseServerEvent.
|
|
87
|
+
*/
|
|
88
|
+
getHealthCheckResponse?(raw: unknown): unknown | null;
|
|
89
|
+
};
|