@ai-sdk/google 4.0.0-canary.77 → 4.0.0-canary.79

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,31 @@
1
1
  # @ai-sdk/google
2
2
 
3
+ ## 4.0.0-canary.79
4
+
5
+ ### Patch Changes
6
+
7
+ - ce769dd: feat(provider): add experimental Realtime API support for voice conversations
8
+
9
+ Adds first-class support for realtime (speech-to-speech) APIs:
10
+
11
+ - `Experimental_RealtimeModelV4` spec in `@ai-sdk/provider` with normalized event types and factory
12
+ - OpenAI, Google, and xAI realtime provider implementations
13
+ - `openai.experimental_realtime()` / `google.experimental_realtime()` / `xai.experimental_realtime()` work in both server and browser
14
+ - `.getToken()` static method on each provider for server-side ephemeral token creation
15
+ - `experimental_getRealtimeToolDefinitions` helper for provider session tool definitions
16
+ - `experimental_useRealtime` hook in `@ai-sdk/react` returning `UIMessage[]` (aligned with `useChat`), with `onToolCall` and `addToolOutput` for client-driven tool execution
17
+ - `inputAudioTranscription` session config for showing transcribed user audio messages when supported by the provider
18
+
19
+ - Updated dependencies [ce769dd]
20
+ - @ai-sdk/provider@4.0.0-canary.18
21
+ - @ai-sdk/provider-utils@5.0.0-canary.46
22
+
23
+ ## 4.0.0-canary.78
24
+
25
+ ### Patch Changes
26
+
27
+ - 2ce3c65: feat(provider/google-vertex): add Gemini text-to-speech (speech) model support
28
+
3
29
  ## 4.0.0-canary.77
4
30
 
5
31
  ### Patch Changes
package/dist/index.d.ts CHANGED
@@ -1,6 +1,6 @@
1
1
  import * as _ai_sdk_provider_utils from '@ai-sdk/provider-utils';
2
2
  import { InferSchema, FetchFunction } from '@ai-sdk/provider-utils';
3
- import { ProviderV4, LanguageModelV4, ImageModelV4, EmbeddingModelV4, Experimental_VideoModelV4, SpeechModelV4, FilesV4 } from '@ai-sdk/provider';
3
+ import { ProviderV4, LanguageModelV4, ImageModelV4, EmbeddingModelV4, Experimental_VideoModelV4, SpeechModelV4, FilesV4, Experimental_RealtimeFactoryV4, Experimental_RealtimeModelV4, Experimental_RealtimeModelV4ClientSecretOptions, Experimental_RealtimeModelV4ClientSecretResult, Experimental_RealtimeModelV4ServerEvent, Experimental_RealtimeModelV4ClientEvent, Experimental_RealtimeModelV4SessionConfig } from '@ai-sdk/provider';
4
4
 
5
5
  declare const googleErrorDataSchema: _ai_sdk_provider_utils.LazySchema<{
6
6
  error: {
@@ -554,6 +554,7 @@ interface GoogleProvider extends ProviderV4 {
554
554
  } | {
555
555
  managedAgent: string;
556
556
  }): LanguageModelV4;
557
+ experimental_realtime: Experimental_RealtimeFactoryV4;
557
558
  tools: typeof googleTools;
558
559
  }
559
560
  interface GoogleProviderSettings {
@@ -595,6 +596,32 @@ declare function createGoogle(options?: GoogleProviderSettings): GoogleProvider;
595
596
  */
596
597
  declare const google: GoogleProvider;
597
598
 
599
+ type GoogleRealtimeModelConfig = {
600
+ provider: string;
601
+ baseURL: string;
602
+ headers: () => Record<string, string | undefined>;
603
+ fetch?: FetchFunction;
604
+ };
605
+ declare class GoogleRealtimeModel implements Experimental_RealtimeModelV4 {
606
+ readonly specificationVersion: "v4";
607
+ readonly provider: string;
608
+ readonly modelId: string;
609
+ private readonly config;
610
+ private readonly mapper;
611
+ constructor(modelId: string, config: GoogleRealtimeModelConfig);
612
+ doCreateClientSecret(options: Experimental_RealtimeModelV4ClientSecretOptions): Promise<Experimental_RealtimeModelV4ClientSecretResult>;
613
+ getWebSocketConfig(options: {
614
+ token: string;
615
+ url: string;
616
+ }): {
617
+ url: string;
618
+ protocols?: string[];
619
+ };
620
+ parseServerEvent(raw: unknown): Experimental_RealtimeModelV4ServerEvent | Experimental_RealtimeModelV4ServerEvent[];
621
+ serializeClientEvent(event: Experimental_RealtimeModelV4ClientEvent): ReturnType<Experimental_RealtimeModelV4['serializeClientEvent']>;
622
+ buildSessionConfig(config: Experimental_RealtimeModelV4SessionConfig): Record<string, unknown>;
623
+ }
624
+
598
625
  declare const VERSION: string;
599
626
 
600
- export { type GoogleEmbeddingModelOptions, type GoogleErrorData, type GoogleFilesUploadOptions, type GoogleEmbeddingModelOptions as GoogleGenerativeAIEmbeddingProviderOptions, type GoogleImageModelOptions as GoogleGenerativeAIImageProviderOptions, type GoogleProvider as GoogleGenerativeAIProvider, type GoogleProviderMetadata as GoogleGenerativeAIProviderMetadata, type GoogleLanguageModelOptions as GoogleGenerativeAIProviderOptions, type GoogleProviderSettings as GoogleGenerativeAIProviderSettings, type GoogleVideoModelId as GoogleGenerativeAIVideoModelId, type GoogleVideoModelOptions as GoogleGenerativeAIVideoProviderOptions, type GoogleImageModelOptions, type GoogleInteractionsAgentName, type GoogleInteractionsModelId, type GoogleInteractionsProviderMetadata, type GoogleLanguageModelInteractionsOptions, type GoogleLanguageModelOptions, type GoogleProvider, type GoogleProviderMetadata, type GoogleProviderSettings, type GoogleSpeechModelId, type GoogleSpeechModelOptions, type GoogleVideoModelId, type GoogleVideoModelOptions, VERSION, createGoogle, createGoogle as createGoogleGenerativeAI, google };
627
+ export { GoogleRealtimeModel as Experimental_GoogleRealtimeModel, type GoogleRealtimeModelConfig as Experimental_GoogleRealtimeModelConfig, type GoogleEmbeddingModelOptions, type GoogleErrorData, type GoogleFilesUploadOptions, type GoogleEmbeddingModelOptions as GoogleGenerativeAIEmbeddingProviderOptions, type GoogleImageModelOptions as GoogleGenerativeAIImageProviderOptions, type GoogleProvider as GoogleGenerativeAIProvider, type GoogleProviderMetadata as GoogleGenerativeAIProviderMetadata, type GoogleLanguageModelOptions as GoogleGenerativeAIProviderOptions, type GoogleProviderSettings as GoogleGenerativeAIProviderSettings, type GoogleVideoModelId as GoogleGenerativeAIVideoModelId, type GoogleVideoModelOptions as GoogleGenerativeAIVideoProviderOptions, type GoogleImageModelOptions, type GoogleInteractionsAgentName, type GoogleInteractionsModelId, type GoogleInteractionsProviderMetadata, type GoogleLanguageModelInteractionsOptions, type GoogleLanguageModelOptions, type GoogleProvider, type GoogleProviderMetadata, type GoogleProviderSettings, type GoogleSpeechModelId, type GoogleSpeechModelOptions, type GoogleVideoModelId, type GoogleVideoModelOptions, VERSION, createGoogle, createGoogle as createGoogleGenerativeAI, google };
package/dist/index.js CHANGED
@@ -7,7 +7,7 @@ import {
7
7
  } from "@ai-sdk/provider-utils";
8
8
 
9
9
  // src/version.ts
10
- var VERSION = true ? "4.0.0-canary.77" : "0.0.0-test";
10
+ var VERSION = true ? "4.0.0-canary.79" : "0.0.0-test";
11
11
 
12
12
  // src/google-embedding-model.ts
13
13
  import {
@@ -3675,11 +3675,25 @@ var GoogleSpeechModel = class _GoogleSpeechModel {
3675
3675
  providerOptions
3676
3676
  }) {
3677
3677
  const warnings = [];
3678
- const googleOptions = await parseProviderOptions6({
3679
- provider: "google",
3680
- providerOptions,
3681
- schema: googleSpeechProviderOptionsSchema
3682
- });
3678
+ const providerOptionsNames = this.config.provider.includes("vertex") ? ["googleVertex", "vertex"] : ["google"];
3679
+ let googleOptions;
3680
+ for (const name of providerOptionsNames) {
3681
+ googleOptions = await parseProviderOptions6({
3682
+ provider: name,
3683
+ providerOptions,
3684
+ schema: googleSpeechProviderOptionsSchema
3685
+ });
3686
+ if (googleOptions != null) {
3687
+ break;
3688
+ }
3689
+ }
3690
+ if (googleOptions == null && !providerOptionsNames.includes("google")) {
3691
+ googleOptions = await parseProviderOptions6({
3692
+ provider: "google",
3693
+ providerOptions,
3694
+ schema: googleSpeechProviderOptionsSchema
3695
+ });
3696
+ }
3683
3697
  const multiSpeakerVoiceConfig = googleOptions == null ? void 0 : googleOptions.multiSpeakerVoiceConfig;
3684
3698
  const speechConfig = multiSpeakerVoiceConfig ? { multiSpeakerVoiceConfig } : { voiceConfig: { prebuiltVoiceConfig: { voiceName: voice } } };
3685
3699
  let promptText = text;
@@ -3719,7 +3733,7 @@ var GoogleSpeechModel = class _GoogleSpeechModel {
3719
3733
  });
3720
3734
  }
3721
3735
  const requestBody = {
3722
- contents: [{ parts: [{ text: promptText }] }],
3736
+ contents: [{ role: "user", parts: [{ text: promptText }] }],
3723
3737
  generationConfig: {
3724
3738
  responseModalities: ["AUDIO"],
3725
3739
  speechConfig
@@ -5632,14 +5646,14 @@ async function cancelGoogleInteraction({
5632
5646
  baseURL,
5633
5647
  interactionId,
5634
5648
  headers,
5635
- fetch = getOriginalFetch()
5649
+ fetch: fetch2 = getOriginalFetch()
5636
5650
  }) {
5637
5651
  if (interactionId == null || interactionId.length === 0) {
5638
5652
  return;
5639
5653
  }
5640
5654
  const url = `${baseURL}/interactions/${encodeURIComponent(interactionId)}/cancel`;
5641
5655
  try {
5642
- const response = await fetch(url, {
5656
+ const response = await fetch2(url, {
5643
5657
  method: "POST",
5644
5658
  headers: withUserAgentSuffix(
5645
5659
  combineHeaders7({ "Content-Type": "application/json" }, headers),
@@ -5667,7 +5681,7 @@ async function pollGoogleInteractionUntilTerminal({
5667
5681
  baseURL,
5668
5682
  interactionId,
5669
5683
  headers,
5670
- fetch,
5684
+ fetch: fetch2,
5671
5685
  abortSignal,
5672
5686
  initialDelayMs = DEFAULT_INITIAL_DELAY_MS,
5673
5687
  maxDelayMs = DEFAULT_MAX_DELAY_MS,
@@ -5681,7 +5695,7 @@ async function pollGoogleInteractionUntilTerminal({
5681
5695
  const startedAt = Date.now();
5682
5696
  let nextDelayMs = initialDelayMs;
5683
5697
  const url = `${baseURL}/interactions/${encodeURIComponent(interactionId)}`;
5684
- const cancelOnServer = () => cancelGoogleInteraction({ baseURL, interactionId, headers, fetch });
5698
+ const cancelOnServer = () => cancelGoogleInteraction({ baseURL, interactionId, headers, fetch: fetch2 });
5685
5699
  try {
5686
5700
  while (true) {
5687
5701
  if (abortSignal == null ? void 0 : abortSignal.aborted) {
@@ -5706,7 +5720,7 @@ async function pollGoogleInteractionUntilTerminal({
5706
5720
  googleInteractionsResponseSchema
5707
5721
  ),
5708
5722
  abortSignal,
5709
- fetch
5723
+ fetch: fetch2
5710
5724
  });
5711
5725
  if (isTerminalStatus(response.status)) {
5712
5726
  return { response, rawResponse, responseHeaders };
@@ -5882,7 +5896,7 @@ function streamGoogleInteractionEvents({
5882
5896
  baseURL,
5883
5897
  interactionId,
5884
5898
  headers,
5885
- fetch,
5899
+ fetch: fetch2,
5886
5900
  abortSignal,
5887
5901
  maxRetries = DEFAULT_MAX_RETRIES,
5888
5902
  retryDelayMs = DEFAULT_RETRY_DELAY_MS
@@ -5930,7 +5944,7 @@ function streamGoogleInteractionEvents({
5930
5944
  googleInteractionsEventSchema
5931
5945
  ),
5932
5946
  abortSignal: effectiveSignal,
5933
- fetch
5947
+ fetch: fetch2
5934
5948
  });
5935
5949
  return stream.getReader();
5936
5950
  }
@@ -6023,7 +6037,7 @@ function streamGoogleInteractionEvents({
6023
6037
  baseURL,
6024
6038
  interactionId,
6025
6039
  headers,
6026
- fetch
6040
+ fetch: fetch2
6027
6041
  });
6028
6042
  }
6029
6043
  }
@@ -6676,6 +6690,388 @@ function pruneUndefined(obj) {
6676
6690
  return result;
6677
6691
  }
6678
6692
 
6693
+ // src/realtime/google-realtime-event-mapper.ts
6694
+ import { safeParseJSON } from "@ai-sdk/provider-utils";
6695
+ var GoogleRealtimeEventMapper = class {
6696
+ constructor() {
6697
+ this.turnCounter = 0;
6698
+ this.hasAudio = false;
6699
+ this.hasText = false;
6700
+ this.hasTranscript = false;
6701
+ this.turnClosed = false;
6702
+ this.inputAudioRate = 16e3;
6703
+ }
6704
+ get responseId() {
6705
+ return `google-resp-${this.turnCounter}`;
6706
+ }
6707
+ get itemId() {
6708
+ return `google-item-${this.turnCounter}`;
6709
+ }
6710
+ /**
6711
+ * Rolls over to the next turn lazily, only once new model content actually
6712
+ * arrives. `turnComplete` merely marks the current turn closed; the counter
6713
+ * is not advanced until the next response begins. This keeps a transcript
6714
+ * that arrives shortly after `turnComplete` attached to the turn it belongs
6715
+ * to, since Google delivers transcription independently with no guaranteed
6716
+ * ordering relative to `turnComplete`.
6717
+ */
6718
+ beginTurnIfClosed() {
6719
+ if (!this.turnClosed) return;
6720
+ this.turnCounter++;
6721
+ this.hasAudio = false;
6722
+ this.hasText = false;
6723
+ this.hasTranscript = false;
6724
+ this.turnClosed = false;
6725
+ }
6726
+ parseServerEvent(raw) {
6727
+ var _a, _b;
6728
+ const data = raw;
6729
+ if (data.setupComplete != null) {
6730
+ return { type: "session-created", raw };
6731
+ }
6732
+ if (data.toolCall != null) {
6733
+ this.beginTurnIfClosed();
6734
+ const functionCalls = (_a = data.toolCall.functionCalls) != null ? _a : [];
6735
+ return functionCalls.flatMap((functionCall) => {
6736
+ var _a2;
6737
+ const args = JSON.stringify((_a2 = functionCall.args) != null ? _a2 : {});
6738
+ return [
6739
+ {
6740
+ type: "function-call-arguments-delta",
6741
+ responseId: this.responseId,
6742
+ itemId: this.itemId,
6743
+ callId: functionCall.id,
6744
+ delta: args,
6745
+ raw
6746
+ },
6747
+ {
6748
+ type: "function-call-arguments-done",
6749
+ responseId: this.responseId,
6750
+ itemId: this.itemId,
6751
+ callId: functionCall.id,
6752
+ name: functionCall.name,
6753
+ arguments: args,
6754
+ raw
6755
+ }
6756
+ ];
6757
+ });
6758
+ }
6759
+ if (data.toolCallCancellation != null) {
6760
+ return {
6761
+ type: "custom",
6762
+ rawType: "toolCallCancellation",
6763
+ raw
6764
+ };
6765
+ }
6766
+ if (data.serverContent != null) {
6767
+ return this.parseServerContent(data.serverContent, raw);
6768
+ }
6769
+ if (((_b = data.inputTranscription) == null ? void 0 : _b.text) != null) {
6770
+ return {
6771
+ type: "input-transcription-completed",
6772
+ itemId: `google-input-${this.turnCounter}`,
6773
+ transcript: data.inputTranscription.text,
6774
+ raw
6775
+ };
6776
+ }
6777
+ return { type: "custom", rawType: String(Object.keys(data)[0]), raw };
6778
+ }
6779
+ parseServerContent(serverContent, raw) {
6780
+ var _a, _b, _c, _d;
6781
+ const events = [];
6782
+ if (serverContent.interrupted) {
6783
+ events.push({
6784
+ type: "speech-started",
6785
+ raw
6786
+ });
6787
+ }
6788
+ if ((_a = serverContent.modelTurn) == null ? void 0 : _a.parts) {
6789
+ this.beginTurnIfClosed();
6790
+ for (const part of serverContent.modelTurn.parts) {
6791
+ if ((_b = part.inlineData) == null ? void 0 : _b.data) {
6792
+ this.hasAudio = true;
6793
+ events.push({
6794
+ type: "audio-delta",
6795
+ responseId: this.responseId,
6796
+ itemId: this.itemId,
6797
+ delta: part.inlineData.data,
6798
+ raw
6799
+ });
6800
+ }
6801
+ if (part.text) {
6802
+ this.hasText = true;
6803
+ events.push({
6804
+ type: "text-delta",
6805
+ responseId: this.responseId,
6806
+ itemId: this.itemId,
6807
+ delta: part.text,
6808
+ raw
6809
+ });
6810
+ }
6811
+ }
6812
+ }
6813
+ if ((_c = serverContent.outputTranscription) == null ? void 0 : _c.text) {
6814
+ this.hasTranscript = true;
6815
+ events.push({
6816
+ type: "audio-transcript-delta",
6817
+ responseId: this.responseId,
6818
+ itemId: this.itemId,
6819
+ delta: serverContent.outputTranscription.text,
6820
+ raw
6821
+ });
6822
+ }
6823
+ if ((_d = serverContent.inputTranscription) == null ? void 0 : _d.text) {
6824
+ events.push({
6825
+ type: "input-transcription-completed",
6826
+ itemId: `google-input-${this.turnCounter}`,
6827
+ transcript: serverContent.inputTranscription.text,
6828
+ raw
6829
+ });
6830
+ }
6831
+ if (serverContent.turnComplete) {
6832
+ if (this.hasAudio) {
6833
+ events.push({
6834
+ type: "audio-done",
6835
+ responseId: this.responseId,
6836
+ itemId: this.itemId,
6837
+ raw
6838
+ });
6839
+ }
6840
+ if (this.hasText) {
6841
+ events.push({
6842
+ type: "text-done",
6843
+ responseId: this.responseId,
6844
+ itemId: this.itemId,
6845
+ raw
6846
+ });
6847
+ }
6848
+ if (this.hasTranscript) {
6849
+ events.push({
6850
+ type: "audio-transcript-done",
6851
+ responseId: this.responseId,
6852
+ itemId: this.itemId,
6853
+ raw
6854
+ });
6855
+ }
6856
+ events.push({
6857
+ type: "response-done",
6858
+ responseId: this.responseId,
6859
+ status: "completed",
6860
+ raw
6861
+ });
6862
+ this.turnClosed = true;
6863
+ }
6864
+ if (events.length === 0) {
6865
+ return { type: "custom", rawType: "serverContent", raw };
6866
+ }
6867
+ return events.length === 1 ? events[0] : events;
6868
+ }
6869
+ serializeClientEvent(event, modelId) {
6870
+ var _a;
6871
+ switch (event.type) {
6872
+ case "session-update":
6873
+ if (((_a = event.config.inputAudioFormat) == null ? void 0 : _a.rate) != null) {
6874
+ this.inputAudioRate = event.config.inputAudioFormat.rate;
6875
+ }
6876
+ return {
6877
+ setup: buildGoogleSessionConfig(event.config, modelId)
6878
+ };
6879
+ case "input-audio-append":
6880
+ return {
6881
+ realtimeInput: {
6882
+ audio: {
6883
+ data: event.audio,
6884
+ mimeType: `audio/pcm;rate=${this.inputAudioRate}`
6885
+ }
6886
+ }
6887
+ };
6888
+ case "input-audio-commit":
6889
+ case "input-audio-clear":
6890
+ case "response-create":
6891
+ case "response-cancel":
6892
+ case "conversation-item-truncate":
6893
+ return null;
6894
+ case "conversation-item-create": {
6895
+ const item = event.item;
6896
+ switch (item.type) {
6897
+ case "text-message":
6898
+ return {
6899
+ realtimeInput: {
6900
+ text: item.text
6901
+ }
6902
+ };
6903
+ case "function-call-output":
6904
+ return serializeFunctionCallOutput(item);
6905
+ case "audio-message":
6906
+ return null;
6907
+ }
6908
+ break;
6909
+ }
6910
+ }
6911
+ return null;
6912
+ }
6913
+ };
6914
+ async function serializeFunctionCallOutput(item) {
6915
+ const parseResult = await safeParseJSON({ text: item.output });
6916
+ const response = parseResult.success ? parseResult.value : {};
6917
+ return {
6918
+ toolResponse: {
6919
+ functionResponses: [
6920
+ {
6921
+ id: item.callId,
6922
+ name: item.name,
6923
+ response
6924
+ }
6925
+ ]
6926
+ }
6927
+ };
6928
+ }
6929
+ function buildGoogleSessionConfig(config, modelId) {
6930
+ const setup = {
6931
+ model: getModelPath(modelId)
6932
+ };
6933
+ const generationConfig = {};
6934
+ if ((config == null ? void 0 : config.outputModalities) != null) {
6935
+ generationConfig.responseModalities = config.outputModalities.map(
6936
+ (m) => m.toUpperCase()
6937
+ );
6938
+ } else {
6939
+ generationConfig.responseModalities = ["AUDIO"];
6940
+ }
6941
+ if ((config == null ? void 0 : config.voice) != null) {
6942
+ generationConfig.speechConfig = {
6943
+ voiceConfig: {
6944
+ prebuiltVoiceConfig: {
6945
+ voiceName: config.voice
6946
+ }
6947
+ }
6948
+ };
6949
+ }
6950
+ setup.generationConfig = generationConfig;
6951
+ if ((config == null ? void 0 : config.instructions) != null) {
6952
+ setup.systemInstruction = {
6953
+ parts: [{ text: config.instructions }]
6954
+ };
6955
+ }
6956
+ if ((config == null ? void 0 : config.tools) != null && config.tools.length > 0) {
6957
+ setup.tools = [
6958
+ {
6959
+ functionDeclarations: config.tools.map((tool) => ({
6960
+ name: tool.name,
6961
+ description: tool.description,
6962
+ parameters: convertJSONSchemaToOpenAPISchema(tool.parameters)
6963
+ }))
6964
+ }
6965
+ ];
6966
+ }
6967
+ if ((config == null ? void 0 : config.inputAudioTranscription) != null) {
6968
+ setup.inputAudioTranscription = {};
6969
+ }
6970
+ if ((config == null ? void 0 : config.outputAudioTranscription) != null) {
6971
+ setup.outputAudioTranscription = {};
6972
+ }
6973
+ if ((config == null ? void 0 : config.providerOptions) != null) {
6974
+ Object.assign(setup, config.providerOptions);
6975
+ }
6976
+ return setup;
6977
+ }
6978
+
6979
+ // src/realtime/google-realtime-model.ts
6980
+ var realtimeWebSocketPath = "google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContentConstrained";
6981
+ function getRealtimeBaseURL(baseURL) {
6982
+ const url = new URL(baseURL);
6983
+ const pathSegments = url.pathname.split("/");
6984
+ const version = pathSegments.at(-1);
6985
+ if (version === "v1beta" || version === "v1alpha") {
6986
+ pathSegments.pop();
6987
+ url.pathname = pathSegments.join("/") || "/";
6988
+ }
6989
+ return url;
6990
+ }
6991
+ function getAuthTokensURL(baseURL) {
6992
+ const url = getRealtimeBaseURL(baseURL);
6993
+ url.pathname = `${url.pathname.replace(/\/$/, "")}/v1alpha/auth_tokens`;
6994
+ return url.toString();
6995
+ }
6996
+ function getWebSocketURL(baseURL) {
6997
+ const url = getRealtimeBaseURL(baseURL);
6998
+ url.protocol = url.protocol === "https:" ? "wss:" : "ws:";
6999
+ url.pathname = `${url.pathname.replace(/\/$/, "")}/ws/${realtimeWebSocketPath}`;
7000
+ return url.toString();
7001
+ }
7002
+ var GoogleRealtimeModel = class {
7003
+ constructor(modelId, config) {
7004
+ this.specificationVersion = "v4";
7005
+ this.mapper = new GoogleRealtimeEventMapper();
7006
+ this.modelId = modelId;
7007
+ this.provider = config.provider;
7008
+ this.config = config;
7009
+ }
7010
+ async doCreateClientSecret(options) {
7011
+ var _a, _b;
7012
+ const fetchFn = (_a = this.config.fetch) != null ? _a : fetch;
7013
+ const headers = this.config.headers();
7014
+ const apiKey = headers["x-goog-api-key"];
7015
+ if (!apiKey) {
7016
+ throw new Error(
7017
+ "Google Generative AI API key is required for realtime token creation."
7018
+ );
7019
+ }
7020
+ const now = Date.now();
7021
+ const openWindowMs = ((_b = options.expiresAfterSeconds) != null ? _b : 60) * 1e3;
7022
+ const newSessionExpireTime = new Date(now + openWindowMs).toISOString();
7023
+ const expireTime = new Date(
7024
+ now + openWindowMs + 30 * 60 * 1e3
7025
+ ).toISOString();
7026
+ const setupPayload = buildGoogleSessionConfig(
7027
+ options.sessionConfig,
7028
+ this.modelId
7029
+ );
7030
+ const response = await fetchFn(
7031
+ `${getAuthTokensURL(this.config.baseURL)}?key=${encodeURIComponent(apiKey)}`,
7032
+ {
7033
+ method: "POST",
7034
+ headers: { "Content-Type": "application/json" },
7035
+ body: JSON.stringify({
7036
+ // `uses: 0` means no limit is applied to how many times the token can
7037
+ // start a session (per the AuthToken spec). An unset value would
7038
+ // default to 1, which breaks WebSocket reconnects within the session.
7039
+ uses: 0,
7040
+ expireTime,
7041
+ newSessionExpireTime,
7042
+ bidiGenerateContentSetup: setupPayload
7043
+ })
7044
+ }
7045
+ );
7046
+ if (!response.ok) {
7047
+ const text = await response.text();
7048
+ throw new Error(
7049
+ `Google realtime auth token request failed: ${response.status} ${text}`
7050
+ );
7051
+ }
7052
+ const data = await response.json();
7053
+ return {
7054
+ token: data.name,
7055
+ url: getWebSocketURL(this.config.baseURL),
7056
+ expiresAt: data.expireTime ? Math.floor(new Date(data.expireTime).getTime() / 1e3) : void 0
7057
+ };
7058
+ }
7059
+ getWebSocketConfig(options) {
7060
+ return {
7061
+ url: `${options.url}?access_token=${encodeURIComponent(options.token)}`
7062
+ };
7063
+ }
7064
+ parseServerEvent(raw) {
7065
+ return this.mapper.parseServerEvent(raw);
7066
+ }
7067
+ serializeClientEvent(event) {
7068
+ return this.mapper.serializeClientEvent(event, this.modelId);
7069
+ }
7070
+ buildSessionConfig(config) {
7071
+ return buildGoogleSessionConfig(config, this.modelId);
7072
+ }
7073
+ };
7074
+
6679
7075
  // src/google-provider.ts
6680
7076
  function createGoogle(options = {}) {
6681
7077
  var _a, _b;
@@ -6742,12 +7138,35 @@ function createGoogle(options = {}) {
6742
7138
  generateId: (_a2 = options.generateId) != null ? _a2 : generateId2
6743
7139
  });
6744
7140
  };
7141
+ const createRealtimeModel = (modelId) => new GoogleRealtimeModel(modelId, {
7142
+ provider: `${providerName}.realtime`,
7143
+ baseURL,
7144
+ headers: getHeaders,
7145
+ fetch: options.fetch
7146
+ });
6745
7147
  const createSpeechModel = (modelId) => new GoogleSpeechModel(modelId, {
6746
7148
  provider: `${providerName}.speech`,
6747
7149
  baseURL,
6748
7150
  headers: getHeaders,
6749
7151
  fetch: options.fetch
6750
7152
  });
7153
+ const experimentalRealtimeFactory = Object.assign(
7154
+ (modelId) => createRealtimeModel(modelId),
7155
+ {
7156
+ getToken: async (tokenOptions) => {
7157
+ const model = createRealtimeModel(tokenOptions.model);
7158
+ const secret = await model.doCreateClientSecret({
7159
+ sessionConfig: tokenOptions.sessionConfig,
7160
+ expiresAfterSeconds: tokenOptions.expiresAfterSeconds
7161
+ });
7162
+ return {
7163
+ token: secret.token,
7164
+ url: secret.url,
7165
+ expiresAt: secret.expiresAt
7166
+ };
7167
+ }
7168
+ }
7169
+ );
6751
7170
  const createInteractionsModel = (modelIdOrAgent) => {
6752
7171
  var _a2;
6753
7172
  return new GoogleInteractionsLanguageModel(
@@ -6781,6 +7200,7 @@ function createGoogle(options = {}) {
6781
7200
  provider.imageModel = createImageModel;
6782
7201
  provider.video = createVideoModel;
6783
7202
  provider.videoModel = createVideoModel;
7203
+ provider.experimental_realtime = experimentalRealtimeFactory;
6784
7204
  provider.files = createFiles;
6785
7205
  provider.speech = createSpeechModel;
6786
7206
  provider.speechModel = createSpeechModel;
@@ -6790,6 +7210,7 @@ function createGoogle(options = {}) {
6790
7210
  }
6791
7211
  var google = createGoogle();
6792
7212
  export {
7213
+ GoogleRealtimeModel as Experimental_GoogleRealtimeModel,
6793
7214
  VERSION,
6794
7215
  createGoogle,
6795
7216
  createGoogle as createGoogleGenerativeAI,