@firebase/ai 2.4.0-canary.91c218db2 → 2.4.0-canary.bc5a7c4a7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,32 +53,65 @@ export declare class LiveSession {
53
53
  */
54
54
  send(request: string | Array<string | Part>, turnComplete?: boolean): Promise<void>;
55
55
  /**
56
- * Sends realtime input to the server.
56
+ * Sends text to the server in realtime.
57
57
  *
58
- * @param mediaChunks - The media chunks to send.
58
+ * @example
59
+ * ```javascript
60
+ * liveSession.sendTextRealtime("Hello, how are you?");
61
+ * ```
62
+ *
63
+ * @param text - The text data to send.
59
64
  * @throws If this session has been closed.
60
65
  *
61
66
  * @beta
62
67
  */
63
- sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise<void>;
68
+ sendTextRealtime(text: string): Promise<void>;
64
69
  /**
65
- * Sends function responses to the server.
70
+ * Sends audio data to the server in realtime.
66
71
  *
67
- * @param functionResponses - The function responses to send.
72
+ * @remarks The server requires that the audio data is base64-encoded 16-bit PCM at 16kHz
73
+ * little-endian.
74
+ *
75
+ * @example
76
+ * ```javascript
77
+ * // const pcmData = ... base64-encoded 16-bit PCM at 16kHz little-endian.
78
+ * const blob = { mimeType: "audio/pcm", data: pcmData };
79
+ * liveSession.sendAudioRealtime(blob);
80
+ * ```
81
+ *
82
+ * @param blob - The base64-encoded PCM data to send to the server in realtime.
68
83
  * @throws If this session has been closed.
69
84
  *
70
85
  * @beta
71
86
  */
72
- sendFunctionResponses(functionResponses: FunctionResponse[]): Promise<void>;
87
+ sendAudioRealtime(blob: GenerativeContentBlob): Promise<void>;
73
88
  /**
74
- * Sends a stream of {@link GenerativeContentBlob}.
89
+ * Sends video data to the server in realtime.
75
90
  *
76
- * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send.
91
+ * @remarks The server requires that the video is sent as individual video frames at 1 FPS. It
92
+ * is recommended to set `mimeType` to `image/jpeg`.
93
+ *
94
+ * @example
95
+ * ```javascript
96
+ * // const videoFrame = ... base64-encoded JPEG data
97
+ * const blob = { mimeType: "image/jpeg", data: videoFrame };
98
+ * liveSession.sendVideoRealtime(blob);
99
+ * ```
100
+ * @param blob - The base64-encoded video data to send to the server in realtime.
77
101
  * @throws If this session has been closed.
78
102
  *
79
103
  * @beta
80
104
  */
81
- sendMediaStream(mediaChunkStream: ReadableStream<GenerativeContentBlob>): Promise<void>;
105
+ sendVideoRealtime(blob: GenerativeContentBlob): Promise<void>;
106
+ /**
107
+ * Sends function responses to the server.
108
+ *
109
+ * @param functionResponses - The function responses to send.
110
+ * @throws If this session has been closed.
111
+ *
112
+ * @beta
113
+ */
114
+ sendFunctionResponses(functionResponses: FunctionResponse[]): Promise<void>;
82
115
  /**
83
116
  * Yields messages received from the server.
84
117
  * This can only be used by one consumer at a time.
@@ -96,4 +129,26 @@ export declare class LiveSession {
96
129
  * @beta
97
130
  */
98
131
  close(): Promise<void>;
132
+ /**
133
+ * Sends realtime input to the server.
134
+ *
135
+ * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead.
136
+ *
137
+ * @param mediaChunks - The media chunks to send.
138
+ * @throws If this session has been closed.
139
+ *
140
+ * @beta
141
+ */
142
+ sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise<void>;
143
+ /**
144
+ * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead.
145
+ *
146
+ * Sends a stream of {@link GenerativeContentBlob}.
147
+ *
148
+ * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send.
149
+ * @throws If this session has been closed.
150
+ *
151
+ * @beta
152
+ */
153
+ sendMediaStream(mediaChunkStream: ReadableStream<GenerativeContentBlob>): Promise<void>;
99
154
  }
@@ -15,7 +15,8 @@
15
15
  * limitations under the License.
16
16
  */
17
17
  import { Content, FunctionResponse, GenerativeContentBlob, Part } from './content';
18
- import { LiveGenerationConfig, Tool, ToolConfig } from './requests';
18
+ import { AudioTranscriptionConfig, LiveGenerationConfig, Tool, ToolConfig } from './requests';
19
+ import { Transcription } from './responses';
19
20
  /**
20
21
  * User input that is sent to the model.
21
22
  *
@@ -25,6 +26,8 @@ export interface _LiveClientContent {
25
26
  clientContent: {
26
27
  turns: [Content];
27
28
  turnComplete: boolean;
29
+ inputTranscription?: Transcription;
30
+ outputTranscription?: Transcription;
28
31
  };
29
32
  }
30
33
  /**
@@ -34,7 +37,13 @@ export interface _LiveClientContent {
34
37
  */
35
38
  export interface _LiveClientRealtimeInput {
36
39
  realtimeInput: {
37
- mediaChunks: GenerativeContentBlob[];
40
+ text?: string;
41
+ audio?: GenerativeContentBlob;
42
+ video?: GenerativeContentBlob;
43
+ /**
44
+ * @deprecated Use `text`, `audio`, and `video` instead.
45
+ */
46
+ mediaChunks?: GenerativeContentBlob[];
38
47
  };
39
48
  }
40
49
  /**
@@ -53,9 +62,18 @@ export interface _LiveClientToolResponse {
53
62
  export interface _LiveClientSetup {
54
63
  setup: {
55
64
  model: string;
56
- generationConfig?: LiveGenerationConfig;
65
+ generationConfig?: _LiveGenerationConfig;
57
66
  tools?: Tool[];
58
67
  toolConfig?: ToolConfig;
59
68
  systemInstruction?: string | Part | Content;
69
+ inputAudioTranscription?: AudioTranscriptionConfig;
70
+ outputAudioTranscription?: AudioTranscriptionConfig;
60
71
  };
61
72
  }
73
+ /**
74
+ * The Live Generation Config.
75
+ *
76
+ * The public API ({@link LiveGenerationConfig}) has `inputAudioTranscription` and `outputAudioTranscription`,
77
+ * but the server expects these fields to be in the top-level `setup` message. This was a conscious API decision.
78
+ */
79
+ export type _LiveGenerationConfig = Omit<LiveGenerationConfig, 'inputAudioTranscription' | 'outputAudioTranscription'>;
@@ -167,6 +167,24 @@ export interface LiveGenerationConfig {
167
167
  * The modalities of the response.
168
168
  */
169
169
  responseModalities?: ResponseModality[];
170
+ /**
171
+ * Enables transcription of audio input.
172
+ *
173
+ * When enabled, the model will respond with transcriptions of your audio input in the `inputTranscriptions` property
174
+ * in {@link LiveServerContent} messages. Note that the transcriptions are broken up across
175
+ * messages, so you may only receive small amounts of text per message. For example, if you ask the model
176
+ * "How are you today?", the model may transcribe that input across three messages, broken up as "How a", "re yo", "u today?".
177
+ */
178
+ inputAudioTranscription?: AudioTranscriptionConfig;
179
+ /**
180
+ * Enables transcription of audio input.
181
+ *
182
+ * When enabled, the model will respond with transcriptions of its audio output in the `outputTranscription` property
183
+ * in {@link LiveServerContent} messages. Note that the transcriptions are broken up across
184
+ * messages, so you may only receive small amounts of text per message. For example, if the model says
185
+ * "How are you today?", the model may transcribe that output across three messages, broken up as "How a", "re yo", "u today?".
186
+ */
187
+ outputAudioTranscription?: AudioTranscriptionConfig;
170
188
  }
171
189
  /**
172
190
  * Params for {@link GenerativeModel.startChat}.
@@ -439,3 +457,8 @@ export interface SpeechConfig {
439
457
  */
440
458
  voiceConfig?: VoiceConfig;
441
459
  }
460
+ /**
461
+ * The audio transcription configuration.
462
+ */
463
+ export interface AudioTranscriptionConfig {
464
+ }
@@ -516,6 +516,27 @@ export interface LiveServerContent {
516
516
  * model was not interrupted.
517
517
  */
518
518
  interrupted?: boolean;
519
+ /**
520
+ * Transcription of the audio that was input to the model.
521
+ */
522
+ inputTranscription?: Transcription;
523
+ /**
524
+ * Transcription of the audio output from the model.
525
+ */
526
+ outputTranscription?: Transcription;
527
+ }
528
+ /**
529
+ * Transcription of audio. This can be returned from a {@link LiveGenerativeModel} if transcription
530
+ * is enabled with the `inputAudioTranscription` or `outputAudioTranscription` properties on
531
+ * the {@link LiveGenerationConfig}.
532
+ *
533
+ * @beta
534
+ */
535
+ export interface Transcription {
536
+ /**
537
+ * The text transcription of the audio.
538
+ */
539
+ text?: string;
519
540
  }
520
541
  /**
521
542
  * A request from the model for the client to execute one or more functions.
package/dist/index.cjs.js CHANGED
@@ -8,7 +8,7 @@ var util = require('@firebase/util');
8
8
  var logger$1 = require('@firebase/logger');
9
9
 
10
10
  var name = "@firebase/ai";
11
- var version = "2.4.0-canary.91c218db2";
11
+ var version = "2.4.0-canary.bc5a7c4a7";
12
12
 
13
13
  /**
14
14
  * @license
@@ -2874,75 +2874,104 @@ class LiveSession {
2874
2874
  this.webSocketHandler.send(JSON.stringify(message));
2875
2875
  }
2876
2876
  /**
2877
- * Sends realtime input to the server.
2877
+ * Sends text to the server in realtime.
2878
2878
  *
2879
- * @param mediaChunks - The media chunks to send.
2879
+ * @example
2880
+ * ```javascript
2881
+ * liveSession.sendTextRealtime("Hello, how are you?");
2882
+ * ```
2883
+ *
2884
+ * @param text - The text data to send.
2880
2885
  * @throws If this session has been closed.
2881
2886
  *
2882
2887
  * @beta
2883
2888
  */
2884
- async sendMediaChunks(mediaChunks) {
2889
+ async sendTextRealtime(text) {
2885
2890
  if (this.isClosed) {
2886
2891
  throw new AIError(AIErrorCode.REQUEST_ERROR, 'This LiveSession has been closed and cannot be used.');
2887
2892
  }
2888
- // The backend does not support sending more than one mediaChunk in one message.
2889
- // Work around this limitation by sending mediaChunks in separate messages.
2890
- mediaChunks.forEach(mediaChunk => {
2891
- const message = {
2892
- realtimeInput: { mediaChunks: [mediaChunk] }
2893
- };
2894
- this.webSocketHandler.send(JSON.stringify(message));
2895
- });
2893
+ const message = {
2894
+ realtimeInput: {
2895
+ text
2896
+ }
2897
+ };
2898
+ this.webSocketHandler.send(JSON.stringify(message));
2896
2899
  }
2897
2900
  /**
2898
- * Sends function responses to the server.
2901
+ * Sends audio data to the server in realtime.
2899
2902
  *
2900
- * @param functionResponses - The function responses to send.
2903
+ * @remarks The server requires that the audio data is base64-encoded 16-bit PCM at 16kHz
2904
+ * little-endian.
2905
+ *
2906
+ * @example
2907
+ * ```javascript
2908
+ * // const pcmData = ... base64-encoded 16-bit PCM at 16kHz little-endian.
2909
+ * const blob = { mimeType: "audio/pcm", data: pcmData };
2910
+ * liveSession.sendAudioRealtime(blob);
2911
+ * ```
2912
+ *
2913
+ * @param blob - The base64-encoded PCM data to send to the server in realtime.
2901
2914
  * @throws If this session has been closed.
2902
2915
  *
2903
2916
  * @beta
2904
2917
  */
2905
- async sendFunctionResponses(functionResponses) {
2918
+ async sendAudioRealtime(blob) {
2906
2919
  if (this.isClosed) {
2907
2920
  throw new AIError(AIErrorCode.REQUEST_ERROR, 'This LiveSession has been closed and cannot be used.');
2908
2921
  }
2909
2922
  const message = {
2910
- toolResponse: {
2911
- functionResponses
2923
+ realtimeInput: {
2924
+ audio: blob
2912
2925
  }
2913
2926
  };
2914
2927
  this.webSocketHandler.send(JSON.stringify(message));
2915
2928
  }
2916
2929
  /**
2917
- * Sends a stream of {@link GenerativeContentBlob}.
2930
+ * Sends video data to the server in realtime.
2918
2931
  *
2919
- * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send.
2932
+ * @remarks The server requires that the video is sent as individual video frames at 1 FPS. It
2933
+ * is recommended to set `mimeType` to `image/jpeg`.
2934
+ *
2935
+ * @example
2936
+ * ```javascript
2937
+ * // const videoFrame = ... base64-encoded JPEG data
2938
+ * const blob = { mimeType: "image/jpeg", data: videoFrame };
2939
+ * liveSession.sendVideoRealtime(blob);
2940
+ * ```
2941
+ * @param blob - The base64-encoded video data to send to the server in realtime.
2920
2942
  * @throws If this session has been closed.
2921
2943
  *
2922
2944
  * @beta
2923
2945
  */
2924
- async sendMediaStream(mediaChunkStream) {
2946
+ async sendVideoRealtime(blob) {
2925
2947
  if (this.isClosed) {
2926
2948
  throw new AIError(AIErrorCode.REQUEST_ERROR, 'This LiveSession has been closed and cannot be used.');
2927
2949
  }
2928
- const reader = mediaChunkStream.getReader();
2929
- while (true) {
2930
- try {
2931
- const { done, value } = await reader.read();
2932
- if (done) {
2933
- break;
2934
- }
2935
- else if (!value) {
2936
- throw new Error('Missing chunk in reader, but reader is not done.');
2937
- }
2938
- await this.sendMediaChunks([value]);
2939
- }
2940
- catch (e) {
2941
- // Re-throw any errors that occur during stream consumption or sending.
2942
- const message = e instanceof Error ? e.message : 'Error processing media stream.';
2943
- throw new AIError(AIErrorCode.REQUEST_ERROR, message);
2950
+ const message = {
2951
+ realtimeInput: {
2952
+ video: blob
2944
2953
  }
2954
+ };
2955
+ this.webSocketHandler.send(JSON.stringify(message));
2956
+ }
2957
+ /**
2958
+ * Sends function responses to the server.
2959
+ *
2960
+ * @param functionResponses - The function responses to send.
2961
+ * @throws If this session has been closed.
2962
+ *
2963
+ * @beta
2964
+ */
2965
+ async sendFunctionResponses(functionResponses) {
2966
+ if (this.isClosed) {
2967
+ throw new AIError(AIErrorCode.REQUEST_ERROR, 'This LiveSession has been closed and cannot be used.');
2945
2968
  }
2969
+ const message = {
2970
+ toolResponse: {
2971
+ functionResponses
2972
+ }
2973
+ };
2974
+ this.webSocketHandler.send(JSON.stringify(message));
2946
2975
  }
2947
2976
  /**
2948
2977
  * Yields messages received from the server.
@@ -3000,6 +3029,62 @@ class LiveSession {
3000
3029
  await this.webSocketHandler.close(1000, 'Client closed session.');
3001
3030
  }
3002
3031
  }
3032
+ /**
3033
+ * Sends realtime input to the server.
3034
+ *
3035
+ * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead.
3036
+ *
3037
+ * @param mediaChunks - The media chunks to send.
3038
+ * @throws If this session has been closed.
3039
+ *
3040
+ * @beta
3041
+ */
3042
+ async sendMediaChunks(mediaChunks) {
3043
+ if (this.isClosed) {
3044
+ throw new AIError(AIErrorCode.REQUEST_ERROR, 'This LiveSession has been closed and cannot be used.');
3045
+ }
3046
+ // The backend does not support sending more than one mediaChunk in one message.
3047
+ // Work around this limitation by sending mediaChunks in separate messages.
3048
+ mediaChunks.forEach(mediaChunk => {
3049
+ const message = {
3050
+ realtimeInput: { mediaChunks: [mediaChunk] }
3051
+ };
3052
+ this.webSocketHandler.send(JSON.stringify(message));
3053
+ });
3054
+ }
3055
+ /**
3056
+ * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead.
3057
+ *
3058
+ * Sends a stream of {@link GenerativeContentBlob}.
3059
+ *
3060
+ * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send.
3061
+ * @throws If this session has been closed.
3062
+ *
3063
+ * @beta
3064
+ */
3065
+ async sendMediaStream(mediaChunkStream) {
3066
+ if (this.isClosed) {
3067
+ throw new AIError(AIErrorCode.REQUEST_ERROR, 'This LiveSession has been closed and cannot be used.');
3068
+ }
3069
+ const reader = mediaChunkStream.getReader();
3070
+ while (true) {
3071
+ try {
3072
+ const { done, value } = await reader.read();
3073
+ if (done) {
3074
+ break;
3075
+ }
3076
+ else if (!value) {
3077
+ throw new Error('Missing chunk in reader, but reader is not done.');
3078
+ }
3079
+ await this.sendMediaChunks([value]);
3080
+ }
3081
+ catch (e) {
3082
+ // Re-throw any errors that occur during stream consumption or sending.
3083
+ const message = e instanceof Error ? e.message : 'Error processing media stream.';
3084
+ throw new AIError(AIErrorCode.REQUEST_ERROR, message);
3085
+ }
3086
+ }
3087
+ }
3003
3088
  }
3004
3089
 
3005
3090
  /**
@@ -3060,13 +3145,18 @@ class LiveGenerativeModel extends AIModel {
3060
3145
  else {
3061
3146
  fullModelPath = `projects/${this._apiSettings.project}/locations/${this._apiSettings.location}/${this.model}`;
3062
3147
  }
3148
+ // inputAudioTranscription and outputAudioTranscription are on the generation config in the public API,
3149
+ // but the backend expects them to be in the `setup` message.
3150
+ const { inputAudioTranscription, outputAudioTranscription, ...generationConfig } = this.generationConfig;
3063
3151
  const setupMessage = {
3064
3152
  setup: {
3065
3153
  model: fullModelPath,
3066
- generationConfig: this.generationConfig,
3154
+ generationConfig,
3067
3155
  tools: this.tools,
3068
3156
  toolConfig: this.toolConfig,
3069
- systemInstruction: this.systemInstruction
3157
+ systemInstruction: this.systemInstruction,
3158
+ inputAudioTranscription,
3159
+ outputAudioTranscription
3070
3160
  }
3071
3161
  };
3072
3162
  try {
@@ -3772,7 +3862,7 @@ class AudioConversationRunner {
3772
3862
  mimeType: 'audio/pcm',
3773
3863
  data: base64
3774
3864
  };
3775
- void this.liveSession.sendMediaChunks([chunk]);
3865
+ void this.liveSession.sendAudioRealtime(chunk);
3776
3866
  };
3777
3867
  }
3778
3868
  /**