@firebase/ai 2.4.0 → 2.5.0-20251028194003

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -192,6 +192,12 @@ export declare interface AudioConversationController {
192
192
  stop: () => Promise<void>;
193
193
  }
194
194
 
195
+ /**
196
+ * The audio transcription configuration.
197
+ */
198
+ export declare interface AudioTranscriptionConfig {
199
+ }
200
+
195
201
  /**
196
202
  * Abstract base class representing the configuration for an AI service backend.
197
203
  * This class should not be instantiated directly. Use its subclasses; {@link GoogleAIBackend} for
@@ -558,6 +564,12 @@ export declare interface EnhancedGenerateContentResponse extends GenerateContent
558
564
  * set to `true`.
559
565
  */
560
566
  thoughtSummary: () => string | undefined;
567
+ /**
568
+ * Indicates whether inference happened on-device or in-cloud.
569
+ *
570
+ * @beta
571
+ */
572
+ inferenceSource?: InferenceSource;
561
573
  }
562
574
 
563
575
  /**
@@ -1833,6 +1845,23 @@ export declare const InferenceMode: {
1833
1845
  */
1834
1846
  export declare type InferenceMode = (typeof InferenceMode)[keyof typeof InferenceMode];
1835
1847
 
1848
+ /**
1849
+ * Indicates whether inference happened on-device or in-cloud.
1850
+ *
1851
+ * @beta
1852
+ */
1853
+ export declare const InferenceSource: {
1854
+ readonly ON_DEVICE: "on_device";
1855
+ readonly IN_CLOUD: "in_cloud";
1856
+ };
1857
+
1858
+ /**
1859
+ * Indicates whether inference happened on-device or in-cloud.
1860
+ *
1861
+ * @beta
1862
+ */
1863
+ export declare type InferenceSource = (typeof InferenceSource)[keyof typeof InferenceSource];
1864
+
1836
1865
  /**
1837
1866
  * Content part interface if the part represents an image.
1838
1867
  * @public
@@ -1997,6 +2026,24 @@ export declare interface LiveGenerationConfig {
1997
2026
  * The modalities of the response.
1998
2027
  */
1999
2028
  responseModalities?: ResponseModality[];
2029
+ /**
2030
+ * Enables transcription of audio input.
2031
+ *
2032
+ * When enabled, the model will respond with transcriptions of your audio input in the `inputTranscriptions` property
2033
+ * in {@link LiveServerContent} messages. Note that the transcriptions are broken up across
2034
+ * messages, so you may only receive small amounts of text per message. For example, if you ask the model
2035
+ * "How are you today?", the model may transcribe that input across three messages, broken up as "How a", "re yo", "u today?".
2036
+ */
2037
+ inputAudioTranscription?: AudioTranscriptionConfig;
2038
+ /**
2039
+ * Enables transcription of audio input.
2040
+ *
2041
+ * When enabled, the model will respond with transcriptions of its audio output in the `outputTranscription` property
2042
+ * in {@link LiveServerContent} messages. Note that the transcriptions are broken up across
2043
+ * messages, so you may only receive small amounts of text per message. For example, if the model says
2044
+ * "How are you today?", the model may transcribe that output across three messages, broken up as "How a", "re yo", "u today?".
2045
+ */
2046
+ outputAudioTranscription?: AudioTranscriptionConfig;
2000
2047
  }
2001
2048
 
2002
2049
  /**
@@ -2078,6 +2125,14 @@ export declare interface LiveServerContent {
2078
2125
  * model was not interrupted.
2079
2126
  */
2080
2127
  interrupted?: boolean;
2128
+ /**
2129
+ * Transcription of the audio that was input to the model.
2130
+ */
2131
+ inputTranscription?: Transcription;
2132
+ /**
2133
+ * Transcription of the audio output from the model.
2134
+ */
2135
+ outputTranscription?: Transcription;
2081
2136
  }
2082
2137
 
2083
2138
  /**
@@ -2140,32 +2195,65 @@ export declare class LiveSession {
2140
2195
  */
2141
2196
  send(request: string | Array<string | Part>, turnComplete?: boolean): Promise<void>;
2142
2197
  /**
2143
- * Sends realtime input to the server.
2198
+ * Sends text to the server in realtime.
2144
2199
  *
2145
- * @param mediaChunks - The media chunks to send.
2200
+ * @example
2201
+ * ```javascript
2202
+ * liveSession.sendTextRealtime("Hello, how are you?");
2203
+ * ```
2204
+ *
2205
+ * @param text - The text data to send.
2146
2206
  * @throws If this session has been closed.
2147
2207
  *
2148
2208
  * @beta
2149
2209
  */
2150
- sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise<void>;
2210
+ sendTextRealtime(text: string): Promise<void>;
2151
2211
  /**
2152
- * Sends function responses to the server.
2212
+ * Sends audio data to the server in realtime.
2153
2213
  *
2154
- * @param functionResponses - The function responses to send.
2214
+ * @remarks The server requires that the audio data is base64-encoded 16-bit PCM at 16kHz
2215
+ * little-endian.
2216
+ *
2217
+ * @example
2218
+ * ```javascript
2219
+ * // const pcmData = ... base64-encoded 16-bit PCM at 16kHz little-endian.
2220
+ * const blob = { mimeType: "audio/pcm", data: pcmData };
2221
+ * liveSession.sendAudioRealtime(blob);
2222
+ * ```
2223
+ *
2224
+ * @param blob - The base64-encoded PCM data to send to the server in realtime.
2155
2225
  * @throws If this session has been closed.
2156
2226
  *
2157
2227
  * @beta
2158
2228
  */
2159
- sendFunctionResponses(functionResponses: FunctionResponse[]): Promise<void>;
2229
+ sendAudioRealtime(blob: GenerativeContentBlob): Promise<void>;
2160
2230
  /**
2161
- * Sends a stream of {@link GenerativeContentBlob}.
2231
+ * Sends video data to the server in realtime.
2162
2232
  *
2163
- * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send.
2233
+ * @remarks The server requires that the video is sent as individual video frames at 1 FPS. It
2234
+ * is recommended to set `mimeType` to `image/jpeg`.
2235
+ *
2236
+ * @example
2237
+ * ```javascript
2238
+ * // const videoFrame = ... base64-encoded JPEG data
2239
+ * const blob = { mimeType: "image/jpeg", data: videoFrame };
2240
+ * liveSession.sendVideoRealtime(blob);
2241
+ * ```
2242
+ * @param blob - The base64-encoded video data to send to the server in realtime.
2164
2243
  * @throws If this session has been closed.
2165
2244
  *
2166
2245
  * @beta
2167
2246
  */
2168
- sendMediaStream(mediaChunkStream: ReadableStream<GenerativeContentBlob>): Promise<void>;
2247
+ sendVideoRealtime(blob: GenerativeContentBlob): Promise<void>;
2248
+ /**
2249
+ * Sends function responses to the server.
2250
+ *
2251
+ * @param functionResponses - The function responses to send.
2252
+ * @throws If this session has been closed.
2253
+ *
2254
+ * @beta
2255
+ */
2256
+ sendFunctionResponses(functionResponses: FunctionResponse[]): Promise<void>;
2169
2257
  /**
2170
2258
  * Yields messages received from the server.
2171
2259
  * This can only be used by one consumer at a time.
@@ -2183,6 +2271,28 @@ export declare class LiveSession {
2183
2271
  * @beta
2184
2272
  */
2185
2273
  close(): Promise<void>;
2274
+ /**
2275
+ * Sends realtime input to the server.
2276
+ *
2277
+ * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead.
2278
+ *
2279
+ * @param mediaChunks - The media chunks to send.
2280
+ * @throws If this session has been closed.
2281
+ *
2282
+ * @beta
2283
+ */
2284
+ sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise<void>;
2285
+ /**
2286
+ * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead.
2287
+ *
2288
+ * Sends a stream of {@link GenerativeContentBlob}.
2289
+ *
2290
+ * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send.
2291
+ * @throws If this session has been closed.
2292
+ *
2293
+ * @beta
2294
+ */
2295
+ sendMediaStream(mediaChunkStream: ReadableStream<GenerativeContentBlob>): Promise<void>;
2186
2296
  }
2187
2297
 
2188
2298
  /**
@@ -2874,6 +2984,20 @@ export declare interface ToolConfig {
2874
2984
  functionCallingConfig?: FunctionCallingConfig;
2875
2985
  }
2876
2986
 
2987
+ /**
2988
+ * Transcription of audio. This can be returned from a {@link LiveGenerativeModel} if transcription
2989
+ * is enabled with the `inputAudioTranscription` or `outputAudioTranscription` properties on
2990
+ * the {@link LiveGenerationConfig}.
2991
+ *
2992
+ * @beta
2993
+ */
2994
+ export declare interface Transcription {
2995
+ /**
2996
+ * The text transcription of the audio.
2997
+ */
2998
+ text?: string;
2999
+ }
3000
+
2877
3001
  /**
2878
3002
  * A type that includes all specific Schema types.
2879
3003
  * @public
package/dist/ai.d.ts CHANGED
@@ -232,6 +232,12 @@ export declare interface AudioConversationController {
232
232
  stop: () => Promise<void>;
233
233
  }
234
234
 
235
+ /**
236
+ * The audio transcription configuration.
237
+ */
238
+ export declare interface AudioTranscriptionConfig {
239
+ }
240
+
235
241
  /**
236
242
  * Abstract base class representing the configuration for an AI service backend.
237
243
  * This class should not be instantiated directly. Use its subclasses; {@link GoogleAIBackend} for
@@ -604,6 +610,12 @@ export declare interface EnhancedGenerateContentResponse extends GenerateContent
604
610
  * set to `true`.
605
611
  */
606
612
  thoughtSummary: () => string | undefined;
613
+ /**
614
+ * Indicates whether inference happened on-device or in-cloud.
615
+ *
616
+ * @beta
617
+ */
618
+ inferenceSource?: InferenceSource;
607
619
  }
608
620
 
609
621
  /**
@@ -1945,6 +1957,23 @@ export declare const InferenceMode: {
1945
1957
  */
1946
1958
  export declare type InferenceMode = (typeof InferenceMode)[keyof typeof InferenceMode];
1947
1959
 
1960
+ /**
1961
+ * Indicates whether inference happened on-device or in-cloud.
1962
+ *
1963
+ * @beta
1964
+ */
1965
+ export declare const InferenceSource: {
1966
+ readonly ON_DEVICE: "on_device";
1967
+ readonly IN_CLOUD: "in_cloud";
1968
+ };
1969
+
1970
+ /**
1971
+ * Indicates whether inference happened on-device or in-cloud.
1972
+ *
1973
+ * @beta
1974
+ */
1975
+ export declare type InferenceSource = (typeof InferenceSource)[keyof typeof InferenceSource];
1976
+
1948
1977
  /**
1949
1978
  * Content part interface if the part represents an image.
1950
1979
  * @public
@@ -2112,6 +2141,24 @@ export declare interface LiveGenerationConfig {
2112
2141
  * The modalities of the response.
2113
2142
  */
2114
2143
  responseModalities?: ResponseModality[];
2144
+ /**
2145
+ * Enables transcription of audio input.
2146
+ *
2147
+ * When enabled, the model will respond with transcriptions of your audio input in the `inputTranscriptions` property
2148
+ * in {@link LiveServerContent} messages. Note that the transcriptions are broken up across
2149
+ * messages, so you may only receive small amounts of text per message. For example, if you ask the model
2150
+ * "How are you today?", the model may transcribe that input across three messages, broken up as "How a", "re yo", "u today?".
2151
+ */
2152
+ inputAudioTranscription?: AudioTranscriptionConfig;
2153
+ /**
2154
+ * Enables transcription of audio input.
2155
+ *
2156
+ * When enabled, the model will respond with transcriptions of its audio output in the `outputTranscription` property
2157
+ * in {@link LiveServerContent} messages. Note that the transcriptions are broken up across
2158
+ * messages, so you may only receive small amounts of text per message. For example, if the model says
2159
+ * "How are you today?", the model may transcribe that output across three messages, broken up as "How a", "re yo", "u today?".
2160
+ */
2161
+ outputAudioTranscription?: AudioTranscriptionConfig;
2115
2162
  }
2116
2163
 
2117
2164
  /**
@@ -2203,6 +2250,14 @@ export declare interface LiveServerContent {
2203
2250
  * model was not interrupted.
2204
2251
  */
2205
2252
  interrupted?: boolean;
2253
+ /**
2254
+ * Transcription of the audio that was input to the model.
2255
+ */
2256
+ inputTranscription?: Transcription;
2257
+ /**
2258
+ * Transcription of the audio output from the model.
2259
+ */
2260
+ outputTranscription?: Transcription;
2206
2261
  }
2207
2262
 
2208
2263
  /**
@@ -2268,32 +2323,65 @@ export declare class LiveSession {
2268
2323
  */
2269
2324
  send(request: string | Array<string | Part>, turnComplete?: boolean): Promise<void>;
2270
2325
  /**
2271
- * Sends realtime input to the server.
2326
+ * Sends text to the server in realtime.
2272
2327
  *
2273
- * @param mediaChunks - The media chunks to send.
2328
+ * @example
2329
+ * ```javascript
2330
+ * liveSession.sendTextRealtime("Hello, how are you?");
2331
+ * ```
2332
+ *
2333
+ * @param text - The text data to send.
2274
2334
  * @throws If this session has been closed.
2275
2335
  *
2276
2336
  * @beta
2277
2337
  */
2278
- sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise<void>;
2338
+ sendTextRealtime(text: string): Promise<void>;
2279
2339
  /**
2280
- * Sends function responses to the server.
2340
+ * Sends audio data to the server in realtime.
2281
2341
  *
2282
- * @param functionResponses - The function responses to send.
2342
+ * @remarks The server requires that the audio data is base64-encoded 16-bit PCM at 16kHz
2343
+ * little-endian.
2344
+ *
2345
+ * @example
2346
+ * ```javascript
2347
+ * // const pcmData = ... base64-encoded 16-bit PCM at 16kHz little-endian.
2348
+ * const blob = { mimeType: "audio/pcm", data: pcmData };
2349
+ * liveSession.sendAudioRealtime(blob);
2350
+ * ```
2351
+ *
2352
+ * @param blob - The base64-encoded PCM data to send to the server in realtime.
2283
2353
  * @throws If this session has been closed.
2284
2354
  *
2285
2355
  * @beta
2286
2356
  */
2287
- sendFunctionResponses(functionResponses: FunctionResponse[]): Promise<void>;
2357
+ sendAudioRealtime(blob: GenerativeContentBlob): Promise<void>;
2288
2358
  /**
2289
- * Sends a stream of {@link GenerativeContentBlob}.
2359
+ * Sends video data to the server in realtime.
2290
2360
  *
2291
- * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send.
2361
+ * @remarks The server requires that the video is sent as individual video frames at 1 FPS. It
2362
+ * is recommended to set `mimeType` to `image/jpeg`.
2363
+ *
2364
+ * @example
2365
+ * ```javascript
2366
+ * // const videoFrame = ... base64-encoded JPEG data
2367
+ * const blob = { mimeType: "image/jpeg", data: videoFrame };
2368
+ * liveSession.sendVideoRealtime(blob);
2369
+ * ```
2370
+ * @param blob - The base64-encoded video data to send to the server in realtime.
2292
2371
  * @throws If this session has been closed.
2293
2372
  *
2294
2373
  * @beta
2295
2374
  */
2296
- sendMediaStream(mediaChunkStream: ReadableStream<GenerativeContentBlob>): Promise<void>;
2375
+ sendVideoRealtime(blob: GenerativeContentBlob): Promise<void>;
2376
+ /**
2377
+ * Sends function responses to the server.
2378
+ *
2379
+ * @param functionResponses - The function responses to send.
2380
+ * @throws If this session has been closed.
2381
+ *
2382
+ * @beta
2383
+ */
2384
+ sendFunctionResponses(functionResponses: FunctionResponse[]): Promise<void>;
2297
2385
  /**
2298
2386
  * Yields messages received from the server.
2299
2387
  * This can only be used by one consumer at a time.
@@ -2311,6 +2399,28 @@ export declare class LiveSession {
2311
2399
  * @beta
2312
2400
  */
2313
2401
  close(): Promise<void>;
2402
+ /**
2403
+ * Sends realtime input to the server.
2404
+ *
2405
+ * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead.
2406
+ *
2407
+ * @param mediaChunks - The media chunks to send.
2408
+ * @throws If this session has been closed.
2409
+ *
2410
+ * @beta
2411
+ */
2412
+ sendMediaChunks(mediaChunks: GenerativeContentBlob[]): Promise<void>;
2413
+ /**
2414
+ * @deprecated Use `sendTextRealtime()`, `sendAudioRealtime()`, and `sendVideoRealtime()` instead.
2415
+ *
2416
+ * Sends a stream of {@link GenerativeContentBlob}.
2417
+ *
2418
+ * @param mediaChunkStream - The stream of {@link GenerativeContentBlob} to send.
2419
+ * @throws If this session has been closed.
2420
+ *
2421
+ * @beta
2422
+ */
2423
+ sendMediaStream(mediaChunkStream: ReadableStream<GenerativeContentBlob>): Promise<void>;
2314
2424
  }
2315
2425
 
2316
2426
  /**
@@ -3016,6 +3126,20 @@ export declare interface ToolConfig {
3016
3126
  functionCallingConfig?: FunctionCallingConfig;
3017
3127
  }
3018
3128
 
3129
+ /**
3130
+ * Transcription of audio. This can be returned from a {@link LiveGenerativeModel} if transcription
3131
+ * is enabled with the `inputAudioTranscription` or `outputAudioTranscription` properties on
3132
+ * the {@link LiveGenerationConfig}.
3133
+ *
3134
+ * @beta
3135
+ */
3136
+ export declare interface Transcription {
3137
+ /**
3138
+ * The text transcription of the audio.
3139
+ */
3140
+ text?: string;
3141
+ }
3142
+
3019
3143
  /**
3020
3144
  * A type that includes all specific Schema types.
3021
3145
  * @public