voice-router-dev 0.9.4 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -223,6 +223,9 @@ var listenTranscribeQueryParams = zod.object({
223
223
  diarize: zod.boolean().optional().describe(
224
224
  "Recognize speaker changes. Each word in the transcript will be assigned a speaker number starting at 0"
225
225
  ),
226
+ diarize_model: zod.enum(["latest", "v1", "v2"]).optional().describe(
227
+ "Select and enable a specific batch diarization model version. If specifying this parameter, you should not set the deprecated `diarize=true` parameter. Not accepted on streaming requests."
228
+ ),
226
229
  dictation: zod.boolean().optional().describe("Dictation mode for controlling formatting with dictated speech"),
227
230
  encoding: zod.enum(["linear16", "flac", "mulaw", "amr-nb", "amr-wb", "opus", "speex", "g729"]).optional().describe("Specify the expected encoding of your submitted audio"),
228
231
  filler_words: zod.boolean().optional().describe('Filler Words can help transcribe interruptions in your audio, like "uh" and "um"'),
@@ -487,6 +490,7 @@ var listenTranscribeResponse = zod.object({
487
490
  );
488
491
  var speakGenerateQueryCallbackMethodDefault = "POST";
489
492
  var speakGenerateQueryModelDefault = "aura-asteria-en";
493
+ var speakGenerateQuerySpeedDefault = 1;
490
494
  var speakGenerateQueryParams = zod.object({
491
495
  callback: zod.string().optional().describe("URL to which we'll make the callback request"),
492
496
  callback_method: zod.enum(["POST", "PUT"]).default(speakGenerateQueryCallbackMethodDefault).describe("HTTP method by which the callback request will be made"),
@@ -598,6 +602,9 @@ var speakGenerateQueryParams = zod.object({
598
602
  zod.enum(["22050"]).describe("Encoding - mp3. Sample rate is fixed and not configurable (22050 Hz).")
599
603
  ).or(zod.enum(["48000"]).describe("Encoding - opus. Sample rate is fixed at 48000 Hz.")).optional().describe(
600
604
  "Sample Rate specifies the sample rate for the output audio. Based on the encoding, different sample rates are supported. For some encodings, the sample rate is not configurable"
605
+ ),
606
+ speed: zod.number().default(speakGenerateQuerySpeedDefault).describe(
607
+ "Speaking rate multiplier that adjusts the pace of generated speech while preserving natural prosody and voice quality. Not yet supported in all languages."
601
608
  )
602
609
  });
603
610
  var speakGenerateHeader = zod.object({
@@ -937,7 +944,7 @@ var createTranscriptBody = zod3.object({
937
944
  "Customize how words are spelled and formatted using to and from values. See [Custom Spelling](https://www.assemblyai.com/docs/pre-recorded-audio/correct-spelling-of-terms) for more details."
938
945
  ),
939
946
  disfluencies: zod3.boolean().optional().describe(
940
- 'Transcribe [Filler Words](https://www.assemblyai.com/docs/pre-recorded-audio/include-filler-words), like "umm", in your media file; can be true or false'
947
+ 'Transcribe [Filler Words](https://www.assemblyai.com/docs/pre-recorded-audio/include-filler-words), like "umm", in your media file; can be true or false. Supported on Universal-3 Pro and Universal-2.'
941
948
  ),
942
949
  domain: zod3.string().nullish().describe(
943
950
  'Enable domain-specific transcription models to improve accuracy for specialized terminology. Set to `"medical-v1"` to enable [Medical Mode](https://www.assemblyai.com/docs/pre-recorded-audio/medical-mode) for improved accuracy of medical terms such as medications, procedures, conditions, and dosages.\n\nSupported languages: English (`en`), Spanish (`es`), German (`de`), French (`fr`). If used with an unsupported language, the parameter is ignored and a warning is returned.\n'
@@ -1244,12 +1251,20 @@ var createTranscriptBody = zod3.object({
1244
1251
  "email_address",
1245
1252
  "event",
1246
1253
  "filename",
1254
+ "gender",
1247
1255
  "gender_sexuality",
1248
1256
  "healthcare_number",
1249
1257
  "injury",
1250
1258
  "ip_address",
1251
1259
  "language",
1252
1260
  "location",
1261
+ "location_address",
1262
+ "location_address_street",
1263
+ "location_city",
1264
+ "location_coordinate",
1265
+ "location_country",
1266
+ "location_state",
1267
+ "location_zip",
1253
1268
  "marital_status",
1254
1269
  "medical_condition",
1255
1270
  "medical_process",
@@ -1258,6 +1273,7 @@ var createTranscriptBody = zod3.object({
1258
1273
  "number_sequence",
1259
1274
  "occupation",
1260
1275
  "organization",
1276
+ "organization_medical_facility",
1261
1277
  "passport_number",
1262
1278
  "password",
1263
1279
  "person_age",
@@ -1266,6 +1282,7 @@ var createTranscriptBody = zod3.object({
1266
1282
  "physical_attribute",
1267
1283
  "political_affiliation",
1268
1284
  "religion",
1285
+ "sexuality",
1269
1286
  "statistics",
1270
1287
  "time",
1271
1288
  "url",
@@ -1273,15 +1290,20 @@ var createTranscriptBody = zod3.object({
1273
1290
  "username",
1274
1291
  "vehicle_id",
1275
1292
  "zodiac_sign"
1276
- ]).describe("The type of PII to redact")
1293
+ ]).describe(
1294
+ "The type of PII to redact. See [PII redaction](https://www.assemblyai.com/docs/streaming/pii-redaction) for the full list of policies and their descriptions."
1295
+ )
1277
1296
  ).optional().describe(
1278
1297
  "The list of PII Redaction policies to enable. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more details."
1279
1298
  ),
1280
1299
  redact_pii_sub: zod3.enum(["entity_name", "hash"]).describe(
1281
- "The replacement logic for detected PII, can be `entity_name` or `hash`. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more details."
1300
+ "The replacement logic for detected PII, can be `entity_name` or `hash`. See [PII redaction](https://www.assemblyai.com/docs/streaming/pii-redaction) for more details."
1282
1301
  ).or(zod3.null()).optional().describe(
1283
1302
  "The replacement logic for detected PII, can be `entity_type` or `hash`. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more details."
1284
1303
  ),
1304
+ redact_pii_return_unredacted: zod3.boolean().optional().describe(
1305
+ "When set to `true`, returns the original unredacted transcript alongside the redacted one in the same response. Requires `redact_pii` to be `true`, otherwise a 400 error is returned.\n\nWhen enabled, the response includes the additional fields `unredacted_text`, `unredacted_words`, and `unredacted_utterances`. The existing `text`, `words`, and `utterances` fields remain fully redacted. When disabled (default), the response is unchanged and contains only the redacted transcript. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more details.\n"
1306
+ ),
1285
1307
  sentiment_analysis: zod3.boolean().optional().describe(
1286
1308
  "Enable [Sentiment Analysis](https://www.assemblyai.com/docs/speech-understanding/analyze-sentiment-of-speech), can be true or false"
1287
1309
  ),
@@ -1379,10 +1401,10 @@ var createTranscriptBody = zod3.object({
1379
1401
  ),
1380
1402
  summary_model: zod3.enum(["informative", "conversational", "catchy"]).optional().describe("The model to summarize the transcript"),
1381
1403
  summary_type: zod3.enum(["bullets", "bullets_verbose", "gist", "headline", "paragraph"]).optional().describe("The type of summary"),
1382
- remove_audio_tags: zod3.enum(["all"]).describe(
1383
- 'Remove [audio event tags](https://www.assemblyai.com/docs/pre-recorded-audio/universal-3-pro#audio-event-tags) from the transcript text. Set to `"all"` to remove all audio tags.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n'
1404
+ remove_audio_tags: zod3.enum(["all", "speaker"]).describe(
1405
+ 'Universal-3 Pro generates rich transcripts that can include inline annotations such as audio event markers and speaker cues. Set to `"all"` to remove all inline annotations, or `"speaker"` to remove only speaker cues while keeping other annotations.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n'
1384
1406
  ).or(zod3.null()).optional().describe(
1385
- 'Remove [audio event tags](https://www.assemblyai.com/docs/pre-recorded-audio/universal-3-pro#audio-event-tags) from the transcript text. Set to `"all"` to remove all audio tags.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n'
1407
+ 'Universal-3 Pro generates rich transcripts that can include inline annotations such as audio event markers and speaker cues. Set to `"all"` to remove all inline annotations, or `"speaker"` to remove only speaker cues while keeping other annotations.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n'
1386
1408
  ),
1387
1409
  temperature: zod3.number().optional().describe(
1388
1410
  "Control the amount of randomness injected into the model's response. See the [Prompting Guide](https://www.assemblyai.com/docs/pre-recorded-audio/prompting) for more details.\n\nNote: This parameter can only be used with the Universal-3 Pro model.\n"
@@ -1513,7 +1535,7 @@ var createTranscriptResponse = zod3.object({
1513
1535
  "Customize how words are spelled and formatted using to and from values. See [Custom Spelling](https://www.assemblyai.com/docs/pre-recorded-audio/correct-spelling-of-terms) for more details."
1514
1536
  ),
1515
1537
  disfluencies: zod3.boolean().nullish().describe(
1516
- 'Transcribe [Filler Words](https://www.assemblyai.com/docs/pre-recorded-audio/include-filler-words), like "umm", in your media file; can be true or false'
1538
+ 'Transcribe [Filler Words](https://www.assemblyai.com/docs/pre-recorded-audio/include-filler-words), like "umm", in your media file; can be true or false. Supported on Universal-3 Pro and Universal-2.'
1517
1539
  ),
1518
1540
  domain: zod3.string().nullish().describe(
1519
1541
  'The domain-specific model applied to the transcript. When set to `"medical-v1"`, [Medical Mode](https://www.assemblyai.com/docs/pre-recorded-audio/medical-mode) was used to improve accuracy for medical terminology.\n'
@@ -1536,12 +1558,20 @@ var createTranscriptResponse = zod3.object({
1536
1558
  "email_address",
1537
1559
  "event",
1538
1560
  "filename",
1561
+ "gender",
1539
1562
  "gender_sexuality",
1540
1563
  "healthcare_number",
1541
1564
  "injury",
1542
1565
  "ip_address",
1543
1566
  "language",
1544
1567
  "location",
1568
+ "location_address",
1569
+ "location_address_street",
1570
+ "location_city",
1571
+ "location_coordinate",
1572
+ "location_country",
1573
+ "location_state",
1574
+ "location_zip",
1545
1575
  "marital_status",
1546
1576
  "medical_condition",
1547
1577
  "medical_process",
@@ -1550,6 +1580,7 @@ var createTranscriptResponse = zod3.object({
1550
1580
  "number_sequence",
1551
1581
  "occupation",
1552
1582
  "organization",
1583
+ "organization_medical_facility",
1553
1584
  "passport_number",
1554
1585
  "password",
1555
1586
  "person_age",
@@ -1558,6 +1589,7 @@ var createTranscriptResponse = zod3.object({
1558
1589
  "physical_attribute",
1559
1590
  "political_affiliation",
1560
1591
  "religion",
1592
+ "sexuality",
1561
1593
  "statistics",
1562
1594
  "time",
1563
1595
  "url",
@@ -1862,6 +1894,24 @@ var createTranscriptResponse = zod3.object({
1862
1894
  }).optional().describe(
1863
1895
  "Specify options for [Automatic Language Detection](https://www.assemblyai.com/docs/pre-recorded-audio/language-detection)."
1864
1896
  ),
1897
+ metadata: zod3.object({
1898
+ domain_used: zod3.string().nullish().describe(
1899
+ 'The domain-specific model that was applied to the transcription (for example, `"medical-v1"` when [Medical Mode](https://www.assemblyai.com/docs/pre-recorded-audio/medical-mode) was applied), or `null` if no domain-specific model was used. Always present when `metadata` is present.\n'
1900
+ ),
1901
+ warnings: zod3.array(
1902
+ zod3.object({
1903
+ message: zod3.string().describe("A human-readable description of the warning.")
1904
+ }).describe(
1905
+ "A warning message emitted while processing a transcription request. Warnings are surfaced on the transcript response under `metadata.warnings`.\n"
1906
+ )
1907
+ ).optional().describe(
1908
+ "Warning messages emitted while processing the request. Each warning is an object with a human-readable `message`. When there are no warnings to report, this field is omitted from the `metadata` object entirely.\n"
1909
+ )
1910
+ }).describe(
1911
+ "Additional metadata about the transcription returned on the `Transcript` object under `metadata`. Only present when there is information to report \u2014 when all of its fields would be empty, the `metadata` object is omitted from the response entirely.\n"
1912
+ ).or(zod3.null()).optional().describe(
1913
+ "Additional metadata about the transcription, including any warnings emitted while processing the request. Only present when there is information to report; if no fields would be populated, `metadata` is omitted from the response entirely.\n"
1914
+ ),
1865
1915
  multichannel: zod3.boolean().nullish().describe(
1866
1916
  "Whether [Multichannel transcription](https://www.assemblyai.com/docs/pre-recorded-audio/transcribe-multiple-audio-channels) was enabled in the transcription request, either true or false"
1867
1917
  ),
@@ -1909,12 +1959,20 @@ var createTranscriptResponse = zod3.object({
1909
1959
  "email_address",
1910
1960
  "event",
1911
1961
  "filename",
1962
+ "gender",
1912
1963
  "gender_sexuality",
1913
1964
  "healthcare_number",
1914
1965
  "injury",
1915
1966
  "ip_address",
1916
1967
  "language",
1917
1968
  "location",
1969
+ "location_address",
1970
+ "location_address_street",
1971
+ "location_city",
1972
+ "location_coordinate",
1973
+ "location_country",
1974
+ "location_state",
1975
+ "location_zip",
1918
1976
  "marital_status",
1919
1977
  "medical_condition",
1920
1978
  "medical_process",
@@ -1923,6 +1981,7 @@ var createTranscriptResponse = zod3.object({
1923
1981
  "number_sequence",
1924
1982
  "occupation",
1925
1983
  "organization",
1984
+ "organization_medical_facility",
1926
1985
  "passport_number",
1927
1986
  "password",
1928
1987
  "person_age",
@@ -1931,6 +1990,7 @@ var createTranscriptResponse = zod3.object({
1931
1990
  "physical_attribute",
1932
1991
  "political_affiliation",
1933
1992
  "religion",
1993
+ "sexuality",
1934
1994
  "statistics",
1935
1995
  "time",
1936
1996
  "url",
@@ -1938,12 +1998,17 @@ var createTranscriptResponse = zod3.object({
1938
1998
  "username",
1939
1999
  "vehicle_id",
1940
2000
  "zodiac_sign"
1941
- ]).describe("The type of PII to redact")
2001
+ ]).describe(
2002
+ "The type of PII to redact. See [PII redaction](https://www.assemblyai.com/docs/streaming/pii-redaction) for the full list of policies and their descriptions."
2003
+ )
1942
2004
  ).nullish().describe(
1943
2005
  "The list of PII Redaction policies that were enabled, if PII Redaction is enabled.\nSee [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
1944
2006
  ),
1945
2007
  redact_pii_sub: zod3.enum(["entity_name", "hash"]).optional().describe(
1946
- "The replacement logic for detected PII, can be `entity_name` or `hash`. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more details."
2008
+ "The replacement logic for detected PII, can be `entity_name` or `hash`. See [PII redaction](https://www.assemblyai.com/docs/streaming/pii-redaction) for more details."
2009
+ ),
2010
+ redact_pii_return_unredacted: zod3.boolean().nullish().describe(
2011
+ "Whether the original unredacted transcript was also returned alongside the redacted one. When `true`, the response includes `unredacted_text`, `unredacted_words`, and `unredacted_utterances`. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
1947
2012
  ),
1948
2013
  sentiment_analysis: zod3.boolean().nullish().describe(
1949
2014
  "Whether [Sentiment Analysis](https://www.assemblyai.com/docs/speech-understanding/analyze-sentiment-of-speech) is enabled, can be true or false"
@@ -2080,20 +2145,23 @@ var createTranscriptResponse = zod3.object({
2080
2145
  "The generated summary of the media file, if [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details."
2081
2146
  ),
2082
2147
  summary_model: zod3.string().nullish().describe(
2083
- "The Summarization model used to generate the summary,\nif [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts#summary-models) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details.\n"
2148
+ "The Summarization model used to generate the summary,\nif [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details.\n"
2084
2149
  ),
2085
2150
  summary_type: zod3.string().nullish().describe(
2086
- "The type of summary generated, if [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts#summary-types) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details."
2151
+ "The type of summary generated, if [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details."
2087
2152
  ),
2088
- remove_audio_tags: zod3.enum(["all"]).describe(
2089
- "Whether [audio event tags](https://www.assemblyai.com/docs/pre-recorded-audio/universal-3-pro#audio-event-tags) were removed from the transcript text.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n"
2153
+ remove_audio_tags: zod3.enum(["all", "speaker"]).describe(
2154
+ 'Universal-3 Pro generates rich transcripts that can include inline annotations such as audio event markers and speaker cues. Set to `"all"` to remove all inline annotations, or `"speaker"` to remove only speaker cues while keeping other annotations.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n'
2090
2155
  ).or(zod3.null()).optional().describe(
2091
- "Whether [audio event tags](https://www.assemblyai.com/docs/pre-recorded-audio/universal-3-pro#audio-event-tags) were removed from the transcript text.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n"
2156
+ 'Universal-3 Pro generates rich transcripts that can include inline annotations such as audio event markers and speaker cues. Set to `"all"` to remove all inline annotations, or `"speaker"` to remove only speaker cues while keeping other annotations.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n'
2092
2157
  ),
2093
2158
  temperature: zod3.number().nullish().describe(
2094
2159
  "The temperature that was used for the model's response. See the [Prompting Guide](https://www.assemblyai.com/docs/pre-recorded-audio/prompting) for more details.\n\nNote: This parameter can only be used with the Universal-3 Pro model.\n"
2095
2160
  ),
2096
2161
  text: zod3.string().nullish().describe("The textual transcript of your media file"),
2162
+ unredacted_text: zod3.string().nullish().describe(
2163
+ "The original textual transcript of your media file before PII redaction was applied. Only returned when `redact_pii_return_unredacted` was set to `true` on the transcription request, otherwise this field is omitted and the `text` field remains fully redacted. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
2164
+ ),
2097
2165
  throttled: zod3.boolean().nullish().describe(
2098
2166
  "True while a request is throttled and false when a request is no longer throttled"
2099
2167
  ),
@@ -2130,6 +2198,39 @@ var createTranscriptResponse = zod3.object({
2130
2198
  ).nullish().describe(
2131
2199
  "When multichannel or speaker_labels is enabled, a list of turn-by-turn utterance objects.\nSee [Speaker diarization](https://www.assemblyai.com/docs/pre-recorded-audio/label-speakers) and [Multichannel transcription](https://www.assemblyai.com/docs/pre-recorded-audio/transcribe-multiple-audio-channels) for more information.\n"
2132
2200
  ),
2201
+ unredacted_utterances: zod3.array(
2202
+ zod3.object({
2203
+ confidence: zod3.number().describe("The confidence score for the transcript of this utterance"),
2204
+ start: zod3.number().describe("The starting time, in milliseconds, of the utterance in the audio file"),
2205
+ end: zod3.number().describe("The ending time, in milliseconds, of the utterance in the audio file"),
2206
+ text: zod3.string().describe("The text for this utterance"),
2207
+ words: zod3.array(
2208
+ zod3.object({
2209
+ confidence: zod3.number().describe("The confidence score for the transcript of this word"),
2210
+ start: zod3.number().describe("The starting time, in milliseconds, for the word"),
2211
+ end: zod3.number().describe("The ending time, in milliseconds, for the word"),
2212
+ text: zod3.string().describe("The text of the word"),
2213
+ channel: zod3.string().nullish().describe(
2214
+ "The channel of the word. The left and right channels are channels 1 and 2. Additional channels increment the channel number sequentially."
2215
+ ),
2216
+ speaker: zod3.string().nullable().describe(
2217
+ "The speaker of the word if [Speaker Diarization](https://www.assemblyai.com/docs/pre-recorded-audio/label-speakers) is enabled, else null"
2218
+ )
2219
+ })
2220
+ ).describe("The words in the utterance."),
2221
+ channel: zod3.string().nullish().describe(
2222
+ "The channel of this utterance. The left and right channels are channels 1 and 2. Additional channels increment the channel number sequentially."
2223
+ ),
2224
+ speaker: zod3.string().describe(
2225
+ 'The speaker of this utterance, where each speaker is assigned a sequential capital letter - e.g. "A" for Speaker A, "B" for Speaker B, etc.'
2226
+ ),
2227
+ translated_texts: zod3.record(zod3.string(), zod3.string()).optional().describe(
2228
+ 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "\xDCbersetzter Text"}`). Only present when `match_original_utterance` is enabled with translation.'
2229
+ )
2230
+ })
2231
+ ).nullish().describe(
2232
+ "The original turn-by-turn utterance objects before PII redaction was applied. Same shape as `utterances`. Only returned when `redact_pii_return_unredacted` was set to `true` on the transcription request, otherwise this field is omitted and the `utterances` field remains fully redacted. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
2233
+ ),
2133
2234
  webhook_auth: zod3.boolean().describe(
2134
2235
  "Whether [webhook](https://www.assemblyai.com/docs/deployment/webhooks-for-pre-recorded-audio) authentication details were provided"
2135
2236
  ),
@@ -2158,6 +2259,22 @@ var createTranscriptResponse = zod3.object({
2158
2259
  ).nullish().describe(
2159
2260
  "An array of temporally-sequential word objects, one for each word in the transcript.\n"
2160
2261
  ),
2262
+ unredacted_words: zod3.array(
2263
+ zod3.object({
2264
+ confidence: zod3.number().describe("The confidence score for the transcript of this word"),
2265
+ start: zod3.number().describe("The starting time, in milliseconds, for the word"),
2266
+ end: zod3.number().describe("The ending time, in milliseconds, for the word"),
2267
+ text: zod3.string().describe("The text of the word"),
2268
+ channel: zod3.string().nullish().describe(
2269
+ "The channel of the word. The left and right channels are channels 1 and 2. Additional channels increment the channel number sequentially."
2270
+ ),
2271
+ speaker: zod3.string().nullable().describe(
2272
+ "The speaker of the word if [Speaker Diarization](https://www.assemblyai.com/docs/pre-recorded-audio/label-speakers) is enabled, else null"
2273
+ )
2274
+ })
2275
+ ).nullish().describe(
2276
+ "The original temporally-sequential word objects before PII redaction was applied. Same shape as `words`. Only returned when `redact_pii_return_unredacted` was set to `true` on the transcription request, otherwise this field is omitted and the `words` field remains fully redacted. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
2277
+ ),
2161
2278
  acoustic_model: zod3.string().describe("This parameter does not currently have any functionality attached to it."),
2162
2279
  custom_topics: zod3.boolean().nullish().describe("This parameter does not currently have any functionality attached to it."),
2163
2280
  language_model: zod3.string().describe("This parameter does not currently have any functionality attached to it."),
@@ -2329,7 +2446,7 @@ var getTranscriptResponse = zod3.object({
2329
2446
  "Customize how words are spelled and formatted using to and from values. See [Custom Spelling](https://www.assemblyai.com/docs/pre-recorded-audio/correct-spelling-of-terms) for more details."
2330
2447
  ),
2331
2448
  disfluencies: zod3.boolean().nullish().describe(
2332
- 'Transcribe [Filler Words](https://www.assemblyai.com/docs/pre-recorded-audio/include-filler-words), like "umm", in your media file; can be true or false'
2449
+ 'Transcribe [Filler Words](https://www.assemblyai.com/docs/pre-recorded-audio/include-filler-words), like "umm", in your media file; can be true or false. Supported on Universal-3 Pro and Universal-2.'
2333
2450
  ),
2334
2451
  domain: zod3.string().nullish().describe(
2335
2452
  'The domain-specific model applied to the transcript. When set to `"medical-v1"`, [Medical Mode](https://www.assemblyai.com/docs/pre-recorded-audio/medical-mode) was used to improve accuracy for medical terminology.\n'
@@ -2352,12 +2469,20 @@ var getTranscriptResponse = zod3.object({
2352
2469
  "email_address",
2353
2470
  "event",
2354
2471
  "filename",
2472
+ "gender",
2355
2473
  "gender_sexuality",
2356
2474
  "healthcare_number",
2357
2475
  "injury",
2358
2476
  "ip_address",
2359
2477
  "language",
2360
2478
  "location",
2479
+ "location_address",
2480
+ "location_address_street",
2481
+ "location_city",
2482
+ "location_coordinate",
2483
+ "location_country",
2484
+ "location_state",
2485
+ "location_zip",
2361
2486
  "marital_status",
2362
2487
  "medical_condition",
2363
2488
  "medical_process",
@@ -2366,6 +2491,7 @@ var getTranscriptResponse = zod3.object({
2366
2491
  "number_sequence",
2367
2492
  "occupation",
2368
2493
  "organization",
2494
+ "organization_medical_facility",
2369
2495
  "passport_number",
2370
2496
  "password",
2371
2497
  "person_age",
@@ -2374,6 +2500,7 @@ var getTranscriptResponse = zod3.object({
2374
2500
  "physical_attribute",
2375
2501
  "political_affiliation",
2376
2502
  "religion",
2503
+ "sexuality",
2377
2504
  "statistics",
2378
2505
  "time",
2379
2506
  "url",
@@ -2678,6 +2805,24 @@ var getTranscriptResponse = zod3.object({
2678
2805
  }).optional().describe(
2679
2806
  "Specify options for [Automatic Language Detection](https://www.assemblyai.com/docs/pre-recorded-audio/language-detection)."
2680
2807
  ),
2808
+ metadata: zod3.object({
2809
+ domain_used: zod3.string().nullish().describe(
2810
+ 'The domain-specific model that was applied to the transcription (for example, `"medical-v1"` when [Medical Mode](https://www.assemblyai.com/docs/pre-recorded-audio/medical-mode) was applied), or `null` if no domain-specific model was used. Always present when `metadata` is present.\n'
2811
+ ),
2812
+ warnings: zod3.array(
2813
+ zod3.object({
2814
+ message: zod3.string().describe("A human-readable description of the warning.")
2815
+ }).describe(
2816
+ "A warning message emitted while processing a transcription request. Warnings are surfaced on the transcript response under `metadata.warnings`.\n"
2817
+ )
2818
+ ).optional().describe(
2819
+ "Warning messages emitted while processing the request. Each warning is an object with a human-readable `message`. When there are no warnings to report, this field is omitted from the `metadata` object entirely.\n"
2820
+ )
2821
+ }).describe(
2822
+ "Additional metadata about the transcription returned on the `Transcript` object under `metadata`. Only present when there is information to report \u2014 when all of its fields would be empty, the `metadata` object is omitted from the response entirely.\n"
2823
+ ).or(zod3.null()).optional().describe(
2824
+ "Additional metadata about the transcription, including any warnings emitted while processing the request. Only present when there is information to report; if no fields would be populated, `metadata` is omitted from the response entirely.\n"
2825
+ ),
2681
2826
  multichannel: zod3.boolean().nullish().describe(
2682
2827
  "Whether [Multichannel transcription](https://www.assemblyai.com/docs/pre-recorded-audio/transcribe-multiple-audio-channels) was enabled in the transcription request, either true or false"
2683
2828
  ),
@@ -2725,12 +2870,20 @@ var getTranscriptResponse = zod3.object({
2725
2870
  "email_address",
2726
2871
  "event",
2727
2872
  "filename",
2873
+ "gender",
2728
2874
  "gender_sexuality",
2729
2875
  "healthcare_number",
2730
2876
  "injury",
2731
2877
  "ip_address",
2732
2878
  "language",
2733
2879
  "location",
2880
+ "location_address",
2881
+ "location_address_street",
2882
+ "location_city",
2883
+ "location_coordinate",
2884
+ "location_country",
2885
+ "location_state",
2886
+ "location_zip",
2734
2887
  "marital_status",
2735
2888
  "medical_condition",
2736
2889
  "medical_process",
@@ -2739,6 +2892,7 @@ var getTranscriptResponse = zod3.object({
2739
2892
  "number_sequence",
2740
2893
  "occupation",
2741
2894
  "organization",
2895
+ "organization_medical_facility",
2742
2896
  "passport_number",
2743
2897
  "password",
2744
2898
  "person_age",
@@ -2747,6 +2901,7 @@ var getTranscriptResponse = zod3.object({
2747
2901
  "physical_attribute",
2748
2902
  "political_affiliation",
2749
2903
  "religion",
2904
+ "sexuality",
2750
2905
  "statistics",
2751
2906
  "time",
2752
2907
  "url",
@@ -2754,12 +2909,17 @@ var getTranscriptResponse = zod3.object({
2754
2909
  "username",
2755
2910
  "vehicle_id",
2756
2911
  "zodiac_sign"
2757
- ]).describe("The type of PII to redact")
2912
+ ]).describe(
2913
+ "The type of PII to redact. See [PII redaction](https://www.assemblyai.com/docs/streaming/pii-redaction) for the full list of policies and their descriptions."
2914
+ )
2758
2915
  ).nullish().describe(
2759
2916
  "The list of PII Redaction policies that were enabled, if PII Redaction is enabled.\nSee [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
2760
2917
  ),
2761
2918
  redact_pii_sub: zod3.enum(["entity_name", "hash"]).optional().describe(
2762
- "The replacement logic for detected PII, can be `entity_name` or `hash`. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more details."
2919
+ "The replacement logic for detected PII, can be `entity_name` or `hash`. See [PII redaction](https://www.assemblyai.com/docs/streaming/pii-redaction) for more details."
2920
+ ),
2921
+ redact_pii_return_unredacted: zod3.boolean().nullish().describe(
2922
+ "Whether the original unredacted transcript was also returned alongside the redacted one. When `true`, the response includes `unredacted_text`, `unredacted_words`, and `unredacted_utterances`. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
2763
2923
  ),
2764
2924
  sentiment_analysis: zod3.boolean().nullish().describe(
2765
2925
  "Whether [Sentiment Analysis](https://www.assemblyai.com/docs/speech-understanding/analyze-sentiment-of-speech) is enabled, can be true or false"
@@ -2896,20 +3056,23 @@ var getTranscriptResponse = zod3.object({
2896
3056
  "The generated summary of the media file, if [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details."
2897
3057
  ),
2898
3058
  summary_model: zod3.string().nullish().describe(
2899
- "The Summarization model used to generate the summary,\nif [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts#summary-models) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details.\n"
3059
+ "The Summarization model used to generate the summary,\nif [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details.\n"
2900
3060
  ),
2901
3061
  summary_type: zod3.string().nullish().describe(
2902
- "The type of summary generated, if [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts#summary-types) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details."
3062
+ "The type of summary generated, if [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details."
2903
3063
  ),
2904
- remove_audio_tags: zod3.enum(["all"]).describe(
2905
- "Whether [audio event tags](https://www.assemblyai.com/docs/pre-recorded-audio/universal-3-pro#audio-event-tags) were removed from the transcript text.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n"
3064
+ remove_audio_tags: zod3.enum(["all", "speaker"]).describe(
3065
+ 'Universal-3 Pro generates rich transcripts that can include inline annotations such as audio event markers and speaker cues. Set to `"all"` to remove all inline annotations, or `"speaker"` to remove only speaker cues while keeping other annotations.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n'
2906
3066
  ).or(zod3.null()).optional().describe(
2907
- "Whether [audio event tags](https://www.assemblyai.com/docs/pre-recorded-audio/universal-3-pro#audio-event-tags) were removed from the transcript text.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n"
3067
+ 'Universal-3 Pro generates rich transcripts that can include inline annotations such as audio event markers and speaker cues. Set to `"all"` to remove all inline annotations, or `"speaker"` to remove only speaker cues while keeping other annotations.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n'
2908
3068
  ),
2909
3069
  temperature: zod3.number().nullish().describe(
2910
3070
  "The temperature that was used for the model's response. See the [Prompting Guide](https://www.assemblyai.com/docs/pre-recorded-audio/prompting) for more details.\n\nNote: This parameter can only be used with the Universal-3 Pro model.\n"
2911
3071
  ),
2912
3072
  text: zod3.string().nullish().describe("The textual transcript of your media file"),
3073
+ unredacted_text: zod3.string().nullish().describe(
3074
+ "The original textual transcript of your media file before PII redaction was applied. Only returned when `redact_pii_return_unredacted` was set to `true` on the transcription request, otherwise this field is omitted and the `text` field remains fully redacted. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
3075
+ ),
2913
3076
  throttled: zod3.boolean().nullish().describe(
2914
3077
  "True while a request is throttled and false when a request is no longer throttled"
2915
3078
  ),
@@ -2946,6 +3109,39 @@ var getTranscriptResponse = zod3.object({
2946
3109
  ).nullish().describe(
2947
3110
  "When multichannel or speaker_labels is enabled, a list of turn-by-turn utterance objects.\nSee [Speaker diarization](https://www.assemblyai.com/docs/pre-recorded-audio/label-speakers) and [Multichannel transcription](https://www.assemblyai.com/docs/pre-recorded-audio/transcribe-multiple-audio-channels) for more information.\n"
2948
3111
  ),
3112
+ unredacted_utterances: zod3.array(
3113
+ zod3.object({
3114
+ confidence: zod3.number().describe("The confidence score for the transcript of this utterance"),
3115
+ start: zod3.number().describe("The starting time, in milliseconds, of the utterance in the audio file"),
3116
+ end: zod3.number().describe("The ending time, in milliseconds, of the utterance in the audio file"),
3117
+ text: zod3.string().describe("The text for this utterance"),
3118
+ words: zod3.array(
3119
+ zod3.object({
3120
+ confidence: zod3.number().describe("The confidence score for the transcript of this word"),
3121
+ start: zod3.number().describe("The starting time, in milliseconds, for the word"),
3122
+ end: zod3.number().describe("The ending time, in milliseconds, for the word"),
3123
+ text: zod3.string().describe("The text of the word"),
3124
+ channel: zod3.string().nullish().describe(
3125
+ "The channel of the word. The left and right channels are channels 1 and 2. Additional channels increment the channel number sequentially."
3126
+ ),
3127
+ speaker: zod3.string().nullable().describe(
3128
+ "The speaker of the word if [Speaker Diarization](https://www.assemblyai.com/docs/pre-recorded-audio/label-speakers) is enabled, else null"
3129
+ )
3130
+ })
3131
+ ).describe("The words in the utterance."),
3132
+ channel: zod3.string().nullish().describe(
3133
+ "The channel of this utterance. The left and right channels are channels 1 and 2. Additional channels increment the channel number sequentially."
3134
+ ),
3135
+ speaker: zod3.string().describe(
3136
+ 'The speaker of this utterance, where each speaker is assigned a sequential capital letter - e.g. "A" for Speaker A, "B" for Speaker B, etc.'
3137
+ ),
3138
+ translated_texts: zod3.record(zod3.string(), zod3.string()).optional().describe(
3139
+ 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "\xDCbersetzter Text"}`). Only present when `match_original_utterance` is enabled with translation.'
3140
+ )
3141
+ })
3142
+ ).nullish().describe(
3143
+ "The original turn-by-turn utterance objects before PII redaction was applied. Same shape as `utterances`. Only returned when `redact_pii_return_unredacted` was set to `true` on the transcription request, otherwise this field is omitted and the `utterances` field remains fully redacted. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
3144
+ ),
2949
3145
  webhook_auth: zod3.boolean().describe(
2950
3146
  "Whether [webhook](https://www.assemblyai.com/docs/deployment/webhooks-for-pre-recorded-audio) authentication details were provided"
2951
3147
  ),
@@ -2974,6 +3170,22 @@ var getTranscriptResponse = zod3.object({
2974
3170
  ).nullish().describe(
2975
3171
  "An array of temporally-sequential word objects, one for each word in the transcript.\n"
2976
3172
  ),
3173
+ unredacted_words: zod3.array(
3174
+ zod3.object({
3175
+ confidence: zod3.number().describe("The confidence score for the transcript of this word"),
3176
+ start: zod3.number().describe("The starting time, in milliseconds, for the word"),
3177
+ end: zod3.number().describe("The ending time, in milliseconds, for the word"),
3178
+ text: zod3.string().describe("The text of the word"),
3179
+ channel: zod3.string().nullish().describe(
3180
+ "The channel of the word. The left and right channels are channels 1 and 2. Additional channels increment the channel number sequentially."
3181
+ ),
3182
+ speaker: zod3.string().nullable().describe(
3183
+ "The speaker of the word if [Speaker Diarization](https://www.assemblyai.com/docs/pre-recorded-audio/label-speakers) is enabled, else null"
3184
+ )
3185
+ })
3186
+ ).nullish().describe(
3187
+ "The original temporally-sequential word objects before PII redaction was applied. Same shape as `words`. Only returned when `redact_pii_return_unredacted` was set to `true` on the transcription request, otherwise this field is omitted and the `words` field remains fully redacted. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
3188
+ ),
2977
3189
  acoustic_model: zod3.string().describe("This parameter does not currently have any functionality attached to it."),
2978
3190
  custom_topics: zod3.boolean().nullish().describe("This parameter does not currently have any functionality attached to it."),
2979
3191
  language_model: zod3.string().describe("This parameter does not currently have any functionality attached to it."),
@@ -3106,7 +3318,7 @@ var deleteTranscriptResponse = zod3.object({
3106
3318
  "Customize how words are spelled and formatted using to and from values. See [Custom Spelling](https://www.assemblyai.com/docs/pre-recorded-audio/correct-spelling-of-terms) for more details."
3107
3319
  ),
3108
3320
  disfluencies: zod3.boolean().nullish().describe(
3109
- 'Transcribe [Filler Words](https://www.assemblyai.com/docs/pre-recorded-audio/include-filler-words), like "umm", in your media file; can be true or false'
3321
+ 'Transcribe [Filler Words](https://www.assemblyai.com/docs/pre-recorded-audio/include-filler-words), like "umm", in your media file; can be true or false. Supported on Universal-3 Pro and Universal-2.'
3110
3322
  ),
3111
3323
  domain: zod3.string().nullish().describe(
3112
3324
  'The domain-specific model applied to the transcript. When set to `"medical-v1"`, [Medical Mode](https://www.assemblyai.com/docs/pre-recorded-audio/medical-mode) was used to improve accuracy for medical terminology.\n'
@@ -3129,12 +3341,20 @@ var deleteTranscriptResponse = zod3.object({
3129
3341
  "email_address",
3130
3342
  "event",
3131
3343
  "filename",
3344
+ "gender",
3132
3345
  "gender_sexuality",
3133
3346
  "healthcare_number",
3134
3347
  "injury",
3135
3348
  "ip_address",
3136
3349
  "language",
3137
3350
  "location",
3351
+ "location_address",
3352
+ "location_address_street",
3353
+ "location_city",
3354
+ "location_coordinate",
3355
+ "location_country",
3356
+ "location_state",
3357
+ "location_zip",
3138
3358
  "marital_status",
3139
3359
  "medical_condition",
3140
3360
  "medical_process",
@@ -3143,6 +3363,7 @@ var deleteTranscriptResponse = zod3.object({
3143
3363
  "number_sequence",
3144
3364
  "occupation",
3145
3365
  "organization",
3366
+ "organization_medical_facility",
3146
3367
  "passport_number",
3147
3368
  "password",
3148
3369
  "person_age",
@@ -3151,6 +3372,7 @@ var deleteTranscriptResponse = zod3.object({
3151
3372
  "physical_attribute",
3152
3373
  "political_affiliation",
3153
3374
  "religion",
3375
+ "sexuality",
3154
3376
  "statistics",
3155
3377
  "time",
3156
3378
  "url",
@@ -3455,6 +3677,24 @@ var deleteTranscriptResponse = zod3.object({
3455
3677
  }).optional().describe(
3456
3678
  "Specify options for [Automatic Language Detection](https://www.assemblyai.com/docs/pre-recorded-audio/language-detection)."
3457
3679
  ),
3680
+ metadata: zod3.object({
3681
+ domain_used: zod3.string().nullish().describe(
3682
+ 'The domain-specific model that was applied to the transcription (for example, `"medical-v1"` when [Medical Mode](https://www.assemblyai.com/docs/pre-recorded-audio/medical-mode) was applied), or `null` if no domain-specific model was used. Always present when `metadata` is present.\n'
3683
+ ),
3684
+ warnings: zod3.array(
3685
+ zod3.object({
3686
+ message: zod3.string().describe("A human-readable description of the warning.")
3687
+ }).describe(
3688
+ "A warning message emitted while processing a transcription request. Warnings are surfaced on the transcript response under `metadata.warnings`.\n"
3689
+ )
3690
+ ).optional().describe(
3691
+ "Warning messages emitted while processing the request. Each warning is an object with a human-readable `message`. When there are no warnings to report, this field is omitted from the `metadata` object entirely.\n"
3692
+ )
3693
+ }).describe(
3694
+ "Additional metadata about the transcription returned on the `Transcript` object under `metadata`. Only present when there is information to report \u2014 when all of its fields would be empty, the `metadata` object is omitted from the response entirely.\n"
3695
+ ).or(zod3.null()).optional().describe(
3696
+ "Additional metadata about the transcription, including any warnings emitted while processing the request. Only present when there is information to report; if no fields would be populated, `metadata` is omitted from the response entirely.\n"
3697
+ ),
3458
3698
  multichannel: zod3.boolean().nullish().describe(
3459
3699
  "Whether [Multichannel transcription](https://www.assemblyai.com/docs/pre-recorded-audio/transcribe-multiple-audio-channels) was enabled in the transcription request, either true or false"
3460
3700
  ),
@@ -3502,12 +3742,20 @@ var deleteTranscriptResponse = zod3.object({
3502
3742
  "email_address",
3503
3743
  "event",
3504
3744
  "filename",
3745
+ "gender",
3505
3746
  "gender_sexuality",
3506
3747
  "healthcare_number",
3507
3748
  "injury",
3508
3749
  "ip_address",
3509
3750
  "language",
3510
3751
  "location",
3752
+ "location_address",
3753
+ "location_address_street",
3754
+ "location_city",
3755
+ "location_coordinate",
3756
+ "location_country",
3757
+ "location_state",
3758
+ "location_zip",
3511
3759
  "marital_status",
3512
3760
  "medical_condition",
3513
3761
  "medical_process",
@@ -3516,6 +3764,7 @@ var deleteTranscriptResponse = zod3.object({
3516
3764
  "number_sequence",
3517
3765
  "occupation",
3518
3766
  "organization",
3767
+ "organization_medical_facility",
3519
3768
  "passport_number",
3520
3769
  "password",
3521
3770
  "person_age",
@@ -3524,6 +3773,7 @@ var deleteTranscriptResponse = zod3.object({
3524
3773
  "physical_attribute",
3525
3774
  "political_affiliation",
3526
3775
  "religion",
3776
+ "sexuality",
3527
3777
  "statistics",
3528
3778
  "time",
3529
3779
  "url",
@@ -3531,12 +3781,17 @@ var deleteTranscriptResponse = zod3.object({
3531
3781
  "username",
3532
3782
  "vehicle_id",
3533
3783
  "zodiac_sign"
3534
- ]).describe("The type of PII to redact")
3784
+ ]).describe(
3785
+ "The type of PII to redact. See [PII redaction](https://www.assemblyai.com/docs/streaming/pii-redaction) for the full list of policies and their descriptions."
3786
+ )
3535
3787
  ).nullish().describe(
3536
3788
  "The list of PII Redaction policies that were enabled, if PII Redaction is enabled.\nSee [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
3537
3789
  ),
3538
3790
  redact_pii_sub: zod3.enum(["entity_name", "hash"]).optional().describe(
3539
- "The replacement logic for detected PII, can be `entity_name` or `hash`. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more details."
3791
+ "The replacement logic for detected PII, can be `entity_name` or `hash`. See [PII redaction](https://www.assemblyai.com/docs/streaming/pii-redaction) for more details."
3792
+ ),
3793
+ redact_pii_return_unredacted: zod3.boolean().nullish().describe(
3794
+ "Whether the original unredacted transcript was also returned alongside the redacted one. When `true`, the response includes `unredacted_text`, `unredacted_words`, and `unredacted_utterances`. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
3540
3795
  ),
3541
3796
  sentiment_analysis: zod3.boolean().nullish().describe(
3542
3797
  "Whether [Sentiment Analysis](https://www.assemblyai.com/docs/speech-understanding/analyze-sentiment-of-speech) is enabled, can be true or false"
@@ -3673,20 +3928,23 @@ var deleteTranscriptResponse = zod3.object({
3673
3928
  "The generated summary of the media file, if [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details."
3674
3929
  ),
3675
3930
  summary_model: zod3.string().nullish().describe(
3676
- "The Summarization model used to generate the summary,\nif [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts#summary-models) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details.\n"
3931
+ "The Summarization model used to generate the summary,\nif [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details.\n"
3677
3932
  ),
3678
3933
  summary_type: zod3.string().nullish().describe(
3679
- "The type of summary generated, if [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts#summary-types) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details."
3934
+ "The type of summary generated, if [Summarization](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) is enabled. Deprecated - use [LLM Gateway](https://www.assemblyai.com/docs/llm-gateway/overview) instead for more flexible summaries. See the [updated Summarization page](https://www.assemblyai.com/docs/speech-understanding/summarize-transcripts) for details."
3680
3935
  ),
3681
- remove_audio_tags: zod3.enum(["all"]).describe(
3682
- "Whether [audio event tags](https://www.assemblyai.com/docs/pre-recorded-audio/universal-3-pro#audio-event-tags) were removed from the transcript text.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n"
3936
+ remove_audio_tags: zod3.enum(["all", "speaker"]).describe(
3937
+ 'Universal-3 Pro generates rich transcripts that can include inline annotations such as audio event markers and speaker cues. Set to `"all"` to remove all inline annotations, or `"speaker"` to remove only speaker cues while keeping other annotations.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n'
3683
3938
  ).or(zod3.null()).optional().describe(
3684
- "Whether [audio event tags](https://www.assemblyai.com/docs/pre-recorded-audio/universal-3-pro#audio-event-tags) were removed from the transcript text.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n"
3939
+ 'Universal-3 Pro generates rich transcripts that can include inline annotations such as audio event markers and speaker cues. Set to `"all"` to remove all inline annotations, or `"speaker"` to remove only speaker cues while keeping other annotations.\n\nNote: This parameter is only supported for the Universal-3 Pro model.\n'
3685
3940
  ),
3686
3941
  temperature: zod3.number().nullish().describe(
3687
3942
  "The temperature that was used for the model's response. See the [Prompting Guide](https://www.assemblyai.com/docs/pre-recorded-audio/prompting) for more details.\n\nNote: This parameter can only be used with the Universal-3 Pro model.\n"
3688
3943
  ),
3689
3944
  text: zod3.string().nullish().describe("The textual transcript of your media file"),
3945
+ unredacted_text: zod3.string().nullish().describe(
3946
+ "The original textual transcript of your media file before PII redaction was applied. Only returned when `redact_pii_return_unredacted` was set to `true` on the transcription request, otherwise this field is omitted and the `text` field remains fully redacted. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
3947
+ ),
3690
3948
  throttled: zod3.boolean().nullish().describe(
3691
3949
  "True while a request is throttled and false when a request is no longer throttled"
3692
3950
  ),
@@ -3723,6 +3981,39 @@ var deleteTranscriptResponse = zod3.object({
3723
3981
  ).nullish().describe(
3724
3982
  "When multichannel or speaker_labels is enabled, a list of turn-by-turn utterance objects.\nSee [Speaker diarization](https://www.assemblyai.com/docs/pre-recorded-audio/label-speakers) and [Multichannel transcription](https://www.assemblyai.com/docs/pre-recorded-audio/transcribe-multiple-audio-channels) for more information.\n"
3725
3983
  ),
3984
+ unredacted_utterances: zod3.array(
3985
+ zod3.object({
3986
+ confidence: zod3.number().describe("The confidence score for the transcript of this utterance"),
3987
+ start: zod3.number().describe("The starting time, in milliseconds, of the utterance in the audio file"),
3988
+ end: zod3.number().describe("The ending time, in milliseconds, of the utterance in the audio file"),
3989
+ text: zod3.string().describe("The text for this utterance"),
3990
+ words: zod3.array(
3991
+ zod3.object({
3992
+ confidence: zod3.number().describe("The confidence score for the transcript of this word"),
3993
+ start: zod3.number().describe("The starting time, in milliseconds, for the word"),
3994
+ end: zod3.number().describe("The ending time, in milliseconds, for the word"),
3995
+ text: zod3.string().describe("The text of the word"),
3996
+ channel: zod3.string().nullish().describe(
3997
+ "The channel of the word. The left and right channels are channels 1 and 2. Additional channels increment the channel number sequentially."
3998
+ ),
3999
+ speaker: zod3.string().nullable().describe(
4000
+ "The speaker of the word if [Speaker Diarization](https://www.assemblyai.com/docs/pre-recorded-audio/label-speakers) is enabled, else null"
4001
+ )
4002
+ })
4003
+ ).describe("The words in the utterance."),
4004
+ channel: zod3.string().nullish().describe(
4005
+ "The channel of this utterance. The left and right channels are channels 1 and 2. Additional channels increment the channel number sequentially."
4006
+ ),
4007
+ speaker: zod3.string().describe(
4008
+ 'The speaker of this utterance, where each speaker is assigned a sequential capital letter - e.g. "A" for Speaker A, "B" for Speaker B, etc.'
4009
+ ),
4010
+ translated_texts: zod3.record(zod3.string(), zod3.string()).optional().describe(
4011
+ 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "\xDCbersetzter Text"}`). Only present when `match_original_utterance` is enabled with translation.'
4012
+ )
4013
+ })
4014
+ ).nullish().describe(
4015
+ "The original turn-by-turn utterance objects before PII redaction was applied. Same shape as `utterances`. Only returned when `redact_pii_return_unredacted` was set to `true` on the transcription request, otherwise this field is omitted and the `utterances` field remains fully redacted. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
4016
+ ),
3726
4017
  webhook_auth: zod3.boolean().describe(
3727
4018
  "Whether [webhook](https://www.assemblyai.com/docs/deployment/webhooks-for-pre-recorded-audio) authentication details were provided"
3728
4019
  ),
@@ -3751,6 +4042,22 @@ var deleteTranscriptResponse = zod3.object({
3751
4042
  ).nullish().describe(
3752
4043
  "An array of temporally-sequential word objects, one for each word in the transcript.\n"
3753
4044
  ),
4045
+ unredacted_words: zod3.array(
4046
+ zod3.object({
4047
+ confidence: zod3.number().describe("The confidence score for the transcript of this word"),
4048
+ start: zod3.number().describe("The starting time, in milliseconds, for the word"),
4049
+ end: zod3.number().describe("The ending time, in milliseconds, for the word"),
4050
+ text: zod3.string().describe("The text of the word"),
4051
+ channel: zod3.string().nullish().describe(
4052
+ "The channel of the word. The left and right channels are channels 1 and 2. Additional channels increment the channel number sequentially."
4053
+ ),
4054
+ speaker: zod3.string().nullable().describe(
4055
+ "The speaker of the word if [Speaker Diarization](https://www.assemblyai.com/docs/pre-recorded-audio/label-speakers) is enabled, else null"
4056
+ )
4057
+ })
4058
+ ).nullish().describe(
4059
+ "The original temporally-sequential word objects before PII redaction was applied. Same shape as `words`. Only returned when `redact_pii_return_unredacted` was set to `true` on the transcription request, otherwise this field is omitted and the `words` field remains fully redacted. See [PII redaction](https://www.assemblyai.com/docs/pii-redaction) for more information.\n"
4060
+ ),
3754
4061
  acoustic_model: zod3.string().describe("This parameter does not currently have any functionality attached to it."),
3755
4062
  custom_topics: zod3.boolean().nullish().describe("This parameter does not currently have any functionality attached to it."),
3756
4063
  language_model: zod3.string().describe("This parameter does not currently have any functionality attached to it."),
@@ -3906,7 +4213,21 @@ var streamingTranscriberParams = zod4.object({
3906
4213
  inactivityTimeout: zod4.number().optional().describe("From SDK v3"),
3907
4214
  speakerLabels: zod4.boolean().optional().describe("From SDK v3"),
3908
4215
  maxSpeakers: zod4.number().optional().describe("From SDK v3"),
3909
- llmGateway: zod4.unknown().optional().describe("From SDK v3")
4216
+ voiceFocus: zod4.unknown().optional().describe("From SDK v3"),
4217
+ voiceFocusThreshold: zod4.number().optional().describe("From SDK v3"),
4218
+ continuousPartials: zod4.boolean().optional().describe("From SDK v3"),
4219
+ interruptionDelay: zod4.number().optional().describe("From SDK v3"),
4220
+ turnLeftPadMs: zod4.number().optional().describe("From SDK v3"),
4221
+ customerSupportAudioCapture: zod4.boolean().optional().describe("From SDK v3"),
4222
+ includePartialTurns: zod4.boolean().optional().describe("From SDK v3"),
4223
+ redactPii: zod4.boolean().optional().describe("From SDK v3"),
4224
+ redactPiiPolicies: zod4.unknown().optional().describe("From SDK v3"),
4225
+ redactPiiSub: zod4.unknown().optional().describe("From SDK v3"),
4226
+ llmGateway: zod4.unknown().optional().describe("From SDK v3"),
4227
+ webhookUrl: zod4.string().optional().describe("From SDK v3"),
4228
+ webhookAuthHeaderName: zod4.string().optional().describe("From SDK v3"),
4229
+ webhookAuthHeaderValue: zod4.string().optional().describe("From SDK v3"),
4230
+ mode: zod4.unknown().describe("From SDK v3")
3910
4231
  });
3911
4232
  var streamingUpdateConfigParams = zod4.object({
3912
4233
  end_utterance_silence_threshold: zod4.number().min(0).max(2e4).optional().describe("The duration threshold in milliseconds"),
@@ -3918,7 +4239,9 @@ var streamingUpdateConfigParams = zod4.object({
3918
4239
  format_turns: zod4.boolean().optional().describe("From SDK v3"),
3919
4240
  keyterms_prompt: zod4.array(zod4.string()).optional().describe("From SDK v3"),
3920
4241
  prompt: zod4.string().optional().describe("From SDK v3"),
3921
- filter_profanity: zod4.boolean().optional().describe("From SDK v3")
4242
+ filter_profanity: zod4.boolean().optional().describe("From SDK v3"),
4243
+ interruption_delay: zod4.number().optional().describe("From SDK v3"),
4244
+ turn_left_pad_ms: zod4.number().optional().describe("From SDK v3")
3922
4245
  });
3923
4246
 
3924
4247
  // src/generated/gladia/api/gladiaControlAPI.zod.ts
@@ -3955,7 +4278,7 @@ var preRecordedControllerInitPreRecordedJobV2BodyTranslationConfigMatchOriginalU
3955
4278
  var preRecordedControllerInitPreRecordedJobV2BodyTranslationConfigLipsyncDefault = true;
3956
4279
  var preRecordedControllerInitPreRecordedJobV2BodyTranslationConfigContextAdaptationDefault = true;
3957
4280
  var preRecordedControllerInitPreRecordedJobV2BodySummarizationConfigTypeDefault = "general";
3958
- var preRecordedControllerInitPreRecordedJobV2BodyAudioToLlmConfigModelDefault = "openai/gpt-3.5-turbo";
4281
+ var preRecordedControllerInitPreRecordedJobV2BodyAudioToLlmConfigModelDefault = "openai/gpt-5.4-nano";
3959
4282
  var preRecordedControllerInitPreRecordedJobV2BodyLanguageConfigLanguagesDefault = [];
3960
4283
  var preRecordedControllerInitPreRecordedJobV2Body = zod5.object({
3961
4284
  custom_vocabulary: zod5.boolean().optional().describe(
@@ -4240,23 +4563,23 @@ var preRecordedControllerInitPreRecordedJobV2Body = zod5.object({
4240
4563
  "Forces the translation to use informal language forms when available in the target language."
4241
4564
  )
4242
4565
  }).optional().describe("**[Beta]** Translation configuration, if `translation` is enabled"),
4243
- summarization: zod5.boolean().optional().describe("**[Beta]** Enable summarization for this audio"),
4566
+ summarization: zod5.boolean().optional().describe("Enable summarization for this audio"),
4244
4567
  summarization_config: zod5.object({
4245
4568
  type: zod5.enum(["general", "bullet_points", "concise"]).describe("The type of summarization to apply").default(preRecordedControllerInitPreRecordedJobV2BodySummarizationConfigTypeDefault).describe("The type of summarization to apply")
4246
- }).optional().describe("**[Beta]** Summarization configuration, if `summarization` is enabled"),
4569
+ }).optional().describe("Summarization configuration, if `summarization` is enabled"),
4247
4570
  named_entity_recognition: zod5.boolean().optional().describe("**[Alpha]** Enable named entity recognition for this audio"),
4248
4571
  custom_spelling: zod5.boolean().optional().describe("**[Alpha]** Enable custom spelling for this audio"),
4249
4572
  custom_spelling_config: zod5.object({
4250
4573
  spelling_dictionary: zod5.record(zod5.string(), zod5.array(zod5.string())).describe("The list of spelling applied on the audio transcription")
4251
4574
  }).optional().describe("**[Alpha]** Custom spelling configuration, if `custom_spelling` is enabled"),
4252
4575
  sentiment_analysis: zod5.boolean().optional().describe("Enable sentiment analysis for this audio"),
4253
- audio_to_llm: zod5.boolean().optional().describe("**[Alpha]** Enable audio to llm processing for this audio"),
4576
+ audio_to_llm: zod5.boolean().optional().describe("Enable audio to LLM processing for this audio"),
4254
4577
  audio_to_llm_config: zod5.object({
4255
4578
  prompts: zod5.array(zod5.array(zod5.unknown())).min(1).describe("The list of prompts applied on the audio transcription"),
4256
4579
  model: zod5.string().default(preRecordedControllerInitPreRecordedJobV2BodyAudioToLlmConfigModelDefault).describe(
4257
4580
  "The model to use for the prompt execution. You can find the list of supported models [here](https://openrouter.ai/models)."
4258
4581
  )
4259
- }).optional().describe("**[Alpha]** Audio to llm configuration, if `audio_to_llm` is enabled"),
4582
+ }).optional().describe("Audio to LLM configuration, if `audio_to_llm` is enabled"),
4260
4583
  pii_redaction: zod5.boolean().optional().describe("Enable PII redaction for this audio"),
4261
4584
  pii_redaction_config: zod5.object({
4262
4585
  entity_types: zod5.enum([
@@ -4498,7 +4821,7 @@ var preRecordedControllerGetPreRecordedJobsV2ResponseItemsItemRequestParamsTrans
4498
4821
  var preRecordedControllerGetPreRecordedJobsV2ResponseItemsItemRequestParamsTranslationConfigLipsyncDefault = true;
4499
4822
  var preRecordedControllerGetPreRecordedJobsV2ResponseItemsItemRequestParamsTranslationConfigContextAdaptationDefault = true;
4500
4823
  var preRecordedControllerGetPreRecordedJobsV2ResponseItemsItemRequestParamsSummarizationConfigTypeDefault = "general";
4501
- var preRecordedControllerGetPreRecordedJobsV2ResponseItemsItemRequestParamsAudioToLlmConfigModelDefault = "openai/gpt-3.5-turbo";
4824
+ var preRecordedControllerGetPreRecordedJobsV2ResponseItemsItemRequestParamsAudioToLlmConfigModelDefault = "openai/gpt-5.4-nano";
4502
4825
  var preRecordedControllerGetPreRecordedJobsV2ResponseItemsItemRequestParamsLanguageConfigLanguagesDefault = [];
4503
4826
  var preRecordedControllerGetPreRecordedJobsV2ResponseItemsItemResultTranscriptionUtterancesItemChannelMin = 0;
4504
4827
  var preRecordedControllerGetPreRecordedJobsV2ResponseItemsItemResultTranscriptionUtterancesItemSpeakerMin = 0;
@@ -4842,12 +5165,12 @@ var preRecordedControllerGetPreRecordedJobsV2Response = zod5.object({
4842
5165
  "Forces the translation to use informal language forms when available in the target language."
4843
5166
  )
4844
5167
  }).optional().describe("**[Beta]** Translation configuration, if `translation` is enabled"),
4845
- summarization: zod5.boolean().optional().describe("**[Beta]** Enable summarization for this audio"),
5168
+ summarization: zod5.boolean().optional().describe("Enable summarization for this audio"),
4846
5169
  summarization_config: zod5.object({
4847
5170
  type: zod5.enum(["general", "bullet_points", "concise"]).describe("The type of summarization to apply").default(
4848
5171
  preRecordedControllerGetPreRecordedJobsV2ResponseItemsItemRequestParamsSummarizationConfigTypeDefault
4849
5172
  ).describe("The type of summarization to apply")
4850
- }).optional().describe("**[Beta]** Summarization configuration, if `summarization` is enabled"),
5173
+ }).optional().describe("Summarization configuration, if `summarization` is enabled"),
4851
5174
  named_entity_recognition: zod5.boolean().optional().describe("**[Alpha]** Enable named entity recognition for this audio"),
4852
5175
  custom_spelling: zod5.boolean().optional().describe("**[Alpha]** Enable custom spelling for this audio"),
4853
5176
  custom_spelling_config: zod5.object({
@@ -4856,7 +5179,7 @@ var preRecordedControllerGetPreRecordedJobsV2Response = zod5.object({
4856
5179
  "**[Alpha]** Custom spelling configuration, if `custom_spelling` is enabled"
4857
5180
  ),
4858
5181
  sentiment_analysis: zod5.boolean().optional().describe("Enable sentiment analysis for this audio"),
4859
- audio_to_llm: zod5.boolean().optional().describe("**[Alpha]** Enable audio to llm processing for this audio"),
5182
+ audio_to_llm: zod5.boolean().optional().describe("Enable audio to LLM processing for this audio"),
4860
5183
  audio_to_llm_config: zod5.object({
4861
5184
  prompts: zod5.array(zod5.array(zod5.unknown())).min(1).describe("The list of prompts applied on the audio transcription"),
4862
5185
  model: zod5.string().default(
@@ -4864,7 +5187,7 @@ var preRecordedControllerGetPreRecordedJobsV2Response = zod5.object({
4864
5187
  ).describe(
4865
5188
  "The model to use for the prompt execution. You can find the list of supported models [here](https://openrouter.ai/models)."
4866
5189
  )
4867
- }).optional().describe("**[Alpha]** Audio to llm configuration, if `audio_to_llm` is enabled"),
5190
+ }).optional().describe("Audio to LLM configuration, if `audio_to_llm` is enabled"),
4868
5191
  pii_redaction: zod5.boolean().optional().describe("Enable PII redaction for this audio"),
4869
5192
  pii_redaction_config: zod5.object({
4870
5193
  entity_types: zod5.enum([
@@ -5989,7 +6312,7 @@ var preRecordedControllerGetPreRecordedJobV2ResponseRequestParamsTranslationConf
5989
6312
  var preRecordedControllerGetPreRecordedJobV2ResponseRequestParamsTranslationConfigLipsyncDefault = true;
5990
6313
  var preRecordedControllerGetPreRecordedJobV2ResponseRequestParamsTranslationConfigContextAdaptationDefault = true;
5991
6314
  var preRecordedControllerGetPreRecordedJobV2ResponseRequestParamsSummarizationConfigTypeDefault = "general";
5992
- var preRecordedControllerGetPreRecordedJobV2ResponseRequestParamsAudioToLlmConfigModelDefault = "openai/gpt-3.5-turbo";
6315
+ var preRecordedControllerGetPreRecordedJobV2ResponseRequestParamsAudioToLlmConfigModelDefault = "openai/gpt-5.4-nano";
5993
6316
  var preRecordedControllerGetPreRecordedJobV2ResponseRequestParamsLanguageConfigLanguagesDefault = [];
5994
6317
  var preRecordedControllerGetPreRecordedJobV2ResponseResultTranscriptionUtterancesItemChannelMin = 0;
5995
6318
  var preRecordedControllerGetPreRecordedJobV2ResponseResultTranscriptionUtterancesItemSpeakerMin = 0;
@@ -6326,19 +6649,19 @@ var preRecordedControllerGetPreRecordedJobV2Response = zod5.object({
6326
6649
  "Forces the translation to use informal language forms when available in the target language."
6327
6650
  )
6328
6651
  }).optional().describe("**[Beta]** Translation configuration, if `translation` is enabled"),
6329
- summarization: zod5.boolean().optional().describe("**[Beta]** Enable summarization for this audio"),
6652
+ summarization: zod5.boolean().optional().describe("Enable summarization for this audio"),
6330
6653
  summarization_config: zod5.object({
6331
6654
  type: zod5.enum(["general", "bullet_points", "concise"]).describe("The type of summarization to apply").default(
6332
6655
  preRecordedControllerGetPreRecordedJobV2ResponseRequestParamsSummarizationConfigTypeDefault
6333
6656
  ).describe("The type of summarization to apply")
6334
- }).optional().describe("**[Beta]** Summarization configuration, if `summarization` is enabled"),
6657
+ }).optional().describe("Summarization configuration, if `summarization` is enabled"),
6335
6658
  named_entity_recognition: zod5.boolean().optional().describe("**[Alpha]** Enable named entity recognition for this audio"),
6336
6659
  custom_spelling: zod5.boolean().optional().describe("**[Alpha]** Enable custom spelling for this audio"),
6337
6660
  custom_spelling_config: zod5.object({
6338
6661
  spelling_dictionary: zod5.record(zod5.string(), zod5.array(zod5.string())).describe("The list of spelling applied on the audio transcription")
6339
6662
  }).optional().describe("**[Alpha]** Custom spelling configuration, if `custom_spelling` is enabled"),
6340
6663
  sentiment_analysis: zod5.boolean().optional().describe("Enable sentiment analysis for this audio"),
6341
- audio_to_llm: zod5.boolean().optional().describe("**[Alpha]** Enable audio to llm processing for this audio"),
6664
+ audio_to_llm: zod5.boolean().optional().describe("Enable audio to LLM processing for this audio"),
6342
6665
  audio_to_llm_config: zod5.object({
6343
6666
  prompts: zod5.array(zod5.array(zod5.unknown())).min(1).describe("The list of prompts applied on the audio transcription"),
6344
6667
  model: zod5.string().default(
@@ -6346,7 +6669,7 @@ var preRecordedControllerGetPreRecordedJobV2Response = zod5.object({
6346
6669
  ).describe(
6347
6670
  "The model to use for the prompt execution. You can find the list of supported models [here](https://openrouter.ai/models)."
6348
6671
  )
6349
- }).optional().describe("**[Alpha]** Audio to llm configuration, if `audio_to_llm` is enabled"),
6672
+ }).optional().describe("Audio to LLM configuration, if `audio_to_llm` is enabled"),
6350
6673
  pii_redaction: zod5.boolean().optional().describe("Enable PII redaction for this audio"),
6351
6674
  pii_redaction_config: zod5.object({
6352
6675
  entity_types: zod5.enum([
@@ -7448,7 +7771,7 @@ var transcriptionControllerInitPreRecordedJobV2BodyTranslationConfigMatchOrigina
7448
7771
  var transcriptionControllerInitPreRecordedJobV2BodyTranslationConfigLipsyncDefault = true;
7449
7772
  var transcriptionControllerInitPreRecordedJobV2BodyTranslationConfigContextAdaptationDefault = true;
7450
7773
  var transcriptionControllerInitPreRecordedJobV2BodySummarizationConfigTypeDefault = "general";
7451
- var transcriptionControllerInitPreRecordedJobV2BodyAudioToLlmConfigModelDefault = "openai/gpt-3.5-turbo";
7774
+ var transcriptionControllerInitPreRecordedJobV2BodyAudioToLlmConfigModelDefault = "openai/gpt-5.4-nano";
7452
7775
  var transcriptionControllerInitPreRecordedJobV2BodyLanguageConfigLanguagesDefault = [];
7453
7776
  var transcriptionControllerInitPreRecordedJobV2Body = zod5.object({
7454
7777
  custom_vocabulary: zod5.boolean().optional().describe(
@@ -7737,23 +8060,23 @@ var transcriptionControllerInitPreRecordedJobV2Body = zod5.object({
7737
8060
  "Forces the translation to use informal language forms when available in the target language."
7738
8061
  )
7739
8062
  }).optional().describe("**[Beta]** Translation configuration, if `translation` is enabled"),
7740
- summarization: zod5.boolean().optional().describe("**[Beta]** Enable summarization for this audio"),
8063
+ summarization: zod5.boolean().optional().describe("Enable summarization for this audio"),
7741
8064
  summarization_config: zod5.object({
7742
8065
  type: zod5.enum(["general", "bullet_points", "concise"]).describe("The type of summarization to apply").default(transcriptionControllerInitPreRecordedJobV2BodySummarizationConfigTypeDefault).describe("The type of summarization to apply")
7743
- }).optional().describe("**[Beta]** Summarization configuration, if `summarization` is enabled"),
8066
+ }).optional().describe("Summarization configuration, if `summarization` is enabled"),
7744
8067
  named_entity_recognition: zod5.boolean().optional().describe("**[Alpha]** Enable named entity recognition for this audio"),
7745
8068
  custom_spelling: zod5.boolean().optional().describe("**[Alpha]** Enable custom spelling for this audio"),
7746
8069
  custom_spelling_config: zod5.object({
7747
8070
  spelling_dictionary: zod5.record(zod5.string(), zod5.array(zod5.string())).describe("The list of spelling applied on the audio transcription")
7748
8071
  }).optional().describe("**[Alpha]** Custom spelling configuration, if `custom_spelling` is enabled"),
7749
8072
  sentiment_analysis: zod5.boolean().optional().describe("Enable sentiment analysis for this audio"),
7750
- audio_to_llm: zod5.boolean().optional().describe("**[Alpha]** Enable audio to llm processing for this audio"),
8073
+ audio_to_llm: zod5.boolean().optional().describe("Enable audio to LLM processing for this audio"),
7751
8074
  audio_to_llm_config: zod5.object({
7752
8075
  prompts: zod5.array(zod5.array(zod5.unknown())).min(1).describe("The list of prompts applied on the audio transcription"),
7753
8076
  model: zod5.string().default(transcriptionControllerInitPreRecordedJobV2BodyAudioToLlmConfigModelDefault).describe(
7754
8077
  "The model to use for the prompt execution. You can find the list of supported models [here](https://openrouter.ai/models)."
7755
8078
  )
7756
- }).optional().describe("**[Alpha]** Audio to llm configuration, if `audio_to_llm` is enabled"),
8079
+ }).optional().describe("Audio to LLM configuration, if `audio_to_llm` is enabled"),
7757
8080
  pii_redaction: zod5.boolean().optional().describe("Enable PII redaction for this audio"),
7758
8081
  pii_redaction_config: zod5.object({
7759
8082
  entity_types: zod5.enum([
@@ -7998,7 +8321,7 @@ var transcriptionControllerListV2ResponseItemsItemRequestParamsTranslationConfig
7998
8321
  var transcriptionControllerListV2ResponseItemsItemRequestParamsTranslationConfigLipsyncDefault = true;
7999
8322
  var transcriptionControllerListV2ResponseItemsItemRequestParamsTranslationConfigContextAdaptationDefault = true;
8000
8323
  var transcriptionControllerListV2ResponseItemsItemRequestParamsSummarizationConfigTypeDefault = "general";
8001
- var transcriptionControllerListV2ResponseItemsItemRequestParamsAudioToLlmConfigModelDefault = "openai/gpt-3.5-turbo";
8324
+ var transcriptionControllerListV2ResponseItemsItemRequestParamsAudioToLlmConfigModelDefault = "openai/gpt-5.4-nano";
8002
8325
  var transcriptionControllerListV2ResponseItemsItemRequestParamsLanguageConfigLanguagesDefault = [];
8003
8326
  var transcriptionControllerListV2ResponseItemsItemResultTranscriptionUtterancesItemChannelMin = 0;
8004
8327
  var transcriptionControllerListV2ResponseItemsItemResultTranscriptionUtterancesItemSpeakerMin = 0;
@@ -8387,12 +8710,12 @@ var transcriptionControllerListV2Response = zod5.object({
8387
8710
  "Forces the translation to use informal language forms when available in the target language."
8388
8711
  )
8389
8712
  }).optional().describe("**[Beta]** Translation configuration, if `translation` is enabled"),
8390
- summarization: zod5.boolean().optional().describe("**[Beta]** Enable summarization for this audio"),
8713
+ summarization: zod5.boolean().optional().describe("Enable summarization for this audio"),
8391
8714
  summarization_config: zod5.object({
8392
8715
  type: zod5.enum(["general", "bullet_points", "concise"]).describe("The type of summarization to apply").default(
8393
8716
  transcriptionControllerListV2ResponseItemsItemRequestParamsSummarizationConfigTypeDefault
8394
8717
  ).describe("The type of summarization to apply")
8395
- }).optional().describe("**[Beta]** Summarization configuration, if `summarization` is enabled"),
8718
+ }).optional().describe("Summarization configuration, if `summarization` is enabled"),
8396
8719
  named_entity_recognition: zod5.boolean().optional().describe("**[Alpha]** Enable named entity recognition for this audio"),
8397
8720
  custom_spelling: zod5.boolean().optional().describe("**[Alpha]** Enable custom spelling for this audio"),
8398
8721
  custom_spelling_config: zod5.object({
@@ -8401,7 +8724,7 @@ var transcriptionControllerListV2Response = zod5.object({
8401
8724
  "**[Alpha]** Custom spelling configuration, if `custom_spelling` is enabled"
8402
8725
  ),
8403
8726
  sentiment_analysis: zod5.boolean().optional().describe("Enable sentiment analysis for this audio"),
8404
- audio_to_llm: zod5.boolean().optional().describe("**[Alpha]** Enable audio to llm processing for this audio"),
8727
+ audio_to_llm: zod5.boolean().optional().describe("Enable audio to LLM processing for this audio"),
8405
8728
  audio_to_llm_config: zod5.object({
8406
8729
  prompts: zod5.array(zod5.array(zod5.unknown())).min(1).describe("The list of prompts applied on the audio transcription"),
8407
8730
  model: zod5.string().default(
@@ -8409,7 +8732,7 @@ var transcriptionControllerListV2Response = zod5.object({
8409
8732
  ).describe(
8410
8733
  "The model to use for the prompt execution. You can find the list of supported models [here](https://openrouter.ai/models)."
8411
8734
  )
8412
- }).optional().describe("**[Alpha]** Audio to llm configuration, if `audio_to_llm` is enabled"),
8735
+ }).optional().describe("Audio to LLM configuration, if `audio_to_llm` is enabled"),
8413
8736
  pii_redaction: zod5.boolean().optional().describe("Enable PII redaction for this audio"),
8414
8737
  pii_redaction_config: zod5.object({
8415
8738
  entity_types: zod5.enum([
@@ -10715,7 +11038,7 @@ var transcriptionControllerGetTranscriptV2ResponseRequestParamsTranslationConfig
10715
11038
  var transcriptionControllerGetTranscriptV2ResponseRequestParamsTranslationConfigLipsyncDefault = true;
10716
11039
  var transcriptionControllerGetTranscriptV2ResponseRequestParamsTranslationConfigContextAdaptationDefault = true;
10717
11040
  var transcriptionControllerGetTranscriptV2ResponseRequestParamsSummarizationConfigTypeDefault = "general";
10718
- var transcriptionControllerGetTranscriptV2ResponseRequestParamsAudioToLlmConfigModelDefault = "openai/gpt-3.5-turbo";
11041
+ var transcriptionControllerGetTranscriptV2ResponseRequestParamsAudioToLlmConfigModelDefault = "openai/gpt-5.4-nano";
10719
11042
  var transcriptionControllerGetTranscriptV2ResponseRequestParamsLanguageConfigLanguagesDefault = [];
10720
11043
  var transcriptionControllerGetTranscriptV2ResponseResultTranscriptionUtterancesItemChannelMin = 0;
10721
11044
  var transcriptionControllerGetTranscriptV2ResponseResultTranscriptionUtterancesItemSpeakerMin = 0;
@@ -11098,19 +11421,19 @@ var transcriptionControllerGetTranscriptV2Response = zod5.discriminatedUnion("ki
11098
11421
  "Forces the translation to use informal language forms when available in the target language."
11099
11422
  )
11100
11423
  }).optional().describe("**[Beta]** Translation configuration, if `translation` is enabled"),
11101
- summarization: zod5.boolean().optional().describe("**[Beta]** Enable summarization for this audio"),
11424
+ summarization: zod5.boolean().optional().describe("Enable summarization for this audio"),
11102
11425
  summarization_config: zod5.object({
11103
11426
  type: zod5.enum(["general", "bullet_points", "concise"]).describe("The type of summarization to apply").default(
11104
11427
  transcriptionControllerGetTranscriptV2ResponseRequestParamsSummarizationConfigTypeDefault
11105
11428
  ).describe("The type of summarization to apply")
11106
- }).optional().describe("**[Beta]** Summarization configuration, if `summarization` is enabled"),
11429
+ }).optional().describe("Summarization configuration, if `summarization` is enabled"),
11107
11430
  named_entity_recognition: zod5.boolean().optional().describe("**[Alpha]** Enable named entity recognition for this audio"),
11108
11431
  custom_spelling: zod5.boolean().optional().describe("**[Alpha]** Enable custom spelling for this audio"),
11109
11432
  custom_spelling_config: zod5.object({
11110
11433
  spelling_dictionary: zod5.record(zod5.string(), zod5.array(zod5.string())).describe("The list of spelling applied on the audio transcription")
11111
11434
  }).optional().describe("**[Alpha]** Custom spelling configuration, if `custom_spelling` is enabled"),
11112
11435
  sentiment_analysis: zod5.boolean().optional().describe("Enable sentiment analysis for this audio"),
11113
- audio_to_llm: zod5.boolean().optional().describe("**[Alpha]** Enable audio to llm processing for this audio"),
11436
+ audio_to_llm: zod5.boolean().optional().describe("Enable audio to LLM processing for this audio"),
11114
11437
  audio_to_llm_config: zod5.object({
11115
11438
  prompts: zod5.array(zod5.array(zod5.unknown())).min(1).describe("The list of prompts applied on the audio transcription"),
11116
11439
  model: zod5.string().default(
@@ -11118,7 +11441,7 @@ var transcriptionControllerGetTranscriptV2Response = zod5.discriminatedUnion("ki
11118
11441
  ).describe(
11119
11442
  "The model to use for the prompt execution. You can find the list of supported models [here](https://openrouter.ai/models)."
11120
11443
  )
11121
- }).optional().describe("**[Alpha]** Audio to llm configuration, if `audio_to_llm` is enabled"),
11444
+ }).optional().describe("Audio to LLM configuration, if `audio_to_llm` is enabled"),
11122
11445
  pii_redaction: zod5.boolean().optional().describe("Enable PII redaction for this audio"),
11123
11446
  pii_redaction_config: zod5.object({
11124
11447
  entity_types: zod5.enum([
@@ -13809,7 +14132,7 @@ var historyControllerGetListV1ResponseItemsItemRequestParamsTranslationConfigMat
13809
14132
  var historyControllerGetListV1ResponseItemsItemRequestParamsTranslationConfigLipsyncDefault = true;
13810
14133
  var historyControllerGetListV1ResponseItemsItemRequestParamsTranslationConfigContextAdaptationDefault = true;
13811
14134
  var historyControllerGetListV1ResponseItemsItemRequestParamsSummarizationConfigTypeDefault = "general";
13812
- var historyControllerGetListV1ResponseItemsItemRequestParamsAudioToLlmConfigModelDefault = "openai/gpt-3.5-turbo";
14135
+ var historyControllerGetListV1ResponseItemsItemRequestParamsAudioToLlmConfigModelDefault = "openai/gpt-5.4-nano";
13813
14136
  var historyControllerGetListV1ResponseItemsItemRequestParamsLanguageConfigLanguagesDefault = [];
13814
14137
  var historyControllerGetListV1ResponseItemsItemResultTranscriptionUtterancesItemChannelMin = 0;
13815
14138
  var historyControllerGetListV1ResponseItemsItemResultTranscriptionUtterancesItemSpeakerMin = 0;
@@ -14198,12 +14521,12 @@ var historyControllerGetListV1Response = zod5.object({
14198
14521
  "Forces the translation to use informal language forms when available in the target language."
14199
14522
  )
14200
14523
  }).optional().describe("**[Beta]** Translation configuration, if `translation` is enabled"),
14201
- summarization: zod5.boolean().optional().describe("**[Beta]** Enable summarization for this audio"),
14524
+ summarization: zod5.boolean().optional().describe("Enable summarization for this audio"),
14202
14525
  summarization_config: zod5.object({
14203
14526
  type: zod5.enum(["general", "bullet_points", "concise"]).describe("The type of summarization to apply").default(
14204
14527
  historyControllerGetListV1ResponseItemsItemRequestParamsSummarizationConfigTypeDefault
14205
14528
  ).describe("The type of summarization to apply")
14206
- }).optional().describe("**[Beta]** Summarization configuration, if `summarization` is enabled"),
14529
+ }).optional().describe("Summarization configuration, if `summarization` is enabled"),
14207
14530
  named_entity_recognition: zod5.boolean().optional().describe("**[Alpha]** Enable named entity recognition for this audio"),
14208
14531
  custom_spelling: zod5.boolean().optional().describe("**[Alpha]** Enable custom spelling for this audio"),
14209
14532
  custom_spelling_config: zod5.object({
@@ -14212,7 +14535,7 @@ var historyControllerGetListV1Response = zod5.object({
14212
14535
  "**[Alpha]** Custom spelling configuration, if `custom_spelling` is enabled"
14213
14536
  ),
14214
14537
  sentiment_analysis: zod5.boolean().optional().describe("Enable sentiment analysis for this audio"),
14215
- audio_to_llm: zod5.boolean().optional().describe("**[Alpha]** Enable audio to llm processing for this audio"),
14538
+ audio_to_llm: zod5.boolean().optional().describe("Enable audio to LLM processing for this audio"),
14216
14539
  audio_to_llm_config: zod5.object({
14217
14540
  prompts: zod5.array(zod5.array(zod5.unknown())).min(1).describe("The list of prompts applied on the audio transcription"),
14218
14541
  model: zod5.string().default(
@@ -14220,7 +14543,7 @@ var historyControllerGetListV1Response = zod5.object({
14220
14543
  ).describe(
14221
14544
  "The model to use for the prompt execution. You can find the list of supported models [here](https://openrouter.ai/models)."
14222
14545
  )
14223
- }).optional().describe("**[Alpha]** Audio to llm configuration, if `audio_to_llm` is enabled"),
14546
+ }).optional().describe("Audio to LLM configuration, if `audio_to_llm` is enabled"),
14224
14547
  pii_redaction: zod5.boolean().optional().describe("Enable PII redaction for this audio"),
14225
14548
  pii_redaction_config: zod5.object({
14226
14549
  entity_types: zod5.enum([
@@ -19673,6 +19996,7 @@ var createRealtimeClientSecretBodySessionAudioOutputSpeedDefault = 1;
19673
19996
  var createRealtimeClientSecretBodySessionAudioOutputSpeedMin = 0.25;
19674
19997
  var createRealtimeClientSecretBodySessionAudioOutputSpeedMax = 1.5;
19675
19998
  var createRealtimeClientSecretBodySessionToolChoiceDefault = "auto";
19999
+ var createRealtimeClientSecretBodySessionReasoningEffortDefault = "low";
19676
20000
  var createRealtimeClientSecretBodySessionTruncationRetentionRatioMin = 0;
19677
20001
  var createRealtimeClientSecretBodySessionTruncationRetentionRatioMax = 1;
19678
20002
  var createRealtimeClientSecretBodySessionTruncationTokenLimitsPostInstructionsMin = 0;
@@ -19703,6 +20027,7 @@ var createRealtimeClientSecretBody = zod6.object({
19703
20027
  zod6.enum([
19704
20028
  "gpt-realtime",
19705
20029
  "gpt-realtime-1.5",
20030
+ "gpt-realtime-2",
19706
20031
  "gpt-realtime-2025-08-28",
19707
20032
  "gpt-4o-realtime-preview",
19708
20033
  "gpt-4o-realtime-preview-2024-10-01",
@@ -19743,16 +20068,20 @@ var createRealtimeClientSecretBody = zod6.object({
19743
20068
  "gpt-4o-mini-transcribe",
19744
20069
  "gpt-4o-mini-transcribe-2025-12-15",
19745
20070
  "gpt-4o-transcribe",
19746
- "gpt-4o-transcribe-diarize"
20071
+ "gpt-4o-transcribe-diarize",
20072
+ "gpt-realtime-whisper"
19747
20073
  ])
19748
20074
  ).optional().describe(
19749
- "The model to use for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.\n"
20075
+ "The model to use for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, `gpt-4o-transcribe-diarize`, and `gpt-realtime-whisper`. Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.\n"
19750
20076
  ),
19751
20077
  language: zod6.string().optional().describe(
19752
20078
  "The language of the input audio. Supplying the input language in\n[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format\nwill improve accuracy and latency.\n"
19753
20079
  ),
19754
20080
  prompt: zod6.string().optional().describe(
19755
- 'An optional text to guide the model\'s style or continue a previous audio\nsegment.\nFor `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting).\nFor `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the prompt is a free text string, for example "expect words related to technology".\n'
20081
+ 'An optional text to guide the model\'s style or continue a previous audio\nsegment.\nFor `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting).\nFor `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the prompt is a free text string, for example "expect words related to technology".\nPrompt is not supported with `gpt-realtime-whisper` in GA Realtime sessions.\n'
20082
+ ),
20083
+ delay: zod6.enum(["minimal", "low", "medium", "high", "xhigh"]).optional().describe(
20084
+ "Controls how long the model waits before emitting transcription text.\nHigher values can improve transcription accuracy at the cost of latency.\nOnly supported with `gpt-realtime-whisper` in GA Realtime sessions.\n"
19756
20085
  )
19757
20086
  }).optional(),
19758
20087
  noise_reduction: zod6.object({
@@ -19819,7 +20148,7 @@ var createRealtimeClientSecretBody = zod6.object({
19819
20148
  "Server-side semantic turn detection which uses a model to determine when the user has finished speaking."
19820
20149
  )
19821
20150
  ]).describe(
19822
- 'Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually trigger model response.\n\nServer VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech.\n\nSemantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency.\n'
20151
+ 'Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually trigger model response.\n\nServer VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech.\n\nSemantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency.\n\nFor `gpt-realtime-whisper` transcription sessions, turn detection must be\nset to `null`; VAD is not supported.\n'
19823
20152
  ).or(zod6.null()).optional()
19824
20153
  }).optional(),
19825
20154
  output: zod6.object({
@@ -19892,7 +20221,7 @@ var createRealtimeClientSecretBody = zod6.object({
19892
20221
  server_label: zod6.string().describe(
19893
20222
  "A label for this MCP server, used to identify it in tool calls.\n"
19894
20223
  ),
19895
- server_url: zod6.string().optional().describe(
20224
+ server_url: zod6.string().url().optional().describe(
19896
20225
  "The URL for the MCP server. One of `server_url` or `connector_id` must be\nprovided.\n"
19897
20226
  ),
19898
20227
  connector_id: zod6.enum([
@@ -19970,6 +20299,16 @@ var createRealtimeClientSecretBody = zod6.object({
19970
20299
  ).default(createRealtimeClientSecretBodySessionToolChoiceDefault).describe(
19971
20300
  "How the model chooses tools. Provide one of the string modes or force a specific\nfunction/MCP tool.\n"
19972
20301
  ),
20302
+ parallel_tool_calls: zod6.boolean().optional().describe(
20303
+ "Whether the model may call multiple tools in parallel. Only supported by\nreasoning Realtime models such as `gpt-realtime-2`.\n"
20304
+ ),
20305
+ reasoning: zod6.object({
20306
+ effort: zod6.enum(["minimal", "low", "medium", "high", "xhigh"]).default(createRealtimeClientSecretBodySessionReasoningEffortDefault).describe(
20307
+ "Constrains effort on reasoning for reasoning-capable Realtime models such as\n`gpt-realtime-2`.\n"
20308
+ )
20309
+ }).optional().describe(
20310
+ "Configuration for reasoning-capable Realtime models such as `gpt-realtime-2`.\n"
20311
+ ),
19973
20312
  max_output_tokens: zod6.number().or(zod6.enum(["inf"])).optional().describe(
19974
20313
  "Maximum number of output tokens for a single assistant response,\ninclusive of tool calls. Provide an integer between 1 and 4096 to\nlimit output tokens, or `inf` for the maximum available tokens for a\ngiven model. Defaults to `inf`.\n"
19975
20314
  ),
@@ -20009,7 +20348,7 @@ var createRealtimeClientSecretBody = zod6.object({
20009
20348
  ).or(
20010
20349
  zod6.object({
20011
20350
  type: zod6.enum(["input_image"]).describe("The type of the input item. Always `input_image`."),
20012
- image_url: zod6.string().describe(
20351
+ image_url: zod6.string().url().describe(
20013
20352
  "The URL of the image to be sent to the model. A fully qualified URL or base64 encoded image in a data URL."
20014
20353
  ).or(zod6.null()).optional(),
20015
20354
  file_id: zod6.string().describe("The ID of the file to be sent to the model.").or(zod6.null()).optional(),
@@ -20023,7 +20362,7 @@ var createRealtimeClientSecretBody = zod6.object({
20023
20362
  file_id: zod6.string().describe("The ID of the file to be sent to the model.").or(zod6.null()).optional(),
20024
20363
  filename: zod6.string().optional().describe("The name of the file to be sent to the model."),
20025
20364
  file_data: zod6.string().optional().describe("The content of the file to be sent to the model.\n"),
20026
- file_url: zod6.string().optional().describe("The URL of the file to be sent to the model."),
20365
+ file_url: zod6.string().url().optional().describe("The URL of the file to be sent to the model."),
20027
20366
  detail: zod6.enum(["low", "high"]).optional()
20028
20367
  }).describe("A file input to the model.")
20029
20368
  )
@@ -20059,16 +20398,20 @@ var createRealtimeClientSecretBody = zod6.object({
20059
20398
  "gpt-4o-mini-transcribe",
20060
20399
  "gpt-4o-mini-transcribe-2025-12-15",
20061
20400
  "gpt-4o-transcribe",
20062
- "gpt-4o-transcribe-diarize"
20401
+ "gpt-4o-transcribe-diarize",
20402
+ "gpt-realtime-whisper"
20063
20403
  ])
20064
20404
  ).optional().describe(
20065
- "The model to use for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.\n"
20405
+ "The model to use for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, `gpt-4o-transcribe-diarize`, and `gpt-realtime-whisper`. Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.\n"
20066
20406
  ),
20067
20407
  language: zod6.string().optional().describe(
20068
20408
  "The language of the input audio. Supplying the input language in\n[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format\nwill improve accuracy and latency.\n"
20069
20409
  ),
20070
20410
  prompt: zod6.string().optional().describe(
20071
- 'An optional text to guide the model\'s style or continue a previous audio\nsegment.\nFor `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting).\nFor `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the prompt is a free text string, for example "expect words related to technology".\n'
20411
+ 'An optional text to guide the model\'s style or continue a previous audio\nsegment.\nFor `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting).\nFor `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the prompt is a free text string, for example "expect words related to technology".\nPrompt is not supported with `gpt-realtime-whisper` in GA Realtime sessions.\n'
20412
+ ),
20413
+ delay: zod6.enum(["minimal", "low", "medium", "high", "xhigh"]).optional().describe(
20414
+ "Controls how long the model waits before emitting transcription text.\nHigher values can improve transcription accuracy at the cost of latency.\nOnly supported with `gpt-realtime-whisper` in GA Realtime sessions.\n"
20072
20415
  )
20073
20416
  }).optional(),
20074
20417
  noise_reduction: zod6.object({
@@ -20135,7 +20478,7 @@ var createRealtimeClientSecretBody = zod6.object({
20135
20478
  "Server-side semantic turn detection which uses a model to determine when the user has finished speaking."
20136
20479
  )
20137
20480
  ]).describe(
20138
- 'Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually trigger model response.\n\nServer VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech.\n\nSemantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency.\n'
20481
+ 'Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually trigger model response.\n\nServer VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech.\n\nSemantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency.\n\nFor `gpt-realtime-whisper` transcription sessions, turn detection must be\nset to `null`; VAD is not supported.\n'
20139
20482
  ).or(zod6.null()).optional()
20140
20483
  }).optional()
20141
20484
  }).optional().describe("Configuration for input and output audio.\n"),
@@ -20161,23 +20504,21 @@ var createRealtimeClientSecretResponseSessionAudioOutputSpeedDefault = 1;
20161
20504
  var createRealtimeClientSecretResponseSessionAudioOutputSpeedMin = 0.25;
20162
20505
  var createRealtimeClientSecretResponseSessionAudioOutputSpeedMax = 1.5;
20163
20506
  var createRealtimeClientSecretResponseSessionToolChoiceDefault = "auto";
20507
+ var createRealtimeClientSecretResponseSessionReasoningEffortDefault = "low";
20164
20508
  var createRealtimeClientSecretResponseSessionTruncationRetentionRatioMin = 0;
20165
20509
  var createRealtimeClientSecretResponseSessionTruncationRetentionRatioMax = 1;
20166
20510
  var createRealtimeClientSecretResponseSessionTruncationTokenLimitsPostInstructionsMin = 0;
20167
20511
  var createRealtimeClientSecretResponse = zod6.object({
20168
20512
  value: zod6.string().describe("The generated client secret value."),
20169
20513
  expires_at: zod6.number().describe("Expiration timestamp for the client secret, in seconds since epoch."),
20170
- session: zod6.discriminatedUnion("type", [
20514
+ session: zod6.union([
20171
20515
  zod6.object({
20172
- client_secret: zod6.object({
20173
- value: zod6.string().describe(
20174
- "Ephemeral key usable in client environments to authenticate connections to the Realtime API. Use this in client-side environments rather than a standard API token, which should only be used server-side.\n"
20175
- ),
20176
- expires_at: zod6.number().describe(
20177
- "Timestamp for when the token expires. Currently, all tokens expire\nafter one minute.\n"
20178
- )
20179
- }).describe("Ephemeral key returned by the API."),
20180
20516
  type: zod6.enum(["realtime"]).describe("The type of session to create. Always `realtime` for the Realtime API.\n"),
20517
+ id: zod6.string().describe(
20518
+ "Unique identifier for the session that looks like `sess_1234567890abcdef`.\n"
20519
+ ),
20520
+ object: zod6.enum(["realtime.session"]).describe("The object type. Always `realtime.session`."),
20521
+ expires_at: zod6.number().optional().describe("Expiration timestamp for the session, in seconds since epoch."),
20181
20522
  output_modalities: zod6.array(zod6.enum(["text", "audio"])).default(createRealtimeClientSecretResponseSessionOutputModalitiesDefault).describe(
20182
20523
  'The set of modalities the model can respond with. It defaults to `["audio"]`, indicating\nthat the model will respond with audio plus a transcript. `["text"]` can be used to make\nthe model respond with text only. It is not possible to request both `text` and `audio` at the same time.\n'
20183
20524
  ),
@@ -20185,6 +20526,7 @@ var createRealtimeClientSecretResponse = zod6.object({
20185
20526
  zod6.enum([
20186
20527
  "gpt-realtime",
20187
20528
  "gpt-realtime-1.5",
20529
+ "gpt-realtime-2",
20188
20530
  "gpt-realtime-2025-08-28",
20189
20531
  "gpt-4o-realtime-preview",
20190
20532
  "gpt-4o-realtime-preview-2024-10-01",
@@ -20207,15 +20549,15 @@ var createRealtimeClientSecretResponse = zod6.object({
20207
20549
  audio: zod6.object({
20208
20550
  input: zod6.object({
20209
20551
  format: zod6.object({
20210
- type: zod6.enum(["audio/pcm"]).describe("The audio format. Always `audio/pcm`."),
20211
- rate: zod6.literal(24e3).describe("The sample rate of the audio. Always `24000`.")
20552
+ type: zod6.enum(["audio/pcm"]).optional().describe("The audio format. Always `audio/pcm`."),
20553
+ rate: zod6.literal(24e3).optional().describe("The sample rate of the audio. Always `24000`.")
20212
20554
  }).describe("The PCM audio format. Only a 24kHz sample rate is supported.").or(
20213
20555
  zod6.object({
20214
- type: zod6.enum(["audio/pcmu"]).describe("The audio format. Always `audio/pcmu`.")
20556
+ type: zod6.enum(["audio/pcmu"]).optional().describe("The audio format. Always `audio/pcmu`.")
20215
20557
  }).describe("The G.711 \u03BC-law format.")
20216
20558
  ).or(
20217
20559
  zod6.object({
20218
- type: zod6.enum(["audio/pcma"]).describe("The audio format. Always `audio/pcma`.")
20560
+ type: zod6.enum(["audio/pcma"]).optional().describe("The audio format. Always `audio/pcma`.")
20219
20561
  }).describe("The G.711 A-law format.")
20220
20562
  ).optional(),
20221
20563
  transcription: zod6.object({
@@ -20225,20 +20567,19 @@ var createRealtimeClientSecretResponse = zod6.object({
20225
20567
  "gpt-4o-mini-transcribe",
20226
20568
  "gpt-4o-mini-transcribe-2025-12-15",
20227
20569
  "gpt-4o-transcribe",
20228
- "gpt-4o-transcribe-diarize"
20570
+ "gpt-4o-transcribe-diarize",
20571
+ "gpt-realtime-whisper"
20229
20572
  ])
20230
20573
  ).optional().describe(
20231
- "The model to use for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.\n"
20232
- ),
20233
- language: zod6.string().optional().describe(
20234
- "The language of the input audio. Supplying the input language in\n[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format\nwill improve accuracy and latency.\n"
20574
+ "The model used for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, `gpt-4o-transcribe-diarize`, and `gpt-realtime-whisper`.\n"
20235
20575
  ),
20576
+ language: zod6.string().optional().describe("The language of the input audio.\n"),
20236
20577
  prompt: zod6.string().optional().describe(
20237
- 'An optional text to guide the model\'s style or continue a previous audio\nsegment.\nFor `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting).\nFor `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the prompt is a free text string, for example "expect words related to technology".\n'
20578
+ "The prompt configured for input audio transcription, when present.\n"
20238
20579
  )
20239
20580
  }).optional(),
20240
20581
  noise_reduction: zod6.object({
20241
- type: zod6.enum(["near_field", "far_field"]).describe(
20582
+ type: zod6.enum(["near_field", "far_field"]).optional().describe(
20242
20583
  "Type of noise reduction. `near_field` is for close-talking microphones such as headphones, `far_field` is for far-field microphones such as laptop or conference room microphones.\n"
20243
20584
  )
20244
20585
  }).optional().describe(
@@ -20301,20 +20642,20 @@ var createRealtimeClientSecretResponse = zod6.object({
20301
20642
  "Server-side semantic turn detection which uses a model to determine when the user has finished speaking."
20302
20643
  )
20303
20644
  ]).describe(
20304
- 'Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually trigger model response.\n\nServer VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech.\n\nSemantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency.\n'
20645
+ 'Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually trigger model response.\n\nServer VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech.\n\nSemantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency.\n\nFor `gpt-realtime-whisper` transcription sessions, turn detection must be\nset to `null`; VAD is not supported.\n'
20305
20646
  ).or(zod6.null()).optional()
20306
20647
  }).optional(),
20307
20648
  output: zod6.object({
20308
20649
  format: zod6.object({
20309
- type: zod6.enum(["audio/pcm"]).describe("The audio format. Always `audio/pcm`."),
20310
- rate: zod6.literal(24e3).describe("The sample rate of the audio. Always `24000`.")
20650
+ type: zod6.enum(["audio/pcm"]).optional().describe("The audio format. Always `audio/pcm`."),
20651
+ rate: zod6.literal(24e3).optional().describe("The sample rate of the audio. Always `24000`.")
20311
20652
  }).describe("The PCM audio format. Only a 24kHz sample rate is supported.").or(
20312
20653
  zod6.object({
20313
- type: zod6.enum(["audio/pcmu"]).describe("The audio format. Always `audio/pcmu`.")
20654
+ type: zod6.enum(["audio/pcmu"]).optional().describe("The audio format. Always `audio/pcmu`.")
20314
20655
  }).describe("The G.711 \u03BC-law format.")
20315
20656
  ).or(
20316
20657
  zod6.object({
20317
- type: zod6.enum(["audio/pcma"]).describe("The audio format. Always `audio/pcma`.")
20658
+ type: zod6.enum(["audio/pcma"]).optional().describe("The audio format. Always `audio/pcma`.")
20318
20659
  }).describe("The G.711 A-law format.")
20319
20660
  ).optional(),
20320
20661
  voice: zod6.string().or(
@@ -20358,7 +20699,7 @@ var createRealtimeClientSecretResponse = zod6.object({
20358
20699
  ).or(zod6.null()).optional(),
20359
20700
  tools: zod6.array(
20360
20701
  zod6.object({
20361
- type: zod6.enum(["function"]).describe("The type of the tool, i.e. `function`."),
20702
+ type: zod6.enum(["function"]).optional().describe("The type of the tool, i.e. `function`."),
20362
20703
  name: zod6.string().optional().describe("The name of the function."),
20363
20704
  description: zod6.string().optional().describe(
20364
20705
  "The description of the function, including guidance on when and how\nto call it, and guidance about what to tell the user when calling\n(if anything).\n"
@@ -20370,7 +20711,7 @@ var createRealtimeClientSecretResponse = zod6.object({
20370
20711
  server_label: zod6.string().describe(
20371
20712
  "A label for this MCP server, used to identify it in tool calls.\n"
20372
20713
  ),
20373
- server_url: zod6.string().optional().describe(
20714
+ server_url: zod6.string().url().optional().describe(
20374
20715
  "The URL for the MCP server. One of `server_url` or `connector_id` must be\nprovided.\n"
20375
20716
  ),
20376
20717
  connector_id: zod6.enum([
@@ -20382,7 +20723,7 @@ var createRealtimeClientSecretResponse = zod6.object({
20382
20723
  "connector_outlookcalendar",
20383
20724
  "connector_outlookemail",
20384
20725
  "connector_sharepoint"
20385
- ]).describe(
20726
+ ]).optional().describe(
20386
20727
  "Identifier for service connectors, like those available in ChatGPT. One of\n`server_url` or `connector_id` must be provided. Learn more about service\nconnectors [here](/docs/guides/tools-remote-mcp#connectors).\n\nCurrently supported `connector_id` values are:\n\n- Dropbox: `connector_dropbox`\n- Gmail: `connector_gmail`\n- Google Calendar: `connector_googlecalendar`\n- Google Drive: `connector_googledrive`\n- Microsoft Teams: `connector_microsoftteams`\n- Outlook Calendar: `connector_outlookcalendar`\n- Outlook Email: `connector_outlookemail`\n- SharePoint: `connector_sharepoint`\n"
20387
20728
  ),
20388
20729
  authorization: zod6.string().optional().describe(
@@ -20448,6 +20789,13 @@ var createRealtimeClientSecretResponse = zod6.object({
20448
20789
  ).default(createRealtimeClientSecretResponseSessionToolChoiceDefault).describe(
20449
20790
  "How the model chooses tools. Provide one of the string modes or force a specific\nfunction/MCP tool.\n"
20450
20791
  ),
20792
+ reasoning: zod6.object({
20793
+ effort: zod6.enum(["minimal", "low", "medium", "high", "xhigh"]).default(createRealtimeClientSecretResponseSessionReasoningEffortDefault).describe(
20794
+ "Constrains effort on reasoning for reasoning-capable Realtime models such as\n`gpt-realtime-2`.\n"
20795
+ )
20796
+ }).optional().describe(
20797
+ "Configuration for reasoning-capable Realtime models such as `gpt-realtime-2`.\n"
20798
+ ),
20451
20799
  max_output_tokens: zod6.number().or(zod6.enum(["inf"])).optional().describe(
20452
20800
  "Maximum number of output tokens for a single assistant response,\ninclusive of tool calls. Provide an integer between 1 and 4096 to\nlimit output tokens, or `inf` for the maximum available tokens for a\ngiven model. Defaults to `inf`.\n"
20453
20801
  ),
@@ -20487,7 +20835,7 @@ var createRealtimeClientSecretResponse = zod6.object({
20487
20835
  ).or(
20488
20836
  zod6.object({
20489
20837
  type: zod6.enum(["input_image"]).describe("The type of the input item. Always `input_image`."),
20490
- image_url: zod6.string().describe(
20838
+ image_url: zod6.string().url().describe(
20491
20839
  "The URL of the image to be sent to the model. A fully qualified URL or base64 encoded image in a data URL."
20492
20840
  ).or(zod6.null()).optional(),
20493
20841
  file_id: zod6.string().describe("The ID of the file to be sent to the model.").or(zod6.null()).optional(),
@@ -20501,8 +20849,8 @@ var createRealtimeClientSecretResponse = zod6.object({
20501
20849
  file_id: zod6.string().describe("The ID of the file to be sent to the model.").or(zod6.null()).optional(),
20502
20850
  filename: zod6.string().optional().describe("The name of the file to be sent to the model."),
20503
20851
  file_data: zod6.string().optional().describe("The content of the file to be sent to the model.\n"),
20504
- file_url: zod6.string().optional().describe("The URL of the file to be sent to the model."),
20505
- detail: zod6.enum(["low", "high"])
20852
+ file_url: zod6.string().url().optional().describe("The URL of the file to be sent to the model."),
20853
+ detail: zod6.enum(["low", "high"]).optional()
20506
20854
  }).describe("A file input to the model.")
20507
20855
  )
20508
20856
  ).describe(
@@ -20511,9 +20859,7 @@ var createRealtimeClientSecretResponse = zod6.object({
20511
20859
  }).describe(
20512
20860
  "Reference to a prompt template and its variables.\n[Learn more](/docs/guides/text?api-mode=responses#reusable-prompts).\n"
20513
20861
  ).or(zod6.null()).optional()
20514
- }).describe(
20515
- "A new Realtime session configuration, with an ephemeral key. Default TTL\nfor keys is one minute.\n"
20516
- ),
20862
+ }).describe("A Realtime session configuration object.\n"),
20517
20863
  zod6.object({
20518
20864
  type: zod6.enum(["transcription"]).describe(
20519
20865
  "The type of session. Always `transcription` for transcription sessions.\n"
@@ -20529,15 +20875,15 @@ var createRealtimeClientSecretResponse = zod6.object({
20529
20875
  audio: zod6.object({
20530
20876
  input: zod6.object({
20531
20877
  format: zod6.object({
20532
- type: zod6.enum(["audio/pcm"]).describe("The audio format. Always `audio/pcm`."),
20533
- rate: zod6.literal(24e3).describe("The sample rate of the audio. Always `24000`.")
20878
+ type: zod6.enum(["audio/pcm"]).optional().describe("The audio format. Always `audio/pcm`."),
20879
+ rate: zod6.literal(24e3).optional().describe("The sample rate of the audio. Always `24000`.")
20534
20880
  }).describe("The PCM audio format. Only a 24kHz sample rate is supported.").or(
20535
20881
  zod6.object({
20536
- type: zod6.enum(["audio/pcmu"]).describe("The audio format. Always `audio/pcmu`.")
20882
+ type: zod6.enum(["audio/pcmu"]).optional().describe("The audio format. Always `audio/pcmu`.")
20537
20883
  }).describe("The G.711 \u03BC-law format.")
20538
20884
  ).or(
20539
20885
  zod6.object({
20540
- type: zod6.enum(["audio/pcma"]).describe("The audio format. Always `audio/pcma`.")
20886
+ type: zod6.enum(["audio/pcma"]).optional().describe("The audio format. Always `audio/pcma`.")
20541
20887
  }).describe("The G.711 A-law format.")
20542
20888
  ).optional(),
20543
20889
  transcription: zod6.object({
@@ -20547,20 +20893,19 @@ var createRealtimeClientSecretResponse = zod6.object({
20547
20893
  "gpt-4o-mini-transcribe",
20548
20894
  "gpt-4o-mini-transcribe-2025-12-15",
20549
20895
  "gpt-4o-transcribe",
20550
- "gpt-4o-transcribe-diarize"
20896
+ "gpt-4o-transcribe-diarize",
20897
+ "gpt-realtime-whisper"
20551
20898
  ])
20552
20899
  ).optional().describe(
20553
- "The model to use for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.\n"
20554
- ),
20555
- language: zod6.string().optional().describe(
20556
- "The language of the input audio. Supplying the input language in\n[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format\nwill improve accuracy and latency.\n"
20900
+ "The model used for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, `gpt-4o-transcribe-diarize`, and `gpt-realtime-whisper`.\n"
20557
20901
  ),
20902
+ language: zod6.string().optional().describe("The language of the input audio.\n"),
20558
20903
  prompt: zod6.string().optional().describe(
20559
- 'An optional text to guide the model\'s style or continue a previous audio\nsegment.\nFor `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting).\nFor `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the prompt is a free text string, for example "expect words related to technology".\n'
20904
+ "The prompt configured for input audio transcription, when present.\n"
20560
20905
  )
20561
20906
  }).optional(),
20562
20907
  noise_reduction: zod6.object({
20563
- type: zod6.enum(["near_field", "far_field"]).describe(
20908
+ type: zod6.enum(["near_field", "far_field"]).optional().describe(
20564
20909
  "Type of noise reduction. `near_field` is for close-talking microphones such as headphones, `far_field` is for far-field microphones such as laptop or conference room microphones.\n"
20565
20910
  )
20566
20911
  }).optional().describe("Configuration for input audio noise reduction.\n"),
@@ -20577,8 +20922,10 @@ var createRealtimeClientSecretResponse = zod6.object({
20577
20922
  silence_duration_ms: zod6.number().optional().describe(
20578
20923
  "Duration of silence to detect speech stop (in milliseconds). Defaults\nto 500ms. With shorter values the model will respond more quickly,\nbut may jump in on short pauses from the user.\n"
20579
20924
  )
20580
- }).optional().describe(
20581
- "Configuration for turn detection. Can be set to `null` to turn off. Server\nVAD means that the model will detect the start and end of speech based on\naudio volume and respond at the end of user speech.\n"
20925
+ }).describe(
20926
+ "Configuration for turn detection. Can be set to `null` to turn off. Server\nVAD means that the model will detect the start and end of speech based on\naudio volume and respond at the end of user speech. For `gpt-realtime-whisper`, this must be `null`; VAD is not supported.\n"
20927
+ ).or(zod6.null()).optional().describe(
20928
+ "Configuration for turn detection. For `gpt-realtime-whisper`, this must be `null`; VAD is not supported.\n"
20582
20929
  )
20583
20930
  }).optional()
20584
20931
  }).optional().describe("Configuration for input audio for the session.\n")
@@ -20714,7 +21061,7 @@ var createRealtimeSessionBody = zod6.object({
20714
21061
  ).or(
20715
21062
  zod6.object({
20716
21063
  type: zod6.enum(["input_image"]).describe("The type of the input item. Always `input_image`."),
20717
- image_url: zod6.string().describe(
21064
+ image_url: zod6.string().url().describe(
20718
21065
  "The URL of the image to be sent to the model. A fully qualified URL or base64 encoded image in a data URL."
20719
21066
  ).or(zod6.null()).optional(),
20720
21067
  file_id: zod6.string().describe("The ID of the file to be sent to the model.").or(zod6.null()).optional(),
@@ -20728,7 +21075,7 @@ var createRealtimeSessionBody = zod6.object({
20728
21075
  file_id: zod6.string().describe("The ID of the file to be sent to the model.").or(zod6.null()).optional(),
20729
21076
  filename: zod6.string().optional().describe("The name of the file to be sent to the model."),
20730
21077
  file_data: zod6.string().optional().describe("The content of the file to be sent to the model.\n"),
20731
- file_url: zod6.string().optional().describe("The URL of the file to be sent to the model."),
21078
+ file_url: zod6.string().url().optional().describe("The URL of the file to be sent to the model."),
20732
21079
  detail: zod6.enum(["low", "high"]).optional()
20733
21080
  }).describe("A file input to the model.")
20734
21081
  )
@@ -20776,17 +21123,14 @@ var createRealtimeSessionResponse = zod6.object({
20776
21123
  "gpt-4o-mini-transcribe",
20777
21124
  "gpt-4o-mini-transcribe-2025-12-15",
20778
21125
  "gpt-4o-transcribe",
20779
- "gpt-4o-transcribe-diarize"
21126
+ "gpt-4o-transcribe-diarize",
21127
+ "gpt-realtime-whisper"
20780
21128
  ])
20781
21129
  ).optional().describe(
20782
- "The model to use for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.\n"
21130
+ "The model used for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, `gpt-4o-transcribe-diarize`, and `gpt-realtime-whisper`.\n"
20783
21131
  ),
20784
- language: zod6.string().optional().describe(
20785
- "The language of the input audio. Supplying the input language in\n[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format\nwill improve accuracy and latency.\n"
20786
- ),
20787
- prompt: zod6.string().optional().describe(
20788
- 'An optional text to guide the model\'s style or continue a previous audio\nsegment.\nFor `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting).\nFor `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the prompt is a free text string, for example "expect words related to technology".\n'
20789
- )
21132
+ language: zod6.string().optional().describe("The language of the input audio.\n"),
21133
+ prompt: zod6.string().optional().describe("The prompt configured for input audio transcription, when present.\n")
20790
21134
  }).optional(),
20791
21135
  noise_reduction: zod6.object({
20792
21136
  type: zod6.enum(["near_field", "far_field"]).optional().describe(
@@ -20911,16 +21255,20 @@ var createRealtimeTranscriptionSessionBody = zod6.object({
20911
21255
  "gpt-4o-mini-transcribe",
20912
21256
  "gpt-4o-mini-transcribe-2025-12-15",
20913
21257
  "gpt-4o-transcribe",
20914
- "gpt-4o-transcribe-diarize"
21258
+ "gpt-4o-transcribe-diarize",
21259
+ "gpt-realtime-whisper"
20915
21260
  ])
20916
21261
  ).optional().describe(
20917
- "The model to use for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.\n"
21262
+ "The model to use for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, `gpt-4o-transcribe-diarize`, and `gpt-realtime-whisper`. Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.\n"
20918
21263
  ),
20919
21264
  language: zod6.string().optional().describe(
20920
21265
  "The language of the input audio. Supplying the input language in\n[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format\nwill improve accuracy and latency.\n"
20921
21266
  ),
20922
21267
  prompt: zod6.string().optional().describe(
20923
- 'An optional text to guide the model\'s style or continue a previous audio\nsegment.\nFor `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting).\nFor `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the prompt is a free text string, for example "expect words related to technology".\n'
21268
+ 'An optional text to guide the model\'s style or continue a previous audio\nsegment.\nFor `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting).\nFor `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the prompt is a free text string, for example "expect words related to technology".\nPrompt is not supported with `gpt-realtime-whisper` in GA Realtime sessions.\n'
21269
+ ),
21270
+ delay: zod6.enum(["minimal", "low", "medium", "high", "xhigh"]).optional().describe(
21271
+ "Controls how long the model waits before emitting transcription text.\nHigher values can improve transcription accuracy at the cost of latency.\nOnly supported with `gpt-realtime-whisper` in GA Realtime sessions.\n"
20924
21272
  )
20925
21273
  }).optional(),
20926
21274
  include: zod6.array(zod6.enum(["item.input_audio_transcription.logprobs"])).optional().describe(
@@ -20949,17 +21297,14 @@ var createRealtimeTranscriptionSessionResponse = zod6.object({
20949
21297
  "gpt-4o-mini-transcribe",
20950
21298
  "gpt-4o-mini-transcribe-2025-12-15",
20951
21299
  "gpt-4o-transcribe",
20952
- "gpt-4o-transcribe-diarize"
21300
+ "gpt-4o-transcribe-diarize",
21301
+ "gpt-realtime-whisper"
20953
21302
  ])
20954
21303
  ).optional().describe(
20955
- "The model to use for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels.\n"
20956
- ),
20957
- language: zod6.string().optional().describe(
20958
- "The language of the input audio. Supplying the input language in\n[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format\nwill improve accuracy and latency.\n"
21304
+ "The model used for transcription. Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `gpt-4o-transcribe`, `gpt-4o-transcribe-diarize`, and `gpt-realtime-whisper`.\n"
20959
21305
  ),
20960
- prompt: zod6.string().optional().describe(
20961
- 'An optional text to guide the model\'s style or continue a previous audio\nsegment.\nFor `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting).\nFor `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the prompt is a free text string, for example "expect words related to technology".\n'
20962
- )
21306
+ language: zod6.string().optional().describe("The language of the input audio.\n"),
21307
+ prompt: zod6.string().optional().describe("The prompt configured for input audio transcription, when present.\n")
20963
21308
  }).optional(),
20964
21309
  turn_detection: zod6.object({
20965
21310
  type: zod6.string().optional().describe("Type of turn detection, only `server_vad` is currently supported.\n"),
@@ -24572,11 +24917,73 @@ var getModelsResponse = zod10.object({
24572
24917
  })
24573
24918
  ).describe("List of available models and their attributes.")
24574
24919
  });
24920
+ var getTtsModelsResponse = zod10.object({
24921
+ models: zod10.array(
24922
+ zod10.object({
24923
+ id: zod10.string().describe("Unique identifier of the model."),
24924
+ aliased_model_id: zod10.string().or(zod10.null()).describe("If this is an alias, the id of the aliased model."),
24925
+ name: zod10.string().describe("Name of the model."),
24926
+ voices: zod10.array(
24927
+ zod10.object({
24928
+ id: zod10.string().describe("Unique identifier of the voice."),
24929
+ description: zod10.string().describe("Description of the TTS voice."),
24930
+ gender: zod10.enum(["male", "female", "neutral"])
24931
+ })
24932
+ ).describe("List of available voices for this model."),
24933
+ languages: zod10.array(
24934
+ zod10.object({
24935
+ code: zod10.string().describe("2-letter language code."),
24936
+ name: zod10.string().describe("Language name.")
24937
+ })
24938
+ ).describe("List of languages supported by the model.")
24939
+ })
24940
+ ).describe("List of available TTS models and their attributes.")
24941
+ });
24942
+ var getUsageLogsQueryLimitDefault = 1e3;
24943
+ var getUsageLogsQueryLimitMax = 1e3;
24944
+ var getUsageLogsQuerySortDefault = "end_time_asc";
24945
+ var getUsageLogsQueryParams = zod10.object({
24946
+ start_time: zod10.string().describe("Start of the time window (inclusive). Filters by request end time."),
24947
+ end_time: zod10.string().describe("End of the time window (exclusive). Filters by request end time."),
24948
+ limit: zod10.number().min(1).max(getUsageLogsQueryLimitMax).default(getUsageLogsQueryLimitDefault).describe("Maximum number of usage log entries to return."),
24949
+ sort: zod10.enum(["end_time_asc", "end_time_desc"]).default(getUsageLogsQuerySortDefault).describe(
24950
+ "Sort order by end_time.Use `end_time_desc` to get the most recent entries first. When paginating, pass the same `sort` value alongside the cursor."
24951
+ ),
24952
+ cursor: zod10.string().or(zod10.null()).optional().describe("Pagination cursor for the next page of results.")
24953
+ });
24954
+ var getUsageLogsResponse = zod10.object({
24955
+ usage_logs: zod10.array(
24956
+ zod10.object({
24957
+ uuid: zod10.string().uuid().describe("Unique identifier of the request."),
24958
+ request_scope: zod10.string().describe("Scope of the request (api / playground)."),
24959
+ client_reference_id: zod10.string().describe("Client reference ID supplied on the original request. Empty string if none."),
24960
+ model: zod10.string().describe("Model identifier."),
24961
+ start_time: zod10.string().datetime({}).describe("When the request started."),
24962
+ end_time: zod10.string().datetime({}).describe("When the request ended."),
24963
+ input_text_tokens: zod10.number(),
24964
+ input_audio_tokens: zod10.number(),
24965
+ input_audio_duration_ms: zod10.number(),
24966
+ output_text_tokens: zod10.number(),
24967
+ output_audio_tokens: zod10.number(),
24968
+ output_audio_duration_ms: zod10.number(),
24969
+ cost_usd: zod10.string(),
24970
+ input_cost_usd: zod10.string(),
24971
+ input_text_cost_usd: zod10.string(),
24972
+ input_audio_cost_usd: zod10.string(),
24973
+ output_cost_usd: zod10.string(),
24974
+ output_text_cost_usd: zod10.string(),
24975
+ output_audio_cost_usd: zod10.string()
24976
+ })
24977
+ ).describe("Per-request usage log entries ordered by end_time, uuid (per `sort`)."),
24978
+ next_page_cursor: zod10.string().or(zod10.null()).optional().describe(
24979
+ "A pagination token that references the next page of results. When more data is available, this field contains a value to pass in the cursor parameter of a subsequent request. When null, no additional results are available."
24980
+ )
24981
+ });
24575
24982
  var createTemporaryApiKeyBodyExpiresInSecondsMax = 3600;
24576
24983
  var createTemporaryApiKeyBodyClientReferenceIdMaxOne = 256;
24577
24984
  var createTemporaryApiKeyBodyMaxSessionDurationSecondsMaxOne = 18e3;
24578
24985
  var createTemporaryApiKeyBody = zod10.object({
24579
- usage_type: zod10.enum(["transcribe_websocket"]),
24986
+ usage_type: zod10.enum(["transcribe_websocket", "tts_rt"]),
24580
24987
  expires_in_seconds: zod10.number().min(1).max(createTemporaryApiKeyBodyExpiresInSecondsMax).describe("Duration in seconds until the temporary API key expires."),
24581
24988
  client_reference_id: zod10.string().max(createTemporaryApiKeyBodyClientReferenceIdMaxOne).or(zod10.null()).optional().describe("Optional tracking identifier string. Does not need to be unique."),
24582
24989
  single_use: zod10.boolean().or(zod10.null()).optional().describe("If true, the temporary API key can be used only once."),
@@ -24584,6 +24991,28 @@ var createTemporaryApiKeyBody = zod10.object({
24584
24991
  "Maximum WebSocket connection duration in seconds. If exceeded, the connection will be dropped. If not set, no limit is applied."
24585
24992
  )
24586
24993
  });
24994
+ var getConcurrencyLimitsResponse = zod10.object({
24995
+ project: zod10.object({
24996
+ current: zod10.object({
24997
+ transcribe_concurrent: zod10.number(),
24998
+ tts_concurrent: zod10.number()
24999
+ }).describe("Live counts read from Redis"),
25000
+ limits: zod10.object({
25001
+ transcribe_concurrent: zod10.number().or(zod10.null()),
25002
+ tts_concurrent: zod10.number().or(zod10.null())
25003
+ }).describe("Configured limits")
25004
+ }),
25005
+ organization: zod10.object({
25006
+ current: zod10.object({
25007
+ transcribe_concurrent: zod10.number(),
25008
+ tts_concurrent: zod10.number()
25009
+ }).describe("Live counts read from Redis"),
25010
+ limits: zod10.object({
25011
+ transcribe_concurrent: zod10.number().or(zod10.null()),
25012
+ tts_concurrent: zod10.number().or(zod10.null())
25013
+ }).describe("Configured limits")
25014
+ })
25015
+ });
24587
25016
 
24588
25017
  // src/generated/soniox/streaming-types.zod.ts
24589
25018
  import { z as zod11 } from "zod";
@@ -24649,10 +25078,10 @@ var sonioxStructuredContextSchema = zod11.object({
24649
25078
  var sonioxContextSchema = zod11.union([sonioxStructuredContextSchema, zod11.string()]);
24650
25079
  var sonioxRealtimeModelSchema = zod11.enum([
24651
25080
  "stt-rt-v4",
24652
- "stt-rt-v3",
24653
25081
  "stt-rt-preview",
24654
25082
  "stt-rt-v3-preview",
24655
- "stt-rt-preview-v2"
25083
+ "stt-rt-preview-v2",
25084
+ "stt-rt-v3"
24656
25085
  ]);
24657
25086
  var streamingTranscriberParams3 = zod11.object({
24658
25087
  model: sonioxRealtimeModelSchema,
@@ -24660,12 +25089,16 @@ var streamingTranscriberParams3 = zod11.object({
24660
25089
  sampleRate: zod11.number().optional(),
24661
25090
  numChannels: zod11.number().optional(),
24662
25091
  languageHints: zod11.array(zod11.string()).optional(),
25092
+ languageHintsStrict: zod11.boolean().optional(),
24663
25093
  context: sonioxContextSchema.optional(),
24664
25094
  enableSpeakerDiarization: zod11.boolean().optional(),
24665
25095
  enableLanguageIdentification: zod11.boolean().optional(),
24666
25096
  enableEndpointDetection: zod11.boolean().optional(),
25097
+ maxEndpointDelayMs: zod11.number().optional(),
24667
25098
  translation: sonioxTranslationConfigSchema.optional(),
24668
- clientReferenceId: zod11.string().optional()
25099
+ clientReferenceId: zod11.string().optional(),
25100
+ keepaliveIntervalMs: zod11.number().optional(),
25101
+ connectTimeoutMs: zod11.number().optional()
24669
25102
  });
24670
25103
  var sonioxTranslationStatusSchema = zod11.enum(["original", "translation", "none"]);
24671
25104
  var sonioxTokenSchema = zod11.object({