voice-router-dev 0.8.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -84,6 +84,13 @@ __export(src_exports, {
84
84
  DeepgramTranscriptionSchema: () => DeepgramTranscriptionSchema,
85
85
  DeepgramTypes: () => schema_exports4,
86
86
  DeepgramZodSchemas: () => deepgramAPISpecification_zod_exports,
87
+ ElevenLabsAdapter: () => ElevenLabsAdapter,
88
+ ElevenLabsCapabilities: () => ElevenLabsCapabilities,
89
+ ElevenLabsLanguageCodes: () => ElevenLabsLanguageCodes,
90
+ ElevenLabsLanguageLabels: () => ElevenLabsLanguageLabels,
91
+ ElevenLabsLanguages: () => ElevenLabsLanguages,
92
+ ElevenLabsTypes: () => schema_exports8,
93
+ ElevenLabsZodSchemas: () => elevenLabsSpeechToTextAPI_zod_exports,
87
94
  GladiaAdapter: () => GladiaAdapter,
88
95
  GladiaBitDepth: () => GladiaBitDepth,
89
96
  GladiaCapabilities: () => GladiaCapabilities,
@@ -170,6 +177,7 @@ __export(src_exports, {
170
177
  createAssemblyAIAdapter: () => createAssemblyAIAdapter,
171
178
  createAzureSTTAdapter: () => createAzureSTTAdapter,
172
179
  createDeepgramAdapter: () => createDeepgramAdapter,
180
+ createElevenLabsAdapter: () => createElevenLabsAdapter,
173
181
  createGladiaAdapter: () => createGladiaAdapter,
174
182
  createOpenAIWhisperAdapter: () => createOpenAIWhisperAdapter,
175
183
  createSonioxAdapter: () => createSonioxAdapter,
@@ -969,6 +977,7 @@ var SonioxLanguage = {
969
977
 
970
978
  // src/generated/soniox/models.ts
971
979
  var SonioxModels = [
980
+ { id: "stt-rt-v4", name: "Speech-to-Text Real-time v4", mode: "real_time" },
972
981
  { id: "stt-rt-v3", name: "Speech-to-Text Real-time v3", mode: "real_time" },
973
982
  { id: "stt-async-v4", name: "Speech-to-Text Async v4", mode: "async" },
974
983
  { id: "stt-async-v3", name: "Speech-to-Text Async v3", mode: "async" },
@@ -979,6 +988,7 @@ var SonioxModels = [
979
988
  { id: "stt-async-preview-v1", name: "Speech-to-Text Async Preview v1", mode: "async", aliasOf: "stt-async-v3" }
980
989
  ];
981
990
  var SonioxModelCodes = [
991
+ "stt-rt-v4",
982
992
  "stt-rt-v3",
983
993
  "stt-async-v4",
984
994
  "stt-async-v3",
@@ -989,6 +999,7 @@ var SonioxModelCodes = [
989
999
  "stt-async-preview-v1"
990
1000
  ];
991
1001
  var SonioxModelLabels = {
1002
+ "stt-rt-v4": "Speech-to-Text Real-time v4",
992
1003
  "stt-rt-v3": "Speech-to-Text Real-time v3",
993
1004
  "stt-async-v4": "Speech-to-Text Async v4",
994
1005
  "stt-async-v3": "Speech-to-Text Async v3",
@@ -999,6 +1010,7 @@ var SonioxModelLabels = {
999
1010
  "stt-async-preview-v1": "Speech-to-Text Async Preview v1"
1000
1011
  };
1001
1012
  var SonioxModel = {
1013
+ stt_rt_v4: "stt-rt-v4",
1002
1014
  stt_rt_v3: "stt-rt-v3",
1003
1015
  stt_async_v4: "stt-async-v4",
1004
1016
  stt_async_v3: "stt-async-v3",
@@ -1009,6 +1021,7 @@ var SonioxModel = {
1009
1021
  stt_async_preview_v1: "stt-async-preview-v1"
1010
1022
  };
1011
1023
  var SonioxRealtimeModel = {
1024
+ stt_rt_v4: "stt-rt-v4",
1012
1025
  stt_rt_v3: "stt-rt-v3",
1013
1026
  stt_rt_preview: "stt-rt-preview",
1014
1027
  stt_rt_v3_preview: "stt-rt-v3-preview",
@@ -1025,6 +1038,7 @@ var SonioxAsyncModel = {
1025
1038
  var SpeechmaticsLanguages = [
1026
1039
  { code: "auto", name: "Automatic Detection" },
1027
1040
  { code: "ar", name: "Arabic" },
1041
+ { code: "ar_en", name: "Arabic / English" },
1028
1042
  { code: "ba", name: "Bashkir" },
1029
1043
  { code: "be", name: "Belarusian" },
1030
1044
  { code: "bg", name: "Bulgarian" },
@@ -1089,6 +1103,7 @@ var SpeechmaticsLanguages = [
1089
1103
  var SpeechmaticsLanguageCodes = [
1090
1104
  "auto",
1091
1105
  "ar",
1106
+ "ar_en",
1092
1107
  "ba",
1093
1108
  "be",
1094
1109
  "bg",
@@ -1153,6 +1168,7 @@ var SpeechmaticsLanguageCodes = [
1153
1168
  var SpeechmaticsLanguageLabels = {
1154
1169
  "auto": "Automatic Detection",
1155
1170
  "ar": "Arabic",
1171
+ "ar_en": "Arabic / English",
1156
1172
  "ba": "Bashkir",
1157
1173
  "be": "Belarusian",
1158
1174
  "bg": "Bulgarian",
@@ -1217,6 +1233,7 @@ var SpeechmaticsLanguageLabels = {
1217
1233
  var SpeechmaticsLanguage = {
1218
1234
  "auto": "auto",
1219
1235
  "ar": "ar",
1236
+ "ar_en": "ar_en",
1220
1237
  "ba": "ba",
1221
1238
  "be": "be",
1222
1239
  "bg": "bg",
@@ -1416,7 +1433,9 @@ var AzureLocales = [
1416
1433
  { code: "sl-SI", name: "Slovenian (Slovenia)" },
1417
1434
  { code: "so-SO", name: "Somali (Somalia)" },
1418
1435
  { code: "sq-AL", name: "Albanian (Albania)" },
1436
+ { code: "sr-ME", name: "Serbian (ME)" },
1419
1437
  { code: "sr-RS", name: "Serbian (Serbia)" },
1438
+ { code: "sr-XK", name: "Serbian (XK)" },
1420
1439
  { code: "su-ID", name: "Sundanese (Indonesia)" },
1421
1440
  { code: "sv-SE", name: "Swedish (Sweden)" },
1422
1441
  { code: "sw-KE", name: "Swahili (Kenya)" },
@@ -1571,7 +1590,9 @@ var AzureLocaleCodes = [
1571
1590
  "sl-SI",
1572
1591
  "so-SO",
1573
1592
  "sq-AL",
1593
+ "sr-ME",
1574
1594
  "sr-RS",
1595
+ "sr-XK",
1575
1596
  "su-ID",
1576
1597
  "sv-SE",
1577
1598
  "sw-KE",
@@ -1726,7 +1747,9 @@ var AzureLocaleLabels = {
1726
1747
  "sl-SI": "Slovenian (Slovenia)",
1727
1748
  "so-SO": "Somali (Somalia)",
1728
1749
  "sq-AL": "Albanian (Albania)",
1750
+ "sr-ME": "Serbian (ME)",
1729
1751
  "sr-RS": "Serbian (Serbia)",
1752
+ "sr-XK": "Serbian (XK)",
1730
1753
  "su-ID": "Sundanese (Indonesia)",
1731
1754
  "sv-SE": "Swedish (Sweden)",
1732
1755
  "sw-KE": "Swahili (Kenya)",
@@ -1881,7 +1904,9 @@ var AzureLocale = {
1881
1904
  "sl-SI": "sl-SI",
1882
1905
  "so-SO": "so-SO",
1883
1906
  "sq-AL": "sq-AL",
1907
+ "sr-ME": "sr-ME",
1884
1908
  "sr-RS": "sr-RS",
1909
+ "sr-XK": "sr-XK",
1885
1910
  "su-ID": "su-ID",
1886
1911
  "sv-SE": "sv-SE",
1887
1912
  "sw-KE": "sw-KE",
@@ -1907,6 +1932,311 @@ var AzureLocale = {
1907
1932
  "zu-ZA": "zu-ZA"
1908
1933
  };
1909
1934
 
1935
+ // src/generated/elevenlabs/languages.ts
1936
+ var ElevenLabsLanguages = [
1937
+ { code: "en", name: "English" },
1938
+ { code: "zh", name: "Chinese" },
1939
+ { code: "de", name: "German" },
1940
+ { code: "es", name: "Spanish" },
1941
+ { code: "ru", name: "Russian" },
1942
+ { code: "ko", name: "Korean" },
1943
+ { code: "fr", name: "French" },
1944
+ { code: "ja", name: "Japanese" },
1945
+ { code: "pt", name: "Portuguese" },
1946
+ { code: "tr", name: "Turkish" },
1947
+ { code: "pl", name: "Polish" },
1948
+ { code: "ca", name: "Catalan" },
1949
+ { code: "nl", name: "Dutch" },
1950
+ { code: "ar", name: "Arabic" },
1951
+ { code: "sv", name: "Swedish" },
1952
+ { code: "it", name: "Italian" },
1953
+ { code: "id", name: "Indonesian" },
1954
+ { code: "hi", name: "Hindi" },
1955
+ { code: "fi", name: "Finnish" },
1956
+ { code: "vi", name: "Vietnamese" },
1957
+ { code: "he", name: "Hebrew" },
1958
+ { code: "uk", name: "Ukrainian" },
1959
+ { code: "el", name: "Greek" },
1960
+ { code: "ms", name: "Malay" },
1961
+ { code: "cs", name: "Czech" },
1962
+ { code: "ro", name: "Romanian" },
1963
+ { code: "da", name: "Danish" },
1964
+ { code: "hu", name: "Hungarian" },
1965
+ { code: "ta", name: "Tamil" },
1966
+ { code: "no", name: "Norwegian" },
1967
+ { code: "th", name: "Thai" },
1968
+ { code: "ur", name: "Urdu" },
1969
+ { code: "hr", name: "Croatian" },
1970
+ { code: "bg", name: "Bulgarian" },
1971
+ { code: "lt", name: "Lithuanian" },
1972
+ { code: "la", name: "Latin" },
1973
+ { code: "mi", name: "Maori" },
1974
+ { code: "ml", name: "Malayalam" },
1975
+ { code: "cy", name: "Welsh" },
1976
+ { code: "sk", name: "Slovak" },
1977
+ { code: "te", name: "Telugu" },
1978
+ { code: "fa", name: "Persian" },
1979
+ { code: "lv", name: "Latvian" },
1980
+ { code: "bn", name: "Bengali" },
1981
+ { code: "sr", name: "Serbian" },
1982
+ { code: "az", name: "Azerbaijani" },
1983
+ { code: "sl", name: "Slovenian" },
1984
+ { code: "kn", name: "Kannada" },
1985
+ { code: "et", name: "Estonian" },
1986
+ { code: "mk", name: "Macedonian" },
1987
+ { code: "br", name: "Breton" },
1988
+ { code: "eu", name: "Basque" },
1989
+ { code: "is", name: "Icelandic" },
1990
+ { code: "hy", name: "Armenian" },
1991
+ { code: "ne", name: "Nepali" },
1992
+ { code: "mn", name: "Mongolian" },
1993
+ { code: "bs", name: "Bosnian" },
1994
+ { code: "kk", name: "Kazakh" },
1995
+ { code: "sq", name: "Albanian" },
1996
+ { code: "sw", name: "Swahili" },
1997
+ { code: "gl", name: "Galician" },
1998
+ { code: "mr", name: "Marathi" },
1999
+ { code: "pa", name: "Punjabi" },
2000
+ { code: "si", name: "Sinhala" },
2001
+ { code: "km", name: "Khmer" },
2002
+ { code: "sn", name: "Shona" },
2003
+ { code: "yo", name: "Yoruba" },
2004
+ { code: "so", name: "Somali" },
2005
+ { code: "af", name: "Afrikaans" },
2006
+ { code: "oc", name: "Occitan" },
2007
+ { code: "ka", name: "Georgian" },
2008
+ { code: "be", name: "Belarusian" },
2009
+ { code: "tg", name: "Tajik" },
2010
+ { code: "sd", name: "Sindhi" },
2011
+ { code: "gu", name: "Gujarati" },
2012
+ { code: "am", name: "Amharic" },
2013
+ { code: "yi", name: "Yiddish" },
2014
+ { code: "lo", name: "Lao" },
2015
+ { code: "uz", name: "Uzbek" },
2016
+ { code: "fo", name: "Faroese" },
2017
+ { code: "ht", name: "Haitian Creole" },
2018
+ { code: "ps", name: "Pashto" },
2019
+ { code: "tk", name: "Turkmen" },
2020
+ { code: "nn", name: "Norwegian Nynorsk" },
2021
+ { code: "mt", name: "Maltese" },
2022
+ { code: "sa", name: "Sanskrit" },
2023
+ { code: "lb", name: "Luxembourgish" },
2024
+ { code: "my", name: "Burmese" },
2025
+ { code: "bo", name: "Tibetan" },
2026
+ { code: "tl", name: "Tagalog" },
2027
+ { code: "mg", name: "Malagasy" },
2028
+ { code: "as", name: "Assamese" },
2029
+ { code: "tt", name: "Tatar" },
2030
+ { code: "haw", name: "Hawaiian" },
2031
+ { code: "ln", name: "Lingala" },
2032
+ { code: "ha", name: "Hausa" },
2033
+ { code: "ba", name: "Bashkir" },
2034
+ { code: "jw", name: "Javanese" },
2035
+ { code: "su", name: "Sundanese" }
2036
+ ];
2037
+ var ElevenLabsLanguageCodes = [
2038
+ "en",
2039
+ "zh",
2040
+ "de",
2041
+ "es",
2042
+ "ru",
2043
+ "ko",
2044
+ "fr",
2045
+ "ja",
2046
+ "pt",
2047
+ "tr",
2048
+ "pl",
2049
+ "ca",
2050
+ "nl",
2051
+ "ar",
2052
+ "sv",
2053
+ "it",
2054
+ "id",
2055
+ "hi",
2056
+ "fi",
2057
+ "vi",
2058
+ "he",
2059
+ "uk",
2060
+ "el",
2061
+ "ms",
2062
+ "cs",
2063
+ "ro",
2064
+ "da",
2065
+ "hu",
2066
+ "ta",
2067
+ "no",
2068
+ "th",
2069
+ "ur",
2070
+ "hr",
2071
+ "bg",
2072
+ "lt",
2073
+ "la",
2074
+ "mi",
2075
+ "ml",
2076
+ "cy",
2077
+ "sk",
2078
+ "te",
2079
+ "fa",
2080
+ "lv",
2081
+ "bn",
2082
+ "sr",
2083
+ "az",
2084
+ "sl",
2085
+ "kn",
2086
+ "et",
2087
+ "mk",
2088
+ "br",
2089
+ "eu",
2090
+ "is",
2091
+ "hy",
2092
+ "ne",
2093
+ "mn",
2094
+ "bs",
2095
+ "kk",
2096
+ "sq",
2097
+ "sw",
2098
+ "gl",
2099
+ "mr",
2100
+ "pa",
2101
+ "si",
2102
+ "km",
2103
+ "sn",
2104
+ "yo",
2105
+ "so",
2106
+ "af",
2107
+ "oc",
2108
+ "ka",
2109
+ "be",
2110
+ "tg",
2111
+ "sd",
2112
+ "gu",
2113
+ "am",
2114
+ "yi",
2115
+ "lo",
2116
+ "uz",
2117
+ "fo",
2118
+ "ht",
2119
+ "ps",
2120
+ "tk",
2121
+ "nn",
2122
+ "mt",
2123
+ "sa",
2124
+ "lb",
2125
+ "my",
2126
+ "bo",
2127
+ "tl",
2128
+ "mg",
2129
+ "as",
2130
+ "tt",
2131
+ "haw",
2132
+ "ln",
2133
+ "ha",
2134
+ "ba",
2135
+ "jw",
2136
+ "su"
2137
+ ];
2138
+ var ElevenLabsLanguageLabels = {
2139
+ en: "English",
2140
+ zh: "Chinese",
2141
+ de: "German",
2142
+ es: "Spanish",
2143
+ ru: "Russian",
2144
+ ko: "Korean",
2145
+ fr: "French",
2146
+ ja: "Japanese",
2147
+ pt: "Portuguese",
2148
+ tr: "Turkish",
2149
+ pl: "Polish",
2150
+ ca: "Catalan",
2151
+ nl: "Dutch",
2152
+ ar: "Arabic",
2153
+ sv: "Swedish",
2154
+ it: "Italian",
2155
+ id: "Indonesian",
2156
+ hi: "Hindi",
2157
+ fi: "Finnish",
2158
+ vi: "Vietnamese",
2159
+ he: "Hebrew",
2160
+ uk: "Ukrainian",
2161
+ el: "Greek",
2162
+ ms: "Malay",
2163
+ cs: "Czech",
2164
+ ro: "Romanian",
2165
+ da: "Danish",
2166
+ hu: "Hungarian",
2167
+ ta: "Tamil",
2168
+ no: "Norwegian",
2169
+ th: "Thai",
2170
+ ur: "Urdu",
2171
+ hr: "Croatian",
2172
+ bg: "Bulgarian",
2173
+ lt: "Lithuanian",
2174
+ la: "Latin",
2175
+ mi: "Maori",
2176
+ ml: "Malayalam",
2177
+ cy: "Welsh",
2178
+ sk: "Slovak",
2179
+ te: "Telugu",
2180
+ fa: "Persian",
2181
+ lv: "Latvian",
2182
+ bn: "Bengali",
2183
+ sr: "Serbian",
2184
+ az: "Azerbaijani",
2185
+ sl: "Slovenian",
2186
+ kn: "Kannada",
2187
+ et: "Estonian",
2188
+ mk: "Macedonian",
2189
+ br: "Breton",
2190
+ eu: "Basque",
2191
+ is: "Icelandic",
2192
+ hy: "Armenian",
2193
+ ne: "Nepali",
2194
+ mn: "Mongolian",
2195
+ bs: "Bosnian",
2196
+ kk: "Kazakh",
2197
+ sq: "Albanian",
2198
+ sw: "Swahili",
2199
+ gl: "Galician",
2200
+ mr: "Marathi",
2201
+ pa: "Punjabi",
2202
+ si: "Sinhala",
2203
+ km: "Khmer",
2204
+ sn: "Shona",
2205
+ yo: "Yoruba",
2206
+ so: "Somali",
2207
+ af: "Afrikaans",
2208
+ oc: "Occitan",
2209
+ ka: "Georgian",
2210
+ be: "Belarusian",
2211
+ tg: "Tajik",
2212
+ sd: "Sindhi",
2213
+ gu: "Gujarati",
2214
+ am: "Amharic",
2215
+ yi: "Yiddish",
2216
+ lo: "Lao",
2217
+ uz: "Uzbek",
2218
+ fo: "Faroese",
2219
+ ht: "Haitian Creole",
2220
+ ps: "Pashto",
2221
+ tk: "Turkmen",
2222
+ nn: "Norwegian Nynorsk",
2223
+ mt: "Maltese",
2224
+ sa: "Sanskrit",
2225
+ lb: "Luxembourgish",
2226
+ my: "Burmese",
2227
+ bo: "Tibetan",
2228
+ tl: "Tagalog",
2229
+ mg: "Malagasy",
2230
+ as: "Assamese",
2231
+ tt: "Tatar",
2232
+ haw: "Hawaiian",
2233
+ ln: "Lingala",
2234
+ ha: "Hausa",
2235
+ ba: "Bashkir",
2236
+ jw: "Javanese",
2237
+ su: "Sundanese"
2238
+ };
2239
+
1910
2240
  // src/generated/gladia/schema/streamingSupportedBitDepthEnum.ts
1911
2241
  var StreamingSupportedBitDepthEnum = {
1912
2242
  NUMBER_8: 8,
@@ -2448,6 +2778,16 @@ var DeepgramSampleRate = {
2448
2778
  NUMBER_44100: 44100,
2449
2779
  NUMBER_48000: 48e3
2450
2780
  };
2781
+ var ElevenLabsRegion = {
2782
+ /** Global endpoint (default) */
2783
+ global: "global",
2784
+ /** United States */
2785
+ us: "us",
2786
+ /** European Union */
2787
+ eu: "eu",
2788
+ /** India */
2789
+ in: "in"
2790
+ };
2451
2791
  var GladiaEncoding = StreamingSupportedEncodingEnum;
2452
2792
  var GladiaSampleRate = StreamingSupportedSampleRateEnum;
2453
2793
  var GladiaBitDepth = StreamingSupportedBitDepthEnum;
@@ -2866,6 +3206,70 @@ function extractWords(words, mapper) {
2866
3206
  const normalizedWords = words.map(mapper);
2867
3207
  return normalizedWords.length > 0 ? normalizedWords : void 0;
2868
3208
  }
3209
+ function buildUtterancesFromWords(words) {
3210
+ const utterances = [];
3211
+ let currentSpeaker;
3212
+ let currentWords = [];
3213
+ let utteranceStart = 0;
3214
+ for (const word of words) {
3215
+ if (!word.speaker) continue;
3216
+ if (word.speaker !== currentSpeaker) {
3217
+ if (currentSpeaker && currentWords.length > 0) {
3218
+ utterances.push({
3219
+ text: currentWords.map((w) => w.word).join(" "),
3220
+ start: utteranceStart,
3221
+ end: currentWords[currentWords.length - 1].end,
3222
+ speaker: currentSpeaker,
3223
+ words: currentWords
3224
+ });
3225
+ }
3226
+ currentSpeaker = word.speaker;
3227
+ currentWords = [word];
3228
+ utteranceStart = word.start;
3229
+ } else {
3230
+ currentWords.push(word);
3231
+ }
3232
+ }
3233
+ if (currentSpeaker && currentWords.length > 0) {
3234
+ utterances.push({
3235
+ text: currentWords.map((w) => w.word).join(" "),
3236
+ start: utteranceStart,
3237
+ end: currentWords[currentWords.length - 1].end,
3238
+ speaker: currentSpeaker,
3239
+ words: currentWords
3240
+ });
3241
+ }
3242
+ return utterances;
3243
+ }
3244
+ function buildTextFromSpeechmaticsResults(results) {
3245
+ const parts = [];
3246
+ let attachNext = false;
3247
+ for (const result of results) {
3248
+ if (result.type !== "word" && result.type !== "punctuation") continue;
3249
+ const content = result.alternatives?.[0]?.content;
3250
+ if (!content) continue;
3251
+ if (result.type === "punctuation") {
3252
+ const attaches = result.attaches_to;
3253
+ if (attaches === "previous" || attaches === "both") {
3254
+ parts.push(content);
3255
+ attachNext = attaches === "both";
3256
+ } else if (attaches === "next") {
3257
+ if (parts.length > 0) parts.push(" ");
3258
+ parts.push(content);
3259
+ attachNext = true;
3260
+ } else {
3261
+ if (parts.length > 0 && !attachNext) parts.push(" ");
3262
+ parts.push(content);
3263
+ attachNext = false;
3264
+ }
3265
+ } else {
3266
+ if (parts.length > 0 && !attachNext) parts.push(" ");
3267
+ parts.push(content);
3268
+ attachNext = false;
3269
+ }
3270
+ }
3271
+ return parts.join("");
3272
+ }
2869
3273
  var STATUS_MAPPINGS = {
2870
3274
  gladia: {
2871
3275
  queued: "queued",
@@ -6482,7 +6886,7 @@ var DeepgramAdapter = class extends BaseAdapter {
6482
6886
  start: w.start || 0,
6483
6887
  end: w.end || 0,
6484
6888
  confidence: w.confidence
6485
- }))
6889
+ })) ?? []
6486
6890
  }));
6487
6891
  }
6488
6892
  /**
@@ -6891,7 +7295,7 @@ var DeepgramAdapter = class extends BaseAdapter {
6891
7295
  start: w.start,
6892
7296
  end: w.end,
6893
7297
  confidence: w.confidence
6894
- }))
7298
+ })) ?? []
6895
7299
  });
6896
7300
  }
6897
7301
  break;
@@ -8130,7 +8534,8 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
8130
8534
  callbacks?.onUtterance?.({
8131
8535
  text: transcription.transcript,
8132
8536
  start: 0,
8133
- end: 0
8537
+ end: 0,
8538
+ words: []
8134
8539
  });
8135
8540
  break;
8136
8541
  }
@@ -8193,7 +8598,8 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
8193
8598
  text: segment.text,
8194
8599
  start: segment.start,
8195
8600
  end: segment.end,
8196
- confidence: void 0
8601
+ confidence: void 0,
8602
+ words: []
8197
8603
  }));
8198
8604
  const requestId2 = `openai-${Date.now()}`;
8199
8605
  return {
@@ -8559,7 +8965,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8559
8965
  * Normalize Speechmatics response to unified format
8560
8966
  */
8561
8967
  normalizeResponse(response) {
8562
- const text = response.results.filter((r) => r.type === "word" && r.alternatives).map((r) => r.alternatives[0]?.content || "").join(" ");
8968
+ const text = buildTextFromSpeechmaticsResults(response.results);
8563
8969
  const words = response.results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
8564
8970
  word: result.alternatives?.[0]?.content || "",
8565
8971
  start: result.start_time,
@@ -8568,51 +8974,14 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
8568
8974
  speaker: result.alternatives?.[0]?.speaker
8569
8975
  }));
8570
8976
  const speakerSet = /* @__PURE__ */ new Set();
8571
- response.results.forEach((r) => {
8572
- if (r.alternatives) {
8573
- const speaker = r.alternatives[0]?.speaker;
8574
- if (speaker) speakerSet.add(speaker);
8575
- }
8977
+ words.forEach((w) => {
8978
+ if (w.speaker) speakerSet.add(w.speaker);
8576
8979
  });
8577
8980
  const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
8578
8981
  id,
8579
8982
  label: `Speaker ${id}`
8580
8983
  })) : void 0;
8581
- const utterances = [];
8582
- if (speakers) {
8583
- let currentSpeaker;
8584
- let currentUtterance = [];
8585
- let utteranceStart = 0;
8586
- response.results.filter((r) => r.type === "word" && r.alternatives).forEach((result, idx) => {
8587
- const speaker = result.alternatives[0]?.speaker;
8588
- const word = result.alternatives[0]?.content || "";
8589
- if (speaker !== currentSpeaker) {
8590
- if (currentSpeaker && currentUtterance.length > 0) {
8591
- const prevResult = response.results.filter((r) => r.type === "word")[idx - 1];
8592
- utterances.push({
8593
- speaker: currentSpeaker,
8594
- text: currentUtterance.join(" "),
8595
- start: utteranceStart || 0,
8596
- end: prevResult?.end_time || result.start_time || 0
8597
- });
8598
- }
8599
- currentSpeaker = speaker;
8600
- currentUtterance = [word];
8601
- utteranceStart = result.start_time || 0;
8602
- } else {
8603
- currentUtterance.push(word);
8604
- }
8605
- });
8606
- if (currentSpeaker && currentUtterance.length > 0) {
8607
- const lastWord = response.results.filter((r) => r.type === "word").pop();
8608
- utterances.push({
8609
- speaker: currentSpeaker,
8610
- text: currentUtterance.join(" "),
8611
- start: utteranceStart,
8612
- end: lastWord?.end_time || utteranceStart
8613
- });
8614
- }
8615
- }
8984
+ const utterances = buildUtterancesFromWords(words);
8616
8985
  return {
8617
8986
  success: true,
8618
8987
  provider: this.name,
@@ -9134,45 +9503,14 @@ var SonioxAdapter = class extends BaseAdapter {
9134
9503
  * Build utterances from tokens based on speaker changes
9135
9504
  */
9136
9505
  buildUtterancesFromTokens(tokens) {
9137
- const utterances = [];
9138
- let currentSpeaker;
9139
- let currentWords = [];
9140
- let utteranceStart = 0;
9141
- for (const token of tokens) {
9142
- const word = {
9143
- word: token.text,
9144
- start: token.start_ms ? token.start_ms / 1e3 : 0,
9145
- end: token.end_ms ? token.end_ms / 1e3 : 0,
9146
- confidence: token.confidence,
9147
- speaker: token.speaker
9148
- };
9149
- if (token.speaker !== currentSpeaker) {
9150
- if (currentSpeaker && currentWords.length > 0) {
9151
- utterances.push({
9152
- text: currentWords.map((w) => w.word).join(" "),
9153
- start: utteranceStart,
9154
- end: currentWords[currentWords.length - 1].end,
9155
- speaker: currentSpeaker,
9156
- words: currentWords
9157
- });
9158
- }
9159
- currentSpeaker = token.speaker;
9160
- currentWords = [word];
9161
- utteranceStart = word.start;
9162
- } else {
9163
- currentWords.push(word);
9164
- }
9165
- }
9166
- if (currentSpeaker && currentWords.length > 0) {
9167
- utterances.push({
9168
- text: currentWords.map((w) => w.word).join(" "),
9169
- start: utteranceStart,
9170
- end: currentWords[currentWords.length - 1].end,
9171
- speaker: currentSpeaker,
9172
- words: currentWords
9173
- });
9174
- }
9175
- return utterances;
9506
+ const words = tokens.map((token) => ({
9507
+ word: token.text,
9508
+ start: token.start_ms ? token.start_ms / 1e3 : 0,
9509
+ end: token.end_ms ? token.end_ms / 1e3 : 0,
9510
+ confidence: token.confidence,
9511
+ speaker: token.speaker
9512
+ }));
9513
+ return buildUtterancesFromWords(words);
9176
9514
  }
9177
9515
  /**
9178
9516
  * Normalize Soniox response to unified format
@@ -9196,7 +9534,7 @@ var SonioxAdapter = class extends BaseAdapter {
9196
9534
  id,
9197
9535
  label: `Speaker ${id}`
9198
9536
  })) : void 0;
9199
- const utterances = response.tokens ? this.buildUtterancesFromTokens(response.tokens) : [];
9537
+ const utterances = response.tokens ? this.buildUtterancesFromTokens(response.tokens.filter((t) => t.is_final)) : [];
9200
9538
  const language = response.tokens?.find((t) => t.language)?.language;
9201
9539
  return {
9202
9540
  success: true,
@@ -9225,6 +9563,501 @@ function createSonioxAdapter(config) {
9225
9563
  return adapter;
9226
9564
  }
9227
9565
 
9566
+ // src/adapters/elevenlabs-adapter.ts
9567
+ var import_axios10 = __toESM(require("axios"));
9568
+ var ElevenLabsAdapter = class extends BaseAdapter {
9569
+ constructor() {
9570
+ super(...arguments);
9571
+ this.name = "elevenlabs";
9572
+ this.capabilities = {
9573
+ streaming: true,
9574
+ diarization: true,
9575
+ wordTimestamps: true,
9576
+ languageDetection: true,
9577
+ customVocabulary: true,
9578
+ summarization: false,
9579
+ sentimentAnalysis: false,
9580
+ entityDetection: true,
9581
+ piiRedaction: true,
9582
+ listTranscripts: false,
9583
+ deleteTranscript: false
9584
+ };
9585
+ this.region = ElevenLabsRegion.global;
9586
+ this.defaultModel = "scribe_v2";
9587
+ }
9588
+ /**
9589
+ * Get regional API host based on configured region
9590
+ */
9591
+ getRegionalHost() {
9592
+ switch (this.region) {
9593
+ case ElevenLabsRegion.us:
9594
+ return "api.us.elevenlabs.io";
9595
+ case ElevenLabsRegion.eu:
9596
+ return "api.eu.residency.elevenlabs.io";
9597
+ case ElevenLabsRegion.in:
9598
+ return "api.in.residency.elevenlabs.io";
9599
+ case ElevenLabsRegion.global:
9600
+ default:
9601
+ return "api.elevenlabs.io";
9602
+ }
9603
+ }
9604
+ /**
9605
+ * Get the base URL for API requests
9606
+ */
9607
+ get baseUrl() {
9608
+ if (this.config?.baseUrl) return this.config.baseUrl;
9609
+ return `https://${this.getRegionalHost()}`;
9610
+ }
9611
+ initialize(config) {
9612
+ super.initialize(config);
9613
+ if (config.region) {
9614
+ this.region = config.region;
9615
+ }
9616
+ if (config.model) {
9617
+ this.defaultModel = config.model;
9618
+ }
9619
+ this.client = import_axios10.default.create({
9620
+ baseURL: this.baseUrl,
9621
+ timeout: config.timeout || 12e4,
9622
+ headers: {
9623
+ "xi-api-key": config.apiKey,
9624
+ ...config.headers
9625
+ }
9626
+ });
9627
+ }
9628
+ /**
9629
+ * Get current region
9630
+ */
9631
+ getRegion() {
9632
+ return this.region;
9633
+ }
9634
+ /**
9635
+ * Set regional endpoint
9636
+ */
9637
+ setRegion(region) {
9638
+ this.region = region;
9639
+ if (this.config?.apiKey) {
9640
+ this.client = import_axios10.default.create({
9641
+ baseURL: this.baseUrl,
9642
+ timeout: this.config.timeout || 12e4,
9643
+ headers: {
9644
+ "xi-api-key": this.config.apiKey,
9645
+ ...this.config.headers
9646
+ }
9647
+ });
9648
+ }
9649
+ }
9650
+ /**
9651
+ * Submit audio for transcription
9652
+ *
9653
+ * ElevenLabs batch is synchronous - the API returns the result directly.
9654
+ */
9655
+ async transcribe(audio, options) {
9656
+ this.validateConfig();
9657
+ try {
9658
+ const formData = new FormData();
9659
+ const modelId = options?.model || this.defaultModel;
9660
+ formData.append("model_id", modelId);
9661
+ if (audio.type === "url") {
9662
+ formData.append("cloud_storage_url", audio.url);
9663
+ } else if (audio.type === "file") {
9664
+ const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
9665
+ formData.append("file", audioBlob, audio.filename || "audio.wav");
9666
+ } else {
9667
+ return {
9668
+ success: false,
9669
+ provider: this.name,
9670
+ error: {
9671
+ code: "INVALID_INPUT",
9672
+ message: "ElevenLabs only supports URL and File audio input"
9673
+ }
9674
+ };
9675
+ }
9676
+ if (options?.language) {
9677
+ formData.append("language_code", options.language);
9678
+ }
9679
+ if (options?.diarization) {
9680
+ formData.append("diarize", "true");
9681
+ }
9682
+ formData.append("timestamps_granularity", "word");
9683
+ if (options?.speakersExpected) {
9684
+ formData.append("num_speakers", String(options.speakersExpected));
9685
+ }
9686
+ if (options?.customVocabulary && options.customVocabulary.length > 0) {
9687
+ for (const term of options.customVocabulary) {
9688
+ formData.append("keyterms", term);
9689
+ }
9690
+ }
9691
+ if (options?.entityDetection) {
9692
+ formData.append("entity_detection", "all");
9693
+ }
9694
+ const elevenlabsOpts = options?.elevenlabs;
9695
+ if (elevenlabsOpts) {
9696
+ for (const [key, value] of Object.entries(elevenlabsOpts)) {
9697
+ if (value === void 0 || value === null) continue;
9698
+ if (formData.has(key)) continue;
9699
+ if (typeof value === "boolean") {
9700
+ formData.append(key, String(value));
9701
+ } else if (Array.isArray(value)) {
9702
+ for (const item of value) {
9703
+ formData.append(key, typeof item === "object" ? JSON.stringify(item) : String(item));
9704
+ }
9705
+ } else if (typeof value === "object") {
9706
+ formData.append(key, JSON.stringify(value));
9707
+ } else {
9708
+ formData.append(key, String(value));
9709
+ }
9710
+ }
9711
+ }
9712
+ const response = await this.client.post("/v1/speech-to-text", formData, {
9713
+ headers: {
9714
+ "Content-Type": "multipart/form-data"
9715
+ }
9716
+ });
9717
+ return this.normalizeResponse(response.data);
9718
+ } catch (error) {
9719
+ return this.createErrorResponse(error);
9720
+ }
9721
+ }
9722
+ /**
9723
+ * Get transcription result by ID
9724
+ *
9725
+ * ElevenLabs batch is synchronous, but supports transcript retrieval.
9726
+ */
9727
+ async getTranscript(transcriptId) {
9728
+ this.validateConfig();
9729
+ try {
9730
+ const response = await this.client.get(`/v1/speech-to-text/transcripts/${transcriptId}`);
9731
+ return this.normalizeResponse(response.data);
9732
+ } catch (error) {
9733
+ return this.createErrorResponse(error);
9734
+ }
9735
+ }
9736
+ /**
9737
+ * Stream audio for real-time transcription
9738
+ *
9739
+ * Creates a WebSocket connection to ElevenLabs realtime STT endpoint.
9740
+ * Audio is sent as base64-encoded JSON messages.
9741
+ */
9742
+ async transcribeStream(options, callbacks) {
9743
+ this.validateConfig();
9744
+ const sessionId = `elevenlabs_${Date.now()}_${Math.random().toString(36).substring(7)}`;
9745
+ const createdAt = /* @__PURE__ */ new Date();
9746
+ const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalHost()}`);
9747
+ const wsUrl = new URL(`${wsBase}/v1/speech-to-text/realtime`);
9748
+ const elOpts = options?.elevenlabsStreaming;
9749
+ const modelId = elOpts?.model || "scribe_v2_realtime";
9750
+ wsUrl.searchParams.set("model_id", modelId);
9751
+ const audioFormat = elOpts?.audioFormat || "pcm_16000";
9752
+ wsUrl.searchParams.set("audio_format", audioFormat);
9753
+ const langCode = elOpts?.languageCode || options?.language;
9754
+ if (langCode) {
9755
+ wsUrl.searchParams.set("language_code", langCode);
9756
+ }
9757
+ if (elOpts?.includeTimestamps !== void 0) {
9758
+ wsUrl.searchParams.set("include_timestamps", String(elOpts.includeTimestamps));
9759
+ }
9760
+ if (elOpts?.includeLanguageDetection || options?.languageDetection) {
9761
+ wsUrl.searchParams.set("include_language_detection", "true");
9762
+ }
9763
+ if (elOpts?.commitStrategy) {
9764
+ wsUrl.searchParams.set("commit_strategy", elOpts.commitStrategy);
9765
+ }
9766
+ if (elOpts?.vadSilenceThresholdSecs !== void 0) {
9767
+ wsUrl.searchParams.set("vad_silence_threshold_secs", String(elOpts.vadSilenceThresholdSecs));
9768
+ }
9769
+ if (elOpts?.vadThreshold !== void 0) {
9770
+ wsUrl.searchParams.set("vad_threshold", String(elOpts.vadThreshold));
9771
+ }
9772
+ if (elOpts?.minSpeechDurationMs !== void 0) {
9773
+ wsUrl.searchParams.set("min_speech_duration_ms", String(elOpts.minSpeechDurationMs));
9774
+ }
9775
+ if (elOpts?.minSilenceDurationMs !== void 0) {
9776
+ wsUrl.searchParams.set("min_silence_duration_ms", String(elOpts.minSilenceDurationMs));
9777
+ }
9778
+ if (elOpts?.previousText) {
9779
+ wsUrl.searchParams.set("previous_text", elOpts.previousText);
9780
+ }
9781
+ if (!elOpts?.audioFormat && options?.encoding) {
9782
+ const encodingMap = {
9783
+ linear16: "pcm_16000",
9784
+ pcm: "pcm_16000",
9785
+ mulaw: "ulaw_8000"
9786
+ };
9787
+ const mappedFormat = encodingMap[options.encoding];
9788
+ if (mappedFormat) {
9789
+ wsUrl.searchParams.set("audio_format", mappedFormat);
9790
+ }
9791
+ }
9792
+ let status = "connecting";
9793
+ let openedAt = null;
9794
+ let receivedData = false;
9795
+ const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : require("ws");
9796
+ const ws = new WebSocketImpl(wsUrl.toString(), {
9797
+ headers: {
9798
+ "xi-api-key": this.config.apiKey
9799
+ }
9800
+ });
9801
+ ws.onopen = () => {
9802
+ status = "open";
9803
+ openedAt = Date.now();
9804
+ callbacks?.onOpen?.();
9805
+ };
9806
+ ws.onmessage = (event) => {
9807
+ receivedData = true;
9808
+ const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
9809
+ let messageType;
9810
+ try {
9811
+ const data = JSON.parse(rawPayload);
9812
+ if (data.error) {
9813
+ messageType = "error";
9814
+ } else if (data.message_type === "session_started") {
9815
+ messageType = "session_started";
9816
+ } else if (data.message_type === "partial_transcript") {
9817
+ messageType = "partial_transcript";
9818
+ } else if (data.message_type === "committed_transcript") {
9819
+ messageType = "committed_transcript";
9820
+ } else if (data.message_type === "committed_transcript_with_timestamps") {
9821
+ messageType = "committed_transcript_with_timestamps";
9822
+ }
9823
+ if (callbacks?.onRawMessage) {
9824
+ callbacks.onRawMessage({
9825
+ provider: this.name,
9826
+ direction: "incoming",
9827
+ timestamp: Date.now(),
9828
+ payload: rawPayload,
9829
+ messageType
9830
+ });
9831
+ }
9832
+ if (data.error) {
9833
+ callbacks?.onError?.({
9834
+ code: data.error_code?.toString() || "STREAM_ERROR",
9835
+ message: data.error
9836
+ });
9837
+ return;
9838
+ }
9839
+ if (data.message_type === "session_started") {
9840
+ return;
9841
+ }
9842
+ if (data.message_type === "partial_transcript") {
9843
+ const streamEvent = {
9844
+ type: "transcript",
9845
+ text: data.text || "",
9846
+ isFinal: false,
9847
+ confidence: void 0,
9848
+ language: data.language_code
9849
+ };
9850
+ callbacks?.onTranscript?.(streamEvent);
9851
+ return;
9852
+ }
9853
+ if (data.message_type === "committed_transcript" || data.message_type === "committed_transcript_with_timestamps") {
9854
+ const words = data.words ? data.words.map((w) => ({
9855
+ word: w.text || "",
9856
+ start: w.start || 0,
9857
+ end: w.end || 0,
9858
+ confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
9859
+ speaker: w.speaker_id
9860
+ })) : [];
9861
+ const streamEvent = {
9862
+ type: "transcript",
9863
+ text: data.text || "",
9864
+ isFinal: true,
9865
+ words: words.length > 0 ? words : void 0,
9866
+ speaker: words[0]?.speaker,
9867
+ language: data.language_code,
9868
+ confidence: void 0
9869
+ };
9870
+ callbacks?.onTranscript?.(streamEvent);
9871
+ if (options?.diarization && words.length > 0) {
9872
+ const utterances = buildUtterancesFromWords(words);
9873
+ for (const utterance of utterances) {
9874
+ callbacks?.onUtterance?.(utterance);
9875
+ }
9876
+ }
9877
+ }
9878
+ } catch (error) {
9879
+ callbacks?.onError?.({
9880
+ code: "PARSE_ERROR",
9881
+ message: `Failed to parse message: ${error}`
9882
+ });
9883
+ }
9884
+ };
9885
+ ws.onerror = () => {
9886
+ callbacks?.onError?.({
9887
+ code: "WEBSOCKET_ERROR",
9888
+ message: "WebSocket error occurred"
9889
+ });
9890
+ };
9891
+ ws.onclose = (event) => {
9892
+ status = "closed";
9893
+ const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
9894
+ const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
9895
+ if (isImmediateClose && event.code === 1e3) {
9896
+ callbacks?.onError?.({
9897
+ code: "ELEVENLABS_CONFIG_REJECTED",
9898
+ message: [
9899
+ "ElevenLabs closed connection immediately after opening.",
9900
+ `Current config: region=${this.region}, model=${modelId}`,
9901
+ "Likely causes:",
9902
+ " - Invalid API key",
9903
+ " - Unsupported audio format or model",
9904
+ event.reason ? `Server reason: ${event.reason}` : null
9905
+ ].filter(Boolean).join("\n")
9906
+ });
9907
+ }
9908
+ callbacks?.onClose?.(event.code, event.reason);
9909
+ };
9910
+ await new Promise((resolve, reject) => {
9911
+ const timeout = setTimeout(() => {
9912
+ reject(new Error("WebSocket connection timeout"));
9913
+ }, 1e4);
9914
+ const checkOpen = () => {
9915
+ if (status === "open") {
9916
+ clearTimeout(timeout);
9917
+ resolve();
9918
+ } else if (status === "closed") {
9919
+ clearTimeout(timeout);
9920
+ reject(new Error("WebSocket connection failed"));
9921
+ } else {
9922
+ setTimeout(checkOpen, 100);
9923
+ }
9924
+ };
9925
+ checkOpen();
9926
+ });
9927
+ return {
9928
+ id: sessionId,
9929
+ provider: this.name,
9930
+ createdAt,
9931
+ getStatus: () => status,
9932
+ sendAudio: async (chunk) => {
9933
+ if (status !== "open") {
9934
+ throw new Error("Session is not open");
9935
+ }
9936
+ let base64Audio;
9937
+ if (chunk.data instanceof ArrayBuffer) {
9938
+ base64Audio = Buffer.from(chunk.data).toString("base64");
9939
+ } else if (chunk.data instanceof Uint8Array) {
9940
+ base64Audio = Buffer.from(
9941
+ chunk.data.buffer,
9942
+ chunk.data.byteOffset,
9943
+ chunk.data.byteLength
9944
+ ).toString("base64");
9945
+ } else {
9946
+ base64Audio = Buffer.from(chunk.data).toString("base64");
9947
+ }
9948
+ const message = JSON.stringify({
9949
+ message_type: "input_audio_chunk",
9950
+ audio_base_64: base64Audio
9951
+ });
9952
+ if (callbacks?.onRawMessage) {
9953
+ callbacks.onRawMessage({
9954
+ provider: this.name,
9955
+ direction: "outgoing",
9956
+ timestamp: Date.now(),
9957
+ payload: message,
9958
+ messageType: "audio"
9959
+ });
9960
+ }
9961
+ ws.send(message);
9962
+ },
9963
+ close: async () => {
9964
+ if (status === "open") {
9965
+ status = "closing";
9966
+ ws.send(JSON.stringify({ message_type: "end_of_stream" }));
9967
+ ws.close(1e3, "Client requested close");
9968
+ }
9969
+ }
9970
+ };
9971
+ }
9972
+ /**
9973
+ * Normalize ElevenLabs response to unified format
9974
+ *
9975
+ * ElevenLabs returns either:
9976
+ * - Single channel: `SpeechToTextChunkResponseModel` directly (text, words, etc.)
9977
+ * - Multi-channel: `MultichannelSpeechToTextResponseModel` with `transcripts[]`
9978
+ */
9979
+ normalizeResponse(response) {
9980
+ const chunks = response.transcripts ? response.transcripts : [response];
9981
+ const text = chunks.map((c) => c.text).join(" ");
9982
+ const words = [];
9983
+ const speakerSet = /* @__PURE__ */ new Set();
9984
+ const audioEvents = [];
9985
+ for (const chunk of chunks) {
9986
+ if (!chunk.words) continue;
9987
+ for (const w of chunk.words) {
9988
+ if (w.type === "audio_event") {
9989
+ audioEvents.push({
9990
+ text: w.text,
9991
+ start: typeof w.start === "number" ? w.start : 0,
9992
+ end: typeof w.end === "number" ? w.end : 0
9993
+ });
9994
+ continue;
9995
+ }
9996
+ const speakerId = w.speaker_id ?? void 0;
9997
+ const word = {
9998
+ word: w.text,
9999
+ start: typeof w.start === "number" ? w.start : 0,
10000
+ end: typeof w.end === "number" ? w.end : 0,
10001
+ confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
10002
+ speaker: speakerId ?? void 0
10003
+ };
10004
+ words.push(word);
10005
+ if (speakerId) {
10006
+ speakerSet.add(speakerId);
10007
+ }
10008
+ }
10009
+ }
10010
+ const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
10011
+ id,
10012
+ label: `Speaker ${id}`
10013
+ })) : void 0;
10014
+ const utterances = words.length > 0 ? buildUtterancesFromWords(words) : [];
10015
+ const language = chunks[0]?.language_code;
10016
+ const languageProbability = chunks[0]?.language_probability;
10017
+ const entities = [];
10018
+ for (const chunk of chunks) {
10019
+ if (chunk.entities && Array.isArray(chunk.entities)) {
10020
+ for (const entity of chunk.entities) {
10021
+ entities.push({
10022
+ text: entity.text,
10023
+ entity_type: entity.entity_type,
10024
+ start_char: entity.start_char,
10025
+ end_char: entity.end_char
10026
+ });
10027
+ }
10028
+ }
10029
+ }
10030
+ const transcriptionId = response.transcription_id || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
10031
+ return {
10032
+ success: true,
10033
+ provider: this.name,
10034
+ data: {
10035
+ id: transcriptionId,
10036
+ text,
10037
+ status: "completed",
10038
+ language,
10039
+ speakers,
10040
+ words: words.length > 0 ? words : void 0,
10041
+ utterances: utterances.length > 0 ? utterances : void 0
10042
+ },
10043
+ extended: {
10044
+ entities: entities.length > 0 ? entities : void 0,
10045
+ audioEvents: audioEvents.length > 0 ? audioEvents : void 0,
10046
+ languageProbability
10047
+ },
10048
+ tracking: {
10049
+ requestId: transcriptionId
10050
+ },
10051
+ raw: response
10052
+ };
10053
+ }
10054
+ };
10055
+ function createElevenLabsAdapter(config) {
10056
+ const adapter = new ElevenLabsAdapter();
10057
+ adapter.initialize(config);
10058
+ return adapter;
10059
+ }
10060
+
9228
10061
  // src/utils/zod-to-field-configs.ts
9229
10062
  function unwrapZodType(schema) {
9230
10063
  let inner = schema;
@@ -36625,6 +37458,21 @@ var SonioxCapabilities = {
36625
37458
  listTranscripts: false,
36626
37459
  deleteTranscript: false
36627
37460
  };
37461
+ var ElevenLabsCapabilities = {
37462
+ streaming: true,
37463
+ diarization: true,
37464
+ wordTimestamps: true,
37465
+ languageDetection: true,
37466
+ customVocabulary: true,
37467
+ // Via keyterms parameter
37468
+ summarization: false,
37469
+ sentimentAnalysis: false,
37470
+ entityDetection: true,
37471
+ piiRedaction: true,
37472
+ // Via entity_detection with PII categories
37473
+ listTranscripts: false,
37474
+ deleteTranscript: false
37475
+ };
36628
37476
  var ProviderCapabilitiesMap = {
36629
37477
  gladia: GladiaCapabilities,
36630
37478
  assemblyai: AssemblyAICapabilities,
@@ -36632,7 +37480,8 @@ var ProviderCapabilitiesMap = {
36632
37480
  "openai-whisper": OpenAICapabilities,
36633
37481
  "azure-stt": AzureCapabilities,
36634
37482
  speechmatics: SpeechmaticsCapabilities,
36635
- soniox: SonioxCapabilities
37483
+ soniox: SonioxCapabilities,
37484
+ elevenlabs: ElevenLabsCapabilities
36636
37485
  };
36637
37486
  var CapabilityKeys = [
36638
37487
  "streaming",
@@ -36834,7 +37683,8 @@ var AllLanguageCodes = {
36834
37683
  // BCP-47 locale codes (e.g., "en-US")
36835
37684
  speechmatics: SpeechmaticsLanguageCodes,
36836
37685
  // ISO 639-1 codes with multilingual packs
36837
- soniox: SonioxLanguageCodes
37686
+ soniox: SonioxLanguageCodes,
37687
+ elevenlabs: ElevenLabsLanguageCodes
36838
37688
  };
36839
37689
  var ProviderDisplayNames = {
36840
37690
  gladia: "Gladia",
@@ -36843,7 +37693,8 @@ var ProviderDisplayNames = {
36843
37693
  "openai-whisper": "OpenAI Whisper",
36844
37694
  "azure-stt": "Azure Speech",
36845
37695
  speechmatics: "Speechmatics",
36846
- soniox: "Soniox"
37696
+ soniox: "Soniox",
37697
+ elevenlabs: "ElevenLabs"
36847
37698
  };
36848
37699
  var ProviderWebsites = {
36849
37700
  gladia: "https://gladia.io",
@@ -36852,7 +37703,8 @@ var ProviderWebsites = {
36852
37703
  "openai-whisper": "https://openai.com",
36853
37704
  "azure-stt": "https://azure.microsoft.com/services/cognitive-services/speech-to-text/",
36854
37705
  speechmatics: "https://speechmatics.com",
36855
- soniox: "https://soniox.com"
37706
+ soniox: "https://soniox.com",
37707
+ elevenlabs: "https://elevenlabs.io"
36856
37708
  };
36857
37709
  var ProviderDocs = {
36858
37710
  gladia: "https://docs.gladia.io",
@@ -36861,7 +37713,8 @@ var ProviderDocs = {
36861
37713
  "openai-whisper": "https://platform.openai.com/docs/guides/speech-to-text",
36862
37714
  "azure-stt": "https://learn.microsoft.com/azure/cognitive-services/speech-service/",
36863
37715
  speechmatics: "https://docs.speechmatics.com",
36864
- soniox: "https://soniox.com/docs/stt/"
37716
+ soniox: "https://soniox.com/docs/stt/",
37717
+ elevenlabs: "https://elevenlabs.io/docs/capabilities/speech-to-text"
36865
37718
  };
36866
37719
  var AllProviders = [
36867
37720
  "gladia",
@@ -36870,7 +37723,8 @@ var AllProviders = [
36870
37723
  "openai-whisper",
36871
37724
  "azure-stt",
36872
37725
  "speechmatics",
36873
- "soniox"
37726
+ "soniox",
37727
+ "elevenlabs"
36874
37728
  ];
36875
37729
  var StreamingProviders = AllProviders.filter(
36876
37730
  (p) => ProviderCapabilitiesMap[p].streaming
@@ -37595,6 +38449,77 @@ var TranslationConfigType = {
37595
38449
  two_way: "two_way"
37596
38450
  };
37597
38451
 
38452
+ // src/generated/elevenlabs/schema/index.ts
38453
+ var schema_exports8 = {};
38454
+ __export(schema_exports8, {
38455
+ BodySpeechToTextV1SpeechToTextPostFileFormat: () => BodySpeechToTextV1SpeechToTextPostFileFormat,
38456
+ BodySpeechToTextV1SpeechToTextPostModelId: () => BodySpeechToTextV1SpeechToTextPostModelId,
38457
+ BodySpeechToTextV1SpeechToTextPostTimestampsGranularity: () => BodySpeechToTextV1SpeechToTextPostTimestampsGranularity,
38458
+ DocxExportOptionsFormat: () => DocxExportOptionsFormat,
38459
+ HtmlExportOptionsFormat: () => HtmlExportOptionsFormat,
38460
+ PdfExportOptionsFormat: () => PdfExportOptionsFormat,
38461
+ SegmentedJsonExportOptionsFormat: () => SegmentedJsonExportOptionsFormat,
38462
+ SpeechToTextWordResponseModelType: () => SpeechToTextWordResponseModelType,
38463
+ SrtExportOptionsFormat: () => SrtExportOptionsFormat,
38464
+ TxtExportOptionsFormat: () => TxtExportOptionsFormat
38465
+ });
38466
+
38467
+ // src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostFileFormat.ts
38468
+ var BodySpeechToTextV1SpeechToTextPostFileFormat = {
38469
+ pcm_s16le_16: "pcm_s16le_16",
38470
+ other: "other"
38471
+ };
38472
+
38473
+ // src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostModelId.ts
38474
+ var BodySpeechToTextV1SpeechToTextPostModelId = {
38475
+ scribe_v1: "scribe_v1",
38476
+ scribe_v2: "scribe_v2"
38477
+ };
38478
+
38479
+ // src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostTimestampsGranularity.ts
38480
+ var BodySpeechToTextV1SpeechToTextPostTimestampsGranularity = {
38481
+ none: "none",
38482
+ word: "word",
38483
+ character: "character"
38484
+ };
38485
+
38486
+ // src/generated/elevenlabs/schema/docxExportOptionsFormat.ts
38487
+ var DocxExportOptionsFormat = {
38488
+ docx: "docx"
38489
+ };
38490
+
38491
+ // src/generated/elevenlabs/schema/htmlExportOptionsFormat.ts
38492
+ var HtmlExportOptionsFormat = {
38493
+ html: "html"
38494
+ };
38495
+
38496
+ // src/generated/elevenlabs/schema/pdfExportOptionsFormat.ts
38497
+ var PdfExportOptionsFormat = {
38498
+ pdf: "pdf"
38499
+ };
38500
+
38501
+ // src/generated/elevenlabs/schema/segmentedJsonExportOptionsFormat.ts
38502
+ var SegmentedJsonExportOptionsFormat = {
38503
+ segmented_json: "segmented_json"
38504
+ };
38505
+
38506
+ // src/generated/elevenlabs/schema/speechToTextWordResponseModelType.ts
38507
+ var SpeechToTextWordResponseModelType = {
38508
+ word: "word",
38509
+ spacing: "spacing",
38510
+ audio_event: "audio_event"
38511
+ };
38512
+
38513
+ // src/generated/elevenlabs/schema/srtExportOptionsFormat.ts
38514
+ var SrtExportOptionsFormat = {
38515
+ srt: "srt"
38516
+ };
38517
+
38518
+ // src/generated/elevenlabs/schema/txtExportOptionsFormat.ts
38519
+ var TxtExportOptionsFormat = {
38520
+ txt: "txt"
38521
+ };
38522
+
37598
38523
  // src/generated/speechmatics/api/speechmaticsASRRESTAPI.zod.ts
37599
38524
  var speechmaticsASRRESTAPI_zod_exports = {};
37600
38525
  __export(speechmaticsASRRESTAPI_zod_exports, {
@@ -38707,6 +39632,448 @@ var getUsageResponse = import_zod12.z.object({
38707
39632
  })
38708
39633
  )
38709
39634
  });
39635
+
39636
+ // src/generated/elevenlabs/api/elevenLabsSpeechToTextAPI.zod.ts
39637
+ var elevenLabsSpeechToTextAPI_zod_exports = {};
39638
+ __export(elevenLabsSpeechToTextAPI_zod_exports, {
39639
+ deleteTranscriptByIdHeader: () => deleteTranscriptByIdHeader,
39640
+ deleteTranscriptByIdParams: () => deleteTranscriptByIdParams,
39641
+ deleteTranscriptByIdResponse: () => deleteTranscriptByIdResponse,
39642
+ getTranscriptByIdHeader: () => getTranscriptByIdHeader,
39643
+ getTranscriptByIdParams: () => getTranscriptByIdParams,
39644
+ getTranscriptByIdResponse: () => getTranscriptByIdResponse,
39645
+ speechToTextBody: () => speechToTextBody,
39646
+ speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault,
39647
+ speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive,
39648
+ speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour,
39649
+ speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne,
39650
+ speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree,
39651
+ speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo,
39652
+ speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault,
39653
+ speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive,
39654
+ speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour,
39655
+ speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne,
39656
+ speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree,
39657
+ speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo,
39658
+ speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault: () => speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault,
39659
+ speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree: () => speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree,
39660
+ speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive,
39661
+ speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive,
39662
+ speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive,
39663
+ speechToTextBodyAdditionalFormatsMax: () => speechToTextBodyAdditionalFormatsMax,
39664
+ speechToTextBodyDiarizationThresholdMaxOne: () => speechToTextBodyDiarizationThresholdMaxOne,
39665
+ speechToTextBodyDiarizationThresholdMinOne: () => speechToTextBodyDiarizationThresholdMinOne,
39666
+ speechToTextBodyDiarizeDefault: () => speechToTextBodyDiarizeDefault,
39667
+ speechToTextBodyFileFormatDefault: () => speechToTextBodyFileFormatDefault,
39668
+ speechToTextBodyKeytermsDefault: () => speechToTextBodyKeytermsDefault,
39669
+ speechToTextBodyNoVerbatimDefault: () => speechToTextBodyNoVerbatimDefault,
39670
+ speechToTextBodyNumSpeakersMaxOne: () => speechToTextBodyNumSpeakersMaxOne,
39671
+ speechToTextBodySeedMaxOne: () => speechToTextBodySeedMaxOne,
39672
+ speechToTextBodySeedMinOne: () => speechToTextBodySeedMinOne,
39673
+ speechToTextBodyTagAudioEventsDefault: () => speechToTextBodyTagAudioEventsDefault,
39674
+ speechToTextBodyTemperatureMaxOne: () => speechToTextBodyTemperatureMaxOne,
39675
+ speechToTextBodyTemperatureMinOne: () => speechToTextBodyTemperatureMinOne,
39676
+ speechToTextBodyTimestampsGranularityDefault: () => speechToTextBodyTimestampsGranularityDefault,
39677
+ speechToTextBodyUseMultiChannelDefault: () => speechToTextBodyUseMultiChannelDefault,
39678
+ speechToTextBodyWebhookDefault: () => speechToTextBodyWebhookDefault,
39679
+ speechToTextHeader: () => speechToTextHeader,
39680
+ speechToTextQueryEnableLoggingDefault: () => speechToTextQueryEnableLoggingDefault,
39681
+ speechToTextQueryParams: () => speechToTextQueryParams,
39682
+ speechToTextResponse: () => speechToTextResponse
39683
+ });
39684
+ var import_zod13 = require("zod");
39685
+ var speechToTextQueryEnableLoggingDefault = true;
39686
+ var speechToTextQueryParams = import_zod13.z.object({
39687
+ enable_logging: import_zod13.z.boolean().default(speechToTextQueryEnableLoggingDefault).describe(
39688
+ "When enable_logging is set to false zero retention mode will be used for the request. This will mean log and transcript storage features are unavailable for this request. Zero retention mode may only be used by enterprise customers."
39689
+ )
39690
+ });
39691
+ var speechToTextHeader = import_zod13.z.object({
39692
+ "xi-api-key": import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
39693
+ "Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
39694
+ )
39695
+ });
39696
+ var speechToTextBodyTagAudioEventsDefault = true;
39697
+ var speechToTextBodyNumSpeakersMaxOne = 32;
39698
+ var speechToTextBodyTimestampsGranularityDefault = "word";
39699
+ var speechToTextBodyDiarizeDefault = false;
39700
+ var speechToTextBodyDiarizationThresholdMinOne = 0.1;
39701
+ var speechToTextBodyDiarizationThresholdMaxOne = 0.4;
39702
+ var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault = true;
39703
+ var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault = true;
39704
+ var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne = true;
39705
+ var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne = true;
39706
+ var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo = true;
39707
+ var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo = true;
39708
+ var speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault = 100;
39709
+ var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree = true;
39710
+ var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree = true;
39711
+ var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour = true;
39712
+ var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour = true;
39713
+ var speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree = 42;
39714
+ var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive = false;
39715
+ var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive = true;
39716
+ var speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive = 0.8;
39717
+ var speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive = 4;
39718
+ var speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive = 84;
39719
+ var speechToTextBodyAdditionalFormatsMax = 10;
39720
+ var speechToTextBodyFileFormatDefault = "other";
39721
+ var speechToTextBodyWebhookDefault = false;
39722
+ var speechToTextBodyTemperatureMinOne = 0;
39723
+ var speechToTextBodyTemperatureMaxOne = 2;
39724
+ var speechToTextBodySeedMinOne = 0;
39725
+ var speechToTextBodySeedMaxOne = 2147483647;
39726
+ var speechToTextBodyUseMultiChannelDefault = false;
39727
+ var speechToTextBodyNoVerbatimDefault = false;
39728
+ var speechToTextBodyKeytermsDefault = [];
39729
+ var speechToTextBody = import_zod13.z.object({
39730
+ model_id: import_zod13.z.enum(["scribe_v1", "scribe_v2"]).describe("The ID of the model to use for transcription."),
39731
+ file: import_zod13.z.instanceof(File).or(import_zod13.z.null()).optional().describe(
39732
+ "The file to transcribe. All major audio and video formats are supported. Exactly one of the file or cloud_storage_url parameters must be provided. The file size must be less than 3.0GB."
39733
+ ),
39734
+ language_code: import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
39735
+ "An ISO-639-1 or ISO-639-3 language_code corresponding to the language of the audio file. Can sometimes improve transcription performance if known beforehand. Defaults to null, in this case the language is predicted automatically."
39736
+ ),
39737
+ tag_audio_events: import_zod13.z.boolean().default(speechToTextBodyTagAudioEventsDefault).describe(
39738
+ "Whether to tag audio events like (laughter), (footsteps), etc. in the transcription."
39739
+ ),
39740
+ num_speakers: import_zod13.z.number().min(1).max(speechToTextBodyNumSpeakersMaxOne).or(import_zod13.z.null()).optional().describe(
39741
+ "The maximum amount of speakers talking in the uploaded file. Can help with predicting who speaks when. The maximum amount of speakers that can be predicted is 32. Defaults to null, in this case the amount of speakers is set to the maximum value the model supports."
39742
+ ),
39743
+ timestamps_granularity: import_zod13.z.enum(["none", "word", "character"]).default(speechToTextBodyTimestampsGranularityDefault).describe(
39744
+ "The granularity of the timestamps in the transcription. 'word' provides word-level timestamps and 'character' provides character-level timestamps per word."
39745
+ ),
39746
+ diarize: import_zod13.z.boolean().optional().describe("Whether to annotate which speaker is currently talking in the uploaded file."),
39747
+ diarization_threshold: import_zod13.z.number().min(speechToTextBodyDiarizationThresholdMinOne).max(speechToTextBodyDiarizationThresholdMaxOne).or(import_zod13.z.null()).optional().describe(
39748
+ "Diarization threshold to apply during speaker diarization. A higher value means there will be a lower chance of one speaker being diarized as two different speakers but also a higher chance of two different speakers being diarized as one speaker (less total speakers predicted). A low value means there will be a higher chance of one speaker being diarized as two different speakers but also a lower chance of two different speakers being diarized as one speaker (more total speakers predicted). Can only be set when diarize=True and num_speakers=None. Defaults to None, in which case we will choose a threshold based on the model_id (0.22 usually)."
39749
+ ),
39750
+ additional_formats: import_zod13.z.array(
39751
+ import_zod13.z.discriminatedUnion("format", [
39752
+ import_zod13.z.object({
39753
+ include_speakers: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault),
39754
+ include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault),
39755
+ format: import_zod13.z.enum(["segmented_json"]),
39756
+ segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
39757
+ max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
39758
+ max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).optional()
39759
+ }),
39760
+ import_zod13.z.object({
39761
+ include_speakers: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne),
39762
+ include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne),
39763
+ format: import_zod13.z.enum(["docx"]),
39764
+ segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
39765
+ max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
39766
+ max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).optional()
39767
+ }),
39768
+ import_zod13.z.object({
39769
+ include_speakers: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo),
39770
+ include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo),
39771
+ format: import_zod13.z.enum(["pdf"]),
39772
+ segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
39773
+ max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
39774
+ max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).optional()
39775
+ }),
39776
+ import_zod13.z.object({
39777
+ max_characters_per_line: import_zod13.z.number().or(import_zod13.z.null()).default(speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault),
39778
+ include_speakers: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree),
39779
+ include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree),
39780
+ format: import_zod13.z.enum(["txt"]),
39781
+ segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
39782
+ max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
39783
+ max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).optional()
39784
+ }),
39785
+ import_zod13.z.object({
39786
+ include_speakers: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour),
39787
+ include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour),
39788
+ format: import_zod13.z.enum(["html"]),
39789
+ segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
39790
+ max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
39791
+ max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).optional()
39792
+ }),
39793
+ import_zod13.z.object({
39794
+ max_characters_per_line: import_zod13.z.number().or(import_zod13.z.null()).default(speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree),
39795
+ include_speakers: import_zod13.z.boolean().optional(),
39796
+ include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive),
39797
+ format: import_zod13.z.enum(["srt"]),
39798
+ segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).default(
39799
+ speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive
39800
+ ),
39801
+ max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).default(speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive),
39802
+ max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).default(speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive)
39803
+ })
39804
+ ])
39805
+ ).max(speechToTextBodyAdditionalFormatsMax).optional(),
39806
+ file_format: import_zod13.z.enum(["pcm_s16le_16", "other"]).default(speechToTextBodyFileFormatDefault).describe(
39807
+ "The format of input audio. Options are 'pcm_s16le_16' or 'other' For `pcm_s16le_16`, the input audio must be 16-bit PCM at a 16kHz sample rate, single channel (mono), and little-endian byte order. Latency will be lower than with passing an encoded waveform."
39808
+ ),
39809
+ cloud_storage_url: import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
39810
+ "The HTTPS URL of the file to transcribe. Exactly one of the file or cloud_storage_url parameters must be provided. The file must be accessible via HTTPS and the file size must be less than 2GB. Any valid HTTPS URL is accepted, including URLs from cloud storage providers (AWS S3, Google Cloud Storage, Cloudflare R2, etc.), CDNs, or any other HTTPS source. URLs can be pre-signed or include authentication tokens in query parameters."
39811
+ ),
39812
+ webhook: import_zod13.z.boolean().optional().describe(
39813
+ "Whether to send the transcription result to configured speech-to-text webhooks. If set the request will return early without the transcription, which will be delivered later via webhook."
39814
+ ),
39815
+ webhook_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
39816
+ "Optional specific webhook ID to send the transcription result to. Only valid when webhook is set to true. If not provided, transcription will be sent to all configured speech-to-text webhooks."
39817
+ ),
39818
+ temperature: import_zod13.z.number().min(speechToTextBodyTemperatureMinOne).max(speechToTextBodyTemperatureMaxOne).or(import_zod13.z.null()).optional().describe(
39819
+ "Controls the randomness of the transcription output. Accepts values between 0.0 and 2.0, where higher values result in more diverse and less deterministic results. If omitted, we will use a temperature based on the model you selected which is usually 0."
39820
+ ),
39821
+ seed: import_zod13.z.number().min(speechToTextBodySeedMinOne).max(speechToTextBodySeedMaxOne).or(import_zod13.z.null()).optional().describe(
39822
+ "If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result. Determinism is not guaranteed. Must be an integer between 0 and 2147483647."
39823
+ ),
39824
+ use_multi_channel: import_zod13.z.boolean().optional().describe(
39825
+ "Whether the audio file contains multiple channels where each channel contains a single speaker. When enabled, each channel will be transcribed independently and the results will be combined. Each word in the response will include a 'channel_index' field indicating which channel it was spoken on. A maximum of 5 channels is supported."
39826
+ ),
39827
+ webhook_metadata: import_zod13.z.string().or(import_zod13.z.record(import_zod13.z.string(), import_zod13.z.any())).or(import_zod13.z.null()).optional().describe(
39828
+ "Optional metadata to be included in the webhook response. This should be a JSON string representing an object with a maximum depth of 2 levels and maximum size of 16KB. Useful for tracking internal IDs, job references, or other contextual information."
39829
+ ),
39830
+ entity_detection: import_zod13.z.string().or(import_zod13.z.array(import_zod13.z.string())).or(import_zod13.z.null()).optional().describe(
39831
+ "Detect entities in the transcript. Can be 'all' to detect all entities, a single entity type or category string, or a list of entity types/categories. Categories include 'pii', 'phi', 'pci', 'other', 'offensive_language'. When enabled, detected entities will be returned in the 'entities' field with their text, type, and character positions. Usage of this parameter will incur additional costs."
39832
+ ),
39833
+ no_verbatim: import_zod13.z.boolean().optional().describe(
39834
+ "If true, the transcription will not have any filler words, false starts and non-speech sounds. Only supported with scribe_v2 model."
39835
+ ),
39836
+ keyterms: import_zod13.z.array(import_zod13.z.string()).default(speechToTextBodyKeytermsDefault).describe(
39837
+ 'A list of keyterms to bias the transcription towards. The keyterms are words or phrases you want the model to recognise more accurately. The number of keyterms cannot exceed 100. The length of each keyterm must be less than 50 characters. Keyterms can contain at most 5 words (after normalisation). For example ["hello", "world", "technical term"]. Usage of this parameter will incur additional costs. '
39838
+ )
39839
+ });
39840
+ var speechToTextResponse = import_zod13.z.object({
39841
+ language_code: import_zod13.z.string().describe("The detected language code (e.g. 'eng' for English)."),
39842
+ language_probability: import_zod13.z.number().describe("The confidence score of the language detection (0 to 1)."),
39843
+ text: import_zod13.z.string().describe("The raw text of the transcription."),
39844
+ words: import_zod13.z.array(
39845
+ import_zod13.z.object({
39846
+ text: import_zod13.z.string().describe("The word or sound that was transcribed."),
39847
+ start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the word or sound in seconds."),
39848
+ end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the word or sound in seconds."),
39849
+ type: import_zod13.z.enum(["word", "spacing", "audio_event"]).describe(
39850
+ "The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
39851
+ ),
39852
+ speaker_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("Unique identifier for the speaker of this word."),
39853
+ logprob: import_zod13.z.number().describe(
39854
+ "The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
39855
+ ),
39856
+ characters: import_zod13.z.array(
39857
+ import_zod13.z.object({
39858
+ text: import_zod13.z.string().describe("The character that was transcribed."),
39859
+ start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the character in seconds."),
39860
+ end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the character in seconds.")
39861
+ })
39862
+ ).or(import_zod13.z.null()).optional().describe("The characters that make up the word and their timing information.")
39863
+ }).describe("Word-level detail of the transcription with timing information.")
39864
+ ).describe("List of words with their timing information."),
39865
+ channel_index: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The channel index this transcript belongs to (for multichannel audio)."),
39866
+ additional_formats: import_zod13.z.array(
39867
+ import_zod13.z.object({
39868
+ requested_format: import_zod13.z.string().describe("The requested format."),
39869
+ file_extension: import_zod13.z.string().describe("The file extension of the additional format."),
39870
+ content_type: import_zod13.z.string().describe("The content type of the additional format."),
39871
+ is_base64_encoded: import_zod13.z.boolean().describe("Whether the content is base64 encoded."),
39872
+ content: import_zod13.z.string().describe("The content of the additional format.")
39873
+ }).or(import_zod13.z.null())
39874
+ ).or(import_zod13.z.null()).optional().describe("Requested additional formats of the transcript."),
39875
+ transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response."),
39876
+ entities: import_zod13.z.array(
39877
+ import_zod13.z.object({
39878
+ text: import_zod13.z.string().describe("The text that was identified as an entity."),
39879
+ entity_type: import_zod13.z.string().describe(
39880
+ "The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
39881
+ ),
39882
+ start_char: import_zod13.z.number().describe("Start character position in the transcript text."),
39883
+ end_char: import_zod13.z.number().describe("End character position in the transcript text.")
39884
+ })
39885
+ ).or(import_zod13.z.null()).optional().describe(
39886
+ "List of detected entities with their text, type, and character positions in the transcript."
39887
+ )
39888
+ }).describe("Chunk-level detail of the transcription with timing information.").or(
39889
+ import_zod13.z.object({
39890
+ transcripts: import_zod13.z.array(
39891
+ import_zod13.z.object({
39892
+ language_code: import_zod13.z.string().describe("The detected language code (e.g. 'eng' for English)."),
39893
+ language_probability: import_zod13.z.number().describe("The confidence score of the language detection (0 to 1)."),
39894
+ text: import_zod13.z.string().describe("The raw text of the transcription."),
39895
+ words: import_zod13.z.array(
39896
+ import_zod13.z.object({
39897
+ text: import_zod13.z.string().describe("The word or sound that was transcribed."),
39898
+ start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the word or sound in seconds."),
39899
+ end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the word or sound in seconds."),
39900
+ type: import_zod13.z.enum(["word", "spacing", "audio_event"]).describe(
39901
+ "The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
39902
+ ),
39903
+ speaker_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("Unique identifier for the speaker of this word."),
39904
+ logprob: import_zod13.z.number().describe(
39905
+ "The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
39906
+ ),
39907
+ characters: import_zod13.z.array(
39908
+ import_zod13.z.object({
39909
+ text: import_zod13.z.string().describe("The character that was transcribed."),
39910
+ start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the character in seconds."),
39911
+ end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the character in seconds.")
39912
+ })
39913
+ ).or(import_zod13.z.null()).optional().describe(
39914
+ "The characters that make up the word and their timing information."
39915
+ )
39916
+ }).describe("Word-level detail of the transcription with timing information.")
39917
+ ).describe("List of words with their timing information."),
39918
+ channel_index: import_zod13.z.number().or(import_zod13.z.null()).optional().describe(
39919
+ "The channel index this transcript belongs to (for multichannel audio)."
39920
+ ),
39921
+ additional_formats: import_zod13.z.array(
39922
+ import_zod13.z.object({
39923
+ requested_format: import_zod13.z.string().describe("The requested format."),
39924
+ file_extension: import_zod13.z.string().describe("The file extension of the additional format."),
39925
+ content_type: import_zod13.z.string().describe("The content type of the additional format."),
39926
+ is_base64_encoded: import_zod13.z.boolean().describe("Whether the content is base64 encoded."),
39927
+ content: import_zod13.z.string().describe("The content of the additional format.")
39928
+ }).or(import_zod13.z.null())
39929
+ ).or(import_zod13.z.null()).optional().describe("Requested additional formats of the transcript."),
39930
+ transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response."),
39931
+ entities: import_zod13.z.array(
39932
+ import_zod13.z.object({
39933
+ text: import_zod13.z.string().describe("The text that was identified as an entity."),
39934
+ entity_type: import_zod13.z.string().describe(
39935
+ "The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
39936
+ ),
39937
+ start_char: import_zod13.z.number().describe("Start character position in the transcript text."),
39938
+ end_char: import_zod13.z.number().describe("End character position in the transcript text.")
39939
+ })
39940
+ ).or(import_zod13.z.null()).optional().describe(
39941
+ "List of detected entities with their text, type, and character positions in the transcript."
39942
+ )
39943
+ }).describe("Chunk-level detail of the transcription with timing information.")
39944
+ ).describe(
39945
+ "List of transcripts, one for each audio channel. Each transcript contains the text and word-level details for its respective channel."
39946
+ ),
39947
+ transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response.")
39948
+ }).describe("Response model for multichannel speech-to-text transcription.")
39949
+ );
39950
+ var getTranscriptByIdParams = import_zod13.z.object({
39951
+ transcription_id: import_zod13.z.string().describe("The unique ID of the transcript to retrieve")
39952
+ });
39953
+ var getTranscriptByIdHeader = import_zod13.z.object({
39954
+ "xi-api-key": import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
39955
+ "Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
39956
+ )
39957
+ });
39958
+ var getTranscriptByIdResponse = import_zod13.z.object({
39959
+ language_code: import_zod13.z.string().describe("The detected language code (e.g. 'eng' for English)."),
39960
+ language_probability: import_zod13.z.number().describe("The confidence score of the language detection (0 to 1)."),
39961
+ text: import_zod13.z.string().describe("The raw text of the transcription."),
39962
+ words: import_zod13.z.array(
39963
+ import_zod13.z.object({
39964
+ text: import_zod13.z.string().describe("The word or sound that was transcribed."),
39965
+ start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the word or sound in seconds."),
39966
+ end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the word or sound in seconds."),
39967
+ type: import_zod13.z.enum(["word", "spacing", "audio_event"]).describe(
39968
+ "The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
39969
+ ),
39970
+ speaker_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("Unique identifier for the speaker of this word."),
39971
+ logprob: import_zod13.z.number().describe(
39972
+ "The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
39973
+ ),
39974
+ characters: import_zod13.z.array(
39975
+ import_zod13.z.object({
39976
+ text: import_zod13.z.string().describe("The character that was transcribed."),
39977
+ start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the character in seconds."),
39978
+ end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the character in seconds.")
39979
+ })
39980
+ ).or(import_zod13.z.null()).optional().describe("The characters that make up the word and their timing information.")
39981
+ }).describe("Word-level detail of the transcription with timing information.")
39982
+ ).describe("List of words with their timing information."),
39983
+ channel_index: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The channel index this transcript belongs to (for multichannel audio)."),
39984
+ additional_formats: import_zod13.z.array(
39985
+ import_zod13.z.object({
39986
+ requested_format: import_zod13.z.string().describe("The requested format."),
39987
+ file_extension: import_zod13.z.string().describe("The file extension of the additional format."),
39988
+ content_type: import_zod13.z.string().describe("The content type of the additional format."),
39989
+ is_base64_encoded: import_zod13.z.boolean().describe("Whether the content is base64 encoded."),
39990
+ content: import_zod13.z.string().describe("The content of the additional format.")
39991
+ }).or(import_zod13.z.null())
39992
+ ).or(import_zod13.z.null()).optional().describe("Requested additional formats of the transcript."),
39993
+ transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response."),
39994
+ entities: import_zod13.z.array(
39995
+ import_zod13.z.object({
39996
+ text: import_zod13.z.string().describe("The text that was identified as an entity."),
39997
+ entity_type: import_zod13.z.string().describe(
39998
+ "The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
39999
+ ),
40000
+ start_char: import_zod13.z.number().describe("Start character position in the transcript text."),
40001
+ end_char: import_zod13.z.number().describe("End character position in the transcript text.")
40002
+ })
40003
+ ).or(import_zod13.z.null()).optional().describe(
40004
+ "List of detected entities with their text, type, and character positions in the transcript."
40005
+ )
40006
+ }).describe("Chunk-level detail of the transcription with timing information.").or(
40007
+ import_zod13.z.object({
40008
+ transcripts: import_zod13.z.array(
40009
+ import_zod13.z.object({
40010
+ language_code: import_zod13.z.string().describe("The detected language code (e.g. 'eng' for English)."),
40011
+ language_probability: import_zod13.z.number().describe("The confidence score of the language detection (0 to 1)."),
40012
+ text: import_zod13.z.string().describe("The raw text of the transcription."),
40013
+ words: import_zod13.z.array(
40014
+ import_zod13.z.object({
40015
+ text: import_zod13.z.string().describe("The word or sound that was transcribed."),
40016
+ start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the word or sound in seconds."),
40017
+ end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the word or sound in seconds."),
40018
+ type: import_zod13.z.enum(["word", "spacing", "audio_event"]).describe(
40019
+ "The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
40020
+ ),
40021
+ speaker_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("Unique identifier for the speaker of this word."),
40022
+ logprob: import_zod13.z.number().describe(
40023
+ "The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
40024
+ ),
40025
+ characters: import_zod13.z.array(
40026
+ import_zod13.z.object({
40027
+ text: import_zod13.z.string().describe("The character that was transcribed."),
40028
+ start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the character in seconds."),
40029
+ end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the character in seconds.")
40030
+ })
40031
+ ).or(import_zod13.z.null()).optional().describe(
40032
+ "The characters that make up the word and their timing information."
40033
+ )
40034
+ }).describe("Word-level detail of the transcription with timing information.")
40035
+ ).describe("List of words with their timing information."),
40036
+ channel_index: import_zod13.z.number().or(import_zod13.z.null()).optional().describe(
40037
+ "The channel index this transcript belongs to (for multichannel audio)."
40038
+ ),
40039
+ additional_formats: import_zod13.z.array(
40040
+ import_zod13.z.object({
40041
+ requested_format: import_zod13.z.string().describe("The requested format."),
40042
+ file_extension: import_zod13.z.string().describe("The file extension of the additional format."),
40043
+ content_type: import_zod13.z.string().describe("The content type of the additional format."),
40044
+ is_base64_encoded: import_zod13.z.boolean().describe("Whether the content is base64 encoded."),
40045
+ content: import_zod13.z.string().describe("The content of the additional format.")
40046
+ }).or(import_zod13.z.null())
40047
+ ).or(import_zod13.z.null()).optional().describe("Requested additional formats of the transcript."),
40048
+ transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response."),
40049
+ entities: import_zod13.z.array(
40050
+ import_zod13.z.object({
40051
+ text: import_zod13.z.string().describe("The text that was identified as an entity."),
40052
+ entity_type: import_zod13.z.string().describe(
40053
+ "The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
40054
+ ),
40055
+ start_char: import_zod13.z.number().describe("Start character position in the transcript text."),
40056
+ end_char: import_zod13.z.number().describe("End character position in the transcript text.")
40057
+ })
40058
+ ).or(import_zod13.z.null()).optional().describe(
40059
+ "List of detected entities with their text, type, and character positions in the transcript."
40060
+ )
40061
+ }).describe("Chunk-level detail of the transcription with timing information.")
40062
+ ).describe(
40063
+ "List of transcripts, one for each audio channel. Each transcript contains the text and word-level details for its respective channel."
40064
+ ),
40065
+ transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response.")
40066
+ }).describe("Response model for multichannel speech-to-text transcription.")
40067
+ );
40068
+ var deleteTranscriptByIdParams = import_zod13.z.object({
40069
+ transcription_id: import_zod13.z.string().describe("The unique ID of the transcript to delete")
40070
+ });
40071
+ var deleteTranscriptByIdHeader = import_zod13.z.object({
40072
+ "xi-api-key": import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
40073
+ "Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
40074
+ )
40075
+ });
40076
+ var deleteTranscriptByIdResponse = import_zod13.z.any();
38710
40077
  // Annotate the CommonJS export names for ESM import in node:
38711
40078
  0 && (module.exports = {
38712
40079
  AllLanguageCodes,
@@ -38762,6 +40129,13 @@ var getUsageResponse = import_zod12.z.object({
38762
40129
  DeepgramTranscriptionSchema,
38763
40130
  DeepgramTypes,
38764
40131
  DeepgramZodSchemas,
40132
+ ElevenLabsAdapter,
40133
+ ElevenLabsCapabilities,
40134
+ ElevenLabsLanguageCodes,
40135
+ ElevenLabsLanguageLabels,
40136
+ ElevenLabsLanguages,
40137
+ ElevenLabsTypes,
40138
+ ElevenLabsZodSchemas,
38765
40139
  GladiaAdapter,
38766
40140
  GladiaBitDepth,
38767
40141
  GladiaCapabilities,
@@ -38848,6 +40222,7 @@ var getUsageResponse = import_zod12.z.object({
38848
40222
  createAssemblyAIAdapter,
38849
40223
  createAzureSTTAdapter,
38850
40224
  createDeepgramAdapter,
40225
+ createElevenLabsAdapter,
38851
40226
  createGladiaAdapter,
38852
40227
  createOpenAIWhisperAdapter,
38853
40228
  createSonioxAdapter,