voice-router-dev 0.7.9 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +202 -1
- package/README.md +21 -2
- package/dist/constants.d.mts +600 -12
- package/dist/constants.d.ts +600 -12
- package/dist/constants.js +548 -5
- package/dist/constants.mjs +537 -5
- package/dist/{field-configs-CaXYfrJg.d.mts → field-configs-CDVygOte.d.mts} +26 -20
- package/dist/{field-configs-CaXYfrJg.d.ts → field-configs-CDVygOte.d.ts} +26 -20
- package/dist/field-configs.d.mts +1 -1
- package/dist/field-configs.d.ts +1 -1
- package/dist/field-configs.js +7 -4
- package/dist/field-configs.mjs +7 -4
- package/dist/index.d.mts +3184 -1367
- package/dist/index.d.ts +3184 -1367
- package/dist/index.js +1529 -105
- package/dist/index.mjs +1521 -105
- package/dist/{provider-metadata-DVQcYIHe.d.mts → provider-metadata-BnkedpXm.d.mts} +34 -4
- package/dist/{provider-metadata-Derls1wa.d.ts → provider-metadata-DbsSGAO7.d.ts} +34 -4
- package/dist/provider-metadata.d.mts +2 -2
- package/dist/provider-metadata.d.ts +2 -2
- package/dist/provider-metadata.js +349 -9
- package/dist/provider-metadata.mjs +345 -9
- package/dist/{transcriptWebhookNotification-BTxv69ck.d.ts → transcriptWebhookNotification-BJk1CEF5.d.ts} +712 -9
- package/dist/{transcriptWebhookNotification-DCcbnAKP.d.mts → transcriptWebhookNotification-CNFpns9f.d.mts} +712 -9
- package/dist/webhooks.d.mts +102 -5
- package/dist/webhooks.d.ts +102 -5
- package/dist/webhooks.js +342 -39
- package/dist/webhooks.mjs +340 -39
- package/package.json +11 -5
package/dist/index.js
CHANGED
|
@@ -84,6 +84,13 @@ __export(src_exports, {
|
|
|
84
84
|
DeepgramTranscriptionSchema: () => DeepgramTranscriptionSchema,
|
|
85
85
|
DeepgramTypes: () => schema_exports4,
|
|
86
86
|
DeepgramZodSchemas: () => deepgramAPISpecification_zod_exports,
|
|
87
|
+
ElevenLabsAdapter: () => ElevenLabsAdapter,
|
|
88
|
+
ElevenLabsCapabilities: () => ElevenLabsCapabilities,
|
|
89
|
+
ElevenLabsLanguageCodes: () => ElevenLabsLanguageCodes,
|
|
90
|
+
ElevenLabsLanguageLabels: () => ElevenLabsLanguageLabels,
|
|
91
|
+
ElevenLabsLanguages: () => ElevenLabsLanguages,
|
|
92
|
+
ElevenLabsTypes: () => schema_exports8,
|
|
93
|
+
ElevenLabsZodSchemas: () => elevenLabsSpeechToTextAPI_zod_exports,
|
|
87
94
|
GladiaAdapter: () => GladiaAdapter,
|
|
88
95
|
GladiaBitDepth: () => GladiaBitDepth,
|
|
89
96
|
GladiaCapabilities: () => GladiaCapabilities,
|
|
@@ -170,6 +177,7 @@ __export(src_exports, {
|
|
|
170
177
|
createAssemblyAIAdapter: () => createAssemblyAIAdapter,
|
|
171
178
|
createAzureSTTAdapter: () => createAzureSTTAdapter,
|
|
172
179
|
createDeepgramAdapter: () => createDeepgramAdapter,
|
|
180
|
+
createElevenLabsAdapter: () => createElevenLabsAdapter,
|
|
173
181
|
createGladiaAdapter: () => createGladiaAdapter,
|
|
174
182
|
createOpenAIWhisperAdapter: () => createOpenAIWhisperAdapter,
|
|
175
183
|
createSonioxAdapter: () => createSonioxAdapter,
|
|
@@ -639,6 +647,22 @@ var DeepgramLanguage = {
|
|
|
639
647
|
yo: "yo",
|
|
640
648
|
zh: "zh",
|
|
641
649
|
// Regional variants
|
|
650
|
+
"ar-AE": "ar-AE",
|
|
651
|
+
"ar-DZ": "ar-DZ",
|
|
652
|
+
"ar-EG": "ar-EG",
|
|
653
|
+
"ar-IQ": "ar-IQ",
|
|
654
|
+
"ar-IR": "ar-IR",
|
|
655
|
+
"ar-JO": "ar-JO",
|
|
656
|
+
"ar-KW": "ar-KW",
|
|
657
|
+
"ar-LB": "ar-LB",
|
|
658
|
+
"ar-MA": "ar-MA",
|
|
659
|
+
"ar-PS": "ar-PS",
|
|
660
|
+
"ar-QA": "ar-QA",
|
|
661
|
+
"ar-SA": "ar-SA",
|
|
662
|
+
"ar-SD": "ar-SD",
|
|
663
|
+
"ar-SY": "ar-SY",
|
|
664
|
+
"ar-TD": "ar-TD",
|
|
665
|
+
"ar-TN": "ar-TN",
|
|
642
666
|
"be-BY": "be-BY",
|
|
643
667
|
"bn-IN": "bn-IN",
|
|
644
668
|
"bs-BA": "bs-BA",
|
|
@@ -953,7 +977,9 @@ var SonioxLanguage = {
|
|
|
953
977
|
|
|
954
978
|
// src/generated/soniox/models.ts
|
|
955
979
|
var SonioxModels = [
|
|
980
|
+
{ id: "stt-rt-v4", name: "Speech-to-Text Real-time v4", mode: "real_time" },
|
|
956
981
|
{ id: "stt-rt-v3", name: "Speech-to-Text Real-time v3", mode: "real_time" },
|
|
982
|
+
{ id: "stt-async-v4", name: "Speech-to-Text Async v4", mode: "async" },
|
|
957
983
|
{ id: "stt-async-v3", name: "Speech-to-Text Async v3", mode: "async" },
|
|
958
984
|
{ id: "stt-rt-preview", name: "Speech-to-Text Real-time Preview", mode: "real_time", aliasOf: "stt-rt-v3" },
|
|
959
985
|
{ id: "stt-async-preview", name: "Speech-to-Text Async Preview", mode: "async", aliasOf: "stt-async-v3" },
|
|
@@ -962,7 +988,9 @@ var SonioxModels = [
|
|
|
962
988
|
{ id: "stt-async-preview-v1", name: "Speech-to-Text Async Preview v1", mode: "async", aliasOf: "stt-async-v3" }
|
|
963
989
|
];
|
|
964
990
|
var SonioxModelCodes = [
|
|
991
|
+
"stt-rt-v4",
|
|
965
992
|
"stt-rt-v3",
|
|
993
|
+
"stt-async-v4",
|
|
966
994
|
"stt-async-v3",
|
|
967
995
|
"stt-rt-preview",
|
|
968
996
|
"stt-async-preview",
|
|
@@ -971,7 +999,9 @@ var SonioxModelCodes = [
|
|
|
971
999
|
"stt-async-preview-v1"
|
|
972
1000
|
];
|
|
973
1001
|
var SonioxModelLabels = {
|
|
1002
|
+
"stt-rt-v4": "Speech-to-Text Real-time v4",
|
|
974
1003
|
"stt-rt-v3": "Speech-to-Text Real-time v3",
|
|
1004
|
+
"stt-async-v4": "Speech-to-Text Async v4",
|
|
975
1005
|
"stt-async-v3": "Speech-to-Text Async v3",
|
|
976
1006
|
"stt-rt-preview": "Speech-to-Text Real-time Preview",
|
|
977
1007
|
"stt-async-preview": "Speech-to-Text Async Preview",
|
|
@@ -980,7 +1010,9 @@ var SonioxModelLabels = {
|
|
|
980
1010
|
"stt-async-preview-v1": "Speech-to-Text Async Preview v1"
|
|
981
1011
|
};
|
|
982
1012
|
var SonioxModel = {
|
|
1013
|
+
stt_rt_v4: "stt-rt-v4",
|
|
983
1014
|
stt_rt_v3: "stt-rt-v3",
|
|
1015
|
+
stt_async_v4: "stt-async-v4",
|
|
984
1016
|
stt_async_v3: "stt-async-v3",
|
|
985
1017
|
stt_rt_preview: "stt-rt-preview",
|
|
986
1018
|
stt_async_preview: "stt-async-preview",
|
|
@@ -989,12 +1021,14 @@ var SonioxModel = {
|
|
|
989
1021
|
stt_async_preview_v1: "stt-async-preview-v1"
|
|
990
1022
|
};
|
|
991
1023
|
var SonioxRealtimeModel = {
|
|
1024
|
+
stt_rt_v4: "stt-rt-v4",
|
|
992
1025
|
stt_rt_v3: "stt-rt-v3",
|
|
993
1026
|
stt_rt_preview: "stt-rt-preview",
|
|
994
1027
|
stt_rt_v3_preview: "stt-rt-v3-preview",
|
|
995
1028
|
stt_rt_preview_v2: "stt-rt-preview-v2"
|
|
996
1029
|
};
|
|
997
1030
|
var SonioxAsyncModel = {
|
|
1031
|
+
stt_async_v4: "stt-async-v4",
|
|
998
1032
|
stt_async_v3: "stt-async-v3",
|
|
999
1033
|
stt_async_preview: "stt-async-preview",
|
|
1000
1034
|
stt_async_preview_v1: "stt-async-preview-v1"
|
|
@@ -1004,6 +1038,7 @@ var SonioxAsyncModel = {
|
|
|
1004
1038
|
var SpeechmaticsLanguages = [
|
|
1005
1039
|
{ code: "auto", name: "Automatic Detection" },
|
|
1006
1040
|
{ code: "ar", name: "Arabic" },
|
|
1041
|
+
{ code: "ar_en", name: "Arabic / English" },
|
|
1007
1042
|
{ code: "ba", name: "Bashkir" },
|
|
1008
1043
|
{ code: "be", name: "Belarusian" },
|
|
1009
1044
|
{ code: "bg", name: "Bulgarian" },
|
|
@@ -1068,6 +1103,7 @@ var SpeechmaticsLanguages = [
|
|
|
1068
1103
|
var SpeechmaticsLanguageCodes = [
|
|
1069
1104
|
"auto",
|
|
1070
1105
|
"ar",
|
|
1106
|
+
"ar_en",
|
|
1071
1107
|
"ba",
|
|
1072
1108
|
"be",
|
|
1073
1109
|
"bg",
|
|
@@ -1132,6 +1168,7 @@ var SpeechmaticsLanguageCodes = [
|
|
|
1132
1168
|
var SpeechmaticsLanguageLabels = {
|
|
1133
1169
|
"auto": "Automatic Detection",
|
|
1134
1170
|
"ar": "Arabic",
|
|
1171
|
+
"ar_en": "Arabic / English",
|
|
1135
1172
|
"ba": "Bashkir",
|
|
1136
1173
|
"be": "Belarusian",
|
|
1137
1174
|
"bg": "Bulgarian",
|
|
@@ -1196,6 +1233,7 @@ var SpeechmaticsLanguageLabels = {
|
|
|
1196
1233
|
var SpeechmaticsLanguage = {
|
|
1197
1234
|
"auto": "auto",
|
|
1198
1235
|
"ar": "ar",
|
|
1236
|
+
"ar_en": "ar_en",
|
|
1199
1237
|
"ba": "ba",
|
|
1200
1238
|
"be": "be",
|
|
1201
1239
|
"bg": "bg",
|
|
@@ -1382,7 +1420,6 @@ var AzureLocales = [
|
|
|
1382
1420
|
{ code: "ne-NP", name: "Nepali (Nepal)" },
|
|
1383
1421
|
{ code: "nl-BE", name: "Dutch (Belgium)" },
|
|
1384
1422
|
{ code: "nl-NL", name: "Dutch (Netherlands)" },
|
|
1385
|
-
{ code: "non-HD", name: "Norse (Historical)" },
|
|
1386
1423
|
{ code: "or-IN", name: "Odia (India)" },
|
|
1387
1424
|
{ code: "pa-IN", name: "Punjabi (India)" },
|
|
1388
1425
|
{ code: "pl-PL", name: "Polish (Poland)" },
|
|
@@ -1396,7 +1433,9 @@ var AzureLocales = [
|
|
|
1396
1433
|
{ code: "sl-SI", name: "Slovenian (Slovenia)" },
|
|
1397
1434
|
{ code: "so-SO", name: "Somali (Somalia)" },
|
|
1398
1435
|
{ code: "sq-AL", name: "Albanian (Albania)" },
|
|
1436
|
+
{ code: "sr-ME", name: "Serbian (ME)" },
|
|
1399
1437
|
{ code: "sr-RS", name: "Serbian (Serbia)" },
|
|
1438
|
+
{ code: "sr-XK", name: "Serbian (XK)" },
|
|
1400
1439
|
{ code: "su-ID", name: "Sundanese (Indonesia)" },
|
|
1401
1440
|
{ code: "sv-SE", name: "Swedish (Sweden)" },
|
|
1402
1441
|
{ code: "sw-KE", name: "Swahili (Kenya)" },
|
|
@@ -1538,7 +1577,6 @@ var AzureLocaleCodes = [
|
|
|
1538
1577
|
"ne-NP",
|
|
1539
1578
|
"nl-BE",
|
|
1540
1579
|
"nl-NL",
|
|
1541
|
-
"non-HD",
|
|
1542
1580
|
"or-IN",
|
|
1543
1581
|
"pa-IN",
|
|
1544
1582
|
"pl-PL",
|
|
@@ -1552,7 +1590,9 @@ var AzureLocaleCodes = [
|
|
|
1552
1590
|
"sl-SI",
|
|
1553
1591
|
"so-SO",
|
|
1554
1592
|
"sq-AL",
|
|
1593
|
+
"sr-ME",
|
|
1555
1594
|
"sr-RS",
|
|
1595
|
+
"sr-XK",
|
|
1556
1596
|
"su-ID",
|
|
1557
1597
|
"sv-SE",
|
|
1558
1598
|
"sw-KE",
|
|
@@ -1694,7 +1734,6 @@ var AzureLocaleLabels = {
|
|
|
1694
1734
|
"ne-NP": "Nepali (Nepal)",
|
|
1695
1735
|
"nl-BE": "Dutch (Belgium)",
|
|
1696
1736
|
"nl-NL": "Dutch (Netherlands)",
|
|
1697
|
-
"non-HD": "Norse (Historical)",
|
|
1698
1737
|
"or-IN": "Odia (India)",
|
|
1699
1738
|
"pa-IN": "Punjabi (India)",
|
|
1700
1739
|
"pl-PL": "Polish (Poland)",
|
|
@@ -1708,7 +1747,9 @@ var AzureLocaleLabels = {
|
|
|
1708
1747
|
"sl-SI": "Slovenian (Slovenia)",
|
|
1709
1748
|
"so-SO": "Somali (Somalia)",
|
|
1710
1749
|
"sq-AL": "Albanian (Albania)",
|
|
1750
|
+
"sr-ME": "Serbian (ME)",
|
|
1711
1751
|
"sr-RS": "Serbian (Serbia)",
|
|
1752
|
+
"sr-XK": "Serbian (XK)",
|
|
1712
1753
|
"su-ID": "Sundanese (Indonesia)",
|
|
1713
1754
|
"sv-SE": "Swedish (Sweden)",
|
|
1714
1755
|
"sw-KE": "Swahili (Kenya)",
|
|
@@ -1850,7 +1891,6 @@ var AzureLocale = {
|
|
|
1850
1891
|
"ne-NP": "ne-NP",
|
|
1851
1892
|
"nl-BE": "nl-BE",
|
|
1852
1893
|
"nl-NL": "nl-NL",
|
|
1853
|
-
"non-HD": "non-HD",
|
|
1854
1894
|
"or-IN": "or-IN",
|
|
1855
1895
|
"pa-IN": "pa-IN",
|
|
1856
1896
|
"pl-PL": "pl-PL",
|
|
@@ -1864,7 +1904,9 @@ var AzureLocale = {
|
|
|
1864
1904
|
"sl-SI": "sl-SI",
|
|
1865
1905
|
"so-SO": "so-SO",
|
|
1866
1906
|
"sq-AL": "sq-AL",
|
|
1907
|
+
"sr-ME": "sr-ME",
|
|
1867
1908
|
"sr-RS": "sr-RS",
|
|
1909
|
+
"sr-XK": "sr-XK",
|
|
1868
1910
|
"su-ID": "su-ID",
|
|
1869
1911
|
"sv-SE": "sv-SE",
|
|
1870
1912
|
"sw-KE": "sw-KE",
|
|
@@ -1890,6 +1932,311 @@ var AzureLocale = {
|
|
|
1890
1932
|
"zu-ZA": "zu-ZA"
|
|
1891
1933
|
};
|
|
1892
1934
|
|
|
1935
|
+
// src/generated/elevenlabs/languages.ts
|
|
1936
|
+
var ElevenLabsLanguages = [
|
|
1937
|
+
{ code: "en", name: "English" },
|
|
1938
|
+
{ code: "zh", name: "Chinese" },
|
|
1939
|
+
{ code: "de", name: "German" },
|
|
1940
|
+
{ code: "es", name: "Spanish" },
|
|
1941
|
+
{ code: "ru", name: "Russian" },
|
|
1942
|
+
{ code: "ko", name: "Korean" },
|
|
1943
|
+
{ code: "fr", name: "French" },
|
|
1944
|
+
{ code: "ja", name: "Japanese" },
|
|
1945
|
+
{ code: "pt", name: "Portuguese" },
|
|
1946
|
+
{ code: "tr", name: "Turkish" },
|
|
1947
|
+
{ code: "pl", name: "Polish" },
|
|
1948
|
+
{ code: "ca", name: "Catalan" },
|
|
1949
|
+
{ code: "nl", name: "Dutch" },
|
|
1950
|
+
{ code: "ar", name: "Arabic" },
|
|
1951
|
+
{ code: "sv", name: "Swedish" },
|
|
1952
|
+
{ code: "it", name: "Italian" },
|
|
1953
|
+
{ code: "id", name: "Indonesian" },
|
|
1954
|
+
{ code: "hi", name: "Hindi" },
|
|
1955
|
+
{ code: "fi", name: "Finnish" },
|
|
1956
|
+
{ code: "vi", name: "Vietnamese" },
|
|
1957
|
+
{ code: "he", name: "Hebrew" },
|
|
1958
|
+
{ code: "uk", name: "Ukrainian" },
|
|
1959
|
+
{ code: "el", name: "Greek" },
|
|
1960
|
+
{ code: "ms", name: "Malay" },
|
|
1961
|
+
{ code: "cs", name: "Czech" },
|
|
1962
|
+
{ code: "ro", name: "Romanian" },
|
|
1963
|
+
{ code: "da", name: "Danish" },
|
|
1964
|
+
{ code: "hu", name: "Hungarian" },
|
|
1965
|
+
{ code: "ta", name: "Tamil" },
|
|
1966
|
+
{ code: "no", name: "Norwegian" },
|
|
1967
|
+
{ code: "th", name: "Thai" },
|
|
1968
|
+
{ code: "ur", name: "Urdu" },
|
|
1969
|
+
{ code: "hr", name: "Croatian" },
|
|
1970
|
+
{ code: "bg", name: "Bulgarian" },
|
|
1971
|
+
{ code: "lt", name: "Lithuanian" },
|
|
1972
|
+
{ code: "la", name: "Latin" },
|
|
1973
|
+
{ code: "mi", name: "Maori" },
|
|
1974
|
+
{ code: "ml", name: "Malayalam" },
|
|
1975
|
+
{ code: "cy", name: "Welsh" },
|
|
1976
|
+
{ code: "sk", name: "Slovak" },
|
|
1977
|
+
{ code: "te", name: "Telugu" },
|
|
1978
|
+
{ code: "fa", name: "Persian" },
|
|
1979
|
+
{ code: "lv", name: "Latvian" },
|
|
1980
|
+
{ code: "bn", name: "Bengali" },
|
|
1981
|
+
{ code: "sr", name: "Serbian" },
|
|
1982
|
+
{ code: "az", name: "Azerbaijani" },
|
|
1983
|
+
{ code: "sl", name: "Slovenian" },
|
|
1984
|
+
{ code: "kn", name: "Kannada" },
|
|
1985
|
+
{ code: "et", name: "Estonian" },
|
|
1986
|
+
{ code: "mk", name: "Macedonian" },
|
|
1987
|
+
{ code: "br", name: "Breton" },
|
|
1988
|
+
{ code: "eu", name: "Basque" },
|
|
1989
|
+
{ code: "is", name: "Icelandic" },
|
|
1990
|
+
{ code: "hy", name: "Armenian" },
|
|
1991
|
+
{ code: "ne", name: "Nepali" },
|
|
1992
|
+
{ code: "mn", name: "Mongolian" },
|
|
1993
|
+
{ code: "bs", name: "Bosnian" },
|
|
1994
|
+
{ code: "kk", name: "Kazakh" },
|
|
1995
|
+
{ code: "sq", name: "Albanian" },
|
|
1996
|
+
{ code: "sw", name: "Swahili" },
|
|
1997
|
+
{ code: "gl", name: "Galician" },
|
|
1998
|
+
{ code: "mr", name: "Marathi" },
|
|
1999
|
+
{ code: "pa", name: "Punjabi" },
|
|
2000
|
+
{ code: "si", name: "Sinhala" },
|
|
2001
|
+
{ code: "km", name: "Khmer" },
|
|
2002
|
+
{ code: "sn", name: "Shona" },
|
|
2003
|
+
{ code: "yo", name: "Yoruba" },
|
|
2004
|
+
{ code: "so", name: "Somali" },
|
|
2005
|
+
{ code: "af", name: "Afrikaans" },
|
|
2006
|
+
{ code: "oc", name: "Occitan" },
|
|
2007
|
+
{ code: "ka", name: "Georgian" },
|
|
2008
|
+
{ code: "be", name: "Belarusian" },
|
|
2009
|
+
{ code: "tg", name: "Tajik" },
|
|
2010
|
+
{ code: "sd", name: "Sindhi" },
|
|
2011
|
+
{ code: "gu", name: "Gujarati" },
|
|
2012
|
+
{ code: "am", name: "Amharic" },
|
|
2013
|
+
{ code: "yi", name: "Yiddish" },
|
|
2014
|
+
{ code: "lo", name: "Lao" },
|
|
2015
|
+
{ code: "uz", name: "Uzbek" },
|
|
2016
|
+
{ code: "fo", name: "Faroese" },
|
|
2017
|
+
{ code: "ht", name: "Haitian Creole" },
|
|
2018
|
+
{ code: "ps", name: "Pashto" },
|
|
2019
|
+
{ code: "tk", name: "Turkmen" },
|
|
2020
|
+
{ code: "nn", name: "Norwegian Nynorsk" },
|
|
2021
|
+
{ code: "mt", name: "Maltese" },
|
|
2022
|
+
{ code: "sa", name: "Sanskrit" },
|
|
2023
|
+
{ code: "lb", name: "Luxembourgish" },
|
|
2024
|
+
{ code: "my", name: "Burmese" },
|
|
2025
|
+
{ code: "bo", name: "Tibetan" },
|
|
2026
|
+
{ code: "tl", name: "Tagalog" },
|
|
2027
|
+
{ code: "mg", name: "Malagasy" },
|
|
2028
|
+
{ code: "as", name: "Assamese" },
|
|
2029
|
+
{ code: "tt", name: "Tatar" },
|
|
2030
|
+
{ code: "haw", name: "Hawaiian" },
|
|
2031
|
+
{ code: "ln", name: "Lingala" },
|
|
2032
|
+
{ code: "ha", name: "Hausa" },
|
|
2033
|
+
{ code: "ba", name: "Bashkir" },
|
|
2034
|
+
{ code: "jw", name: "Javanese" },
|
|
2035
|
+
{ code: "su", name: "Sundanese" }
|
|
2036
|
+
];
|
|
2037
|
+
var ElevenLabsLanguageCodes = [
|
|
2038
|
+
"en",
|
|
2039
|
+
"zh",
|
|
2040
|
+
"de",
|
|
2041
|
+
"es",
|
|
2042
|
+
"ru",
|
|
2043
|
+
"ko",
|
|
2044
|
+
"fr",
|
|
2045
|
+
"ja",
|
|
2046
|
+
"pt",
|
|
2047
|
+
"tr",
|
|
2048
|
+
"pl",
|
|
2049
|
+
"ca",
|
|
2050
|
+
"nl",
|
|
2051
|
+
"ar",
|
|
2052
|
+
"sv",
|
|
2053
|
+
"it",
|
|
2054
|
+
"id",
|
|
2055
|
+
"hi",
|
|
2056
|
+
"fi",
|
|
2057
|
+
"vi",
|
|
2058
|
+
"he",
|
|
2059
|
+
"uk",
|
|
2060
|
+
"el",
|
|
2061
|
+
"ms",
|
|
2062
|
+
"cs",
|
|
2063
|
+
"ro",
|
|
2064
|
+
"da",
|
|
2065
|
+
"hu",
|
|
2066
|
+
"ta",
|
|
2067
|
+
"no",
|
|
2068
|
+
"th",
|
|
2069
|
+
"ur",
|
|
2070
|
+
"hr",
|
|
2071
|
+
"bg",
|
|
2072
|
+
"lt",
|
|
2073
|
+
"la",
|
|
2074
|
+
"mi",
|
|
2075
|
+
"ml",
|
|
2076
|
+
"cy",
|
|
2077
|
+
"sk",
|
|
2078
|
+
"te",
|
|
2079
|
+
"fa",
|
|
2080
|
+
"lv",
|
|
2081
|
+
"bn",
|
|
2082
|
+
"sr",
|
|
2083
|
+
"az",
|
|
2084
|
+
"sl",
|
|
2085
|
+
"kn",
|
|
2086
|
+
"et",
|
|
2087
|
+
"mk",
|
|
2088
|
+
"br",
|
|
2089
|
+
"eu",
|
|
2090
|
+
"is",
|
|
2091
|
+
"hy",
|
|
2092
|
+
"ne",
|
|
2093
|
+
"mn",
|
|
2094
|
+
"bs",
|
|
2095
|
+
"kk",
|
|
2096
|
+
"sq",
|
|
2097
|
+
"sw",
|
|
2098
|
+
"gl",
|
|
2099
|
+
"mr",
|
|
2100
|
+
"pa",
|
|
2101
|
+
"si",
|
|
2102
|
+
"km",
|
|
2103
|
+
"sn",
|
|
2104
|
+
"yo",
|
|
2105
|
+
"so",
|
|
2106
|
+
"af",
|
|
2107
|
+
"oc",
|
|
2108
|
+
"ka",
|
|
2109
|
+
"be",
|
|
2110
|
+
"tg",
|
|
2111
|
+
"sd",
|
|
2112
|
+
"gu",
|
|
2113
|
+
"am",
|
|
2114
|
+
"yi",
|
|
2115
|
+
"lo",
|
|
2116
|
+
"uz",
|
|
2117
|
+
"fo",
|
|
2118
|
+
"ht",
|
|
2119
|
+
"ps",
|
|
2120
|
+
"tk",
|
|
2121
|
+
"nn",
|
|
2122
|
+
"mt",
|
|
2123
|
+
"sa",
|
|
2124
|
+
"lb",
|
|
2125
|
+
"my",
|
|
2126
|
+
"bo",
|
|
2127
|
+
"tl",
|
|
2128
|
+
"mg",
|
|
2129
|
+
"as",
|
|
2130
|
+
"tt",
|
|
2131
|
+
"haw",
|
|
2132
|
+
"ln",
|
|
2133
|
+
"ha",
|
|
2134
|
+
"ba",
|
|
2135
|
+
"jw",
|
|
2136
|
+
"su"
|
|
2137
|
+
];
|
|
2138
|
+
var ElevenLabsLanguageLabels = {
|
|
2139
|
+
en: "English",
|
|
2140
|
+
zh: "Chinese",
|
|
2141
|
+
de: "German",
|
|
2142
|
+
es: "Spanish",
|
|
2143
|
+
ru: "Russian",
|
|
2144
|
+
ko: "Korean",
|
|
2145
|
+
fr: "French",
|
|
2146
|
+
ja: "Japanese",
|
|
2147
|
+
pt: "Portuguese",
|
|
2148
|
+
tr: "Turkish",
|
|
2149
|
+
pl: "Polish",
|
|
2150
|
+
ca: "Catalan",
|
|
2151
|
+
nl: "Dutch",
|
|
2152
|
+
ar: "Arabic",
|
|
2153
|
+
sv: "Swedish",
|
|
2154
|
+
it: "Italian",
|
|
2155
|
+
id: "Indonesian",
|
|
2156
|
+
hi: "Hindi",
|
|
2157
|
+
fi: "Finnish",
|
|
2158
|
+
vi: "Vietnamese",
|
|
2159
|
+
he: "Hebrew",
|
|
2160
|
+
uk: "Ukrainian",
|
|
2161
|
+
el: "Greek",
|
|
2162
|
+
ms: "Malay",
|
|
2163
|
+
cs: "Czech",
|
|
2164
|
+
ro: "Romanian",
|
|
2165
|
+
da: "Danish",
|
|
2166
|
+
hu: "Hungarian",
|
|
2167
|
+
ta: "Tamil",
|
|
2168
|
+
no: "Norwegian",
|
|
2169
|
+
th: "Thai",
|
|
2170
|
+
ur: "Urdu",
|
|
2171
|
+
hr: "Croatian",
|
|
2172
|
+
bg: "Bulgarian",
|
|
2173
|
+
lt: "Lithuanian",
|
|
2174
|
+
la: "Latin",
|
|
2175
|
+
mi: "Maori",
|
|
2176
|
+
ml: "Malayalam",
|
|
2177
|
+
cy: "Welsh",
|
|
2178
|
+
sk: "Slovak",
|
|
2179
|
+
te: "Telugu",
|
|
2180
|
+
fa: "Persian",
|
|
2181
|
+
lv: "Latvian",
|
|
2182
|
+
bn: "Bengali",
|
|
2183
|
+
sr: "Serbian",
|
|
2184
|
+
az: "Azerbaijani",
|
|
2185
|
+
sl: "Slovenian",
|
|
2186
|
+
kn: "Kannada",
|
|
2187
|
+
et: "Estonian",
|
|
2188
|
+
mk: "Macedonian",
|
|
2189
|
+
br: "Breton",
|
|
2190
|
+
eu: "Basque",
|
|
2191
|
+
is: "Icelandic",
|
|
2192
|
+
hy: "Armenian",
|
|
2193
|
+
ne: "Nepali",
|
|
2194
|
+
mn: "Mongolian",
|
|
2195
|
+
bs: "Bosnian",
|
|
2196
|
+
kk: "Kazakh",
|
|
2197
|
+
sq: "Albanian",
|
|
2198
|
+
sw: "Swahili",
|
|
2199
|
+
gl: "Galician",
|
|
2200
|
+
mr: "Marathi",
|
|
2201
|
+
pa: "Punjabi",
|
|
2202
|
+
si: "Sinhala",
|
|
2203
|
+
km: "Khmer",
|
|
2204
|
+
sn: "Shona",
|
|
2205
|
+
yo: "Yoruba",
|
|
2206
|
+
so: "Somali",
|
|
2207
|
+
af: "Afrikaans",
|
|
2208
|
+
oc: "Occitan",
|
|
2209
|
+
ka: "Georgian",
|
|
2210
|
+
be: "Belarusian",
|
|
2211
|
+
tg: "Tajik",
|
|
2212
|
+
sd: "Sindhi",
|
|
2213
|
+
gu: "Gujarati",
|
|
2214
|
+
am: "Amharic",
|
|
2215
|
+
yi: "Yiddish",
|
|
2216
|
+
lo: "Lao",
|
|
2217
|
+
uz: "Uzbek",
|
|
2218
|
+
fo: "Faroese",
|
|
2219
|
+
ht: "Haitian Creole",
|
|
2220
|
+
ps: "Pashto",
|
|
2221
|
+
tk: "Turkmen",
|
|
2222
|
+
nn: "Norwegian Nynorsk",
|
|
2223
|
+
mt: "Maltese",
|
|
2224
|
+
sa: "Sanskrit",
|
|
2225
|
+
lb: "Luxembourgish",
|
|
2226
|
+
my: "Burmese",
|
|
2227
|
+
bo: "Tibetan",
|
|
2228
|
+
tl: "Tagalog",
|
|
2229
|
+
mg: "Malagasy",
|
|
2230
|
+
as: "Assamese",
|
|
2231
|
+
tt: "Tatar",
|
|
2232
|
+
haw: "Hawaiian",
|
|
2233
|
+
ln: "Lingala",
|
|
2234
|
+
ha: "Hausa",
|
|
2235
|
+
ba: "Bashkir",
|
|
2236
|
+
jw: "Javanese",
|
|
2237
|
+
su: "Sundanese"
|
|
2238
|
+
};
|
|
2239
|
+
|
|
1893
2240
|
// src/generated/gladia/schema/streamingSupportedBitDepthEnum.ts
|
|
1894
2241
|
var StreamingSupportedBitDepthEnum = {
|
|
1895
2242
|
NUMBER_8: 8,
|
|
@@ -2431,6 +2778,16 @@ var DeepgramSampleRate = {
|
|
|
2431
2778
|
NUMBER_44100: 44100,
|
|
2432
2779
|
NUMBER_48000: 48e3
|
|
2433
2780
|
};
|
|
2781
|
+
var ElevenLabsRegion = {
|
|
2782
|
+
/** Global endpoint (default) */
|
|
2783
|
+
global: "global",
|
|
2784
|
+
/** United States */
|
|
2785
|
+
us: "us",
|
|
2786
|
+
/** European Union */
|
|
2787
|
+
eu: "eu",
|
|
2788
|
+
/** India */
|
|
2789
|
+
in: "in"
|
|
2790
|
+
};
|
|
2434
2791
|
var GladiaEncoding = StreamingSupportedEncodingEnum;
|
|
2435
2792
|
var GladiaSampleRate = StreamingSupportedSampleRateEnum;
|
|
2436
2793
|
var GladiaBitDepth = StreamingSupportedBitDepthEnum;
|
|
@@ -2641,6 +2998,20 @@ var BaseAdapter = class {
|
|
|
2641
2998
|
throw new Error(`API key is required for ${this.name} provider`);
|
|
2642
2999
|
}
|
|
2643
3000
|
}
|
|
3001
|
+
/**
|
|
3002
|
+
* Derive a WebSocket URL from an HTTP base URL
|
|
3003
|
+
*
|
|
3004
|
+
* Converts `https://` → `wss://` and `http://` → `ws://`
|
|
3005
|
+
*/
|
|
3006
|
+
deriveWsUrl(httpUrl) {
|
|
3007
|
+
if (httpUrl.startsWith("https://")) {
|
|
3008
|
+
return httpUrl.replace(/^https:\/\//, "wss://");
|
|
3009
|
+
}
|
|
3010
|
+
if (httpUrl.startsWith("http://")) {
|
|
3011
|
+
return httpUrl.replace(/^http:\/\//, "ws://");
|
|
3012
|
+
}
|
|
3013
|
+
return httpUrl;
|
|
3014
|
+
}
|
|
2644
3015
|
/**
|
|
2645
3016
|
* Build axios config for generated API client functions
|
|
2646
3017
|
*
|
|
@@ -2835,6 +3206,70 @@ function extractWords(words, mapper) {
|
|
|
2835
3206
|
const normalizedWords = words.map(mapper);
|
|
2836
3207
|
return normalizedWords.length > 0 ? normalizedWords : void 0;
|
|
2837
3208
|
}
|
|
3209
|
+
function buildUtterancesFromWords(words) {
|
|
3210
|
+
const utterances = [];
|
|
3211
|
+
let currentSpeaker;
|
|
3212
|
+
let currentWords = [];
|
|
3213
|
+
let utteranceStart = 0;
|
|
3214
|
+
for (const word of words) {
|
|
3215
|
+
if (!word.speaker) continue;
|
|
3216
|
+
if (word.speaker !== currentSpeaker) {
|
|
3217
|
+
if (currentSpeaker && currentWords.length > 0) {
|
|
3218
|
+
utterances.push({
|
|
3219
|
+
text: currentWords.map((w) => w.word).join(" "),
|
|
3220
|
+
start: utteranceStart,
|
|
3221
|
+
end: currentWords[currentWords.length - 1].end,
|
|
3222
|
+
speaker: currentSpeaker,
|
|
3223
|
+
words: currentWords
|
|
3224
|
+
});
|
|
3225
|
+
}
|
|
3226
|
+
currentSpeaker = word.speaker;
|
|
3227
|
+
currentWords = [word];
|
|
3228
|
+
utteranceStart = word.start;
|
|
3229
|
+
} else {
|
|
3230
|
+
currentWords.push(word);
|
|
3231
|
+
}
|
|
3232
|
+
}
|
|
3233
|
+
if (currentSpeaker && currentWords.length > 0) {
|
|
3234
|
+
utterances.push({
|
|
3235
|
+
text: currentWords.map((w) => w.word).join(" "),
|
|
3236
|
+
start: utteranceStart,
|
|
3237
|
+
end: currentWords[currentWords.length - 1].end,
|
|
3238
|
+
speaker: currentSpeaker,
|
|
3239
|
+
words: currentWords
|
|
3240
|
+
});
|
|
3241
|
+
}
|
|
3242
|
+
return utterances;
|
|
3243
|
+
}
|
|
3244
|
+
function buildTextFromSpeechmaticsResults(results) {
|
|
3245
|
+
const parts = [];
|
|
3246
|
+
let attachNext = false;
|
|
3247
|
+
for (const result of results) {
|
|
3248
|
+
if (result.type !== "word" && result.type !== "punctuation") continue;
|
|
3249
|
+
const content = result.alternatives?.[0]?.content;
|
|
3250
|
+
if (!content) continue;
|
|
3251
|
+
if (result.type === "punctuation") {
|
|
3252
|
+
const attaches = result.attaches_to;
|
|
3253
|
+
if (attaches === "previous" || attaches === "both") {
|
|
3254
|
+
parts.push(content);
|
|
3255
|
+
attachNext = attaches === "both";
|
|
3256
|
+
} else if (attaches === "next") {
|
|
3257
|
+
if (parts.length > 0) parts.push(" ");
|
|
3258
|
+
parts.push(content);
|
|
3259
|
+
attachNext = true;
|
|
3260
|
+
} else {
|
|
3261
|
+
if (parts.length > 0 && !attachNext) parts.push(" ");
|
|
3262
|
+
parts.push(content);
|
|
3263
|
+
attachNext = false;
|
|
3264
|
+
}
|
|
3265
|
+
} else {
|
|
3266
|
+
if (parts.length > 0 && !attachNext) parts.push(" ");
|
|
3267
|
+
parts.push(content);
|
|
3268
|
+
attachNext = false;
|
|
3269
|
+
}
|
|
3270
|
+
}
|
|
3271
|
+
return parts.join("");
|
|
3272
|
+
}
|
|
2838
3273
|
var STATUS_MAPPINGS = {
|
|
2839
3274
|
gladia: {
|
|
2840
3275
|
queued: "queued",
|
|
@@ -4445,7 +4880,8 @@ var GladiaAdapter = class extends BaseAdapter {
|
|
|
4445
4880
|
options?.region ? { region: options.region } : void 0,
|
|
4446
4881
|
this.getAxiosConfig()
|
|
4447
4882
|
);
|
|
4448
|
-
const { id, url:
|
|
4883
|
+
const { id, url: apiWsUrl } = initResponse.data;
|
|
4884
|
+
const wsUrl = this.config?.wsBaseUrl || apiWsUrl;
|
|
4449
4885
|
const ws = new import_ws.default(wsUrl);
|
|
4450
4886
|
let sessionStatus = "connecting";
|
|
4451
4887
|
setupWebSocketHandlers(ws, callbacks, (status) => {
|
|
@@ -5175,6 +5611,14 @@ var AssemblyAIAdapter = class extends BaseAdapter {
|
|
|
5175
5611
|
this.wsBaseUrl = "wss://streaming.assemblyai.com/v3/ws";
|
|
5176
5612
|
}
|
|
5177
5613
|
// v3 Universal Streaming endpoint
|
|
5614
|
+
initialize(config) {
|
|
5615
|
+
super.initialize(config);
|
|
5616
|
+
if (config.wsBaseUrl) {
|
|
5617
|
+
this.wsBaseUrl = config.wsBaseUrl;
|
|
5618
|
+
} else if (config.baseUrl) {
|
|
5619
|
+
this.wsBaseUrl = `${this.deriveWsUrl(config.baseUrl)}/v3/ws`;
|
|
5620
|
+
}
|
|
5621
|
+
}
|
|
5178
5622
|
/**
|
|
5179
5623
|
* Get axios config for generated API client functions
|
|
5180
5624
|
* Configures headers and base URL using authorization header
|
|
@@ -6086,7 +6530,7 @@ var DeepgramAdapter = class extends BaseAdapter {
|
|
|
6086
6530
|
this.projectId = config.projectId;
|
|
6087
6531
|
const host = this.getRegionalHost(config.region);
|
|
6088
6532
|
this.baseUrl = config.baseUrl || `https://${host}/v1`;
|
|
6089
|
-
this.wsBaseUrl = `wss://${host}/v1/listen
|
|
6533
|
+
this.wsBaseUrl = config.wsBaseUrl || (config.baseUrl ? `${this.deriveWsUrl(config.baseUrl)}/listen` : `wss://${host}/v1/listen`);
|
|
6090
6534
|
this.client = import_axios3.default.create({
|
|
6091
6535
|
baseURL: this.baseUrl,
|
|
6092
6536
|
timeout: config.timeout || 6e4,
|
|
@@ -6121,9 +6565,13 @@ var DeepgramAdapter = class extends BaseAdapter {
|
|
|
6121
6565
|
*/
|
|
6122
6566
|
setRegion(region) {
|
|
6123
6567
|
this.validateConfig();
|
|
6124
|
-
|
|
6125
|
-
|
|
6126
|
-
|
|
6568
|
+
if (!this.config.baseUrl) {
|
|
6569
|
+
const host = this.getRegionalHost(region);
|
|
6570
|
+
this.baseUrl = `https://${host}/v1`;
|
|
6571
|
+
if (!this.config.wsBaseUrl) {
|
|
6572
|
+
this.wsBaseUrl = `wss://${host}/v1/listen`;
|
|
6573
|
+
}
|
|
6574
|
+
}
|
|
6127
6575
|
this.client = import_axios3.default.create({
|
|
6128
6576
|
baseURL: this.baseUrl,
|
|
6129
6577
|
timeout: this.config.timeout || 6e4,
|
|
@@ -6438,7 +6886,7 @@ var DeepgramAdapter = class extends BaseAdapter {
|
|
|
6438
6886
|
start: w.start || 0,
|
|
6439
6887
|
end: w.end || 0,
|
|
6440
6888
|
confidence: w.confidence
|
|
6441
|
-
}))
|
|
6889
|
+
})) ?? []
|
|
6442
6890
|
}));
|
|
6443
6891
|
}
|
|
6444
6892
|
/**
|
|
@@ -6847,7 +7295,7 @@ var DeepgramAdapter = class extends BaseAdapter {
|
|
|
6847
7295
|
start: w.start,
|
|
6848
7296
|
end: w.end,
|
|
6849
7297
|
confidence: w.confidence
|
|
6850
|
-
}))
|
|
7298
|
+
})) ?? []
|
|
6851
7299
|
});
|
|
6852
7300
|
}
|
|
6853
7301
|
break;
|
|
@@ -8086,7 +8534,8 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
|
|
|
8086
8534
|
callbacks?.onUtterance?.({
|
|
8087
8535
|
text: transcription.transcript,
|
|
8088
8536
|
start: 0,
|
|
8089
|
-
end: 0
|
|
8537
|
+
end: 0,
|
|
8538
|
+
words: []
|
|
8090
8539
|
});
|
|
8091
8540
|
break;
|
|
8092
8541
|
}
|
|
@@ -8149,7 +8598,8 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
|
|
|
8149
8598
|
text: segment.text,
|
|
8150
8599
|
start: segment.start,
|
|
8151
8600
|
end: segment.end,
|
|
8152
|
-
confidence: void 0
|
|
8601
|
+
confidence: void 0,
|
|
8602
|
+
words: []
|
|
8153
8603
|
}));
|
|
8154
8604
|
const requestId2 = `openai-${Date.now()}`;
|
|
8155
8605
|
return {
|
|
@@ -8515,7 +8965,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
8515
8965
|
* Normalize Speechmatics response to unified format
|
|
8516
8966
|
*/
|
|
8517
8967
|
normalizeResponse(response) {
|
|
8518
|
-
const text = response.results
|
|
8968
|
+
const text = buildTextFromSpeechmaticsResults(response.results);
|
|
8519
8969
|
const words = response.results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
|
|
8520
8970
|
word: result.alternatives?.[0]?.content || "",
|
|
8521
8971
|
start: result.start_time,
|
|
@@ -8524,51 +8974,14 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
8524
8974
|
speaker: result.alternatives?.[0]?.speaker
|
|
8525
8975
|
}));
|
|
8526
8976
|
const speakerSet = /* @__PURE__ */ new Set();
|
|
8527
|
-
|
|
8528
|
-
if (
|
|
8529
|
-
const speaker = r.alternatives[0]?.speaker;
|
|
8530
|
-
if (speaker) speakerSet.add(speaker);
|
|
8531
|
-
}
|
|
8977
|
+
words.forEach((w) => {
|
|
8978
|
+
if (w.speaker) speakerSet.add(w.speaker);
|
|
8532
8979
|
});
|
|
8533
8980
|
const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
|
|
8534
8981
|
id,
|
|
8535
8982
|
label: `Speaker ${id}`
|
|
8536
8983
|
})) : void 0;
|
|
8537
|
-
const utterances =
|
|
8538
|
-
if (speakers) {
|
|
8539
|
-
let currentSpeaker;
|
|
8540
|
-
let currentUtterance = [];
|
|
8541
|
-
let utteranceStart = 0;
|
|
8542
|
-
response.results.filter((r) => r.type === "word" && r.alternatives).forEach((result, idx) => {
|
|
8543
|
-
const speaker = result.alternatives[0]?.speaker;
|
|
8544
|
-
const word = result.alternatives[0]?.content || "";
|
|
8545
|
-
if (speaker !== currentSpeaker) {
|
|
8546
|
-
if (currentSpeaker && currentUtterance.length > 0) {
|
|
8547
|
-
const prevResult = response.results.filter((r) => r.type === "word")[idx - 1];
|
|
8548
|
-
utterances.push({
|
|
8549
|
-
speaker: currentSpeaker,
|
|
8550
|
-
text: currentUtterance.join(" "),
|
|
8551
|
-
start: utteranceStart || 0,
|
|
8552
|
-
end: prevResult?.end_time || result.start_time || 0
|
|
8553
|
-
});
|
|
8554
|
-
}
|
|
8555
|
-
currentSpeaker = speaker;
|
|
8556
|
-
currentUtterance = [word];
|
|
8557
|
-
utteranceStart = result.start_time || 0;
|
|
8558
|
-
} else {
|
|
8559
|
-
currentUtterance.push(word);
|
|
8560
|
-
}
|
|
8561
|
-
});
|
|
8562
|
-
if (currentSpeaker && currentUtterance.length > 0) {
|
|
8563
|
-
const lastWord = response.results.filter((r) => r.type === "word").pop();
|
|
8564
|
-
utterances.push({
|
|
8565
|
-
speaker: currentSpeaker,
|
|
8566
|
-
text: currentUtterance.join(" "),
|
|
8567
|
-
start: utteranceStart,
|
|
8568
|
-
end: lastWord?.end_time || utteranceStart
|
|
8569
|
-
});
|
|
8570
|
-
}
|
|
8571
|
-
}
|
|
8984
|
+
const utterances = buildUtterancesFromWords(words);
|
|
8572
8985
|
return {
|
|
8573
8986
|
success: true,
|
|
8574
8987
|
provider: this.name,
|
|
@@ -8666,6 +9079,7 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
8666
9079
|
* Get the base URL for API requests
|
|
8667
9080
|
*/
|
|
8668
9081
|
get baseUrl() {
|
|
9082
|
+
if (this.config?.baseUrl) return this.config.baseUrl;
|
|
8669
9083
|
return `https://${this.getRegionalHost()}/v1`;
|
|
8670
9084
|
}
|
|
8671
9085
|
initialize(config) {
|
|
@@ -8829,7 +9243,8 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
8829
9243
|
this.validateConfig();
|
|
8830
9244
|
const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
|
|
8831
9245
|
const createdAt = /* @__PURE__ */ new Date();
|
|
8832
|
-
const
|
|
9246
|
+
const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
|
|
9247
|
+
const wsUrl = new URL(`${wsBase}/transcribe-websocket`);
|
|
8833
9248
|
wsUrl.searchParams.set("api_key", this.config.apiKey);
|
|
8834
9249
|
const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-preview";
|
|
8835
9250
|
wsUrl.searchParams.set("model", modelId);
|
|
@@ -9088,45 +9503,14 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9088
9503
|
* Build utterances from tokens based on speaker changes
|
|
9089
9504
|
*/
|
|
9090
9505
|
buildUtterancesFromTokens(tokens) {
|
|
9091
|
-
const
|
|
9092
|
-
|
|
9093
|
-
|
|
9094
|
-
|
|
9095
|
-
|
|
9096
|
-
|
|
9097
|
-
|
|
9098
|
-
|
|
9099
|
-
end: token.end_ms ? token.end_ms / 1e3 : 0,
|
|
9100
|
-
confidence: token.confidence,
|
|
9101
|
-
speaker: token.speaker
|
|
9102
|
-
};
|
|
9103
|
-
if (token.speaker !== currentSpeaker) {
|
|
9104
|
-
if (currentSpeaker && currentWords.length > 0) {
|
|
9105
|
-
utterances.push({
|
|
9106
|
-
text: currentWords.map((w) => w.word).join(" "),
|
|
9107
|
-
start: utteranceStart,
|
|
9108
|
-
end: currentWords[currentWords.length - 1].end,
|
|
9109
|
-
speaker: currentSpeaker,
|
|
9110
|
-
words: currentWords
|
|
9111
|
-
});
|
|
9112
|
-
}
|
|
9113
|
-
currentSpeaker = token.speaker;
|
|
9114
|
-
currentWords = [word];
|
|
9115
|
-
utteranceStart = word.start;
|
|
9116
|
-
} else {
|
|
9117
|
-
currentWords.push(word);
|
|
9118
|
-
}
|
|
9119
|
-
}
|
|
9120
|
-
if (currentSpeaker && currentWords.length > 0) {
|
|
9121
|
-
utterances.push({
|
|
9122
|
-
text: currentWords.map((w) => w.word).join(" "),
|
|
9123
|
-
start: utteranceStart,
|
|
9124
|
-
end: currentWords[currentWords.length - 1].end,
|
|
9125
|
-
speaker: currentSpeaker,
|
|
9126
|
-
words: currentWords
|
|
9127
|
-
});
|
|
9128
|
-
}
|
|
9129
|
-
return utterances;
|
|
9506
|
+
const words = tokens.map((token) => ({
|
|
9507
|
+
word: token.text,
|
|
9508
|
+
start: token.start_ms ? token.start_ms / 1e3 : 0,
|
|
9509
|
+
end: token.end_ms ? token.end_ms / 1e3 : 0,
|
|
9510
|
+
confidence: token.confidence,
|
|
9511
|
+
speaker: token.speaker
|
|
9512
|
+
}));
|
|
9513
|
+
return buildUtterancesFromWords(words);
|
|
9130
9514
|
}
|
|
9131
9515
|
/**
|
|
9132
9516
|
* Normalize Soniox response to unified format
|
|
@@ -9150,7 +9534,7 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
9150
9534
|
id,
|
|
9151
9535
|
label: `Speaker ${id}`
|
|
9152
9536
|
})) : void 0;
|
|
9153
|
-
const utterances = response.tokens ? this.buildUtterancesFromTokens(response.tokens) : [];
|
|
9537
|
+
const utterances = response.tokens ? this.buildUtterancesFromTokens(response.tokens.filter((t) => t.is_final)) : [];
|
|
9154
9538
|
const language = response.tokens?.find((t) => t.language)?.language;
|
|
9155
9539
|
return {
|
|
9156
9540
|
success: true,
|
|
@@ -9179,6 +9563,501 @@ function createSonioxAdapter(config) {
|
|
|
9179
9563
|
return adapter;
|
|
9180
9564
|
}
|
|
9181
9565
|
|
|
9566
|
+
// src/adapters/elevenlabs-adapter.ts
|
|
9567
|
+
var import_axios10 = __toESM(require("axios"));
|
|
9568
|
+
var ElevenLabsAdapter = class extends BaseAdapter {
|
|
9569
|
+
constructor() {
|
|
9570
|
+
super(...arguments);
|
|
9571
|
+
this.name = "elevenlabs";
|
|
9572
|
+
this.capabilities = {
|
|
9573
|
+
streaming: true,
|
|
9574
|
+
diarization: true,
|
|
9575
|
+
wordTimestamps: true,
|
|
9576
|
+
languageDetection: true,
|
|
9577
|
+
customVocabulary: true,
|
|
9578
|
+
summarization: false,
|
|
9579
|
+
sentimentAnalysis: false,
|
|
9580
|
+
entityDetection: true,
|
|
9581
|
+
piiRedaction: true,
|
|
9582
|
+
listTranscripts: false,
|
|
9583
|
+
deleteTranscript: false
|
|
9584
|
+
};
|
|
9585
|
+
this.region = ElevenLabsRegion.global;
|
|
9586
|
+
this.defaultModel = "scribe_v2";
|
|
9587
|
+
}
|
|
9588
|
+
/**
|
|
9589
|
+
* Get regional API host based on configured region
|
|
9590
|
+
*/
|
|
9591
|
+
getRegionalHost() {
|
|
9592
|
+
switch (this.region) {
|
|
9593
|
+
case ElevenLabsRegion.us:
|
|
9594
|
+
return "api.us.elevenlabs.io";
|
|
9595
|
+
case ElevenLabsRegion.eu:
|
|
9596
|
+
return "api.eu.residency.elevenlabs.io";
|
|
9597
|
+
case ElevenLabsRegion.in:
|
|
9598
|
+
return "api.in.residency.elevenlabs.io";
|
|
9599
|
+
case ElevenLabsRegion.global:
|
|
9600
|
+
default:
|
|
9601
|
+
return "api.elevenlabs.io";
|
|
9602
|
+
}
|
|
9603
|
+
}
|
|
9604
|
+
/**
|
|
9605
|
+
* Get the base URL for API requests
|
|
9606
|
+
*/
|
|
9607
|
+
get baseUrl() {
|
|
9608
|
+
if (this.config?.baseUrl) return this.config.baseUrl;
|
|
9609
|
+
return `https://${this.getRegionalHost()}`;
|
|
9610
|
+
}
|
|
9611
|
+
initialize(config) {
|
|
9612
|
+
super.initialize(config);
|
|
9613
|
+
if (config.region) {
|
|
9614
|
+
this.region = config.region;
|
|
9615
|
+
}
|
|
9616
|
+
if (config.model) {
|
|
9617
|
+
this.defaultModel = config.model;
|
|
9618
|
+
}
|
|
9619
|
+
this.client = import_axios10.default.create({
|
|
9620
|
+
baseURL: this.baseUrl,
|
|
9621
|
+
timeout: config.timeout || 12e4,
|
|
9622
|
+
headers: {
|
|
9623
|
+
"xi-api-key": config.apiKey,
|
|
9624
|
+
...config.headers
|
|
9625
|
+
}
|
|
9626
|
+
});
|
|
9627
|
+
}
|
|
9628
|
+
/**
|
|
9629
|
+
* Get current region
|
|
9630
|
+
*/
|
|
9631
|
+
getRegion() {
|
|
9632
|
+
return this.region;
|
|
9633
|
+
}
|
|
9634
|
+
/**
|
|
9635
|
+
* Set regional endpoint
|
|
9636
|
+
*/
|
|
9637
|
+
setRegion(region) {
|
|
9638
|
+
this.region = region;
|
|
9639
|
+
if (this.config?.apiKey) {
|
|
9640
|
+
this.client = import_axios10.default.create({
|
|
9641
|
+
baseURL: this.baseUrl,
|
|
9642
|
+
timeout: this.config.timeout || 12e4,
|
|
9643
|
+
headers: {
|
|
9644
|
+
"xi-api-key": this.config.apiKey,
|
|
9645
|
+
...this.config.headers
|
|
9646
|
+
}
|
|
9647
|
+
});
|
|
9648
|
+
}
|
|
9649
|
+
}
|
|
9650
|
+
/**
|
|
9651
|
+
* Submit audio for transcription
|
|
9652
|
+
*
|
|
9653
|
+
* ElevenLabs batch is synchronous - the API returns the result directly.
|
|
9654
|
+
*/
|
|
9655
|
+
async transcribe(audio, options) {
|
|
9656
|
+
this.validateConfig();
|
|
9657
|
+
try {
|
|
9658
|
+
const formData = new FormData();
|
|
9659
|
+
const modelId = options?.model || this.defaultModel;
|
|
9660
|
+
formData.append("model_id", modelId);
|
|
9661
|
+
if (audio.type === "url") {
|
|
9662
|
+
formData.append("cloud_storage_url", audio.url);
|
|
9663
|
+
} else if (audio.type === "file") {
|
|
9664
|
+
const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
|
|
9665
|
+
formData.append("file", audioBlob, audio.filename || "audio.wav");
|
|
9666
|
+
} else {
|
|
9667
|
+
return {
|
|
9668
|
+
success: false,
|
|
9669
|
+
provider: this.name,
|
|
9670
|
+
error: {
|
|
9671
|
+
code: "INVALID_INPUT",
|
|
9672
|
+
message: "ElevenLabs only supports URL and File audio input"
|
|
9673
|
+
}
|
|
9674
|
+
};
|
|
9675
|
+
}
|
|
9676
|
+
if (options?.language) {
|
|
9677
|
+
formData.append("language_code", options.language);
|
|
9678
|
+
}
|
|
9679
|
+
if (options?.diarization) {
|
|
9680
|
+
formData.append("diarize", "true");
|
|
9681
|
+
}
|
|
9682
|
+
formData.append("timestamps_granularity", "word");
|
|
9683
|
+
if (options?.speakersExpected) {
|
|
9684
|
+
formData.append("num_speakers", String(options.speakersExpected));
|
|
9685
|
+
}
|
|
9686
|
+
if (options?.customVocabulary && options.customVocabulary.length > 0) {
|
|
9687
|
+
for (const term of options.customVocabulary) {
|
|
9688
|
+
formData.append("keyterms", term);
|
|
9689
|
+
}
|
|
9690
|
+
}
|
|
9691
|
+
if (options?.entityDetection) {
|
|
9692
|
+
formData.append("entity_detection", "all");
|
|
9693
|
+
}
|
|
9694
|
+
const elevenlabsOpts = options?.elevenlabs;
|
|
9695
|
+
if (elevenlabsOpts) {
|
|
9696
|
+
for (const [key, value] of Object.entries(elevenlabsOpts)) {
|
|
9697
|
+
if (value === void 0 || value === null) continue;
|
|
9698
|
+
if (formData.has(key)) continue;
|
|
9699
|
+
if (typeof value === "boolean") {
|
|
9700
|
+
formData.append(key, String(value));
|
|
9701
|
+
} else if (Array.isArray(value)) {
|
|
9702
|
+
for (const item of value) {
|
|
9703
|
+
formData.append(key, typeof item === "object" ? JSON.stringify(item) : String(item));
|
|
9704
|
+
}
|
|
9705
|
+
} else if (typeof value === "object") {
|
|
9706
|
+
formData.append(key, JSON.stringify(value));
|
|
9707
|
+
} else {
|
|
9708
|
+
formData.append(key, String(value));
|
|
9709
|
+
}
|
|
9710
|
+
}
|
|
9711
|
+
}
|
|
9712
|
+
const response = await this.client.post("/v1/speech-to-text", formData, {
|
|
9713
|
+
headers: {
|
|
9714
|
+
"Content-Type": "multipart/form-data"
|
|
9715
|
+
}
|
|
9716
|
+
});
|
|
9717
|
+
return this.normalizeResponse(response.data);
|
|
9718
|
+
} catch (error) {
|
|
9719
|
+
return this.createErrorResponse(error);
|
|
9720
|
+
}
|
|
9721
|
+
}
|
|
9722
|
+
/**
|
|
9723
|
+
* Get transcription result by ID
|
|
9724
|
+
*
|
|
9725
|
+
* ElevenLabs batch is synchronous, but supports transcript retrieval.
|
|
9726
|
+
*/
|
|
9727
|
+
async getTranscript(transcriptId) {
|
|
9728
|
+
this.validateConfig();
|
|
9729
|
+
try {
|
|
9730
|
+
const response = await this.client.get(`/v1/speech-to-text/transcripts/${transcriptId}`);
|
|
9731
|
+
return this.normalizeResponse(response.data);
|
|
9732
|
+
} catch (error) {
|
|
9733
|
+
return this.createErrorResponse(error);
|
|
9734
|
+
}
|
|
9735
|
+
}
|
|
9736
|
+
/**
|
|
9737
|
+
* Stream audio for real-time transcription
|
|
9738
|
+
*
|
|
9739
|
+
* Creates a WebSocket connection to ElevenLabs realtime STT endpoint.
|
|
9740
|
+
* Audio is sent as base64-encoded JSON messages.
|
|
9741
|
+
*/
|
|
9742
|
+
async transcribeStream(options, callbacks) {
|
|
9743
|
+
this.validateConfig();
|
|
9744
|
+
const sessionId = `elevenlabs_${Date.now()}_${Math.random().toString(36).substring(7)}`;
|
|
9745
|
+
const createdAt = /* @__PURE__ */ new Date();
|
|
9746
|
+
const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalHost()}`);
|
|
9747
|
+
const wsUrl = new URL(`${wsBase}/v1/speech-to-text/realtime`);
|
|
9748
|
+
const elOpts = options?.elevenlabsStreaming;
|
|
9749
|
+
const modelId = elOpts?.model || "scribe_v2_realtime";
|
|
9750
|
+
wsUrl.searchParams.set("model_id", modelId);
|
|
9751
|
+
const audioFormat = elOpts?.audioFormat || "pcm_16000";
|
|
9752
|
+
wsUrl.searchParams.set("audio_format", audioFormat);
|
|
9753
|
+
const langCode = elOpts?.languageCode || options?.language;
|
|
9754
|
+
if (langCode) {
|
|
9755
|
+
wsUrl.searchParams.set("language_code", langCode);
|
|
9756
|
+
}
|
|
9757
|
+
if (elOpts?.includeTimestamps !== void 0) {
|
|
9758
|
+
wsUrl.searchParams.set("include_timestamps", String(elOpts.includeTimestamps));
|
|
9759
|
+
}
|
|
9760
|
+
if (elOpts?.includeLanguageDetection || options?.languageDetection) {
|
|
9761
|
+
wsUrl.searchParams.set("include_language_detection", "true");
|
|
9762
|
+
}
|
|
9763
|
+
if (elOpts?.commitStrategy) {
|
|
9764
|
+
wsUrl.searchParams.set("commit_strategy", elOpts.commitStrategy);
|
|
9765
|
+
}
|
|
9766
|
+
if (elOpts?.vadSilenceThresholdSecs !== void 0) {
|
|
9767
|
+
wsUrl.searchParams.set("vad_silence_threshold_secs", String(elOpts.vadSilenceThresholdSecs));
|
|
9768
|
+
}
|
|
9769
|
+
if (elOpts?.vadThreshold !== void 0) {
|
|
9770
|
+
wsUrl.searchParams.set("vad_threshold", String(elOpts.vadThreshold));
|
|
9771
|
+
}
|
|
9772
|
+
if (elOpts?.minSpeechDurationMs !== void 0) {
|
|
9773
|
+
wsUrl.searchParams.set("min_speech_duration_ms", String(elOpts.minSpeechDurationMs));
|
|
9774
|
+
}
|
|
9775
|
+
if (elOpts?.minSilenceDurationMs !== void 0) {
|
|
9776
|
+
wsUrl.searchParams.set("min_silence_duration_ms", String(elOpts.minSilenceDurationMs));
|
|
9777
|
+
}
|
|
9778
|
+
if (elOpts?.previousText) {
|
|
9779
|
+
wsUrl.searchParams.set("previous_text", elOpts.previousText);
|
|
9780
|
+
}
|
|
9781
|
+
if (!elOpts?.audioFormat && options?.encoding) {
|
|
9782
|
+
const encodingMap = {
|
|
9783
|
+
linear16: "pcm_16000",
|
|
9784
|
+
pcm: "pcm_16000",
|
|
9785
|
+
mulaw: "ulaw_8000"
|
|
9786
|
+
};
|
|
9787
|
+
const mappedFormat = encodingMap[options.encoding];
|
|
9788
|
+
if (mappedFormat) {
|
|
9789
|
+
wsUrl.searchParams.set("audio_format", mappedFormat);
|
|
9790
|
+
}
|
|
9791
|
+
}
|
|
9792
|
+
let status = "connecting";
|
|
9793
|
+
let openedAt = null;
|
|
9794
|
+
let receivedData = false;
|
|
9795
|
+
const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : require("ws");
|
|
9796
|
+
const ws = new WebSocketImpl(wsUrl.toString(), {
|
|
9797
|
+
headers: {
|
|
9798
|
+
"xi-api-key": this.config.apiKey
|
|
9799
|
+
}
|
|
9800
|
+
});
|
|
9801
|
+
ws.onopen = () => {
|
|
9802
|
+
status = "open";
|
|
9803
|
+
openedAt = Date.now();
|
|
9804
|
+
callbacks?.onOpen?.();
|
|
9805
|
+
};
|
|
9806
|
+
ws.onmessage = (event) => {
|
|
9807
|
+
receivedData = true;
|
|
9808
|
+
const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
|
|
9809
|
+
let messageType;
|
|
9810
|
+
try {
|
|
9811
|
+
const data = JSON.parse(rawPayload);
|
|
9812
|
+
if (data.error) {
|
|
9813
|
+
messageType = "error";
|
|
9814
|
+
} else if (data.message_type === "session_started") {
|
|
9815
|
+
messageType = "session_started";
|
|
9816
|
+
} else if (data.message_type === "partial_transcript") {
|
|
9817
|
+
messageType = "partial_transcript";
|
|
9818
|
+
} else if (data.message_type === "committed_transcript") {
|
|
9819
|
+
messageType = "committed_transcript";
|
|
9820
|
+
} else if (data.message_type === "committed_transcript_with_timestamps") {
|
|
9821
|
+
messageType = "committed_transcript_with_timestamps";
|
|
9822
|
+
}
|
|
9823
|
+
if (callbacks?.onRawMessage) {
|
|
9824
|
+
callbacks.onRawMessage({
|
|
9825
|
+
provider: this.name,
|
|
9826
|
+
direction: "incoming",
|
|
9827
|
+
timestamp: Date.now(),
|
|
9828
|
+
payload: rawPayload,
|
|
9829
|
+
messageType
|
|
9830
|
+
});
|
|
9831
|
+
}
|
|
9832
|
+
if (data.error) {
|
|
9833
|
+
callbacks?.onError?.({
|
|
9834
|
+
code: data.error_code?.toString() || "STREAM_ERROR",
|
|
9835
|
+
message: data.error
|
|
9836
|
+
});
|
|
9837
|
+
return;
|
|
9838
|
+
}
|
|
9839
|
+
if (data.message_type === "session_started") {
|
|
9840
|
+
return;
|
|
9841
|
+
}
|
|
9842
|
+
if (data.message_type === "partial_transcript") {
|
|
9843
|
+
const streamEvent = {
|
|
9844
|
+
type: "transcript",
|
|
9845
|
+
text: data.text || "",
|
|
9846
|
+
isFinal: false,
|
|
9847
|
+
confidence: void 0,
|
|
9848
|
+
language: data.language_code
|
|
9849
|
+
};
|
|
9850
|
+
callbacks?.onTranscript?.(streamEvent);
|
|
9851
|
+
return;
|
|
9852
|
+
}
|
|
9853
|
+
if (data.message_type === "committed_transcript" || data.message_type === "committed_transcript_with_timestamps") {
|
|
9854
|
+
const words = data.words ? data.words.map((w) => ({
|
|
9855
|
+
word: w.text || "",
|
|
9856
|
+
start: w.start || 0,
|
|
9857
|
+
end: w.end || 0,
|
|
9858
|
+
confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
|
|
9859
|
+
speaker: w.speaker_id
|
|
9860
|
+
})) : [];
|
|
9861
|
+
const streamEvent = {
|
|
9862
|
+
type: "transcript",
|
|
9863
|
+
text: data.text || "",
|
|
9864
|
+
isFinal: true,
|
|
9865
|
+
words: words.length > 0 ? words : void 0,
|
|
9866
|
+
speaker: words[0]?.speaker,
|
|
9867
|
+
language: data.language_code,
|
|
9868
|
+
confidence: void 0
|
|
9869
|
+
};
|
|
9870
|
+
callbacks?.onTranscript?.(streamEvent);
|
|
9871
|
+
if (options?.diarization && words.length > 0) {
|
|
9872
|
+
const utterances = buildUtterancesFromWords(words);
|
|
9873
|
+
for (const utterance of utterances) {
|
|
9874
|
+
callbacks?.onUtterance?.(utterance);
|
|
9875
|
+
}
|
|
9876
|
+
}
|
|
9877
|
+
}
|
|
9878
|
+
} catch (error) {
|
|
9879
|
+
callbacks?.onError?.({
|
|
9880
|
+
code: "PARSE_ERROR",
|
|
9881
|
+
message: `Failed to parse message: ${error}`
|
|
9882
|
+
});
|
|
9883
|
+
}
|
|
9884
|
+
};
|
|
9885
|
+
ws.onerror = () => {
|
|
9886
|
+
callbacks?.onError?.({
|
|
9887
|
+
code: "WEBSOCKET_ERROR",
|
|
9888
|
+
message: "WebSocket error occurred"
|
|
9889
|
+
});
|
|
9890
|
+
};
|
|
9891
|
+
ws.onclose = (event) => {
|
|
9892
|
+
status = "closed";
|
|
9893
|
+
const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
|
|
9894
|
+
const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
|
|
9895
|
+
if (isImmediateClose && event.code === 1e3) {
|
|
9896
|
+
callbacks?.onError?.({
|
|
9897
|
+
code: "ELEVENLABS_CONFIG_REJECTED",
|
|
9898
|
+
message: [
|
|
9899
|
+
"ElevenLabs closed connection immediately after opening.",
|
|
9900
|
+
`Current config: region=${this.region}, model=${modelId}`,
|
|
9901
|
+
"Likely causes:",
|
|
9902
|
+
" - Invalid API key",
|
|
9903
|
+
" - Unsupported audio format or model",
|
|
9904
|
+
event.reason ? `Server reason: ${event.reason}` : null
|
|
9905
|
+
].filter(Boolean).join("\n")
|
|
9906
|
+
});
|
|
9907
|
+
}
|
|
9908
|
+
callbacks?.onClose?.(event.code, event.reason);
|
|
9909
|
+
};
|
|
9910
|
+
await new Promise((resolve, reject) => {
|
|
9911
|
+
const timeout = setTimeout(() => {
|
|
9912
|
+
reject(new Error("WebSocket connection timeout"));
|
|
9913
|
+
}, 1e4);
|
|
9914
|
+
const checkOpen = () => {
|
|
9915
|
+
if (status === "open") {
|
|
9916
|
+
clearTimeout(timeout);
|
|
9917
|
+
resolve();
|
|
9918
|
+
} else if (status === "closed") {
|
|
9919
|
+
clearTimeout(timeout);
|
|
9920
|
+
reject(new Error("WebSocket connection failed"));
|
|
9921
|
+
} else {
|
|
9922
|
+
setTimeout(checkOpen, 100);
|
|
9923
|
+
}
|
|
9924
|
+
};
|
|
9925
|
+
checkOpen();
|
|
9926
|
+
});
|
|
9927
|
+
return {
|
|
9928
|
+
id: sessionId,
|
|
9929
|
+
provider: this.name,
|
|
9930
|
+
createdAt,
|
|
9931
|
+
getStatus: () => status,
|
|
9932
|
+
sendAudio: async (chunk) => {
|
|
9933
|
+
if (status !== "open") {
|
|
9934
|
+
throw new Error("Session is not open");
|
|
9935
|
+
}
|
|
9936
|
+
let base64Audio;
|
|
9937
|
+
if (chunk.data instanceof ArrayBuffer) {
|
|
9938
|
+
base64Audio = Buffer.from(chunk.data).toString("base64");
|
|
9939
|
+
} else if (chunk.data instanceof Uint8Array) {
|
|
9940
|
+
base64Audio = Buffer.from(
|
|
9941
|
+
chunk.data.buffer,
|
|
9942
|
+
chunk.data.byteOffset,
|
|
9943
|
+
chunk.data.byteLength
|
|
9944
|
+
).toString("base64");
|
|
9945
|
+
} else {
|
|
9946
|
+
base64Audio = Buffer.from(chunk.data).toString("base64");
|
|
9947
|
+
}
|
|
9948
|
+
const message = JSON.stringify({
|
|
9949
|
+
message_type: "input_audio_chunk",
|
|
9950
|
+
audio_base_64: base64Audio
|
|
9951
|
+
});
|
|
9952
|
+
if (callbacks?.onRawMessage) {
|
|
9953
|
+
callbacks.onRawMessage({
|
|
9954
|
+
provider: this.name,
|
|
9955
|
+
direction: "outgoing",
|
|
9956
|
+
timestamp: Date.now(),
|
|
9957
|
+
payload: message,
|
|
9958
|
+
messageType: "audio"
|
|
9959
|
+
});
|
|
9960
|
+
}
|
|
9961
|
+
ws.send(message);
|
|
9962
|
+
},
|
|
9963
|
+
close: async () => {
|
|
9964
|
+
if (status === "open") {
|
|
9965
|
+
status = "closing";
|
|
9966
|
+
ws.send(JSON.stringify({ message_type: "end_of_stream" }));
|
|
9967
|
+
ws.close(1e3, "Client requested close");
|
|
9968
|
+
}
|
|
9969
|
+
}
|
|
9970
|
+
};
|
|
9971
|
+
}
|
|
9972
|
+
/**
|
|
9973
|
+
* Normalize ElevenLabs response to unified format
|
|
9974
|
+
*
|
|
9975
|
+
* ElevenLabs returns either:
|
|
9976
|
+
* - Single channel: `SpeechToTextChunkResponseModel` directly (text, words, etc.)
|
|
9977
|
+
* - Multi-channel: `MultichannelSpeechToTextResponseModel` with `transcripts[]`
|
|
9978
|
+
*/
|
|
9979
|
+
normalizeResponse(response) {
|
|
9980
|
+
const chunks = response.transcripts ? response.transcripts : [response];
|
|
9981
|
+
const text = chunks.map((c) => c.text).join(" ");
|
|
9982
|
+
const words = [];
|
|
9983
|
+
const speakerSet = /* @__PURE__ */ new Set();
|
|
9984
|
+
const audioEvents = [];
|
|
9985
|
+
for (const chunk of chunks) {
|
|
9986
|
+
if (!chunk.words) continue;
|
|
9987
|
+
for (const w of chunk.words) {
|
|
9988
|
+
if (w.type === "audio_event") {
|
|
9989
|
+
audioEvents.push({
|
|
9990
|
+
text: w.text,
|
|
9991
|
+
start: typeof w.start === "number" ? w.start : 0,
|
|
9992
|
+
end: typeof w.end === "number" ? w.end : 0
|
|
9993
|
+
});
|
|
9994
|
+
continue;
|
|
9995
|
+
}
|
|
9996
|
+
const speakerId = w.speaker_id ?? void 0;
|
|
9997
|
+
const word = {
|
|
9998
|
+
word: w.text,
|
|
9999
|
+
start: typeof w.start === "number" ? w.start : 0,
|
|
10000
|
+
end: typeof w.end === "number" ? w.end : 0,
|
|
10001
|
+
confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
|
|
10002
|
+
speaker: speakerId ?? void 0
|
|
10003
|
+
};
|
|
10004
|
+
words.push(word);
|
|
10005
|
+
if (speakerId) {
|
|
10006
|
+
speakerSet.add(speakerId);
|
|
10007
|
+
}
|
|
10008
|
+
}
|
|
10009
|
+
}
|
|
10010
|
+
const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
|
|
10011
|
+
id,
|
|
10012
|
+
label: `Speaker ${id}`
|
|
10013
|
+
})) : void 0;
|
|
10014
|
+
const utterances = words.length > 0 ? buildUtterancesFromWords(words) : [];
|
|
10015
|
+
const language = chunks[0]?.language_code;
|
|
10016
|
+
const languageProbability = chunks[0]?.language_probability;
|
|
10017
|
+
const entities = [];
|
|
10018
|
+
for (const chunk of chunks) {
|
|
10019
|
+
if (chunk.entities && Array.isArray(chunk.entities)) {
|
|
10020
|
+
for (const entity of chunk.entities) {
|
|
10021
|
+
entities.push({
|
|
10022
|
+
text: entity.text,
|
|
10023
|
+
entity_type: entity.entity_type,
|
|
10024
|
+
start_char: entity.start_char,
|
|
10025
|
+
end_char: entity.end_char
|
|
10026
|
+
});
|
|
10027
|
+
}
|
|
10028
|
+
}
|
|
10029
|
+
}
|
|
10030
|
+
const transcriptionId = response.transcription_id || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
|
|
10031
|
+
return {
|
|
10032
|
+
success: true,
|
|
10033
|
+
provider: this.name,
|
|
10034
|
+
data: {
|
|
10035
|
+
id: transcriptionId,
|
|
10036
|
+
text,
|
|
10037
|
+
status: "completed",
|
|
10038
|
+
language,
|
|
10039
|
+
speakers,
|
|
10040
|
+
words: words.length > 0 ? words : void 0,
|
|
10041
|
+
utterances: utterances.length > 0 ? utterances : void 0
|
|
10042
|
+
},
|
|
10043
|
+
extended: {
|
|
10044
|
+
entities: entities.length > 0 ? entities : void 0,
|
|
10045
|
+
audioEvents: audioEvents.length > 0 ? audioEvents : void 0,
|
|
10046
|
+
languageProbability
|
|
10047
|
+
},
|
|
10048
|
+
tracking: {
|
|
10049
|
+
requestId: transcriptionId
|
|
10050
|
+
},
|
|
10051
|
+
raw: response
|
|
10052
|
+
};
|
|
10053
|
+
}
|
|
10054
|
+
};
|
|
10055
|
+
function createElevenLabsAdapter(config) {
|
|
10056
|
+
const adapter = new ElevenLabsAdapter();
|
|
10057
|
+
adapter.initialize(config);
|
|
10058
|
+
return adapter;
|
|
10059
|
+
}
|
|
10060
|
+
|
|
9182
10061
|
// src/utils/zod-to-field-configs.ts
|
|
9183
10062
|
function unwrapZodType(schema) {
|
|
9184
10063
|
let inner = schema;
|
|
@@ -35778,7 +36657,8 @@ var speechmaticsTranscriptionConfigSchema = import_zod8.z.object({
|
|
|
35778
36657
|
enable_entities: import_zod8.z.boolean().optional(),
|
|
35779
36658
|
operating_point: import_zod8.z.enum(["standard", "enhanced"]).optional(),
|
|
35780
36659
|
punctuation_overrides: import_zod8.z.unknown().optional(),
|
|
35781
|
-
conversation_config: import_zod8.z.unknown().optional()
|
|
36660
|
+
conversation_config: import_zod8.z.unknown().optional(),
|
|
36661
|
+
channel_diarization_labels: import_zod8.z.array(import_zod8.z.string()).optional()
|
|
35782
36662
|
});
|
|
35783
36663
|
var speechmaticsMidSessionConfigSchema = import_zod8.z.object({
|
|
35784
36664
|
language: import_zod8.z.string().optional().describe(
|
|
@@ -35795,18 +36675,19 @@ var speechmaticsMidSessionConfigSchema = import_zod8.z.object({
|
|
|
35795
36675
|
conversation_config: import_zod8.z.unknown().optional()
|
|
35796
36676
|
});
|
|
35797
36677
|
var speechmaticsSpeakerDiarizationConfigSchema = import_zod8.z.object({
|
|
35798
|
-
max_speakers: import_zod8.z.number().min(2).
|
|
36678
|
+
max_speakers: import_zod8.z.number().min(2).optional().describe(
|
|
35799
36679
|
"Configure the maximum number of speakers to detect. See [Max Speakers](http://docs.speechmatics.com/speech-to-text/features/diarization#max-speakers)."
|
|
35800
36680
|
),
|
|
35801
36681
|
prefer_current_speaker: import_zod8.z.boolean().optional().describe(
|
|
35802
36682
|
"When set to `true`, reduces the likelihood of incorrectly switching between similar sounding speakers. See [Prefer Current Speaker](https://docs.speechmatics.com/speech-to-text/features/diarization#prefer-current-speaker)."
|
|
35803
36683
|
),
|
|
35804
36684
|
speaker_sensitivity: import_zod8.z.number().min(0).max(1).optional(),
|
|
36685
|
+
get_speakers: import_zod8.z.boolean().optional().describe("If true, speaker identifiers will be returned at the end of transcript."),
|
|
35805
36686
|
speakers: import_zod8.z.array(
|
|
35806
36687
|
import_zod8.z.unknown()
|
|
35807
36688
|
/* TODO: resolve SpeakersInputItem */
|
|
35808
36689
|
).optional().describe(
|
|
35809
|
-
"Use this option to provide speaker labels linked to their speaker identifiers. When passed, the transcription system will tag spoken words in the transcript with the provided speaker labels whenever any of the specified speakers is detected in the audio.
|
|
36690
|
+
"Use this option to provide speaker labels linked to their speaker identifiers. When passed, the transcription system will tag spoken words in the transcript with the provided speaker labels whenever any of the specified speakers is detected in the audio. A maximum of 50 speakers identifiers across all speakers can be provided."
|
|
35810
36691
|
)
|
|
35811
36692
|
});
|
|
35812
36693
|
var speechmaticsConversationConfigSchema = import_zod8.z.object({
|
|
@@ -35832,7 +36713,8 @@ var streamingTranscriberParams2 = import_zod8.z.object({
|
|
|
35832
36713
|
"Whether or not to send Partials (i.e. `AddPartialTranslation` messages) as well as Finals (i.e. `AddTranslation` messages) See [Partial transcripts](https://docs.speechmatics.com/speech-to-text/realtime/output#partial-transcripts)."
|
|
35833
36714
|
),
|
|
35834
36715
|
enable_entities: import_zod8.z.boolean().optional(),
|
|
35835
|
-
operating_point: import_zod8.z.enum(["standard", "enhanced"]).optional()
|
|
36716
|
+
operating_point: import_zod8.z.enum(["standard", "enhanced"]).optional(),
|
|
36717
|
+
channel_diarization_labels: import_zod8.z.array(import_zod8.z.string()).optional()
|
|
35836
36718
|
});
|
|
35837
36719
|
var streamingUpdateConfigParams2 = import_zod8.z.object({
|
|
35838
36720
|
language: import_zod8.z.string().optional().describe(
|
|
@@ -36576,6 +37458,21 @@ var SonioxCapabilities = {
|
|
|
36576
37458
|
listTranscripts: false,
|
|
36577
37459
|
deleteTranscript: false
|
|
36578
37460
|
};
|
|
37461
|
+
var ElevenLabsCapabilities = {
|
|
37462
|
+
streaming: true,
|
|
37463
|
+
diarization: true,
|
|
37464
|
+
wordTimestamps: true,
|
|
37465
|
+
languageDetection: true,
|
|
37466
|
+
customVocabulary: true,
|
|
37467
|
+
// Via keyterms parameter
|
|
37468
|
+
summarization: false,
|
|
37469
|
+
sentimentAnalysis: false,
|
|
37470
|
+
entityDetection: true,
|
|
37471
|
+
piiRedaction: true,
|
|
37472
|
+
// Via entity_detection with PII categories
|
|
37473
|
+
listTranscripts: false,
|
|
37474
|
+
deleteTranscript: false
|
|
37475
|
+
};
|
|
36579
37476
|
var ProviderCapabilitiesMap = {
|
|
36580
37477
|
gladia: GladiaCapabilities,
|
|
36581
37478
|
assemblyai: AssemblyAICapabilities,
|
|
@@ -36583,7 +37480,8 @@ var ProviderCapabilitiesMap = {
|
|
|
36583
37480
|
"openai-whisper": OpenAICapabilities,
|
|
36584
37481
|
"azure-stt": AzureCapabilities,
|
|
36585
37482
|
speechmatics: SpeechmaticsCapabilities,
|
|
36586
|
-
soniox: SonioxCapabilities
|
|
37483
|
+
soniox: SonioxCapabilities,
|
|
37484
|
+
elevenlabs: ElevenLabsCapabilities
|
|
36587
37485
|
};
|
|
36588
37486
|
var CapabilityKeys = [
|
|
36589
37487
|
"streaming",
|
|
@@ -36785,7 +37683,8 @@ var AllLanguageCodes = {
|
|
|
36785
37683
|
// BCP-47 locale codes (e.g., "en-US")
|
|
36786
37684
|
speechmatics: SpeechmaticsLanguageCodes,
|
|
36787
37685
|
// ISO 639-1 codes with multilingual packs
|
|
36788
|
-
soniox: SonioxLanguageCodes
|
|
37686
|
+
soniox: SonioxLanguageCodes,
|
|
37687
|
+
elevenlabs: ElevenLabsLanguageCodes
|
|
36789
37688
|
};
|
|
36790
37689
|
var ProviderDisplayNames = {
|
|
36791
37690
|
gladia: "Gladia",
|
|
@@ -36794,7 +37693,8 @@ var ProviderDisplayNames = {
|
|
|
36794
37693
|
"openai-whisper": "OpenAI Whisper",
|
|
36795
37694
|
"azure-stt": "Azure Speech",
|
|
36796
37695
|
speechmatics: "Speechmatics",
|
|
36797
|
-
soniox: "Soniox"
|
|
37696
|
+
soniox: "Soniox",
|
|
37697
|
+
elevenlabs: "ElevenLabs"
|
|
36798
37698
|
};
|
|
36799
37699
|
var ProviderWebsites = {
|
|
36800
37700
|
gladia: "https://gladia.io",
|
|
@@ -36803,7 +37703,8 @@ var ProviderWebsites = {
|
|
|
36803
37703
|
"openai-whisper": "https://openai.com",
|
|
36804
37704
|
"azure-stt": "https://azure.microsoft.com/services/cognitive-services/speech-to-text/",
|
|
36805
37705
|
speechmatics: "https://speechmatics.com",
|
|
36806
|
-
soniox: "https://soniox.com"
|
|
37706
|
+
soniox: "https://soniox.com",
|
|
37707
|
+
elevenlabs: "https://elevenlabs.io"
|
|
36807
37708
|
};
|
|
36808
37709
|
var ProviderDocs = {
|
|
36809
37710
|
gladia: "https://docs.gladia.io",
|
|
@@ -36812,7 +37713,8 @@ var ProviderDocs = {
|
|
|
36812
37713
|
"openai-whisper": "https://platform.openai.com/docs/guides/speech-to-text",
|
|
36813
37714
|
"azure-stt": "https://learn.microsoft.com/azure/cognitive-services/speech-service/",
|
|
36814
37715
|
speechmatics: "https://docs.speechmatics.com",
|
|
36815
|
-
soniox: "https://soniox.com/docs/stt/"
|
|
37716
|
+
soniox: "https://soniox.com/docs/stt/",
|
|
37717
|
+
elevenlabs: "https://elevenlabs.io/docs/capabilities/speech-to-text"
|
|
36816
37718
|
};
|
|
36817
37719
|
var AllProviders = [
|
|
36818
37720
|
"gladia",
|
|
@@ -36821,7 +37723,8 @@ var AllProviders = [
|
|
|
36821
37723
|
"openai-whisper",
|
|
36822
37724
|
"azure-stt",
|
|
36823
37725
|
"speechmatics",
|
|
36824
|
-
"soniox"
|
|
37726
|
+
"soniox",
|
|
37727
|
+
"elevenlabs"
|
|
36825
37728
|
];
|
|
36826
37729
|
var StreamingProviders = AllProviders.filter(
|
|
36827
37730
|
(p) => ProviderCapabilitiesMap[p].streaming
|
|
@@ -37546,6 +38449,77 @@ var TranslationConfigType = {
|
|
|
37546
38449
|
two_way: "two_way"
|
|
37547
38450
|
};
|
|
37548
38451
|
|
|
38452
|
+
// src/generated/elevenlabs/schema/index.ts
|
|
38453
|
+
var schema_exports8 = {};
|
|
38454
|
+
__export(schema_exports8, {
|
|
38455
|
+
BodySpeechToTextV1SpeechToTextPostFileFormat: () => BodySpeechToTextV1SpeechToTextPostFileFormat,
|
|
38456
|
+
BodySpeechToTextV1SpeechToTextPostModelId: () => BodySpeechToTextV1SpeechToTextPostModelId,
|
|
38457
|
+
BodySpeechToTextV1SpeechToTextPostTimestampsGranularity: () => BodySpeechToTextV1SpeechToTextPostTimestampsGranularity,
|
|
38458
|
+
DocxExportOptionsFormat: () => DocxExportOptionsFormat,
|
|
38459
|
+
HtmlExportOptionsFormat: () => HtmlExportOptionsFormat,
|
|
38460
|
+
PdfExportOptionsFormat: () => PdfExportOptionsFormat,
|
|
38461
|
+
SegmentedJsonExportOptionsFormat: () => SegmentedJsonExportOptionsFormat,
|
|
38462
|
+
SpeechToTextWordResponseModelType: () => SpeechToTextWordResponseModelType,
|
|
38463
|
+
SrtExportOptionsFormat: () => SrtExportOptionsFormat,
|
|
38464
|
+
TxtExportOptionsFormat: () => TxtExportOptionsFormat
|
|
38465
|
+
});
|
|
38466
|
+
|
|
38467
|
+
// src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostFileFormat.ts
|
|
38468
|
+
var BodySpeechToTextV1SpeechToTextPostFileFormat = {
|
|
38469
|
+
pcm_s16le_16: "pcm_s16le_16",
|
|
38470
|
+
other: "other"
|
|
38471
|
+
};
|
|
38472
|
+
|
|
38473
|
+
// src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostModelId.ts
|
|
38474
|
+
var BodySpeechToTextV1SpeechToTextPostModelId = {
|
|
38475
|
+
scribe_v1: "scribe_v1",
|
|
38476
|
+
scribe_v2: "scribe_v2"
|
|
38477
|
+
};
|
|
38478
|
+
|
|
38479
|
+
// src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostTimestampsGranularity.ts
|
|
38480
|
+
var BodySpeechToTextV1SpeechToTextPostTimestampsGranularity = {
|
|
38481
|
+
none: "none",
|
|
38482
|
+
word: "word",
|
|
38483
|
+
character: "character"
|
|
38484
|
+
};
|
|
38485
|
+
|
|
38486
|
+
// src/generated/elevenlabs/schema/docxExportOptionsFormat.ts
|
|
38487
|
+
var DocxExportOptionsFormat = {
|
|
38488
|
+
docx: "docx"
|
|
38489
|
+
};
|
|
38490
|
+
|
|
38491
|
+
// src/generated/elevenlabs/schema/htmlExportOptionsFormat.ts
|
|
38492
|
+
var HtmlExportOptionsFormat = {
|
|
38493
|
+
html: "html"
|
|
38494
|
+
};
|
|
38495
|
+
|
|
38496
|
+
// src/generated/elevenlabs/schema/pdfExportOptionsFormat.ts
|
|
38497
|
+
var PdfExportOptionsFormat = {
|
|
38498
|
+
pdf: "pdf"
|
|
38499
|
+
};
|
|
38500
|
+
|
|
38501
|
+
// src/generated/elevenlabs/schema/segmentedJsonExportOptionsFormat.ts
|
|
38502
|
+
var SegmentedJsonExportOptionsFormat = {
|
|
38503
|
+
segmented_json: "segmented_json"
|
|
38504
|
+
};
|
|
38505
|
+
|
|
38506
|
+
// src/generated/elevenlabs/schema/speechToTextWordResponseModelType.ts
|
|
38507
|
+
var SpeechToTextWordResponseModelType = {
|
|
38508
|
+
word: "word",
|
|
38509
|
+
spacing: "spacing",
|
|
38510
|
+
audio_event: "audio_event"
|
|
38511
|
+
};
|
|
38512
|
+
|
|
38513
|
+
// src/generated/elevenlabs/schema/srtExportOptionsFormat.ts
|
|
38514
|
+
var SrtExportOptionsFormat = {
|
|
38515
|
+
srt: "srt"
|
|
38516
|
+
};
|
|
38517
|
+
|
|
38518
|
+
// src/generated/elevenlabs/schema/txtExportOptionsFormat.ts
|
|
38519
|
+
var TxtExportOptionsFormat = {
|
|
38520
|
+
txt: "txt"
|
|
38521
|
+
};
|
|
38522
|
+
|
|
37549
38523
|
// src/generated/speechmatics/api/speechmaticsASRRESTAPI.zod.ts
|
|
37550
38524
|
var speechmaticsASRRESTAPI_zod_exports = {};
|
|
37551
38525
|
__export(speechmaticsASRRESTAPI_zod_exports, {
|
|
@@ -38658,6 +39632,448 @@ var getUsageResponse = import_zod12.z.object({
|
|
|
38658
39632
|
})
|
|
38659
39633
|
)
|
|
38660
39634
|
});
|
|
39635
|
+
|
|
39636
|
+
// src/generated/elevenlabs/api/elevenLabsSpeechToTextAPI.zod.ts
|
|
39637
|
+
var elevenLabsSpeechToTextAPI_zod_exports = {};
|
|
39638
|
+
__export(elevenLabsSpeechToTextAPI_zod_exports, {
|
|
39639
|
+
deleteTranscriptByIdHeader: () => deleteTranscriptByIdHeader,
|
|
39640
|
+
deleteTranscriptByIdParams: () => deleteTranscriptByIdParams,
|
|
39641
|
+
deleteTranscriptByIdResponse: () => deleteTranscriptByIdResponse,
|
|
39642
|
+
getTranscriptByIdHeader: () => getTranscriptByIdHeader,
|
|
39643
|
+
getTranscriptByIdParams: () => getTranscriptByIdParams,
|
|
39644
|
+
getTranscriptByIdResponse: () => getTranscriptByIdResponse,
|
|
39645
|
+
speechToTextBody: () => speechToTextBody,
|
|
39646
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault,
|
|
39647
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive,
|
|
39648
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour,
|
|
39649
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne,
|
|
39650
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree,
|
|
39651
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo,
|
|
39652
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault,
|
|
39653
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive,
|
|
39654
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour,
|
|
39655
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne,
|
|
39656
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree,
|
|
39657
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo,
|
|
39658
|
+
speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault: () => speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault,
|
|
39659
|
+
speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree: () => speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree,
|
|
39660
|
+
speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive,
|
|
39661
|
+
speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive,
|
|
39662
|
+
speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive,
|
|
39663
|
+
speechToTextBodyAdditionalFormatsMax: () => speechToTextBodyAdditionalFormatsMax,
|
|
39664
|
+
speechToTextBodyDiarizationThresholdMaxOne: () => speechToTextBodyDiarizationThresholdMaxOne,
|
|
39665
|
+
speechToTextBodyDiarizationThresholdMinOne: () => speechToTextBodyDiarizationThresholdMinOne,
|
|
39666
|
+
speechToTextBodyDiarizeDefault: () => speechToTextBodyDiarizeDefault,
|
|
39667
|
+
speechToTextBodyFileFormatDefault: () => speechToTextBodyFileFormatDefault,
|
|
39668
|
+
speechToTextBodyKeytermsDefault: () => speechToTextBodyKeytermsDefault,
|
|
39669
|
+
speechToTextBodyNoVerbatimDefault: () => speechToTextBodyNoVerbatimDefault,
|
|
39670
|
+
speechToTextBodyNumSpeakersMaxOne: () => speechToTextBodyNumSpeakersMaxOne,
|
|
39671
|
+
speechToTextBodySeedMaxOne: () => speechToTextBodySeedMaxOne,
|
|
39672
|
+
speechToTextBodySeedMinOne: () => speechToTextBodySeedMinOne,
|
|
39673
|
+
speechToTextBodyTagAudioEventsDefault: () => speechToTextBodyTagAudioEventsDefault,
|
|
39674
|
+
speechToTextBodyTemperatureMaxOne: () => speechToTextBodyTemperatureMaxOne,
|
|
39675
|
+
speechToTextBodyTemperatureMinOne: () => speechToTextBodyTemperatureMinOne,
|
|
39676
|
+
speechToTextBodyTimestampsGranularityDefault: () => speechToTextBodyTimestampsGranularityDefault,
|
|
39677
|
+
speechToTextBodyUseMultiChannelDefault: () => speechToTextBodyUseMultiChannelDefault,
|
|
39678
|
+
speechToTextBodyWebhookDefault: () => speechToTextBodyWebhookDefault,
|
|
39679
|
+
speechToTextHeader: () => speechToTextHeader,
|
|
39680
|
+
speechToTextQueryEnableLoggingDefault: () => speechToTextQueryEnableLoggingDefault,
|
|
39681
|
+
speechToTextQueryParams: () => speechToTextQueryParams,
|
|
39682
|
+
speechToTextResponse: () => speechToTextResponse
|
|
39683
|
+
});
|
|
39684
|
+
var import_zod13 = require("zod");
|
|
39685
|
+
var speechToTextQueryEnableLoggingDefault = true;
|
|
39686
|
+
var speechToTextQueryParams = import_zod13.z.object({
|
|
39687
|
+
enable_logging: import_zod13.z.boolean().default(speechToTextQueryEnableLoggingDefault).describe(
|
|
39688
|
+
"When enable_logging is set to false zero retention mode will be used for the request. This will mean log and transcript storage features are unavailable for this request. Zero retention mode may only be used by enterprise customers."
|
|
39689
|
+
)
|
|
39690
|
+
});
|
|
39691
|
+
var speechToTextHeader = import_zod13.z.object({
|
|
39692
|
+
"xi-api-key": import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
|
|
39693
|
+
"Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
|
|
39694
|
+
)
|
|
39695
|
+
});
|
|
39696
|
+
var speechToTextBodyTagAudioEventsDefault = true;
|
|
39697
|
+
var speechToTextBodyNumSpeakersMaxOne = 32;
|
|
39698
|
+
var speechToTextBodyTimestampsGranularityDefault = "word";
|
|
39699
|
+
var speechToTextBodyDiarizeDefault = false;
|
|
39700
|
+
var speechToTextBodyDiarizationThresholdMinOne = 0.1;
|
|
39701
|
+
var speechToTextBodyDiarizationThresholdMaxOne = 0.4;
|
|
39702
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault = true;
|
|
39703
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault = true;
|
|
39704
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne = true;
|
|
39705
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne = true;
|
|
39706
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo = true;
|
|
39707
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo = true;
|
|
39708
|
+
var speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault = 100;
|
|
39709
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree = true;
|
|
39710
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree = true;
|
|
39711
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour = true;
|
|
39712
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour = true;
|
|
39713
|
+
var speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree = 42;
|
|
39714
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive = false;
|
|
39715
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive = true;
|
|
39716
|
+
var speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive = 0.8;
|
|
39717
|
+
var speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive = 4;
|
|
39718
|
+
var speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive = 84;
|
|
39719
|
+
var speechToTextBodyAdditionalFormatsMax = 10;
|
|
39720
|
+
var speechToTextBodyFileFormatDefault = "other";
|
|
39721
|
+
var speechToTextBodyWebhookDefault = false;
|
|
39722
|
+
var speechToTextBodyTemperatureMinOne = 0;
|
|
39723
|
+
var speechToTextBodyTemperatureMaxOne = 2;
|
|
39724
|
+
var speechToTextBodySeedMinOne = 0;
|
|
39725
|
+
var speechToTextBodySeedMaxOne = 2147483647;
|
|
39726
|
+
var speechToTextBodyUseMultiChannelDefault = false;
|
|
39727
|
+
var speechToTextBodyNoVerbatimDefault = false;
|
|
39728
|
+
var speechToTextBodyKeytermsDefault = [];
|
|
39729
|
+
var speechToTextBody = import_zod13.z.object({
|
|
39730
|
+
model_id: import_zod13.z.enum(["scribe_v1", "scribe_v2"]).describe("The ID of the model to use for transcription."),
|
|
39731
|
+
file: import_zod13.z.instanceof(File).or(import_zod13.z.null()).optional().describe(
|
|
39732
|
+
"The file to transcribe. All major audio and video formats are supported. Exactly one of the file or cloud_storage_url parameters must be provided. The file size must be less than 3.0GB."
|
|
39733
|
+
),
|
|
39734
|
+
language_code: import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
|
|
39735
|
+
"An ISO-639-1 or ISO-639-3 language_code corresponding to the language of the audio file. Can sometimes improve transcription performance if known beforehand. Defaults to null, in this case the language is predicted automatically."
|
|
39736
|
+
),
|
|
39737
|
+
tag_audio_events: import_zod13.z.boolean().default(speechToTextBodyTagAudioEventsDefault).describe(
|
|
39738
|
+
"Whether to tag audio events like (laughter), (footsteps), etc. in the transcription."
|
|
39739
|
+
),
|
|
39740
|
+
num_speakers: import_zod13.z.number().min(1).max(speechToTextBodyNumSpeakersMaxOne).or(import_zod13.z.null()).optional().describe(
|
|
39741
|
+
"The maximum amount of speakers talking in the uploaded file. Can help with predicting who speaks when. The maximum amount of speakers that can be predicted is 32. Defaults to null, in this case the amount of speakers is set to the maximum value the model supports."
|
|
39742
|
+
),
|
|
39743
|
+
timestamps_granularity: import_zod13.z.enum(["none", "word", "character"]).default(speechToTextBodyTimestampsGranularityDefault).describe(
|
|
39744
|
+
"The granularity of the timestamps in the transcription. 'word' provides word-level timestamps and 'character' provides character-level timestamps per word."
|
|
39745
|
+
),
|
|
39746
|
+
diarize: import_zod13.z.boolean().optional().describe("Whether to annotate which speaker is currently talking in the uploaded file."),
|
|
39747
|
+
diarization_threshold: import_zod13.z.number().min(speechToTextBodyDiarizationThresholdMinOne).max(speechToTextBodyDiarizationThresholdMaxOne).or(import_zod13.z.null()).optional().describe(
|
|
39748
|
+
"Diarization threshold to apply during speaker diarization. A higher value means there will be a lower chance of one speaker being diarized as two different speakers but also a higher chance of two different speakers being diarized as one speaker (less total speakers predicted). A low value means there will be a higher chance of one speaker being diarized as two different speakers but also a lower chance of two different speakers being diarized as one speaker (more total speakers predicted). Can only be set when diarize=True and num_speakers=None. Defaults to None, in which case we will choose a threshold based on the model_id (0.22 usually)."
|
|
39749
|
+
),
|
|
39750
|
+
additional_formats: import_zod13.z.array(
|
|
39751
|
+
import_zod13.z.discriminatedUnion("format", [
|
|
39752
|
+
import_zod13.z.object({
|
|
39753
|
+
include_speakers: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault),
|
|
39754
|
+
include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault),
|
|
39755
|
+
format: import_zod13.z.enum(["segmented_json"]),
|
|
39756
|
+
segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
|
|
39757
|
+
max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
|
|
39758
|
+
max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).optional()
|
|
39759
|
+
}),
|
|
39760
|
+
import_zod13.z.object({
|
|
39761
|
+
include_speakers: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne),
|
|
39762
|
+
include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne),
|
|
39763
|
+
format: import_zod13.z.enum(["docx"]),
|
|
39764
|
+
segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
|
|
39765
|
+
max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
|
|
39766
|
+
max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).optional()
|
|
39767
|
+
}),
|
|
39768
|
+
import_zod13.z.object({
|
|
39769
|
+
include_speakers: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo),
|
|
39770
|
+
include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo),
|
|
39771
|
+
format: import_zod13.z.enum(["pdf"]),
|
|
39772
|
+
segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
|
|
39773
|
+
max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
|
|
39774
|
+
max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).optional()
|
|
39775
|
+
}),
|
|
39776
|
+
import_zod13.z.object({
|
|
39777
|
+
max_characters_per_line: import_zod13.z.number().or(import_zod13.z.null()).default(speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault),
|
|
39778
|
+
include_speakers: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree),
|
|
39779
|
+
include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree),
|
|
39780
|
+
format: import_zod13.z.enum(["txt"]),
|
|
39781
|
+
segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
|
|
39782
|
+
max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
|
|
39783
|
+
max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).optional()
|
|
39784
|
+
}),
|
|
39785
|
+
import_zod13.z.object({
|
|
39786
|
+
include_speakers: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour),
|
|
39787
|
+
include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour),
|
|
39788
|
+
format: import_zod13.z.enum(["html"]),
|
|
39789
|
+
segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
|
|
39790
|
+
max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).optional(),
|
|
39791
|
+
max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).optional()
|
|
39792
|
+
}),
|
|
39793
|
+
import_zod13.z.object({
|
|
39794
|
+
max_characters_per_line: import_zod13.z.number().or(import_zod13.z.null()).default(speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree),
|
|
39795
|
+
include_speakers: import_zod13.z.boolean().optional(),
|
|
39796
|
+
include_timestamps: import_zod13.z.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive),
|
|
39797
|
+
format: import_zod13.z.enum(["srt"]),
|
|
39798
|
+
segment_on_silence_longer_than_s: import_zod13.z.number().or(import_zod13.z.null()).default(
|
|
39799
|
+
speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive
|
|
39800
|
+
),
|
|
39801
|
+
max_segment_duration_s: import_zod13.z.number().or(import_zod13.z.null()).default(speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive),
|
|
39802
|
+
max_segment_chars: import_zod13.z.number().or(import_zod13.z.null()).default(speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive)
|
|
39803
|
+
})
|
|
39804
|
+
])
|
|
39805
|
+
).max(speechToTextBodyAdditionalFormatsMax).optional(),
|
|
39806
|
+
file_format: import_zod13.z.enum(["pcm_s16le_16", "other"]).default(speechToTextBodyFileFormatDefault).describe(
|
|
39807
|
+
"The format of input audio. Options are 'pcm_s16le_16' or 'other' For `pcm_s16le_16`, the input audio must be 16-bit PCM at a 16kHz sample rate, single channel (mono), and little-endian byte order. Latency will be lower than with passing an encoded waveform."
|
|
39808
|
+
),
|
|
39809
|
+
cloud_storage_url: import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
|
|
39810
|
+
"The HTTPS URL of the file to transcribe. Exactly one of the file or cloud_storage_url parameters must be provided. The file must be accessible via HTTPS and the file size must be less than 2GB. Any valid HTTPS URL is accepted, including URLs from cloud storage providers (AWS S3, Google Cloud Storage, Cloudflare R2, etc.), CDNs, or any other HTTPS source. URLs can be pre-signed or include authentication tokens in query parameters."
|
|
39811
|
+
),
|
|
39812
|
+
webhook: import_zod13.z.boolean().optional().describe(
|
|
39813
|
+
"Whether to send the transcription result to configured speech-to-text webhooks. If set the request will return early without the transcription, which will be delivered later via webhook."
|
|
39814
|
+
),
|
|
39815
|
+
webhook_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
|
|
39816
|
+
"Optional specific webhook ID to send the transcription result to. Only valid when webhook is set to true. If not provided, transcription will be sent to all configured speech-to-text webhooks."
|
|
39817
|
+
),
|
|
39818
|
+
temperature: import_zod13.z.number().min(speechToTextBodyTemperatureMinOne).max(speechToTextBodyTemperatureMaxOne).or(import_zod13.z.null()).optional().describe(
|
|
39819
|
+
"Controls the randomness of the transcription output. Accepts values between 0.0 and 2.0, where higher values result in more diverse and less deterministic results. If omitted, we will use a temperature based on the model you selected which is usually 0."
|
|
39820
|
+
),
|
|
39821
|
+
seed: import_zod13.z.number().min(speechToTextBodySeedMinOne).max(speechToTextBodySeedMaxOne).or(import_zod13.z.null()).optional().describe(
|
|
39822
|
+
"If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result. Determinism is not guaranteed. Must be an integer between 0 and 2147483647."
|
|
39823
|
+
),
|
|
39824
|
+
use_multi_channel: import_zod13.z.boolean().optional().describe(
|
|
39825
|
+
"Whether the audio file contains multiple channels where each channel contains a single speaker. When enabled, each channel will be transcribed independently and the results will be combined. Each word in the response will include a 'channel_index' field indicating which channel it was spoken on. A maximum of 5 channels is supported."
|
|
39826
|
+
),
|
|
39827
|
+
webhook_metadata: import_zod13.z.string().or(import_zod13.z.record(import_zod13.z.string(), import_zod13.z.any())).or(import_zod13.z.null()).optional().describe(
|
|
39828
|
+
"Optional metadata to be included in the webhook response. This should be a JSON string representing an object with a maximum depth of 2 levels and maximum size of 16KB. Useful for tracking internal IDs, job references, or other contextual information."
|
|
39829
|
+
),
|
|
39830
|
+
entity_detection: import_zod13.z.string().or(import_zod13.z.array(import_zod13.z.string())).or(import_zod13.z.null()).optional().describe(
|
|
39831
|
+
"Detect entities in the transcript. Can be 'all' to detect all entities, a single entity type or category string, or a list of entity types/categories. Categories include 'pii', 'phi', 'pci', 'other', 'offensive_language'. When enabled, detected entities will be returned in the 'entities' field with their text, type, and character positions. Usage of this parameter will incur additional costs."
|
|
39832
|
+
),
|
|
39833
|
+
no_verbatim: import_zod13.z.boolean().optional().describe(
|
|
39834
|
+
"If true, the transcription will not have any filler words, false starts and non-speech sounds. Only supported with scribe_v2 model."
|
|
39835
|
+
),
|
|
39836
|
+
keyterms: import_zod13.z.array(import_zod13.z.string()).default(speechToTextBodyKeytermsDefault).describe(
|
|
39837
|
+
'A list of keyterms to bias the transcription towards. The keyterms are words or phrases you want the model to recognise more accurately. The number of keyterms cannot exceed 100. The length of each keyterm must be less than 50 characters. Keyterms can contain at most 5 words (after normalisation). For example ["hello", "world", "technical term"]. Usage of this parameter will incur additional costs. '
|
|
39838
|
+
)
|
|
39839
|
+
});
|
|
39840
|
+
var speechToTextResponse = import_zod13.z.object({
|
|
39841
|
+
language_code: import_zod13.z.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
39842
|
+
language_probability: import_zod13.z.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
39843
|
+
text: import_zod13.z.string().describe("The raw text of the transcription."),
|
|
39844
|
+
words: import_zod13.z.array(
|
|
39845
|
+
import_zod13.z.object({
|
|
39846
|
+
text: import_zod13.z.string().describe("The word or sound that was transcribed."),
|
|
39847
|
+
start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
39848
|
+
end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
39849
|
+
type: import_zod13.z.enum(["word", "spacing", "audio_event"]).describe(
|
|
39850
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
39851
|
+
),
|
|
39852
|
+
speaker_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
39853
|
+
logprob: import_zod13.z.number().describe(
|
|
39854
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
39855
|
+
),
|
|
39856
|
+
characters: import_zod13.z.array(
|
|
39857
|
+
import_zod13.z.object({
|
|
39858
|
+
text: import_zod13.z.string().describe("The character that was transcribed."),
|
|
39859
|
+
start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the character in seconds."),
|
|
39860
|
+
end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the character in seconds.")
|
|
39861
|
+
})
|
|
39862
|
+
).or(import_zod13.z.null()).optional().describe("The characters that make up the word and their timing information.")
|
|
39863
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
39864
|
+
).describe("List of words with their timing information."),
|
|
39865
|
+
channel_index: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The channel index this transcript belongs to (for multichannel audio)."),
|
|
39866
|
+
additional_formats: import_zod13.z.array(
|
|
39867
|
+
import_zod13.z.object({
|
|
39868
|
+
requested_format: import_zod13.z.string().describe("The requested format."),
|
|
39869
|
+
file_extension: import_zod13.z.string().describe("The file extension of the additional format."),
|
|
39870
|
+
content_type: import_zod13.z.string().describe("The content type of the additional format."),
|
|
39871
|
+
is_base64_encoded: import_zod13.z.boolean().describe("Whether the content is base64 encoded."),
|
|
39872
|
+
content: import_zod13.z.string().describe("The content of the additional format.")
|
|
39873
|
+
}).or(import_zod13.z.null())
|
|
39874
|
+
).or(import_zod13.z.null()).optional().describe("Requested additional formats of the transcript."),
|
|
39875
|
+
transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response."),
|
|
39876
|
+
entities: import_zod13.z.array(
|
|
39877
|
+
import_zod13.z.object({
|
|
39878
|
+
text: import_zod13.z.string().describe("The text that was identified as an entity."),
|
|
39879
|
+
entity_type: import_zod13.z.string().describe(
|
|
39880
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
39881
|
+
),
|
|
39882
|
+
start_char: import_zod13.z.number().describe("Start character position in the transcript text."),
|
|
39883
|
+
end_char: import_zod13.z.number().describe("End character position in the transcript text.")
|
|
39884
|
+
})
|
|
39885
|
+
).or(import_zod13.z.null()).optional().describe(
|
|
39886
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
39887
|
+
)
|
|
39888
|
+
}).describe("Chunk-level detail of the transcription with timing information.").or(
|
|
39889
|
+
import_zod13.z.object({
|
|
39890
|
+
transcripts: import_zod13.z.array(
|
|
39891
|
+
import_zod13.z.object({
|
|
39892
|
+
language_code: import_zod13.z.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
39893
|
+
language_probability: import_zod13.z.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
39894
|
+
text: import_zod13.z.string().describe("The raw text of the transcription."),
|
|
39895
|
+
words: import_zod13.z.array(
|
|
39896
|
+
import_zod13.z.object({
|
|
39897
|
+
text: import_zod13.z.string().describe("The word or sound that was transcribed."),
|
|
39898
|
+
start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
39899
|
+
end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
39900
|
+
type: import_zod13.z.enum(["word", "spacing", "audio_event"]).describe(
|
|
39901
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
39902
|
+
),
|
|
39903
|
+
speaker_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
39904
|
+
logprob: import_zod13.z.number().describe(
|
|
39905
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
39906
|
+
),
|
|
39907
|
+
characters: import_zod13.z.array(
|
|
39908
|
+
import_zod13.z.object({
|
|
39909
|
+
text: import_zod13.z.string().describe("The character that was transcribed."),
|
|
39910
|
+
start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the character in seconds."),
|
|
39911
|
+
end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the character in seconds.")
|
|
39912
|
+
})
|
|
39913
|
+
).or(import_zod13.z.null()).optional().describe(
|
|
39914
|
+
"The characters that make up the word and their timing information."
|
|
39915
|
+
)
|
|
39916
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
39917
|
+
).describe("List of words with their timing information."),
|
|
39918
|
+
channel_index: import_zod13.z.number().or(import_zod13.z.null()).optional().describe(
|
|
39919
|
+
"The channel index this transcript belongs to (for multichannel audio)."
|
|
39920
|
+
),
|
|
39921
|
+
additional_formats: import_zod13.z.array(
|
|
39922
|
+
import_zod13.z.object({
|
|
39923
|
+
requested_format: import_zod13.z.string().describe("The requested format."),
|
|
39924
|
+
file_extension: import_zod13.z.string().describe("The file extension of the additional format."),
|
|
39925
|
+
content_type: import_zod13.z.string().describe("The content type of the additional format."),
|
|
39926
|
+
is_base64_encoded: import_zod13.z.boolean().describe("Whether the content is base64 encoded."),
|
|
39927
|
+
content: import_zod13.z.string().describe("The content of the additional format.")
|
|
39928
|
+
}).or(import_zod13.z.null())
|
|
39929
|
+
).or(import_zod13.z.null()).optional().describe("Requested additional formats of the transcript."),
|
|
39930
|
+
transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response."),
|
|
39931
|
+
entities: import_zod13.z.array(
|
|
39932
|
+
import_zod13.z.object({
|
|
39933
|
+
text: import_zod13.z.string().describe("The text that was identified as an entity."),
|
|
39934
|
+
entity_type: import_zod13.z.string().describe(
|
|
39935
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
39936
|
+
),
|
|
39937
|
+
start_char: import_zod13.z.number().describe("Start character position in the transcript text."),
|
|
39938
|
+
end_char: import_zod13.z.number().describe("End character position in the transcript text.")
|
|
39939
|
+
})
|
|
39940
|
+
).or(import_zod13.z.null()).optional().describe(
|
|
39941
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
39942
|
+
)
|
|
39943
|
+
}).describe("Chunk-level detail of the transcription with timing information.")
|
|
39944
|
+
).describe(
|
|
39945
|
+
"List of transcripts, one for each audio channel. Each transcript contains the text and word-level details for its respective channel."
|
|
39946
|
+
),
|
|
39947
|
+
transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response.")
|
|
39948
|
+
}).describe("Response model for multichannel speech-to-text transcription.")
|
|
39949
|
+
);
|
|
39950
|
+
var getTranscriptByIdParams = import_zod13.z.object({
|
|
39951
|
+
transcription_id: import_zod13.z.string().describe("The unique ID of the transcript to retrieve")
|
|
39952
|
+
});
|
|
39953
|
+
var getTranscriptByIdHeader = import_zod13.z.object({
|
|
39954
|
+
"xi-api-key": import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
|
|
39955
|
+
"Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
|
|
39956
|
+
)
|
|
39957
|
+
});
|
|
39958
|
+
var getTranscriptByIdResponse = import_zod13.z.object({
|
|
39959
|
+
language_code: import_zod13.z.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
39960
|
+
language_probability: import_zod13.z.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
39961
|
+
text: import_zod13.z.string().describe("The raw text of the transcription."),
|
|
39962
|
+
words: import_zod13.z.array(
|
|
39963
|
+
import_zod13.z.object({
|
|
39964
|
+
text: import_zod13.z.string().describe("The word or sound that was transcribed."),
|
|
39965
|
+
start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
39966
|
+
end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
39967
|
+
type: import_zod13.z.enum(["word", "spacing", "audio_event"]).describe(
|
|
39968
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
39969
|
+
),
|
|
39970
|
+
speaker_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
39971
|
+
logprob: import_zod13.z.number().describe(
|
|
39972
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
39973
|
+
),
|
|
39974
|
+
characters: import_zod13.z.array(
|
|
39975
|
+
import_zod13.z.object({
|
|
39976
|
+
text: import_zod13.z.string().describe("The character that was transcribed."),
|
|
39977
|
+
start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the character in seconds."),
|
|
39978
|
+
end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the character in seconds.")
|
|
39979
|
+
})
|
|
39980
|
+
).or(import_zod13.z.null()).optional().describe("The characters that make up the word and their timing information.")
|
|
39981
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
39982
|
+
).describe("List of words with their timing information."),
|
|
39983
|
+
channel_index: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The channel index this transcript belongs to (for multichannel audio)."),
|
|
39984
|
+
additional_formats: import_zod13.z.array(
|
|
39985
|
+
import_zod13.z.object({
|
|
39986
|
+
requested_format: import_zod13.z.string().describe("The requested format."),
|
|
39987
|
+
file_extension: import_zod13.z.string().describe("The file extension of the additional format."),
|
|
39988
|
+
content_type: import_zod13.z.string().describe("The content type of the additional format."),
|
|
39989
|
+
is_base64_encoded: import_zod13.z.boolean().describe("Whether the content is base64 encoded."),
|
|
39990
|
+
content: import_zod13.z.string().describe("The content of the additional format.")
|
|
39991
|
+
}).or(import_zod13.z.null())
|
|
39992
|
+
).or(import_zod13.z.null()).optional().describe("Requested additional formats of the transcript."),
|
|
39993
|
+
transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response."),
|
|
39994
|
+
entities: import_zod13.z.array(
|
|
39995
|
+
import_zod13.z.object({
|
|
39996
|
+
text: import_zod13.z.string().describe("The text that was identified as an entity."),
|
|
39997
|
+
entity_type: import_zod13.z.string().describe(
|
|
39998
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
39999
|
+
),
|
|
40000
|
+
start_char: import_zod13.z.number().describe("Start character position in the transcript text."),
|
|
40001
|
+
end_char: import_zod13.z.number().describe("End character position in the transcript text.")
|
|
40002
|
+
})
|
|
40003
|
+
).or(import_zod13.z.null()).optional().describe(
|
|
40004
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
40005
|
+
)
|
|
40006
|
+
}).describe("Chunk-level detail of the transcription with timing information.").or(
|
|
40007
|
+
import_zod13.z.object({
|
|
40008
|
+
transcripts: import_zod13.z.array(
|
|
40009
|
+
import_zod13.z.object({
|
|
40010
|
+
language_code: import_zod13.z.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
40011
|
+
language_probability: import_zod13.z.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
40012
|
+
text: import_zod13.z.string().describe("The raw text of the transcription."),
|
|
40013
|
+
words: import_zod13.z.array(
|
|
40014
|
+
import_zod13.z.object({
|
|
40015
|
+
text: import_zod13.z.string().describe("The word or sound that was transcribed."),
|
|
40016
|
+
start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
40017
|
+
end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
40018
|
+
type: import_zod13.z.enum(["word", "spacing", "audio_event"]).describe(
|
|
40019
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
40020
|
+
),
|
|
40021
|
+
speaker_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
40022
|
+
logprob: import_zod13.z.number().describe(
|
|
40023
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
40024
|
+
),
|
|
40025
|
+
characters: import_zod13.z.array(
|
|
40026
|
+
import_zod13.z.object({
|
|
40027
|
+
text: import_zod13.z.string().describe("The character that was transcribed."),
|
|
40028
|
+
start: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The start time of the character in seconds."),
|
|
40029
|
+
end: import_zod13.z.number().or(import_zod13.z.null()).optional().describe("The end time of the character in seconds.")
|
|
40030
|
+
})
|
|
40031
|
+
).or(import_zod13.z.null()).optional().describe(
|
|
40032
|
+
"The characters that make up the word and their timing information."
|
|
40033
|
+
)
|
|
40034
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
40035
|
+
).describe("List of words with their timing information."),
|
|
40036
|
+
channel_index: import_zod13.z.number().or(import_zod13.z.null()).optional().describe(
|
|
40037
|
+
"The channel index this transcript belongs to (for multichannel audio)."
|
|
40038
|
+
),
|
|
40039
|
+
additional_formats: import_zod13.z.array(
|
|
40040
|
+
import_zod13.z.object({
|
|
40041
|
+
requested_format: import_zod13.z.string().describe("The requested format."),
|
|
40042
|
+
file_extension: import_zod13.z.string().describe("The file extension of the additional format."),
|
|
40043
|
+
content_type: import_zod13.z.string().describe("The content type of the additional format."),
|
|
40044
|
+
is_base64_encoded: import_zod13.z.boolean().describe("Whether the content is base64 encoded."),
|
|
40045
|
+
content: import_zod13.z.string().describe("The content of the additional format.")
|
|
40046
|
+
}).or(import_zod13.z.null())
|
|
40047
|
+
).or(import_zod13.z.null()).optional().describe("Requested additional formats of the transcript."),
|
|
40048
|
+
transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response."),
|
|
40049
|
+
entities: import_zod13.z.array(
|
|
40050
|
+
import_zod13.z.object({
|
|
40051
|
+
text: import_zod13.z.string().describe("The text that was identified as an entity."),
|
|
40052
|
+
entity_type: import_zod13.z.string().describe(
|
|
40053
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
40054
|
+
),
|
|
40055
|
+
start_char: import_zod13.z.number().describe("Start character position in the transcript text."),
|
|
40056
|
+
end_char: import_zod13.z.number().describe("End character position in the transcript text.")
|
|
40057
|
+
})
|
|
40058
|
+
).or(import_zod13.z.null()).optional().describe(
|
|
40059
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
40060
|
+
)
|
|
40061
|
+
}).describe("Chunk-level detail of the transcription with timing information.")
|
|
40062
|
+
).describe(
|
|
40063
|
+
"List of transcripts, one for each audio channel. Each transcript contains the text and word-level details for its respective channel."
|
|
40064
|
+
),
|
|
40065
|
+
transcription_id: import_zod13.z.string().or(import_zod13.z.null()).optional().describe("The transcription ID of the response.")
|
|
40066
|
+
}).describe("Response model for multichannel speech-to-text transcription.")
|
|
40067
|
+
);
|
|
40068
|
+
var deleteTranscriptByIdParams = import_zod13.z.object({
|
|
40069
|
+
transcription_id: import_zod13.z.string().describe("The unique ID of the transcript to delete")
|
|
40070
|
+
});
|
|
40071
|
+
var deleteTranscriptByIdHeader = import_zod13.z.object({
|
|
40072
|
+
"xi-api-key": import_zod13.z.string().or(import_zod13.z.null()).optional().describe(
|
|
40073
|
+
"Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
|
|
40074
|
+
)
|
|
40075
|
+
});
|
|
40076
|
+
var deleteTranscriptByIdResponse = import_zod13.z.any();
|
|
38661
40077
|
// Annotate the CommonJS export names for ESM import in node:
|
|
38662
40078
|
0 && (module.exports = {
|
|
38663
40079
|
AllLanguageCodes,
|
|
@@ -38713,6 +40129,13 @@ var getUsageResponse = import_zod12.z.object({
|
|
|
38713
40129
|
DeepgramTranscriptionSchema,
|
|
38714
40130
|
DeepgramTypes,
|
|
38715
40131
|
DeepgramZodSchemas,
|
|
40132
|
+
ElevenLabsAdapter,
|
|
40133
|
+
ElevenLabsCapabilities,
|
|
40134
|
+
ElevenLabsLanguageCodes,
|
|
40135
|
+
ElevenLabsLanguageLabels,
|
|
40136
|
+
ElevenLabsLanguages,
|
|
40137
|
+
ElevenLabsTypes,
|
|
40138
|
+
ElevenLabsZodSchemas,
|
|
38716
40139
|
GladiaAdapter,
|
|
38717
40140
|
GladiaBitDepth,
|
|
38718
40141
|
GladiaCapabilities,
|
|
@@ -38799,6 +40222,7 @@ var getUsageResponse = import_zod12.z.object({
|
|
|
38799
40222
|
createAssemblyAIAdapter,
|
|
38800
40223
|
createAzureSTTAdapter,
|
|
38801
40224
|
createDeepgramAdapter,
|
|
40225
|
+
createElevenLabsAdapter,
|
|
38802
40226
|
createGladiaAdapter,
|
|
38803
40227
|
createOpenAIWhisperAdapter,
|
|
38804
40228
|
createSonioxAdapter,
|