voice-router-dev 0.7.9 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +202 -1
- package/README.md +21 -2
- package/dist/constants.d.mts +600 -12
- package/dist/constants.d.ts +600 -12
- package/dist/constants.js +548 -5
- package/dist/constants.mjs +537 -5
- package/dist/{field-configs-CaXYfrJg.d.mts → field-configs-CDVygOte.d.mts} +26 -20
- package/dist/{field-configs-CaXYfrJg.d.ts → field-configs-CDVygOte.d.ts} +26 -20
- package/dist/field-configs.d.mts +1 -1
- package/dist/field-configs.d.ts +1 -1
- package/dist/field-configs.js +7 -4
- package/dist/field-configs.mjs +7 -4
- package/dist/index.d.mts +3184 -1367
- package/dist/index.d.ts +3184 -1367
- package/dist/index.js +1529 -105
- package/dist/index.mjs +1521 -105
- package/dist/{provider-metadata-DVQcYIHe.d.mts → provider-metadata-BnkedpXm.d.mts} +34 -4
- package/dist/{provider-metadata-Derls1wa.d.ts → provider-metadata-DbsSGAO7.d.ts} +34 -4
- package/dist/provider-metadata.d.mts +2 -2
- package/dist/provider-metadata.d.ts +2 -2
- package/dist/provider-metadata.js +349 -9
- package/dist/provider-metadata.mjs +345 -9
- package/dist/{transcriptWebhookNotification-BTxv69ck.d.ts → transcriptWebhookNotification-BJk1CEF5.d.ts} +712 -9
- package/dist/{transcriptWebhookNotification-DCcbnAKP.d.mts → transcriptWebhookNotification-CNFpns9f.d.mts} +712 -9
- package/dist/webhooks.d.mts +102 -5
- package/dist/webhooks.d.ts +102 -5
- package/dist/webhooks.js +342 -39
- package/dist/webhooks.mjs +340 -39
- package/package.json +11 -5
package/dist/index.mjs
CHANGED
|
@@ -420,6 +420,22 @@ var DeepgramLanguage = {
|
|
|
420
420
|
yo: "yo",
|
|
421
421
|
zh: "zh",
|
|
422
422
|
// Regional variants
|
|
423
|
+
"ar-AE": "ar-AE",
|
|
424
|
+
"ar-DZ": "ar-DZ",
|
|
425
|
+
"ar-EG": "ar-EG",
|
|
426
|
+
"ar-IQ": "ar-IQ",
|
|
427
|
+
"ar-IR": "ar-IR",
|
|
428
|
+
"ar-JO": "ar-JO",
|
|
429
|
+
"ar-KW": "ar-KW",
|
|
430
|
+
"ar-LB": "ar-LB",
|
|
431
|
+
"ar-MA": "ar-MA",
|
|
432
|
+
"ar-PS": "ar-PS",
|
|
433
|
+
"ar-QA": "ar-QA",
|
|
434
|
+
"ar-SA": "ar-SA",
|
|
435
|
+
"ar-SD": "ar-SD",
|
|
436
|
+
"ar-SY": "ar-SY",
|
|
437
|
+
"ar-TD": "ar-TD",
|
|
438
|
+
"ar-TN": "ar-TN",
|
|
423
439
|
"be-BY": "be-BY",
|
|
424
440
|
"bn-IN": "bn-IN",
|
|
425
441
|
"bs-BA": "bs-BA",
|
|
@@ -734,7 +750,9 @@ var SonioxLanguage = {
|
|
|
734
750
|
|
|
735
751
|
// src/generated/soniox/models.ts
|
|
736
752
|
var SonioxModels = [
|
|
753
|
+
{ id: "stt-rt-v4", name: "Speech-to-Text Real-time v4", mode: "real_time" },
|
|
737
754
|
{ id: "stt-rt-v3", name: "Speech-to-Text Real-time v3", mode: "real_time" },
|
|
755
|
+
{ id: "stt-async-v4", name: "Speech-to-Text Async v4", mode: "async" },
|
|
738
756
|
{ id: "stt-async-v3", name: "Speech-to-Text Async v3", mode: "async" },
|
|
739
757
|
{ id: "stt-rt-preview", name: "Speech-to-Text Real-time Preview", mode: "real_time", aliasOf: "stt-rt-v3" },
|
|
740
758
|
{ id: "stt-async-preview", name: "Speech-to-Text Async Preview", mode: "async", aliasOf: "stt-async-v3" },
|
|
@@ -743,7 +761,9 @@ var SonioxModels = [
|
|
|
743
761
|
{ id: "stt-async-preview-v1", name: "Speech-to-Text Async Preview v1", mode: "async", aliasOf: "stt-async-v3" }
|
|
744
762
|
];
|
|
745
763
|
var SonioxModelCodes = [
|
|
764
|
+
"stt-rt-v4",
|
|
746
765
|
"stt-rt-v3",
|
|
766
|
+
"stt-async-v4",
|
|
747
767
|
"stt-async-v3",
|
|
748
768
|
"stt-rt-preview",
|
|
749
769
|
"stt-async-preview",
|
|
@@ -752,7 +772,9 @@ var SonioxModelCodes = [
|
|
|
752
772
|
"stt-async-preview-v1"
|
|
753
773
|
];
|
|
754
774
|
var SonioxModelLabels = {
|
|
775
|
+
"stt-rt-v4": "Speech-to-Text Real-time v4",
|
|
755
776
|
"stt-rt-v3": "Speech-to-Text Real-time v3",
|
|
777
|
+
"stt-async-v4": "Speech-to-Text Async v4",
|
|
756
778
|
"stt-async-v3": "Speech-to-Text Async v3",
|
|
757
779
|
"stt-rt-preview": "Speech-to-Text Real-time Preview",
|
|
758
780
|
"stt-async-preview": "Speech-to-Text Async Preview",
|
|
@@ -761,7 +783,9 @@ var SonioxModelLabels = {
|
|
|
761
783
|
"stt-async-preview-v1": "Speech-to-Text Async Preview v1"
|
|
762
784
|
};
|
|
763
785
|
var SonioxModel = {
|
|
786
|
+
stt_rt_v4: "stt-rt-v4",
|
|
764
787
|
stt_rt_v3: "stt-rt-v3",
|
|
788
|
+
stt_async_v4: "stt-async-v4",
|
|
765
789
|
stt_async_v3: "stt-async-v3",
|
|
766
790
|
stt_rt_preview: "stt-rt-preview",
|
|
767
791
|
stt_async_preview: "stt-async-preview",
|
|
@@ -770,12 +794,14 @@ var SonioxModel = {
|
|
|
770
794
|
stt_async_preview_v1: "stt-async-preview-v1"
|
|
771
795
|
};
|
|
772
796
|
var SonioxRealtimeModel = {
|
|
797
|
+
stt_rt_v4: "stt-rt-v4",
|
|
773
798
|
stt_rt_v3: "stt-rt-v3",
|
|
774
799
|
stt_rt_preview: "stt-rt-preview",
|
|
775
800
|
stt_rt_v3_preview: "stt-rt-v3-preview",
|
|
776
801
|
stt_rt_preview_v2: "stt-rt-preview-v2"
|
|
777
802
|
};
|
|
778
803
|
var SonioxAsyncModel = {
|
|
804
|
+
stt_async_v4: "stt-async-v4",
|
|
779
805
|
stt_async_v3: "stt-async-v3",
|
|
780
806
|
stt_async_preview: "stt-async-preview",
|
|
781
807
|
stt_async_preview_v1: "stt-async-preview-v1"
|
|
@@ -785,6 +811,7 @@ var SonioxAsyncModel = {
|
|
|
785
811
|
var SpeechmaticsLanguages = [
|
|
786
812
|
{ code: "auto", name: "Automatic Detection" },
|
|
787
813
|
{ code: "ar", name: "Arabic" },
|
|
814
|
+
{ code: "ar_en", name: "Arabic / English" },
|
|
788
815
|
{ code: "ba", name: "Bashkir" },
|
|
789
816
|
{ code: "be", name: "Belarusian" },
|
|
790
817
|
{ code: "bg", name: "Bulgarian" },
|
|
@@ -849,6 +876,7 @@ var SpeechmaticsLanguages = [
|
|
|
849
876
|
var SpeechmaticsLanguageCodes = [
|
|
850
877
|
"auto",
|
|
851
878
|
"ar",
|
|
879
|
+
"ar_en",
|
|
852
880
|
"ba",
|
|
853
881
|
"be",
|
|
854
882
|
"bg",
|
|
@@ -913,6 +941,7 @@ var SpeechmaticsLanguageCodes = [
|
|
|
913
941
|
var SpeechmaticsLanguageLabels = {
|
|
914
942
|
"auto": "Automatic Detection",
|
|
915
943
|
"ar": "Arabic",
|
|
944
|
+
"ar_en": "Arabic / English",
|
|
916
945
|
"ba": "Bashkir",
|
|
917
946
|
"be": "Belarusian",
|
|
918
947
|
"bg": "Bulgarian",
|
|
@@ -977,6 +1006,7 @@ var SpeechmaticsLanguageLabels = {
|
|
|
977
1006
|
var SpeechmaticsLanguage = {
|
|
978
1007
|
"auto": "auto",
|
|
979
1008
|
"ar": "ar",
|
|
1009
|
+
"ar_en": "ar_en",
|
|
980
1010
|
"ba": "ba",
|
|
981
1011
|
"be": "be",
|
|
982
1012
|
"bg": "bg",
|
|
@@ -1163,7 +1193,6 @@ var AzureLocales = [
|
|
|
1163
1193
|
{ code: "ne-NP", name: "Nepali (Nepal)" },
|
|
1164
1194
|
{ code: "nl-BE", name: "Dutch (Belgium)" },
|
|
1165
1195
|
{ code: "nl-NL", name: "Dutch (Netherlands)" },
|
|
1166
|
-
{ code: "non-HD", name: "Norse (Historical)" },
|
|
1167
1196
|
{ code: "or-IN", name: "Odia (India)" },
|
|
1168
1197
|
{ code: "pa-IN", name: "Punjabi (India)" },
|
|
1169
1198
|
{ code: "pl-PL", name: "Polish (Poland)" },
|
|
@@ -1177,7 +1206,9 @@ var AzureLocales = [
|
|
|
1177
1206
|
{ code: "sl-SI", name: "Slovenian (Slovenia)" },
|
|
1178
1207
|
{ code: "so-SO", name: "Somali (Somalia)" },
|
|
1179
1208
|
{ code: "sq-AL", name: "Albanian (Albania)" },
|
|
1209
|
+
{ code: "sr-ME", name: "Serbian (ME)" },
|
|
1180
1210
|
{ code: "sr-RS", name: "Serbian (Serbia)" },
|
|
1211
|
+
{ code: "sr-XK", name: "Serbian (XK)" },
|
|
1181
1212
|
{ code: "su-ID", name: "Sundanese (Indonesia)" },
|
|
1182
1213
|
{ code: "sv-SE", name: "Swedish (Sweden)" },
|
|
1183
1214
|
{ code: "sw-KE", name: "Swahili (Kenya)" },
|
|
@@ -1319,7 +1350,6 @@ var AzureLocaleCodes = [
|
|
|
1319
1350
|
"ne-NP",
|
|
1320
1351
|
"nl-BE",
|
|
1321
1352
|
"nl-NL",
|
|
1322
|
-
"non-HD",
|
|
1323
1353
|
"or-IN",
|
|
1324
1354
|
"pa-IN",
|
|
1325
1355
|
"pl-PL",
|
|
@@ -1333,7 +1363,9 @@ var AzureLocaleCodes = [
|
|
|
1333
1363
|
"sl-SI",
|
|
1334
1364
|
"so-SO",
|
|
1335
1365
|
"sq-AL",
|
|
1366
|
+
"sr-ME",
|
|
1336
1367
|
"sr-RS",
|
|
1368
|
+
"sr-XK",
|
|
1337
1369
|
"su-ID",
|
|
1338
1370
|
"sv-SE",
|
|
1339
1371
|
"sw-KE",
|
|
@@ -1475,7 +1507,6 @@ var AzureLocaleLabels = {
|
|
|
1475
1507
|
"ne-NP": "Nepali (Nepal)",
|
|
1476
1508
|
"nl-BE": "Dutch (Belgium)",
|
|
1477
1509
|
"nl-NL": "Dutch (Netherlands)",
|
|
1478
|
-
"non-HD": "Norse (Historical)",
|
|
1479
1510
|
"or-IN": "Odia (India)",
|
|
1480
1511
|
"pa-IN": "Punjabi (India)",
|
|
1481
1512
|
"pl-PL": "Polish (Poland)",
|
|
@@ -1489,7 +1520,9 @@ var AzureLocaleLabels = {
|
|
|
1489
1520
|
"sl-SI": "Slovenian (Slovenia)",
|
|
1490
1521
|
"so-SO": "Somali (Somalia)",
|
|
1491
1522
|
"sq-AL": "Albanian (Albania)",
|
|
1523
|
+
"sr-ME": "Serbian (ME)",
|
|
1492
1524
|
"sr-RS": "Serbian (Serbia)",
|
|
1525
|
+
"sr-XK": "Serbian (XK)",
|
|
1493
1526
|
"su-ID": "Sundanese (Indonesia)",
|
|
1494
1527
|
"sv-SE": "Swedish (Sweden)",
|
|
1495
1528
|
"sw-KE": "Swahili (Kenya)",
|
|
@@ -1631,7 +1664,6 @@ var AzureLocale = {
|
|
|
1631
1664
|
"ne-NP": "ne-NP",
|
|
1632
1665
|
"nl-BE": "nl-BE",
|
|
1633
1666
|
"nl-NL": "nl-NL",
|
|
1634
|
-
"non-HD": "non-HD",
|
|
1635
1667
|
"or-IN": "or-IN",
|
|
1636
1668
|
"pa-IN": "pa-IN",
|
|
1637
1669
|
"pl-PL": "pl-PL",
|
|
@@ -1645,7 +1677,9 @@ var AzureLocale = {
|
|
|
1645
1677
|
"sl-SI": "sl-SI",
|
|
1646
1678
|
"so-SO": "so-SO",
|
|
1647
1679
|
"sq-AL": "sq-AL",
|
|
1680
|
+
"sr-ME": "sr-ME",
|
|
1648
1681
|
"sr-RS": "sr-RS",
|
|
1682
|
+
"sr-XK": "sr-XK",
|
|
1649
1683
|
"su-ID": "su-ID",
|
|
1650
1684
|
"sv-SE": "sv-SE",
|
|
1651
1685
|
"sw-KE": "sw-KE",
|
|
@@ -1671,6 +1705,311 @@ var AzureLocale = {
|
|
|
1671
1705
|
"zu-ZA": "zu-ZA"
|
|
1672
1706
|
};
|
|
1673
1707
|
|
|
1708
|
+
// src/generated/elevenlabs/languages.ts
|
|
1709
|
+
var ElevenLabsLanguages = [
|
|
1710
|
+
{ code: "en", name: "English" },
|
|
1711
|
+
{ code: "zh", name: "Chinese" },
|
|
1712
|
+
{ code: "de", name: "German" },
|
|
1713
|
+
{ code: "es", name: "Spanish" },
|
|
1714
|
+
{ code: "ru", name: "Russian" },
|
|
1715
|
+
{ code: "ko", name: "Korean" },
|
|
1716
|
+
{ code: "fr", name: "French" },
|
|
1717
|
+
{ code: "ja", name: "Japanese" },
|
|
1718
|
+
{ code: "pt", name: "Portuguese" },
|
|
1719
|
+
{ code: "tr", name: "Turkish" },
|
|
1720
|
+
{ code: "pl", name: "Polish" },
|
|
1721
|
+
{ code: "ca", name: "Catalan" },
|
|
1722
|
+
{ code: "nl", name: "Dutch" },
|
|
1723
|
+
{ code: "ar", name: "Arabic" },
|
|
1724
|
+
{ code: "sv", name: "Swedish" },
|
|
1725
|
+
{ code: "it", name: "Italian" },
|
|
1726
|
+
{ code: "id", name: "Indonesian" },
|
|
1727
|
+
{ code: "hi", name: "Hindi" },
|
|
1728
|
+
{ code: "fi", name: "Finnish" },
|
|
1729
|
+
{ code: "vi", name: "Vietnamese" },
|
|
1730
|
+
{ code: "he", name: "Hebrew" },
|
|
1731
|
+
{ code: "uk", name: "Ukrainian" },
|
|
1732
|
+
{ code: "el", name: "Greek" },
|
|
1733
|
+
{ code: "ms", name: "Malay" },
|
|
1734
|
+
{ code: "cs", name: "Czech" },
|
|
1735
|
+
{ code: "ro", name: "Romanian" },
|
|
1736
|
+
{ code: "da", name: "Danish" },
|
|
1737
|
+
{ code: "hu", name: "Hungarian" },
|
|
1738
|
+
{ code: "ta", name: "Tamil" },
|
|
1739
|
+
{ code: "no", name: "Norwegian" },
|
|
1740
|
+
{ code: "th", name: "Thai" },
|
|
1741
|
+
{ code: "ur", name: "Urdu" },
|
|
1742
|
+
{ code: "hr", name: "Croatian" },
|
|
1743
|
+
{ code: "bg", name: "Bulgarian" },
|
|
1744
|
+
{ code: "lt", name: "Lithuanian" },
|
|
1745
|
+
{ code: "la", name: "Latin" },
|
|
1746
|
+
{ code: "mi", name: "Maori" },
|
|
1747
|
+
{ code: "ml", name: "Malayalam" },
|
|
1748
|
+
{ code: "cy", name: "Welsh" },
|
|
1749
|
+
{ code: "sk", name: "Slovak" },
|
|
1750
|
+
{ code: "te", name: "Telugu" },
|
|
1751
|
+
{ code: "fa", name: "Persian" },
|
|
1752
|
+
{ code: "lv", name: "Latvian" },
|
|
1753
|
+
{ code: "bn", name: "Bengali" },
|
|
1754
|
+
{ code: "sr", name: "Serbian" },
|
|
1755
|
+
{ code: "az", name: "Azerbaijani" },
|
|
1756
|
+
{ code: "sl", name: "Slovenian" },
|
|
1757
|
+
{ code: "kn", name: "Kannada" },
|
|
1758
|
+
{ code: "et", name: "Estonian" },
|
|
1759
|
+
{ code: "mk", name: "Macedonian" },
|
|
1760
|
+
{ code: "br", name: "Breton" },
|
|
1761
|
+
{ code: "eu", name: "Basque" },
|
|
1762
|
+
{ code: "is", name: "Icelandic" },
|
|
1763
|
+
{ code: "hy", name: "Armenian" },
|
|
1764
|
+
{ code: "ne", name: "Nepali" },
|
|
1765
|
+
{ code: "mn", name: "Mongolian" },
|
|
1766
|
+
{ code: "bs", name: "Bosnian" },
|
|
1767
|
+
{ code: "kk", name: "Kazakh" },
|
|
1768
|
+
{ code: "sq", name: "Albanian" },
|
|
1769
|
+
{ code: "sw", name: "Swahili" },
|
|
1770
|
+
{ code: "gl", name: "Galician" },
|
|
1771
|
+
{ code: "mr", name: "Marathi" },
|
|
1772
|
+
{ code: "pa", name: "Punjabi" },
|
|
1773
|
+
{ code: "si", name: "Sinhala" },
|
|
1774
|
+
{ code: "km", name: "Khmer" },
|
|
1775
|
+
{ code: "sn", name: "Shona" },
|
|
1776
|
+
{ code: "yo", name: "Yoruba" },
|
|
1777
|
+
{ code: "so", name: "Somali" },
|
|
1778
|
+
{ code: "af", name: "Afrikaans" },
|
|
1779
|
+
{ code: "oc", name: "Occitan" },
|
|
1780
|
+
{ code: "ka", name: "Georgian" },
|
|
1781
|
+
{ code: "be", name: "Belarusian" },
|
|
1782
|
+
{ code: "tg", name: "Tajik" },
|
|
1783
|
+
{ code: "sd", name: "Sindhi" },
|
|
1784
|
+
{ code: "gu", name: "Gujarati" },
|
|
1785
|
+
{ code: "am", name: "Amharic" },
|
|
1786
|
+
{ code: "yi", name: "Yiddish" },
|
|
1787
|
+
{ code: "lo", name: "Lao" },
|
|
1788
|
+
{ code: "uz", name: "Uzbek" },
|
|
1789
|
+
{ code: "fo", name: "Faroese" },
|
|
1790
|
+
{ code: "ht", name: "Haitian Creole" },
|
|
1791
|
+
{ code: "ps", name: "Pashto" },
|
|
1792
|
+
{ code: "tk", name: "Turkmen" },
|
|
1793
|
+
{ code: "nn", name: "Norwegian Nynorsk" },
|
|
1794
|
+
{ code: "mt", name: "Maltese" },
|
|
1795
|
+
{ code: "sa", name: "Sanskrit" },
|
|
1796
|
+
{ code: "lb", name: "Luxembourgish" },
|
|
1797
|
+
{ code: "my", name: "Burmese" },
|
|
1798
|
+
{ code: "bo", name: "Tibetan" },
|
|
1799
|
+
{ code: "tl", name: "Tagalog" },
|
|
1800
|
+
{ code: "mg", name: "Malagasy" },
|
|
1801
|
+
{ code: "as", name: "Assamese" },
|
|
1802
|
+
{ code: "tt", name: "Tatar" },
|
|
1803
|
+
{ code: "haw", name: "Hawaiian" },
|
|
1804
|
+
{ code: "ln", name: "Lingala" },
|
|
1805
|
+
{ code: "ha", name: "Hausa" },
|
|
1806
|
+
{ code: "ba", name: "Bashkir" },
|
|
1807
|
+
{ code: "jw", name: "Javanese" },
|
|
1808
|
+
{ code: "su", name: "Sundanese" }
|
|
1809
|
+
];
|
|
1810
|
+
var ElevenLabsLanguageCodes = [
|
|
1811
|
+
"en",
|
|
1812
|
+
"zh",
|
|
1813
|
+
"de",
|
|
1814
|
+
"es",
|
|
1815
|
+
"ru",
|
|
1816
|
+
"ko",
|
|
1817
|
+
"fr",
|
|
1818
|
+
"ja",
|
|
1819
|
+
"pt",
|
|
1820
|
+
"tr",
|
|
1821
|
+
"pl",
|
|
1822
|
+
"ca",
|
|
1823
|
+
"nl",
|
|
1824
|
+
"ar",
|
|
1825
|
+
"sv",
|
|
1826
|
+
"it",
|
|
1827
|
+
"id",
|
|
1828
|
+
"hi",
|
|
1829
|
+
"fi",
|
|
1830
|
+
"vi",
|
|
1831
|
+
"he",
|
|
1832
|
+
"uk",
|
|
1833
|
+
"el",
|
|
1834
|
+
"ms",
|
|
1835
|
+
"cs",
|
|
1836
|
+
"ro",
|
|
1837
|
+
"da",
|
|
1838
|
+
"hu",
|
|
1839
|
+
"ta",
|
|
1840
|
+
"no",
|
|
1841
|
+
"th",
|
|
1842
|
+
"ur",
|
|
1843
|
+
"hr",
|
|
1844
|
+
"bg",
|
|
1845
|
+
"lt",
|
|
1846
|
+
"la",
|
|
1847
|
+
"mi",
|
|
1848
|
+
"ml",
|
|
1849
|
+
"cy",
|
|
1850
|
+
"sk",
|
|
1851
|
+
"te",
|
|
1852
|
+
"fa",
|
|
1853
|
+
"lv",
|
|
1854
|
+
"bn",
|
|
1855
|
+
"sr",
|
|
1856
|
+
"az",
|
|
1857
|
+
"sl",
|
|
1858
|
+
"kn",
|
|
1859
|
+
"et",
|
|
1860
|
+
"mk",
|
|
1861
|
+
"br",
|
|
1862
|
+
"eu",
|
|
1863
|
+
"is",
|
|
1864
|
+
"hy",
|
|
1865
|
+
"ne",
|
|
1866
|
+
"mn",
|
|
1867
|
+
"bs",
|
|
1868
|
+
"kk",
|
|
1869
|
+
"sq",
|
|
1870
|
+
"sw",
|
|
1871
|
+
"gl",
|
|
1872
|
+
"mr",
|
|
1873
|
+
"pa",
|
|
1874
|
+
"si",
|
|
1875
|
+
"km",
|
|
1876
|
+
"sn",
|
|
1877
|
+
"yo",
|
|
1878
|
+
"so",
|
|
1879
|
+
"af",
|
|
1880
|
+
"oc",
|
|
1881
|
+
"ka",
|
|
1882
|
+
"be",
|
|
1883
|
+
"tg",
|
|
1884
|
+
"sd",
|
|
1885
|
+
"gu",
|
|
1886
|
+
"am",
|
|
1887
|
+
"yi",
|
|
1888
|
+
"lo",
|
|
1889
|
+
"uz",
|
|
1890
|
+
"fo",
|
|
1891
|
+
"ht",
|
|
1892
|
+
"ps",
|
|
1893
|
+
"tk",
|
|
1894
|
+
"nn",
|
|
1895
|
+
"mt",
|
|
1896
|
+
"sa",
|
|
1897
|
+
"lb",
|
|
1898
|
+
"my",
|
|
1899
|
+
"bo",
|
|
1900
|
+
"tl",
|
|
1901
|
+
"mg",
|
|
1902
|
+
"as",
|
|
1903
|
+
"tt",
|
|
1904
|
+
"haw",
|
|
1905
|
+
"ln",
|
|
1906
|
+
"ha",
|
|
1907
|
+
"ba",
|
|
1908
|
+
"jw",
|
|
1909
|
+
"su"
|
|
1910
|
+
];
|
|
1911
|
+
var ElevenLabsLanguageLabels = {
|
|
1912
|
+
en: "English",
|
|
1913
|
+
zh: "Chinese",
|
|
1914
|
+
de: "German",
|
|
1915
|
+
es: "Spanish",
|
|
1916
|
+
ru: "Russian",
|
|
1917
|
+
ko: "Korean",
|
|
1918
|
+
fr: "French",
|
|
1919
|
+
ja: "Japanese",
|
|
1920
|
+
pt: "Portuguese",
|
|
1921
|
+
tr: "Turkish",
|
|
1922
|
+
pl: "Polish",
|
|
1923
|
+
ca: "Catalan",
|
|
1924
|
+
nl: "Dutch",
|
|
1925
|
+
ar: "Arabic",
|
|
1926
|
+
sv: "Swedish",
|
|
1927
|
+
it: "Italian",
|
|
1928
|
+
id: "Indonesian",
|
|
1929
|
+
hi: "Hindi",
|
|
1930
|
+
fi: "Finnish",
|
|
1931
|
+
vi: "Vietnamese",
|
|
1932
|
+
he: "Hebrew",
|
|
1933
|
+
uk: "Ukrainian",
|
|
1934
|
+
el: "Greek",
|
|
1935
|
+
ms: "Malay",
|
|
1936
|
+
cs: "Czech",
|
|
1937
|
+
ro: "Romanian",
|
|
1938
|
+
da: "Danish",
|
|
1939
|
+
hu: "Hungarian",
|
|
1940
|
+
ta: "Tamil",
|
|
1941
|
+
no: "Norwegian",
|
|
1942
|
+
th: "Thai",
|
|
1943
|
+
ur: "Urdu",
|
|
1944
|
+
hr: "Croatian",
|
|
1945
|
+
bg: "Bulgarian",
|
|
1946
|
+
lt: "Lithuanian",
|
|
1947
|
+
la: "Latin",
|
|
1948
|
+
mi: "Maori",
|
|
1949
|
+
ml: "Malayalam",
|
|
1950
|
+
cy: "Welsh",
|
|
1951
|
+
sk: "Slovak",
|
|
1952
|
+
te: "Telugu",
|
|
1953
|
+
fa: "Persian",
|
|
1954
|
+
lv: "Latvian",
|
|
1955
|
+
bn: "Bengali",
|
|
1956
|
+
sr: "Serbian",
|
|
1957
|
+
az: "Azerbaijani",
|
|
1958
|
+
sl: "Slovenian",
|
|
1959
|
+
kn: "Kannada",
|
|
1960
|
+
et: "Estonian",
|
|
1961
|
+
mk: "Macedonian",
|
|
1962
|
+
br: "Breton",
|
|
1963
|
+
eu: "Basque",
|
|
1964
|
+
is: "Icelandic",
|
|
1965
|
+
hy: "Armenian",
|
|
1966
|
+
ne: "Nepali",
|
|
1967
|
+
mn: "Mongolian",
|
|
1968
|
+
bs: "Bosnian",
|
|
1969
|
+
kk: "Kazakh",
|
|
1970
|
+
sq: "Albanian",
|
|
1971
|
+
sw: "Swahili",
|
|
1972
|
+
gl: "Galician",
|
|
1973
|
+
mr: "Marathi",
|
|
1974
|
+
pa: "Punjabi",
|
|
1975
|
+
si: "Sinhala",
|
|
1976
|
+
km: "Khmer",
|
|
1977
|
+
sn: "Shona",
|
|
1978
|
+
yo: "Yoruba",
|
|
1979
|
+
so: "Somali",
|
|
1980
|
+
af: "Afrikaans",
|
|
1981
|
+
oc: "Occitan",
|
|
1982
|
+
ka: "Georgian",
|
|
1983
|
+
be: "Belarusian",
|
|
1984
|
+
tg: "Tajik",
|
|
1985
|
+
sd: "Sindhi",
|
|
1986
|
+
gu: "Gujarati",
|
|
1987
|
+
am: "Amharic",
|
|
1988
|
+
yi: "Yiddish",
|
|
1989
|
+
lo: "Lao",
|
|
1990
|
+
uz: "Uzbek",
|
|
1991
|
+
fo: "Faroese",
|
|
1992
|
+
ht: "Haitian Creole",
|
|
1993
|
+
ps: "Pashto",
|
|
1994
|
+
tk: "Turkmen",
|
|
1995
|
+
nn: "Norwegian Nynorsk",
|
|
1996
|
+
mt: "Maltese",
|
|
1997
|
+
sa: "Sanskrit",
|
|
1998
|
+
lb: "Luxembourgish",
|
|
1999
|
+
my: "Burmese",
|
|
2000
|
+
bo: "Tibetan",
|
|
2001
|
+
tl: "Tagalog",
|
|
2002
|
+
mg: "Malagasy",
|
|
2003
|
+
as: "Assamese",
|
|
2004
|
+
tt: "Tatar",
|
|
2005
|
+
haw: "Hawaiian",
|
|
2006
|
+
ln: "Lingala",
|
|
2007
|
+
ha: "Hausa",
|
|
2008
|
+
ba: "Bashkir",
|
|
2009
|
+
jw: "Javanese",
|
|
2010
|
+
su: "Sundanese"
|
|
2011
|
+
};
|
|
2012
|
+
|
|
1674
2013
|
// src/generated/gladia/schema/streamingSupportedBitDepthEnum.ts
|
|
1675
2014
|
var StreamingSupportedBitDepthEnum = {
|
|
1676
2015
|
NUMBER_8: 8,
|
|
@@ -2212,6 +2551,16 @@ var DeepgramSampleRate = {
|
|
|
2212
2551
|
NUMBER_44100: 44100,
|
|
2213
2552
|
NUMBER_48000: 48e3
|
|
2214
2553
|
};
|
|
2554
|
+
var ElevenLabsRegion = {
|
|
2555
|
+
/** Global endpoint (default) */
|
|
2556
|
+
global: "global",
|
|
2557
|
+
/** United States */
|
|
2558
|
+
us: "us",
|
|
2559
|
+
/** European Union */
|
|
2560
|
+
eu: "eu",
|
|
2561
|
+
/** India */
|
|
2562
|
+
in: "in"
|
|
2563
|
+
};
|
|
2215
2564
|
var GladiaEncoding = StreamingSupportedEncodingEnum;
|
|
2216
2565
|
var GladiaSampleRate = StreamingSupportedSampleRateEnum;
|
|
2217
2566
|
var GladiaBitDepth = StreamingSupportedBitDepthEnum;
|
|
@@ -2422,6 +2771,20 @@ var BaseAdapter = class {
|
|
|
2422
2771
|
throw new Error(`API key is required for ${this.name} provider`);
|
|
2423
2772
|
}
|
|
2424
2773
|
}
|
|
2774
|
+
/**
|
|
2775
|
+
* Derive a WebSocket URL from an HTTP base URL
|
|
2776
|
+
*
|
|
2777
|
+
* Converts `https://` → `wss://` and `http://` → `ws://`
|
|
2778
|
+
*/
|
|
2779
|
+
deriveWsUrl(httpUrl) {
|
|
2780
|
+
if (httpUrl.startsWith("https://")) {
|
|
2781
|
+
return httpUrl.replace(/^https:\/\//, "wss://");
|
|
2782
|
+
}
|
|
2783
|
+
if (httpUrl.startsWith("http://")) {
|
|
2784
|
+
return httpUrl.replace(/^http:\/\//, "ws://");
|
|
2785
|
+
}
|
|
2786
|
+
return httpUrl;
|
|
2787
|
+
}
|
|
2425
2788
|
/**
|
|
2426
2789
|
* Build axios config for generated API client functions
|
|
2427
2790
|
*
|
|
@@ -2616,6 +2979,70 @@ function extractWords(words, mapper) {
|
|
|
2616
2979
|
const normalizedWords = words.map(mapper);
|
|
2617
2980
|
return normalizedWords.length > 0 ? normalizedWords : void 0;
|
|
2618
2981
|
}
|
|
2982
|
+
function buildUtterancesFromWords(words) {
|
|
2983
|
+
const utterances = [];
|
|
2984
|
+
let currentSpeaker;
|
|
2985
|
+
let currentWords = [];
|
|
2986
|
+
let utteranceStart = 0;
|
|
2987
|
+
for (const word of words) {
|
|
2988
|
+
if (!word.speaker) continue;
|
|
2989
|
+
if (word.speaker !== currentSpeaker) {
|
|
2990
|
+
if (currentSpeaker && currentWords.length > 0) {
|
|
2991
|
+
utterances.push({
|
|
2992
|
+
text: currentWords.map((w) => w.word).join(" "),
|
|
2993
|
+
start: utteranceStart,
|
|
2994
|
+
end: currentWords[currentWords.length - 1].end,
|
|
2995
|
+
speaker: currentSpeaker,
|
|
2996
|
+
words: currentWords
|
|
2997
|
+
});
|
|
2998
|
+
}
|
|
2999
|
+
currentSpeaker = word.speaker;
|
|
3000
|
+
currentWords = [word];
|
|
3001
|
+
utteranceStart = word.start;
|
|
3002
|
+
} else {
|
|
3003
|
+
currentWords.push(word);
|
|
3004
|
+
}
|
|
3005
|
+
}
|
|
3006
|
+
if (currentSpeaker && currentWords.length > 0) {
|
|
3007
|
+
utterances.push({
|
|
3008
|
+
text: currentWords.map((w) => w.word).join(" "),
|
|
3009
|
+
start: utteranceStart,
|
|
3010
|
+
end: currentWords[currentWords.length - 1].end,
|
|
3011
|
+
speaker: currentSpeaker,
|
|
3012
|
+
words: currentWords
|
|
3013
|
+
});
|
|
3014
|
+
}
|
|
3015
|
+
return utterances;
|
|
3016
|
+
}
|
|
3017
|
+
function buildTextFromSpeechmaticsResults(results) {
|
|
3018
|
+
const parts = [];
|
|
3019
|
+
let attachNext = false;
|
|
3020
|
+
for (const result of results) {
|
|
3021
|
+
if (result.type !== "word" && result.type !== "punctuation") continue;
|
|
3022
|
+
const content = result.alternatives?.[0]?.content;
|
|
3023
|
+
if (!content) continue;
|
|
3024
|
+
if (result.type === "punctuation") {
|
|
3025
|
+
const attaches = result.attaches_to;
|
|
3026
|
+
if (attaches === "previous" || attaches === "both") {
|
|
3027
|
+
parts.push(content);
|
|
3028
|
+
attachNext = attaches === "both";
|
|
3029
|
+
} else if (attaches === "next") {
|
|
3030
|
+
if (parts.length > 0) parts.push(" ");
|
|
3031
|
+
parts.push(content);
|
|
3032
|
+
attachNext = true;
|
|
3033
|
+
} else {
|
|
3034
|
+
if (parts.length > 0 && !attachNext) parts.push(" ");
|
|
3035
|
+
parts.push(content);
|
|
3036
|
+
attachNext = false;
|
|
3037
|
+
}
|
|
3038
|
+
} else {
|
|
3039
|
+
if (parts.length > 0 && !attachNext) parts.push(" ");
|
|
3040
|
+
parts.push(content);
|
|
3041
|
+
attachNext = false;
|
|
3042
|
+
}
|
|
3043
|
+
}
|
|
3044
|
+
return parts.join("");
|
|
3045
|
+
}
|
|
2619
3046
|
var STATUS_MAPPINGS = {
|
|
2620
3047
|
gladia: {
|
|
2621
3048
|
queued: "queued",
|
|
@@ -4226,7 +4653,8 @@ var GladiaAdapter = class extends BaseAdapter {
|
|
|
4226
4653
|
options?.region ? { region: options.region } : void 0,
|
|
4227
4654
|
this.getAxiosConfig()
|
|
4228
4655
|
);
|
|
4229
|
-
const { id, url:
|
|
4656
|
+
const { id, url: apiWsUrl } = initResponse.data;
|
|
4657
|
+
const wsUrl = this.config?.wsBaseUrl || apiWsUrl;
|
|
4230
4658
|
const ws = new WebSocket2(wsUrl);
|
|
4231
4659
|
let sessionStatus = "connecting";
|
|
4232
4660
|
setupWebSocketHandlers(ws, callbacks, (status) => {
|
|
@@ -4956,6 +5384,14 @@ var AssemblyAIAdapter = class extends BaseAdapter {
|
|
|
4956
5384
|
this.wsBaseUrl = "wss://streaming.assemblyai.com/v3/ws";
|
|
4957
5385
|
}
|
|
4958
5386
|
// v3 Universal Streaming endpoint
|
|
5387
|
+
initialize(config) {
|
|
5388
|
+
super.initialize(config);
|
|
5389
|
+
if (config.wsBaseUrl) {
|
|
5390
|
+
this.wsBaseUrl = config.wsBaseUrl;
|
|
5391
|
+
} else if (config.baseUrl) {
|
|
5392
|
+
this.wsBaseUrl = `${this.deriveWsUrl(config.baseUrl)}/v3/ws`;
|
|
5393
|
+
}
|
|
5394
|
+
}
|
|
4959
5395
|
/**
|
|
4960
5396
|
* Get axios config for generated API client functions
|
|
4961
5397
|
* Configures headers and base URL using authorization header
|
|
@@ -5867,7 +6303,7 @@ var DeepgramAdapter = class extends BaseAdapter {
|
|
|
5867
6303
|
this.projectId = config.projectId;
|
|
5868
6304
|
const host = this.getRegionalHost(config.region);
|
|
5869
6305
|
this.baseUrl = config.baseUrl || `https://${host}/v1`;
|
|
5870
|
-
this.wsBaseUrl = `wss://${host}/v1/listen
|
|
6306
|
+
this.wsBaseUrl = config.wsBaseUrl || (config.baseUrl ? `${this.deriveWsUrl(config.baseUrl)}/listen` : `wss://${host}/v1/listen`);
|
|
5871
6307
|
this.client = axios3.create({
|
|
5872
6308
|
baseURL: this.baseUrl,
|
|
5873
6309
|
timeout: config.timeout || 6e4,
|
|
@@ -5902,9 +6338,13 @@ var DeepgramAdapter = class extends BaseAdapter {
|
|
|
5902
6338
|
*/
|
|
5903
6339
|
setRegion(region) {
|
|
5904
6340
|
this.validateConfig();
|
|
5905
|
-
|
|
5906
|
-
|
|
5907
|
-
|
|
6341
|
+
if (!this.config.baseUrl) {
|
|
6342
|
+
const host = this.getRegionalHost(region);
|
|
6343
|
+
this.baseUrl = `https://${host}/v1`;
|
|
6344
|
+
if (!this.config.wsBaseUrl) {
|
|
6345
|
+
this.wsBaseUrl = `wss://${host}/v1/listen`;
|
|
6346
|
+
}
|
|
6347
|
+
}
|
|
5908
6348
|
this.client = axios3.create({
|
|
5909
6349
|
baseURL: this.baseUrl,
|
|
5910
6350
|
timeout: this.config.timeout || 6e4,
|
|
@@ -6219,7 +6659,7 @@ var DeepgramAdapter = class extends BaseAdapter {
|
|
|
6219
6659
|
start: w.start || 0,
|
|
6220
6660
|
end: w.end || 0,
|
|
6221
6661
|
confidence: w.confidence
|
|
6222
|
-
}))
|
|
6662
|
+
})) ?? []
|
|
6223
6663
|
}));
|
|
6224
6664
|
}
|
|
6225
6665
|
/**
|
|
@@ -6628,7 +7068,7 @@ var DeepgramAdapter = class extends BaseAdapter {
|
|
|
6628
7068
|
start: w.start,
|
|
6629
7069
|
end: w.end,
|
|
6630
7070
|
confidence: w.confidence
|
|
6631
|
-
}))
|
|
7071
|
+
})) ?? []
|
|
6632
7072
|
});
|
|
6633
7073
|
}
|
|
6634
7074
|
break;
|
|
@@ -7867,7 +8307,8 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
|
|
|
7867
8307
|
callbacks?.onUtterance?.({
|
|
7868
8308
|
text: transcription.transcript,
|
|
7869
8309
|
start: 0,
|
|
7870
|
-
end: 0
|
|
8310
|
+
end: 0,
|
|
8311
|
+
words: []
|
|
7871
8312
|
});
|
|
7872
8313
|
break;
|
|
7873
8314
|
}
|
|
@@ -7930,7 +8371,8 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
|
|
|
7930
8371
|
text: segment.text,
|
|
7931
8372
|
start: segment.start,
|
|
7932
8373
|
end: segment.end,
|
|
7933
|
-
confidence: void 0
|
|
8374
|
+
confidence: void 0,
|
|
8375
|
+
words: []
|
|
7934
8376
|
}));
|
|
7935
8377
|
const requestId2 = `openai-${Date.now()}`;
|
|
7936
8378
|
return {
|
|
@@ -8296,7 +8738,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
8296
8738
|
* Normalize Speechmatics response to unified format
|
|
8297
8739
|
*/
|
|
8298
8740
|
normalizeResponse(response) {
|
|
8299
|
-
const text = response.results
|
|
8741
|
+
const text = buildTextFromSpeechmaticsResults(response.results);
|
|
8300
8742
|
const words = response.results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
|
|
8301
8743
|
word: result.alternatives?.[0]?.content || "",
|
|
8302
8744
|
start: result.start_time,
|
|
@@ -8305,51 +8747,14 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
8305
8747
|
speaker: result.alternatives?.[0]?.speaker
|
|
8306
8748
|
}));
|
|
8307
8749
|
const speakerSet = /* @__PURE__ */ new Set();
|
|
8308
|
-
|
|
8309
|
-
if (
|
|
8310
|
-
const speaker = r.alternatives[0]?.speaker;
|
|
8311
|
-
if (speaker) speakerSet.add(speaker);
|
|
8312
|
-
}
|
|
8750
|
+
words.forEach((w) => {
|
|
8751
|
+
if (w.speaker) speakerSet.add(w.speaker);
|
|
8313
8752
|
});
|
|
8314
8753
|
const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
|
|
8315
8754
|
id,
|
|
8316
8755
|
label: `Speaker ${id}`
|
|
8317
8756
|
})) : void 0;
|
|
8318
|
-
const utterances =
|
|
8319
|
-
if (speakers) {
|
|
8320
|
-
let currentSpeaker;
|
|
8321
|
-
let currentUtterance = [];
|
|
8322
|
-
let utteranceStart = 0;
|
|
8323
|
-
response.results.filter((r) => r.type === "word" && r.alternatives).forEach((result, idx) => {
|
|
8324
|
-
const speaker = result.alternatives[0]?.speaker;
|
|
8325
|
-
const word = result.alternatives[0]?.content || "";
|
|
8326
|
-
if (speaker !== currentSpeaker) {
|
|
8327
|
-
if (currentSpeaker && currentUtterance.length > 0) {
|
|
8328
|
-
const prevResult = response.results.filter((r) => r.type === "word")[idx - 1];
|
|
8329
|
-
utterances.push({
|
|
8330
|
-
speaker: currentSpeaker,
|
|
8331
|
-
text: currentUtterance.join(" "),
|
|
8332
|
-
start: utteranceStart || 0,
|
|
8333
|
-
end: prevResult?.end_time || result.start_time || 0
|
|
8334
|
-
});
|
|
8335
|
-
}
|
|
8336
|
-
currentSpeaker = speaker;
|
|
8337
|
-
currentUtterance = [word];
|
|
8338
|
-
utteranceStart = result.start_time || 0;
|
|
8339
|
-
} else {
|
|
8340
|
-
currentUtterance.push(word);
|
|
8341
|
-
}
|
|
8342
|
-
});
|
|
8343
|
-
if (currentSpeaker && currentUtterance.length > 0) {
|
|
8344
|
-
const lastWord = response.results.filter((r) => r.type === "word").pop();
|
|
8345
|
-
utterances.push({
|
|
8346
|
-
speaker: currentSpeaker,
|
|
8347
|
-
text: currentUtterance.join(" "),
|
|
8348
|
-
start: utteranceStart,
|
|
8349
|
-
end: lastWord?.end_time || utteranceStart
|
|
8350
|
-
});
|
|
8351
|
-
}
|
|
8352
|
-
}
|
|
8757
|
+
const utterances = buildUtterancesFromWords(words);
|
|
8353
8758
|
return {
|
|
8354
8759
|
success: true,
|
|
8355
8760
|
provider: this.name,
|
|
@@ -8447,6 +8852,7 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
8447
8852
|
* Get the base URL for API requests
|
|
8448
8853
|
*/
|
|
8449
8854
|
get baseUrl() {
|
|
8855
|
+
if (this.config?.baseUrl) return this.config.baseUrl;
|
|
8450
8856
|
return `https://${this.getRegionalHost()}/v1`;
|
|
8451
8857
|
}
|
|
8452
8858
|
initialize(config) {
|
|
@@ -8610,7 +9016,8 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
8610
9016
|
this.validateConfig();
|
|
8611
9017
|
const sessionId = `soniox_${Date.now()}_${Math.random().toString(36).substring(7)}`;
|
|
8612
9018
|
const createdAt = /* @__PURE__ */ new Date();
|
|
8613
|
-
const
|
|
9019
|
+
const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalWsHost()}`);
|
|
9020
|
+
const wsUrl = new URL(`${wsBase}/transcribe-websocket`);
|
|
8614
9021
|
wsUrl.searchParams.set("api_key", this.config.apiKey);
|
|
8615
9022
|
const modelId = options?.sonioxStreaming?.model || options?.model || "stt-rt-preview";
|
|
8616
9023
|
wsUrl.searchParams.set("model", modelId);
|
|
@@ -8869,45 +9276,14 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
8869
9276
|
* Build utterances from tokens based on speaker changes
|
|
8870
9277
|
*/
|
|
8871
9278
|
buildUtterancesFromTokens(tokens) {
|
|
8872
|
-
const
|
|
8873
|
-
|
|
8874
|
-
|
|
8875
|
-
|
|
8876
|
-
|
|
8877
|
-
|
|
8878
|
-
|
|
8879
|
-
|
|
8880
|
-
end: token.end_ms ? token.end_ms / 1e3 : 0,
|
|
8881
|
-
confidence: token.confidence,
|
|
8882
|
-
speaker: token.speaker
|
|
8883
|
-
};
|
|
8884
|
-
if (token.speaker !== currentSpeaker) {
|
|
8885
|
-
if (currentSpeaker && currentWords.length > 0) {
|
|
8886
|
-
utterances.push({
|
|
8887
|
-
text: currentWords.map((w) => w.word).join(" "),
|
|
8888
|
-
start: utteranceStart,
|
|
8889
|
-
end: currentWords[currentWords.length - 1].end,
|
|
8890
|
-
speaker: currentSpeaker,
|
|
8891
|
-
words: currentWords
|
|
8892
|
-
});
|
|
8893
|
-
}
|
|
8894
|
-
currentSpeaker = token.speaker;
|
|
8895
|
-
currentWords = [word];
|
|
8896
|
-
utteranceStart = word.start;
|
|
8897
|
-
} else {
|
|
8898
|
-
currentWords.push(word);
|
|
8899
|
-
}
|
|
8900
|
-
}
|
|
8901
|
-
if (currentSpeaker && currentWords.length > 0) {
|
|
8902
|
-
utterances.push({
|
|
8903
|
-
text: currentWords.map((w) => w.word).join(" "),
|
|
8904
|
-
start: utteranceStart,
|
|
8905
|
-
end: currentWords[currentWords.length - 1].end,
|
|
8906
|
-
speaker: currentSpeaker,
|
|
8907
|
-
words: currentWords
|
|
8908
|
-
});
|
|
8909
|
-
}
|
|
8910
|
-
return utterances;
|
|
9279
|
+
const words = tokens.map((token) => ({
|
|
9280
|
+
word: token.text,
|
|
9281
|
+
start: token.start_ms ? token.start_ms / 1e3 : 0,
|
|
9282
|
+
end: token.end_ms ? token.end_ms / 1e3 : 0,
|
|
9283
|
+
confidence: token.confidence,
|
|
9284
|
+
speaker: token.speaker
|
|
9285
|
+
}));
|
|
9286
|
+
return buildUtterancesFromWords(words);
|
|
8911
9287
|
}
|
|
8912
9288
|
/**
|
|
8913
9289
|
* Normalize Soniox response to unified format
|
|
@@ -8931,7 +9307,7 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
8931
9307
|
id,
|
|
8932
9308
|
label: `Speaker ${id}`
|
|
8933
9309
|
})) : void 0;
|
|
8934
|
-
const utterances = response.tokens ? this.buildUtterancesFromTokens(response.tokens) : [];
|
|
9310
|
+
const utterances = response.tokens ? this.buildUtterancesFromTokens(response.tokens.filter((t) => t.is_final)) : [];
|
|
8935
9311
|
const language = response.tokens?.find((t) => t.language)?.language;
|
|
8936
9312
|
return {
|
|
8937
9313
|
success: true,
|
|
@@ -8960,6 +9336,501 @@ function createSonioxAdapter(config) {
|
|
|
8960
9336
|
return adapter;
|
|
8961
9337
|
}
|
|
8962
9338
|
|
|
9339
|
+
// src/adapters/elevenlabs-adapter.ts
|
|
9340
|
+
import axios10 from "axios";
|
|
9341
|
+
var ElevenLabsAdapter = class extends BaseAdapter {
|
|
9342
|
+
constructor() {
|
|
9343
|
+
super(...arguments);
|
|
9344
|
+
this.name = "elevenlabs";
|
|
9345
|
+
this.capabilities = {
|
|
9346
|
+
streaming: true,
|
|
9347
|
+
diarization: true,
|
|
9348
|
+
wordTimestamps: true,
|
|
9349
|
+
languageDetection: true,
|
|
9350
|
+
customVocabulary: true,
|
|
9351
|
+
summarization: false,
|
|
9352
|
+
sentimentAnalysis: false,
|
|
9353
|
+
entityDetection: true,
|
|
9354
|
+
piiRedaction: true,
|
|
9355
|
+
listTranscripts: false,
|
|
9356
|
+
deleteTranscript: false
|
|
9357
|
+
};
|
|
9358
|
+
this.region = ElevenLabsRegion.global;
|
|
9359
|
+
this.defaultModel = "scribe_v2";
|
|
9360
|
+
}
|
|
9361
|
+
/**
|
|
9362
|
+
* Get regional API host based on configured region
|
|
9363
|
+
*/
|
|
9364
|
+
getRegionalHost() {
|
|
9365
|
+
switch (this.region) {
|
|
9366
|
+
case ElevenLabsRegion.us:
|
|
9367
|
+
return "api.us.elevenlabs.io";
|
|
9368
|
+
case ElevenLabsRegion.eu:
|
|
9369
|
+
return "api.eu.residency.elevenlabs.io";
|
|
9370
|
+
case ElevenLabsRegion.in:
|
|
9371
|
+
return "api.in.residency.elevenlabs.io";
|
|
9372
|
+
case ElevenLabsRegion.global:
|
|
9373
|
+
default:
|
|
9374
|
+
return "api.elevenlabs.io";
|
|
9375
|
+
}
|
|
9376
|
+
}
|
|
9377
|
+
/**
|
|
9378
|
+
* Get the base URL for API requests
|
|
9379
|
+
*/
|
|
9380
|
+
get baseUrl() {
|
|
9381
|
+
if (this.config?.baseUrl) return this.config.baseUrl;
|
|
9382
|
+
return `https://${this.getRegionalHost()}`;
|
|
9383
|
+
}
|
|
9384
|
+
initialize(config) {
|
|
9385
|
+
super.initialize(config);
|
|
9386
|
+
if (config.region) {
|
|
9387
|
+
this.region = config.region;
|
|
9388
|
+
}
|
|
9389
|
+
if (config.model) {
|
|
9390
|
+
this.defaultModel = config.model;
|
|
9391
|
+
}
|
|
9392
|
+
this.client = axios10.create({
|
|
9393
|
+
baseURL: this.baseUrl,
|
|
9394
|
+
timeout: config.timeout || 12e4,
|
|
9395
|
+
headers: {
|
|
9396
|
+
"xi-api-key": config.apiKey,
|
|
9397
|
+
...config.headers
|
|
9398
|
+
}
|
|
9399
|
+
});
|
|
9400
|
+
}
|
|
9401
|
+
/**
|
|
9402
|
+
* Get current region
|
|
9403
|
+
*/
|
|
9404
|
+
getRegion() {
|
|
9405
|
+
return this.region;
|
|
9406
|
+
}
|
|
9407
|
+
/**
|
|
9408
|
+
* Set regional endpoint
|
|
9409
|
+
*/
|
|
9410
|
+
setRegion(region) {
|
|
9411
|
+
this.region = region;
|
|
9412
|
+
if (this.config?.apiKey) {
|
|
9413
|
+
this.client = axios10.create({
|
|
9414
|
+
baseURL: this.baseUrl,
|
|
9415
|
+
timeout: this.config.timeout || 12e4,
|
|
9416
|
+
headers: {
|
|
9417
|
+
"xi-api-key": this.config.apiKey,
|
|
9418
|
+
...this.config.headers
|
|
9419
|
+
}
|
|
9420
|
+
});
|
|
9421
|
+
}
|
|
9422
|
+
}
|
|
9423
|
+
/**
|
|
9424
|
+
* Submit audio for transcription
|
|
9425
|
+
*
|
|
9426
|
+
* ElevenLabs batch is synchronous - the API returns the result directly.
|
|
9427
|
+
*/
|
|
9428
|
+
async transcribe(audio, options) {
|
|
9429
|
+
this.validateConfig();
|
|
9430
|
+
try {
|
|
9431
|
+
const formData = new FormData();
|
|
9432
|
+
const modelId = options?.model || this.defaultModel;
|
|
9433
|
+
formData.append("model_id", modelId);
|
|
9434
|
+
if (audio.type === "url") {
|
|
9435
|
+
formData.append("cloud_storage_url", audio.url);
|
|
9436
|
+
} else if (audio.type === "file") {
|
|
9437
|
+
const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
|
|
9438
|
+
formData.append("file", audioBlob, audio.filename || "audio.wav");
|
|
9439
|
+
} else {
|
|
9440
|
+
return {
|
|
9441
|
+
success: false,
|
|
9442
|
+
provider: this.name,
|
|
9443
|
+
error: {
|
|
9444
|
+
code: "INVALID_INPUT",
|
|
9445
|
+
message: "ElevenLabs only supports URL and File audio input"
|
|
9446
|
+
}
|
|
9447
|
+
};
|
|
9448
|
+
}
|
|
9449
|
+
if (options?.language) {
|
|
9450
|
+
formData.append("language_code", options.language);
|
|
9451
|
+
}
|
|
9452
|
+
if (options?.diarization) {
|
|
9453
|
+
formData.append("diarize", "true");
|
|
9454
|
+
}
|
|
9455
|
+
formData.append("timestamps_granularity", "word");
|
|
9456
|
+
if (options?.speakersExpected) {
|
|
9457
|
+
formData.append("num_speakers", String(options.speakersExpected));
|
|
9458
|
+
}
|
|
9459
|
+
if (options?.customVocabulary && options.customVocabulary.length > 0) {
|
|
9460
|
+
for (const term of options.customVocabulary) {
|
|
9461
|
+
formData.append("keyterms", term);
|
|
9462
|
+
}
|
|
9463
|
+
}
|
|
9464
|
+
if (options?.entityDetection) {
|
|
9465
|
+
formData.append("entity_detection", "all");
|
|
9466
|
+
}
|
|
9467
|
+
const elevenlabsOpts = options?.elevenlabs;
|
|
9468
|
+
if (elevenlabsOpts) {
|
|
9469
|
+
for (const [key, value] of Object.entries(elevenlabsOpts)) {
|
|
9470
|
+
if (value === void 0 || value === null) continue;
|
|
9471
|
+
if (formData.has(key)) continue;
|
|
9472
|
+
if (typeof value === "boolean") {
|
|
9473
|
+
formData.append(key, String(value));
|
|
9474
|
+
} else if (Array.isArray(value)) {
|
|
9475
|
+
for (const item of value) {
|
|
9476
|
+
formData.append(key, typeof item === "object" ? JSON.stringify(item) : String(item));
|
|
9477
|
+
}
|
|
9478
|
+
} else if (typeof value === "object") {
|
|
9479
|
+
formData.append(key, JSON.stringify(value));
|
|
9480
|
+
} else {
|
|
9481
|
+
formData.append(key, String(value));
|
|
9482
|
+
}
|
|
9483
|
+
}
|
|
9484
|
+
}
|
|
9485
|
+
const response = await this.client.post("/v1/speech-to-text", formData, {
|
|
9486
|
+
headers: {
|
|
9487
|
+
"Content-Type": "multipart/form-data"
|
|
9488
|
+
}
|
|
9489
|
+
});
|
|
9490
|
+
return this.normalizeResponse(response.data);
|
|
9491
|
+
} catch (error) {
|
|
9492
|
+
return this.createErrorResponse(error);
|
|
9493
|
+
}
|
|
9494
|
+
}
|
|
9495
|
+
/**
|
|
9496
|
+
* Get transcription result by ID
|
|
9497
|
+
*
|
|
9498
|
+
* ElevenLabs batch is synchronous, but supports transcript retrieval.
|
|
9499
|
+
*/
|
|
9500
|
+
async getTranscript(transcriptId) {
|
|
9501
|
+
this.validateConfig();
|
|
9502
|
+
try {
|
|
9503
|
+
const response = await this.client.get(`/v1/speech-to-text/transcripts/${transcriptId}`);
|
|
9504
|
+
return this.normalizeResponse(response.data);
|
|
9505
|
+
} catch (error) {
|
|
9506
|
+
return this.createErrorResponse(error);
|
|
9507
|
+
}
|
|
9508
|
+
}
|
|
9509
|
+
/**
|
|
9510
|
+
* Stream audio for real-time transcription
|
|
9511
|
+
*
|
|
9512
|
+
* Creates a WebSocket connection to ElevenLabs realtime STT endpoint.
|
|
9513
|
+
* Audio is sent as base64-encoded JSON messages.
|
|
9514
|
+
*/
|
|
9515
|
+
async transcribeStream(options, callbacks) {
|
|
9516
|
+
this.validateConfig();
|
|
9517
|
+
const sessionId = `elevenlabs_${Date.now()}_${Math.random().toString(36).substring(7)}`;
|
|
9518
|
+
const createdAt = /* @__PURE__ */ new Date();
|
|
9519
|
+
const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalHost()}`);
|
|
9520
|
+
const wsUrl = new URL(`${wsBase}/v1/speech-to-text/realtime`);
|
|
9521
|
+
const elOpts = options?.elevenlabsStreaming;
|
|
9522
|
+
const modelId = elOpts?.model || "scribe_v2_realtime";
|
|
9523
|
+
wsUrl.searchParams.set("model_id", modelId);
|
|
9524
|
+
const audioFormat = elOpts?.audioFormat || "pcm_16000";
|
|
9525
|
+
wsUrl.searchParams.set("audio_format", audioFormat);
|
|
9526
|
+
const langCode = elOpts?.languageCode || options?.language;
|
|
9527
|
+
if (langCode) {
|
|
9528
|
+
wsUrl.searchParams.set("language_code", langCode);
|
|
9529
|
+
}
|
|
9530
|
+
if (elOpts?.includeTimestamps !== void 0) {
|
|
9531
|
+
wsUrl.searchParams.set("include_timestamps", String(elOpts.includeTimestamps));
|
|
9532
|
+
}
|
|
9533
|
+
if (elOpts?.includeLanguageDetection || options?.languageDetection) {
|
|
9534
|
+
wsUrl.searchParams.set("include_language_detection", "true");
|
|
9535
|
+
}
|
|
9536
|
+
if (elOpts?.commitStrategy) {
|
|
9537
|
+
wsUrl.searchParams.set("commit_strategy", elOpts.commitStrategy);
|
|
9538
|
+
}
|
|
9539
|
+
if (elOpts?.vadSilenceThresholdSecs !== void 0) {
|
|
9540
|
+
wsUrl.searchParams.set("vad_silence_threshold_secs", String(elOpts.vadSilenceThresholdSecs));
|
|
9541
|
+
}
|
|
9542
|
+
if (elOpts?.vadThreshold !== void 0) {
|
|
9543
|
+
wsUrl.searchParams.set("vad_threshold", String(elOpts.vadThreshold));
|
|
9544
|
+
}
|
|
9545
|
+
if (elOpts?.minSpeechDurationMs !== void 0) {
|
|
9546
|
+
wsUrl.searchParams.set("min_speech_duration_ms", String(elOpts.minSpeechDurationMs));
|
|
9547
|
+
}
|
|
9548
|
+
if (elOpts?.minSilenceDurationMs !== void 0) {
|
|
9549
|
+
wsUrl.searchParams.set("min_silence_duration_ms", String(elOpts.minSilenceDurationMs));
|
|
9550
|
+
}
|
|
9551
|
+
if (elOpts?.previousText) {
|
|
9552
|
+
wsUrl.searchParams.set("previous_text", elOpts.previousText);
|
|
9553
|
+
}
|
|
9554
|
+
if (!elOpts?.audioFormat && options?.encoding) {
|
|
9555
|
+
const encodingMap = {
|
|
9556
|
+
linear16: "pcm_16000",
|
|
9557
|
+
pcm: "pcm_16000",
|
|
9558
|
+
mulaw: "ulaw_8000"
|
|
9559
|
+
};
|
|
9560
|
+
const mappedFormat = encodingMap[options.encoding];
|
|
9561
|
+
if (mappedFormat) {
|
|
9562
|
+
wsUrl.searchParams.set("audio_format", mappedFormat);
|
|
9563
|
+
}
|
|
9564
|
+
}
|
|
9565
|
+
let status = "connecting";
|
|
9566
|
+
let openedAt = null;
|
|
9567
|
+
let receivedData = false;
|
|
9568
|
+
const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : __require("ws");
|
|
9569
|
+
const ws = new WebSocketImpl(wsUrl.toString(), {
|
|
9570
|
+
headers: {
|
|
9571
|
+
"xi-api-key": this.config.apiKey
|
|
9572
|
+
}
|
|
9573
|
+
});
|
|
9574
|
+
ws.onopen = () => {
|
|
9575
|
+
status = "open";
|
|
9576
|
+
openedAt = Date.now();
|
|
9577
|
+
callbacks?.onOpen?.();
|
|
9578
|
+
};
|
|
9579
|
+
ws.onmessage = (event) => {
|
|
9580
|
+
receivedData = true;
|
|
9581
|
+
const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
|
|
9582
|
+
let messageType;
|
|
9583
|
+
try {
|
|
9584
|
+
const data = JSON.parse(rawPayload);
|
|
9585
|
+
if (data.error) {
|
|
9586
|
+
messageType = "error";
|
|
9587
|
+
} else if (data.message_type === "session_started") {
|
|
9588
|
+
messageType = "session_started";
|
|
9589
|
+
} else if (data.message_type === "partial_transcript") {
|
|
9590
|
+
messageType = "partial_transcript";
|
|
9591
|
+
} else if (data.message_type === "committed_transcript") {
|
|
9592
|
+
messageType = "committed_transcript";
|
|
9593
|
+
} else if (data.message_type === "committed_transcript_with_timestamps") {
|
|
9594
|
+
messageType = "committed_transcript_with_timestamps";
|
|
9595
|
+
}
|
|
9596
|
+
if (callbacks?.onRawMessage) {
|
|
9597
|
+
callbacks.onRawMessage({
|
|
9598
|
+
provider: this.name,
|
|
9599
|
+
direction: "incoming",
|
|
9600
|
+
timestamp: Date.now(),
|
|
9601
|
+
payload: rawPayload,
|
|
9602
|
+
messageType
|
|
9603
|
+
});
|
|
9604
|
+
}
|
|
9605
|
+
if (data.error) {
|
|
9606
|
+
callbacks?.onError?.({
|
|
9607
|
+
code: data.error_code?.toString() || "STREAM_ERROR",
|
|
9608
|
+
message: data.error
|
|
9609
|
+
});
|
|
9610
|
+
return;
|
|
9611
|
+
}
|
|
9612
|
+
if (data.message_type === "session_started") {
|
|
9613
|
+
return;
|
|
9614
|
+
}
|
|
9615
|
+
if (data.message_type === "partial_transcript") {
|
|
9616
|
+
const streamEvent = {
|
|
9617
|
+
type: "transcript",
|
|
9618
|
+
text: data.text || "",
|
|
9619
|
+
isFinal: false,
|
|
9620
|
+
confidence: void 0,
|
|
9621
|
+
language: data.language_code
|
|
9622
|
+
};
|
|
9623
|
+
callbacks?.onTranscript?.(streamEvent);
|
|
9624
|
+
return;
|
|
9625
|
+
}
|
|
9626
|
+
if (data.message_type === "committed_transcript" || data.message_type === "committed_transcript_with_timestamps") {
|
|
9627
|
+
const words = data.words ? data.words.map((w) => ({
|
|
9628
|
+
word: w.text || "",
|
|
9629
|
+
start: w.start || 0,
|
|
9630
|
+
end: w.end || 0,
|
|
9631
|
+
confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
|
|
9632
|
+
speaker: w.speaker_id
|
|
9633
|
+
})) : [];
|
|
9634
|
+
const streamEvent = {
|
|
9635
|
+
type: "transcript",
|
|
9636
|
+
text: data.text || "",
|
|
9637
|
+
isFinal: true,
|
|
9638
|
+
words: words.length > 0 ? words : void 0,
|
|
9639
|
+
speaker: words[0]?.speaker,
|
|
9640
|
+
language: data.language_code,
|
|
9641
|
+
confidence: void 0
|
|
9642
|
+
};
|
|
9643
|
+
callbacks?.onTranscript?.(streamEvent);
|
|
9644
|
+
if (options?.diarization && words.length > 0) {
|
|
9645
|
+
const utterances = buildUtterancesFromWords(words);
|
|
9646
|
+
for (const utterance of utterances) {
|
|
9647
|
+
callbacks?.onUtterance?.(utterance);
|
|
9648
|
+
}
|
|
9649
|
+
}
|
|
9650
|
+
}
|
|
9651
|
+
} catch (error) {
|
|
9652
|
+
callbacks?.onError?.({
|
|
9653
|
+
code: "PARSE_ERROR",
|
|
9654
|
+
message: `Failed to parse message: ${error}`
|
|
9655
|
+
});
|
|
9656
|
+
}
|
|
9657
|
+
};
|
|
9658
|
+
ws.onerror = () => {
|
|
9659
|
+
callbacks?.onError?.({
|
|
9660
|
+
code: "WEBSOCKET_ERROR",
|
|
9661
|
+
message: "WebSocket error occurred"
|
|
9662
|
+
});
|
|
9663
|
+
};
|
|
9664
|
+
ws.onclose = (event) => {
|
|
9665
|
+
status = "closed";
|
|
9666
|
+
const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
|
|
9667
|
+
const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
|
|
9668
|
+
if (isImmediateClose && event.code === 1e3) {
|
|
9669
|
+
callbacks?.onError?.({
|
|
9670
|
+
code: "ELEVENLABS_CONFIG_REJECTED",
|
|
9671
|
+
message: [
|
|
9672
|
+
"ElevenLabs closed connection immediately after opening.",
|
|
9673
|
+
`Current config: region=${this.region}, model=${modelId}`,
|
|
9674
|
+
"Likely causes:",
|
|
9675
|
+
" - Invalid API key",
|
|
9676
|
+
" - Unsupported audio format or model",
|
|
9677
|
+
event.reason ? `Server reason: ${event.reason}` : null
|
|
9678
|
+
].filter(Boolean).join("\n")
|
|
9679
|
+
});
|
|
9680
|
+
}
|
|
9681
|
+
callbacks?.onClose?.(event.code, event.reason);
|
|
9682
|
+
};
|
|
9683
|
+
await new Promise((resolve, reject) => {
|
|
9684
|
+
const timeout = setTimeout(() => {
|
|
9685
|
+
reject(new Error("WebSocket connection timeout"));
|
|
9686
|
+
}, 1e4);
|
|
9687
|
+
const checkOpen = () => {
|
|
9688
|
+
if (status === "open") {
|
|
9689
|
+
clearTimeout(timeout);
|
|
9690
|
+
resolve();
|
|
9691
|
+
} else if (status === "closed") {
|
|
9692
|
+
clearTimeout(timeout);
|
|
9693
|
+
reject(new Error("WebSocket connection failed"));
|
|
9694
|
+
} else {
|
|
9695
|
+
setTimeout(checkOpen, 100);
|
|
9696
|
+
}
|
|
9697
|
+
};
|
|
9698
|
+
checkOpen();
|
|
9699
|
+
});
|
|
9700
|
+
return {
|
|
9701
|
+
id: sessionId,
|
|
9702
|
+
provider: this.name,
|
|
9703
|
+
createdAt,
|
|
9704
|
+
getStatus: () => status,
|
|
9705
|
+
sendAudio: async (chunk) => {
|
|
9706
|
+
if (status !== "open") {
|
|
9707
|
+
throw new Error("Session is not open");
|
|
9708
|
+
}
|
|
9709
|
+
let base64Audio;
|
|
9710
|
+
if (chunk.data instanceof ArrayBuffer) {
|
|
9711
|
+
base64Audio = Buffer.from(chunk.data).toString("base64");
|
|
9712
|
+
} else if (chunk.data instanceof Uint8Array) {
|
|
9713
|
+
base64Audio = Buffer.from(
|
|
9714
|
+
chunk.data.buffer,
|
|
9715
|
+
chunk.data.byteOffset,
|
|
9716
|
+
chunk.data.byteLength
|
|
9717
|
+
).toString("base64");
|
|
9718
|
+
} else {
|
|
9719
|
+
base64Audio = Buffer.from(chunk.data).toString("base64");
|
|
9720
|
+
}
|
|
9721
|
+
const message = JSON.stringify({
|
|
9722
|
+
message_type: "input_audio_chunk",
|
|
9723
|
+
audio_base_64: base64Audio
|
|
9724
|
+
});
|
|
9725
|
+
if (callbacks?.onRawMessage) {
|
|
9726
|
+
callbacks.onRawMessage({
|
|
9727
|
+
provider: this.name,
|
|
9728
|
+
direction: "outgoing",
|
|
9729
|
+
timestamp: Date.now(),
|
|
9730
|
+
payload: message,
|
|
9731
|
+
messageType: "audio"
|
|
9732
|
+
});
|
|
9733
|
+
}
|
|
9734
|
+
ws.send(message);
|
|
9735
|
+
},
|
|
9736
|
+
close: async () => {
|
|
9737
|
+
if (status === "open") {
|
|
9738
|
+
status = "closing";
|
|
9739
|
+
ws.send(JSON.stringify({ message_type: "end_of_stream" }));
|
|
9740
|
+
ws.close(1e3, "Client requested close");
|
|
9741
|
+
}
|
|
9742
|
+
}
|
|
9743
|
+
};
|
|
9744
|
+
}
|
|
9745
|
+
/**
|
|
9746
|
+
* Normalize ElevenLabs response to unified format
|
|
9747
|
+
*
|
|
9748
|
+
* ElevenLabs returns either:
|
|
9749
|
+
* - Single channel: `SpeechToTextChunkResponseModel` directly (text, words, etc.)
|
|
9750
|
+
* - Multi-channel: `MultichannelSpeechToTextResponseModel` with `transcripts[]`
|
|
9751
|
+
*/
|
|
9752
|
+
normalizeResponse(response) {
|
|
9753
|
+
const chunks = response.transcripts ? response.transcripts : [response];
|
|
9754
|
+
const text = chunks.map((c) => c.text).join(" ");
|
|
9755
|
+
const words = [];
|
|
9756
|
+
const speakerSet = /* @__PURE__ */ new Set();
|
|
9757
|
+
const audioEvents = [];
|
|
9758
|
+
for (const chunk of chunks) {
|
|
9759
|
+
if (!chunk.words) continue;
|
|
9760
|
+
for (const w of chunk.words) {
|
|
9761
|
+
if (w.type === "audio_event") {
|
|
9762
|
+
audioEvents.push({
|
|
9763
|
+
text: w.text,
|
|
9764
|
+
start: typeof w.start === "number" ? w.start : 0,
|
|
9765
|
+
end: typeof w.end === "number" ? w.end : 0
|
|
9766
|
+
});
|
|
9767
|
+
continue;
|
|
9768
|
+
}
|
|
9769
|
+
const speakerId = w.speaker_id ?? void 0;
|
|
9770
|
+
const word = {
|
|
9771
|
+
word: w.text,
|
|
9772
|
+
start: typeof w.start === "number" ? w.start : 0,
|
|
9773
|
+
end: typeof w.end === "number" ? w.end : 0,
|
|
9774
|
+
confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
|
|
9775
|
+
speaker: speakerId ?? void 0
|
|
9776
|
+
};
|
|
9777
|
+
words.push(word);
|
|
9778
|
+
if (speakerId) {
|
|
9779
|
+
speakerSet.add(speakerId);
|
|
9780
|
+
}
|
|
9781
|
+
}
|
|
9782
|
+
}
|
|
9783
|
+
const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
|
|
9784
|
+
id,
|
|
9785
|
+
label: `Speaker ${id}`
|
|
9786
|
+
})) : void 0;
|
|
9787
|
+
const utterances = words.length > 0 ? buildUtterancesFromWords(words) : [];
|
|
9788
|
+
const language = chunks[0]?.language_code;
|
|
9789
|
+
const languageProbability = chunks[0]?.language_probability;
|
|
9790
|
+
const entities = [];
|
|
9791
|
+
for (const chunk of chunks) {
|
|
9792
|
+
if (chunk.entities && Array.isArray(chunk.entities)) {
|
|
9793
|
+
for (const entity of chunk.entities) {
|
|
9794
|
+
entities.push({
|
|
9795
|
+
text: entity.text,
|
|
9796
|
+
entity_type: entity.entity_type,
|
|
9797
|
+
start_char: entity.start_char,
|
|
9798
|
+
end_char: entity.end_char
|
|
9799
|
+
});
|
|
9800
|
+
}
|
|
9801
|
+
}
|
|
9802
|
+
}
|
|
9803
|
+
const transcriptionId = response.transcription_id || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
|
|
9804
|
+
return {
|
|
9805
|
+
success: true,
|
|
9806
|
+
provider: this.name,
|
|
9807
|
+
data: {
|
|
9808
|
+
id: transcriptionId,
|
|
9809
|
+
text,
|
|
9810
|
+
status: "completed",
|
|
9811
|
+
language,
|
|
9812
|
+
speakers,
|
|
9813
|
+
words: words.length > 0 ? words : void 0,
|
|
9814
|
+
utterances: utterances.length > 0 ? utterances : void 0
|
|
9815
|
+
},
|
|
9816
|
+
extended: {
|
|
9817
|
+
entities: entities.length > 0 ? entities : void 0,
|
|
9818
|
+
audioEvents: audioEvents.length > 0 ? audioEvents : void 0,
|
|
9819
|
+
languageProbability
|
|
9820
|
+
},
|
|
9821
|
+
tracking: {
|
|
9822
|
+
requestId: transcriptionId
|
|
9823
|
+
},
|
|
9824
|
+
raw: response
|
|
9825
|
+
};
|
|
9826
|
+
}
|
|
9827
|
+
};
|
|
9828
|
+
function createElevenLabsAdapter(config) {
|
|
9829
|
+
const adapter = new ElevenLabsAdapter();
|
|
9830
|
+
adapter.initialize(config);
|
|
9831
|
+
return adapter;
|
|
9832
|
+
}
|
|
9833
|
+
|
|
8963
9834
|
// src/utils/zod-to-field-configs.ts
|
|
8964
9835
|
function unwrapZodType(schema) {
|
|
8965
9836
|
let inner = schema;
|
|
@@ -35559,7 +36430,8 @@ var speechmaticsTranscriptionConfigSchema = zod8.object({
|
|
|
35559
36430
|
enable_entities: zod8.boolean().optional(),
|
|
35560
36431
|
operating_point: zod8.enum(["standard", "enhanced"]).optional(),
|
|
35561
36432
|
punctuation_overrides: zod8.unknown().optional(),
|
|
35562
|
-
conversation_config: zod8.unknown().optional()
|
|
36433
|
+
conversation_config: zod8.unknown().optional(),
|
|
36434
|
+
channel_diarization_labels: zod8.array(zod8.string()).optional()
|
|
35563
36435
|
});
|
|
35564
36436
|
var speechmaticsMidSessionConfigSchema = zod8.object({
|
|
35565
36437
|
language: zod8.string().optional().describe(
|
|
@@ -35576,18 +36448,19 @@ var speechmaticsMidSessionConfigSchema = zod8.object({
|
|
|
35576
36448
|
conversation_config: zod8.unknown().optional()
|
|
35577
36449
|
});
|
|
35578
36450
|
var speechmaticsSpeakerDiarizationConfigSchema = zod8.object({
|
|
35579
|
-
max_speakers: zod8.number().min(2).
|
|
36451
|
+
max_speakers: zod8.number().min(2).optional().describe(
|
|
35580
36452
|
"Configure the maximum number of speakers to detect. See [Max Speakers](http://docs.speechmatics.com/speech-to-text/features/diarization#max-speakers)."
|
|
35581
36453
|
),
|
|
35582
36454
|
prefer_current_speaker: zod8.boolean().optional().describe(
|
|
35583
36455
|
"When set to `true`, reduces the likelihood of incorrectly switching between similar sounding speakers. See [Prefer Current Speaker](https://docs.speechmatics.com/speech-to-text/features/diarization#prefer-current-speaker)."
|
|
35584
36456
|
),
|
|
35585
36457
|
speaker_sensitivity: zod8.number().min(0).max(1).optional(),
|
|
36458
|
+
get_speakers: zod8.boolean().optional().describe("If true, speaker identifiers will be returned at the end of transcript."),
|
|
35586
36459
|
speakers: zod8.array(
|
|
35587
36460
|
zod8.unknown()
|
|
35588
36461
|
/* TODO: resolve SpeakersInputItem */
|
|
35589
36462
|
).optional().describe(
|
|
35590
|
-
"Use this option to provide speaker labels linked to their speaker identifiers. When passed, the transcription system will tag spoken words in the transcript with the provided speaker labels whenever any of the specified speakers is detected in the audio.
|
|
36463
|
+
"Use this option to provide speaker labels linked to their speaker identifiers. When passed, the transcription system will tag spoken words in the transcript with the provided speaker labels whenever any of the specified speakers is detected in the audio. A maximum of 50 speakers identifiers across all speakers can be provided."
|
|
35591
36464
|
)
|
|
35592
36465
|
});
|
|
35593
36466
|
var speechmaticsConversationConfigSchema = zod8.object({
|
|
@@ -35613,7 +36486,8 @@ var streamingTranscriberParams2 = zod8.object({
|
|
|
35613
36486
|
"Whether or not to send Partials (i.e. `AddPartialTranslation` messages) as well as Finals (i.e. `AddTranslation` messages) See [Partial transcripts](https://docs.speechmatics.com/speech-to-text/realtime/output#partial-transcripts)."
|
|
35614
36487
|
),
|
|
35615
36488
|
enable_entities: zod8.boolean().optional(),
|
|
35616
|
-
operating_point: zod8.enum(["standard", "enhanced"]).optional()
|
|
36489
|
+
operating_point: zod8.enum(["standard", "enhanced"]).optional(),
|
|
36490
|
+
channel_diarization_labels: zod8.array(zod8.string()).optional()
|
|
35617
36491
|
});
|
|
35618
36492
|
var streamingUpdateConfigParams2 = zod8.object({
|
|
35619
36493
|
language: zod8.string().optional().describe(
|
|
@@ -36357,6 +37231,21 @@ var SonioxCapabilities = {
|
|
|
36357
37231
|
listTranscripts: false,
|
|
36358
37232
|
deleteTranscript: false
|
|
36359
37233
|
};
|
|
37234
|
+
var ElevenLabsCapabilities = {
|
|
37235
|
+
streaming: true,
|
|
37236
|
+
diarization: true,
|
|
37237
|
+
wordTimestamps: true,
|
|
37238
|
+
languageDetection: true,
|
|
37239
|
+
customVocabulary: true,
|
|
37240
|
+
// Via keyterms parameter
|
|
37241
|
+
summarization: false,
|
|
37242
|
+
sentimentAnalysis: false,
|
|
37243
|
+
entityDetection: true,
|
|
37244
|
+
piiRedaction: true,
|
|
37245
|
+
// Via entity_detection with PII categories
|
|
37246
|
+
listTranscripts: false,
|
|
37247
|
+
deleteTranscript: false
|
|
37248
|
+
};
|
|
36360
37249
|
var ProviderCapabilitiesMap = {
|
|
36361
37250
|
gladia: GladiaCapabilities,
|
|
36362
37251
|
assemblyai: AssemblyAICapabilities,
|
|
@@ -36364,7 +37253,8 @@ var ProviderCapabilitiesMap = {
|
|
|
36364
37253
|
"openai-whisper": OpenAICapabilities,
|
|
36365
37254
|
"azure-stt": AzureCapabilities,
|
|
36366
37255
|
speechmatics: SpeechmaticsCapabilities,
|
|
36367
|
-
soniox: SonioxCapabilities
|
|
37256
|
+
soniox: SonioxCapabilities,
|
|
37257
|
+
elevenlabs: ElevenLabsCapabilities
|
|
36368
37258
|
};
|
|
36369
37259
|
var CapabilityKeys = [
|
|
36370
37260
|
"streaming",
|
|
@@ -36566,7 +37456,8 @@ var AllLanguageCodes = {
|
|
|
36566
37456
|
// BCP-47 locale codes (e.g., "en-US")
|
|
36567
37457
|
speechmatics: SpeechmaticsLanguageCodes,
|
|
36568
37458
|
// ISO 639-1 codes with multilingual packs
|
|
36569
|
-
soniox: SonioxLanguageCodes
|
|
37459
|
+
soniox: SonioxLanguageCodes,
|
|
37460
|
+
elevenlabs: ElevenLabsLanguageCodes
|
|
36570
37461
|
};
|
|
36571
37462
|
var ProviderDisplayNames = {
|
|
36572
37463
|
gladia: "Gladia",
|
|
@@ -36575,7 +37466,8 @@ var ProviderDisplayNames = {
|
|
|
36575
37466
|
"openai-whisper": "OpenAI Whisper",
|
|
36576
37467
|
"azure-stt": "Azure Speech",
|
|
36577
37468
|
speechmatics: "Speechmatics",
|
|
36578
|
-
soniox: "Soniox"
|
|
37469
|
+
soniox: "Soniox",
|
|
37470
|
+
elevenlabs: "ElevenLabs"
|
|
36579
37471
|
};
|
|
36580
37472
|
var ProviderWebsites = {
|
|
36581
37473
|
gladia: "https://gladia.io",
|
|
@@ -36584,7 +37476,8 @@ var ProviderWebsites = {
|
|
|
36584
37476
|
"openai-whisper": "https://openai.com",
|
|
36585
37477
|
"azure-stt": "https://azure.microsoft.com/services/cognitive-services/speech-to-text/",
|
|
36586
37478
|
speechmatics: "https://speechmatics.com",
|
|
36587
|
-
soniox: "https://soniox.com"
|
|
37479
|
+
soniox: "https://soniox.com",
|
|
37480
|
+
elevenlabs: "https://elevenlabs.io"
|
|
36588
37481
|
};
|
|
36589
37482
|
var ProviderDocs = {
|
|
36590
37483
|
gladia: "https://docs.gladia.io",
|
|
@@ -36593,7 +37486,8 @@ var ProviderDocs = {
|
|
|
36593
37486
|
"openai-whisper": "https://platform.openai.com/docs/guides/speech-to-text",
|
|
36594
37487
|
"azure-stt": "https://learn.microsoft.com/azure/cognitive-services/speech-service/",
|
|
36595
37488
|
speechmatics: "https://docs.speechmatics.com",
|
|
36596
|
-
soniox: "https://soniox.com/docs/stt/"
|
|
37489
|
+
soniox: "https://soniox.com/docs/stt/",
|
|
37490
|
+
elevenlabs: "https://elevenlabs.io/docs/capabilities/speech-to-text"
|
|
36597
37491
|
};
|
|
36598
37492
|
var AllProviders = [
|
|
36599
37493
|
"gladia",
|
|
@@ -36602,7 +37496,8 @@ var AllProviders = [
|
|
|
36602
37496
|
"openai-whisper",
|
|
36603
37497
|
"azure-stt",
|
|
36604
37498
|
"speechmatics",
|
|
36605
|
-
"soniox"
|
|
37499
|
+
"soniox",
|
|
37500
|
+
"elevenlabs"
|
|
36606
37501
|
];
|
|
36607
37502
|
var StreamingProviders = AllProviders.filter(
|
|
36608
37503
|
(p) => ProviderCapabilitiesMap[p].streaming
|
|
@@ -37327,6 +38222,77 @@ var TranslationConfigType = {
|
|
|
37327
38222
|
two_way: "two_way"
|
|
37328
38223
|
};
|
|
37329
38224
|
|
|
38225
|
+
// src/generated/elevenlabs/schema/index.ts
|
|
38226
|
+
var schema_exports8 = {};
|
|
38227
|
+
__export(schema_exports8, {
|
|
38228
|
+
BodySpeechToTextV1SpeechToTextPostFileFormat: () => BodySpeechToTextV1SpeechToTextPostFileFormat,
|
|
38229
|
+
BodySpeechToTextV1SpeechToTextPostModelId: () => BodySpeechToTextV1SpeechToTextPostModelId,
|
|
38230
|
+
BodySpeechToTextV1SpeechToTextPostTimestampsGranularity: () => BodySpeechToTextV1SpeechToTextPostTimestampsGranularity,
|
|
38231
|
+
DocxExportOptionsFormat: () => DocxExportOptionsFormat,
|
|
38232
|
+
HtmlExportOptionsFormat: () => HtmlExportOptionsFormat,
|
|
38233
|
+
PdfExportOptionsFormat: () => PdfExportOptionsFormat,
|
|
38234
|
+
SegmentedJsonExportOptionsFormat: () => SegmentedJsonExportOptionsFormat,
|
|
38235
|
+
SpeechToTextWordResponseModelType: () => SpeechToTextWordResponseModelType,
|
|
38236
|
+
SrtExportOptionsFormat: () => SrtExportOptionsFormat,
|
|
38237
|
+
TxtExportOptionsFormat: () => TxtExportOptionsFormat
|
|
38238
|
+
});
|
|
38239
|
+
|
|
38240
|
+
// src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostFileFormat.ts
|
|
38241
|
+
var BodySpeechToTextV1SpeechToTextPostFileFormat = {
|
|
38242
|
+
pcm_s16le_16: "pcm_s16le_16",
|
|
38243
|
+
other: "other"
|
|
38244
|
+
};
|
|
38245
|
+
|
|
38246
|
+
// src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostModelId.ts
|
|
38247
|
+
var BodySpeechToTextV1SpeechToTextPostModelId = {
|
|
38248
|
+
scribe_v1: "scribe_v1",
|
|
38249
|
+
scribe_v2: "scribe_v2"
|
|
38250
|
+
};
|
|
38251
|
+
|
|
38252
|
+
// src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostTimestampsGranularity.ts
|
|
38253
|
+
var BodySpeechToTextV1SpeechToTextPostTimestampsGranularity = {
|
|
38254
|
+
none: "none",
|
|
38255
|
+
word: "word",
|
|
38256
|
+
character: "character"
|
|
38257
|
+
};
|
|
38258
|
+
|
|
38259
|
+
// src/generated/elevenlabs/schema/docxExportOptionsFormat.ts
|
|
38260
|
+
var DocxExportOptionsFormat = {
|
|
38261
|
+
docx: "docx"
|
|
38262
|
+
};
|
|
38263
|
+
|
|
38264
|
+
// src/generated/elevenlabs/schema/htmlExportOptionsFormat.ts
|
|
38265
|
+
var HtmlExportOptionsFormat = {
|
|
38266
|
+
html: "html"
|
|
38267
|
+
};
|
|
38268
|
+
|
|
38269
|
+
// src/generated/elevenlabs/schema/pdfExportOptionsFormat.ts
|
|
38270
|
+
var PdfExportOptionsFormat = {
|
|
38271
|
+
pdf: "pdf"
|
|
38272
|
+
};
|
|
38273
|
+
|
|
38274
|
+
// src/generated/elevenlabs/schema/segmentedJsonExportOptionsFormat.ts
|
|
38275
|
+
var SegmentedJsonExportOptionsFormat = {
|
|
38276
|
+
segmented_json: "segmented_json"
|
|
38277
|
+
};
|
|
38278
|
+
|
|
38279
|
+
// src/generated/elevenlabs/schema/speechToTextWordResponseModelType.ts
|
|
38280
|
+
var SpeechToTextWordResponseModelType = {
|
|
38281
|
+
word: "word",
|
|
38282
|
+
spacing: "spacing",
|
|
38283
|
+
audio_event: "audio_event"
|
|
38284
|
+
};
|
|
38285
|
+
|
|
38286
|
+
// src/generated/elevenlabs/schema/srtExportOptionsFormat.ts
|
|
38287
|
+
var SrtExportOptionsFormat = {
|
|
38288
|
+
srt: "srt"
|
|
38289
|
+
};
|
|
38290
|
+
|
|
38291
|
+
// src/generated/elevenlabs/schema/txtExportOptionsFormat.ts
|
|
38292
|
+
var TxtExportOptionsFormat = {
|
|
38293
|
+
txt: "txt"
|
|
38294
|
+
};
|
|
38295
|
+
|
|
37330
38296
|
// src/generated/speechmatics/api/speechmaticsASRRESTAPI.zod.ts
|
|
37331
38297
|
var speechmaticsASRRESTAPI_zod_exports = {};
|
|
37332
38298
|
__export(speechmaticsASRRESTAPI_zod_exports, {
|
|
@@ -38439,6 +39405,448 @@ var getUsageResponse = zod12.object({
|
|
|
38439
39405
|
})
|
|
38440
39406
|
)
|
|
38441
39407
|
});
|
|
39408
|
+
|
|
39409
|
+
// src/generated/elevenlabs/api/elevenLabsSpeechToTextAPI.zod.ts
|
|
39410
|
+
var elevenLabsSpeechToTextAPI_zod_exports = {};
|
|
39411
|
+
__export(elevenLabsSpeechToTextAPI_zod_exports, {
|
|
39412
|
+
deleteTranscriptByIdHeader: () => deleteTranscriptByIdHeader,
|
|
39413
|
+
deleteTranscriptByIdParams: () => deleteTranscriptByIdParams,
|
|
39414
|
+
deleteTranscriptByIdResponse: () => deleteTranscriptByIdResponse,
|
|
39415
|
+
getTranscriptByIdHeader: () => getTranscriptByIdHeader,
|
|
39416
|
+
getTranscriptByIdParams: () => getTranscriptByIdParams,
|
|
39417
|
+
getTranscriptByIdResponse: () => getTranscriptByIdResponse,
|
|
39418
|
+
speechToTextBody: () => speechToTextBody,
|
|
39419
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault,
|
|
39420
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive,
|
|
39421
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour,
|
|
39422
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne,
|
|
39423
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree,
|
|
39424
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo,
|
|
39425
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault,
|
|
39426
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive,
|
|
39427
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour,
|
|
39428
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne,
|
|
39429
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree,
|
|
39430
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo,
|
|
39431
|
+
speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault: () => speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault,
|
|
39432
|
+
speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree: () => speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree,
|
|
39433
|
+
speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive,
|
|
39434
|
+
speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive,
|
|
39435
|
+
speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive,
|
|
39436
|
+
speechToTextBodyAdditionalFormatsMax: () => speechToTextBodyAdditionalFormatsMax,
|
|
39437
|
+
speechToTextBodyDiarizationThresholdMaxOne: () => speechToTextBodyDiarizationThresholdMaxOne,
|
|
39438
|
+
speechToTextBodyDiarizationThresholdMinOne: () => speechToTextBodyDiarizationThresholdMinOne,
|
|
39439
|
+
speechToTextBodyDiarizeDefault: () => speechToTextBodyDiarizeDefault,
|
|
39440
|
+
speechToTextBodyFileFormatDefault: () => speechToTextBodyFileFormatDefault,
|
|
39441
|
+
speechToTextBodyKeytermsDefault: () => speechToTextBodyKeytermsDefault,
|
|
39442
|
+
speechToTextBodyNoVerbatimDefault: () => speechToTextBodyNoVerbatimDefault,
|
|
39443
|
+
speechToTextBodyNumSpeakersMaxOne: () => speechToTextBodyNumSpeakersMaxOne,
|
|
39444
|
+
speechToTextBodySeedMaxOne: () => speechToTextBodySeedMaxOne,
|
|
39445
|
+
speechToTextBodySeedMinOne: () => speechToTextBodySeedMinOne,
|
|
39446
|
+
speechToTextBodyTagAudioEventsDefault: () => speechToTextBodyTagAudioEventsDefault,
|
|
39447
|
+
speechToTextBodyTemperatureMaxOne: () => speechToTextBodyTemperatureMaxOne,
|
|
39448
|
+
speechToTextBodyTemperatureMinOne: () => speechToTextBodyTemperatureMinOne,
|
|
39449
|
+
speechToTextBodyTimestampsGranularityDefault: () => speechToTextBodyTimestampsGranularityDefault,
|
|
39450
|
+
speechToTextBodyUseMultiChannelDefault: () => speechToTextBodyUseMultiChannelDefault,
|
|
39451
|
+
speechToTextBodyWebhookDefault: () => speechToTextBodyWebhookDefault,
|
|
39452
|
+
speechToTextHeader: () => speechToTextHeader,
|
|
39453
|
+
speechToTextQueryEnableLoggingDefault: () => speechToTextQueryEnableLoggingDefault,
|
|
39454
|
+
speechToTextQueryParams: () => speechToTextQueryParams,
|
|
39455
|
+
speechToTextResponse: () => speechToTextResponse
|
|
39456
|
+
});
|
|
39457
|
+
import { z as zod13 } from "zod";
|
|
39458
|
+
var speechToTextQueryEnableLoggingDefault = true;
|
|
39459
|
+
var speechToTextQueryParams = zod13.object({
|
|
39460
|
+
enable_logging: zod13.boolean().default(speechToTextQueryEnableLoggingDefault).describe(
|
|
39461
|
+
"When enable_logging is set to false zero retention mode will be used for the request. This will mean log and transcript storage features are unavailable for this request. Zero retention mode may only be used by enterprise customers."
|
|
39462
|
+
)
|
|
39463
|
+
});
|
|
39464
|
+
var speechToTextHeader = zod13.object({
|
|
39465
|
+
"xi-api-key": zod13.string().or(zod13.null()).optional().describe(
|
|
39466
|
+
"Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
|
|
39467
|
+
)
|
|
39468
|
+
});
|
|
39469
|
+
var speechToTextBodyTagAudioEventsDefault = true;
|
|
39470
|
+
var speechToTextBodyNumSpeakersMaxOne = 32;
|
|
39471
|
+
var speechToTextBodyTimestampsGranularityDefault = "word";
|
|
39472
|
+
var speechToTextBodyDiarizeDefault = false;
|
|
39473
|
+
var speechToTextBodyDiarizationThresholdMinOne = 0.1;
|
|
39474
|
+
var speechToTextBodyDiarizationThresholdMaxOne = 0.4;
|
|
39475
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault = true;
|
|
39476
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault = true;
|
|
39477
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne = true;
|
|
39478
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne = true;
|
|
39479
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo = true;
|
|
39480
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo = true;
|
|
39481
|
+
var speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault = 100;
|
|
39482
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree = true;
|
|
39483
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree = true;
|
|
39484
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour = true;
|
|
39485
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour = true;
|
|
39486
|
+
var speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree = 42;
|
|
39487
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive = false;
|
|
39488
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive = true;
|
|
39489
|
+
var speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive = 0.8;
|
|
39490
|
+
var speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive = 4;
|
|
39491
|
+
var speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive = 84;
|
|
39492
|
+
var speechToTextBodyAdditionalFormatsMax = 10;
|
|
39493
|
+
var speechToTextBodyFileFormatDefault = "other";
|
|
39494
|
+
var speechToTextBodyWebhookDefault = false;
|
|
39495
|
+
var speechToTextBodyTemperatureMinOne = 0;
|
|
39496
|
+
var speechToTextBodyTemperatureMaxOne = 2;
|
|
39497
|
+
var speechToTextBodySeedMinOne = 0;
|
|
39498
|
+
var speechToTextBodySeedMaxOne = 2147483647;
|
|
39499
|
+
var speechToTextBodyUseMultiChannelDefault = false;
|
|
39500
|
+
var speechToTextBodyNoVerbatimDefault = false;
|
|
39501
|
+
var speechToTextBodyKeytermsDefault = [];
|
|
39502
|
+
var speechToTextBody = zod13.object({
|
|
39503
|
+
model_id: zod13.enum(["scribe_v1", "scribe_v2"]).describe("The ID of the model to use for transcription."),
|
|
39504
|
+
file: zod13.instanceof(File).or(zod13.null()).optional().describe(
|
|
39505
|
+
"The file to transcribe. All major audio and video formats are supported. Exactly one of the file or cloud_storage_url parameters must be provided. The file size must be less than 3.0GB."
|
|
39506
|
+
),
|
|
39507
|
+
language_code: zod13.string().or(zod13.null()).optional().describe(
|
|
39508
|
+
"An ISO-639-1 or ISO-639-3 language_code corresponding to the language of the audio file. Can sometimes improve transcription performance if known beforehand. Defaults to null, in this case the language is predicted automatically."
|
|
39509
|
+
),
|
|
39510
|
+
tag_audio_events: zod13.boolean().default(speechToTextBodyTagAudioEventsDefault).describe(
|
|
39511
|
+
"Whether to tag audio events like (laughter), (footsteps), etc. in the transcription."
|
|
39512
|
+
),
|
|
39513
|
+
num_speakers: zod13.number().min(1).max(speechToTextBodyNumSpeakersMaxOne).or(zod13.null()).optional().describe(
|
|
39514
|
+
"The maximum amount of speakers talking in the uploaded file. Can help with predicting who speaks when. The maximum amount of speakers that can be predicted is 32. Defaults to null, in this case the amount of speakers is set to the maximum value the model supports."
|
|
39515
|
+
),
|
|
39516
|
+
timestamps_granularity: zod13.enum(["none", "word", "character"]).default(speechToTextBodyTimestampsGranularityDefault).describe(
|
|
39517
|
+
"The granularity of the timestamps in the transcription. 'word' provides word-level timestamps and 'character' provides character-level timestamps per word."
|
|
39518
|
+
),
|
|
39519
|
+
diarize: zod13.boolean().optional().describe("Whether to annotate which speaker is currently talking in the uploaded file."),
|
|
39520
|
+
diarization_threshold: zod13.number().min(speechToTextBodyDiarizationThresholdMinOne).max(speechToTextBodyDiarizationThresholdMaxOne).or(zod13.null()).optional().describe(
|
|
39521
|
+
"Diarization threshold to apply during speaker diarization. A higher value means there will be a lower chance of one speaker being diarized as two different speakers but also a higher chance of two different speakers being diarized as one speaker (less total speakers predicted). A low value means there will be a higher chance of one speaker being diarized as two different speakers but also a lower chance of two different speakers being diarized as one speaker (more total speakers predicted). Can only be set when diarize=True and num_speakers=None. Defaults to None, in which case we will choose a threshold based on the model_id (0.22 usually)."
|
|
39522
|
+
),
|
|
39523
|
+
additional_formats: zod13.array(
|
|
39524
|
+
zod13.discriminatedUnion("format", [
|
|
39525
|
+
zod13.object({
|
|
39526
|
+
include_speakers: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault),
|
|
39527
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault),
|
|
39528
|
+
format: zod13.enum(["segmented_json"]),
|
|
39529
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).optional(),
|
|
39530
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).optional(),
|
|
39531
|
+
max_segment_chars: zod13.number().or(zod13.null()).optional()
|
|
39532
|
+
}),
|
|
39533
|
+
zod13.object({
|
|
39534
|
+
include_speakers: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne),
|
|
39535
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne),
|
|
39536
|
+
format: zod13.enum(["docx"]),
|
|
39537
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).optional(),
|
|
39538
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).optional(),
|
|
39539
|
+
max_segment_chars: zod13.number().or(zod13.null()).optional()
|
|
39540
|
+
}),
|
|
39541
|
+
zod13.object({
|
|
39542
|
+
include_speakers: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo),
|
|
39543
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo),
|
|
39544
|
+
format: zod13.enum(["pdf"]),
|
|
39545
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).optional(),
|
|
39546
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).optional(),
|
|
39547
|
+
max_segment_chars: zod13.number().or(zod13.null()).optional()
|
|
39548
|
+
}),
|
|
39549
|
+
zod13.object({
|
|
39550
|
+
max_characters_per_line: zod13.number().or(zod13.null()).default(speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault),
|
|
39551
|
+
include_speakers: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree),
|
|
39552
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree),
|
|
39553
|
+
format: zod13.enum(["txt"]),
|
|
39554
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).optional(),
|
|
39555
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).optional(),
|
|
39556
|
+
max_segment_chars: zod13.number().or(zod13.null()).optional()
|
|
39557
|
+
}),
|
|
39558
|
+
zod13.object({
|
|
39559
|
+
include_speakers: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour),
|
|
39560
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour),
|
|
39561
|
+
format: zod13.enum(["html"]),
|
|
39562
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).optional(),
|
|
39563
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).optional(),
|
|
39564
|
+
max_segment_chars: zod13.number().or(zod13.null()).optional()
|
|
39565
|
+
}),
|
|
39566
|
+
zod13.object({
|
|
39567
|
+
max_characters_per_line: zod13.number().or(zod13.null()).default(speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree),
|
|
39568
|
+
include_speakers: zod13.boolean().optional(),
|
|
39569
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive),
|
|
39570
|
+
format: zod13.enum(["srt"]),
|
|
39571
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).default(
|
|
39572
|
+
speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive
|
|
39573
|
+
),
|
|
39574
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).default(speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive),
|
|
39575
|
+
max_segment_chars: zod13.number().or(zod13.null()).default(speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive)
|
|
39576
|
+
})
|
|
39577
|
+
])
|
|
39578
|
+
).max(speechToTextBodyAdditionalFormatsMax).optional(),
|
|
39579
|
+
file_format: zod13.enum(["pcm_s16le_16", "other"]).default(speechToTextBodyFileFormatDefault).describe(
|
|
39580
|
+
"The format of input audio. Options are 'pcm_s16le_16' or 'other' For `pcm_s16le_16`, the input audio must be 16-bit PCM at a 16kHz sample rate, single channel (mono), and little-endian byte order. Latency will be lower than with passing an encoded waveform."
|
|
39581
|
+
),
|
|
39582
|
+
cloud_storage_url: zod13.string().or(zod13.null()).optional().describe(
|
|
39583
|
+
"The HTTPS URL of the file to transcribe. Exactly one of the file or cloud_storage_url parameters must be provided. The file must be accessible via HTTPS and the file size must be less than 2GB. Any valid HTTPS URL is accepted, including URLs from cloud storage providers (AWS S3, Google Cloud Storage, Cloudflare R2, etc.), CDNs, or any other HTTPS source. URLs can be pre-signed or include authentication tokens in query parameters."
|
|
39584
|
+
),
|
|
39585
|
+
webhook: zod13.boolean().optional().describe(
|
|
39586
|
+
"Whether to send the transcription result to configured speech-to-text webhooks. If set the request will return early without the transcription, which will be delivered later via webhook."
|
|
39587
|
+
),
|
|
39588
|
+
webhook_id: zod13.string().or(zod13.null()).optional().describe(
|
|
39589
|
+
"Optional specific webhook ID to send the transcription result to. Only valid when webhook is set to true. If not provided, transcription will be sent to all configured speech-to-text webhooks."
|
|
39590
|
+
),
|
|
39591
|
+
temperature: zod13.number().min(speechToTextBodyTemperatureMinOne).max(speechToTextBodyTemperatureMaxOne).or(zod13.null()).optional().describe(
|
|
39592
|
+
"Controls the randomness of the transcription output. Accepts values between 0.0 and 2.0, where higher values result in more diverse and less deterministic results. If omitted, we will use a temperature based on the model you selected which is usually 0."
|
|
39593
|
+
),
|
|
39594
|
+
seed: zod13.number().min(speechToTextBodySeedMinOne).max(speechToTextBodySeedMaxOne).or(zod13.null()).optional().describe(
|
|
39595
|
+
"If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result. Determinism is not guaranteed. Must be an integer between 0 and 2147483647."
|
|
39596
|
+
),
|
|
39597
|
+
use_multi_channel: zod13.boolean().optional().describe(
|
|
39598
|
+
"Whether the audio file contains multiple channels where each channel contains a single speaker. When enabled, each channel will be transcribed independently and the results will be combined. Each word in the response will include a 'channel_index' field indicating which channel it was spoken on. A maximum of 5 channels is supported."
|
|
39599
|
+
),
|
|
39600
|
+
webhook_metadata: zod13.string().or(zod13.record(zod13.string(), zod13.any())).or(zod13.null()).optional().describe(
|
|
39601
|
+
"Optional metadata to be included in the webhook response. This should be a JSON string representing an object with a maximum depth of 2 levels and maximum size of 16KB. Useful for tracking internal IDs, job references, or other contextual information."
|
|
39602
|
+
),
|
|
39603
|
+
entity_detection: zod13.string().or(zod13.array(zod13.string())).or(zod13.null()).optional().describe(
|
|
39604
|
+
"Detect entities in the transcript. Can be 'all' to detect all entities, a single entity type or category string, or a list of entity types/categories. Categories include 'pii', 'phi', 'pci', 'other', 'offensive_language'. When enabled, detected entities will be returned in the 'entities' field with their text, type, and character positions. Usage of this parameter will incur additional costs."
|
|
39605
|
+
),
|
|
39606
|
+
no_verbatim: zod13.boolean().optional().describe(
|
|
39607
|
+
"If true, the transcription will not have any filler words, false starts and non-speech sounds. Only supported with scribe_v2 model."
|
|
39608
|
+
),
|
|
39609
|
+
keyterms: zod13.array(zod13.string()).default(speechToTextBodyKeytermsDefault).describe(
|
|
39610
|
+
'A list of keyterms to bias the transcription towards. The keyterms are words or phrases you want the model to recognise more accurately. The number of keyterms cannot exceed 100. The length of each keyterm must be less than 50 characters. Keyterms can contain at most 5 words (after normalisation). For example ["hello", "world", "technical term"]. Usage of this parameter will incur additional costs. '
|
|
39611
|
+
)
|
|
39612
|
+
});
|
|
39613
|
+
var speechToTextResponse = zod13.object({
|
|
39614
|
+
language_code: zod13.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
39615
|
+
language_probability: zod13.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
39616
|
+
text: zod13.string().describe("The raw text of the transcription."),
|
|
39617
|
+
words: zod13.array(
|
|
39618
|
+
zod13.object({
|
|
39619
|
+
text: zod13.string().describe("The word or sound that was transcribed."),
|
|
39620
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
39621
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
39622
|
+
type: zod13.enum(["word", "spacing", "audio_event"]).describe(
|
|
39623
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
39624
|
+
),
|
|
39625
|
+
speaker_id: zod13.string().or(zod13.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
39626
|
+
logprob: zod13.number().describe(
|
|
39627
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
39628
|
+
),
|
|
39629
|
+
characters: zod13.array(
|
|
39630
|
+
zod13.object({
|
|
39631
|
+
text: zod13.string().describe("The character that was transcribed."),
|
|
39632
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the character in seconds."),
|
|
39633
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the character in seconds.")
|
|
39634
|
+
})
|
|
39635
|
+
).or(zod13.null()).optional().describe("The characters that make up the word and their timing information.")
|
|
39636
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
39637
|
+
).describe("List of words with their timing information."),
|
|
39638
|
+
channel_index: zod13.number().or(zod13.null()).optional().describe("The channel index this transcript belongs to (for multichannel audio)."),
|
|
39639
|
+
additional_formats: zod13.array(
|
|
39640
|
+
zod13.object({
|
|
39641
|
+
requested_format: zod13.string().describe("The requested format."),
|
|
39642
|
+
file_extension: zod13.string().describe("The file extension of the additional format."),
|
|
39643
|
+
content_type: zod13.string().describe("The content type of the additional format."),
|
|
39644
|
+
is_base64_encoded: zod13.boolean().describe("Whether the content is base64 encoded."),
|
|
39645
|
+
content: zod13.string().describe("The content of the additional format.")
|
|
39646
|
+
}).or(zod13.null())
|
|
39647
|
+
).or(zod13.null()).optional().describe("Requested additional formats of the transcript."),
|
|
39648
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response."),
|
|
39649
|
+
entities: zod13.array(
|
|
39650
|
+
zod13.object({
|
|
39651
|
+
text: zod13.string().describe("The text that was identified as an entity."),
|
|
39652
|
+
entity_type: zod13.string().describe(
|
|
39653
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
39654
|
+
),
|
|
39655
|
+
start_char: zod13.number().describe("Start character position in the transcript text."),
|
|
39656
|
+
end_char: zod13.number().describe("End character position in the transcript text.")
|
|
39657
|
+
})
|
|
39658
|
+
).or(zod13.null()).optional().describe(
|
|
39659
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
39660
|
+
)
|
|
39661
|
+
}).describe("Chunk-level detail of the transcription with timing information.").or(
|
|
39662
|
+
zod13.object({
|
|
39663
|
+
transcripts: zod13.array(
|
|
39664
|
+
zod13.object({
|
|
39665
|
+
language_code: zod13.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
39666
|
+
language_probability: zod13.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
39667
|
+
text: zod13.string().describe("The raw text of the transcription."),
|
|
39668
|
+
words: zod13.array(
|
|
39669
|
+
zod13.object({
|
|
39670
|
+
text: zod13.string().describe("The word or sound that was transcribed."),
|
|
39671
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
39672
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
39673
|
+
type: zod13.enum(["word", "spacing", "audio_event"]).describe(
|
|
39674
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
39675
|
+
),
|
|
39676
|
+
speaker_id: zod13.string().or(zod13.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
39677
|
+
logprob: zod13.number().describe(
|
|
39678
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
39679
|
+
),
|
|
39680
|
+
characters: zod13.array(
|
|
39681
|
+
zod13.object({
|
|
39682
|
+
text: zod13.string().describe("The character that was transcribed."),
|
|
39683
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the character in seconds."),
|
|
39684
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the character in seconds.")
|
|
39685
|
+
})
|
|
39686
|
+
).or(zod13.null()).optional().describe(
|
|
39687
|
+
"The characters that make up the word and their timing information."
|
|
39688
|
+
)
|
|
39689
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
39690
|
+
).describe("List of words with their timing information."),
|
|
39691
|
+
channel_index: zod13.number().or(zod13.null()).optional().describe(
|
|
39692
|
+
"The channel index this transcript belongs to (for multichannel audio)."
|
|
39693
|
+
),
|
|
39694
|
+
additional_formats: zod13.array(
|
|
39695
|
+
zod13.object({
|
|
39696
|
+
requested_format: zod13.string().describe("The requested format."),
|
|
39697
|
+
file_extension: zod13.string().describe("The file extension of the additional format."),
|
|
39698
|
+
content_type: zod13.string().describe("The content type of the additional format."),
|
|
39699
|
+
is_base64_encoded: zod13.boolean().describe("Whether the content is base64 encoded."),
|
|
39700
|
+
content: zod13.string().describe("The content of the additional format.")
|
|
39701
|
+
}).or(zod13.null())
|
|
39702
|
+
).or(zod13.null()).optional().describe("Requested additional formats of the transcript."),
|
|
39703
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response."),
|
|
39704
|
+
entities: zod13.array(
|
|
39705
|
+
zod13.object({
|
|
39706
|
+
text: zod13.string().describe("The text that was identified as an entity."),
|
|
39707
|
+
entity_type: zod13.string().describe(
|
|
39708
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
39709
|
+
),
|
|
39710
|
+
start_char: zod13.number().describe("Start character position in the transcript text."),
|
|
39711
|
+
end_char: zod13.number().describe("End character position in the transcript text.")
|
|
39712
|
+
})
|
|
39713
|
+
).or(zod13.null()).optional().describe(
|
|
39714
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
39715
|
+
)
|
|
39716
|
+
}).describe("Chunk-level detail of the transcription with timing information.")
|
|
39717
|
+
).describe(
|
|
39718
|
+
"List of transcripts, one for each audio channel. Each transcript contains the text and word-level details for its respective channel."
|
|
39719
|
+
),
|
|
39720
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response.")
|
|
39721
|
+
}).describe("Response model for multichannel speech-to-text transcription.")
|
|
39722
|
+
);
|
|
39723
|
+
var getTranscriptByIdParams = zod13.object({
|
|
39724
|
+
transcription_id: zod13.string().describe("The unique ID of the transcript to retrieve")
|
|
39725
|
+
});
|
|
39726
|
+
var getTranscriptByIdHeader = zod13.object({
|
|
39727
|
+
"xi-api-key": zod13.string().or(zod13.null()).optional().describe(
|
|
39728
|
+
"Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
|
|
39729
|
+
)
|
|
39730
|
+
});
|
|
39731
|
+
var getTranscriptByIdResponse = zod13.object({
|
|
39732
|
+
language_code: zod13.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
39733
|
+
language_probability: zod13.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
39734
|
+
text: zod13.string().describe("The raw text of the transcription."),
|
|
39735
|
+
words: zod13.array(
|
|
39736
|
+
zod13.object({
|
|
39737
|
+
text: zod13.string().describe("The word or sound that was transcribed."),
|
|
39738
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
39739
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
39740
|
+
type: zod13.enum(["word", "spacing", "audio_event"]).describe(
|
|
39741
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
39742
|
+
),
|
|
39743
|
+
speaker_id: zod13.string().or(zod13.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
39744
|
+
logprob: zod13.number().describe(
|
|
39745
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
39746
|
+
),
|
|
39747
|
+
characters: zod13.array(
|
|
39748
|
+
zod13.object({
|
|
39749
|
+
text: zod13.string().describe("The character that was transcribed."),
|
|
39750
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the character in seconds."),
|
|
39751
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the character in seconds.")
|
|
39752
|
+
})
|
|
39753
|
+
).or(zod13.null()).optional().describe("The characters that make up the word and their timing information.")
|
|
39754
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
39755
|
+
).describe("List of words with their timing information."),
|
|
39756
|
+
channel_index: zod13.number().or(zod13.null()).optional().describe("The channel index this transcript belongs to (for multichannel audio)."),
|
|
39757
|
+
additional_formats: zod13.array(
|
|
39758
|
+
zod13.object({
|
|
39759
|
+
requested_format: zod13.string().describe("The requested format."),
|
|
39760
|
+
file_extension: zod13.string().describe("The file extension of the additional format."),
|
|
39761
|
+
content_type: zod13.string().describe("The content type of the additional format."),
|
|
39762
|
+
is_base64_encoded: zod13.boolean().describe("Whether the content is base64 encoded."),
|
|
39763
|
+
content: zod13.string().describe("The content of the additional format.")
|
|
39764
|
+
}).or(zod13.null())
|
|
39765
|
+
).or(zod13.null()).optional().describe("Requested additional formats of the transcript."),
|
|
39766
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response."),
|
|
39767
|
+
entities: zod13.array(
|
|
39768
|
+
zod13.object({
|
|
39769
|
+
text: zod13.string().describe("The text that was identified as an entity."),
|
|
39770
|
+
entity_type: zod13.string().describe(
|
|
39771
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
39772
|
+
),
|
|
39773
|
+
start_char: zod13.number().describe("Start character position in the transcript text."),
|
|
39774
|
+
end_char: zod13.number().describe("End character position in the transcript text.")
|
|
39775
|
+
})
|
|
39776
|
+
).or(zod13.null()).optional().describe(
|
|
39777
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
39778
|
+
)
|
|
39779
|
+
}).describe("Chunk-level detail of the transcription with timing information.").or(
|
|
39780
|
+
zod13.object({
|
|
39781
|
+
transcripts: zod13.array(
|
|
39782
|
+
zod13.object({
|
|
39783
|
+
language_code: zod13.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
39784
|
+
language_probability: zod13.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
39785
|
+
text: zod13.string().describe("The raw text of the transcription."),
|
|
39786
|
+
words: zod13.array(
|
|
39787
|
+
zod13.object({
|
|
39788
|
+
text: zod13.string().describe("The word or sound that was transcribed."),
|
|
39789
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
39790
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
39791
|
+
type: zod13.enum(["word", "spacing", "audio_event"]).describe(
|
|
39792
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
39793
|
+
),
|
|
39794
|
+
speaker_id: zod13.string().or(zod13.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
39795
|
+
logprob: zod13.number().describe(
|
|
39796
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
39797
|
+
),
|
|
39798
|
+
characters: zod13.array(
|
|
39799
|
+
zod13.object({
|
|
39800
|
+
text: zod13.string().describe("The character that was transcribed."),
|
|
39801
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the character in seconds."),
|
|
39802
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the character in seconds.")
|
|
39803
|
+
})
|
|
39804
|
+
).or(zod13.null()).optional().describe(
|
|
39805
|
+
"The characters that make up the word and their timing information."
|
|
39806
|
+
)
|
|
39807
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
39808
|
+
).describe("List of words with their timing information."),
|
|
39809
|
+
channel_index: zod13.number().or(zod13.null()).optional().describe(
|
|
39810
|
+
"The channel index this transcript belongs to (for multichannel audio)."
|
|
39811
|
+
),
|
|
39812
|
+
additional_formats: zod13.array(
|
|
39813
|
+
zod13.object({
|
|
39814
|
+
requested_format: zod13.string().describe("The requested format."),
|
|
39815
|
+
file_extension: zod13.string().describe("The file extension of the additional format."),
|
|
39816
|
+
content_type: zod13.string().describe("The content type of the additional format."),
|
|
39817
|
+
is_base64_encoded: zod13.boolean().describe("Whether the content is base64 encoded."),
|
|
39818
|
+
content: zod13.string().describe("The content of the additional format.")
|
|
39819
|
+
}).or(zod13.null())
|
|
39820
|
+
).or(zod13.null()).optional().describe("Requested additional formats of the transcript."),
|
|
39821
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response."),
|
|
39822
|
+
entities: zod13.array(
|
|
39823
|
+
zod13.object({
|
|
39824
|
+
text: zod13.string().describe("The text that was identified as an entity."),
|
|
39825
|
+
entity_type: zod13.string().describe(
|
|
39826
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
39827
|
+
),
|
|
39828
|
+
start_char: zod13.number().describe("Start character position in the transcript text."),
|
|
39829
|
+
end_char: zod13.number().describe("End character position in the transcript text.")
|
|
39830
|
+
})
|
|
39831
|
+
).or(zod13.null()).optional().describe(
|
|
39832
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
39833
|
+
)
|
|
39834
|
+
}).describe("Chunk-level detail of the transcription with timing information.")
|
|
39835
|
+
).describe(
|
|
39836
|
+
"List of transcripts, one for each audio channel. Each transcript contains the text and word-level details for its respective channel."
|
|
39837
|
+
),
|
|
39838
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response.")
|
|
39839
|
+
}).describe("Response model for multichannel speech-to-text transcription.")
|
|
39840
|
+
);
|
|
39841
|
+
var deleteTranscriptByIdParams = zod13.object({
|
|
39842
|
+
transcription_id: zod13.string().describe("The unique ID of the transcript to delete")
|
|
39843
|
+
});
|
|
39844
|
+
var deleteTranscriptByIdHeader = zod13.object({
|
|
39845
|
+
"xi-api-key": zod13.string().or(zod13.null()).optional().describe(
|
|
39846
|
+
"Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
|
|
39847
|
+
)
|
|
39848
|
+
});
|
|
39849
|
+
var deleteTranscriptByIdResponse = zod13.any();
|
|
38442
39850
|
export {
|
|
38443
39851
|
AllLanguageCodes,
|
|
38444
39852
|
AllProviders,
|
|
@@ -38493,6 +39901,13 @@ export {
|
|
|
38493
39901
|
DeepgramTranscriptionSchema,
|
|
38494
39902
|
schema_exports4 as DeepgramTypes,
|
|
38495
39903
|
deepgramAPISpecification_zod_exports as DeepgramZodSchemas,
|
|
39904
|
+
ElevenLabsAdapter,
|
|
39905
|
+
ElevenLabsCapabilities,
|
|
39906
|
+
ElevenLabsLanguageCodes,
|
|
39907
|
+
ElevenLabsLanguageLabels,
|
|
39908
|
+
ElevenLabsLanguages,
|
|
39909
|
+
schema_exports8 as ElevenLabsTypes,
|
|
39910
|
+
elevenLabsSpeechToTextAPI_zod_exports as ElevenLabsZodSchemas,
|
|
38496
39911
|
GladiaAdapter,
|
|
38497
39912
|
GladiaBitDepth,
|
|
38498
39913
|
GladiaCapabilities,
|
|
@@ -38579,6 +39994,7 @@ export {
|
|
|
38579
39994
|
createAssemblyAIAdapter,
|
|
38580
39995
|
createAzureSTTAdapter,
|
|
38581
39996
|
createDeepgramAdapter,
|
|
39997
|
+
createElevenLabsAdapter,
|
|
38582
39998
|
createGladiaAdapter,
|
|
38583
39999
|
createOpenAIWhisperAdapter,
|
|
38584
40000
|
createSonioxAdapter,
|