voice-router-dev 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +162 -0
- package/README.md +21 -2
- package/dist/constants.d.mts +577 -7
- package/dist/constants.d.ts +577 -7
- package/dist/constants.js +493 -1
- package/dist/constants.mjs +482 -1
- package/dist/{field-configs-CDeDcDz_.d.mts → field-configs-CDVygOte.d.mts} +5817 -5817
- package/dist/{field-configs-CDeDcDz_.d.ts → field-configs-CDVygOte.d.ts} +5817 -5817
- package/dist/field-configs.d.mts +1 -1
- package/dist/field-configs.d.ts +1 -1
- package/dist/index.d.mts +3558 -1757
- package/dist/index.d.ts +3558 -1757
- package/dist/index.js +1466 -91
- package/dist/index.mjs +1458 -91
- package/dist/{provider-metadata-BHbouRC9.d.mts → provider-metadata-BnkedpXm.d.mts} +34 -4
- package/dist/{provider-metadata-Dsk2PVud.d.ts → provider-metadata-DbsSGAO7.d.ts} +34 -4
- package/dist/provider-metadata.d.mts +2 -2
- package/dist/provider-metadata.d.ts +2 -2
- package/dist/provider-metadata.js +349 -6
- package/dist/provider-metadata.mjs +345 -6
- package/dist/{transcriptWebhookNotification-D1iE2_a4.d.ts → transcriptWebhookNotification-BJk1CEF5.d.ts} +712 -9
- package/dist/{transcriptWebhookNotification-Cz9RsK5D.d.mts → transcriptWebhookNotification-CNFpns9f.d.mts} +712 -9
- package/dist/webhooks.d.mts +102 -5
- package/dist/webhooks.d.ts +102 -5
- package/dist/webhooks.js +342 -39
- package/dist/webhooks.mjs +340 -39
- package/package.json +11 -5
package/dist/index.mjs
CHANGED
|
@@ -750,6 +750,7 @@ var SonioxLanguage = {
|
|
|
750
750
|
|
|
751
751
|
// src/generated/soniox/models.ts
|
|
752
752
|
var SonioxModels = [
|
|
753
|
+
{ id: "stt-rt-v4", name: "Speech-to-Text Real-time v4", mode: "real_time" },
|
|
753
754
|
{ id: "stt-rt-v3", name: "Speech-to-Text Real-time v3", mode: "real_time" },
|
|
754
755
|
{ id: "stt-async-v4", name: "Speech-to-Text Async v4", mode: "async" },
|
|
755
756
|
{ id: "stt-async-v3", name: "Speech-to-Text Async v3", mode: "async" },
|
|
@@ -760,6 +761,7 @@ var SonioxModels = [
|
|
|
760
761
|
{ id: "stt-async-preview-v1", name: "Speech-to-Text Async Preview v1", mode: "async", aliasOf: "stt-async-v3" }
|
|
761
762
|
];
|
|
762
763
|
var SonioxModelCodes = [
|
|
764
|
+
"stt-rt-v4",
|
|
763
765
|
"stt-rt-v3",
|
|
764
766
|
"stt-async-v4",
|
|
765
767
|
"stt-async-v3",
|
|
@@ -770,6 +772,7 @@ var SonioxModelCodes = [
|
|
|
770
772
|
"stt-async-preview-v1"
|
|
771
773
|
];
|
|
772
774
|
var SonioxModelLabels = {
|
|
775
|
+
"stt-rt-v4": "Speech-to-Text Real-time v4",
|
|
773
776
|
"stt-rt-v3": "Speech-to-Text Real-time v3",
|
|
774
777
|
"stt-async-v4": "Speech-to-Text Async v4",
|
|
775
778
|
"stt-async-v3": "Speech-to-Text Async v3",
|
|
@@ -780,6 +783,7 @@ var SonioxModelLabels = {
|
|
|
780
783
|
"stt-async-preview-v1": "Speech-to-Text Async Preview v1"
|
|
781
784
|
};
|
|
782
785
|
var SonioxModel = {
|
|
786
|
+
stt_rt_v4: "stt-rt-v4",
|
|
783
787
|
stt_rt_v3: "stt-rt-v3",
|
|
784
788
|
stt_async_v4: "stt-async-v4",
|
|
785
789
|
stt_async_v3: "stt-async-v3",
|
|
@@ -790,6 +794,7 @@ var SonioxModel = {
|
|
|
790
794
|
stt_async_preview_v1: "stt-async-preview-v1"
|
|
791
795
|
};
|
|
792
796
|
var SonioxRealtimeModel = {
|
|
797
|
+
stt_rt_v4: "stt-rt-v4",
|
|
793
798
|
stt_rt_v3: "stt-rt-v3",
|
|
794
799
|
stt_rt_preview: "stt-rt-preview",
|
|
795
800
|
stt_rt_v3_preview: "stt-rt-v3-preview",
|
|
@@ -806,6 +811,7 @@ var SonioxAsyncModel = {
|
|
|
806
811
|
var SpeechmaticsLanguages = [
|
|
807
812
|
{ code: "auto", name: "Automatic Detection" },
|
|
808
813
|
{ code: "ar", name: "Arabic" },
|
|
814
|
+
{ code: "ar_en", name: "Arabic / English" },
|
|
809
815
|
{ code: "ba", name: "Bashkir" },
|
|
810
816
|
{ code: "be", name: "Belarusian" },
|
|
811
817
|
{ code: "bg", name: "Bulgarian" },
|
|
@@ -870,6 +876,7 @@ var SpeechmaticsLanguages = [
|
|
|
870
876
|
var SpeechmaticsLanguageCodes = [
|
|
871
877
|
"auto",
|
|
872
878
|
"ar",
|
|
879
|
+
"ar_en",
|
|
873
880
|
"ba",
|
|
874
881
|
"be",
|
|
875
882
|
"bg",
|
|
@@ -934,6 +941,7 @@ var SpeechmaticsLanguageCodes = [
|
|
|
934
941
|
var SpeechmaticsLanguageLabels = {
|
|
935
942
|
"auto": "Automatic Detection",
|
|
936
943
|
"ar": "Arabic",
|
|
944
|
+
"ar_en": "Arabic / English",
|
|
937
945
|
"ba": "Bashkir",
|
|
938
946
|
"be": "Belarusian",
|
|
939
947
|
"bg": "Bulgarian",
|
|
@@ -998,6 +1006,7 @@ var SpeechmaticsLanguageLabels = {
|
|
|
998
1006
|
var SpeechmaticsLanguage = {
|
|
999
1007
|
"auto": "auto",
|
|
1000
1008
|
"ar": "ar",
|
|
1009
|
+
"ar_en": "ar_en",
|
|
1001
1010
|
"ba": "ba",
|
|
1002
1011
|
"be": "be",
|
|
1003
1012
|
"bg": "bg",
|
|
@@ -1197,7 +1206,9 @@ var AzureLocales = [
|
|
|
1197
1206
|
{ code: "sl-SI", name: "Slovenian (Slovenia)" },
|
|
1198
1207
|
{ code: "so-SO", name: "Somali (Somalia)" },
|
|
1199
1208
|
{ code: "sq-AL", name: "Albanian (Albania)" },
|
|
1209
|
+
{ code: "sr-ME", name: "Serbian (ME)" },
|
|
1200
1210
|
{ code: "sr-RS", name: "Serbian (Serbia)" },
|
|
1211
|
+
{ code: "sr-XK", name: "Serbian (XK)" },
|
|
1201
1212
|
{ code: "su-ID", name: "Sundanese (Indonesia)" },
|
|
1202
1213
|
{ code: "sv-SE", name: "Swedish (Sweden)" },
|
|
1203
1214
|
{ code: "sw-KE", name: "Swahili (Kenya)" },
|
|
@@ -1352,7 +1363,9 @@ var AzureLocaleCodes = [
|
|
|
1352
1363
|
"sl-SI",
|
|
1353
1364
|
"so-SO",
|
|
1354
1365
|
"sq-AL",
|
|
1366
|
+
"sr-ME",
|
|
1355
1367
|
"sr-RS",
|
|
1368
|
+
"sr-XK",
|
|
1356
1369
|
"su-ID",
|
|
1357
1370
|
"sv-SE",
|
|
1358
1371
|
"sw-KE",
|
|
@@ -1507,7 +1520,9 @@ var AzureLocaleLabels = {
|
|
|
1507
1520
|
"sl-SI": "Slovenian (Slovenia)",
|
|
1508
1521
|
"so-SO": "Somali (Somalia)",
|
|
1509
1522
|
"sq-AL": "Albanian (Albania)",
|
|
1523
|
+
"sr-ME": "Serbian (ME)",
|
|
1510
1524
|
"sr-RS": "Serbian (Serbia)",
|
|
1525
|
+
"sr-XK": "Serbian (XK)",
|
|
1511
1526
|
"su-ID": "Sundanese (Indonesia)",
|
|
1512
1527
|
"sv-SE": "Swedish (Sweden)",
|
|
1513
1528
|
"sw-KE": "Swahili (Kenya)",
|
|
@@ -1662,7 +1677,9 @@ var AzureLocale = {
|
|
|
1662
1677
|
"sl-SI": "sl-SI",
|
|
1663
1678
|
"so-SO": "so-SO",
|
|
1664
1679
|
"sq-AL": "sq-AL",
|
|
1680
|
+
"sr-ME": "sr-ME",
|
|
1665
1681
|
"sr-RS": "sr-RS",
|
|
1682
|
+
"sr-XK": "sr-XK",
|
|
1666
1683
|
"su-ID": "su-ID",
|
|
1667
1684
|
"sv-SE": "sv-SE",
|
|
1668
1685
|
"sw-KE": "sw-KE",
|
|
@@ -1688,6 +1705,311 @@ var AzureLocale = {
|
|
|
1688
1705
|
"zu-ZA": "zu-ZA"
|
|
1689
1706
|
};
|
|
1690
1707
|
|
|
1708
|
+
// src/generated/elevenlabs/languages.ts
|
|
1709
|
+
var ElevenLabsLanguages = [
|
|
1710
|
+
{ code: "en", name: "English" },
|
|
1711
|
+
{ code: "zh", name: "Chinese" },
|
|
1712
|
+
{ code: "de", name: "German" },
|
|
1713
|
+
{ code: "es", name: "Spanish" },
|
|
1714
|
+
{ code: "ru", name: "Russian" },
|
|
1715
|
+
{ code: "ko", name: "Korean" },
|
|
1716
|
+
{ code: "fr", name: "French" },
|
|
1717
|
+
{ code: "ja", name: "Japanese" },
|
|
1718
|
+
{ code: "pt", name: "Portuguese" },
|
|
1719
|
+
{ code: "tr", name: "Turkish" },
|
|
1720
|
+
{ code: "pl", name: "Polish" },
|
|
1721
|
+
{ code: "ca", name: "Catalan" },
|
|
1722
|
+
{ code: "nl", name: "Dutch" },
|
|
1723
|
+
{ code: "ar", name: "Arabic" },
|
|
1724
|
+
{ code: "sv", name: "Swedish" },
|
|
1725
|
+
{ code: "it", name: "Italian" },
|
|
1726
|
+
{ code: "id", name: "Indonesian" },
|
|
1727
|
+
{ code: "hi", name: "Hindi" },
|
|
1728
|
+
{ code: "fi", name: "Finnish" },
|
|
1729
|
+
{ code: "vi", name: "Vietnamese" },
|
|
1730
|
+
{ code: "he", name: "Hebrew" },
|
|
1731
|
+
{ code: "uk", name: "Ukrainian" },
|
|
1732
|
+
{ code: "el", name: "Greek" },
|
|
1733
|
+
{ code: "ms", name: "Malay" },
|
|
1734
|
+
{ code: "cs", name: "Czech" },
|
|
1735
|
+
{ code: "ro", name: "Romanian" },
|
|
1736
|
+
{ code: "da", name: "Danish" },
|
|
1737
|
+
{ code: "hu", name: "Hungarian" },
|
|
1738
|
+
{ code: "ta", name: "Tamil" },
|
|
1739
|
+
{ code: "no", name: "Norwegian" },
|
|
1740
|
+
{ code: "th", name: "Thai" },
|
|
1741
|
+
{ code: "ur", name: "Urdu" },
|
|
1742
|
+
{ code: "hr", name: "Croatian" },
|
|
1743
|
+
{ code: "bg", name: "Bulgarian" },
|
|
1744
|
+
{ code: "lt", name: "Lithuanian" },
|
|
1745
|
+
{ code: "la", name: "Latin" },
|
|
1746
|
+
{ code: "mi", name: "Maori" },
|
|
1747
|
+
{ code: "ml", name: "Malayalam" },
|
|
1748
|
+
{ code: "cy", name: "Welsh" },
|
|
1749
|
+
{ code: "sk", name: "Slovak" },
|
|
1750
|
+
{ code: "te", name: "Telugu" },
|
|
1751
|
+
{ code: "fa", name: "Persian" },
|
|
1752
|
+
{ code: "lv", name: "Latvian" },
|
|
1753
|
+
{ code: "bn", name: "Bengali" },
|
|
1754
|
+
{ code: "sr", name: "Serbian" },
|
|
1755
|
+
{ code: "az", name: "Azerbaijani" },
|
|
1756
|
+
{ code: "sl", name: "Slovenian" },
|
|
1757
|
+
{ code: "kn", name: "Kannada" },
|
|
1758
|
+
{ code: "et", name: "Estonian" },
|
|
1759
|
+
{ code: "mk", name: "Macedonian" },
|
|
1760
|
+
{ code: "br", name: "Breton" },
|
|
1761
|
+
{ code: "eu", name: "Basque" },
|
|
1762
|
+
{ code: "is", name: "Icelandic" },
|
|
1763
|
+
{ code: "hy", name: "Armenian" },
|
|
1764
|
+
{ code: "ne", name: "Nepali" },
|
|
1765
|
+
{ code: "mn", name: "Mongolian" },
|
|
1766
|
+
{ code: "bs", name: "Bosnian" },
|
|
1767
|
+
{ code: "kk", name: "Kazakh" },
|
|
1768
|
+
{ code: "sq", name: "Albanian" },
|
|
1769
|
+
{ code: "sw", name: "Swahili" },
|
|
1770
|
+
{ code: "gl", name: "Galician" },
|
|
1771
|
+
{ code: "mr", name: "Marathi" },
|
|
1772
|
+
{ code: "pa", name: "Punjabi" },
|
|
1773
|
+
{ code: "si", name: "Sinhala" },
|
|
1774
|
+
{ code: "km", name: "Khmer" },
|
|
1775
|
+
{ code: "sn", name: "Shona" },
|
|
1776
|
+
{ code: "yo", name: "Yoruba" },
|
|
1777
|
+
{ code: "so", name: "Somali" },
|
|
1778
|
+
{ code: "af", name: "Afrikaans" },
|
|
1779
|
+
{ code: "oc", name: "Occitan" },
|
|
1780
|
+
{ code: "ka", name: "Georgian" },
|
|
1781
|
+
{ code: "be", name: "Belarusian" },
|
|
1782
|
+
{ code: "tg", name: "Tajik" },
|
|
1783
|
+
{ code: "sd", name: "Sindhi" },
|
|
1784
|
+
{ code: "gu", name: "Gujarati" },
|
|
1785
|
+
{ code: "am", name: "Amharic" },
|
|
1786
|
+
{ code: "yi", name: "Yiddish" },
|
|
1787
|
+
{ code: "lo", name: "Lao" },
|
|
1788
|
+
{ code: "uz", name: "Uzbek" },
|
|
1789
|
+
{ code: "fo", name: "Faroese" },
|
|
1790
|
+
{ code: "ht", name: "Haitian Creole" },
|
|
1791
|
+
{ code: "ps", name: "Pashto" },
|
|
1792
|
+
{ code: "tk", name: "Turkmen" },
|
|
1793
|
+
{ code: "nn", name: "Norwegian Nynorsk" },
|
|
1794
|
+
{ code: "mt", name: "Maltese" },
|
|
1795
|
+
{ code: "sa", name: "Sanskrit" },
|
|
1796
|
+
{ code: "lb", name: "Luxembourgish" },
|
|
1797
|
+
{ code: "my", name: "Burmese" },
|
|
1798
|
+
{ code: "bo", name: "Tibetan" },
|
|
1799
|
+
{ code: "tl", name: "Tagalog" },
|
|
1800
|
+
{ code: "mg", name: "Malagasy" },
|
|
1801
|
+
{ code: "as", name: "Assamese" },
|
|
1802
|
+
{ code: "tt", name: "Tatar" },
|
|
1803
|
+
{ code: "haw", name: "Hawaiian" },
|
|
1804
|
+
{ code: "ln", name: "Lingala" },
|
|
1805
|
+
{ code: "ha", name: "Hausa" },
|
|
1806
|
+
{ code: "ba", name: "Bashkir" },
|
|
1807
|
+
{ code: "jw", name: "Javanese" },
|
|
1808
|
+
{ code: "su", name: "Sundanese" }
|
|
1809
|
+
];
|
|
1810
|
+
var ElevenLabsLanguageCodes = [
|
|
1811
|
+
"en",
|
|
1812
|
+
"zh",
|
|
1813
|
+
"de",
|
|
1814
|
+
"es",
|
|
1815
|
+
"ru",
|
|
1816
|
+
"ko",
|
|
1817
|
+
"fr",
|
|
1818
|
+
"ja",
|
|
1819
|
+
"pt",
|
|
1820
|
+
"tr",
|
|
1821
|
+
"pl",
|
|
1822
|
+
"ca",
|
|
1823
|
+
"nl",
|
|
1824
|
+
"ar",
|
|
1825
|
+
"sv",
|
|
1826
|
+
"it",
|
|
1827
|
+
"id",
|
|
1828
|
+
"hi",
|
|
1829
|
+
"fi",
|
|
1830
|
+
"vi",
|
|
1831
|
+
"he",
|
|
1832
|
+
"uk",
|
|
1833
|
+
"el",
|
|
1834
|
+
"ms",
|
|
1835
|
+
"cs",
|
|
1836
|
+
"ro",
|
|
1837
|
+
"da",
|
|
1838
|
+
"hu",
|
|
1839
|
+
"ta",
|
|
1840
|
+
"no",
|
|
1841
|
+
"th",
|
|
1842
|
+
"ur",
|
|
1843
|
+
"hr",
|
|
1844
|
+
"bg",
|
|
1845
|
+
"lt",
|
|
1846
|
+
"la",
|
|
1847
|
+
"mi",
|
|
1848
|
+
"ml",
|
|
1849
|
+
"cy",
|
|
1850
|
+
"sk",
|
|
1851
|
+
"te",
|
|
1852
|
+
"fa",
|
|
1853
|
+
"lv",
|
|
1854
|
+
"bn",
|
|
1855
|
+
"sr",
|
|
1856
|
+
"az",
|
|
1857
|
+
"sl",
|
|
1858
|
+
"kn",
|
|
1859
|
+
"et",
|
|
1860
|
+
"mk",
|
|
1861
|
+
"br",
|
|
1862
|
+
"eu",
|
|
1863
|
+
"is",
|
|
1864
|
+
"hy",
|
|
1865
|
+
"ne",
|
|
1866
|
+
"mn",
|
|
1867
|
+
"bs",
|
|
1868
|
+
"kk",
|
|
1869
|
+
"sq",
|
|
1870
|
+
"sw",
|
|
1871
|
+
"gl",
|
|
1872
|
+
"mr",
|
|
1873
|
+
"pa",
|
|
1874
|
+
"si",
|
|
1875
|
+
"km",
|
|
1876
|
+
"sn",
|
|
1877
|
+
"yo",
|
|
1878
|
+
"so",
|
|
1879
|
+
"af",
|
|
1880
|
+
"oc",
|
|
1881
|
+
"ka",
|
|
1882
|
+
"be",
|
|
1883
|
+
"tg",
|
|
1884
|
+
"sd",
|
|
1885
|
+
"gu",
|
|
1886
|
+
"am",
|
|
1887
|
+
"yi",
|
|
1888
|
+
"lo",
|
|
1889
|
+
"uz",
|
|
1890
|
+
"fo",
|
|
1891
|
+
"ht",
|
|
1892
|
+
"ps",
|
|
1893
|
+
"tk",
|
|
1894
|
+
"nn",
|
|
1895
|
+
"mt",
|
|
1896
|
+
"sa",
|
|
1897
|
+
"lb",
|
|
1898
|
+
"my",
|
|
1899
|
+
"bo",
|
|
1900
|
+
"tl",
|
|
1901
|
+
"mg",
|
|
1902
|
+
"as",
|
|
1903
|
+
"tt",
|
|
1904
|
+
"haw",
|
|
1905
|
+
"ln",
|
|
1906
|
+
"ha",
|
|
1907
|
+
"ba",
|
|
1908
|
+
"jw",
|
|
1909
|
+
"su"
|
|
1910
|
+
];
|
|
1911
|
+
var ElevenLabsLanguageLabels = {
|
|
1912
|
+
en: "English",
|
|
1913
|
+
zh: "Chinese",
|
|
1914
|
+
de: "German",
|
|
1915
|
+
es: "Spanish",
|
|
1916
|
+
ru: "Russian",
|
|
1917
|
+
ko: "Korean",
|
|
1918
|
+
fr: "French",
|
|
1919
|
+
ja: "Japanese",
|
|
1920
|
+
pt: "Portuguese",
|
|
1921
|
+
tr: "Turkish",
|
|
1922
|
+
pl: "Polish",
|
|
1923
|
+
ca: "Catalan",
|
|
1924
|
+
nl: "Dutch",
|
|
1925
|
+
ar: "Arabic",
|
|
1926
|
+
sv: "Swedish",
|
|
1927
|
+
it: "Italian",
|
|
1928
|
+
id: "Indonesian",
|
|
1929
|
+
hi: "Hindi",
|
|
1930
|
+
fi: "Finnish",
|
|
1931
|
+
vi: "Vietnamese",
|
|
1932
|
+
he: "Hebrew",
|
|
1933
|
+
uk: "Ukrainian",
|
|
1934
|
+
el: "Greek",
|
|
1935
|
+
ms: "Malay",
|
|
1936
|
+
cs: "Czech",
|
|
1937
|
+
ro: "Romanian",
|
|
1938
|
+
da: "Danish",
|
|
1939
|
+
hu: "Hungarian",
|
|
1940
|
+
ta: "Tamil",
|
|
1941
|
+
no: "Norwegian",
|
|
1942
|
+
th: "Thai",
|
|
1943
|
+
ur: "Urdu",
|
|
1944
|
+
hr: "Croatian",
|
|
1945
|
+
bg: "Bulgarian",
|
|
1946
|
+
lt: "Lithuanian",
|
|
1947
|
+
la: "Latin",
|
|
1948
|
+
mi: "Maori",
|
|
1949
|
+
ml: "Malayalam",
|
|
1950
|
+
cy: "Welsh",
|
|
1951
|
+
sk: "Slovak",
|
|
1952
|
+
te: "Telugu",
|
|
1953
|
+
fa: "Persian",
|
|
1954
|
+
lv: "Latvian",
|
|
1955
|
+
bn: "Bengali",
|
|
1956
|
+
sr: "Serbian",
|
|
1957
|
+
az: "Azerbaijani",
|
|
1958
|
+
sl: "Slovenian",
|
|
1959
|
+
kn: "Kannada",
|
|
1960
|
+
et: "Estonian",
|
|
1961
|
+
mk: "Macedonian",
|
|
1962
|
+
br: "Breton",
|
|
1963
|
+
eu: "Basque",
|
|
1964
|
+
is: "Icelandic",
|
|
1965
|
+
hy: "Armenian",
|
|
1966
|
+
ne: "Nepali",
|
|
1967
|
+
mn: "Mongolian",
|
|
1968
|
+
bs: "Bosnian",
|
|
1969
|
+
kk: "Kazakh",
|
|
1970
|
+
sq: "Albanian",
|
|
1971
|
+
sw: "Swahili",
|
|
1972
|
+
gl: "Galician",
|
|
1973
|
+
mr: "Marathi",
|
|
1974
|
+
pa: "Punjabi",
|
|
1975
|
+
si: "Sinhala",
|
|
1976
|
+
km: "Khmer",
|
|
1977
|
+
sn: "Shona",
|
|
1978
|
+
yo: "Yoruba",
|
|
1979
|
+
so: "Somali",
|
|
1980
|
+
af: "Afrikaans",
|
|
1981
|
+
oc: "Occitan",
|
|
1982
|
+
ka: "Georgian",
|
|
1983
|
+
be: "Belarusian",
|
|
1984
|
+
tg: "Tajik",
|
|
1985
|
+
sd: "Sindhi",
|
|
1986
|
+
gu: "Gujarati",
|
|
1987
|
+
am: "Amharic",
|
|
1988
|
+
yi: "Yiddish",
|
|
1989
|
+
lo: "Lao",
|
|
1990
|
+
uz: "Uzbek",
|
|
1991
|
+
fo: "Faroese",
|
|
1992
|
+
ht: "Haitian Creole",
|
|
1993
|
+
ps: "Pashto",
|
|
1994
|
+
tk: "Turkmen",
|
|
1995
|
+
nn: "Norwegian Nynorsk",
|
|
1996
|
+
mt: "Maltese",
|
|
1997
|
+
sa: "Sanskrit",
|
|
1998
|
+
lb: "Luxembourgish",
|
|
1999
|
+
my: "Burmese",
|
|
2000
|
+
bo: "Tibetan",
|
|
2001
|
+
tl: "Tagalog",
|
|
2002
|
+
mg: "Malagasy",
|
|
2003
|
+
as: "Assamese",
|
|
2004
|
+
tt: "Tatar",
|
|
2005
|
+
haw: "Hawaiian",
|
|
2006
|
+
ln: "Lingala",
|
|
2007
|
+
ha: "Hausa",
|
|
2008
|
+
ba: "Bashkir",
|
|
2009
|
+
jw: "Javanese",
|
|
2010
|
+
su: "Sundanese"
|
|
2011
|
+
};
|
|
2012
|
+
|
|
1691
2013
|
// src/generated/gladia/schema/streamingSupportedBitDepthEnum.ts
|
|
1692
2014
|
var StreamingSupportedBitDepthEnum = {
|
|
1693
2015
|
NUMBER_8: 8,
|
|
@@ -2229,6 +2551,16 @@ var DeepgramSampleRate = {
|
|
|
2229
2551
|
NUMBER_44100: 44100,
|
|
2230
2552
|
NUMBER_48000: 48e3
|
|
2231
2553
|
};
|
|
2554
|
+
var ElevenLabsRegion = {
|
|
2555
|
+
/** Global endpoint (default) */
|
|
2556
|
+
global: "global",
|
|
2557
|
+
/** United States */
|
|
2558
|
+
us: "us",
|
|
2559
|
+
/** European Union */
|
|
2560
|
+
eu: "eu",
|
|
2561
|
+
/** India */
|
|
2562
|
+
in: "in"
|
|
2563
|
+
};
|
|
2232
2564
|
var GladiaEncoding = StreamingSupportedEncodingEnum;
|
|
2233
2565
|
var GladiaSampleRate = StreamingSupportedSampleRateEnum;
|
|
2234
2566
|
var GladiaBitDepth = StreamingSupportedBitDepthEnum;
|
|
@@ -2647,6 +2979,70 @@ function extractWords(words, mapper) {
|
|
|
2647
2979
|
const normalizedWords = words.map(mapper);
|
|
2648
2980
|
return normalizedWords.length > 0 ? normalizedWords : void 0;
|
|
2649
2981
|
}
|
|
2982
|
+
function buildUtterancesFromWords(words) {
|
|
2983
|
+
const utterances = [];
|
|
2984
|
+
let currentSpeaker;
|
|
2985
|
+
let currentWords = [];
|
|
2986
|
+
let utteranceStart = 0;
|
|
2987
|
+
for (const word of words) {
|
|
2988
|
+
if (!word.speaker) continue;
|
|
2989
|
+
if (word.speaker !== currentSpeaker) {
|
|
2990
|
+
if (currentSpeaker && currentWords.length > 0) {
|
|
2991
|
+
utterances.push({
|
|
2992
|
+
text: currentWords.map((w) => w.word).join(" "),
|
|
2993
|
+
start: utteranceStart,
|
|
2994
|
+
end: currentWords[currentWords.length - 1].end,
|
|
2995
|
+
speaker: currentSpeaker,
|
|
2996
|
+
words: currentWords
|
|
2997
|
+
});
|
|
2998
|
+
}
|
|
2999
|
+
currentSpeaker = word.speaker;
|
|
3000
|
+
currentWords = [word];
|
|
3001
|
+
utteranceStart = word.start;
|
|
3002
|
+
} else {
|
|
3003
|
+
currentWords.push(word);
|
|
3004
|
+
}
|
|
3005
|
+
}
|
|
3006
|
+
if (currentSpeaker && currentWords.length > 0) {
|
|
3007
|
+
utterances.push({
|
|
3008
|
+
text: currentWords.map((w) => w.word).join(" "),
|
|
3009
|
+
start: utteranceStart,
|
|
3010
|
+
end: currentWords[currentWords.length - 1].end,
|
|
3011
|
+
speaker: currentSpeaker,
|
|
3012
|
+
words: currentWords
|
|
3013
|
+
});
|
|
3014
|
+
}
|
|
3015
|
+
return utterances;
|
|
3016
|
+
}
|
|
3017
|
+
function buildTextFromSpeechmaticsResults(results) {
|
|
3018
|
+
const parts = [];
|
|
3019
|
+
let attachNext = false;
|
|
3020
|
+
for (const result of results) {
|
|
3021
|
+
if (result.type !== "word" && result.type !== "punctuation") continue;
|
|
3022
|
+
const content = result.alternatives?.[0]?.content;
|
|
3023
|
+
if (!content) continue;
|
|
3024
|
+
if (result.type === "punctuation") {
|
|
3025
|
+
const attaches = result.attaches_to;
|
|
3026
|
+
if (attaches === "previous" || attaches === "both") {
|
|
3027
|
+
parts.push(content);
|
|
3028
|
+
attachNext = attaches === "both";
|
|
3029
|
+
} else if (attaches === "next") {
|
|
3030
|
+
if (parts.length > 0) parts.push(" ");
|
|
3031
|
+
parts.push(content);
|
|
3032
|
+
attachNext = true;
|
|
3033
|
+
} else {
|
|
3034
|
+
if (parts.length > 0 && !attachNext) parts.push(" ");
|
|
3035
|
+
parts.push(content);
|
|
3036
|
+
attachNext = false;
|
|
3037
|
+
}
|
|
3038
|
+
} else {
|
|
3039
|
+
if (parts.length > 0 && !attachNext) parts.push(" ");
|
|
3040
|
+
parts.push(content);
|
|
3041
|
+
attachNext = false;
|
|
3042
|
+
}
|
|
3043
|
+
}
|
|
3044
|
+
return parts.join("");
|
|
3045
|
+
}
|
|
2650
3046
|
var STATUS_MAPPINGS = {
|
|
2651
3047
|
gladia: {
|
|
2652
3048
|
queued: "queued",
|
|
@@ -6263,7 +6659,7 @@ var DeepgramAdapter = class extends BaseAdapter {
|
|
|
6263
6659
|
start: w.start || 0,
|
|
6264
6660
|
end: w.end || 0,
|
|
6265
6661
|
confidence: w.confidence
|
|
6266
|
-
}))
|
|
6662
|
+
})) ?? []
|
|
6267
6663
|
}));
|
|
6268
6664
|
}
|
|
6269
6665
|
/**
|
|
@@ -6672,7 +7068,7 @@ var DeepgramAdapter = class extends BaseAdapter {
|
|
|
6672
7068
|
start: w.start,
|
|
6673
7069
|
end: w.end,
|
|
6674
7070
|
confidence: w.confidence
|
|
6675
|
-
}))
|
|
7071
|
+
})) ?? []
|
|
6676
7072
|
});
|
|
6677
7073
|
}
|
|
6678
7074
|
break;
|
|
@@ -7911,7 +8307,8 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
|
|
|
7911
8307
|
callbacks?.onUtterance?.({
|
|
7912
8308
|
text: transcription.transcript,
|
|
7913
8309
|
start: 0,
|
|
7914
|
-
end: 0
|
|
8310
|
+
end: 0,
|
|
8311
|
+
words: []
|
|
7915
8312
|
});
|
|
7916
8313
|
break;
|
|
7917
8314
|
}
|
|
@@ -7974,7 +8371,8 @@ var OpenAIWhisperAdapter = class extends BaseAdapter {
|
|
|
7974
8371
|
text: segment.text,
|
|
7975
8372
|
start: segment.start,
|
|
7976
8373
|
end: segment.end,
|
|
7977
|
-
confidence: void 0
|
|
8374
|
+
confidence: void 0,
|
|
8375
|
+
words: []
|
|
7978
8376
|
}));
|
|
7979
8377
|
const requestId2 = `openai-${Date.now()}`;
|
|
7980
8378
|
return {
|
|
@@ -8340,7 +8738,7 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
8340
8738
|
* Normalize Speechmatics response to unified format
|
|
8341
8739
|
*/
|
|
8342
8740
|
normalizeResponse(response) {
|
|
8343
|
-
const text = response.results
|
|
8741
|
+
const text = buildTextFromSpeechmaticsResults(response.results);
|
|
8344
8742
|
const words = response.results.filter((r) => r.type === "word" && r.start_time !== void 0 && r.end_time !== void 0).map((result) => ({
|
|
8345
8743
|
word: result.alternatives?.[0]?.content || "",
|
|
8346
8744
|
start: result.start_time,
|
|
@@ -8349,51 +8747,14 @@ var SpeechmaticsAdapter = class extends BaseAdapter {
|
|
|
8349
8747
|
speaker: result.alternatives?.[0]?.speaker
|
|
8350
8748
|
}));
|
|
8351
8749
|
const speakerSet = /* @__PURE__ */ new Set();
|
|
8352
|
-
|
|
8353
|
-
if (
|
|
8354
|
-
const speaker = r.alternatives[0]?.speaker;
|
|
8355
|
-
if (speaker) speakerSet.add(speaker);
|
|
8356
|
-
}
|
|
8750
|
+
words.forEach((w) => {
|
|
8751
|
+
if (w.speaker) speakerSet.add(w.speaker);
|
|
8357
8752
|
});
|
|
8358
8753
|
const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
|
|
8359
8754
|
id,
|
|
8360
8755
|
label: `Speaker ${id}`
|
|
8361
8756
|
})) : void 0;
|
|
8362
|
-
const utterances =
|
|
8363
|
-
if (speakers) {
|
|
8364
|
-
let currentSpeaker;
|
|
8365
|
-
let currentUtterance = [];
|
|
8366
|
-
let utteranceStart = 0;
|
|
8367
|
-
response.results.filter((r) => r.type === "word" && r.alternatives).forEach((result, idx) => {
|
|
8368
|
-
const speaker = result.alternatives[0]?.speaker;
|
|
8369
|
-
const word = result.alternatives[0]?.content || "";
|
|
8370
|
-
if (speaker !== currentSpeaker) {
|
|
8371
|
-
if (currentSpeaker && currentUtterance.length > 0) {
|
|
8372
|
-
const prevResult = response.results.filter((r) => r.type === "word")[idx - 1];
|
|
8373
|
-
utterances.push({
|
|
8374
|
-
speaker: currentSpeaker,
|
|
8375
|
-
text: currentUtterance.join(" "),
|
|
8376
|
-
start: utteranceStart || 0,
|
|
8377
|
-
end: prevResult?.end_time || result.start_time || 0
|
|
8378
|
-
});
|
|
8379
|
-
}
|
|
8380
|
-
currentSpeaker = speaker;
|
|
8381
|
-
currentUtterance = [word];
|
|
8382
|
-
utteranceStart = result.start_time || 0;
|
|
8383
|
-
} else {
|
|
8384
|
-
currentUtterance.push(word);
|
|
8385
|
-
}
|
|
8386
|
-
});
|
|
8387
|
-
if (currentSpeaker && currentUtterance.length > 0) {
|
|
8388
|
-
const lastWord = response.results.filter((r) => r.type === "word").pop();
|
|
8389
|
-
utterances.push({
|
|
8390
|
-
speaker: currentSpeaker,
|
|
8391
|
-
text: currentUtterance.join(" "),
|
|
8392
|
-
start: utteranceStart,
|
|
8393
|
-
end: lastWord?.end_time || utteranceStart
|
|
8394
|
-
});
|
|
8395
|
-
}
|
|
8396
|
-
}
|
|
8757
|
+
const utterances = buildUtterancesFromWords(words);
|
|
8397
8758
|
return {
|
|
8398
8759
|
success: true,
|
|
8399
8760
|
provider: this.name,
|
|
@@ -8915,45 +9276,14 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
8915
9276
|
* Build utterances from tokens based on speaker changes
|
|
8916
9277
|
*/
|
|
8917
9278
|
buildUtterancesFromTokens(tokens) {
|
|
8918
|
-
const
|
|
8919
|
-
|
|
8920
|
-
|
|
8921
|
-
|
|
8922
|
-
|
|
8923
|
-
|
|
8924
|
-
|
|
8925
|
-
|
|
8926
|
-
end: token.end_ms ? token.end_ms / 1e3 : 0,
|
|
8927
|
-
confidence: token.confidence,
|
|
8928
|
-
speaker: token.speaker
|
|
8929
|
-
};
|
|
8930
|
-
if (token.speaker !== currentSpeaker) {
|
|
8931
|
-
if (currentSpeaker && currentWords.length > 0) {
|
|
8932
|
-
utterances.push({
|
|
8933
|
-
text: currentWords.map((w) => w.word).join(" "),
|
|
8934
|
-
start: utteranceStart,
|
|
8935
|
-
end: currentWords[currentWords.length - 1].end,
|
|
8936
|
-
speaker: currentSpeaker,
|
|
8937
|
-
words: currentWords
|
|
8938
|
-
});
|
|
8939
|
-
}
|
|
8940
|
-
currentSpeaker = token.speaker;
|
|
8941
|
-
currentWords = [word];
|
|
8942
|
-
utteranceStart = word.start;
|
|
8943
|
-
} else {
|
|
8944
|
-
currentWords.push(word);
|
|
8945
|
-
}
|
|
8946
|
-
}
|
|
8947
|
-
if (currentSpeaker && currentWords.length > 0) {
|
|
8948
|
-
utterances.push({
|
|
8949
|
-
text: currentWords.map((w) => w.word).join(" "),
|
|
8950
|
-
start: utteranceStart,
|
|
8951
|
-
end: currentWords[currentWords.length - 1].end,
|
|
8952
|
-
speaker: currentSpeaker,
|
|
8953
|
-
words: currentWords
|
|
8954
|
-
});
|
|
8955
|
-
}
|
|
8956
|
-
return utterances;
|
|
9279
|
+
const words = tokens.map((token) => ({
|
|
9280
|
+
word: token.text,
|
|
9281
|
+
start: token.start_ms ? token.start_ms / 1e3 : 0,
|
|
9282
|
+
end: token.end_ms ? token.end_ms / 1e3 : 0,
|
|
9283
|
+
confidence: token.confidence,
|
|
9284
|
+
speaker: token.speaker
|
|
9285
|
+
}));
|
|
9286
|
+
return buildUtterancesFromWords(words);
|
|
8957
9287
|
}
|
|
8958
9288
|
/**
|
|
8959
9289
|
* Normalize Soniox response to unified format
|
|
@@ -8977,7 +9307,7 @@ var SonioxAdapter = class extends BaseAdapter {
|
|
|
8977
9307
|
id,
|
|
8978
9308
|
label: `Speaker ${id}`
|
|
8979
9309
|
})) : void 0;
|
|
8980
|
-
const utterances = response.tokens ? this.buildUtterancesFromTokens(response.tokens) : [];
|
|
9310
|
+
const utterances = response.tokens ? this.buildUtterancesFromTokens(response.tokens.filter((t) => t.is_final)) : [];
|
|
8981
9311
|
const language = response.tokens?.find((t) => t.language)?.language;
|
|
8982
9312
|
return {
|
|
8983
9313
|
success: true,
|
|
@@ -9006,6 +9336,501 @@ function createSonioxAdapter(config) {
|
|
|
9006
9336
|
return adapter;
|
|
9007
9337
|
}
|
|
9008
9338
|
|
|
9339
|
+
// src/adapters/elevenlabs-adapter.ts
|
|
9340
|
+
import axios10 from "axios";
|
|
9341
|
+
var ElevenLabsAdapter = class extends BaseAdapter {
|
|
9342
|
+
constructor() {
|
|
9343
|
+
super(...arguments);
|
|
9344
|
+
this.name = "elevenlabs";
|
|
9345
|
+
this.capabilities = {
|
|
9346
|
+
streaming: true,
|
|
9347
|
+
diarization: true,
|
|
9348
|
+
wordTimestamps: true,
|
|
9349
|
+
languageDetection: true,
|
|
9350
|
+
customVocabulary: true,
|
|
9351
|
+
summarization: false,
|
|
9352
|
+
sentimentAnalysis: false,
|
|
9353
|
+
entityDetection: true,
|
|
9354
|
+
piiRedaction: true,
|
|
9355
|
+
listTranscripts: false,
|
|
9356
|
+
deleteTranscript: false
|
|
9357
|
+
};
|
|
9358
|
+
this.region = ElevenLabsRegion.global;
|
|
9359
|
+
this.defaultModel = "scribe_v2";
|
|
9360
|
+
}
|
|
9361
|
+
/**
|
|
9362
|
+
* Get regional API host based on configured region
|
|
9363
|
+
*/
|
|
9364
|
+
getRegionalHost() {
|
|
9365
|
+
switch (this.region) {
|
|
9366
|
+
case ElevenLabsRegion.us:
|
|
9367
|
+
return "api.us.elevenlabs.io";
|
|
9368
|
+
case ElevenLabsRegion.eu:
|
|
9369
|
+
return "api.eu.residency.elevenlabs.io";
|
|
9370
|
+
case ElevenLabsRegion.in:
|
|
9371
|
+
return "api.in.residency.elevenlabs.io";
|
|
9372
|
+
case ElevenLabsRegion.global:
|
|
9373
|
+
default:
|
|
9374
|
+
return "api.elevenlabs.io";
|
|
9375
|
+
}
|
|
9376
|
+
}
|
|
9377
|
+
/**
|
|
9378
|
+
* Get the base URL for API requests
|
|
9379
|
+
*/
|
|
9380
|
+
get baseUrl() {
|
|
9381
|
+
if (this.config?.baseUrl) return this.config.baseUrl;
|
|
9382
|
+
return `https://${this.getRegionalHost()}`;
|
|
9383
|
+
}
|
|
9384
|
+
initialize(config) {
|
|
9385
|
+
super.initialize(config);
|
|
9386
|
+
if (config.region) {
|
|
9387
|
+
this.region = config.region;
|
|
9388
|
+
}
|
|
9389
|
+
if (config.model) {
|
|
9390
|
+
this.defaultModel = config.model;
|
|
9391
|
+
}
|
|
9392
|
+
this.client = axios10.create({
|
|
9393
|
+
baseURL: this.baseUrl,
|
|
9394
|
+
timeout: config.timeout || 12e4,
|
|
9395
|
+
headers: {
|
|
9396
|
+
"xi-api-key": config.apiKey,
|
|
9397
|
+
...config.headers
|
|
9398
|
+
}
|
|
9399
|
+
});
|
|
9400
|
+
}
|
|
9401
|
+
/**
|
|
9402
|
+
* Get current region
|
|
9403
|
+
*/
|
|
9404
|
+
getRegion() {
|
|
9405
|
+
return this.region;
|
|
9406
|
+
}
|
|
9407
|
+
/**
|
|
9408
|
+
* Set regional endpoint
|
|
9409
|
+
*/
|
|
9410
|
+
setRegion(region) {
|
|
9411
|
+
this.region = region;
|
|
9412
|
+
if (this.config?.apiKey) {
|
|
9413
|
+
this.client = axios10.create({
|
|
9414
|
+
baseURL: this.baseUrl,
|
|
9415
|
+
timeout: this.config.timeout || 12e4,
|
|
9416
|
+
headers: {
|
|
9417
|
+
"xi-api-key": this.config.apiKey,
|
|
9418
|
+
...this.config.headers
|
|
9419
|
+
}
|
|
9420
|
+
});
|
|
9421
|
+
}
|
|
9422
|
+
}
|
|
9423
|
+
/**
|
|
9424
|
+
* Submit audio for transcription
|
|
9425
|
+
*
|
|
9426
|
+
* ElevenLabs batch is synchronous - the API returns the result directly.
|
|
9427
|
+
*/
|
|
9428
|
+
async transcribe(audio, options) {
|
|
9429
|
+
this.validateConfig();
|
|
9430
|
+
try {
|
|
9431
|
+
const formData = new FormData();
|
|
9432
|
+
const modelId = options?.model || this.defaultModel;
|
|
9433
|
+
formData.append("model_id", modelId);
|
|
9434
|
+
if (audio.type === "url") {
|
|
9435
|
+
formData.append("cloud_storage_url", audio.url);
|
|
9436
|
+
} else if (audio.type === "file") {
|
|
9437
|
+
const audioBlob = audio.file instanceof Blob ? audio.file : new Blob([audio.file], { type: audio.mimeType || "audio/wav" });
|
|
9438
|
+
formData.append("file", audioBlob, audio.filename || "audio.wav");
|
|
9439
|
+
} else {
|
|
9440
|
+
return {
|
|
9441
|
+
success: false,
|
|
9442
|
+
provider: this.name,
|
|
9443
|
+
error: {
|
|
9444
|
+
code: "INVALID_INPUT",
|
|
9445
|
+
message: "ElevenLabs only supports URL and File audio input"
|
|
9446
|
+
}
|
|
9447
|
+
};
|
|
9448
|
+
}
|
|
9449
|
+
if (options?.language) {
|
|
9450
|
+
formData.append("language_code", options.language);
|
|
9451
|
+
}
|
|
9452
|
+
if (options?.diarization) {
|
|
9453
|
+
formData.append("diarize", "true");
|
|
9454
|
+
}
|
|
9455
|
+
formData.append("timestamps_granularity", "word");
|
|
9456
|
+
if (options?.speakersExpected) {
|
|
9457
|
+
formData.append("num_speakers", String(options.speakersExpected));
|
|
9458
|
+
}
|
|
9459
|
+
if (options?.customVocabulary && options.customVocabulary.length > 0) {
|
|
9460
|
+
for (const term of options.customVocabulary) {
|
|
9461
|
+
formData.append("keyterms", term);
|
|
9462
|
+
}
|
|
9463
|
+
}
|
|
9464
|
+
if (options?.entityDetection) {
|
|
9465
|
+
formData.append("entity_detection", "all");
|
|
9466
|
+
}
|
|
9467
|
+
const elevenlabsOpts = options?.elevenlabs;
|
|
9468
|
+
if (elevenlabsOpts) {
|
|
9469
|
+
for (const [key, value] of Object.entries(elevenlabsOpts)) {
|
|
9470
|
+
if (value === void 0 || value === null) continue;
|
|
9471
|
+
if (formData.has(key)) continue;
|
|
9472
|
+
if (typeof value === "boolean") {
|
|
9473
|
+
formData.append(key, String(value));
|
|
9474
|
+
} else if (Array.isArray(value)) {
|
|
9475
|
+
for (const item of value) {
|
|
9476
|
+
formData.append(key, typeof item === "object" ? JSON.stringify(item) : String(item));
|
|
9477
|
+
}
|
|
9478
|
+
} else if (typeof value === "object") {
|
|
9479
|
+
formData.append(key, JSON.stringify(value));
|
|
9480
|
+
} else {
|
|
9481
|
+
formData.append(key, String(value));
|
|
9482
|
+
}
|
|
9483
|
+
}
|
|
9484
|
+
}
|
|
9485
|
+
const response = await this.client.post("/v1/speech-to-text", formData, {
|
|
9486
|
+
headers: {
|
|
9487
|
+
"Content-Type": "multipart/form-data"
|
|
9488
|
+
}
|
|
9489
|
+
});
|
|
9490
|
+
return this.normalizeResponse(response.data);
|
|
9491
|
+
} catch (error) {
|
|
9492
|
+
return this.createErrorResponse(error);
|
|
9493
|
+
}
|
|
9494
|
+
}
|
|
9495
|
+
/**
|
|
9496
|
+
* Get transcription result by ID
|
|
9497
|
+
*
|
|
9498
|
+
* ElevenLabs batch is synchronous, but supports transcript retrieval.
|
|
9499
|
+
*/
|
|
9500
|
+
async getTranscript(transcriptId) {
|
|
9501
|
+
this.validateConfig();
|
|
9502
|
+
try {
|
|
9503
|
+
const response = await this.client.get(`/v1/speech-to-text/transcripts/${transcriptId}`);
|
|
9504
|
+
return this.normalizeResponse(response.data);
|
|
9505
|
+
} catch (error) {
|
|
9506
|
+
return this.createErrorResponse(error);
|
|
9507
|
+
}
|
|
9508
|
+
}
|
|
9509
|
+
/**
|
|
9510
|
+
* Stream audio for real-time transcription
|
|
9511
|
+
*
|
|
9512
|
+
* Creates a WebSocket connection to ElevenLabs realtime STT endpoint.
|
|
9513
|
+
* Audio is sent as base64-encoded JSON messages.
|
|
9514
|
+
*/
|
|
9515
|
+
async transcribeStream(options, callbacks) {
|
|
9516
|
+
this.validateConfig();
|
|
9517
|
+
const sessionId = `elevenlabs_${Date.now()}_${Math.random().toString(36).substring(7)}`;
|
|
9518
|
+
const createdAt = /* @__PURE__ */ new Date();
|
|
9519
|
+
const wsBase = this.config?.wsBaseUrl || (this.config?.baseUrl ? this.deriveWsUrl(this.config.baseUrl) : `wss://${this.getRegionalHost()}`);
|
|
9520
|
+
const wsUrl = new URL(`${wsBase}/v1/speech-to-text/realtime`);
|
|
9521
|
+
const elOpts = options?.elevenlabsStreaming;
|
|
9522
|
+
const modelId = elOpts?.model || "scribe_v2_realtime";
|
|
9523
|
+
wsUrl.searchParams.set("model_id", modelId);
|
|
9524
|
+
const audioFormat = elOpts?.audioFormat || "pcm_16000";
|
|
9525
|
+
wsUrl.searchParams.set("audio_format", audioFormat);
|
|
9526
|
+
const langCode = elOpts?.languageCode || options?.language;
|
|
9527
|
+
if (langCode) {
|
|
9528
|
+
wsUrl.searchParams.set("language_code", langCode);
|
|
9529
|
+
}
|
|
9530
|
+
if (elOpts?.includeTimestamps !== void 0) {
|
|
9531
|
+
wsUrl.searchParams.set("include_timestamps", String(elOpts.includeTimestamps));
|
|
9532
|
+
}
|
|
9533
|
+
if (elOpts?.includeLanguageDetection || options?.languageDetection) {
|
|
9534
|
+
wsUrl.searchParams.set("include_language_detection", "true");
|
|
9535
|
+
}
|
|
9536
|
+
if (elOpts?.commitStrategy) {
|
|
9537
|
+
wsUrl.searchParams.set("commit_strategy", elOpts.commitStrategy);
|
|
9538
|
+
}
|
|
9539
|
+
if (elOpts?.vadSilenceThresholdSecs !== void 0) {
|
|
9540
|
+
wsUrl.searchParams.set("vad_silence_threshold_secs", String(elOpts.vadSilenceThresholdSecs));
|
|
9541
|
+
}
|
|
9542
|
+
if (elOpts?.vadThreshold !== void 0) {
|
|
9543
|
+
wsUrl.searchParams.set("vad_threshold", String(elOpts.vadThreshold));
|
|
9544
|
+
}
|
|
9545
|
+
if (elOpts?.minSpeechDurationMs !== void 0) {
|
|
9546
|
+
wsUrl.searchParams.set("min_speech_duration_ms", String(elOpts.minSpeechDurationMs));
|
|
9547
|
+
}
|
|
9548
|
+
if (elOpts?.minSilenceDurationMs !== void 0) {
|
|
9549
|
+
wsUrl.searchParams.set("min_silence_duration_ms", String(elOpts.minSilenceDurationMs));
|
|
9550
|
+
}
|
|
9551
|
+
if (elOpts?.previousText) {
|
|
9552
|
+
wsUrl.searchParams.set("previous_text", elOpts.previousText);
|
|
9553
|
+
}
|
|
9554
|
+
if (!elOpts?.audioFormat && options?.encoding) {
|
|
9555
|
+
const encodingMap = {
|
|
9556
|
+
linear16: "pcm_16000",
|
|
9557
|
+
pcm: "pcm_16000",
|
|
9558
|
+
mulaw: "ulaw_8000"
|
|
9559
|
+
};
|
|
9560
|
+
const mappedFormat = encodingMap[options.encoding];
|
|
9561
|
+
if (mappedFormat) {
|
|
9562
|
+
wsUrl.searchParams.set("audio_format", mappedFormat);
|
|
9563
|
+
}
|
|
9564
|
+
}
|
|
9565
|
+
let status = "connecting";
|
|
9566
|
+
let openedAt = null;
|
|
9567
|
+
let receivedData = false;
|
|
9568
|
+
const WebSocketImpl = typeof WebSocket !== "undefined" ? WebSocket : __require("ws");
|
|
9569
|
+
const ws = new WebSocketImpl(wsUrl.toString(), {
|
|
9570
|
+
headers: {
|
|
9571
|
+
"xi-api-key": this.config.apiKey
|
|
9572
|
+
}
|
|
9573
|
+
});
|
|
9574
|
+
ws.onopen = () => {
|
|
9575
|
+
status = "open";
|
|
9576
|
+
openedAt = Date.now();
|
|
9577
|
+
callbacks?.onOpen?.();
|
|
9578
|
+
};
|
|
9579
|
+
ws.onmessage = (event) => {
|
|
9580
|
+
receivedData = true;
|
|
9581
|
+
const rawPayload = typeof event.data === "string" ? event.data : event.data.toString();
|
|
9582
|
+
let messageType;
|
|
9583
|
+
try {
|
|
9584
|
+
const data = JSON.parse(rawPayload);
|
|
9585
|
+
if (data.error) {
|
|
9586
|
+
messageType = "error";
|
|
9587
|
+
} else if (data.message_type === "session_started") {
|
|
9588
|
+
messageType = "session_started";
|
|
9589
|
+
} else if (data.message_type === "partial_transcript") {
|
|
9590
|
+
messageType = "partial_transcript";
|
|
9591
|
+
} else if (data.message_type === "committed_transcript") {
|
|
9592
|
+
messageType = "committed_transcript";
|
|
9593
|
+
} else if (data.message_type === "committed_transcript_with_timestamps") {
|
|
9594
|
+
messageType = "committed_transcript_with_timestamps";
|
|
9595
|
+
}
|
|
9596
|
+
if (callbacks?.onRawMessage) {
|
|
9597
|
+
callbacks.onRawMessage({
|
|
9598
|
+
provider: this.name,
|
|
9599
|
+
direction: "incoming",
|
|
9600
|
+
timestamp: Date.now(),
|
|
9601
|
+
payload: rawPayload,
|
|
9602
|
+
messageType
|
|
9603
|
+
});
|
|
9604
|
+
}
|
|
9605
|
+
if (data.error) {
|
|
9606
|
+
callbacks?.onError?.({
|
|
9607
|
+
code: data.error_code?.toString() || "STREAM_ERROR",
|
|
9608
|
+
message: data.error
|
|
9609
|
+
});
|
|
9610
|
+
return;
|
|
9611
|
+
}
|
|
9612
|
+
if (data.message_type === "session_started") {
|
|
9613
|
+
return;
|
|
9614
|
+
}
|
|
9615
|
+
if (data.message_type === "partial_transcript") {
|
|
9616
|
+
const streamEvent = {
|
|
9617
|
+
type: "transcript",
|
|
9618
|
+
text: data.text || "",
|
|
9619
|
+
isFinal: false,
|
|
9620
|
+
confidence: void 0,
|
|
9621
|
+
language: data.language_code
|
|
9622
|
+
};
|
|
9623
|
+
callbacks?.onTranscript?.(streamEvent);
|
|
9624
|
+
return;
|
|
9625
|
+
}
|
|
9626
|
+
if (data.message_type === "committed_transcript" || data.message_type === "committed_transcript_with_timestamps") {
|
|
9627
|
+
const words = data.words ? data.words.map((w) => ({
|
|
9628
|
+
word: w.text || "",
|
|
9629
|
+
start: w.start || 0,
|
|
9630
|
+
end: w.end || 0,
|
|
9631
|
+
confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
|
|
9632
|
+
speaker: w.speaker_id
|
|
9633
|
+
})) : [];
|
|
9634
|
+
const streamEvent = {
|
|
9635
|
+
type: "transcript",
|
|
9636
|
+
text: data.text || "",
|
|
9637
|
+
isFinal: true,
|
|
9638
|
+
words: words.length > 0 ? words : void 0,
|
|
9639
|
+
speaker: words[0]?.speaker,
|
|
9640
|
+
language: data.language_code,
|
|
9641
|
+
confidence: void 0
|
|
9642
|
+
};
|
|
9643
|
+
callbacks?.onTranscript?.(streamEvent);
|
|
9644
|
+
if (options?.diarization && words.length > 0) {
|
|
9645
|
+
const utterances = buildUtterancesFromWords(words);
|
|
9646
|
+
for (const utterance of utterances) {
|
|
9647
|
+
callbacks?.onUtterance?.(utterance);
|
|
9648
|
+
}
|
|
9649
|
+
}
|
|
9650
|
+
}
|
|
9651
|
+
} catch (error) {
|
|
9652
|
+
callbacks?.onError?.({
|
|
9653
|
+
code: "PARSE_ERROR",
|
|
9654
|
+
message: `Failed to parse message: ${error}`
|
|
9655
|
+
});
|
|
9656
|
+
}
|
|
9657
|
+
};
|
|
9658
|
+
ws.onerror = () => {
|
|
9659
|
+
callbacks?.onError?.({
|
|
9660
|
+
code: "WEBSOCKET_ERROR",
|
|
9661
|
+
message: "WebSocket error occurred"
|
|
9662
|
+
});
|
|
9663
|
+
};
|
|
9664
|
+
ws.onclose = (event) => {
|
|
9665
|
+
status = "closed";
|
|
9666
|
+
const timeSinceOpen = openedAt ? Date.now() - openedAt : null;
|
|
9667
|
+
const isImmediateClose = timeSinceOpen !== null && timeSinceOpen < 1e3 && !receivedData;
|
|
9668
|
+
if (isImmediateClose && event.code === 1e3) {
|
|
9669
|
+
callbacks?.onError?.({
|
|
9670
|
+
code: "ELEVENLABS_CONFIG_REJECTED",
|
|
9671
|
+
message: [
|
|
9672
|
+
"ElevenLabs closed connection immediately after opening.",
|
|
9673
|
+
`Current config: region=${this.region}, model=${modelId}`,
|
|
9674
|
+
"Likely causes:",
|
|
9675
|
+
" - Invalid API key",
|
|
9676
|
+
" - Unsupported audio format or model",
|
|
9677
|
+
event.reason ? `Server reason: ${event.reason}` : null
|
|
9678
|
+
].filter(Boolean).join("\n")
|
|
9679
|
+
});
|
|
9680
|
+
}
|
|
9681
|
+
callbacks?.onClose?.(event.code, event.reason);
|
|
9682
|
+
};
|
|
9683
|
+
await new Promise((resolve, reject) => {
|
|
9684
|
+
const timeout = setTimeout(() => {
|
|
9685
|
+
reject(new Error("WebSocket connection timeout"));
|
|
9686
|
+
}, 1e4);
|
|
9687
|
+
const checkOpen = () => {
|
|
9688
|
+
if (status === "open") {
|
|
9689
|
+
clearTimeout(timeout);
|
|
9690
|
+
resolve();
|
|
9691
|
+
} else if (status === "closed") {
|
|
9692
|
+
clearTimeout(timeout);
|
|
9693
|
+
reject(new Error("WebSocket connection failed"));
|
|
9694
|
+
} else {
|
|
9695
|
+
setTimeout(checkOpen, 100);
|
|
9696
|
+
}
|
|
9697
|
+
};
|
|
9698
|
+
checkOpen();
|
|
9699
|
+
});
|
|
9700
|
+
return {
|
|
9701
|
+
id: sessionId,
|
|
9702
|
+
provider: this.name,
|
|
9703
|
+
createdAt,
|
|
9704
|
+
getStatus: () => status,
|
|
9705
|
+
sendAudio: async (chunk) => {
|
|
9706
|
+
if (status !== "open") {
|
|
9707
|
+
throw new Error("Session is not open");
|
|
9708
|
+
}
|
|
9709
|
+
let base64Audio;
|
|
9710
|
+
if (chunk.data instanceof ArrayBuffer) {
|
|
9711
|
+
base64Audio = Buffer.from(chunk.data).toString("base64");
|
|
9712
|
+
} else if (chunk.data instanceof Uint8Array) {
|
|
9713
|
+
base64Audio = Buffer.from(
|
|
9714
|
+
chunk.data.buffer,
|
|
9715
|
+
chunk.data.byteOffset,
|
|
9716
|
+
chunk.data.byteLength
|
|
9717
|
+
).toString("base64");
|
|
9718
|
+
} else {
|
|
9719
|
+
base64Audio = Buffer.from(chunk.data).toString("base64");
|
|
9720
|
+
}
|
|
9721
|
+
const message = JSON.stringify({
|
|
9722
|
+
message_type: "input_audio_chunk",
|
|
9723
|
+
audio_base_64: base64Audio
|
|
9724
|
+
});
|
|
9725
|
+
if (callbacks?.onRawMessage) {
|
|
9726
|
+
callbacks.onRawMessage({
|
|
9727
|
+
provider: this.name,
|
|
9728
|
+
direction: "outgoing",
|
|
9729
|
+
timestamp: Date.now(),
|
|
9730
|
+
payload: message,
|
|
9731
|
+
messageType: "audio"
|
|
9732
|
+
});
|
|
9733
|
+
}
|
|
9734
|
+
ws.send(message);
|
|
9735
|
+
},
|
|
9736
|
+
close: async () => {
|
|
9737
|
+
if (status === "open") {
|
|
9738
|
+
status = "closing";
|
|
9739
|
+
ws.send(JSON.stringify({ message_type: "end_of_stream" }));
|
|
9740
|
+
ws.close(1e3, "Client requested close");
|
|
9741
|
+
}
|
|
9742
|
+
}
|
|
9743
|
+
};
|
|
9744
|
+
}
|
|
9745
|
+
/**
|
|
9746
|
+
* Normalize ElevenLabs response to unified format
|
|
9747
|
+
*
|
|
9748
|
+
* ElevenLabs returns either:
|
|
9749
|
+
* - Single channel: `SpeechToTextChunkResponseModel` directly (text, words, etc.)
|
|
9750
|
+
* - Multi-channel: `MultichannelSpeechToTextResponseModel` with `transcripts[]`
|
|
9751
|
+
*/
|
|
9752
|
+
normalizeResponse(response) {
|
|
9753
|
+
const chunks = response.transcripts ? response.transcripts : [response];
|
|
9754
|
+
const text = chunks.map((c) => c.text).join(" ");
|
|
9755
|
+
const words = [];
|
|
9756
|
+
const speakerSet = /* @__PURE__ */ new Set();
|
|
9757
|
+
const audioEvents = [];
|
|
9758
|
+
for (const chunk of chunks) {
|
|
9759
|
+
if (!chunk.words) continue;
|
|
9760
|
+
for (const w of chunk.words) {
|
|
9761
|
+
if (w.type === "audio_event") {
|
|
9762
|
+
audioEvents.push({
|
|
9763
|
+
text: w.text,
|
|
9764
|
+
start: typeof w.start === "number" ? w.start : 0,
|
|
9765
|
+
end: typeof w.end === "number" ? w.end : 0
|
|
9766
|
+
});
|
|
9767
|
+
continue;
|
|
9768
|
+
}
|
|
9769
|
+
const speakerId = w.speaker_id ?? void 0;
|
|
9770
|
+
const word = {
|
|
9771
|
+
word: w.text,
|
|
9772
|
+
start: typeof w.start === "number" ? w.start : 0,
|
|
9773
|
+
end: typeof w.end === "number" ? w.end : 0,
|
|
9774
|
+
confidence: w.logprob !== void 0 ? Math.exp(w.logprob) : void 0,
|
|
9775
|
+
speaker: speakerId ?? void 0
|
|
9776
|
+
};
|
|
9777
|
+
words.push(word);
|
|
9778
|
+
if (speakerId) {
|
|
9779
|
+
speakerSet.add(speakerId);
|
|
9780
|
+
}
|
|
9781
|
+
}
|
|
9782
|
+
}
|
|
9783
|
+
const speakers = speakerSet.size > 0 ? Array.from(speakerSet).map((id) => ({
|
|
9784
|
+
id,
|
|
9785
|
+
label: `Speaker ${id}`
|
|
9786
|
+
})) : void 0;
|
|
9787
|
+
const utterances = words.length > 0 ? buildUtterancesFromWords(words) : [];
|
|
9788
|
+
const language = chunks[0]?.language_code;
|
|
9789
|
+
const languageProbability = chunks[0]?.language_probability;
|
|
9790
|
+
const entities = [];
|
|
9791
|
+
for (const chunk of chunks) {
|
|
9792
|
+
if (chunk.entities && Array.isArray(chunk.entities)) {
|
|
9793
|
+
for (const entity of chunk.entities) {
|
|
9794
|
+
entities.push({
|
|
9795
|
+
text: entity.text,
|
|
9796
|
+
entity_type: entity.entity_type,
|
|
9797
|
+
start_char: entity.start_char,
|
|
9798
|
+
end_char: entity.end_char
|
|
9799
|
+
});
|
|
9800
|
+
}
|
|
9801
|
+
}
|
|
9802
|
+
}
|
|
9803
|
+
const transcriptionId = response.transcription_id || chunks[0]?.transcription_id || `elevenlabs_${Date.now()}`;
|
|
9804
|
+
return {
|
|
9805
|
+
success: true,
|
|
9806
|
+
provider: this.name,
|
|
9807
|
+
data: {
|
|
9808
|
+
id: transcriptionId,
|
|
9809
|
+
text,
|
|
9810
|
+
status: "completed",
|
|
9811
|
+
language,
|
|
9812
|
+
speakers,
|
|
9813
|
+
words: words.length > 0 ? words : void 0,
|
|
9814
|
+
utterances: utterances.length > 0 ? utterances : void 0
|
|
9815
|
+
},
|
|
9816
|
+
extended: {
|
|
9817
|
+
entities: entities.length > 0 ? entities : void 0,
|
|
9818
|
+
audioEvents: audioEvents.length > 0 ? audioEvents : void 0,
|
|
9819
|
+
languageProbability
|
|
9820
|
+
},
|
|
9821
|
+
tracking: {
|
|
9822
|
+
requestId: transcriptionId
|
|
9823
|
+
},
|
|
9824
|
+
raw: response
|
|
9825
|
+
};
|
|
9826
|
+
}
|
|
9827
|
+
};
|
|
9828
|
+
function createElevenLabsAdapter(config) {
|
|
9829
|
+
const adapter = new ElevenLabsAdapter();
|
|
9830
|
+
adapter.initialize(config);
|
|
9831
|
+
return adapter;
|
|
9832
|
+
}
|
|
9833
|
+
|
|
9009
9834
|
// src/utils/zod-to-field-configs.ts
|
|
9010
9835
|
function unwrapZodType(schema) {
|
|
9011
9836
|
let inner = schema;
|
|
@@ -36406,6 +37231,21 @@ var SonioxCapabilities = {
|
|
|
36406
37231
|
listTranscripts: false,
|
|
36407
37232
|
deleteTranscript: false
|
|
36408
37233
|
};
|
|
37234
|
+
var ElevenLabsCapabilities = {
|
|
37235
|
+
streaming: true,
|
|
37236
|
+
diarization: true,
|
|
37237
|
+
wordTimestamps: true,
|
|
37238
|
+
languageDetection: true,
|
|
37239
|
+
customVocabulary: true,
|
|
37240
|
+
// Via keyterms parameter
|
|
37241
|
+
summarization: false,
|
|
37242
|
+
sentimentAnalysis: false,
|
|
37243
|
+
entityDetection: true,
|
|
37244
|
+
piiRedaction: true,
|
|
37245
|
+
// Via entity_detection with PII categories
|
|
37246
|
+
listTranscripts: false,
|
|
37247
|
+
deleteTranscript: false
|
|
37248
|
+
};
|
|
36409
37249
|
var ProviderCapabilitiesMap = {
|
|
36410
37250
|
gladia: GladiaCapabilities,
|
|
36411
37251
|
assemblyai: AssemblyAICapabilities,
|
|
@@ -36413,7 +37253,8 @@ var ProviderCapabilitiesMap = {
|
|
|
36413
37253
|
"openai-whisper": OpenAICapabilities,
|
|
36414
37254
|
"azure-stt": AzureCapabilities,
|
|
36415
37255
|
speechmatics: SpeechmaticsCapabilities,
|
|
36416
|
-
soniox: SonioxCapabilities
|
|
37256
|
+
soniox: SonioxCapabilities,
|
|
37257
|
+
elevenlabs: ElevenLabsCapabilities
|
|
36417
37258
|
};
|
|
36418
37259
|
var CapabilityKeys = [
|
|
36419
37260
|
"streaming",
|
|
@@ -36615,7 +37456,8 @@ var AllLanguageCodes = {
|
|
|
36615
37456
|
// BCP-47 locale codes (e.g., "en-US")
|
|
36616
37457
|
speechmatics: SpeechmaticsLanguageCodes,
|
|
36617
37458
|
// ISO 639-1 codes with multilingual packs
|
|
36618
|
-
soniox: SonioxLanguageCodes
|
|
37459
|
+
soniox: SonioxLanguageCodes,
|
|
37460
|
+
elevenlabs: ElevenLabsLanguageCodes
|
|
36619
37461
|
};
|
|
36620
37462
|
var ProviderDisplayNames = {
|
|
36621
37463
|
gladia: "Gladia",
|
|
@@ -36624,7 +37466,8 @@ var ProviderDisplayNames = {
|
|
|
36624
37466
|
"openai-whisper": "OpenAI Whisper",
|
|
36625
37467
|
"azure-stt": "Azure Speech",
|
|
36626
37468
|
speechmatics: "Speechmatics",
|
|
36627
|
-
soniox: "Soniox"
|
|
37469
|
+
soniox: "Soniox",
|
|
37470
|
+
elevenlabs: "ElevenLabs"
|
|
36628
37471
|
};
|
|
36629
37472
|
var ProviderWebsites = {
|
|
36630
37473
|
gladia: "https://gladia.io",
|
|
@@ -36633,7 +37476,8 @@ var ProviderWebsites = {
|
|
|
36633
37476
|
"openai-whisper": "https://openai.com",
|
|
36634
37477
|
"azure-stt": "https://azure.microsoft.com/services/cognitive-services/speech-to-text/",
|
|
36635
37478
|
speechmatics: "https://speechmatics.com",
|
|
36636
|
-
soniox: "https://soniox.com"
|
|
37479
|
+
soniox: "https://soniox.com",
|
|
37480
|
+
elevenlabs: "https://elevenlabs.io"
|
|
36637
37481
|
};
|
|
36638
37482
|
var ProviderDocs = {
|
|
36639
37483
|
gladia: "https://docs.gladia.io",
|
|
@@ -36642,7 +37486,8 @@ var ProviderDocs = {
|
|
|
36642
37486
|
"openai-whisper": "https://platform.openai.com/docs/guides/speech-to-text",
|
|
36643
37487
|
"azure-stt": "https://learn.microsoft.com/azure/cognitive-services/speech-service/",
|
|
36644
37488
|
speechmatics: "https://docs.speechmatics.com",
|
|
36645
|
-
soniox: "https://soniox.com/docs/stt/"
|
|
37489
|
+
soniox: "https://soniox.com/docs/stt/",
|
|
37490
|
+
elevenlabs: "https://elevenlabs.io/docs/capabilities/speech-to-text"
|
|
36646
37491
|
};
|
|
36647
37492
|
var AllProviders = [
|
|
36648
37493
|
"gladia",
|
|
@@ -36651,7 +37496,8 @@ var AllProviders = [
|
|
|
36651
37496
|
"openai-whisper",
|
|
36652
37497
|
"azure-stt",
|
|
36653
37498
|
"speechmatics",
|
|
36654
|
-
"soniox"
|
|
37499
|
+
"soniox",
|
|
37500
|
+
"elevenlabs"
|
|
36655
37501
|
];
|
|
36656
37502
|
var StreamingProviders = AllProviders.filter(
|
|
36657
37503
|
(p) => ProviderCapabilitiesMap[p].streaming
|
|
@@ -37376,6 +38222,77 @@ var TranslationConfigType = {
|
|
|
37376
38222
|
two_way: "two_way"
|
|
37377
38223
|
};
|
|
37378
38224
|
|
|
38225
|
+
// src/generated/elevenlabs/schema/index.ts
|
|
38226
|
+
var schema_exports8 = {};
|
|
38227
|
+
__export(schema_exports8, {
|
|
38228
|
+
BodySpeechToTextV1SpeechToTextPostFileFormat: () => BodySpeechToTextV1SpeechToTextPostFileFormat,
|
|
38229
|
+
BodySpeechToTextV1SpeechToTextPostModelId: () => BodySpeechToTextV1SpeechToTextPostModelId,
|
|
38230
|
+
BodySpeechToTextV1SpeechToTextPostTimestampsGranularity: () => BodySpeechToTextV1SpeechToTextPostTimestampsGranularity,
|
|
38231
|
+
DocxExportOptionsFormat: () => DocxExportOptionsFormat,
|
|
38232
|
+
HtmlExportOptionsFormat: () => HtmlExportOptionsFormat,
|
|
38233
|
+
PdfExportOptionsFormat: () => PdfExportOptionsFormat,
|
|
38234
|
+
SegmentedJsonExportOptionsFormat: () => SegmentedJsonExportOptionsFormat,
|
|
38235
|
+
SpeechToTextWordResponseModelType: () => SpeechToTextWordResponseModelType,
|
|
38236
|
+
SrtExportOptionsFormat: () => SrtExportOptionsFormat,
|
|
38237
|
+
TxtExportOptionsFormat: () => TxtExportOptionsFormat
|
|
38238
|
+
});
|
|
38239
|
+
|
|
38240
|
+
// src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostFileFormat.ts
|
|
38241
|
+
var BodySpeechToTextV1SpeechToTextPostFileFormat = {
|
|
38242
|
+
pcm_s16le_16: "pcm_s16le_16",
|
|
38243
|
+
other: "other"
|
|
38244
|
+
};
|
|
38245
|
+
|
|
38246
|
+
// src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostModelId.ts
|
|
38247
|
+
var BodySpeechToTextV1SpeechToTextPostModelId = {
|
|
38248
|
+
scribe_v1: "scribe_v1",
|
|
38249
|
+
scribe_v2: "scribe_v2"
|
|
38250
|
+
};
|
|
38251
|
+
|
|
38252
|
+
// src/generated/elevenlabs/schema/bodySpeechToTextV1SpeechToTextPostTimestampsGranularity.ts
|
|
38253
|
+
var BodySpeechToTextV1SpeechToTextPostTimestampsGranularity = {
|
|
38254
|
+
none: "none",
|
|
38255
|
+
word: "word",
|
|
38256
|
+
character: "character"
|
|
38257
|
+
};
|
|
38258
|
+
|
|
38259
|
+
// src/generated/elevenlabs/schema/docxExportOptionsFormat.ts
|
|
38260
|
+
var DocxExportOptionsFormat = {
|
|
38261
|
+
docx: "docx"
|
|
38262
|
+
};
|
|
38263
|
+
|
|
38264
|
+
// src/generated/elevenlabs/schema/htmlExportOptionsFormat.ts
|
|
38265
|
+
var HtmlExportOptionsFormat = {
|
|
38266
|
+
html: "html"
|
|
38267
|
+
};
|
|
38268
|
+
|
|
38269
|
+
// src/generated/elevenlabs/schema/pdfExportOptionsFormat.ts
|
|
38270
|
+
var PdfExportOptionsFormat = {
|
|
38271
|
+
pdf: "pdf"
|
|
38272
|
+
};
|
|
38273
|
+
|
|
38274
|
+
// src/generated/elevenlabs/schema/segmentedJsonExportOptionsFormat.ts
|
|
38275
|
+
var SegmentedJsonExportOptionsFormat = {
|
|
38276
|
+
segmented_json: "segmented_json"
|
|
38277
|
+
};
|
|
38278
|
+
|
|
38279
|
+
// src/generated/elevenlabs/schema/speechToTextWordResponseModelType.ts
|
|
38280
|
+
var SpeechToTextWordResponseModelType = {
|
|
38281
|
+
word: "word",
|
|
38282
|
+
spacing: "spacing",
|
|
38283
|
+
audio_event: "audio_event"
|
|
38284
|
+
};
|
|
38285
|
+
|
|
38286
|
+
// src/generated/elevenlabs/schema/srtExportOptionsFormat.ts
|
|
38287
|
+
var SrtExportOptionsFormat = {
|
|
38288
|
+
srt: "srt"
|
|
38289
|
+
};
|
|
38290
|
+
|
|
38291
|
+
// src/generated/elevenlabs/schema/txtExportOptionsFormat.ts
|
|
38292
|
+
var TxtExportOptionsFormat = {
|
|
38293
|
+
txt: "txt"
|
|
38294
|
+
};
|
|
38295
|
+
|
|
37379
38296
|
// src/generated/speechmatics/api/speechmaticsASRRESTAPI.zod.ts
|
|
37380
38297
|
var speechmaticsASRRESTAPI_zod_exports = {};
|
|
37381
38298
|
__export(speechmaticsASRRESTAPI_zod_exports, {
|
|
@@ -38488,6 +39405,448 @@ var getUsageResponse = zod12.object({
|
|
|
38488
39405
|
})
|
|
38489
39406
|
)
|
|
38490
39407
|
});
|
|
39408
|
+
|
|
39409
|
+
// src/generated/elevenlabs/api/elevenLabsSpeechToTextAPI.zod.ts
|
|
39410
|
+
var elevenLabsSpeechToTextAPI_zod_exports = {};
|
|
39411
|
+
__export(elevenLabsSpeechToTextAPI_zod_exports, {
|
|
39412
|
+
deleteTranscriptByIdHeader: () => deleteTranscriptByIdHeader,
|
|
39413
|
+
deleteTranscriptByIdParams: () => deleteTranscriptByIdParams,
|
|
39414
|
+
deleteTranscriptByIdResponse: () => deleteTranscriptByIdResponse,
|
|
39415
|
+
getTranscriptByIdHeader: () => getTranscriptByIdHeader,
|
|
39416
|
+
getTranscriptByIdParams: () => getTranscriptByIdParams,
|
|
39417
|
+
getTranscriptByIdResponse: () => getTranscriptByIdResponse,
|
|
39418
|
+
speechToTextBody: () => speechToTextBody,
|
|
39419
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault,
|
|
39420
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive,
|
|
39421
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour,
|
|
39422
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne,
|
|
39423
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree,
|
|
39424
|
+
speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo: () => speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo,
|
|
39425
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault,
|
|
39426
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive,
|
|
39427
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour,
|
|
39428
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne,
|
|
39429
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree,
|
|
39430
|
+
speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo: () => speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo,
|
|
39431
|
+
speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault: () => speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault,
|
|
39432
|
+
speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree: () => speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree,
|
|
39433
|
+
speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive,
|
|
39434
|
+
speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive,
|
|
39435
|
+
speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive: () => speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive,
|
|
39436
|
+
speechToTextBodyAdditionalFormatsMax: () => speechToTextBodyAdditionalFormatsMax,
|
|
39437
|
+
speechToTextBodyDiarizationThresholdMaxOne: () => speechToTextBodyDiarizationThresholdMaxOne,
|
|
39438
|
+
speechToTextBodyDiarizationThresholdMinOne: () => speechToTextBodyDiarizationThresholdMinOne,
|
|
39439
|
+
speechToTextBodyDiarizeDefault: () => speechToTextBodyDiarizeDefault,
|
|
39440
|
+
speechToTextBodyFileFormatDefault: () => speechToTextBodyFileFormatDefault,
|
|
39441
|
+
speechToTextBodyKeytermsDefault: () => speechToTextBodyKeytermsDefault,
|
|
39442
|
+
speechToTextBodyNoVerbatimDefault: () => speechToTextBodyNoVerbatimDefault,
|
|
39443
|
+
speechToTextBodyNumSpeakersMaxOne: () => speechToTextBodyNumSpeakersMaxOne,
|
|
39444
|
+
speechToTextBodySeedMaxOne: () => speechToTextBodySeedMaxOne,
|
|
39445
|
+
speechToTextBodySeedMinOne: () => speechToTextBodySeedMinOne,
|
|
39446
|
+
speechToTextBodyTagAudioEventsDefault: () => speechToTextBodyTagAudioEventsDefault,
|
|
39447
|
+
speechToTextBodyTemperatureMaxOne: () => speechToTextBodyTemperatureMaxOne,
|
|
39448
|
+
speechToTextBodyTemperatureMinOne: () => speechToTextBodyTemperatureMinOne,
|
|
39449
|
+
speechToTextBodyTimestampsGranularityDefault: () => speechToTextBodyTimestampsGranularityDefault,
|
|
39450
|
+
speechToTextBodyUseMultiChannelDefault: () => speechToTextBodyUseMultiChannelDefault,
|
|
39451
|
+
speechToTextBodyWebhookDefault: () => speechToTextBodyWebhookDefault,
|
|
39452
|
+
speechToTextHeader: () => speechToTextHeader,
|
|
39453
|
+
speechToTextQueryEnableLoggingDefault: () => speechToTextQueryEnableLoggingDefault,
|
|
39454
|
+
speechToTextQueryParams: () => speechToTextQueryParams,
|
|
39455
|
+
speechToTextResponse: () => speechToTextResponse
|
|
39456
|
+
});
|
|
39457
|
+
import { z as zod13 } from "zod";
|
|
39458
|
+
var speechToTextQueryEnableLoggingDefault = true;
|
|
39459
|
+
var speechToTextQueryParams = zod13.object({
|
|
39460
|
+
enable_logging: zod13.boolean().default(speechToTextQueryEnableLoggingDefault).describe(
|
|
39461
|
+
"When enable_logging is set to false zero retention mode will be used for the request. This will mean log and transcript storage features are unavailable for this request. Zero retention mode may only be used by enterprise customers."
|
|
39462
|
+
)
|
|
39463
|
+
});
|
|
39464
|
+
var speechToTextHeader = zod13.object({
|
|
39465
|
+
"xi-api-key": zod13.string().or(zod13.null()).optional().describe(
|
|
39466
|
+
"Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
|
|
39467
|
+
)
|
|
39468
|
+
});
|
|
39469
|
+
var speechToTextBodyTagAudioEventsDefault = true;
|
|
39470
|
+
var speechToTextBodyNumSpeakersMaxOne = 32;
|
|
39471
|
+
var speechToTextBodyTimestampsGranularityDefault = "word";
|
|
39472
|
+
var speechToTextBodyDiarizeDefault = false;
|
|
39473
|
+
var speechToTextBodyDiarizationThresholdMinOne = 0.1;
|
|
39474
|
+
var speechToTextBodyDiarizationThresholdMaxOne = 0.4;
|
|
39475
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault = true;
|
|
39476
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault = true;
|
|
39477
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne = true;
|
|
39478
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne = true;
|
|
39479
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo = true;
|
|
39480
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo = true;
|
|
39481
|
+
var speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault = 100;
|
|
39482
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree = true;
|
|
39483
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree = true;
|
|
39484
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour = true;
|
|
39485
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour = true;
|
|
39486
|
+
var speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree = 42;
|
|
39487
|
+
var speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFive = false;
|
|
39488
|
+
var speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive = true;
|
|
39489
|
+
var speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive = 0.8;
|
|
39490
|
+
var speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive = 4;
|
|
39491
|
+
var speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive = 84;
|
|
39492
|
+
var speechToTextBodyAdditionalFormatsMax = 10;
|
|
39493
|
+
var speechToTextBodyFileFormatDefault = "other";
|
|
39494
|
+
var speechToTextBodyWebhookDefault = false;
|
|
39495
|
+
var speechToTextBodyTemperatureMinOne = 0;
|
|
39496
|
+
var speechToTextBodyTemperatureMaxOne = 2;
|
|
39497
|
+
var speechToTextBodySeedMinOne = 0;
|
|
39498
|
+
var speechToTextBodySeedMaxOne = 2147483647;
|
|
39499
|
+
var speechToTextBodyUseMultiChannelDefault = false;
|
|
39500
|
+
var speechToTextBodyNoVerbatimDefault = false;
|
|
39501
|
+
var speechToTextBodyKeytermsDefault = [];
|
|
39502
|
+
var speechToTextBody = zod13.object({
|
|
39503
|
+
model_id: zod13.enum(["scribe_v1", "scribe_v2"]).describe("The ID of the model to use for transcription."),
|
|
39504
|
+
file: zod13.instanceof(File).or(zod13.null()).optional().describe(
|
|
39505
|
+
"The file to transcribe. All major audio and video formats are supported. Exactly one of the file or cloud_storage_url parameters must be provided. The file size must be less than 3.0GB."
|
|
39506
|
+
),
|
|
39507
|
+
language_code: zod13.string().or(zod13.null()).optional().describe(
|
|
39508
|
+
"An ISO-639-1 or ISO-639-3 language_code corresponding to the language of the audio file. Can sometimes improve transcription performance if known beforehand. Defaults to null, in this case the language is predicted automatically."
|
|
39509
|
+
),
|
|
39510
|
+
tag_audio_events: zod13.boolean().default(speechToTextBodyTagAudioEventsDefault).describe(
|
|
39511
|
+
"Whether to tag audio events like (laughter), (footsteps), etc. in the transcription."
|
|
39512
|
+
),
|
|
39513
|
+
num_speakers: zod13.number().min(1).max(speechToTextBodyNumSpeakersMaxOne).or(zod13.null()).optional().describe(
|
|
39514
|
+
"The maximum amount of speakers talking in the uploaded file. Can help with predicting who speaks when. The maximum amount of speakers that can be predicted is 32. Defaults to null, in this case the amount of speakers is set to the maximum value the model supports."
|
|
39515
|
+
),
|
|
39516
|
+
timestamps_granularity: zod13.enum(["none", "word", "character"]).default(speechToTextBodyTimestampsGranularityDefault).describe(
|
|
39517
|
+
"The granularity of the timestamps in the transcription. 'word' provides word-level timestamps and 'character' provides character-level timestamps per word."
|
|
39518
|
+
),
|
|
39519
|
+
diarize: zod13.boolean().optional().describe("Whether to annotate which speaker is currently talking in the uploaded file."),
|
|
39520
|
+
diarization_threshold: zod13.number().min(speechToTextBodyDiarizationThresholdMinOne).max(speechToTextBodyDiarizationThresholdMaxOne).or(zod13.null()).optional().describe(
|
|
39521
|
+
"Diarization threshold to apply during speaker diarization. A higher value means there will be a lower chance of one speaker being diarized as two different speakers but also a higher chance of two different speakers being diarized as one speaker (less total speakers predicted). A low value means there will be a higher chance of one speaker being diarized as two different speakers but also a lower chance of two different speakers being diarized as one speaker (more total speakers predicted). Can only be set when diarize=True and num_speakers=None. Defaults to None, in which case we will choose a threshold based on the model_id (0.22 usually)."
|
|
39522
|
+
),
|
|
39523
|
+
additional_formats: zod13.array(
|
|
39524
|
+
zod13.discriminatedUnion("format", [
|
|
39525
|
+
zod13.object({
|
|
39526
|
+
include_speakers: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefault),
|
|
39527
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefault),
|
|
39528
|
+
format: zod13.enum(["segmented_json"]),
|
|
39529
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).optional(),
|
|
39530
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).optional(),
|
|
39531
|
+
max_segment_chars: zod13.number().or(zod13.null()).optional()
|
|
39532
|
+
}),
|
|
39533
|
+
zod13.object({
|
|
39534
|
+
include_speakers: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultOne),
|
|
39535
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultOne),
|
|
39536
|
+
format: zod13.enum(["docx"]),
|
|
39537
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).optional(),
|
|
39538
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).optional(),
|
|
39539
|
+
max_segment_chars: zod13.number().or(zod13.null()).optional()
|
|
39540
|
+
}),
|
|
39541
|
+
zod13.object({
|
|
39542
|
+
include_speakers: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultTwo),
|
|
39543
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultTwo),
|
|
39544
|
+
format: zod13.enum(["pdf"]),
|
|
39545
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).optional(),
|
|
39546
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).optional(),
|
|
39547
|
+
max_segment_chars: zod13.number().or(zod13.null()).optional()
|
|
39548
|
+
}),
|
|
39549
|
+
zod13.object({
|
|
39550
|
+
max_characters_per_line: zod13.number().or(zod13.null()).default(speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefault),
|
|
39551
|
+
include_speakers: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultThree),
|
|
39552
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultThree),
|
|
39553
|
+
format: zod13.enum(["txt"]),
|
|
39554
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).optional(),
|
|
39555
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).optional(),
|
|
39556
|
+
max_segment_chars: zod13.number().or(zod13.null()).optional()
|
|
39557
|
+
}),
|
|
39558
|
+
zod13.object({
|
|
39559
|
+
include_speakers: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeSpeakersDefaultFour),
|
|
39560
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFour),
|
|
39561
|
+
format: zod13.enum(["html"]),
|
|
39562
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).optional(),
|
|
39563
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).optional(),
|
|
39564
|
+
max_segment_chars: zod13.number().or(zod13.null()).optional()
|
|
39565
|
+
}),
|
|
39566
|
+
zod13.object({
|
|
39567
|
+
max_characters_per_line: zod13.number().or(zod13.null()).default(speechToTextBodyAdditionalFormatsItemMaxCharactersPerLineDefaultThree),
|
|
39568
|
+
include_speakers: zod13.boolean().optional(),
|
|
39569
|
+
include_timestamps: zod13.boolean().default(speechToTextBodyAdditionalFormatsItemIncludeTimestampsDefaultFive),
|
|
39570
|
+
format: zod13.enum(["srt"]),
|
|
39571
|
+
segment_on_silence_longer_than_s: zod13.number().or(zod13.null()).default(
|
|
39572
|
+
speechToTextBodyAdditionalFormatsItemSegmentOnSilenceLongerThanSDefaultOnefive
|
|
39573
|
+
),
|
|
39574
|
+
max_segment_duration_s: zod13.number().or(zod13.null()).default(speechToTextBodyAdditionalFormatsItemMaxSegmentDurationSDefaultOnefive),
|
|
39575
|
+
max_segment_chars: zod13.number().or(zod13.null()).default(speechToTextBodyAdditionalFormatsItemMaxSegmentCharsDefaultOnefive)
|
|
39576
|
+
})
|
|
39577
|
+
])
|
|
39578
|
+
).max(speechToTextBodyAdditionalFormatsMax).optional(),
|
|
39579
|
+
file_format: zod13.enum(["pcm_s16le_16", "other"]).default(speechToTextBodyFileFormatDefault).describe(
|
|
39580
|
+
"The format of input audio. Options are 'pcm_s16le_16' or 'other' For `pcm_s16le_16`, the input audio must be 16-bit PCM at a 16kHz sample rate, single channel (mono), and little-endian byte order. Latency will be lower than with passing an encoded waveform."
|
|
39581
|
+
),
|
|
39582
|
+
cloud_storage_url: zod13.string().or(zod13.null()).optional().describe(
|
|
39583
|
+
"The HTTPS URL of the file to transcribe. Exactly one of the file or cloud_storage_url parameters must be provided. The file must be accessible via HTTPS and the file size must be less than 2GB. Any valid HTTPS URL is accepted, including URLs from cloud storage providers (AWS S3, Google Cloud Storage, Cloudflare R2, etc.), CDNs, or any other HTTPS source. URLs can be pre-signed or include authentication tokens in query parameters."
|
|
39584
|
+
),
|
|
39585
|
+
webhook: zod13.boolean().optional().describe(
|
|
39586
|
+
"Whether to send the transcription result to configured speech-to-text webhooks. If set the request will return early without the transcription, which will be delivered later via webhook."
|
|
39587
|
+
),
|
|
39588
|
+
webhook_id: zod13.string().or(zod13.null()).optional().describe(
|
|
39589
|
+
"Optional specific webhook ID to send the transcription result to. Only valid when webhook is set to true. If not provided, transcription will be sent to all configured speech-to-text webhooks."
|
|
39590
|
+
),
|
|
39591
|
+
temperature: zod13.number().min(speechToTextBodyTemperatureMinOne).max(speechToTextBodyTemperatureMaxOne).or(zod13.null()).optional().describe(
|
|
39592
|
+
"Controls the randomness of the transcription output. Accepts values between 0.0 and 2.0, where higher values result in more diverse and less deterministic results. If omitted, we will use a temperature based on the model you selected which is usually 0."
|
|
39593
|
+
),
|
|
39594
|
+
seed: zod13.number().min(speechToTextBodySeedMinOne).max(speechToTextBodySeedMaxOne).or(zod13.null()).optional().describe(
|
|
39595
|
+
"If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result. Determinism is not guaranteed. Must be an integer between 0 and 2147483647."
|
|
39596
|
+
),
|
|
39597
|
+
use_multi_channel: zod13.boolean().optional().describe(
|
|
39598
|
+
"Whether the audio file contains multiple channels where each channel contains a single speaker. When enabled, each channel will be transcribed independently and the results will be combined. Each word in the response will include a 'channel_index' field indicating which channel it was spoken on. A maximum of 5 channels is supported."
|
|
39599
|
+
),
|
|
39600
|
+
webhook_metadata: zod13.string().or(zod13.record(zod13.string(), zod13.any())).or(zod13.null()).optional().describe(
|
|
39601
|
+
"Optional metadata to be included in the webhook response. This should be a JSON string representing an object with a maximum depth of 2 levels and maximum size of 16KB. Useful for tracking internal IDs, job references, or other contextual information."
|
|
39602
|
+
),
|
|
39603
|
+
entity_detection: zod13.string().or(zod13.array(zod13.string())).or(zod13.null()).optional().describe(
|
|
39604
|
+
"Detect entities in the transcript. Can be 'all' to detect all entities, a single entity type or category string, or a list of entity types/categories. Categories include 'pii', 'phi', 'pci', 'other', 'offensive_language'. When enabled, detected entities will be returned in the 'entities' field with their text, type, and character positions. Usage of this parameter will incur additional costs."
|
|
39605
|
+
),
|
|
39606
|
+
no_verbatim: zod13.boolean().optional().describe(
|
|
39607
|
+
"If true, the transcription will not have any filler words, false starts and non-speech sounds. Only supported with scribe_v2 model."
|
|
39608
|
+
),
|
|
39609
|
+
keyterms: zod13.array(zod13.string()).default(speechToTextBodyKeytermsDefault).describe(
|
|
39610
|
+
'A list of keyterms to bias the transcription towards. The keyterms are words or phrases you want the model to recognise more accurately. The number of keyterms cannot exceed 100. The length of each keyterm must be less than 50 characters. Keyterms can contain at most 5 words (after normalisation). For example ["hello", "world", "technical term"]. Usage of this parameter will incur additional costs. '
|
|
39611
|
+
)
|
|
39612
|
+
});
|
|
39613
|
+
var speechToTextResponse = zod13.object({
|
|
39614
|
+
language_code: zod13.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
39615
|
+
language_probability: zod13.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
39616
|
+
text: zod13.string().describe("The raw text of the transcription."),
|
|
39617
|
+
words: zod13.array(
|
|
39618
|
+
zod13.object({
|
|
39619
|
+
text: zod13.string().describe("The word or sound that was transcribed."),
|
|
39620
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
39621
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
39622
|
+
type: zod13.enum(["word", "spacing", "audio_event"]).describe(
|
|
39623
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
39624
|
+
),
|
|
39625
|
+
speaker_id: zod13.string().or(zod13.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
39626
|
+
logprob: zod13.number().describe(
|
|
39627
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
39628
|
+
),
|
|
39629
|
+
characters: zod13.array(
|
|
39630
|
+
zod13.object({
|
|
39631
|
+
text: zod13.string().describe("The character that was transcribed."),
|
|
39632
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the character in seconds."),
|
|
39633
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the character in seconds.")
|
|
39634
|
+
})
|
|
39635
|
+
).or(zod13.null()).optional().describe("The characters that make up the word and their timing information.")
|
|
39636
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
39637
|
+
).describe("List of words with their timing information."),
|
|
39638
|
+
channel_index: zod13.number().or(zod13.null()).optional().describe("The channel index this transcript belongs to (for multichannel audio)."),
|
|
39639
|
+
additional_formats: zod13.array(
|
|
39640
|
+
zod13.object({
|
|
39641
|
+
requested_format: zod13.string().describe("The requested format."),
|
|
39642
|
+
file_extension: zod13.string().describe("The file extension of the additional format."),
|
|
39643
|
+
content_type: zod13.string().describe("The content type of the additional format."),
|
|
39644
|
+
is_base64_encoded: zod13.boolean().describe("Whether the content is base64 encoded."),
|
|
39645
|
+
content: zod13.string().describe("The content of the additional format.")
|
|
39646
|
+
}).or(zod13.null())
|
|
39647
|
+
).or(zod13.null()).optional().describe("Requested additional formats of the transcript."),
|
|
39648
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response."),
|
|
39649
|
+
entities: zod13.array(
|
|
39650
|
+
zod13.object({
|
|
39651
|
+
text: zod13.string().describe("The text that was identified as an entity."),
|
|
39652
|
+
entity_type: zod13.string().describe(
|
|
39653
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
39654
|
+
),
|
|
39655
|
+
start_char: zod13.number().describe("Start character position in the transcript text."),
|
|
39656
|
+
end_char: zod13.number().describe("End character position in the transcript text.")
|
|
39657
|
+
})
|
|
39658
|
+
).or(zod13.null()).optional().describe(
|
|
39659
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
39660
|
+
)
|
|
39661
|
+
}).describe("Chunk-level detail of the transcription with timing information.").or(
|
|
39662
|
+
zod13.object({
|
|
39663
|
+
transcripts: zod13.array(
|
|
39664
|
+
zod13.object({
|
|
39665
|
+
language_code: zod13.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
39666
|
+
language_probability: zod13.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
39667
|
+
text: zod13.string().describe("The raw text of the transcription."),
|
|
39668
|
+
words: zod13.array(
|
|
39669
|
+
zod13.object({
|
|
39670
|
+
text: zod13.string().describe("The word or sound that was transcribed."),
|
|
39671
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
39672
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
39673
|
+
type: zod13.enum(["word", "spacing", "audio_event"]).describe(
|
|
39674
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
39675
|
+
),
|
|
39676
|
+
speaker_id: zod13.string().or(zod13.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
39677
|
+
logprob: zod13.number().describe(
|
|
39678
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
39679
|
+
),
|
|
39680
|
+
characters: zod13.array(
|
|
39681
|
+
zod13.object({
|
|
39682
|
+
text: zod13.string().describe("The character that was transcribed."),
|
|
39683
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the character in seconds."),
|
|
39684
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the character in seconds.")
|
|
39685
|
+
})
|
|
39686
|
+
).or(zod13.null()).optional().describe(
|
|
39687
|
+
"The characters that make up the word and their timing information."
|
|
39688
|
+
)
|
|
39689
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
39690
|
+
).describe("List of words with their timing information."),
|
|
39691
|
+
channel_index: zod13.number().or(zod13.null()).optional().describe(
|
|
39692
|
+
"The channel index this transcript belongs to (for multichannel audio)."
|
|
39693
|
+
),
|
|
39694
|
+
additional_formats: zod13.array(
|
|
39695
|
+
zod13.object({
|
|
39696
|
+
requested_format: zod13.string().describe("The requested format."),
|
|
39697
|
+
file_extension: zod13.string().describe("The file extension of the additional format."),
|
|
39698
|
+
content_type: zod13.string().describe("The content type of the additional format."),
|
|
39699
|
+
is_base64_encoded: zod13.boolean().describe("Whether the content is base64 encoded."),
|
|
39700
|
+
content: zod13.string().describe("The content of the additional format.")
|
|
39701
|
+
}).or(zod13.null())
|
|
39702
|
+
).or(zod13.null()).optional().describe("Requested additional formats of the transcript."),
|
|
39703
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response."),
|
|
39704
|
+
entities: zod13.array(
|
|
39705
|
+
zod13.object({
|
|
39706
|
+
text: zod13.string().describe("The text that was identified as an entity."),
|
|
39707
|
+
entity_type: zod13.string().describe(
|
|
39708
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
39709
|
+
),
|
|
39710
|
+
start_char: zod13.number().describe("Start character position in the transcript text."),
|
|
39711
|
+
end_char: zod13.number().describe("End character position in the transcript text.")
|
|
39712
|
+
})
|
|
39713
|
+
).or(zod13.null()).optional().describe(
|
|
39714
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
39715
|
+
)
|
|
39716
|
+
}).describe("Chunk-level detail of the transcription with timing information.")
|
|
39717
|
+
).describe(
|
|
39718
|
+
"List of transcripts, one for each audio channel. Each transcript contains the text and word-level details for its respective channel."
|
|
39719
|
+
),
|
|
39720
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response.")
|
|
39721
|
+
}).describe("Response model for multichannel speech-to-text transcription.")
|
|
39722
|
+
);
|
|
39723
|
+
var getTranscriptByIdParams = zod13.object({
|
|
39724
|
+
transcription_id: zod13.string().describe("The unique ID of the transcript to retrieve")
|
|
39725
|
+
});
|
|
39726
|
+
var getTranscriptByIdHeader = zod13.object({
|
|
39727
|
+
"xi-api-key": zod13.string().or(zod13.null()).optional().describe(
|
|
39728
|
+
"Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
|
|
39729
|
+
)
|
|
39730
|
+
});
|
|
39731
|
+
var getTranscriptByIdResponse = zod13.object({
|
|
39732
|
+
language_code: zod13.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
39733
|
+
language_probability: zod13.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
39734
|
+
text: zod13.string().describe("The raw text of the transcription."),
|
|
39735
|
+
words: zod13.array(
|
|
39736
|
+
zod13.object({
|
|
39737
|
+
text: zod13.string().describe("The word or sound that was transcribed."),
|
|
39738
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
39739
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
39740
|
+
type: zod13.enum(["word", "spacing", "audio_event"]).describe(
|
|
39741
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
39742
|
+
),
|
|
39743
|
+
speaker_id: zod13.string().or(zod13.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
39744
|
+
logprob: zod13.number().describe(
|
|
39745
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
39746
|
+
),
|
|
39747
|
+
characters: zod13.array(
|
|
39748
|
+
zod13.object({
|
|
39749
|
+
text: zod13.string().describe("The character that was transcribed."),
|
|
39750
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the character in seconds."),
|
|
39751
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the character in seconds.")
|
|
39752
|
+
})
|
|
39753
|
+
).or(zod13.null()).optional().describe("The characters that make up the word and their timing information.")
|
|
39754
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
39755
|
+
).describe("List of words with their timing information."),
|
|
39756
|
+
channel_index: zod13.number().or(zod13.null()).optional().describe("The channel index this transcript belongs to (for multichannel audio)."),
|
|
39757
|
+
additional_formats: zod13.array(
|
|
39758
|
+
zod13.object({
|
|
39759
|
+
requested_format: zod13.string().describe("The requested format."),
|
|
39760
|
+
file_extension: zod13.string().describe("The file extension of the additional format."),
|
|
39761
|
+
content_type: zod13.string().describe("The content type of the additional format."),
|
|
39762
|
+
is_base64_encoded: zod13.boolean().describe("Whether the content is base64 encoded."),
|
|
39763
|
+
content: zod13.string().describe("The content of the additional format.")
|
|
39764
|
+
}).or(zod13.null())
|
|
39765
|
+
).or(zod13.null()).optional().describe("Requested additional formats of the transcript."),
|
|
39766
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response."),
|
|
39767
|
+
entities: zod13.array(
|
|
39768
|
+
zod13.object({
|
|
39769
|
+
text: zod13.string().describe("The text that was identified as an entity."),
|
|
39770
|
+
entity_type: zod13.string().describe(
|
|
39771
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
39772
|
+
),
|
|
39773
|
+
start_char: zod13.number().describe("Start character position in the transcript text."),
|
|
39774
|
+
end_char: zod13.number().describe("End character position in the transcript text.")
|
|
39775
|
+
})
|
|
39776
|
+
).or(zod13.null()).optional().describe(
|
|
39777
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
39778
|
+
)
|
|
39779
|
+
}).describe("Chunk-level detail of the transcription with timing information.").or(
|
|
39780
|
+
zod13.object({
|
|
39781
|
+
transcripts: zod13.array(
|
|
39782
|
+
zod13.object({
|
|
39783
|
+
language_code: zod13.string().describe("The detected language code (e.g. 'eng' for English)."),
|
|
39784
|
+
language_probability: zod13.number().describe("The confidence score of the language detection (0 to 1)."),
|
|
39785
|
+
text: zod13.string().describe("The raw text of the transcription."),
|
|
39786
|
+
words: zod13.array(
|
|
39787
|
+
zod13.object({
|
|
39788
|
+
text: zod13.string().describe("The word or sound that was transcribed."),
|
|
39789
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the word or sound in seconds."),
|
|
39790
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the word or sound in seconds."),
|
|
39791
|
+
type: zod13.enum(["word", "spacing", "audio_event"]).describe(
|
|
39792
|
+
"The type of the word or sound. 'audio_event' is used for non-word sounds like laughter or footsteps."
|
|
39793
|
+
),
|
|
39794
|
+
speaker_id: zod13.string().or(zod13.null()).optional().describe("Unique identifier for the speaker of this word."),
|
|
39795
|
+
logprob: zod13.number().describe(
|
|
39796
|
+
"The log of the probability with which this word was predicted. Logprobs are in range [-infinity, 0], higher logprobs indicate a higher confidence the model has in its predictions."
|
|
39797
|
+
),
|
|
39798
|
+
characters: zod13.array(
|
|
39799
|
+
zod13.object({
|
|
39800
|
+
text: zod13.string().describe("The character that was transcribed."),
|
|
39801
|
+
start: zod13.number().or(zod13.null()).optional().describe("The start time of the character in seconds."),
|
|
39802
|
+
end: zod13.number().or(zod13.null()).optional().describe("The end time of the character in seconds.")
|
|
39803
|
+
})
|
|
39804
|
+
).or(zod13.null()).optional().describe(
|
|
39805
|
+
"The characters that make up the word and their timing information."
|
|
39806
|
+
)
|
|
39807
|
+
}).describe("Word-level detail of the transcription with timing information.")
|
|
39808
|
+
).describe("List of words with their timing information."),
|
|
39809
|
+
channel_index: zod13.number().or(zod13.null()).optional().describe(
|
|
39810
|
+
"The channel index this transcript belongs to (for multichannel audio)."
|
|
39811
|
+
),
|
|
39812
|
+
additional_formats: zod13.array(
|
|
39813
|
+
zod13.object({
|
|
39814
|
+
requested_format: zod13.string().describe("The requested format."),
|
|
39815
|
+
file_extension: zod13.string().describe("The file extension of the additional format."),
|
|
39816
|
+
content_type: zod13.string().describe("The content type of the additional format."),
|
|
39817
|
+
is_base64_encoded: zod13.boolean().describe("Whether the content is base64 encoded."),
|
|
39818
|
+
content: zod13.string().describe("The content of the additional format.")
|
|
39819
|
+
}).or(zod13.null())
|
|
39820
|
+
).or(zod13.null()).optional().describe("Requested additional formats of the transcript."),
|
|
39821
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response."),
|
|
39822
|
+
entities: zod13.array(
|
|
39823
|
+
zod13.object({
|
|
39824
|
+
text: zod13.string().describe("The text that was identified as an entity."),
|
|
39825
|
+
entity_type: zod13.string().describe(
|
|
39826
|
+
"The type of entity detected (e.g., 'credit_card', 'email_address', 'person_name')."
|
|
39827
|
+
),
|
|
39828
|
+
start_char: zod13.number().describe("Start character position in the transcript text."),
|
|
39829
|
+
end_char: zod13.number().describe("End character position in the transcript text.")
|
|
39830
|
+
})
|
|
39831
|
+
).or(zod13.null()).optional().describe(
|
|
39832
|
+
"List of detected entities with their text, type, and character positions in the transcript."
|
|
39833
|
+
)
|
|
39834
|
+
}).describe("Chunk-level detail of the transcription with timing information.")
|
|
39835
|
+
).describe(
|
|
39836
|
+
"List of transcripts, one for each audio channel. Each transcript contains the text and word-level details for its respective channel."
|
|
39837
|
+
),
|
|
39838
|
+
transcription_id: zod13.string().or(zod13.null()).optional().describe("The transcription ID of the response.")
|
|
39839
|
+
}).describe("Response model for multichannel speech-to-text transcription.")
|
|
39840
|
+
);
|
|
39841
|
+
var deleteTranscriptByIdParams = zod13.object({
|
|
39842
|
+
transcription_id: zod13.string().describe("The unique ID of the transcript to delete")
|
|
39843
|
+
});
|
|
39844
|
+
var deleteTranscriptByIdHeader = zod13.object({
|
|
39845
|
+
"xi-api-key": zod13.string().or(zod13.null()).optional().describe(
|
|
39846
|
+
"Your API key. This is required by most endpoints to access our API programmatically. You can view your xi-api-key using the 'Profile' tab on the website."
|
|
39847
|
+
)
|
|
39848
|
+
});
|
|
39849
|
+
var deleteTranscriptByIdResponse = zod13.any();
|
|
38491
39850
|
export {
|
|
38492
39851
|
AllLanguageCodes,
|
|
38493
39852
|
AllProviders,
|
|
@@ -38542,6 +39901,13 @@ export {
|
|
|
38542
39901
|
DeepgramTranscriptionSchema,
|
|
38543
39902
|
schema_exports4 as DeepgramTypes,
|
|
38544
39903
|
deepgramAPISpecification_zod_exports as DeepgramZodSchemas,
|
|
39904
|
+
ElevenLabsAdapter,
|
|
39905
|
+
ElevenLabsCapabilities,
|
|
39906
|
+
ElevenLabsLanguageCodes,
|
|
39907
|
+
ElevenLabsLanguageLabels,
|
|
39908
|
+
ElevenLabsLanguages,
|
|
39909
|
+
schema_exports8 as ElevenLabsTypes,
|
|
39910
|
+
elevenLabsSpeechToTextAPI_zod_exports as ElevenLabsZodSchemas,
|
|
38545
39911
|
GladiaAdapter,
|
|
38546
39912
|
GladiaBitDepth,
|
|
38547
39913
|
GladiaCapabilities,
|
|
@@ -38628,6 +39994,7 @@ export {
|
|
|
38628
39994
|
createAssemblyAIAdapter,
|
|
38629
39995
|
createAzureSTTAdapter,
|
|
38630
39996
|
createDeepgramAdapter,
|
|
39997
|
+
createElevenLabsAdapter,
|
|
38631
39998
|
createGladiaAdapter,
|
|
38632
39999
|
createOpenAIWhisperAdapter,
|
|
38633
40000
|
createSonioxAdapter,
|