dv-pipecat-ai 0.0.85.dev5__py3-none-any.whl → 0.0.85.dev698__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (157) hide show
  1. {dv_pipecat_ai-0.0.85.dev5.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/METADATA +78 -117
  2. {dv_pipecat_ai-0.0.85.dev5.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/RECORD +157 -123
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +5 -0
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  11. pipecat/audio/filters/noisereduce_filter.py +15 -0
  12. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  13. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  14. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  15. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  16. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  17. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  18. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  19. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  20. pipecat/audio/vad/data/README.md +10 -0
  21. pipecat/audio/vad/vad_analyzer.py +13 -1
  22. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  23. pipecat/frames/frames.py +120 -87
  24. pipecat/observers/loggers/debug_log_observer.py +3 -3
  25. pipecat/observers/loggers/llm_log_observer.py +7 -3
  26. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  27. pipecat/pipeline/runner.py +12 -4
  28. pipecat/pipeline/service_switcher.py +64 -36
  29. pipecat/pipeline/task.py +85 -24
  30. pipecat/processors/aggregators/dtmf_aggregator.py +28 -22
  31. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  32. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  33. pipecat/processors/aggregators/llm_response.py +6 -7
  34. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  35. pipecat/processors/aggregators/user_response.py +6 -6
  36. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  37. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  38. pipecat/processors/filters/stt_mute_filter.py +2 -0
  39. pipecat/processors/frame_processor.py +103 -17
  40. pipecat/processors/frameworks/langchain.py +8 -2
  41. pipecat/processors/frameworks/rtvi.py +209 -68
  42. pipecat/processors/frameworks/strands_agents.py +170 -0
  43. pipecat/processors/logger.py +2 -2
  44. pipecat/processors/transcript_processor.py +4 -4
  45. pipecat/processors/user_idle_processor.py +3 -6
  46. pipecat/runner/run.py +270 -50
  47. pipecat/runner/types.py +2 -0
  48. pipecat/runner/utils.py +51 -10
  49. pipecat/serializers/exotel.py +5 -5
  50. pipecat/serializers/livekit.py +20 -0
  51. pipecat/serializers/plivo.py +6 -9
  52. pipecat/serializers/protobuf.py +6 -5
  53. pipecat/serializers/telnyx.py +2 -2
  54. pipecat/serializers/twilio.py +43 -23
  55. pipecat/services/ai_service.py +2 -6
  56. pipecat/services/anthropic/llm.py +2 -25
  57. pipecat/services/asyncai/tts.py +2 -3
  58. pipecat/services/aws/__init__.py +1 -0
  59. pipecat/services/aws/llm.py +122 -97
  60. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  61. pipecat/services/aws/nova_sonic/context.py +367 -0
  62. pipecat/services/aws/nova_sonic/frames.py +25 -0
  63. pipecat/services/aws/nova_sonic/llm.py +1155 -0
  64. pipecat/services/aws/stt.py +1 -3
  65. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  66. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  67. pipecat/services/aws_nova_sonic/context.py +13 -355
  68. pipecat/services/aws_nova_sonic/frames.py +13 -17
  69. pipecat/services/azure/realtime/__init__.py +0 -0
  70. pipecat/services/azure/realtime/llm.py +65 -0
  71. pipecat/services/azure/stt.py +15 -0
  72. pipecat/services/cartesia/tts.py +2 -2
  73. pipecat/services/deepgram/__init__.py +1 -0
  74. pipecat/services/deepgram/flux/__init__.py +0 -0
  75. pipecat/services/deepgram/flux/stt.py +636 -0
  76. pipecat/services/elevenlabs/__init__.py +2 -1
  77. pipecat/services/elevenlabs/stt.py +254 -276
  78. pipecat/services/elevenlabs/tts.py +5 -5
  79. pipecat/services/fish/tts.py +2 -2
  80. pipecat/services/gemini_multimodal_live/events.py +38 -524
  81. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  82. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  83. pipecat/services/gladia/stt.py +56 -72
  84. pipecat/services/google/__init__.py +1 -0
  85. pipecat/services/google/gemini_live/__init__.py +3 -0
  86. pipecat/services/google/gemini_live/file_api.py +189 -0
  87. pipecat/services/google/gemini_live/llm.py +1582 -0
  88. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  89. pipecat/services/google/llm.py +15 -11
  90. pipecat/services/google/llm_openai.py +3 -3
  91. pipecat/services/google/llm_vertex.py +86 -16
  92. pipecat/services/google/tts.py +7 -3
  93. pipecat/services/heygen/api.py +2 -0
  94. pipecat/services/heygen/client.py +8 -4
  95. pipecat/services/heygen/video.py +2 -0
  96. pipecat/services/hume/__init__.py +5 -0
  97. pipecat/services/hume/tts.py +220 -0
  98. pipecat/services/inworld/tts.py +6 -6
  99. pipecat/services/llm_service.py +15 -5
  100. pipecat/services/lmnt/tts.py +2 -2
  101. pipecat/services/mcp_service.py +4 -2
  102. pipecat/services/mem0/memory.py +6 -5
  103. pipecat/services/mistral/llm.py +29 -8
  104. pipecat/services/moondream/vision.py +42 -16
  105. pipecat/services/neuphonic/tts.py +2 -2
  106. pipecat/services/openai/__init__.py +1 -0
  107. pipecat/services/openai/base_llm.py +27 -20
  108. pipecat/services/openai/realtime/__init__.py +0 -0
  109. pipecat/services/openai/realtime/context.py +272 -0
  110. pipecat/services/openai/realtime/events.py +1106 -0
  111. pipecat/services/openai/realtime/frames.py +37 -0
  112. pipecat/services/openai/realtime/llm.py +829 -0
  113. pipecat/services/openai/tts.py +16 -8
  114. pipecat/services/openai_realtime/__init__.py +27 -0
  115. pipecat/services/openai_realtime/azure.py +21 -0
  116. pipecat/services/openai_realtime/context.py +21 -0
  117. pipecat/services/openai_realtime/events.py +21 -0
  118. pipecat/services/openai_realtime/frames.py +21 -0
  119. pipecat/services/openai_realtime_beta/azure.py +16 -0
  120. pipecat/services/openai_realtime_beta/openai.py +17 -5
  121. pipecat/services/playht/tts.py +31 -4
  122. pipecat/services/rime/tts.py +3 -4
  123. pipecat/services/sarvam/tts.py +2 -6
  124. pipecat/services/simli/video.py +2 -2
  125. pipecat/services/speechmatics/stt.py +1 -7
  126. pipecat/services/stt_service.py +34 -0
  127. pipecat/services/tavus/video.py +2 -2
  128. pipecat/services/tts_service.py +9 -9
  129. pipecat/services/vision_service.py +7 -6
  130. pipecat/services/vistaar/llm.py +4 -0
  131. pipecat/tests/utils.py +4 -4
  132. pipecat/transcriptions/language.py +41 -1
  133. pipecat/transports/base_input.py +17 -42
  134. pipecat/transports/base_output.py +42 -26
  135. pipecat/transports/daily/transport.py +199 -26
  136. pipecat/transports/heygen/__init__.py +0 -0
  137. pipecat/transports/heygen/transport.py +381 -0
  138. pipecat/transports/livekit/transport.py +228 -63
  139. pipecat/transports/local/audio.py +6 -1
  140. pipecat/transports/local/tk.py +11 -2
  141. pipecat/transports/network/fastapi_websocket.py +1 -1
  142. pipecat/transports/smallwebrtc/connection.py +98 -19
  143. pipecat/transports/smallwebrtc/request_handler.py +204 -0
  144. pipecat/transports/smallwebrtc/transport.py +65 -23
  145. pipecat/transports/tavus/transport.py +23 -12
  146. pipecat/transports/websocket/client.py +41 -5
  147. pipecat/transports/websocket/fastapi.py +21 -11
  148. pipecat/transports/websocket/server.py +14 -7
  149. pipecat/transports/whatsapp/api.py +8 -0
  150. pipecat/transports/whatsapp/client.py +47 -0
  151. pipecat/utils/base_object.py +54 -22
  152. pipecat/utils/string.py +12 -1
  153. pipecat/utils/tracing/service_decorators.py +21 -21
  154. {dv_pipecat_ai-0.0.85.dev5.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/WHEEL +0 -0
  155. {dv_pipecat_ai-0.0.85.dev5.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/licenses/LICENSE +0 -0
  156. {dv_pipecat_ai-0.0.85.dev5.dist-info → dv_pipecat_ai-0.0.85.dev698.dist-info}/top_level.txt +0 -0
  157. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -4,12 +4,19 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
- """ElevenLabs speech-to-text service implementation."""
7
+ """ElevenLabs speech-to-text service implementation.
8
8
 
9
- import asyncio
9
+ This module provides integration with ElevenLabs' Speech-to-Text API for transcription
10
+ using segmented audio processing. The service uploads audio files and receives
11
+ transcription results directly.
12
+ """
13
+
14
+ import io
10
15
  from typing import AsyncGenerator, Optional
11
16
 
17
+ import aiohttp
12
18
  from loguru import logger
19
+ from pydantic import BaseModel
13
20
 
14
21
  from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
15
22
  from pipecat.services.stt_service import SegmentedSTTService
@@ -17,345 +24,316 @@ from pipecat.transcriptions.language import Language
17
24
  from pipecat.utils.time import time_now_iso8601
18
25
  from pipecat.utils.tracing.service_decorators import traced_stt
19
26
 
20
- try:
21
- from elevenlabs.client import ElevenLabs
22
- except ModuleNotFoundError as e:
23
- logger.error(f"Exception: {e}")
24
- logger.error("In order to use ElevenLabs, you need to `pip install pipecat-ai[elevenlabs]`.")
25
- raise Exception(f"Missing module: {e}")
26
-
27
27
 
28
28
  def language_to_elevenlabs_language(language: Language) -> Optional[str]:
29
- """Maps pipecat Language enum to ElevenLabs language codes.
29
+ """Convert a Language enum to ElevenLabs language code.
30
+
31
+ Source:
32
+ https://elevenlabs.io/docs/capabilities/speech-to-text
30
33
 
31
34
  Args:
32
- language: A Language enum value representing the input language.
35
+ language: The Language enum value to convert.
33
36
 
34
37
  Returns:
35
- str or None: The corresponding ElevenLabs language code, or None if not supported.
38
+ The corresponding ElevenLabs language code, or None if not supported.
36
39
  """
37
- language_map = {
38
- # English
39
- Language.EN: "eng",
40
- Language.EN_US: "eng",
41
- Language.EN_GB: "eng",
42
- Language.EN_AU: "eng",
43
- Language.EN_CA: "eng",
44
- Language.EN_IN: "eng",
45
- Language.EN_IE: "eng",
46
- Language.EN_NZ: "eng",
47
- Language.EN_ZA: "eng",
48
- Language.EN_SG: "eng",
49
- Language.EN_HK: "eng",
50
- Language.EN_PH: "eng",
51
- Language.EN_KE: "eng",
52
- Language.EN_NG: "eng",
53
- Language.EN_TZ: "eng",
54
- # Spanish
55
- Language.ES: "spa",
56
- Language.ES_ES: "spa",
57
- Language.ES_MX: "spa",
58
- Language.ES_AR: "spa",
59
- Language.ES_CO: "spa",
60
- Language.ES_CL: "spa",
61
- Language.ES_VE: "spa",
62
- Language.ES_PE: "spa",
63
- Language.ES_EC: "spa",
64
- Language.ES_GT: "spa",
65
- Language.ES_CU: "spa",
66
- Language.ES_BO: "spa",
67
- Language.ES_DO: "spa",
68
- Language.ES_HN: "spa",
69
- Language.ES_PY: "spa",
70
- Language.ES_SV: "spa",
71
- Language.ES_NI: "spa",
72
- Language.ES_CR: "spa",
73
- Language.ES_PA: "spa",
74
- Language.ES_UY: "spa",
75
- Language.ES_PR: "spa",
76
- Language.ES_US: "spa",
77
- Language.ES_GQ: "spa",
78
- # French
79
- Language.FR: "fra",
80
- Language.FR_FR: "fra",
81
- Language.FR_CA: "fra",
82
- Language.FR_BE: "fra",
83
- Language.FR_CH: "fra",
84
- # German
85
- Language.DE: "deu",
86
- Language.DE_DE: "deu",
87
- Language.DE_AT: "deu",
88
- Language.DE_CH: "deu",
89
- # Italian
90
- Language.IT: "ita",
91
- Language.IT_IT: "ita",
92
- # Portuguese
93
- Language.PT: "por",
94
- Language.PT_PT: "por",
95
- Language.PT_BR: "por",
96
- # Hindi
97
- Language.HI: "hin",
98
- Language.HI_IN: "hin",
99
- # Arabic
100
- Language.AR: "ara",
101
- Language.AR_SA: "ara",
102
- Language.AR_EG: "ara",
103
- Language.AR_AE: "ara",
104
- Language.AR_BH: "ara",
105
- Language.AR_DZ: "ara",
106
- Language.AR_IQ: "ara",
107
- Language.AR_JO: "ara",
108
- Language.AR_KW: "ara",
109
- Language.AR_LB: "ara",
110
- Language.AR_LY: "ara",
111
- Language.AR_MA: "ara",
112
- Language.AR_OM: "ara",
113
- Language.AR_QA: "ara",
114
- Language.AR_SY: "ara",
115
- Language.AR_TN: "ara",
116
- Language.AR_YE: "ara",
117
- # Japanese
118
- Language.JA: "jpn",
119
- Language.JA_JP: "jpn",
120
- # Korean
121
- Language.KO: "kor",
122
- Language.KO_KR: "kor",
123
- # Chinese
124
- Language.ZH: "cmn",
125
- Language.ZH_CN: "cmn",
126
- Language.ZH_TW: "cmn",
127
- Language.ZH_HK: "cmn",
128
- # Russian
129
- Language.RU: "rus",
130
- Language.RU_RU: "rus",
131
- # Dutch
132
- Language.NL: "nld",
133
- Language.NL_NL: "nld",
134
- Language.NL_BE: "nld",
135
- # Polish
136
- Language.PL: "pol",
137
- Language.PL_PL: "pol",
138
- # Turkish
139
- Language.TR: "tur",
140
- Language.TR_TR: "tur",
141
- # Swedish
142
- Language.SV: "swe",
143
- Language.SV_SE: "swe",
144
- # Norwegian
145
- Language.NO: "nor",
146
- Language.NB: "nor",
147
- Language.NN: "nor",
148
- # Danish
149
- Language.DA: "dan",
150
- Language.DA_DK: "dan",
151
- # Finnish
152
- Language.FI: "fin",
153
- Language.FI_FI: "fin",
154
- # Czech
155
- Language.CS: "ces",
156
- Language.CS_CZ: "ces",
157
- # Hungarian
158
- Language.HU: "hun",
159
- Language.HU_HU: "hun",
160
- # Greek
161
- Language.EL: "ell",
162
- Language.EL_GR: "ell",
163
- # Hebrew
164
- Language.HE: "heb",
165
- Language.HE_IL: "heb",
166
- # Thai
167
- Language.TH: "tha",
168
- Language.TH_TH: "tha",
169
- # Vietnamese
170
- Language.VI: "vie",
171
- Language.VI_VN: "vie",
172
- # Indonesian
173
- Language.ID: "ind",
174
- Language.ID_ID: "ind",
175
- # Malay
176
- Language.MS: "msa",
177
- Language.MS_MY: "msa",
178
- # Ukrainian
179
- Language.UK: "ukr",
180
- Language.UK_UA: "ukr",
181
- # Bulgarian
182
- Language.BG: "bul",
183
- Language.BG_BG: "bul",
184
- # Croatian
185
- Language.HR: "hrv",
186
- Language.HR_HR: "hrv",
187
- # Slovak
188
- Language.SK: "slk",
189
- Language.SK_SK: "slk",
190
- # Slovenian
191
- Language.SL: "slv",
192
- Language.SL_SI: "slv",
193
- # Estonian
194
- Language.ET: "est",
195
- Language.ET_EE: "est",
196
- # Latvian
197
- Language.LV: "lav",
198
- Language.LV_LV: "lav",
199
- # Lithuanian
200
- Language.LT: "lit",
201
- Language.LT_LT: "lit",
202
- Language.TA: "tam", # Tamil
203
- Language.TA_IN: "tam", # Tamil
204
- Language.TE: "tel", # Telugu
205
- Language.TE_IN: "tel", # Telugu
40
+ BASE_LANGUAGES = {
41
+ Language.AF: "afr", # Afrikaans
42
+ Language.AM: "amh", # Amharic
43
+ Language.AR: "ara", # Arabic
44
+ Language.HY: "hye", # Armenian
45
+ Language.AS: "asm", # Assamese
46
+ Language.AST: "ast", # Asturian
47
+ Language.AZ: "aze", # Azerbaijani
48
+ Language.BE: "bel", # Belarusian
49
+ Language.BN: "ben", # Bengali
50
+ Language.BS: "bos", # Bosnian
51
+ Language.BG: "bul", # Bulgarian
52
+ Language.MY: "mya", # Burmese
53
+ Language.YUE: "yue", # Cantonese
54
+ Language.CA: "cat", # Catalan
55
+ Language.CEB: "ceb", # Cebuano
56
+ Language.NY: "nya", # Chichewa
57
+ Language.HR: "hrv", # Croatian
58
+ Language.CS: "ces", # Czech
59
+ Language.DA: "dan", # Danish
60
+ Language.NL: "nld", # Dutch
61
+ Language.EN: "eng", # English
62
+ Language.ET: "est", # Estonian
63
+ Language.FIL: "fil", # Filipino
64
+ Language.FI: "fin", # Finnish
65
+ Language.FR: "fra", # French
66
+ Language.FF: "ful", # Fulah
67
+ Language.GL: "glg", # Galician
68
+ Language.LG: "lug", # Ganda
69
+ Language.KA: "kat", # Georgian
70
+ Language.DE: "deu", # German
71
+ Language.EL: "ell", # Greek
72
+ Language.GU: "guj", # Gujarati
73
+ Language.HA: "hau", # Hausa
74
+ Language.HE: "heb", # Hebrew
75
+ Language.HI: "hin", # Hindi
76
+ Language.HU: "hun", # Hungarian
77
+ Language.IS: "isl", # Icelandic
78
+ Language.IG: "ibo", # Igbo
79
+ Language.ID: "ind", # Indonesian
80
+ Language.GA: "gle", # Irish
81
+ Language.IT: "ita", # Italian
82
+ Language.JA: "jpn", # Japanese
83
+ Language.JV: "jav", # Javanese
84
+ Language.KEA: "kea", # Kabuverdianu
206
85
  Language.KN: "kan", # Kannada
207
- Language.KN_IN: "kan", # Kannada
86
+ Language.KK: "kaz", # Kazakh
87
+ Language.KM: "khm", # Khmer
88
+ Language.KO: "kor", # Korean
89
+ Language.KU: "kur", # Kurdish
90
+ Language.KY: "kir", # Kyrgyz
91
+ Language.LO: "lao", # Lao
92
+ Language.LV: "lav", # Latvian
93
+ Language.LN: "lin", # Lingala
94
+ Language.LT: "lit", # Lithuanian
95
+ Language.LUO: "luo", # Luo
96
+ Language.LB: "ltz", # Luxembourgish
97
+ Language.MK: "mkd", # Macedonian
98
+ Language.MS: "msa", # Malay
208
99
  Language.ML: "mal", # Malayalam
209
- Language.ML_IN: "mal", # Malayalam
100
+ Language.MT: "mlt", # Maltese
101
+ Language.ZH: "zho", # Mandarin Chinese
102
+ Language.MI: "mri", # Māori
210
103
  Language.MR: "mar", # Marathi
211
- Language.MR_IN: "mar", # Marathi
104
+ Language.MN: "mon", # Mongolian
105
+ Language.NE: "nep", # Nepali
106
+ Language.NSO: "nso", # Northern Sotho
107
+ Language.NO: "nor", # Norwegian
108
+ Language.OC: "oci", # Occitan
109
+ Language.OR: "ori", # Odia
110
+ Language.PS: "pus", # Pashto
111
+ Language.FA: "fas", # Persian
112
+ Language.PL: "pol", # Polish
113
+ Language.PT: "por", # Portuguese
114
+ Language.PA: "pan", # Punjabi
115
+ Language.RO: "ron", # Romanian
116
+ Language.RU: "rus", # Russian
117
+ Language.SR: "srp", # Serbian
118
+ Language.SN: "sna", # Shona
119
+ Language.SD: "snd", # Sindhi
120
+ Language.SK: "slk", # Slovak
121
+ Language.SL: "slv", # Slovenian
122
+ Language.SO: "som", # Somali
123
+ Language.ES: "spa", # Spanish
124
+ Language.SW: "swa", # Swahili
125
+ Language.SV: "swe", # Swedish
126
+ Language.TA: "tam", # Tamil
127
+ Language.TG: "tgk", # Tajik
128
+ Language.TE: "tel", # Telugu
129
+ Language.TH: "tha", # Thai
130
+ Language.TR: "tur", # Turkish
131
+ Language.UK: "ukr", # Ukrainian
132
+ Language.UMB: "umb", # Umbundu
133
+ Language.UR: "urd", # Urdu
134
+ Language.UZ: "uzb", # Uzbek
135
+ Language.VI: "vie", # Vietnamese
136
+ Language.CY: "cym", # Welsh
137
+ Language.WO: "wol", # Wolof
138
+ Language.XH: "xho", # Xhosa
139
+ Language.ZU: "zul", # Zulu
212
140
  }
213
- return language_map.get(language)
214
141
 
142
+ result = BASE_LANGUAGES.get(language)
215
143
 
216
- class ElevenlabsSTTService(SegmentedSTTService):
217
- """ElevenLabs speech-to-text service using Scribe v1 model.
144
+ # If not found in base languages, try to find the base language from a variant
145
+ if not result:
146
+ lang_str = str(language.value)
147
+ base_code = lang_str.split("-")[0].lower()
148
+ result = base_code if base_code in BASE_LANGUAGES.values() else None
218
149
 
219
- This service uses ElevenLabs' batch STT API to transcribe audio segments.
220
- It extends SegmentedSTTService to handle VAD-based audio segmentation.
150
+ return result
221
151
 
222
- Args:
223
- api_key: ElevenLabs API key for authentication.
224
- model_id: Model to use for transcription (default: "scribe_v1").
225
- language: Default language for transcription.
226
- tag_audio_events: Whether to tag audio events like laughter (default: False).
227
- diarize: Whether to enable speaker diarization (default: False).
228
- **kwargs: Additional arguments passed to SegmentedSTTService.
152
+
153
+ class ElevenLabsSTTService(SegmentedSTTService):
154
+ """Speech-to-text service using ElevenLabs' file-based API.
155
+
156
+ This service uses ElevenLabs' Speech-to-Text API to perform transcription on audio
157
+ segments. It inherits from SegmentedSTTService to handle audio buffering and speech detection.
158
+ The service uploads audio files to ElevenLabs and receives transcription results directly.
229
159
  """
230
160
 
161
+ class InputParams(BaseModel):
162
+ """Configuration parameters for ElevenLabs STT API.
163
+
164
+ Parameters:
165
+ language: Target language for transcription.
166
+ tag_audio_events: Whether to include audio events like (laughter), (coughing), in the transcription.
167
+ """
168
+
169
+ language: Optional[Language] = None
170
+ tag_audio_events: bool = True
171
+
231
172
  def __init__(
232
173
  self,
233
174
  *,
234
175
  api_key: str,
235
- model_id: str = "scribe_v1",
236
- language: Optional[Language] = None,
237
- tag_audio_events: bool = False,
176
+ aiohttp_session: aiohttp.ClientSession,
177
+ base_url: str = "https://api.elevenlabs.io",
178
+ model: str = "scribe_v1",
238
179
  sample_rate: Optional[int] = None,
239
- diarize: bool = False,
180
+ params: Optional[InputParams] = None,
240
181
  **kwargs,
241
182
  ):
242
- super().__init__(**kwargs)
183
+ """Initialize the ElevenLabs STT service.
184
+
185
+ Args:
186
+ api_key: ElevenLabs API key for authentication.
187
+ aiohttp_session: aiohttp ClientSession for HTTP requests.
188
+ base_url: Base URL for ElevenLabs API.
189
+ model: Model ID for transcription. Defaults to "scribe_v1".
190
+ sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
191
+ params: Configuration parameters for the STT service.
192
+ **kwargs: Additional arguments passed to SegmentedSTTService.
193
+ """
194
+ super().__init__(
195
+ sample_rate=sample_rate,
196
+ **kwargs,
197
+ )
243
198
 
244
- self._client = ElevenLabs(api_key=api_key)
245
- self._model_id = model_id
246
- self._tag_audio_events = tag_audio_events
247
- self._diarize = diarize
199
+ params = params or ElevenLabsSTTService.InputParams()
200
+
201
+ self._api_key = api_key
202
+ self._base_url = base_url
203
+ self._session = aiohttp_session
204
+ self._model_id = model
205
+ self._tag_audio_events = params.tag_audio_events
248
206
 
249
207
  self._settings = {
250
- "language": language,
251
- "model_id": self._model_id,
252
- "tag_audio_events": self._tag_audio_events,
253
- "diarize": self._diarize,
208
+ "language": self.language_to_service_language(params.language)
209
+ if params.language
210
+ else "eng",
254
211
  }
255
- self.set_model_name(model_id)
256
212
 
257
213
  def can_generate_metrics(self) -> bool:
258
- """Check if this service can generate processing metrics.
214
+ """Check if the service can generate processing metrics.
259
215
 
260
216
  Returns:
261
- True, as ElevenLabs service supports metrics generation.
217
+ True, as ElevenLabs STT service supports metrics generation.
262
218
  """
263
219
  return True
264
220
 
265
221
  def language_to_service_language(self, language: Language) -> Optional[str]:
266
- """Convert from pipecat Language to ElevenLabs language code.
222
+ """Convert a Language enum to ElevenLabs service-specific language code.
267
223
 
268
224
  Args:
269
- language: The Language enum value to convert.
225
+ language: The language to convert.
270
226
 
271
227
  Returns:
272
- str or None: The corresponding ElevenLabs language code, or None if not supported.
228
+ The ElevenLabs-specific language code, or None if not supported.
273
229
  """
274
230
  return language_to_elevenlabs_language(language)
275
231
 
276
232
  async def set_language(self, language: Language):
277
- """Set the language for transcription.
233
+ """Set the transcription language.
278
234
 
279
235
  Args:
280
- language: The Language enum value to use for transcription.
236
+ language: The language to use for speech-to-text transcription.
281
237
  """
282
238
  self.logger.info(f"Switching STT language to: [{language}]")
283
- self._settings["language"] = language
239
+ self._settings["language"] = self.language_to_service_language(language)
240
+
241
+ async def set_model(self, model: str):
242
+ """Set the STT model.
243
+
244
+ Args:
245
+ model: The model name to use for transcription.
246
+
247
+ Note:
248
+ ElevenLabs STT API does not currently support model selection.
249
+ This method is provided for interface compatibility.
250
+ """
251
+ await super().set_model(model)
252
+ self.logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
253
+
254
+ async def _transcribe_audio(self, audio_data: bytes) -> dict:
255
+ """Upload audio data to ElevenLabs and get transcription result.
256
+
257
+ Args:
258
+ audio_data: Raw audio bytes in WAV format.
259
+
260
+ Returns:
261
+ The transcription result data.
262
+
263
+ Raises:
264
+ Exception: If transcription fails or returns an error.
265
+ """
266
+ url = f"{self._base_url}/v1/speech-to-text"
267
+ headers = {"xi-api-key": self._api_key}
268
+
269
+ # Create form data with the audio file
270
+ data = aiohttp.FormData()
271
+ data.add_field(
272
+ "file",
273
+ io.BytesIO(audio_data),
274
+ filename="audio.wav",
275
+ content_type="audio/x-wav",
276
+ )
277
+
278
+ # Add required model_id, language_code, and tag_audio_events
279
+ data.add_field("model_id", self._model_id)
280
+ data.add_field("language_code", self._settings["language"])
281
+ data.add_field("tag_audio_events", str(self._tag_audio_events).lower())
282
+
283
+ async with self._session.post(url, data=data, headers=headers) as response:
284
+ if response.status != 200:
285
+ error_text = await response.text()
286
+ self.logger.error(f"ElevenLabs transcription error: {error_text}")
287
+ raise Exception(f"Transcription failed with status {response.status}: {error_text}")
288
+
289
+ result = await response.json()
290
+ return result
284
291
 
285
292
  @traced_stt
286
293
  async def _handle_transcription(
287
- self, transcript: str, is_final: bool, language: Optional[Language] = None
294
+ self, transcript: str, is_final: bool, language: Optional[str] = None
288
295
  ):
289
296
  """Handle a transcription result with tracing."""
290
- pass
297
+ await self.stop_ttfb_metrics()
298
+ await self.stop_processing_metrics()
291
299
 
292
300
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
293
- """Transcribe the provided audio using ElevenLabs STT.
301
+ """Transcribe an audio segment using ElevenLabs' STT API.
294
302
 
295
303
  Args:
296
- audio: Audio data (WAV format) to transcribe.
304
+ audio: Raw audio bytes in WAV format (already converted by base class).
297
305
 
298
306
  Yields:
299
- Frame: TranscriptionFrame containing the transcribed text or ErrorFrame on failure.
307
+ Frame: TranscriptionFrame containing the transcribed text, or ErrorFrame on failure.
308
+
309
+ Note:
310
+ The audio is already in WAV format from the SegmentedSTTService.
311
+ Only non-empty transcriptions are yielded.
300
312
  """
301
313
  try:
302
314
  await self.start_processing_metrics()
303
315
  await self.start_ttfb_metrics()
304
316
 
305
- # Get language code for ElevenLabs API
306
- params = {
307
- "file": audio,
308
- "model_id": self._model_id,
309
- "tag_audio_events": self._tag_audio_events,
310
- "diarize": self._diarize,
311
- }
312
-
313
- language = self._settings["language"]
314
- if language is not None:
315
- elevenlabs_lang = self.language_to_service_language(language)
316
- if elevenlabs_lang:
317
- params["language_code"] = elevenlabs_lang
318
- else:
319
- params["language_code"] = None
320
-
321
- # Call ElevenLabs STT API in thread pool to avoid blocking
322
- transcription = await asyncio.to_thread(self._client.speech_to_text.convert, **params)
323
-
324
- await self.stop_ttfb_metrics()
325
-
326
- # Process transcription result
327
- if transcription and hasattr(transcription, "text") and transcription.text:
328
- transcript_text = transcription.text.strip()
329
-
330
- if transcript_text:
331
- # Determine language if available from response
332
- response_language = language
333
- if hasattr(transcription, "language_code") and transcription.language_code:
334
- # Try to map back from ElevenLabs language code to pipecat Language
335
- try:
336
- # This is a simplified mapping - you might want to create a reverse map
337
- response_language = language # For now, keep the original
338
- except ValueError:
339
- self.logger.warning(
340
- f"Unknown language detected: {transcription.language_code}"
341
- )
342
-
343
- # Handle transcription with tracing
344
- await self._handle_transcription(transcript_text, True, response_language)
345
-
346
- self.logger.debug(f"ElevenLabs transcription: [{transcript_text}]")
347
-
348
- yield TranscriptionFrame(
349
- text=transcript_text,
350
- user_id="",
351
- timestamp=time_now_iso8601(),
352
- language=response_language,
353
- result=transcription,
354
- )
355
-
356
- await self.stop_processing_metrics()
317
+ # Upload audio and get transcription result directly
318
+ result = await self._transcribe_audio(audio)
319
+
320
+ # Extract transcription text
321
+ text = result.get("text", "").strip()
322
+ if text:
323
+ # Use the language_code returned by the API
324
+ detected_language = result.get("language_code", "eng")
325
+
326
+ await self._handle_transcription(text, True, detected_language)
327
+ self.logger.debug(f"Transcription: [{text}]")
328
+
329
+ yield TranscriptionFrame(
330
+ text,
331
+ self._user_id,
332
+ time_now_iso8601(),
333
+ detected_language,
334
+ result=result,
335
+ )
357
336
 
358
337
  except Exception as e:
359
338
  self.logger.error(f"ElevenLabs STT error: {e}")
360
- await self.stop_all_metrics()
361
339
  yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
@@ -25,9 +25,9 @@ from pipecat.frames.frames import (
25
25
  EndFrame,
26
26
  ErrorFrame,
27
27
  Frame,
28
+ InterruptionFrame,
28
29
  LLMFullResponseEndFrame,
29
30
  StartFrame,
30
- StartInterruptionFrame,
31
31
  TTSAudioRawFrame,
32
32
  TTSStartedFrame,
33
33
  TTSStoppedFrame,
@@ -465,7 +465,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
465
465
  direction: The direction to push the frame.
466
466
  """
467
467
  await super().push_frame(frame, direction)
468
- if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
468
+ if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
469
469
  self._started = False
470
470
  if isinstance(frame, TTSStoppedFrame):
471
471
  await self.add_word_timestamps([("Reset", 0)])
@@ -550,7 +550,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
550
550
  return self._websocket
551
551
  raise Exception("Websocket not connected")
552
552
 
553
- async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
553
+ async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
554
554
  """Handle interruption by closing the current context."""
555
555
  await super()._handle_interruption(frame, direction)
556
556
 
@@ -559,7 +559,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
559
559
  logger.trace(f"Closing context {self._context_id} due to interruption")
560
560
  try:
561
561
  # ElevenLabs requires that Pipecat manages the contexts and closes them
562
- # when they're not longer in use. Since a StartInterruptionFrame is pushed
562
+ # when they're not longer in use. Since an InterruptionFrame is pushed
563
563
  # every time the user speaks, we'll use this as a trigger to close the context
564
564
  # and reset the state.
565
565
  # Note: We do not need to call remove_audio_context here, as the context is
@@ -858,7 +858,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
858
858
  direction: The direction to push the frame.
859
859
  """
860
860
  await super().push_frame(frame, direction)
861
- if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
861
+ if isinstance(frame, (InterruptionFrame, TTSStoppedFrame)):
862
862
  # Reset timing on interruption or stop
863
863
  self._reset_state()
864
864