dv-pipecat-ai 0.0.82.dev866__py3-none-any.whl → 0.0.82.dev870__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.82.dev866
3
+ Version: 0.0.82.dev870
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.82.dev866.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.82.dev870.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -199,7 +199,8 @@ pipecat/services/deepgram/stt.py,sha256=IvdKvo23PxhKoWTJDxuK4Uoo0wCtkFGAE_QrMUoG
199
199
  pipecat/services/deepgram/tts.py,sha256=H_2WCJEx3_L4ytrHHRNkA-6GKTd1coou_vvTfiEodpQ,3745
200
200
  pipecat/services/deepseek/__init__.py,sha256=bU5z_oNGzgrF_YpsD9pYIMtEibeZFaUobbRjJ9WcYyE,259
201
201
  pipecat/services/deepseek/llm.py,sha256=5KjpU2blmhUTM3LcRE1ymdsk6OmoFkIzeQgyNOGwQh8,3112
202
- pipecat/services/elevenlabs/__init__.py,sha256=FgA--iiHyoart9xZZGWTTrBaEjmFxJuugESjPXihI7A,263
202
+ pipecat/services/elevenlabs/__init__.py,sha256=fl_Z0Ua-IgONoFGn4O-pMafwufjZ9C6cCVJkR1dN5lI,288
203
+ pipecat/services/elevenlabs/stt.py,sha256=yekKTu-ymsvP79LXYaXIFBF2uCBNfqqYmFoimlcSUIg,11291
203
204
  pipecat/services/elevenlabs/tts.py,sha256=RZ9thg_kFWk1xhfmW047Pk0FjtWdkgnRMiGYFv3_cWk,42777
204
205
  pipecat/services/fal/__init__.py,sha256=z_kfZETvUcKy68Lyvni4B-RtdkOvz3J3eh6sFDVKq6M,278
205
206
  pipecat/services/fal/image.py,sha256=vArKLKrIGoZfw_xeZY_E7zbUzfzVsScj-R7mOmVqjRQ,4585
@@ -376,7 +377,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=HwDCqLGijhYD3F8nxDuQmEw-YkRw0
376
377
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
377
378
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
378
379
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
379
- dv_pipecat_ai-0.0.82.dev866.dist-info/METADATA,sha256=wgLiEeKpjSJV79t78ihfeeBxIJMtwP465XGZpJaR6JU,32639
380
- dv_pipecat_ai-0.0.82.dev866.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
381
- dv_pipecat_ai-0.0.82.dev866.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
382
- dv_pipecat_ai-0.0.82.dev866.dist-info/RECORD,,
380
+ dv_pipecat_ai-0.0.82.dev870.dist-info/METADATA,sha256=NXNrjj2RnrOdSGAX8R-L2ZlZWqTtQ-S0YblaIuHZd5Y,32639
381
+ dv_pipecat_ai-0.0.82.dev870.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
382
+ dv_pipecat_ai-0.0.82.dev870.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
383
+ dv_pipecat_ai-0.0.82.dev870.dist-info/RECORD,,
@@ -9,5 +9,7 @@ import sys
9
9
  from pipecat.services import DeprecatedModuleProxy
10
10
 
11
11
  from .tts import *
12
+ from .stt import *
13
+ # Old
12
14
 
13
15
  sys.modules[__name__] = DeprecatedModuleProxy(globals(), "elevenlabs", "elevenlabs.tts")
@@ -0,0 +1,351 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """ElevenLabs speech-to-text service implementation."""
8
+
9
+ import asyncio
10
+ from typing import AsyncGenerator, Optional
11
+
12
+ from loguru import logger
13
+
14
+ from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
15
+ from pipecat.services.stt_service import SegmentedSTTService
16
+ from pipecat.transcriptions.language import Language
17
+ from pipecat.utils.time import time_now_iso8601
18
+ from pipecat.utils.tracing.service_decorators import traced_stt
19
+
20
+ try:
21
+ from elevenlabs.client import ElevenLabs
22
+ except ModuleNotFoundError as e:
23
+ logger.error(f"Exception: {e}")
24
+ logger.error("In order to use ElevenLabs, you need to `pip install pipecat-ai[elevenlabs]`.")
25
+ raise Exception(f"Missing module: {e}")
26
+
27
+
28
+ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
29
+ """Maps pipecat Language enum to ElevenLabs language codes.
30
+
31
+ Args:
32
+ language: A Language enum value representing the input language.
33
+
34
+ Returns:
35
+ str or None: The corresponding ElevenLabs language code, or None if not supported.
36
+ """
37
+ language_map = {
38
+ # English
39
+ Language.EN: "eng",
40
+ Language.EN_US: "eng",
41
+ Language.EN_GB: "eng",
42
+ Language.EN_AU: "eng",
43
+ Language.EN_CA: "eng",
44
+ Language.EN_IN: "eng",
45
+ Language.EN_IE: "eng",
46
+ Language.EN_NZ: "eng",
47
+ Language.EN_ZA: "eng",
48
+ Language.EN_SG: "eng",
49
+ Language.EN_HK: "eng",
50
+ Language.EN_PH: "eng",
51
+ Language.EN_KE: "eng",
52
+ Language.EN_NG: "eng",
53
+ Language.EN_TZ: "eng",
54
+ # Spanish
55
+ Language.ES: "spa",
56
+ Language.ES_ES: "spa",
57
+ Language.ES_MX: "spa",
58
+ Language.ES_AR: "spa",
59
+ Language.ES_CO: "spa",
60
+ Language.ES_CL: "spa",
61
+ Language.ES_VE: "spa",
62
+ Language.ES_PE: "spa",
63
+ Language.ES_EC: "spa",
64
+ Language.ES_GT: "spa",
65
+ Language.ES_CU: "spa",
66
+ Language.ES_BO: "spa",
67
+ Language.ES_DO: "spa",
68
+ Language.ES_HN: "spa",
69
+ Language.ES_PY: "spa",
70
+ Language.ES_SV: "spa",
71
+ Language.ES_NI: "spa",
72
+ Language.ES_CR: "spa",
73
+ Language.ES_PA: "spa",
74
+ Language.ES_UY: "spa",
75
+ Language.ES_PR: "spa",
76
+ Language.ES_US: "spa",
77
+ Language.ES_GQ: "spa",
78
+ # French
79
+ Language.FR: "fra",
80
+ Language.FR_FR: "fra",
81
+ Language.FR_CA: "fra",
82
+ Language.FR_BE: "fra",
83
+ Language.FR_CH: "fra",
84
+ # German
85
+ Language.DE: "deu",
86
+ Language.DE_DE: "deu",
87
+ Language.DE_AT: "deu",
88
+ Language.DE_CH: "deu",
89
+ # Italian
90
+ Language.IT: "ita",
91
+ Language.IT_IT: "ita",
92
+ # Portuguese
93
+ Language.PT: "por",
94
+ Language.PT_PT: "por",
95
+ Language.PT_BR: "por",
96
+ # Hindi
97
+ Language.HI: "hin",
98
+ Language.HI_IN: "hin",
99
+ # Arabic
100
+ Language.AR: "ara",
101
+ Language.AR_SA: "ara",
102
+ Language.AR_EG: "ara",
103
+ Language.AR_AE: "ara",
104
+ Language.AR_BH: "ara",
105
+ Language.AR_DZ: "ara",
106
+ Language.AR_IQ: "ara",
107
+ Language.AR_JO: "ara",
108
+ Language.AR_KW: "ara",
109
+ Language.AR_LB: "ara",
110
+ Language.AR_LY: "ara",
111
+ Language.AR_MA: "ara",
112
+ Language.AR_OM: "ara",
113
+ Language.AR_QA: "ara",
114
+ Language.AR_SY: "ara",
115
+ Language.AR_TN: "ara",
116
+ Language.AR_YE: "ara",
117
+ # Japanese
118
+ Language.JA: "jpn",
119
+ Language.JA_JP: "jpn",
120
+ # Korean
121
+ Language.KO: "kor",
122
+ Language.KO_KR: "kor",
123
+ # Chinese
124
+ Language.ZH: "cmn",
125
+ Language.ZH_CN: "cmn",
126
+ Language.ZH_TW: "cmn",
127
+ Language.ZH_HK: "cmn",
128
+ # Russian
129
+ Language.RU: "rus",
130
+ Language.RU_RU: "rus",
131
+ # Dutch
132
+ Language.NL: "nld",
133
+ Language.NL_NL: "nld",
134
+ Language.NL_BE: "nld",
135
+ # Polish
136
+ Language.PL: "pol",
137
+ Language.PL_PL: "pol",
138
+ # Turkish
139
+ Language.TR: "tur",
140
+ Language.TR_TR: "tur",
141
+ # Swedish
142
+ Language.SV: "swe",
143
+ Language.SV_SE: "swe",
144
+ # Norwegian
145
+ Language.NO: "nor",
146
+ Language.NB: "nor",
147
+ Language.NN: "nor",
148
+ # Danish
149
+ Language.DA: "dan",
150
+ Language.DA_DK: "dan",
151
+ # Finnish
152
+ Language.FI: "fin",
153
+ Language.FI_FI: "fin",
154
+ # Czech
155
+ Language.CS: "ces",
156
+ Language.CS_CZ: "ces",
157
+ # Hungarian
158
+ Language.HU: "hun",
159
+ Language.HU_HU: "hun",
160
+ # Greek
161
+ Language.EL: "ell",
162
+ Language.EL_GR: "ell",
163
+ # Hebrew
164
+ Language.HE: "heb",
165
+ Language.HE_IL: "heb",
166
+ # Thai
167
+ Language.TH: "tha",
168
+ Language.TH_TH: "tha",
169
+ # Vietnamese
170
+ Language.VI: "vie",
171
+ Language.VI_VN: "vie",
172
+ # Indonesian
173
+ Language.ID: "ind",
174
+ Language.ID_ID: "ind",
175
+ # Malay
176
+ Language.MS: "msa",
177
+ Language.MS_MY: "msa",
178
+ # Ukrainian
179
+ Language.UK: "ukr",
180
+ Language.UK_UA: "ukr",
181
+ # Bulgarian
182
+ Language.BG: "bul",
183
+ Language.BG_BG: "bul",
184
+ # Croatian
185
+ Language.HR: "hrv",
186
+ Language.HR_HR: "hrv",
187
+ # Slovak
188
+ Language.SK: "slk",
189
+ Language.SK_SK: "slk",
190
+ # Slovenian
191
+ Language.SL: "slv",
192
+ Language.SL_SI: "slv",
193
+ # Estonian
194
+ Language.ET: "est",
195
+ Language.ET_EE: "est",
196
+ # Latvian
197
+ Language.LV: "lav",
198
+ Language.LV_LV: "lav",
199
+ # Lithuanian
200
+ Language.LT: "lit",
201
+ Language.LT_LT: "lit",
202
+ }
203
+ return language_map.get(language)
204
+
205
+
206
+ class ElevenlabsSTTService(SegmentedSTTService):
207
+ """ElevenLabs speech-to-text service using Scribe v1 model.
208
+
209
+ This service uses ElevenLabs' batch STT API to transcribe audio segments.
210
+ It extends SegmentedSTTService to handle VAD-based audio segmentation.
211
+
212
+ Args:
213
+ api_key: ElevenLabs API key for authentication.
214
+ model_id: Model to use for transcription (default: "scribe_v1").
215
+ language: Default language for transcription.
216
+ tag_audio_events: Whether to tag audio events like laughter (default: False).
217
+ diarize: Whether to enable speaker diarization (default: False).
218
+ **kwargs: Additional arguments passed to SegmentedSTTService.
219
+ """
220
+
221
+ def __init__(
222
+ self,
223
+ *,
224
+ api_key: str,
225
+ model_id: str = "scribe_v1",
226
+ language: Language = Language.EN,
227
+ tag_audio_events: bool = False,
228
+ sample_rate: Optional[int] = None,
229
+ diarize: bool = False,
230
+ **kwargs,
231
+ ):
232
+ super().__init__(**kwargs)
233
+
234
+ self._client = ElevenLabs(api_key=api_key)
235
+ self._model_id = model_id
236
+ self._tag_audio_events = tag_audio_events
237
+ self._diarize = diarize
238
+
239
+ self._settings = {
240
+ "language": language,
241
+ "model_id": self._model_id,
242
+ "tag_audio_events": self._tag_audio_events,
243
+ "diarize": self._diarize,
244
+ }
245
+ self.set_model_name(model_id)
246
+
247
+ def can_generate_metrics(self) -> bool:
248
+ """Check if this service can generate processing metrics.
249
+
250
+ Returns:
251
+ True, as ElevenLabs service supports metrics generation.
252
+ """
253
+ return True
254
+
255
+ def language_to_service_language(self, language: Language) -> Optional[str]:
256
+ """Convert from pipecat Language to ElevenLabs language code.
257
+
258
+ Args:
259
+ language: The Language enum value to convert.
260
+
261
+ Returns:
262
+ str or None: The corresponding ElevenLabs language code, or None if not supported.
263
+ """
264
+ return language_to_elevenlabs_language(language)
265
+
266
+ async def set_language(self, language: Language):
267
+ """Set the language for transcription.
268
+
269
+ Args:
270
+ language: The Language enum value to use for transcription.
271
+ """
272
+ self.logger.info(f"Switching STT language to: [{language}]")
273
+ self._settings["language"] = language
274
+
275
+ @traced_stt
276
+ async def _handle_transcription(
277
+ self, transcript: str, is_final: bool, language: Optional[Language] = None
278
+ ):
279
+ """Handle a transcription result with tracing."""
280
+ pass
281
+
282
+ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
283
+ """Transcribe the provided audio using ElevenLabs STT.
284
+
285
+ Args:
286
+ audio: Audio data (WAV format) to transcribe.
287
+
288
+ Yields:
289
+ Frame: TranscriptionFrame containing the transcribed text or ErrorFrame on failure.
290
+ """
291
+ try:
292
+ await self.start_processing_metrics()
293
+ await self.start_ttfb_metrics()
294
+
295
+ # Get language code for ElevenLabs API
296
+ language = self._settings["language"]
297
+ elevenlabs_lang = self.language_to_service_language(language)
298
+
299
+ # Prepare API parameters
300
+ params = {
301
+ "file": audio,
302
+ "model_id": self._model_id,
303
+ "tag_audio_events": self._tag_audio_events,
304
+ "diarize": self._diarize,
305
+ }
306
+
307
+ # Add language if specified
308
+ if elevenlabs_lang:
309
+ params["language_code"] = elevenlabs_lang
310
+
311
+ # Call ElevenLabs STT API in thread pool to avoid blocking
312
+ transcription = await asyncio.to_thread(self._client.speech_to_text.convert, **params)
313
+
314
+ await self.stop_ttfb_metrics()
315
+
316
+ # Process transcription result
317
+ if transcription and hasattr(transcription, "text") and transcription.text:
318
+ transcript_text = transcription.text.strip()
319
+
320
+ if transcript_text:
321
+ # Determine language if available from response
322
+ response_language = language
323
+ if hasattr(transcription, "language_code") and transcription.language_code:
324
+ # Try to map back from ElevenLabs language code to pipecat Language
325
+ try:
326
+ # This is a simplified mapping - you might want to create a reverse map
327
+ response_language = language # For now, keep the original
328
+ except ValueError:
329
+ self.logger.warning(
330
+ f"Unknown language detected: {transcription.language_code}"
331
+ )
332
+
333
+ # Handle transcription with tracing
334
+ await self._handle_transcription(transcript_text, True, response_language)
335
+
336
+ self.logger.debug(f"ElevenLabs transcription: [{transcript_text}]")
337
+
338
+ yield TranscriptionFrame(
339
+ text=transcript_text,
340
+ user_id="",
341
+ timestamp=time_now_iso8601(),
342
+ language=response_language,
343
+ result=transcription,
344
+ )
345
+
346
+ await self.stop_processing_metrics()
347
+
348
+ except Exception as e:
349
+ self.logger.error(f"ElevenLabs STT error: {e}")
350
+ await self.stop_all_metrics()
351
+ yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")