dv-pipecat-ai 0.0.85.dev7__py3-none-any.whl → 0.0.85.dev699__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev699.dist-info}/METADATA +78 -117
- {dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev699.dist-info}/RECORD +158 -122
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +5 -0
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +120 -87
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +12 -4
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +85 -24
- pipecat/processors/aggregators/dtmf_aggregator.py +28 -22
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_response.py +6 -7
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/filters/stt_mute_filter.py +2 -0
- pipecat/processors/frame_processor.py +103 -17
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +209 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +4 -4
- pipecat/processors/user_idle_processor.py +3 -6
- pipecat/runner/run.py +270 -50
- pipecat/runner/types.py +2 -0
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +6 -9
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/asyncai/tts.py +2 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +122 -97
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +367 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1155 -0
- pipecat/services/aws/stt.py +1 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +13 -355
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/tts.py +2 -2
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +636 -0
- pipecat/services/elevenlabs/__init__.py +2 -1
- pipecat/services/elevenlabs/stt.py +254 -276
- pipecat/services/elevenlabs/tts.py +5 -5
- pipecat/services/fish/tts.py +2 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +2 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +2 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +16 -8
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/playht/tts.py +31 -4
- pipecat/services/rime/tts.py +3 -4
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +465 -0
- pipecat/services/sarvam/tts.py +2 -6
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +1 -7
- pipecat/services/stt_service.py +34 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +9 -9
- pipecat/services/vision_service.py +7 -6
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +17 -42
- pipecat/transports/base_output.py +42 -26
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +98 -19
- pipecat/transports/smallwebrtc/request_handler.py +204 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/string.py +12 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- {dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev699.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev699.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev699.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -4,12 +4,19 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
-
"""ElevenLabs speech-to-text service implementation.
|
|
7
|
+
"""ElevenLabs speech-to-text service implementation.
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
This module provides integration with ElevenLabs' Speech-to-Text API for transcription
|
|
10
|
+
using segmented audio processing. The service uploads audio files and receives
|
|
11
|
+
transcription results directly.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import io
|
|
10
15
|
from typing import AsyncGenerator, Optional
|
|
11
16
|
|
|
17
|
+
import aiohttp
|
|
12
18
|
from loguru import logger
|
|
19
|
+
from pydantic import BaseModel
|
|
13
20
|
|
|
14
21
|
from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
|
|
15
22
|
from pipecat.services.stt_service import SegmentedSTTService
|
|
@@ -17,345 +24,316 @@ from pipecat.transcriptions.language import Language
|
|
|
17
24
|
from pipecat.utils.time import time_now_iso8601
|
|
18
25
|
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
19
26
|
|
|
20
|
-
try:
|
|
21
|
-
from elevenlabs.client import ElevenLabs
|
|
22
|
-
except ModuleNotFoundError as e:
|
|
23
|
-
logger.error(f"Exception: {e}")
|
|
24
|
-
logger.error("In order to use ElevenLabs, you need to `pip install pipecat-ai[elevenlabs]`.")
|
|
25
|
-
raise Exception(f"Missing module: {e}")
|
|
26
|
-
|
|
27
27
|
|
|
28
28
|
def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
29
|
-
"""
|
|
29
|
+
"""Convert a Language enum to ElevenLabs language code.
|
|
30
|
+
|
|
31
|
+
Source:
|
|
32
|
+
https://elevenlabs.io/docs/capabilities/speech-to-text
|
|
30
33
|
|
|
31
34
|
Args:
|
|
32
|
-
language:
|
|
35
|
+
language: The Language enum value to convert.
|
|
33
36
|
|
|
34
37
|
Returns:
|
|
35
|
-
|
|
38
|
+
The corresponding ElevenLabs language code, or None if not supported.
|
|
36
39
|
"""
|
|
37
|
-
|
|
38
|
-
#
|
|
39
|
-
Language.
|
|
40
|
-
Language.
|
|
41
|
-
Language.
|
|
42
|
-
Language.
|
|
43
|
-
Language.
|
|
44
|
-
Language.
|
|
45
|
-
Language.
|
|
46
|
-
Language.
|
|
47
|
-
Language.
|
|
48
|
-
Language.
|
|
49
|
-
Language.
|
|
50
|
-
Language.
|
|
51
|
-
Language.
|
|
52
|
-
Language.
|
|
53
|
-
Language.
|
|
54
|
-
#
|
|
55
|
-
Language.
|
|
56
|
-
Language.
|
|
57
|
-
Language.
|
|
58
|
-
Language.
|
|
59
|
-
Language.
|
|
60
|
-
Language.
|
|
61
|
-
Language.
|
|
62
|
-
Language.
|
|
63
|
-
Language.
|
|
64
|
-
Language.
|
|
65
|
-
Language.
|
|
66
|
-
Language.
|
|
67
|
-
Language.
|
|
68
|
-
Language.
|
|
69
|
-
Language.
|
|
70
|
-
Language.
|
|
71
|
-
Language.
|
|
72
|
-
Language.
|
|
73
|
-
Language.
|
|
74
|
-
Language.
|
|
75
|
-
Language.
|
|
76
|
-
Language.
|
|
77
|
-
Language.
|
|
78
|
-
#
|
|
79
|
-
Language.
|
|
80
|
-
Language.
|
|
81
|
-
Language.
|
|
82
|
-
Language.FR_BE: "fra",
|
|
83
|
-
Language.FR_CH: "fra",
|
|
84
|
-
# German
|
|
85
|
-
Language.DE: "deu",
|
|
86
|
-
Language.DE_DE: "deu",
|
|
87
|
-
Language.DE_AT: "deu",
|
|
88
|
-
Language.DE_CH: "deu",
|
|
89
|
-
# Italian
|
|
90
|
-
Language.IT: "ita",
|
|
91
|
-
Language.IT_IT: "ita",
|
|
92
|
-
# Portuguese
|
|
93
|
-
Language.PT: "por",
|
|
94
|
-
Language.PT_PT: "por",
|
|
95
|
-
Language.PT_BR: "por",
|
|
96
|
-
# Hindi
|
|
97
|
-
Language.HI: "hin",
|
|
98
|
-
Language.HI_IN: "hin",
|
|
99
|
-
# Arabic
|
|
100
|
-
Language.AR: "ara",
|
|
101
|
-
Language.AR_SA: "ara",
|
|
102
|
-
Language.AR_EG: "ara",
|
|
103
|
-
Language.AR_AE: "ara",
|
|
104
|
-
Language.AR_BH: "ara",
|
|
105
|
-
Language.AR_DZ: "ara",
|
|
106
|
-
Language.AR_IQ: "ara",
|
|
107
|
-
Language.AR_JO: "ara",
|
|
108
|
-
Language.AR_KW: "ara",
|
|
109
|
-
Language.AR_LB: "ara",
|
|
110
|
-
Language.AR_LY: "ara",
|
|
111
|
-
Language.AR_MA: "ara",
|
|
112
|
-
Language.AR_OM: "ara",
|
|
113
|
-
Language.AR_QA: "ara",
|
|
114
|
-
Language.AR_SY: "ara",
|
|
115
|
-
Language.AR_TN: "ara",
|
|
116
|
-
Language.AR_YE: "ara",
|
|
117
|
-
# Japanese
|
|
118
|
-
Language.JA: "jpn",
|
|
119
|
-
Language.JA_JP: "jpn",
|
|
120
|
-
# Korean
|
|
121
|
-
Language.KO: "kor",
|
|
122
|
-
Language.KO_KR: "kor",
|
|
123
|
-
# Chinese
|
|
124
|
-
Language.ZH: "cmn",
|
|
125
|
-
Language.ZH_CN: "cmn",
|
|
126
|
-
Language.ZH_TW: "cmn",
|
|
127
|
-
Language.ZH_HK: "cmn",
|
|
128
|
-
# Russian
|
|
129
|
-
Language.RU: "rus",
|
|
130
|
-
Language.RU_RU: "rus",
|
|
131
|
-
# Dutch
|
|
132
|
-
Language.NL: "nld",
|
|
133
|
-
Language.NL_NL: "nld",
|
|
134
|
-
Language.NL_BE: "nld",
|
|
135
|
-
# Polish
|
|
136
|
-
Language.PL: "pol",
|
|
137
|
-
Language.PL_PL: "pol",
|
|
138
|
-
# Turkish
|
|
139
|
-
Language.TR: "tur",
|
|
140
|
-
Language.TR_TR: "tur",
|
|
141
|
-
# Swedish
|
|
142
|
-
Language.SV: "swe",
|
|
143
|
-
Language.SV_SE: "swe",
|
|
144
|
-
# Norwegian
|
|
145
|
-
Language.NO: "nor",
|
|
146
|
-
Language.NB: "nor",
|
|
147
|
-
Language.NN: "nor",
|
|
148
|
-
# Danish
|
|
149
|
-
Language.DA: "dan",
|
|
150
|
-
Language.DA_DK: "dan",
|
|
151
|
-
# Finnish
|
|
152
|
-
Language.FI: "fin",
|
|
153
|
-
Language.FI_FI: "fin",
|
|
154
|
-
# Czech
|
|
155
|
-
Language.CS: "ces",
|
|
156
|
-
Language.CS_CZ: "ces",
|
|
157
|
-
# Hungarian
|
|
158
|
-
Language.HU: "hun",
|
|
159
|
-
Language.HU_HU: "hun",
|
|
160
|
-
# Greek
|
|
161
|
-
Language.EL: "ell",
|
|
162
|
-
Language.EL_GR: "ell",
|
|
163
|
-
# Hebrew
|
|
164
|
-
Language.HE: "heb",
|
|
165
|
-
Language.HE_IL: "heb",
|
|
166
|
-
# Thai
|
|
167
|
-
Language.TH: "tha",
|
|
168
|
-
Language.TH_TH: "tha",
|
|
169
|
-
# Vietnamese
|
|
170
|
-
Language.VI: "vie",
|
|
171
|
-
Language.VI_VN: "vie",
|
|
172
|
-
# Indonesian
|
|
173
|
-
Language.ID: "ind",
|
|
174
|
-
Language.ID_ID: "ind",
|
|
175
|
-
# Malay
|
|
176
|
-
Language.MS: "msa",
|
|
177
|
-
Language.MS_MY: "msa",
|
|
178
|
-
# Ukrainian
|
|
179
|
-
Language.UK: "ukr",
|
|
180
|
-
Language.UK_UA: "ukr",
|
|
181
|
-
# Bulgarian
|
|
182
|
-
Language.BG: "bul",
|
|
183
|
-
Language.BG_BG: "bul",
|
|
184
|
-
# Croatian
|
|
185
|
-
Language.HR: "hrv",
|
|
186
|
-
Language.HR_HR: "hrv",
|
|
187
|
-
# Slovak
|
|
188
|
-
Language.SK: "slk",
|
|
189
|
-
Language.SK_SK: "slk",
|
|
190
|
-
# Slovenian
|
|
191
|
-
Language.SL: "slv",
|
|
192
|
-
Language.SL_SI: "slv",
|
|
193
|
-
# Estonian
|
|
194
|
-
Language.ET: "est",
|
|
195
|
-
Language.ET_EE: "est",
|
|
196
|
-
# Latvian
|
|
197
|
-
Language.LV: "lav",
|
|
198
|
-
Language.LV_LV: "lav",
|
|
199
|
-
# Lithuanian
|
|
200
|
-
Language.LT: "lit",
|
|
201
|
-
Language.LT_LT: "lit",
|
|
202
|
-
Language.TA: "tam", # Tamil
|
|
203
|
-
Language.TA_IN: "tam", # Tamil
|
|
204
|
-
Language.TE: "tel", # Telugu
|
|
205
|
-
Language.TE_IN: "tel", # Telugu
|
|
40
|
+
BASE_LANGUAGES = {
|
|
41
|
+
Language.AF: "afr", # Afrikaans
|
|
42
|
+
Language.AM: "amh", # Amharic
|
|
43
|
+
Language.AR: "ara", # Arabic
|
|
44
|
+
Language.HY: "hye", # Armenian
|
|
45
|
+
Language.AS: "asm", # Assamese
|
|
46
|
+
Language.AST: "ast", # Asturian
|
|
47
|
+
Language.AZ: "aze", # Azerbaijani
|
|
48
|
+
Language.BE: "bel", # Belarusian
|
|
49
|
+
Language.BN: "ben", # Bengali
|
|
50
|
+
Language.BS: "bos", # Bosnian
|
|
51
|
+
Language.BG: "bul", # Bulgarian
|
|
52
|
+
Language.MY: "mya", # Burmese
|
|
53
|
+
Language.YUE: "yue", # Cantonese
|
|
54
|
+
Language.CA: "cat", # Catalan
|
|
55
|
+
Language.CEB: "ceb", # Cebuano
|
|
56
|
+
Language.NY: "nya", # Chichewa
|
|
57
|
+
Language.HR: "hrv", # Croatian
|
|
58
|
+
Language.CS: "ces", # Czech
|
|
59
|
+
Language.DA: "dan", # Danish
|
|
60
|
+
Language.NL: "nld", # Dutch
|
|
61
|
+
Language.EN: "eng", # English
|
|
62
|
+
Language.ET: "est", # Estonian
|
|
63
|
+
Language.FIL: "fil", # Filipino
|
|
64
|
+
Language.FI: "fin", # Finnish
|
|
65
|
+
Language.FR: "fra", # French
|
|
66
|
+
Language.FF: "ful", # Fulah
|
|
67
|
+
Language.GL: "glg", # Galician
|
|
68
|
+
Language.LG: "lug", # Ganda
|
|
69
|
+
Language.KA: "kat", # Georgian
|
|
70
|
+
Language.DE: "deu", # German
|
|
71
|
+
Language.EL: "ell", # Greek
|
|
72
|
+
Language.GU: "guj", # Gujarati
|
|
73
|
+
Language.HA: "hau", # Hausa
|
|
74
|
+
Language.HE: "heb", # Hebrew
|
|
75
|
+
Language.HI: "hin", # Hindi
|
|
76
|
+
Language.HU: "hun", # Hungarian
|
|
77
|
+
Language.IS: "isl", # Icelandic
|
|
78
|
+
Language.IG: "ibo", # Igbo
|
|
79
|
+
Language.ID: "ind", # Indonesian
|
|
80
|
+
Language.GA: "gle", # Irish
|
|
81
|
+
Language.IT: "ita", # Italian
|
|
82
|
+
Language.JA: "jpn", # Japanese
|
|
83
|
+
Language.JV: "jav", # Javanese
|
|
84
|
+
Language.KEA: "kea", # Kabuverdianu
|
|
206
85
|
Language.KN: "kan", # Kannada
|
|
207
|
-
Language.
|
|
86
|
+
Language.KK: "kaz", # Kazakh
|
|
87
|
+
Language.KM: "khm", # Khmer
|
|
88
|
+
Language.KO: "kor", # Korean
|
|
89
|
+
Language.KU: "kur", # Kurdish
|
|
90
|
+
Language.KY: "kir", # Kyrgyz
|
|
91
|
+
Language.LO: "lao", # Lao
|
|
92
|
+
Language.LV: "lav", # Latvian
|
|
93
|
+
Language.LN: "lin", # Lingala
|
|
94
|
+
Language.LT: "lit", # Lithuanian
|
|
95
|
+
Language.LUO: "luo", # Luo
|
|
96
|
+
Language.LB: "ltz", # Luxembourgish
|
|
97
|
+
Language.MK: "mkd", # Macedonian
|
|
98
|
+
Language.MS: "msa", # Malay
|
|
208
99
|
Language.ML: "mal", # Malayalam
|
|
209
|
-
Language.
|
|
100
|
+
Language.MT: "mlt", # Maltese
|
|
101
|
+
Language.ZH: "zho", # Mandarin Chinese
|
|
102
|
+
Language.MI: "mri", # Māori
|
|
210
103
|
Language.MR: "mar", # Marathi
|
|
211
|
-
Language.
|
|
104
|
+
Language.MN: "mon", # Mongolian
|
|
105
|
+
Language.NE: "nep", # Nepali
|
|
106
|
+
Language.NSO: "nso", # Northern Sotho
|
|
107
|
+
Language.NO: "nor", # Norwegian
|
|
108
|
+
Language.OC: "oci", # Occitan
|
|
109
|
+
Language.OR: "ori", # Odia
|
|
110
|
+
Language.PS: "pus", # Pashto
|
|
111
|
+
Language.FA: "fas", # Persian
|
|
112
|
+
Language.PL: "pol", # Polish
|
|
113
|
+
Language.PT: "por", # Portuguese
|
|
114
|
+
Language.PA: "pan", # Punjabi
|
|
115
|
+
Language.RO: "ron", # Romanian
|
|
116
|
+
Language.RU: "rus", # Russian
|
|
117
|
+
Language.SR: "srp", # Serbian
|
|
118
|
+
Language.SN: "sna", # Shona
|
|
119
|
+
Language.SD: "snd", # Sindhi
|
|
120
|
+
Language.SK: "slk", # Slovak
|
|
121
|
+
Language.SL: "slv", # Slovenian
|
|
122
|
+
Language.SO: "som", # Somali
|
|
123
|
+
Language.ES: "spa", # Spanish
|
|
124
|
+
Language.SW: "swa", # Swahili
|
|
125
|
+
Language.SV: "swe", # Swedish
|
|
126
|
+
Language.TA: "tam", # Tamil
|
|
127
|
+
Language.TG: "tgk", # Tajik
|
|
128
|
+
Language.TE: "tel", # Telugu
|
|
129
|
+
Language.TH: "tha", # Thai
|
|
130
|
+
Language.TR: "tur", # Turkish
|
|
131
|
+
Language.UK: "ukr", # Ukrainian
|
|
132
|
+
Language.UMB: "umb", # Umbundu
|
|
133
|
+
Language.UR: "urd", # Urdu
|
|
134
|
+
Language.UZ: "uzb", # Uzbek
|
|
135
|
+
Language.VI: "vie", # Vietnamese
|
|
136
|
+
Language.CY: "cym", # Welsh
|
|
137
|
+
Language.WO: "wol", # Wolof
|
|
138
|
+
Language.XH: "xho", # Xhosa
|
|
139
|
+
Language.ZU: "zul", # Zulu
|
|
212
140
|
}
|
|
213
|
-
return language_map.get(language)
|
|
214
141
|
|
|
142
|
+
result = BASE_LANGUAGES.get(language)
|
|
215
143
|
|
|
216
|
-
|
|
217
|
-
|
|
144
|
+
# If not found in base languages, try to find the base language from a variant
|
|
145
|
+
if not result:
|
|
146
|
+
lang_str = str(language.value)
|
|
147
|
+
base_code = lang_str.split("-")[0].lower()
|
|
148
|
+
result = base_code if base_code in BASE_LANGUAGES.values() else None
|
|
218
149
|
|
|
219
|
-
|
|
220
|
-
It extends SegmentedSTTService to handle VAD-based audio segmentation.
|
|
150
|
+
return result
|
|
221
151
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
152
|
+
|
|
153
|
+
class ElevenLabsSTTService(SegmentedSTTService):
|
|
154
|
+
"""Speech-to-text service using ElevenLabs' file-based API.
|
|
155
|
+
|
|
156
|
+
This service uses ElevenLabs' Speech-to-Text API to perform transcription on audio
|
|
157
|
+
segments. It inherits from SegmentedSTTService to handle audio buffering and speech detection.
|
|
158
|
+
The service uploads audio files to ElevenLabs and receives transcription results directly.
|
|
229
159
|
"""
|
|
230
160
|
|
|
161
|
+
class InputParams(BaseModel):
|
|
162
|
+
"""Configuration parameters for ElevenLabs STT API.
|
|
163
|
+
|
|
164
|
+
Parameters:
|
|
165
|
+
language: Target language for transcription.
|
|
166
|
+
tag_audio_events: Whether to include audio events like (laughter), (coughing), in the transcription.
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
language: Optional[Language] = None
|
|
170
|
+
tag_audio_events: bool = True
|
|
171
|
+
|
|
231
172
|
def __init__(
|
|
232
173
|
self,
|
|
233
174
|
*,
|
|
234
175
|
api_key: str,
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
176
|
+
aiohttp_session: aiohttp.ClientSession,
|
|
177
|
+
base_url: str = "https://api.elevenlabs.io",
|
|
178
|
+
model: str = "scribe_v1",
|
|
238
179
|
sample_rate: Optional[int] = None,
|
|
239
|
-
|
|
180
|
+
params: Optional[InputParams] = None,
|
|
240
181
|
**kwargs,
|
|
241
182
|
):
|
|
242
|
-
|
|
183
|
+
"""Initialize the ElevenLabs STT service.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
api_key: ElevenLabs API key for authentication.
|
|
187
|
+
aiohttp_session: aiohttp ClientSession for HTTP requests.
|
|
188
|
+
base_url: Base URL for ElevenLabs API.
|
|
189
|
+
model: Model ID for transcription. Defaults to "scribe_v1".
|
|
190
|
+
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
|
|
191
|
+
params: Configuration parameters for the STT service.
|
|
192
|
+
**kwargs: Additional arguments passed to SegmentedSTTService.
|
|
193
|
+
"""
|
|
194
|
+
super().__init__(
|
|
195
|
+
sample_rate=sample_rate,
|
|
196
|
+
**kwargs,
|
|
197
|
+
)
|
|
243
198
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
self.
|
|
247
|
-
self.
|
|
199
|
+
params = params or ElevenLabsSTTService.InputParams()
|
|
200
|
+
|
|
201
|
+
self._api_key = api_key
|
|
202
|
+
self._base_url = base_url
|
|
203
|
+
self._session = aiohttp_session
|
|
204
|
+
self._model_id = model
|
|
205
|
+
self._tag_audio_events = params.tag_audio_events
|
|
248
206
|
|
|
249
207
|
self._settings = {
|
|
250
|
-
"language": language
|
|
251
|
-
|
|
252
|
-
"
|
|
253
|
-
"diarize": self._diarize,
|
|
208
|
+
"language": self.language_to_service_language(params.language)
|
|
209
|
+
if params.language
|
|
210
|
+
else "eng",
|
|
254
211
|
}
|
|
255
|
-
self.set_model_name(model_id)
|
|
256
212
|
|
|
257
213
|
def can_generate_metrics(self) -> bool:
|
|
258
|
-
"""Check if
|
|
214
|
+
"""Check if the service can generate processing metrics.
|
|
259
215
|
|
|
260
216
|
Returns:
|
|
261
|
-
True, as ElevenLabs service supports metrics generation.
|
|
217
|
+
True, as ElevenLabs STT service supports metrics generation.
|
|
262
218
|
"""
|
|
263
219
|
return True
|
|
264
220
|
|
|
265
221
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
266
|
-
"""Convert
|
|
222
|
+
"""Convert a Language enum to ElevenLabs service-specific language code.
|
|
267
223
|
|
|
268
224
|
Args:
|
|
269
|
-
language: The
|
|
225
|
+
language: The language to convert.
|
|
270
226
|
|
|
271
227
|
Returns:
|
|
272
|
-
|
|
228
|
+
The ElevenLabs-specific language code, or None if not supported.
|
|
273
229
|
"""
|
|
274
230
|
return language_to_elevenlabs_language(language)
|
|
275
231
|
|
|
276
232
|
async def set_language(self, language: Language):
|
|
277
|
-
"""Set the language
|
|
233
|
+
"""Set the transcription language.
|
|
278
234
|
|
|
279
235
|
Args:
|
|
280
|
-
language: The
|
|
236
|
+
language: The language to use for speech-to-text transcription.
|
|
281
237
|
"""
|
|
282
238
|
self.logger.info(f"Switching STT language to: [{language}]")
|
|
283
|
-
self._settings["language"] = language
|
|
239
|
+
self._settings["language"] = self.language_to_service_language(language)
|
|
240
|
+
|
|
241
|
+
async def set_model(self, model: str):
|
|
242
|
+
"""Set the STT model.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
model: The model name to use for transcription.
|
|
246
|
+
|
|
247
|
+
Note:
|
|
248
|
+
ElevenLabs STT API does not currently support model selection.
|
|
249
|
+
This method is provided for interface compatibility.
|
|
250
|
+
"""
|
|
251
|
+
await super().set_model(model)
|
|
252
|
+
self.logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
|
|
253
|
+
|
|
254
|
+
async def _transcribe_audio(self, audio_data: bytes) -> dict:
|
|
255
|
+
"""Upload audio data to ElevenLabs and get transcription result.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
audio_data: Raw audio bytes in WAV format.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
The transcription result data.
|
|
262
|
+
|
|
263
|
+
Raises:
|
|
264
|
+
Exception: If transcription fails or returns an error.
|
|
265
|
+
"""
|
|
266
|
+
url = f"{self._base_url}/v1/speech-to-text"
|
|
267
|
+
headers = {"xi-api-key": self._api_key}
|
|
268
|
+
|
|
269
|
+
# Create form data with the audio file
|
|
270
|
+
data = aiohttp.FormData()
|
|
271
|
+
data.add_field(
|
|
272
|
+
"file",
|
|
273
|
+
io.BytesIO(audio_data),
|
|
274
|
+
filename="audio.wav",
|
|
275
|
+
content_type="audio/x-wav",
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Add required model_id, language_code, and tag_audio_events
|
|
279
|
+
data.add_field("model_id", self._model_id)
|
|
280
|
+
data.add_field("language_code", self._settings["language"])
|
|
281
|
+
data.add_field("tag_audio_events", str(self._tag_audio_events).lower())
|
|
282
|
+
|
|
283
|
+
async with self._session.post(url, data=data, headers=headers) as response:
|
|
284
|
+
if response.status != 200:
|
|
285
|
+
error_text = await response.text()
|
|
286
|
+
self.logger.error(f"ElevenLabs transcription error: {error_text}")
|
|
287
|
+
raise Exception(f"Transcription failed with status {response.status}: {error_text}")
|
|
288
|
+
|
|
289
|
+
result = await response.json()
|
|
290
|
+
return result
|
|
284
291
|
|
|
285
292
|
@traced_stt
|
|
286
293
|
async def _handle_transcription(
|
|
287
|
-
self, transcript: str, is_final: bool, language: Optional[
|
|
294
|
+
self, transcript: str, is_final: bool, language: Optional[str] = None
|
|
288
295
|
):
|
|
289
296
|
"""Handle a transcription result with tracing."""
|
|
290
|
-
|
|
297
|
+
await self.stop_ttfb_metrics()
|
|
298
|
+
await self.stop_processing_metrics()
|
|
291
299
|
|
|
292
300
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
293
|
-
"""Transcribe
|
|
301
|
+
"""Transcribe an audio segment using ElevenLabs' STT API.
|
|
294
302
|
|
|
295
303
|
Args:
|
|
296
|
-
audio:
|
|
304
|
+
audio: Raw audio bytes in WAV format (already converted by base class).
|
|
297
305
|
|
|
298
306
|
Yields:
|
|
299
|
-
Frame: TranscriptionFrame containing the transcribed text or ErrorFrame on failure.
|
|
307
|
+
Frame: TranscriptionFrame containing the transcribed text, or ErrorFrame on failure.
|
|
308
|
+
|
|
309
|
+
Note:
|
|
310
|
+
The audio is already in WAV format from the SegmentedSTTService.
|
|
311
|
+
Only non-empty transcriptions are yielded.
|
|
300
312
|
"""
|
|
301
313
|
try:
|
|
302
314
|
await self.start_processing_metrics()
|
|
303
315
|
await self.start_ttfb_metrics()
|
|
304
316
|
|
|
305
|
-
#
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
await self.stop_ttfb_metrics()
|
|
325
|
-
|
|
326
|
-
# Process transcription result
|
|
327
|
-
if transcription and hasattr(transcription, "text") and transcription.text:
|
|
328
|
-
transcript_text = transcription.text.strip()
|
|
329
|
-
|
|
330
|
-
if transcript_text:
|
|
331
|
-
# Determine language if available from response
|
|
332
|
-
response_language = language
|
|
333
|
-
if hasattr(transcription, "language_code") and transcription.language_code:
|
|
334
|
-
# Try to map back from ElevenLabs language code to pipecat Language
|
|
335
|
-
try:
|
|
336
|
-
# This is a simplified mapping - you might want to create a reverse map
|
|
337
|
-
response_language = language # For now, keep the original
|
|
338
|
-
except ValueError:
|
|
339
|
-
self.logger.warning(
|
|
340
|
-
f"Unknown language detected: {transcription.language_code}"
|
|
341
|
-
)
|
|
342
|
-
|
|
343
|
-
# Handle transcription with tracing
|
|
344
|
-
await self._handle_transcription(transcript_text, True, response_language)
|
|
345
|
-
|
|
346
|
-
self.logger.debug(f"ElevenLabs transcription: [{transcript_text}]")
|
|
347
|
-
|
|
348
|
-
yield TranscriptionFrame(
|
|
349
|
-
text=transcript_text,
|
|
350
|
-
user_id="",
|
|
351
|
-
timestamp=time_now_iso8601(),
|
|
352
|
-
language=response_language,
|
|
353
|
-
result=transcription,
|
|
354
|
-
)
|
|
355
|
-
|
|
356
|
-
await self.stop_processing_metrics()
|
|
317
|
+
# Upload audio and get transcription result directly
|
|
318
|
+
result = await self._transcribe_audio(audio)
|
|
319
|
+
|
|
320
|
+
# Extract transcription text
|
|
321
|
+
text = result.get("text", "").strip()
|
|
322
|
+
if text:
|
|
323
|
+
# Use the language_code returned by the API
|
|
324
|
+
detected_language = result.get("language_code", "eng")
|
|
325
|
+
|
|
326
|
+
await self._handle_transcription(text, True, detected_language)
|
|
327
|
+
self.logger.debug(f"Transcription: [{text}]")
|
|
328
|
+
|
|
329
|
+
yield TranscriptionFrame(
|
|
330
|
+
text,
|
|
331
|
+
self._user_id,
|
|
332
|
+
time_now_iso8601(),
|
|
333
|
+
detected_language,
|
|
334
|
+
result=result,
|
|
335
|
+
)
|
|
357
336
|
|
|
358
337
|
except Exception as e:
|
|
359
338
|
self.logger.error(f"ElevenLabs STT error: {e}")
|
|
360
|
-
await self.stop_all_metrics()
|
|
361
339
|
yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
|
|
@@ -25,9 +25,9 @@ from pipecat.frames.frames import (
|
|
|
25
25
|
EndFrame,
|
|
26
26
|
ErrorFrame,
|
|
27
27
|
Frame,
|
|
28
|
+
InterruptionFrame,
|
|
28
29
|
LLMFullResponseEndFrame,
|
|
29
30
|
StartFrame,
|
|
30
|
-
StartInterruptionFrame,
|
|
31
31
|
TTSAudioRawFrame,
|
|
32
32
|
TTSStartedFrame,
|
|
33
33
|
TTSStoppedFrame,
|
|
@@ -465,7 +465,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
465
465
|
direction: The direction to push the frame.
|
|
466
466
|
"""
|
|
467
467
|
await super().push_frame(frame, direction)
|
|
468
|
-
if isinstance(frame, (TTSStoppedFrame,
|
|
468
|
+
if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
|
|
469
469
|
self._started = False
|
|
470
470
|
if isinstance(frame, TTSStoppedFrame):
|
|
471
471
|
await self.add_word_timestamps([("Reset", 0)])
|
|
@@ -550,7 +550,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
550
550
|
return self._websocket
|
|
551
551
|
raise Exception("Websocket not connected")
|
|
552
552
|
|
|
553
|
-
async def _handle_interruption(self, frame:
|
|
553
|
+
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
554
554
|
"""Handle interruption by closing the current context."""
|
|
555
555
|
await super()._handle_interruption(frame, direction)
|
|
556
556
|
|
|
@@ -559,7 +559,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
559
559
|
logger.trace(f"Closing context {self._context_id} due to interruption")
|
|
560
560
|
try:
|
|
561
561
|
# ElevenLabs requires that Pipecat manages the contexts and closes them
|
|
562
|
-
# when they're not longer in use. Since
|
|
562
|
+
# when they're not longer in use. Since an InterruptionFrame is pushed
|
|
563
563
|
# every time the user speaks, we'll use this as a trigger to close the context
|
|
564
564
|
# and reset the state.
|
|
565
565
|
# Note: We do not need to call remove_audio_context here, as the context is
|
|
@@ -858,7 +858,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
858
858
|
direction: The direction to push the frame.
|
|
859
859
|
"""
|
|
860
860
|
await super().push_frame(frame, direction)
|
|
861
|
-
if isinstance(frame, (
|
|
861
|
+
if isinstance(frame, (InterruptionFrame, TTSStoppedFrame)):
|
|
862
862
|
# Reset timing on interruption or stop
|
|
863
863
|
self._reset_state()
|
|
864
864
|
|