dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""ElevenLabs speech-to-text service implementation.
|
|
8
|
+
|
|
9
|
+
This module provides integration with ElevenLabs' Speech-to-Text API for transcription
|
|
10
|
+
using segmented audio processing. The service uploads audio files and receives
|
|
11
|
+
transcription results directly.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import io
|
|
15
|
+
from typing import AsyncGenerator, Optional
|
|
16
|
+
|
|
17
|
+
import aiohttp
|
|
18
|
+
from loguru import logger
|
|
19
|
+
from pydantic import BaseModel
|
|
20
|
+
|
|
21
|
+
from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
|
|
22
|
+
from pipecat.services.stt_service import SegmentedSTTService
|
|
23
|
+
from pipecat.transcriptions.language import Language
|
|
24
|
+
from pipecat.utils.time import time_now_iso8601
|
|
25
|
+
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
29
|
+
"""Convert a Language enum to ElevenLabs language code.
|
|
30
|
+
|
|
31
|
+
Source:
|
|
32
|
+
https://elevenlabs.io/docs/capabilities/speech-to-text
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
language: The Language enum value to convert.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
The corresponding ElevenLabs language code, or None if not supported.
|
|
39
|
+
"""
|
|
40
|
+
BASE_LANGUAGES = {
|
|
41
|
+
Language.AF: "afr", # Afrikaans
|
|
42
|
+
Language.AM: "amh", # Amharic
|
|
43
|
+
Language.AR: "ara", # Arabic
|
|
44
|
+
Language.HY: "hye", # Armenian
|
|
45
|
+
Language.AS: "asm", # Assamese
|
|
46
|
+
Language.AST: "ast", # Asturian
|
|
47
|
+
Language.AZ: "aze", # Azerbaijani
|
|
48
|
+
Language.BE: "bel", # Belarusian
|
|
49
|
+
Language.BN: "ben", # Bengali
|
|
50
|
+
Language.BS: "bos", # Bosnian
|
|
51
|
+
Language.BG: "bul", # Bulgarian
|
|
52
|
+
Language.MY: "mya", # Burmese
|
|
53
|
+
Language.YUE: "yue", # Cantonese
|
|
54
|
+
Language.CA: "cat", # Catalan
|
|
55
|
+
Language.CEB: "ceb", # Cebuano
|
|
56
|
+
Language.NY: "nya", # Chichewa
|
|
57
|
+
Language.HR: "hrv", # Croatian
|
|
58
|
+
Language.CS: "ces", # Czech
|
|
59
|
+
Language.DA: "dan", # Danish
|
|
60
|
+
Language.NL: "nld", # Dutch
|
|
61
|
+
Language.EN: "eng", # English
|
|
62
|
+
Language.ET: "est", # Estonian
|
|
63
|
+
Language.FIL: "fil", # Filipino
|
|
64
|
+
Language.FI: "fin", # Finnish
|
|
65
|
+
Language.FR: "fra", # French
|
|
66
|
+
Language.FF: "ful", # Fulah
|
|
67
|
+
Language.GL: "glg", # Galician
|
|
68
|
+
Language.LG: "lug", # Ganda
|
|
69
|
+
Language.KA: "kat", # Georgian
|
|
70
|
+
Language.DE: "deu", # German
|
|
71
|
+
Language.EL: "ell", # Greek
|
|
72
|
+
Language.GU: "guj", # Gujarati
|
|
73
|
+
Language.HA: "hau", # Hausa
|
|
74
|
+
Language.HE: "heb", # Hebrew
|
|
75
|
+
Language.HI: "hin", # Hindi
|
|
76
|
+
Language.HU: "hun", # Hungarian
|
|
77
|
+
Language.IS: "isl", # Icelandic
|
|
78
|
+
Language.IG: "ibo", # Igbo
|
|
79
|
+
Language.ID: "ind", # Indonesian
|
|
80
|
+
Language.GA: "gle", # Irish
|
|
81
|
+
Language.IT: "ita", # Italian
|
|
82
|
+
Language.JA: "jpn", # Japanese
|
|
83
|
+
Language.JV: "jav", # Javanese
|
|
84
|
+
Language.KEA: "kea", # Kabuverdianu
|
|
85
|
+
Language.KN: "kan", # Kannada
|
|
86
|
+
Language.KK: "kaz", # Kazakh
|
|
87
|
+
Language.KM: "khm", # Khmer
|
|
88
|
+
Language.KO: "kor", # Korean
|
|
89
|
+
Language.KU: "kur", # Kurdish
|
|
90
|
+
Language.KY: "kir", # Kyrgyz
|
|
91
|
+
Language.LO: "lao", # Lao
|
|
92
|
+
Language.LV: "lav", # Latvian
|
|
93
|
+
Language.LN: "lin", # Lingala
|
|
94
|
+
Language.LT: "lit", # Lithuanian
|
|
95
|
+
Language.LUO: "luo", # Luo
|
|
96
|
+
Language.LB: "ltz", # Luxembourgish
|
|
97
|
+
Language.MK: "mkd", # Macedonian
|
|
98
|
+
Language.MS: "msa", # Malay
|
|
99
|
+
Language.ML: "mal", # Malayalam
|
|
100
|
+
Language.MT: "mlt", # Maltese
|
|
101
|
+
Language.ZH: "zho", # Mandarin Chinese
|
|
102
|
+
Language.MI: "mri", # Māori
|
|
103
|
+
Language.MR: "mar", # Marathi
|
|
104
|
+
Language.MN: "mon", # Mongolian
|
|
105
|
+
Language.NE: "nep", # Nepali
|
|
106
|
+
Language.NSO: "nso", # Northern Sotho
|
|
107
|
+
Language.NO: "nor", # Norwegian
|
|
108
|
+
Language.OC: "oci", # Occitan
|
|
109
|
+
Language.OR: "ori", # Odia
|
|
110
|
+
Language.PS: "pus", # Pashto
|
|
111
|
+
Language.FA: "fas", # Persian
|
|
112
|
+
Language.PL: "pol", # Polish
|
|
113
|
+
Language.PT: "por", # Portuguese
|
|
114
|
+
Language.PA: "pan", # Punjabi
|
|
115
|
+
Language.RO: "ron", # Romanian
|
|
116
|
+
Language.RU: "rus", # Russian
|
|
117
|
+
Language.SR: "srp", # Serbian
|
|
118
|
+
Language.SN: "sna", # Shona
|
|
119
|
+
Language.SD: "snd", # Sindhi
|
|
120
|
+
Language.SK: "slk", # Slovak
|
|
121
|
+
Language.SL: "slv", # Slovenian
|
|
122
|
+
Language.SO: "som", # Somali
|
|
123
|
+
Language.ES: "spa", # Spanish
|
|
124
|
+
Language.SW: "swa", # Swahili
|
|
125
|
+
Language.SV: "swe", # Swedish
|
|
126
|
+
Language.TA: "tam", # Tamil
|
|
127
|
+
Language.TG: "tgk", # Tajik
|
|
128
|
+
Language.TE: "tel", # Telugu
|
|
129
|
+
Language.TH: "tha", # Thai
|
|
130
|
+
Language.TR: "tur", # Turkish
|
|
131
|
+
Language.UK: "ukr", # Ukrainian
|
|
132
|
+
Language.UMB: "umb", # Umbundu
|
|
133
|
+
Language.UR: "urd", # Urdu
|
|
134
|
+
Language.UZ: "uzb", # Uzbek
|
|
135
|
+
Language.VI: "vie", # Vietnamese
|
|
136
|
+
Language.CY: "cym", # Welsh
|
|
137
|
+
Language.WO: "wol", # Wolof
|
|
138
|
+
Language.XH: "xho", # Xhosa
|
|
139
|
+
Language.ZU: "zul", # Zulu
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
result = BASE_LANGUAGES.get(language)
|
|
143
|
+
|
|
144
|
+
# If not found in base languages, try to find the base language from a variant
|
|
145
|
+
if not result:
|
|
146
|
+
lang_str = str(language.value)
|
|
147
|
+
base_code = lang_str.split("-")[0].lower()
|
|
148
|
+
result = base_code if base_code in BASE_LANGUAGES.values() else None
|
|
149
|
+
|
|
150
|
+
return result
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class ElevenLabsSTTService(SegmentedSTTService):
|
|
154
|
+
"""Speech-to-text service using ElevenLabs' file-based API.
|
|
155
|
+
|
|
156
|
+
This service uses ElevenLabs' Speech-to-Text API to perform transcription on audio
|
|
157
|
+
segments. It inherits from SegmentedSTTService to handle audio buffering and speech detection.
|
|
158
|
+
The service uploads audio files to ElevenLabs and receives transcription results directly.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
class InputParams(BaseModel):
|
|
162
|
+
"""Configuration parameters for ElevenLabs STT API.
|
|
163
|
+
|
|
164
|
+
Parameters:
|
|
165
|
+
language: Target language for transcription.
|
|
166
|
+
tag_audio_events: Whether to include audio events like (laughter), (coughing), in the transcription.
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
language: Optional[Language] = None
|
|
170
|
+
tag_audio_events: bool = True
|
|
171
|
+
|
|
172
|
+
def __init__(
|
|
173
|
+
self,
|
|
174
|
+
*,
|
|
175
|
+
api_key: str,
|
|
176
|
+
aiohttp_session: aiohttp.ClientSession,
|
|
177
|
+
base_url: str = "https://api.elevenlabs.io",
|
|
178
|
+
model: str = "scribe_v1",
|
|
179
|
+
sample_rate: Optional[int] = None,
|
|
180
|
+
params: Optional[InputParams] = None,
|
|
181
|
+
**kwargs,
|
|
182
|
+
):
|
|
183
|
+
"""Initialize the ElevenLabs STT service.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
api_key: ElevenLabs API key for authentication.
|
|
187
|
+
aiohttp_session: aiohttp ClientSession for HTTP requests.
|
|
188
|
+
base_url: Base URL for ElevenLabs API.
|
|
189
|
+
model: Model ID for transcription. Defaults to "scribe_v1".
|
|
190
|
+
sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate.
|
|
191
|
+
params: Configuration parameters for the STT service.
|
|
192
|
+
**kwargs: Additional arguments passed to SegmentedSTTService.
|
|
193
|
+
"""
|
|
194
|
+
super().__init__(
|
|
195
|
+
sample_rate=sample_rate,
|
|
196
|
+
**kwargs,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
params = params or ElevenLabsSTTService.InputParams()
|
|
200
|
+
|
|
201
|
+
self._api_key = api_key
|
|
202
|
+
self._base_url = base_url
|
|
203
|
+
self._session = aiohttp_session
|
|
204
|
+
self._model_id = model
|
|
205
|
+
self._tag_audio_events = params.tag_audio_events
|
|
206
|
+
|
|
207
|
+
self._settings = {
|
|
208
|
+
"language": self.language_to_service_language(params.language)
|
|
209
|
+
if params.language
|
|
210
|
+
else "eng",
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
def can_generate_metrics(self) -> bool:
|
|
214
|
+
"""Check if the service can generate processing metrics.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
True, as ElevenLabs STT service supports metrics generation.
|
|
218
|
+
"""
|
|
219
|
+
return True
|
|
220
|
+
|
|
221
|
+
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
222
|
+
"""Convert a Language enum to ElevenLabs service-specific language code.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
language: The language to convert.
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
The ElevenLabs-specific language code, or None if not supported.
|
|
229
|
+
"""
|
|
230
|
+
return language_to_elevenlabs_language(language)
|
|
231
|
+
|
|
232
|
+
async def set_language(self, language: Language):
|
|
233
|
+
"""Set the transcription language.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
language: The language to use for speech-to-text transcription.
|
|
237
|
+
"""
|
|
238
|
+
self.logger.info(f"Switching STT language to: [{language}]")
|
|
239
|
+
self._settings["language"] = self.language_to_service_language(language)
|
|
240
|
+
|
|
241
|
+
async def set_model(self, model: str):
|
|
242
|
+
"""Set the STT model.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
model: The model name to use for transcription.
|
|
246
|
+
|
|
247
|
+
Note:
|
|
248
|
+
ElevenLabs STT API does not currently support model selection.
|
|
249
|
+
This method is provided for interface compatibility.
|
|
250
|
+
"""
|
|
251
|
+
await super().set_model(model)
|
|
252
|
+
self.logger.info(f"Model setting [{model}] noted, but ElevenLabs STT uses default model")
|
|
253
|
+
|
|
254
|
+
async def _transcribe_audio(self, audio_data: bytes) -> dict:
|
|
255
|
+
"""Upload audio data to ElevenLabs and get transcription result.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
audio_data: Raw audio bytes in WAV format.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
The transcription result data.
|
|
262
|
+
|
|
263
|
+
Raises:
|
|
264
|
+
Exception: If transcription fails or returns an error.
|
|
265
|
+
"""
|
|
266
|
+
url = f"{self._base_url}/v1/speech-to-text"
|
|
267
|
+
headers = {"xi-api-key": self._api_key}
|
|
268
|
+
|
|
269
|
+
# Create form data with the audio file
|
|
270
|
+
data = aiohttp.FormData()
|
|
271
|
+
data.add_field(
|
|
272
|
+
"file",
|
|
273
|
+
io.BytesIO(audio_data),
|
|
274
|
+
filename="audio.wav",
|
|
275
|
+
content_type="audio/x-wav",
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Add required model_id, language_code, and tag_audio_events
|
|
279
|
+
data.add_field("model_id", self._model_id)
|
|
280
|
+
data.add_field("language_code", self._settings["language"])
|
|
281
|
+
data.add_field("tag_audio_events", str(self._tag_audio_events).lower())
|
|
282
|
+
|
|
283
|
+
async with self._session.post(url, data=data, headers=headers) as response:
|
|
284
|
+
if response.status != 200:
|
|
285
|
+
error_text = await response.text()
|
|
286
|
+
self.logger.error(f"ElevenLabs transcription error: {error_text}")
|
|
287
|
+
raise Exception(f"Transcription failed with status {response.status}: {error_text}")
|
|
288
|
+
|
|
289
|
+
result = await response.json()
|
|
290
|
+
return result
|
|
291
|
+
|
|
292
|
+
@traced_stt
|
|
293
|
+
async def _handle_transcription(
|
|
294
|
+
self, transcript: str, is_final: bool, language: Optional[str] = None
|
|
295
|
+
):
|
|
296
|
+
"""Handle a transcription result with tracing."""
|
|
297
|
+
await self.stop_ttfb_metrics()
|
|
298
|
+
await self.stop_processing_metrics()
|
|
299
|
+
|
|
300
|
+
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
301
|
+
"""Transcribe an audio segment using ElevenLabs' STT API.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
audio: Raw audio bytes in WAV format (already converted by base class).
|
|
305
|
+
|
|
306
|
+
Yields:
|
|
307
|
+
Frame: TranscriptionFrame containing the transcribed text, or ErrorFrame on failure.
|
|
308
|
+
|
|
309
|
+
Note:
|
|
310
|
+
The audio is already in WAV format from the SegmentedSTTService.
|
|
311
|
+
Only non-empty transcriptions are yielded.
|
|
312
|
+
"""
|
|
313
|
+
try:
|
|
314
|
+
await self.start_processing_metrics()
|
|
315
|
+
await self.start_ttfb_metrics()
|
|
316
|
+
|
|
317
|
+
# Upload audio and get transcription result directly
|
|
318
|
+
result = await self._transcribe_audio(audio)
|
|
319
|
+
|
|
320
|
+
# Extract transcription text
|
|
321
|
+
text = result.get("text", "").strip()
|
|
322
|
+
if text:
|
|
323
|
+
# Use the language_code returned by the API
|
|
324
|
+
detected_language = result.get("language_code", "eng")
|
|
325
|
+
|
|
326
|
+
await self._handle_transcription(text, True, detected_language)
|
|
327
|
+
self.logger.debug(f"Transcription: [{text}]")
|
|
328
|
+
|
|
329
|
+
yield TranscriptionFrame(
|
|
330
|
+
text,
|
|
331
|
+
self._user_id,
|
|
332
|
+
time_now_iso8601(),
|
|
333
|
+
detected_language,
|
|
334
|
+
result=result,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
except Exception as e:
|
|
338
|
+
self.logger.error(f"ElevenLabs STT error: {e}")
|
|
339
|
+
yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
|
|
@@ -25,9 +25,9 @@ from pipecat.frames.frames import (
|
|
|
25
25
|
EndFrame,
|
|
26
26
|
ErrorFrame,
|
|
27
27
|
Frame,
|
|
28
|
+
InterruptionFrame,
|
|
28
29
|
LLMFullResponseEndFrame,
|
|
29
30
|
StartFrame,
|
|
30
|
-
StartInterruptionFrame,
|
|
31
31
|
TTSAudioRawFrame,
|
|
32
32
|
TTSStartedFrame,
|
|
33
33
|
TTSStoppedFrame,
|
|
@@ -172,16 +172,24 @@ def build_elevenlabs_voice_settings(
|
|
|
172
172
|
|
|
173
173
|
|
|
174
174
|
def calculate_word_times(
|
|
175
|
-
alignment_info: Mapping[str, Any],
|
|
176
|
-
|
|
175
|
+
alignment_info: Mapping[str, Any],
|
|
176
|
+
cumulative_time: float,
|
|
177
|
+
partial_word: str = "",
|
|
178
|
+
partial_word_start_time: float = 0.0,
|
|
179
|
+
) -> tuple[List[Tuple[str, float]], str, float]:
|
|
177
180
|
"""Calculate word timestamps from character alignment information.
|
|
178
181
|
|
|
179
182
|
Args:
|
|
180
183
|
alignment_info: Character alignment data from ElevenLabs API.
|
|
181
184
|
cumulative_time: Base time offset for this chunk.
|
|
185
|
+
partial_word: Partial word carried over from previous chunk.
|
|
186
|
+
partial_word_start_time: Start time of the partial word.
|
|
182
187
|
|
|
183
188
|
Returns:
|
|
184
|
-
|
|
189
|
+
Tuple of (word_times, new_partial_word, new_partial_word_start_time):
|
|
190
|
+
- word_times: List of (word, timestamp) tuples for complete words
|
|
191
|
+
- new_partial_word: Incomplete word at end of chunk (empty if chunk ends with space)
|
|
192
|
+
- new_partial_word_start_time: Start time of the incomplete word
|
|
185
193
|
"""
|
|
186
194
|
chars = alignment_info["chars"]
|
|
187
195
|
char_start_times_ms = alignment_info["charStartTimesMs"]
|
|
@@ -190,41 +198,37 @@ def calculate_word_times(
|
|
|
190
198
|
logger.error(
|
|
191
199
|
f"calculate_word_times: length mismatch - chars={len(chars)}, times={len(char_start_times_ms)}"
|
|
192
200
|
)
|
|
193
|
-
return []
|
|
201
|
+
return ([], partial_word, partial_word_start_time)
|
|
194
202
|
|
|
195
203
|
# Build words and track their start positions
|
|
196
204
|
words = []
|
|
197
|
-
|
|
198
|
-
current_word =
|
|
199
|
-
|
|
205
|
+
word_start_times = []
|
|
206
|
+
current_word = partial_word # Start with any partial word from previous chunk
|
|
207
|
+
word_start_time = partial_word_start_time if partial_word else None
|
|
200
208
|
|
|
201
209
|
for i, char in enumerate(chars):
|
|
202
210
|
if char == " ":
|
|
203
211
|
# End of current word
|
|
204
212
|
if current_word: # Only add non-empty words
|
|
205
213
|
words.append(current_word)
|
|
206
|
-
|
|
214
|
+
word_start_times.append(word_start_time)
|
|
207
215
|
current_word = ""
|
|
208
|
-
|
|
216
|
+
word_start_time = None
|
|
209
217
|
else:
|
|
210
218
|
# Building a word
|
|
211
|
-
if
|
|
212
|
-
|
|
219
|
+
if word_start_time is None: # First character of new word
|
|
220
|
+
# Convert from milliseconds to seconds and add cumulative offset
|
|
221
|
+
word_start_time = cumulative_time + (char_start_times_ms[i] / 1000.0)
|
|
213
222
|
current_word += char
|
|
214
223
|
|
|
215
|
-
#
|
|
216
|
-
|
|
217
|
-
words.append(current_word)
|
|
218
|
-
word_start_indices.append(word_start_index)
|
|
224
|
+
# Build result for complete words
|
|
225
|
+
word_times = list(zip(words, word_start_times))
|
|
219
226
|
|
|
220
|
-
#
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
# Convert from milliseconds to seconds and add cumulative offset
|
|
224
|
-
start_time_seconds = cumulative_time + (char_start_times_ms[start_idx] / 1000.0)
|
|
225
|
-
word_times.append((word, start_time_seconds))
|
|
227
|
+
# Return any incomplete word at the end of this chunk
|
|
228
|
+
new_partial_word = current_word if current_word else ""
|
|
229
|
+
new_partial_word_start_time = word_start_time if word_start_time is not None else 0.0
|
|
226
230
|
|
|
227
|
-
return word_times
|
|
231
|
+
return (word_times, new_partial_word, new_partial_word_start_time)
|
|
228
232
|
|
|
229
233
|
|
|
230
234
|
class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
@@ -336,6 +340,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
336
340
|
# there's an interruption or TTSStoppedFrame.
|
|
337
341
|
self._started = False
|
|
338
342
|
self._cumulative_time = 0
|
|
343
|
+
# Track partial words that span across alignment chunks
|
|
344
|
+
self._partial_word = ""
|
|
345
|
+
self._partial_word_start_time = 0.0
|
|
339
346
|
|
|
340
347
|
# Context management for v1 multi API
|
|
341
348
|
self._context_id = None
|
|
@@ -465,7 +472,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
465
472
|
direction: The direction to push the frame.
|
|
466
473
|
"""
|
|
467
474
|
await super().push_frame(frame, direction)
|
|
468
|
-
if isinstance(frame, (TTSStoppedFrame,
|
|
475
|
+
if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
|
|
469
476
|
self._started = False
|
|
470
477
|
if isinstance(frame, TTSStoppedFrame):
|
|
471
478
|
await self.add_word_timestamps([("Reset", 0)])
|
|
@@ -526,6 +533,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
526
533
|
url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
|
|
527
534
|
)
|
|
528
535
|
|
|
536
|
+
await self._call_event_handler("on_connected")
|
|
529
537
|
except Exception as e:
|
|
530
538
|
self.logger.error(f"{self} initialization error: {e}")
|
|
531
539
|
self._websocket = None
|
|
@@ -544,13 +552,18 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
544
552
|
logger.debug("Disconnected from ElevenLabs")
|
|
545
553
|
except Exception as e:
|
|
546
554
|
self.logger.error(f"{self} error closing websocket: {e}")
|
|
555
|
+
finally:
|
|
556
|
+
self._started = False
|
|
557
|
+
self._context_id = None
|
|
558
|
+
self._websocket = None
|
|
559
|
+
await self._call_event_handler("on_disconnected")
|
|
547
560
|
|
|
548
561
|
def _get_websocket(self):
|
|
549
562
|
if self._websocket:
|
|
550
563
|
return self._websocket
|
|
551
564
|
raise Exception("Websocket not connected")
|
|
552
565
|
|
|
553
|
-
async def _handle_interruption(self, frame:
|
|
566
|
+
async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
|
|
554
567
|
"""Handle interruption by closing the current context."""
|
|
555
568
|
await super()._handle_interruption(frame, direction)
|
|
556
569
|
|
|
@@ -559,7 +572,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
559
572
|
logger.trace(f"Closing context {self._context_id} due to interruption")
|
|
560
573
|
try:
|
|
561
574
|
# ElevenLabs requires that Pipecat manages the contexts and closes them
|
|
562
|
-
# when they're not longer in use. Since
|
|
575
|
+
# when they're not longer in use. Since an InterruptionFrame is pushed
|
|
563
576
|
# every time the user speaks, we'll use this as a trigger to close the context
|
|
564
577
|
# and reset the state.
|
|
565
578
|
# Note: We do not need to call remove_audio_context here, as the context is
|
|
@@ -571,6 +584,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
571
584
|
logger.error(f"Error closing context on interruption: {e}")
|
|
572
585
|
self._context_id = None
|
|
573
586
|
self._started = False
|
|
587
|
+
self._partial_word = ""
|
|
588
|
+
self._partial_word_start_time = 0.0
|
|
574
589
|
|
|
575
590
|
async def _receive_messages(self):
|
|
576
591
|
"""Handle incoming WebSocket messages from ElevenLabs."""
|
|
@@ -610,7 +625,14 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
610
625
|
|
|
611
626
|
if msg.get("alignment"):
|
|
612
627
|
alignment = msg["alignment"]
|
|
613
|
-
word_times
|
|
628
|
+
word_times, self._partial_word, self._partial_word_start_time = (
|
|
629
|
+
calculate_word_times(
|
|
630
|
+
alignment,
|
|
631
|
+
self._cumulative_time,
|
|
632
|
+
self._partial_word,
|
|
633
|
+
self._partial_word_start_time,
|
|
634
|
+
)
|
|
635
|
+
)
|
|
614
636
|
|
|
615
637
|
if word_times:
|
|
616
638
|
await self.add_word_timestamps(word_times)
|
|
@@ -685,6 +707,8 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
|
|
|
685
707
|
yield TTSStartedFrame()
|
|
686
708
|
self._started = True
|
|
687
709
|
self._cumulative_time = 0
|
|
710
|
+
self._partial_word = ""
|
|
711
|
+
self._partial_word_start_time = 0.0
|
|
688
712
|
# If a context ID does not exist, create a new one and
|
|
689
713
|
# register it. If an ID exists, that means the Pipeline is
|
|
690
714
|
# configured for allow_interruptions=False, so continue
|
|
@@ -758,6 +782,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
758
782
|
base_url: str = "https://api.elevenlabs.io",
|
|
759
783
|
sample_rate: Optional[int] = None,
|
|
760
784
|
params: Optional[InputParams] = None,
|
|
785
|
+
aggregate_sentences: Optional[bool] = True,
|
|
761
786
|
**kwargs,
|
|
762
787
|
):
|
|
763
788
|
"""Initialize the ElevenLabs HTTP TTS service.
|
|
@@ -770,10 +795,11 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
770
795
|
base_url: Base URL for ElevenLabs HTTP API.
|
|
771
796
|
sample_rate: Audio sample rate. If None, uses default.
|
|
772
797
|
params: Additional input parameters for voice customization.
|
|
798
|
+
aggregate_sentences: Whether to aggregate sentences within the TTSService.
|
|
773
799
|
**kwargs: Additional arguments passed to the parent service.
|
|
774
800
|
"""
|
|
775
801
|
super().__init__(
|
|
776
|
-
aggregate_sentences=
|
|
802
|
+
aggregate_sentences=aggregate_sentences,
|
|
777
803
|
push_text_frames=False,
|
|
778
804
|
push_stop_frames=True,
|
|
779
805
|
sample_rate=sample_rate,
|
|
@@ -811,6 +837,10 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
811
837
|
# Store previous text for context within a turn
|
|
812
838
|
self._previous_text = ""
|
|
813
839
|
|
|
840
|
+
# Track partial words that span across alignment chunks
|
|
841
|
+
self._partial_word = ""
|
|
842
|
+
self._partial_word_start_time = 0.0
|
|
843
|
+
|
|
814
844
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
815
845
|
"""Convert pipecat Language to ElevenLabs language code.
|
|
816
846
|
|
|
@@ -838,6 +868,8 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
838
868
|
self._cumulative_time = 0
|
|
839
869
|
self._started = False
|
|
840
870
|
self._previous_text = ""
|
|
871
|
+
self._partial_word = ""
|
|
872
|
+
self._partial_word_start_time = 0.0
|
|
841
873
|
logger.debug(f"{self}: Reset internal state")
|
|
842
874
|
|
|
843
875
|
async def start(self, frame: StartFrame):
|
|
@@ -858,7 +890,7 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
858
890
|
direction: The direction to push the frame.
|
|
859
891
|
"""
|
|
860
892
|
await super().push_frame(frame, direction)
|
|
861
|
-
if isinstance(frame, (
|
|
893
|
+
if isinstance(frame, (InterruptionFrame, TTSStoppedFrame)):
|
|
862
894
|
# Reset timing on interruption or stop
|
|
863
895
|
self._reset_state()
|
|
864
896
|
|
|
@@ -872,11 +904,13 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
872
904
|
def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
|
|
873
905
|
"""Calculate word timing from character alignment data.
|
|
874
906
|
|
|
907
|
+
This method handles partial words that may span across multiple alignment chunks.
|
|
908
|
+
|
|
875
909
|
Args:
|
|
876
910
|
alignment_info: Character timing data from ElevenLabs.
|
|
877
911
|
|
|
878
912
|
Returns:
|
|
879
|
-
List of (word, timestamp) pairs.
|
|
913
|
+
List of (word, timestamp) pairs for complete words in this chunk.
|
|
880
914
|
|
|
881
915
|
Example input data::
|
|
882
916
|
|
|
@@ -902,30 +936,28 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
902
936
|
# Build the words and find their start times
|
|
903
937
|
words = []
|
|
904
938
|
word_start_times = []
|
|
905
|
-
|
|
906
|
-
|
|
939
|
+
# Start with any partial word from previous chunk
|
|
940
|
+
current_word = self._partial_word
|
|
941
|
+
word_start_time = self._partial_word_start_time if self._partial_word else None
|
|
907
942
|
|
|
908
943
|
for i, char in enumerate(chars):
|
|
909
944
|
if char == " ":
|
|
910
945
|
if current_word: # Only add non-empty words
|
|
911
946
|
words.append(current_word)
|
|
912
|
-
|
|
913
|
-
word_start_times.append(
|
|
914
|
-
self._cumulative_time + char_start_times[first_char_idx]
|
|
915
|
-
)
|
|
947
|
+
word_start_times.append(word_start_time)
|
|
916
948
|
current_word = ""
|
|
917
|
-
|
|
949
|
+
word_start_time = None
|
|
918
950
|
else:
|
|
919
|
-
if
|
|
920
|
-
|
|
951
|
+
if word_start_time is None: # First character of a new word
|
|
952
|
+
# Use time of the first character of the word, offset by cumulative time
|
|
953
|
+
word_start_time = self._cumulative_time + char_start_times[i]
|
|
921
954
|
current_word += char
|
|
922
955
|
|
|
923
|
-
#
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
word_start_times.append(self._cumulative_time + char_start_times[first_char_idx])
|
|
956
|
+
# Store any incomplete word at the end of this chunk
|
|
957
|
+
self._partial_word = current_word if current_word else ""
|
|
958
|
+
self._partial_word_start_time = word_start_time if word_start_time is not None else 0.0
|
|
927
959
|
|
|
928
|
-
# Create word-time pairs
|
|
960
|
+
# Create word-time pairs for complete words only
|
|
929
961
|
word_times = list(zip(words, word_start_times))
|
|
930
962
|
|
|
931
963
|
return word_times
|
|
@@ -961,6 +993,9 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
961
993
|
if self._voice_settings:
|
|
962
994
|
payload["voice_settings"] = self._voice_settings
|
|
963
995
|
|
|
996
|
+
if self._settings["apply_text_normalization"] is not None:
|
|
997
|
+
payload["apply_text_normalization"] = self._settings["apply_text_normalization"]
|
|
998
|
+
|
|
964
999
|
language = self._settings["language"]
|
|
965
1000
|
if self._model_name in ELEVENLABS_MULTILINGUAL_MODELS and language:
|
|
966
1001
|
payload["language_code"] = language
|
|
@@ -981,8 +1016,6 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
981
1016
|
}
|
|
982
1017
|
if self._settings["optimize_streaming_latency"] is not None:
|
|
983
1018
|
params["optimize_streaming_latency"] = self._settings["optimize_streaming_latency"]
|
|
984
|
-
if self._settings["apply_text_normalization"] is not None:
|
|
985
|
-
params["apply_text_normalization"] = self._settings["apply_text_normalization"]
|
|
986
1019
|
|
|
987
1020
|
self.logger.debug(f"ElevenLabs request - payload: {payload}, params: {params}")
|
|
988
1021
|
|
|
@@ -1045,6 +1078,14 @@ class ElevenLabsHttpTTSService(WordTTSService):
|
|
|
1045
1078
|
logger.error(f"Error processing response: {e}", exc_info=True)
|
|
1046
1079
|
continue
|
|
1047
1080
|
|
|
1081
|
+
# After processing all chunks, emit any remaining partial word
|
|
1082
|
+
# since this is the end of the utterance
|
|
1083
|
+
if self._partial_word:
|
|
1084
|
+
final_word_time = [(self._partial_word, self._partial_word_start_time)]
|
|
1085
|
+
await self.add_word_timestamps(final_word_time)
|
|
1086
|
+
self._partial_word = ""
|
|
1087
|
+
self._partial_word_start_time = 0.0
|
|
1088
|
+
|
|
1048
1089
|
# After processing all chunks, add the total utterance duration
|
|
1049
1090
|
# to the cumulative time to ensure next utterance starts after this one
|
|
1050
1091
|
if utterance_duration > 0:
|