dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -4,5 +4,12 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
import sys
|
|
7
8
|
|
|
9
|
+
from pipecat.services import DeprecatedModuleProxy
|
|
10
|
+
|
|
11
|
+
from .stt import *
|
|
8
12
|
from .tts import *
|
|
13
|
+
|
|
14
|
+
# Old
|
|
15
|
+
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "sarvam", "sarvam.tts")
|
|
@@ -0,0 +1,540 @@
|
|
|
1
|
+
"""Sarvam AI Speech-to-Text service implementation.
|
|
2
|
+
|
|
3
|
+
This module provides a streaming Speech-to-Text service using Sarvam AI's WebSocket-based
|
|
4
|
+
API. It supports real-time transcription with Voice Activity Detection (VAD) and
|
|
5
|
+
can handle multiple audio formats for Indian language speech recognition.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import base64
|
|
10
|
+
import json
|
|
11
|
+
from enum import StrEnum
|
|
12
|
+
from typing import Literal, Optional
|
|
13
|
+
from urllib.parse import urlencode
|
|
14
|
+
|
|
15
|
+
from loguru import logger
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
|
|
18
|
+
from pipecat.audio.resamplers.resampy_resampler import ResampyResampler
|
|
19
|
+
from pipecat.frames.frames import (
|
|
20
|
+
CancelFrame,
|
|
21
|
+
EndFrame,
|
|
22
|
+
ErrorFrame,
|
|
23
|
+
StartFrame,
|
|
24
|
+
TranscriptionFrame,
|
|
25
|
+
)
|
|
26
|
+
from pipecat.services.stt_service import STTService
|
|
27
|
+
from pipecat.transcriptions.language import Language
|
|
28
|
+
from pipecat.utils.time import time_now_iso8601
|
|
29
|
+
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
import websockets
|
|
33
|
+
from sarvamai import AsyncSarvamAI
|
|
34
|
+
from sarvamai.speech_to_text_streaming.socket_client import (
|
|
35
|
+
AsyncSpeechToTextStreamingSocketClient,
|
|
36
|
+
)
|
|
37
|
+
from sarvamai.speech_to_text_translate_streaming.socket_client import (
|
|
38
|
+
AsyncSpeechToTextTranslateStreamingSocketClient,
|
|
39
|
+
)
|
|
40
|
+
from websockets.protocol import State
|
|
41
|
+
except ModuleNotFoundError as e:
|
|
42
|
+
logger.error(f"Exception: {e}")
|
|
43
|
+
logger.error("In order to use Sarvam, you need to `pip install pipecat-ai[sarvam]`.")
|
|
44
|
+
raise Exception(f"Missing module: {e}")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def language_to_sarvam_language(language) -> str:
|
|
48
|
+
"""Convert Language enum or string to Sarvam language code.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
language: The Language enum or language code string to convert.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
The corresponding Sarvam language code string.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
ValueError: If the language is not supported by Sarvam.
|
|
58
|
+
"""
|
|
59
|
+
# If already a string in the right format, return it
|
|
60
|
+
if isinstance(language, str):
|
|
61
|
+
if "-" in language: # Already in format like "hi-IN"
|
|
62
|
+
return language
|
|
63
|
+
# Convert short codes to full format
|
|
64
|
+
lang_map = {
|
|
65
|
+
"hi": "hi-IN",
|
|
66
|
+
"bn": "bn-IN",
|
|
67
|
+
"gu": "gu-IN",
|
|
68
|
+
"kn": "kn-IN",
|
|
69
|
+
"ml": "ml-IN",
|
|
70
|
+
"mr": "mr-IN",
|
|
71
|
+
"ta": "ta-IN",
|
|
72
|
+
"te": "te-IN",
|
|
73
|
+
"pa": "pa-IN",
|
|
74
|
+
"or": "od-IN",
|
|
75
|
+
"as": "as-IN",
|
|
76
|
+
"en": "en-IN",
|
|
77
|
+
}
|
|
78
|
+
if language.lower() in lang_map:
|
|
79
|
+
return lang_map[language.lower()]
|
|
80
|
+
raise ValueError(f"Unsupported language string: {language}")
|
|
81
|
+
|
|
82
|
+
# Handle Language enum
|
|
83
|
+
match language:
|
|
84
|
+
case Language.BN_IN:
|
|
85
|
+
return "bn-IN"
|
|
86
|
+
case Language.GU_IN:
|
|
87
|
+
return "gu-IN"
|
|
88
|
+
case Language.HI_IN:
|
|
89
|
+
return "hi-IN"
|
|
90
|
+
case Language.KN_IN:
|
|
91
|
+
return "kn-IN"
|
|
92
|
+
case Language.ML_IN:
|
|
93
|
+
return "ml-IN"
|
|
94
|
+
case Language.MR_IN:
|
|
95
|
+
return "mr-IN"
|
|
96
|
+
case Language.TA_IN:
|
|
97
|
+
return "ta-IN"
|
|
98
|
+
case Language.TE_IN:
|
|
99
|
+
return "te-IN"
|
|
100
|
+
case Language.PA_IN:
|
|
101
|
+
return "pa-IN"
|
|
102
|
+
case Language.OR_IN:
|
|
103
|
+
return "od-IN"
|
|
104
|
+
case Language.EN_US:
|
|
105
|
+
return "en-US"
|
|
106
|
+
case Language.EN_IN:
|
|
107
|
+
return "en-IN"
|
|
108
|
+
case Language.AS_IN:
|
|
109
|
+
return "as-IN"
|
|
110
|
+
case _:
|
|
111
|
+
raise ValueError(f"Unsupported language: {language}")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class TranscriptionMetrics(BaseModel):
|
|
115
|
+
"""Metrics for transcription performance."""
|
|
116
|
+
|
|
117
|
+
audio_duration: float
|
|
118
|
+
processing_latency: float
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class TranscriptionData(BaseModel):
|
|
122
|
+
"""Data structure for transcription results."""
|
|
123
|
+
|
|
124
|
+
request_id: str
|
|
125
|
+
transcript: str
|
|
126
|
+
language_code: Optional[str]
|
|
127
|
+
metrics: Optional[TranscriptionMetrics] = None
|
|
128
|
+
is_final: Optional[bool] = None
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class TranscriptionResponse(BaseModel):
|
|
132
|
+
"""Response structure for transcription data."""
|
|
133
|
+
|
|
134
|
+
type: Literal["data"]
|
|
135
|
+
data: TranscriptionData
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class VADSignal(StrEnum):
|
|
139
|
+
"""Voice Activity Detection signal types."""
|
|
140
|
+
|
|
141
|
+
START = "START_SPEECH"
|
|
142
|
+
END = "END_SPEECH"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class EventData(BaseModel):
|
|
146
|
+
"""Data structure for VAD events."""
|
|
147
|
+
|
|
148
|
+
signal_type: VADSignal
|
|
149
|
+
occured_at: float
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class EventResponse(BaseModel):
|
|
153
|
+
"""Response structure for VAD events."""
|
|
154
|
+
|
|
155
|
+
type: Literal["events"]
|
|
156
|
+
data: EventData
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class SarvamSTTService(STTService):
|
|
160
|
+
"""Sarvam speech-to-text service.
|
|
161
|
+
|
|
162
|
+
Provides real-time speech recognition using Sarvam's WebSocket API.
|
|
163
|
+
Supports both Saarika (transcription) and Saaras (translation) models.
|
|
164
|
+
|
|
165
|
+
Models:
|
|
166
|
+
- Saarika (saarika:v2.5): Transcription in a single language
|
|
167
|
+
- Saaras (saaras:v2.5): Translation from source language to target language
|
|
168
|
+
|
|
169
|
+
The service automatically selects the correct endpoint based on the model name.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
def __init__(
|
|
173
|
+
self,
|
|
174
|
+
*,
|
|
175
|
+
api_key: str,
|
|
176
|
+
model: str = "saaras:v2.5",
|
|
177
|
+
language: Language = Language.HI_IN,
|
|
178
|
+
**kwargs,
|
|
179
|
+
):
|
|
180
|
+
"""Initialize the Sarvam STT service.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
api_key: Sarvam API key for authentication.
|
|
184
|
+
model: Sarvam model to use for transcription.
|
|
185
|
+
language: Language for transcription. Defaults to Hindi (India).
|
|
186
|
+
**kwargs: Additional arguments passed to the parent STTService.
|
|
187
|
+
Note: Sarvam requires 16kHz audio. If your input is a different
|
|
188
|
+
sample rate, it will be automatically resampled to 16kHz.
|
|
189
|
+
"""
|
|
190
|
+
super().__init__(**kwargs)
|
|
191
|
+
|
|
192
|
+
self.set_model_name(model)
|
|
193
|
+
self._api_key = api_key
|
|
194
|
+
self._model = model
|
|
195
|
+
self._language = language
|
|
196
|
+
self._target_sample_rate = 16000 # Sarvam requires 16kHz
|
|
197
|
+
|
|
198
|
+
self._client = AsyncSarvamAI(api_subscription_key=api_key)
|
|
199
|
+
self._websocket = None
|
|
200
|
+
self._websocket_connection = None
|
|
201
|
+
self._listening_task = None
|
|
202
|
+
self._resampler = ResampyResampler()
|
|
203
|
+
|
|
204
|
+
# Register VAD event handlers
|
|
205
|
+
self._register_event_handler("on_speech_started")
|
|
206
|
+
self._register_event_handler("on_speech_ended")
|
|
207
|
+
|
|
208
|
+
def can_generate_metrics(self) -> bool:
|
|
209
|
+
"""Check if this service can generate processing metrics.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
True, as Sarvam service supports metrics generation.
|
|
213
|
+
"""
|
|
214
|
+
return True
|
|
215
|
+
|
|
216
|
+
async def set_model(self, model: str):
|
|
217
|
+
"""Set the Sarvam model and reconnect.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
model: The Sarvam model name to use.
|
|
221
|
+
"""
|
|
222
|
+
await super().set_model(model)
|
|
223
|
+
logger.info(f"Switching STT model to: [{model}]")
|
|
224
|
+
self._model = model
|
|
225
|
+
await self._disconnect()
|
|
226
|
+
await self._connect()
|
|
227
|
+
|
|
228
|
+
async def set_language(self, language: Language):
|
|
229
|
+
"""Set the language and reconnect.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
language: The Language enum to use.
|
|
233
|
+
"""
|
|
234
|
+
logger.info(f"Switching STT language to: [{language}]")
|
|
235
|
+
self._language = language
|
|
236
|
+
await self._disconnect()
|
|
237
|
+
await self._connect()
|
|
238
|
+
|
|
239
|
+
async def start(self, frame: StartFrame):
|
|
240
|
+
"""Start the Sarvam STT service.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
frame: The start frame containing initialization parameters.
|
|
244
|
+
"""
|
|
245
|
+
await super().start(frame)
|
|
246
|
+
await self._connect()
|
|
247
|
+
|
|
248
|
+
async def stop(self, frame: EndFrame):
|
|
249
|
+
"""Stop the Sarvam STT service.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
frame: The end frame.
|
|
253
|
+
"""
|
|
254
|
+
await super().stop(frame)
|
|
255
|
+
await self._disconnect()
|
|
256
|
+
|
|
257
|
+
async def cancel(self, frame: CancelFrame):
|
|
258
|
+
"""Cancel the Sarvam STT service.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
frame: The cancel frame.
|
|
262
|
+
"""
|
|
263
|
+
await super().cancel(frame)
|
|
264
|
+
await self._disconnect()
|
|
265
|
+
|
|
266
|
+
async def run_stt(self, audio: bytes):
|
|
267
|
+
"""Send audio data to Sarvam for transcription.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
audio: Raw audio bytes to transcribe.
|
|
271
|
+
|
|
272
|
+
Yields:
|
|
273
|
+
Frame: None (transcription results come via WebSocket callbacks).
|
|
274
|
+
"""
|
|
275
|
+
if not self._websocket_connection or self._websocket_connection.state != State.OPEN:
|
|
276
|
+
logger.warning("WebSocket not connected, cannot process audio")
|
|
277
|
+
yield None
|
|
278
|
+
return
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
# Resample audio to 16kHz if needed
|
|
282
|
+
if self.sample_rate != self._target_sample_rate:
|
|
283
|
+
audio = await self._resampler.resample(
|
|
284
|
+
audio, self.sample_rate, self._target_sample_rate
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Convert audio bytes to base64 for Sarvam API
|
|
288
|
+
audio_base64 = base64.b64encode(audio).decode("utf-8")
|
|
289
|
+
|
|
290
|
+
# Sarvam requires 'audio/wav' encoding (even for raw PCM data)
|
|
291
|
+
message = {
|
|
292
|
+
"audio": {
|
|
293
|
+
"data": audio_base64,
|
|
294
|
+
"encoding": "audio/wav",
|
|
295
|
+
"sample_rate": self._target_sample_rate,
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
await self._websocket_connection.send(json.dumps(message))
|
|
299
|
+
|
|
300
|
+
except websockets.exceptions.ConnectionClosed:
|
|
301
|
+
logger.error("WebSocket connection closed")
|
|
302
|
+
await self.push_error(ErrorFrame("WebSocket connection closed"))
|
|
303
|
+
except Exception as e:
|
|
304
|
+
logger.error(f"Error sending audio to Sarvam: {e}")
|
|
305
|
+
await self.push_error(ErrorFrame(f"Failed to send audio: {e}"))
|
|
306
|
+
|
|
307
|
+
yield None
|
|
308
|
+
|
|
309
|
+
async def _connect(self):
|
|
310
|
+
"""Connect to Sarvam WebSocket API directly."""
|
|
311
|
+
logger.debug(f"Connecting to Sarvam with model: {self._model}")
|
|
312
|
+
|
|
313
|
+
try:
|
|
314
|
+
base_url = self._client._client_wrapper.get_environment().production
|
|
315
|
+
|
|
316
|
+
# Choose endpoint and socket class based on model
|
|
317
|
+
if self._model.startswith("saarika"):
|
|
318
|
+
# Saarika = Transcription endpoint
|
|
319
|
+
path = "/speech-to-text/ws"
|
|
320
|
+
query_params = {
|
|
321
|
+
"language-code": language_to_sarvam_language(self._language),
|
|
322
|
+
"model": self._model,
|
|
323
|
+
"vad_signals": "true",
|
|
324
|
+
}
|
|
325
|
+
socket_cls = AsyncSpeechToTextStreamingSocketClient
|
|
326
|
+
logger.debug(
|
|
327
|
+
f"Using Saarika transcription endpoint with language: {self._language}"
|
|
328
|
+
)
|
|
329
|
+
else:
|
|
330
|
+
# Saaras = Translation endpoint
|
|
331
|
+
path = "/speech-to-text-translate/ws"
|
|
332
|
+
query_params = {
|
|
333
|
+
"model": self._model,
|
|
334
|
+
"vad_signals": "true",
|
|
335
|
+
}
|
|
336
|
+
socket_cls = AsyncSpeechToTextTranslateStreamingSocketClient
|
|
337
|
+
logger.debug("Using Saaras translation endpoint")
|
|
338
|
+
|
|
339
|
+
ws_url = f"{base_url}{path}?{urlencode(query_params)}"
|
|
340
|
+
|
|
341
|
+
# Get headers
|
|
342
|
+
headers = self._client._client_wrapper.get_headers()
|
|
343
|
+
headers["Api-Subscription-Key"] = self._api_key
|
|
344
|
+
|
|
345
|
+
# Connect to WebSocket
|
|
346
|
+
self._websocket_connection = await websockets.connect(
|
|
347
|
+
ws_url, additional_headers=headers
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# Create the socket client wrapper
|
|
351
|
+
self._websocket = socket_cls(websocket=self._websocket_connection)
|
|
352
|
+
|
|
353
|
+
# Start listening for messages
|
|
354
|
+
self._listening_task = asyncio.create_task(self._listen_for_messages())
|
|
355
|
+
|
|
356
|
+
logger.info(f"Connected to Sarvam successfully with model: {self._model}")
|
|
357
|
+
|
|
358
|
+
except websockets.exceptions.InvalidStatusCode as e:
|
|
359
|
+
error_msg = f"Failed to connect to Sarvam: HTTP {e.status_code}"
|
|
360
|
+
if e.status_code == 403:
|
|
361
|
+
if self._model.startswith("saarika"):
|
|
362
|
+
error_msg += f" - Access denied. Check: 1) API key has Saarika access, 2) Model '{self._model}' exists (try saarika:v2.5), 3) Using correct endpoint (transcription)"
|
|
363
|
+
else:
|
|
364
|
+
error_msg += f" - Access denied. Check: 1) API key has Saaras access, 2) Model '{self._model}' exists (try saaras:v2.5), 3) Using correct endpoint (translation)"
|
|
365
|
+
elif e.status_code == 401:
|
|
366
|
+
error_msg += " - Invalid API key"
|
|
367
|
+
logger.error(error_msg)
|
|
368
|
+
self._websocket = None
|
|
369
|
+
self._websocket_connection = None
|
|
370
|
+
await self.push_error(ErrorFrame(error_msg))
|
|
371
|
+
except Exception as e:
|
|
372
|
+
logger.error(f"Failed to connect to Sarvam: {e}")
|
|
373
|
+
self._websocket = None
|
|
374
|
+
self._websocket_connection = None
|
|
375
|
+
await self.push_error(ErrorFrame(f"Failed to connect to Sarvam: {e}"))
|
|
376
|
+
|
|
377
|
+
async def _disconnect(self):
|
|
378
|
+
"""Disconnect from Sarvam WebSocket API."""
|
|
379
|
+
if self._listening_task:
|
|
380
|
+
self._listening_task.cancel()
|
|
381
|
+
try:
|
|
382
|
+
await self._listening_task
|
|
383
|
+
except asyncio.CancelledError:
|
|
384
|
+
pass
|
|
385
|
+
self._listening_task = None
|
|
386
|
+
|
|
387
|
+
if self._websocket_connection and self._websocket_connection.state == State.OPEN:
|
|
388
|
+
try:
|
|
389
|
+
await self._websocket_connection.close()
|
|
390
|
+
except Exception as e:
|
|
391
|
+
logger.error(f"Error closing WebSocket connection: {e}")
|
|
392
|
+
finally:
|
|
393
|
+
logger.debug("Disconnected from Sarvam WebSocket")
|
|
394
|
+
self._websocket_connection = None
|
|
395
|
+
self._websocket = None
|
|
396
|
+
|
|
397
|
+
async def _listen_for_messages(self):
|
|
398
|
+
"""Listen for messages from Sarvam WebSocket."""
|
|
399
|
+
try:
|
|
400
|
+
while self._websocket_connection and self._websocket_connection.state == State.OPEN:
|
|
401
|
+
try:
|
|
402
|
+
message = await self._websocket_connection.recv()
|
|
403
|
+
response = json.loads(message)
|
|
404
|
+
await self._handle_response(response)
|
|
405
|
+
|
|
406
|
+
except websockets.exceptions.ConnectionClosed:
|
|
407
|
+
logger.warning("WebSocket connection closed")
|
|
408
|
+
break
|
|
409
|
+
except json.JSONDecodeError as e:
|
|
410
|
+
logger.error(f"Failed to parse JSON response: {e}")
|
|
411
|
+
continue
|
|
412
|
+
except Exception as e:
|
|
413
|
+
logger.error(f"Error receiving message from Sarvam: {e}")
|
|
414
|
+
break
|
|
415
|
+
|
|
416
|
+
except asyncio.CancelledError:
|
|
417
|
+
logger.debug("Message listening cancelled")
|
|
418
|
+
except Exception as e:
|
|
419
|
+
logger.error(f"Unexpected error in message listener: {e}")
|
|
420
|
+
await self.push_error(ErrorFrame(f"Message listener error: {e}"))
|
|
421
|
+
|
|
422
|
+
async def _handle_response(self, response):
|
|
423
|
+
"""Handle transcription response from Sarvam.
|
|
424
|
+
|
|
425
|
+
Handles both Saarika (transcription) and Saaras (translation) message formats.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
response: The response object from Sarvam WebSocket.
|
|
429
|
+
"""
|
|
430
|
+
logger.debug(f"Received response: {response}")
|
|
431
|
+
|
|
432
|
+
try:
|
|
433
|
+
msg_type = response.get("type")
|
|
434
|
+
|
|
435
|
+
# Error handling
|
|
436
|
+
if msg_type == "error":
|
|
437
|
+
error_msg = response.get("data", {}).get("message", "Unknown error")
|
|
438
|
+
logger.error(f"Sarvam API error: {error_msg}")
|
|
439
|
+
await self.push_error(ErrorFrame(f"Sarvam API error: {error_msg}"))
|
|
440
|
+
await self._disconnect()
|
|
441
|
+
return
|
|
442
|
+
|
|
443
|
+
# Modern Saarika/Saaras message format
|
|
444
|
+
if msg_type == "speech_start":
|
|
445
|
+
await self.start_metrics()
|
|
446
|
+
logger.debug("User started speaking")
|
|
447
|
+
await self._call_event_handler("on_speech_started")
|
|
448
|
+
return
|
|
449
|
+
|
|
450
|
+
if msg_type == "speech_end":
|
|
451
|
+
logger.debug("User stopped speaking")
|
|
452
|
+
await self._call_event_handler("on_speech_ended")
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
if msg_type == "transcript":
|
|
456
|
+
await self.stop_ttfb_metrics()
|
|
457
|
+
# Handle both Saarika (text) and Saaras (text + text_translated)
|
|
458
|
+
transcript = response.get("text") or response.get("text_translated") or ""
|
|
459
|
+
language_code = (
|
|
460
|
+
response.get("source_language_code") or response.get("language_code") or "hi-IN"
|
|
461
|
+
)
|
|
462
|
+
language = self._map_language_code_to_enum(language_code)
|
|
463
|
+
|
|
464
|
+
if transcript.strip():
|
|
465
|
+
await self.push_frame(
|
|
466
|
+
TranscriptionFrame(
|
|
467
|
+
transcript,
|
|
468
|
+
self._user_id,
|
|
469
|
+
time_now_iso8601(),
|
|
470
|
+
language,
|
|
471
|
+
result=response,
|
|
472
|
+
)
|
|
473
|
+
)
|
|
474
|
+
await self.stop_processing_metrics()
|
|
475
|
+
return
|
|
476
|
+
|
|
477
|
+
# Legacy format (backward compatibility)
|
|
478
|
+
if msg_type == "events":
|
|
479
|
+
parsed = EventResponse(**response)
|
|
480
|
+
signal = parsed.data.signal_type
|
|
481
|
+
timestamp = parsed.data.occured_at
|
|
482
|
+
logger.debug(f"VAD Signal: {signal}, Occurred at: {timestamp}")
|
|
483
|
+
|
|
484
|
+
if signal == VADSignal.START:
|
|
485
|
+
await self.start_metrics()
|
|
486
|
+
logger.debug("User started speaking")
|
|
487
|
+
await self._call_event_handler("on_speech_started")
|
|
488
|
+
elif signal == VADSignal.END:
|
|
489
|
+
logger.debug("User stopped speaking")
|
|
490
|
+
await self._call_event_handler("on_speech_ended")
|
|
491
|
+
return
|
|
492
|
+
|
|
493
|
+
if msg_type == "data":
|
|
494
|
+
await self.stop_ttfb_metrics()
|
|
495
|
+
parsed = TranscriptionResponse(**response)
|
|
496
|
+
transcript = parsed.data.transcript
|
|
497
|
+
language_code = parsed.data.language_code or "hi-IN"
|
|
498
|
+
language = self._map_language_code_to_enum(language_code)
|
|
499
|
+
|
|
500
|
+
if transcript and transcript.strip():
|
|
501
|
+
await self.push_frame(
|
|
502
|
+
TranscriptionFrame(
|
|
503
|
+
transcript,
|
|
504
|
+
self._user_id,
|
|
505
|
+
time_now_iso8601(),
|
|
506
|
+
language,
|
|
507
|
+
result=response,
|
|
508
|
+
)
|
|
509
|
+
)
|
|
510
|
+
await self.stop_processing_metrics()
|
|
511
|
+
return
|
|
512
|
+
|
|
513
|
+
except Exception as e:
|
|
514
|
+
logger.error(f"Error handling Sarvam response: {e}")
|
|
515
|
+
await self.push_error(ErrorFrame(f"Failed to handle response: {e}"))
|
|
516
|
+
|
|
517
|
+
def _map_language_code_to_enum(self, language_code: str) -> Language:
|
|
518
|
+
"""Map Sarvam language code (e.g., "hi-IN") to pipecat Language enum."""
|
|
519
|
+
logger.debug(f"Audio language detected as: {language_code}")
|
|
520
|
+
mapping = {
|
|
521
|
+
"bn-IN": Language.BN_IN,
|
|
522
|
+
"gu-IN": Language.GU_IN,
|
|
523
|
+
"hi-IN": Language.HI_IN,
|
|
524
|
+
"kn-IN": Language.KN_IN,
|
|
525
|
+
"ml-IN": Language.ML_IN,
|
|
526
|
+
"mr-IN": Language.MR_IN,
|
|
527
|
+
"ta-IN": Language.TA_IN,
|
|
528
|
+
"te-IN": Language.TE_IN,
|
|
529
|
+
"pa-IN": Language.PA_IN,
|
|
530
|
+
"od-IN": Language.OR_IN,
|
|
531
|
+
"en-US": Language.EN_US,
|
|
532
|
+
"en-IN": Language.EN_IN,
|
|
533
|
+
"as-IN": Language.AS_IN,
|
|
534
|
+
}
|
|
535
|
+
return mapping.get(language_code, Language.HI_IN)
|
|
536
|
+
|
|
537
|
+
async def start_metrics(self):
|
|
538
|
+
"""Start TTFB and processing metrics collection."""
|
|
539
|
+
await self.start_ttfb_metrics()
|
|
540
|
+
await self.start_processing_metrics()
|