dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Soniox speech-to-text service implementation."""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
import time
|
|
12
|
+
from typing import AsyncGenerator, List, Optional
|
|
13
|
+
|
|
14
|
+
from loguru import logger
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
|
|
17
|
+
from pipecat.frames.frames import (
|
|
18
|
+
CancelFrame,
|
|
19
|
+
EndFrame,
|
|
20
|
+
ErrorFrame,
|
|
21
|
+
Frame,
|
|
22
|
+
InterimTranscriptionFrame,
|
|
23
|
+
StartFrame,
|
|
24
|
+
TranscriptionFrame,
|
|
25
|
+
UserStoppedSpeakingFrame,
|
|
26
|
+
)
|
|
27
|
+
from pipecat.processors.frame_processor import FrameDirection
|
|
28
|
+
from pipecat.services.stt_service import STTService
|
|
29
|
+
from pipecat.transcriptions.language import Language
|
|
30
|
+
from pipecat.utils.time import time_now_iso8601
|
|
31
|
+
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
import websockets
|
|
35
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
36
|
+
from websockets.protocol import State
|
|
37
|
+
except ModuleNotFoundError as e:
|
|
38
|
+
logger.error(f"Exception: {e}")
|
|
39
|
+
logger.error("In order to use Soniox, you need to `pip install pipecat-ai[soniox]`.")
|
|
40
|
+
raise Exception(f"Missing module: {e}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
KEEPALIVE_MESSAGE = '{"type": "keepalive"}'
|
|
44
|
+
|
|
45
|
+
FINALIZE_MESSAGE = '{"type": "finalize"}'
|
|
46
|
+
|
|
47
|
+
END_TOKEN = "<end>"
|
|
48
|
+
|
|
49
|
+
FINALIZED_TOKEN = "<fin>"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class SonioxInputParams(BaseModel):
|
|
53
|
+
"""Real-time transcription settings.
|
|
54
|
+
|
|
55
|
+
See Soniox WebSocket API documentation for more details:
|
|
56
|
+
https://soniox.com/docs/speech-to-text/api-reference/websocket-api#configuration-parameters
|
|
57
|
+
|
|
58
|
+
Parameters:
|
|
59
|
+
model: Model to use for transcription.
|
|
60
|
+
audio_format: Audio format to use for transcription.
|
|
61
|
+
num_channels: Number of channels to use for transcription.
|
|
62
|
+
language_hints: List of language hints to use for transcription.
|
|
63
|
+
context: Customization for transcription.
|
|
64
|
+
enable_non_final_tokens: Whether to enable non-final tokens. If false, only final tokens will be returned.
|
|
65
|
+
max_non_final_tokens_duration_ms: Maximum duration of non-final tokens.
|
|
66
|
+
client_reference_id: Client reference ID to use for transcription.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
model: str = "stt-rt-preview"
|
|
70
|
+
|
|
71
|
+
audio_format: Optional[str] = "pcm_s16le"
|
|
72
|
+
num_channels: Optional[int] = 1
|
|
73
|
+
|
|
74
|
+
language_hints: Optional[List[Language]] = None
|
|
75
|
+
context: Optional[str] = None
|
|
76
|
+
|
|
77
|
+
enable_non_final_tokens: Optional[bool] = True
|
|
78
|
+
max_non_final_tokens_duration_ms: Optional[int] = None
|
|
79
|
+
|
|
80
|
+
client_reference_id: Optional[str] = None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def is_end_token(token: dict) -> bool:
|
|
84
|
+
"""Determine if a token is an end token."""
|
|
85
|
+
return token["text"] == END_TOKEN or token["text"] == FINALIZED_TOKEN
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def language_to_soniox_language(language: Language) -> str:
|
|
89
|
+
"""Pipecat Language enum uses same ISO 2-letter codes as Soniox, except with added regional variants.
|
|
90
|
+
|
|
91
|
+
For a list of all supported languages, see: https://soniox.com/docs/speech-to-text/core-concepts/supported-languages
|
|
92
|
+
"""
|
|
93
|
+
lang_str = str(language.value).lower()
|
|
94
|
+
if "-" in lang_str:
|
|
95
|
+
return lang_str.split("-")[0]
|
|
96
|
+
return lang_str
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _prepare_language_hints(
|
|
100
|
+
language_hints: Optional[List[Language]],
|
|
101
|
+
) -> Optional[List[str]]:
|
|
102
|
+
if language_hints is None:
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
prepared_languages = [language_to_soniox_language(lang) for lang in language_hints]
|
|
106
|
+
# Remove duplicates (in case of language_hints with multiple regions).
|
|
107
|
+
return list(set(prepared_languages))
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class SonioxSTTService(STTService):
|
|
111
|
+
"""Speech-to-Text service using Soniox's WebSocket API.
|
|
112
|
+
|
|
113
|
+
This service connects to Soniox's WebSocket API for real-time transcription
|
|
114
|
+
with support for multiple languages, custom context, speaker diarization,
|
|
115
|
+
and more.
|
|
116
|
+
|
|
117
|
+
For complete API documentation, see: https://soniox.com/docs/speech-to-text/api-reference/websocket-api
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(
|
|
121
|
+
self,
|
|
122
|
+
*,
|
|
123
|
+
api_key: str,
|
|
124
|
+
url: str = "wss://stt-rt.soniox.com/transcribe-websocket",
|
|
125
|
+
sample_rate: Optional[int] = None,
|
|
126
|
+
params: Optional[SonioxInputParams] = None,
|
|
127
|
+
vad_force_turn_endpoint: bool = False,
|
|
128
|
+
**kwargs,
|
|
129
|
+
):
|
|
130
|
+
"""Initialize the Soniox STT service.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
api_key: Soniox API key.
|
|
134
|
+
url: Soniox WebSocket API URL.
|
|
135
|
+
sample_rate: Audio sample rate.
|
|
136
|
+
params: Additional configuration parameters, such as language hints, context and
|
|
137
|
+
speaker diarization.
|
|
138
|
+
vad_force_turn_endpoint: Listen to `UserStoppedSpeakingFrame` to send finalize message to Soniox. If disabled, Soniox will detect the end of the speech.
|
|
139
|
+
**kwargs: Additional arguments passed to the STTService.
|
|
140
|
+
"""
|
|
141
|
+
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
142
|
+
params = params or SonioxInputParams()
|
|
143
|
+
|
|
144
|
+
self._api_key = api_key
|
|
145
|
+
self._url = url
|
|
146
|
+
self.set_model_name(params.model)
|
|
147
|
+
self._params = params
|
|
148
|
+
self._vad_force_turn_endpoint = vad_force_turn_endpoint
|
|
149
|
+
self._websocket = None
|
|
150
|
+
|
|
151
|
+
self._final_transcription_buffer = []
|
|
152
|
+
self._last_tokens_received: Optional[float] = None
|
|
153
|
+
|
|
154
|
+
self._receive_task = None
|
|
155
|
+
self._keepalive_task = None
|
|
156
|
+
|
|
157
|
+
async def start(self, frame: StartFrame):
|
|
158
|
+
"""Start the Soniox STT websocket connection.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
frame: The start frame containing initialization parameters.
|
|
162
|
+
"""
|
|
163
|
+
await super().start(frame)
|
|
164
|
+
if self._websocket:
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
self._websocket = await websocket_connect(self._url)
|
|
168
|
+
|
|
169
|
+
if not self._websocket:
|
|
170
|
+
logger.error(f"Unable to connect to Soniox API at {self._url}")
|
|
171
|
+
|
|
172
|
+
# If vad_force_turn_endpoint is not enabled, we need to enable endpoint detection.
|
|
173
|
+
# Either one or the other is required.
|
|
174
|
+
enable_endpoint_detection = not self._vad_force_turn_endpoint
|
|
175
|
+
|
|
176
|
+
# Send the initial configuration message.
|
|
177
|
+
config = {
|
|
178
|
+
"api_key": self._api_key,
|
|
179
|
+
"model": self._model_name,
|
|
180
|
+
"audio_format": self._params.audio_format,
|
|
181
|
+
"num_channels": self._params.num_channels or 1,
|
|
182
|
+
"enable_endpoint_detection": enable_endpoint_detection,
|
|
183
|
+
"sample_rate": self.sample_rate,
|
|
184
|
+
"language_hints": _prepare_language_hints(self._params.language_hints),
|
|
185
|
+
"context": self._params.context,
|
|
186
|
+
"enable_non_final_tokens": self._params.enable_non_final_tokens,
|
|
187
|
+
"max_non_final_tokens_duration_ms": self._params.max_non_final_tokens_duration_ms,
|
|
188
|
+
"client_reference_id": self._params.client_reference_id,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
# Send the configuration message.
|
|
192
|
+
await self._websocket.send(json.dumps(config))
|
|
193
|
+
|
|
194
|
+
if self._websocket and not self._receive_task:
|
|
195
|
+
self._receive_task = self.create_task(self._receive_task_handler())
|
|
196
|
+
if self._websocket and not self._keepalive_task:
|
|
197
|
+
self._keepalive_task = self.create_task(self._keepalive_task_handler())
|
|
198
|
+
|
|
199
|
+
async def _cleanup(self):
|
|
200
|
+
if self._keepalive_task:
|
|
201
|
+
await self.cancel_task(self._keepalive_task)
|
|
202
|
+
self._keepalive_task = None
|
|
203
|
+
|
|
204
|
+
if self._websocket:
|
|
205
|
+
await self._websocket.close()
|
|
206
|
+
self._websocket = None
|
|
207
|
+
|
|
208
|
+
if self._receive_task:
|
|
209
|
+
# Task cannot cancel itself. If task called _cleanup() we expect it to cancel itself.
|
|
210
|
+
if self._receive_task != asyncio.current_task():
|
|
211
|
+
await self._receive_task
|
|
212
|
+
self._receive_task = None
|
|
213
|
+
|
|
214
|
+
async def stop(self, frame: EndFrame):
|
|
215
|
+
"""Stop the Soniox STT websocket connection.
|
|
216
|
+
|
|
217
|
+
Stopping waits for the server to close the connection as we might receive
|
|
218
|
+
additional final tokens after sending the stop recording message.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
frame: The end frame.
|
|
222
|
+
"""
|
|
223
|
+
await super().stop(frame)
|
|
224
|
+
await self._send_stop_recording()
|
|
225
|
+
|
|
226
|
+
async def cancel(self, frame: CancelFrame):
|
|
227
|
+
"""Cancel the Soniox STT websocket connection.
|
|
228
|
+
|
|
229
|
+
Compared to stop, this method closes the connection immediately without waiting
|
|
230
|
+
for the server to close it. This is useful when we want to stop the connection
|
|
231
|
+
immediately without waiting for the server to send any final tokens.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
frame: The cancel frame.
|
|
235
|
+
"""
|
|
236
|
+
await super().cancel(frame)
|
|
237
|
+
await self._cleanup()
|
|
238
|
+
|
|
239
|
+
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
240
|
+
"""Send audio data to Soniox STT Service.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
audio: Raw audio bytes to transcribe.
|
|
244
|
+
|
|
245
|
+
Yields:
|
|
246
|
+
Frame: None (transcription results come via WebSocket callbacks).
|
|
247
|
+
"""
|
|
248
|
+
await self.start_processing_metrics()
|
|
249
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
250
|
+
await self._websocket.send(audio)
|
|
251
|
+
await self.stop_processing_metrics()
|
|
252
|
+
|
|
253
|
+
yield None
|
|
254
|
+
|
|
255
|
+
@traced_stt
|
|
256
|
+
async def _handle_transcription(
|
|
257
|
+
self, transcript: str, is_final: bool, language: Optional[Language] = None
|
|
258
|
+
):
|
|
259
|
+
"""Handle a transcription result with tracing."""
|
|
260
|
+
pass
|
|
261
|
+
|
|
262
|
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
263
|
+
"""Processes a frame of audio data, either buffering or transcribing it.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
frame: The frame to process.
|
|
267
|
+
direction: The direction of frame processing.
|
|
268
|
+
"""
|
|
269
|
+
await super().process_frame(frame, direction)
|
|
270
|
+
|
|
271
|
+
if isinstance(frame, UserStoppedSpeakingFrame) and self._vad_force_turn_endpoint:
|
|
272
|
+
# Send finalize message to Soniox so we get the final tokens asap.
|
|
273
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
274
|
+
await self._websocket.send(FINALIZE_MESSAGE)
|
|
275
|
+
logger.debug(f"Triggered finalize event on: {frame.name=}, {direction=}")
|
|
276
|
+
|
|
277
|
+
async def _send_stop_recording(self):
|
|
278
|
+
"""Send stop recording message to Soniox."""
|
|
279
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
280
|
+
# Send stop recording message
|
|
281
|
+
await self._websocket.send("")
|
|
282
|
+
|
|
283
|
+
async def _keepalive_task_handler(self):
|
|
284
|
+
"""Connection has to be open all the time."""
|
|
285
|
+
try:
|
|
286
|
+
while True:
|
|
287
|
+
logger.trace("Sending keepalive message")
|
|
288
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
289
|
+
await self._websocket.send(KEEPALIVE_MESSAGE)
|
|
290
|
+
else:
|
|
291
|
+
logger.debug("WebSocket connection closed.")
|
|
292
|
+
break
|
|
293
|
+
await asyncio.sleep(5)
|
|
294
|
+
|
|
295
|
+
except websockets.exceptions.ConnectionClosed:
|
|
296
|
+
# Expected when closing the connection
|
|
297
|
+
logger.debug("WebSocket connection closed, keepalive task stopped.")
|
|
298
|
+
except Exception as e:
|
|
299
|
+
logger.error(f"{self} error (_keepalive_task_handler): {e}")
|
|
300
|
+
await self.push_error(ErrorFrame(f"{self} error (_keepalive_task_handler): {e}"))
|
|
301
|
+
|
|
302
|
+
async def _receive_task_handler(self):
|
|
303
|
+
if not self._websocket:
|
|
304
|
+
return
|
|
305
|
+
|
|
306
|
+
# Transcription frame will be only sent after we get the "endpoint" event.
|
|
307
|
+
self._final_transcription_buffer = []
|
|
308
|
+
|
|
309
|
+
async def send_endpoint_transcript():
|
|
310
|
+
if self._final_transcription_buffer:
|
|
311
|
+
text = "".join(map(lambda token: token["text"], self._final_transcription_buffer))
|
|
312
|
+
await self.push_frame(
|
|
313
|
+
TranscriptionFrame(
|
|
314
|
+
text=text,
|
|
315
|
+
user_id=self._user_id,
|
|
316
|
+
timestamp=time_now_iso8601(),
|
|
317
|
+
result=self._final_transcription_buffer,
|
|
318
|
+
)
|
|
319
|
+
)
|
|
320
|
+
await self._handle_transcription(text, is_final=True)
|
|
321
|
+
await self.stop_processing_metrics()
|
|
322
|
+
self._final_transcription_buffer = []
|
|
323
|
+
|
|
324
|
+
try:
|
|
325
|
+
async for message in self._websocket:
|
|
326
|
+
content = json.loads(message)
|
|
327
|
+
|
|
328
|
+
tokens = content["tokens"]
|
|
329
|
+
|
|
330
|
+
if tokens:
|
|
331
|
+
if len(tokens) == 1 and tokens[0]["text"] == FINALIZED_TOKEN:
|
|
332
|
+
# Ignore finalized token, prevent auto-finalize cycling.
|
|
333
|
+
pass
|
|
334
|
+
else:
|
|
335
|
+
# Got at least one token, so we can reset the auto finalize delay.
|
|
336
|
+
self._last_tokens_received = time.time()
|
|
337
|
+
|
|
338
|
+
# We will only send the final tokens after we get the "endpoint" event.
|
|
339
|
+
non_final_transcription = []
|
|
340
|
+
|
|
341
|
+
for token in tokens:
|
|
342
|
+
if token["is_final"]:
|
|
343
|
+
if is_end_token(token):
|
|
344
|
+
# Found an endpoint, tokens until here will be sent as transcript,
|
|
345
|
+
# the rest will be sent as interim tokens (even final tokens).
|
|
346
|
+
await send_endpoint_transcript()
|
|
347
|
+
else:
|
|
348
|
+
self._final_transcription_buffer.append(token)
|
|
349
|
+
else:
|
|
350
|
+
non_final_transcription.append(token)
|
|
351
|
+
|
|
352
|
+
if self._final_transcription_buffer or non_final_transcription:
|
|
353
|
+
final_text = "".join(
|
|
354
|
+
map(lambda token: token["text"], self._final_transcription_buffer)
|
|
355
|
+
)
|
|
356
|
+
non_final_text = "".join(
|
|
357
|
+
map(lambda token: token["text"], non_final_transcription)
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
await self.push_frame(
|
|
361
|
+
InterimTranscriptionFrame(
|
|
362
|
+
# Even final tokens are sent as interim tokens as we want to send
|
|
363
|
+
# nicely formatted messages - therefore waiting for the endpoint.
|
|
364
|
+
text=final_text + non_final_text,
|
|
365
|
+
user_id=self._user_id,
|
|
366
|
+
timestamp=time_now_iso8601(),
|
|
367
|
+
result=self._final_transcription_buffer + non_final_transcription,
|
|
368
|
+
)
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
error_code = content.get("error_code")
|
|
372
|
+
error_message = content.get("error_message")
|
|
373
|
+
if error_code or error_message:
|
|
374
|
+
# In case of error, still send the final transcript (if any remaining in the buffer).
|
|
375
|
+
await send_endpoint_transcript()
|
|
376
|
+
logger.error(
|
|
377
|
+
f"{self} error: {error_code} (_receive_task_handler) - {error_message}"
|
|
378
|
+
)
|
|
379
|
+
await self.push_error(
|
|
380
|
+
ErrorFrame(
|
|
381
|
+
f"{self} error: {error_code} (_receive_task_handler) - {error_message}"
|
|
382
|
+
)
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
finished = content.get("finished")
|
|
386
|
+
if finished:
|
|
387
|
+
# When finished, still send the final transcript (if any remaining in the buffer).
|
|
388
|
+
await send_endpoint_transcript()
|
|
389
|
+
logger.debug("Transcription finished.")
|
|
390
|
+
await self._cleanup()
|
|
391
|
+
return
|
|
392
|
+
|
|
393
|
+
except websockets.exceptions.ConnectionClosed:
|
|
394
|
+
# Expected when closing the connection.
|
|
395
|
+
pass
|
|
396
|
+
except Exception as e:
|
|
397
|
+
logger.error(f"{self} error: {e}")
|
|
398
|
+
await self.push_error(ErrorFrame(f"{self} error: {e}"))
|
|
@@ -23,6 +23,7 @@ from pipecat.frames.frames import (
|
|
|
23
23
|
BotInterruptionFrame,
|
|
24
24
|
CancelFrame,
|
|
25
25
|
EndFrame,
|
|
26
|
+
ErrorFrame,
|
|
26
27
|
Frame,
|
|
27
28
|
InterimTranscriptionFrame,
|
|
28
29
|
StartFrame,
|
|
@@ -463,8 +464,14 @@ class SpeechmaticsSTTService(STTService):
|
|
|
463
464
|
|
|
464
465
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
465
466
|
"""Adds audio to the audio buffer and yields None."""
|
|
466
|
-
|
|
467
|
-
|
|
467
|
+
try:
|
|
468
|
+
if self._client:
|
|
469
|
+
await self._client.send_audio(audio)
|
|
470
|
+
yield None
|
|
471
|
+
except Exception as e:
|
|
472
|
+
logger.error(f"Speechmatics error: {e}")
|
|
473
|
+
yield ErrorFrame(f"Speechmatics error: {e}", fatal=False)
|
|
474
|
+
await self._disconnect()
|
|
468
475
|
|
|
469
476
|
def update_params(
|
|
470
477
|
self,
|
|
@@ -520,7 +527,7 @@ class SpeechmaticsSTTService(STTService):
|
|
|
520
527
|
)
|
|
521
528
|
|
|
522
529
|
# Log the event
|
|
523
|
-
logger.debug("
|
|
530
|
+
logger.debug(f"{self} Connecting to Speechmatics STT service")
|
|
524
531
|
|
|
525
532
|
# Recognition started event
|
|
526
533
|
@self._client.on(ServerMessageType.RECOGNITION_STARTED)
|
|
@@ -562,31 +569,36 @@ class SpeechmaticsSTTService(STTService):
|
|
|
562
569
|
)
|
|
563
570
|
|
|
564
571
|
# Start session
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
572
|
+
try:
|
|
573
|
+
await self._client.start_session(
|
|
574
|
+
transcription_config=self._transcription_config,
|
|
575
|
+
audio_format=AudioFormat(
|
|
576
|
+
encoding=self._params.audio_encoding,
|
|
577
|
+
sample_rate=self.sample_rate,
|
|
578
|
+
chunk_size=self._params.chunk_size,
|
|
579
|
+
),
|
|
580
|
+
)
|
|
581
|
+
logger.debug(f"{self} Connected to Speechmatics STT service")
|
|
582
|
+
except Exception as e:
|
|
583
|
+
logger.error(f"{self} Error connecting to Speechmatics: {e}")
|
|
584
|
+
finally:
|
|
585
|
+
self._client = None
|
|
573
586
|
|
|
574
587
|
async def _disconnect(self) -> None:
|
|
575
588
|
"""Disconnect from the STT service."""
|
|
576
589
|
# Disconnect the client
|
|
590
|
+
self.logger.debug(f"{self} Disconnecting from Speechmatics STT service")
|
|
577
591
|
try:
|
|
578
592
|
if self._client:
|
|
579
|
-
await asyncio.wait_for(self._client.close(), timeout=
|
|
593
|
+
await asyncio.wait_for(self._client.close(), timeout=5.0)
|
|
594
|
+
self.logger.debug(f"{self} Disconnected from Speechmatics STT service")
|
|
580
595
|
except asyncio.TimeoutError:
|
|
581
|
-
logger.warning("Timeout while closing Speechmatics client connection")
|
|
596
|
+
logger.warning(f"{self} Timeout while closing Speechmatics client connection")
|
|
582
597
|
except Exception as e:
|
|
583
|
-
logger.error(f"Error closing Speechmatics client: {e}")
|
|
598
|
+
logger.error(f"{self} Error closing Speechmatics client: {e}")
|
|
584
599
|
finally:
|
|
585
600
|
self._client = None
|
|
586
601
|
|
|
587
|
-
# Log the event
|
|
588
|
-
logger.debug("Disconnected from Speechmatics STT service")
|
|
589
|
-
|
|
590
602
|
def _process_config(self) -> None:
|
|
591
603
|
"""Create a formatted STT transcription config.
|
|
592
604
|
|
pipecat/services/stt_service.py
CHANGED
|
@@ -34,13 +34,6 @@ class STTService(AIService):
|
|
|
34
34
|
Provides common functionality for STT services including audio passthrough,
|
|
35
35
|
muting, settings management, and audio processing. Subclasses must implement
|
|
36
36
|
the run_stt method to provide actual speech recognition.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
audio_passthrough: Whether to pass audio frames downstream after processing.
|
|
40
|
-
Defaults to True.
|
|
41
|
-
sample_rate: The sample rate for audio input. If None, will be determined
|
|
42
|
-
from the start frame.
|
|
43
|
-
**kwargs: Additional arguments passed to the parent AIService.
|
|
44
37
|
"""
|
|
45
38
|
|
|
46
39
|
def __init__(
|
|
@@ -50,15 +43,26 @@ class STTService(AIService):
|
|
|
50
43
|
sample_rate: Optional[int] = None,
|
|
51
44
|
**kwargs,
|
|
52
45
|
):
|
|
46
|
+
"""Initialize the STT service.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
audio_passthrough: Whether to pass audio frames downstream after processing.
|
|
50
|
+
Defaults to True.
|
|
51
|
+
sample_rate: The sample rate for audio input. If None, will be determined
|
|
52
|
+
from the start frame.
|
|
53
|
+
**kwargs: Additional arguments passed to the parent AIService.
|
|
54
|
+
"""
|
|
53
55
|
super().__init__(**kwargs)
|
|
54
56
|
self._audio_passthrough = audio_passthrough
|
|
55
57
|
self._init_sample_rate = sample_rate
|
|
56
58
|
self._sample_rate = 0
|
|
57
59
|
self._settings: Dict[str, Any] = {}
|
|
60
|
+
self._tracing_enabled: bool = False
|
|
58
61
|
self._muted: bool = False
|
|
59
62
|
# Custom fields from ai_services.py for voicemail and first speech handling
|
|
60
63
|
self._first_speech_handled: bool = False
|
|
61
64
|
self._voicemail_detect: bool = False
|
|
65
|
+
self._user_id: str = ""
|
|
62
66
|
|
|
63
67
|
@property
|
|
64
68
|
def is_muted(self) -> bool:
|
|
@@ -119,6 +123,7 @@ class STTService(AIService):
|
|
|
119
123
|
self._sample_rate = self._init_sample_rate or frame.audio_in_sample_rate
|
|
120
124
|
if hasattr(frame, "metadata") and "voicemail_detect" in frame.metadata:
|
|
121
125
|
self._voicemail_detect = frame.metadata["voicemail_detect"]
|
|
126
|
+
self._tracing_enabled = frame.enable_tracing
|
|
122
127
|
|
|
123
128
|
async def _update_settings(self, settings: Mapping[str, Any]):
|
|
124
129
|
self.logger.info(f"Updating STT settings: {self._settings}")
|
|
@@ -138,6 +143,11 @@ class STTService(AIService):
|
|
|
138
143
|
async def process_audio_frame(self, frame: AudioRawFrame, direction: FrameDirection):
|
|
139
144
|
"""Process an audio frame for speech recognition.
|
|
140
145
|
|
|
146
|
+
If the service is muted, this method does nothing. Otherwise, it
|
|
147
|
+
processes the audio frame and runs speech-to-text on it, yielding
|
|
148
|
+
transcription results. If the frame has a user_id, it is stored
|
|
149
|
+
for later use in transcription.
|
|
150
|
+
|
|
141
151
|
Args:
|
|
142
152
|
frame: The audio frame to process.
|
|
143
153
|
direction: The direction of frame processing.
|
|
@@ -146,6 +156,21 @@ class STTService(AIService):
|
|
|
146
156
|
# If first speech is handled, we dont need to worry anymore.
|
|
147
157
|
if self._muted and ((not self._voicemail_detect) or self._first_speech_handled):
|
|
148
158
|
return
|
|
159
|
+
|
|
160
|
+
# UserAudioRawFrame contains a user_id (e.g. Daily, Livekit)
|
|
161
|
+
if hasattr(frame, "user_id"):
|
|
162
|
+
self._user_id = frame.user_id
|
|
163
|
+
# AudioRawFrame does not have a user_id (e.g. SmallWebRTCTransport, websockets)
|
|
164
|
+
else:
|
|
165
|
+
self._user_id = ""
|
|
166
|
+
|
|
167
|
+
if not frame.audio:
|
|
168
|
+
# Ignoring in case we don't have audio to transcribe.
|
|
169
|
+
logger.warning(
|
|
170
|
+
f"Empty audio frame received for STT service: {self.name} {frame.num_frames}"
|
|
171
|
+
)
|
|
172
|
+
return
|
|
173
|
+
|
|
149
174
|
await self.process_generator(self.run_stt(frame.audio))
|
|
150
175
|
|
|
151
176
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
@@ -187,14 +212,16 @@ class SegmentedSTTService(STTService):
|
|
|
187
212
|
Requires VAD to be enabled in the pipeline to function properly. Maintains a
|
|
188
213
|
small audio buffer to account for the delay between actual speech start and
|
|
189
214
|
VAD detection.
|
|
215
|
+
"""
|
|
216
|
+
|
|
217
|
+
def __init__(self, *, sample_rate: Optional[int] = None, **kwargs):
|
|
218
|
+
"""Initialize the segmented STT service.
|
|
190
219
|
|
|
191
|
-
|
|
220
|
+
Args:
|
|
192
221
|
sample_rate: The sample rate for audio input. If None, will be determined
|
|
193
222
|
from the start frame.
|
|
194
223
|
**kwargs: Additional arguments passed to the parent STTService.
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def __init__(self, *, sample_rate: Optional[int] = None, **kwargs):
|
|
224
|
+
"""
|
|
198
225
|
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
199
226
|
self._content = None
|
|
200
227
|
self._wave = None
|
|
@@ -251,10 +278,19 @@ class SegmentedSTTService(STTService):
|
|
|
251
278
|
Continuously buffers audio, growing the buffer while user is speaking and
|
|
252
279
|
maintaining a small buffer when not speaking to account for VAD delay.
|
|
253
280
|
|
|
281
|
+
If the frame has a user_id, it is stored for later use in transcription.
|
|
282
|
+
|
|
254
283
|
Args:
|
|
255
284
|
frame: The audio frame to process.
|
|
256
285
|
direction: The direction of frame processing.
|
|
257
286
|
"""
|
|
287
|
+
# UserAudioRawFrame contains a user_id (e.g. Daily, Livekit)
|
|
288
|
+
if hasattr(frame, "user_id"):
|
|
289
|
+
self._user_id = frame.user_id
|
|
290
|
+
# AudioRawFrame does not have a user_id (e.g. SmallWebRTCTransport, websockets)
|
|
291
|
+
else:
|
|
292
|
+
self._user_id = ""
|
|
293
|
+
|
|
258
294
|
# If the user is speaking the audio buffer will keep growing.
|
|
259
295
|
self._audio_buffer += frame.audio
|
|
260
296
|
|