dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# Copyright (c) 2024–2025, Daily
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
4
|
+
|
|
5
|
+
"""Hume Text-to-Speech service implementation."""
|
|
6
|
+
|
|
7
|
+
import base64
|
|
8
|
+
import os
|
|
9
|
+
from typing import Any, AsyncGenerator, Optional
|
|
10
|
+
|
|
11
|
+
from loguru import logger
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from pipecat.frames.frames import (
|
|
15
|
+
ErrorFrame,
|
|
16
|
+
Frame,
|
|
17
|
+
StartFrame,
|
|
18
|
+
TTSAudioRawFrame,
|
|
19
|
+
TTSStartedFrame,
|
|
20
|
+
TTSStoppedFrame,
|
|
21
|
+
)
|
|
22
|
+
from pipecat.services.tts_service import TTSService
|
|
23
|
+
from pipecat.utils.tracing.service_decorators import traced_tts
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from hume import AsyncHumeClient
|
|
27
|
+
from hume.tts import (
|
|
28
|
+
FormatPcm,
|
|
29
|
+
PostedUtterance,
|
|
30
|
+
PostedUtteranceVoiceWithId,
|
|
31
|
+
)
|
|
32
|
+
except ModuleNotFoundError as e: # pragma: no cover - import-time guidance
|
|
33
|
+
logger.error(f"Exception: {e}")
|
|
34
|
+
logger.error("In order to use Hume, you need to `pip install pipecat-ai[hume]`.")
|
|
35
|
+
raise Exception(f"Missing module: {e}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
HUME_SAMPLE_RATE = 48_000 # Hume TTS streams at 48 kHz
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class HumeTTSService(TTSService):
|
|
42
|
+
"""Hume Octave Text-to-Speech service.
|
|
43
|
+
|
|
44
|
+
Streams PCM audio via Hume's HTTP output streaming (JSON chunks) endpoint
|
|
45
|
+
using the Python SDK and emits ``TTSAudioRawFrame`` frames suitable for Pipecat transports.
|
|
46
|
+
|
|
47
|
+
Supported features:
|
|
48
|
+
|
|
49
|
+
- Generates speech from text using Hume TTS.
|
|
50
|
+
- Streams PCM audio.
|
|
51
|
+
- Supports dynamic updates of voice and synthesis parameters at runtime.
|
|
52
|
+
- Provides metrics for Time To First Byte (TTFB) and TTS usage.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
class InputParams(BaseModel):
|
|
56
|
+
"""Optional synthesis parameters for Hume TTS.
|
|
57
|
+
|
|
58
|
+
Parameters:
|
|
59
|
+
description: Natural-language acting directions (up to 100 characters).
|
|
60
|
+
speed: Speaking-rate multiplier (0.5-2.0).
|
|
61
|
+
trailing_silence: Seconds of silence to append at the end (0-5).
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
description: Optional[str] = None
|
|
65
|
+
speed: Optional[float] = None
|
|
66
|
+
trailing_silence: Optional[float] = None
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
*,
|
|
71
|
+
api_key: Optional[str] = None,
|
|
72
|
+
voice_id: str,
|
|
73
|
+
params: Optional[InputParams] = None,
|
|
74
|
+
sample_rate: Optional[int] = HUME_SAMPLE_RATE,
|
|
75
|
+
**kwargs,
|
|
76
|
+
) -> None:
|
|
77
|
+
"""Initialize the HumeTTSService.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
api_key: Hume API key. If omitted, reads the ``HUME_API_KEY`` environment variable.
|
|
81
|
+
voice_id: ID of the voice to use. Only voice IDs are supported; voice names are not.
|
|
82
|
+
params: Optional synthesis controls (acting instructions, speed, trailing silence).
|
|
83
|
+
sample_rate: Output sample rate for emitted PCM frames. Defaults to 48_000 (Hume).
|
|
84
|
+
**kwargs: Additional arguments passed to the parent class.
|
|
85
|
+
"""
|
|
86
|
+
api_key = api_key or os.getenv("HUME_API_KEY")
|
|
87
|
+
if not api_key:
|
|
88
|
+
raise ValueError("HumeTTSService requires an API key (env HUME_API_KEY or api_key=)")
|
|
89
|
+
|
|
90
|
+
if sample_rate != HUME_SAMPLE_RATE:
|
|
91
|
+
logger.warning(
|
|
92
|
+
f"Hume TTS streams at {HUME_SAMPLE_RATE} Hz; configured sample_rate={sample_rate}"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
96
|
+
|
|
97
|
+
self._client = AsyncHumeClient(api_key=api_key)
|
|
98
|
+
self._params = params or HumeTTSService.InputParams()
|
|
99
|
+
|
|
100
|
+
# Store voice in the base class (mirrors other services)
|
|
101
|
+
self.set_voice(voice_id)
|
|
102
|
+
|
|
103
|
+
self._audio_bytes = b""
|
|
104
|
+
|
|
105
|
+
def can_generate_metrics(self) -> bool:
|
|
106
|
+
"""Can generate metrics.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
True if metrics can be generated, False otherwise.
|
|
110
|
+
"""
|
|
111
|
+
return True
|
|
112
|
+
|
|
113
|
+
async def start(self, frame: StartFrame) -> None:
|
|
114
|
+
"""Start the service.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
frame: The start frame.
|
|
118
|
+
"""
|
|
119
|
+
await super().start(frame)
|
|
120
|
+
|
|
121
|
+
async def update_setting(self, key: str, value: Any) -> None:
|
|
122
|
+
"""Runtime updates via `TTSUpdateSettingsFrame`.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
key: The name of the setting to update. Recognized keys are:
|
|
126
|
+
- "voice_id"
|
|
127
|
+
- "description"
|
|
128
|
+
- "speed"
|
|
129
|
+
- "trailing_silence"
|
|
130
|
+
value: The new value for the setting.
|
|
131
|
+
"""
|
|
132
|
+
key_l = (key or "").lower()
|
|
133
|
+
|
|
134
|
+
if key_l == "voice_id":
|
|
135
|
+
self.set_voice(str(value))
|
|
136
|
+
logger.info(f"HumeTTSService voice_id set to: {self.voice}")
|
|
137
|
+
elif key_l == "description":
|
|
138
|
+
self._params.description = None if value is None else str(value)
|
|
139
|
+
elif key_l == "speed":
|
|
140
|
+
self._params.speed = None if value is None else float(value)
|
|
141
|
+
elif key_l == "trailing_silence":
|
|
142
|
+
self._params.trailing_silence = None if value is None else float(value)
|
|
143
|
+
else:
|
|
144
|
+
# Defer unknown keys to the base class
|
|
145
|
+
await super().update_setting(key, value)
|
|
146
|
+
|
|
147
|
+
@traced_tts
|
|
148
|
+
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
|
|
149
|
+
"""Generate speech from text using Hume TTS.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
text: The text to be synthesized.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
An async generator that yields `Frame` objects, including
|
|
156
|
+
`TTSStartedFrame`, `TTSAudioRawFrame`, `ErrorFrame`, and
|
|
157
|
+
`TTSStoppedFrame`.
|
|
158
|
+
"""
|
|
159
|
+
logger.debug(f"{self}: Generating Hume TTS: [{text}]")
|
|
160
|
+
|
|
161
|
+
# Build the request payload
|
|
162
|
+
utterance_kwargs: dict[str, Any] = {
|
|
163
|
+
"text": text,
|
|
164
|
+
"voice": PostedUtteranceVoiceWithId(id=self._voice_id),
|
|
165
|
+
}
|
|
166
|
+
if self._params.description is not None:
|
|
167
|
+
utterance_kwargs["description"] = self._params.description
|
|
168
|
+
if self._params.speed is not None:
|
|
169
|
+
utterance_kwargs["speed"] = self._params.speed
|
|
170
|
+
if self._params.trailing_silence is not None:
|
|
171
|
+
utterance_kwargs["trailing_silence"] = self._params.trailing_silence
|
|
172
|
+
|
|
173
|
+
utterance = PostedUtterance(**utterance_kwargs)
|
|
174
|
+
|
|
175
|
+
# Request raw PCM chunks in the streaming JSON
|
|
176
|
+
pcm_fmt = FormatPcm(type="pcm")
|
|
177
|
+
|
|
178
|
+
await self.start_ttfb_metrics()
|
|
179
|
+
await self.start_tts_usage_metrics(text)
|
|
180
|
+
yield TTSStartedFrame()
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
# Instant mode is always enabled here (not user-configurable)
|
|
184
|
+
# Hume emits mono PCM at 48 kHz; downstream can resample if needed.
|
|
185
|
+
# We buffer audio bytes before sending to prevent glitches.
|
|
186
|
+
self._audio_bytes = b""
|
|
187
|
+
async for chunk in self._client.tts.synthesize_json_streaming(
|
|
188
|
+
utterances=[utterance],
|
|
189
|
+
format=pcm_fmt,
|
|
190
|
+
instant_mode=True,
|
|
191
|
+
version="2",
|
|
192
|
+
):
|
|
193
|
+
audio_b64 = getattr(chunk, "audio", None)
|
|
194
|
+
if not audio_b64:
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
pcm_bytes = base64.b64decode(audio_b64)
|
|
198
|
+
self._audio_bytes += pcm_bytes
|
|
199
|
+
|
|
200
|
+
# Buffer audio until we have enough to avoid glitches
|
|
201
|
+
if len(self._audio_bytes) < self.chunk_size:
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
frame = TTSAudioRawFrame(
|
|
205
|
+
audio=self._audio_bytes,
|
|
206
|
+
sample_rate=self.sample_rate,
|
|
207
|
+
num_channels=1,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
yield frame
|
|
211
|
+
|
|
212
|
+
self._audio_bytes = b""
|
|
213
|
+
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.exception(f"{self} error generating TTS: {e}")
|
|
216
|
+
await self.push_error(ErrorFrame(f"Error generating TTS: {e}"))
|
|
217
|
+
finally:
|
|
218
|
+
# Ensure TTFB timer is stopped even on early failures
|
|
219
|
+
await self.stop_ttfb_metrics()
|
|
220
|
+
yield TTSStoppedFrame()
|
pipecat/services/inworld/tts.py
CHANGED
|
@@ -38,7 +38,7 @@ Examples::
|
|
|
38
38
|
model="inworld-tts-1",
|
|
39
39
|
streaming=True, # Default
|
|
40
40
|
params=InworldTTSService.InputParams(
|
|
41
|
-
temperature=
|
|
41
|
+
temperature=1.1, # Optional: control synthesis variability (range: [0, 2])
|
|
42
42
|
),
|
|
43
43
|
)
|
|
44
44
|
|
|
@@ -50,7 +50,7 @@ Examples::
|
|
|
50
50
|
model="inworld-tts-1",
|
|
51
51
|
streaming=False,
|
|
52
52
|
params=InworldTTSService.InputParams(
|
|
53
|
-
temperature=
|
|
53
|
+
temperature=1.1,
|
|
54
54
|
),
|
|
55
55
|
)
|
|
56
56
|
"""
|
|
@@ -123,7 +123,7 @@ class InworldTTSService(TTSService):
|
|
|
123
123
|
model="inworld-tts-1",
|
|
124
124
|
streaming=True, # Default behavior
|
|
125
125
|
params=InworldTTSService.InputParams(
|
|
126
|
-
temperature=
|
|
126
|
+
temperature=1.1, # Add variability to speech synthesis (range: [0, 2])
|
|
127
127
|
),
|
|
128
128
|
)
|
|
129
129
|
|
|
@@ -135,7 +135,7 @@ class InworldTTSService(TTSService):
|
|
|
135
135
|
model="inworld-tts-1-max",
|
|
136
136
|
streaming=False,
|
|
137
137
|
params=InworldTTSService.InputParams(
|
|
138
|
-
temperature=
|
|
138
|
+
temperature=1.1,
|
|
139
139
|
),
|
|
140
140
|
)
|
|
141
141
|
"""
|
|
@@ -144,7 +144,7 @@ class InworldTTSService(TTSService):
|
|
|
144
144
|
"""Optional input parameters for Inworld TTS configuration.
|
|
145
145
|
|
|
146
146
|
Parameters:
|
|
147
|
-
temperature: Voice temperature control for synthesis variability (e.g.,
|
|
147
|
+
temperature: Voice temperature control for synthesis variability (e.g., 1.1).
|
|
148
148
|
Valid range: [0, 2]. Higher values increase variability.
|
|
149
149
|
|
|
150
150
|
Note:
|
|
@@ -197,7 +197,7 @@ class InworldTTSService(TTSService):
|
|
|
197
197
|
- "LINEAR16" (default) - Uncompressed PCM, best quality
|
|
198
198
|
- Other formats as supported by Inworld API
|
|
199
199
|
params: Optional input parameters for additional configuration. Use this to specify:
|
|
200
|
-
- temperature: Voice temperature control for variability (range: [0, 2], e.g.,
|
|
200
|
+
- temperature: Voice temperature control for variability (range: [0, 2], e.g., 1.1, optional)
|
|
201
201
|
Language is automatically inferred from input text.
|
|
202
202
|
**kwargs: Additional arguments passed to the parent TTSService class.
|
|
203
203
|
|
pipecat/services/llm_service.py
CHANGED
|
@@ -36,15 +36,15 @@ from pipecat.frames.frames import (
|
|
|
36
36
|
FunctionCallResultFrame,
|
|
37
37
|
FunctionCallResultProperties,
|
|
38
38
|
FunctionCallsStartedFrame,
|
|
39
|
+
InterruptionFrame,
|
|
39
40
|
LLMConfigureOutputFrame,
|
|
40
41
|
LLMFullResponseEndFrame,
|
|
41
42
|
LLMFullResponseStartFrame,
|
|
42
43
|
LLMTextFrame,
|
|
43
44
|
StartFrame,
|
|
44
|
-
StartInterruptionFrame,
|
|
45
45
|
UserImageRequestFrame,
|
|
46
46
|
)
|
|
47
|
-
from pipecat.processors.aggregators.llm_context import LLMContext
|
|
47
|
+
from pipecat.processors.aggregators.llm_context import LLMContext, LLMSpecificMessage
|
|
48
48
|
from pipecat.processors.aggregators.llm_response import (
|
|
49
49
|
LLMAssistantAggregatorParams,
|
|
50
50
|
LLMUserAggregatorParams,
|
|
@@ -195,6 +195,17 @@ class LLMService(AIService):
|
|
|
195
195
|
"""
|
|
196
196
|
return self._adapter
|
|
197
197
|
|
|
198
|
+
def create_llm_specific_message(self, message: Any) -> LLMSpecificMessage:
|
|
199
|
+
"""Create an LLM-specific message (as opposed to a standard message) for use in an LLMContext.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
message: The message content.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
A LLMSpecificMessage instance.
|
|
206
|
+
"""
|
|
207
|
+
return self.get_llm_adapter().create_llm_specific_message(message)
|
|
208
|
+
|
|
198
209
|
async def run_inference(self, context: LLMContext | OpenAILLMContext) -> Optional[str]:
|
|
199
210
|
"""Run a one-shot, out-of-band (i.e. out-of-pipeline) inference with the given LLM context.
|
|
200
211
|
|
|
@@ -269,7 +280,7 @@ class LLMService(AIService):
|
|
|
269
280
|
"""
|
|
270
281
|
await super().process_frame(frame, direction)
|
|
271
282
|
|
|
272
|
-
if isinstance(frame,
|
|
283
|
+
if isinstance(frame, InterruptionFrame):
|
|
273
284
|
await self._handle_interruptions(frame)
|
|
274
285
|
elif isinstance(frame, LLMConfigureOutputFrame):
|
|
275
286
|
self._skip_tts = frame.skip_tts
|
|
@@ -286,8 +297,7 @@ class LLMService(AIService):
|
|
|
286
297
|
|
|
287
298
|
await super().push_frame(frame, direction)
|
|
288
299
|
|
|
289
|
-
async def _handle_interruptions(self, _:
|
|
290
|
-
# logger.info("In LLM Handling interruptions")
|
|
300
|
+
async def _handle_interruptions(self, _: InterruptionFrame):
|
|
291
301
|
for function_name, entry in self._functions.items():
|
|
292
302
|
if entry.cancel_on_interruption:
|
|
293
303
|
await self._cancel_function_call(function_name)
|
pipecat/services/lmnt/tts.py
CHANGED
|
@@ -16,8 +16,8 @@ from pipecat.frames.frames import (
|
|
|
16
16
|
EndFrame,
|
|
17
17
|
ErrorFrame,
|
|
18
18
|
Frame,
|
|
19
|
+
InterruptionFrame,
|
|
19
20
|
StartFrame,
|
|
20
|
-
StartInterruptionFrame,
|
|
21
21
|
TTSAudioRawFrame,
|
|
22
22
|
TTSStartedFrame,
|
|
23
23
|
TTSStoppedFrame,
|
|
@@ -180,7 +180,7 @@ class LmntTTSService(InterruptibleTTSService):
|
|
|
180
180
|
direction: The direction to push the frame.
|
|
181
181
|
"""
|
|
182
182
|
await super().push_frame(frame, direction)
|
|
183
|
-
if isinstance(frame, (TTSStoppedFrame,
|
|
183
|
+
if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
|
|
184
184
|
self._started = False
|
|
185
185
|
|
|
186
186
|
async def _connect(self):
|
|
@@ -222,6 +222,7 @@ class LmntTTSService(InterruptibleTTSService):
|
|
|
222
222
|
# Send initialization message
|
|
223
223
|
await self._websocket.send(json.dumps(init_msg))
|
|
224
224
|
|
|
225
|
+
await self._call_event_handler("on_connected")
|
|
225
226
|
except Exception as e:
|
|
226
227
|
logger.error(f"{self} initialization error: {e}")
|
|
227
228
|
self._websocket = None
|
|
@@ -243,6 +244,7 @@ class LmntTTSService(InterruptibleTTSService):
|
|
|
243
244
|
finally:
|
|
244
245
|
self._started = False
|
|
245
246
|
self._websocket = None
|
|
247
|
+
await self._call_event_handler("on_disconnected")
|
|
246
248
|
|
|
247
249
|
def _get_websocket(self):
|
|
248
250
|
"""Get the WebSocket connection if available."""
|
pipecat/services/mcp_service.py
CHANGED
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
"""MCP (Model Context Protocol) client for integrating external tools with LLMs."""
|
|
8
8
|
|
|
9
9
|
import json
|
|
10
|
-
from typing import Any, Dict, List,
|
|
10
|
+
from typing import Any, Dict, List, TypeAlias
|
|
11
11
|
|
|
12
12
|
from loguru import logger
|
|
13
13
|
|
|
@@ -28,6 +28,8 @@ except ModuleNotFoundError as e:
|
|
|
28
28
|
logger.error("In order to use an MCP client, you need to `pip install pipecat-ai[mcp]`.")
|
|
29
29
|
raise Exception(f"Missing module: {e}")
|
|
30
30
|
|
|
31
|
+
ServerParameters: TypeAlias = StdioServerParameters | SseServerParameters | StreamableHttpParameters
|
|
32
|
+
|
|
31
33
|
|
|
32
34
|
class MCPClient(BaseObject):
|
|
33
35
|
"""Client for Model Context Protocol (MCP) servers.
|
|
@@ -42,7 +44,7 @@ class MCPClient(BaseObject):
|
|
|
42
44
|
|
|
43
45
|
def __init__(
|
|
44
46
|
self,
|
|
45
|
-
server_params:
|
|
47
|
+
server_params: ServerParameters,
|
|
46
48
|
**kwargs,
|
|
47
49
|
):
|
|
48
50
|
"""Initialize the MCP client with server parameters.
|
pipecat/services/mem0/memory.py
CHANGED
|
@@ -16,7 +16,8 @@ from typing import Any, Dict, List, Optional
|
|
|
16
16
|
from loguru import logger
|
|
17
17
|
from pydantic import BaseModel, Field
|
|
18
18
|
|
|
19
|
-
from pipecat.frames.frames import ErrorFrame, Frame, LLMMessagesFrame
|
|
19
|
+
from pipecat.frames.frames import ErrorFrame, Frame, LLMContextFrame, LLMMessagesFrame
|
|
20
|
+
from pipecat.processors.aggregators.llm_context import LLMContext
|
|
20
21
|
from pipecat.processors.aggregators.openai_llm_context import (
|
|
21
22
|
OpenAILLMContext,
|
|
22
23
|
OpenAILLMContextFrame,
|
|
@@ -180,11 +181,11 @@ class Mem0MemoryService(FrameProcessor):
|
|
|
180
181
|
logger.error(f"Error retrieving memories from Mem0: {e}")
|
|
181
182
|
return []
|
|
182
183
|
|
|
183
|
-
def _enhance_context_with_memories(self, context: OpenAILLMContext, query: str):
|
|
184
|
+
def _enhance_context_with_memories(self, context: LLMContext | OpenAILLMContext, query: str):
|
|
184
185
|
"""Enhance the LLM context with relevant memories.
|
|
185
186
|
|
|
186
187
|
Args:
|
|
187
|
-
context: The
|
|
188
|
+
context: The LLM context to enhance with memory information.
|
|
188
189
|
query: The query to search for relevant memories.
|
|
189
190
|
"""
|
|
190
191
|
# Skip if this is the same query we just processed
|
|
@@ -222,11 +223,11 @@ class Mem0MemoryService(FrameProcessor):
|
|
|
222
223
|
context = None
|
|
223
224
|
messages = None
|
|
224
225
|
|
|
225
|
-
if isinstance(frame, OpenAILLMContextFrame):
|
|
226
|
+
if isinstance(frame, (LLMContextFrame, OpenAILLMContextFrame)):
|
|
226
227
|
context = frame.context
|
|
227
228
|
elif isinstance(frame, LLMMessagesFrame):
|
|
228
229
|
messages = frame.messages
|
|
229
|
-
context =
|
|
230
|
+
context = LLMContext(messages)
|
|
230
231
|
|
|
231
232
|
if context:
|
|
232
233
|
try:
|
pipecat/services/mistral/llm.py
CHANGED
|
@@ -57,16 +57,18 @@ class MistralLLMService(OpenAILLMService):
|
|
|
57
57
|
logger.debug(f"Creating Mistral client with api {base_url}")
|
|
58
58
|
return super().create_client(api_key, base_url, **kwargs)
|
|
59
59
|
|
|
60
|
-
def
|
|
60
|
+
def _apply_mistral_fixups(
|
|
61
61
|
self, messages: List[ChatCompletionMessageParam]
|
|
62
62
|
) -> List[ChatCompletionMessageParam]:
|
|
63
|
-
"""Apply
|
|
63
|
+
"""Apply fixups to messages to meet Mistral-specific requirements.
|
|
64
64
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
-
|
|
68
|
-
|
|
69
|
-
|
|
65
|
+
1. A "tool"-role message must be followed by an assistant message.
|
|
66
|
+
|
|
67
|
+
2. "system"-role messages must only appear at the start of a
|
|
68
|
+
conversation.
|
|
69
|
+
|
|
70
|
+
3. Assistant messages must have prefix=True when they are the final
|
|
71
|
+
message in a conversation (but at no other point).
|
|
70
72
|
|
|
71
73
|
Args:
|
|
72
74
|
messages: The original list of messages.
|
|
@@ -80,6 +82,25 @@ class MistralLLMService(OpenAILLMService):
|
|
|
80
82
|
# Create a copy to avoid modifying the original
|
|
81
83
|
fixed_messages = [dict(msg) for msg in messages]
|
|
82
84
|
|
|
85
|
+
# Ensure all tool responses are followed by an assistant message
|
|
86
|
+
assistant_insert_indices = []
|
|
87
|
+
for i, msg in enumerate(fixed_messages):
|
|
88
|
+
if msg.get("role") == "tool":
|
|
89
|
+
# If this is the last message or the next message is not assistant
|
|
90
|
+
if i == len(fixed_messages) - 1 or fixed_messages[i + 1].get("role") != "assistant":
|
|
91
|
+
assistant_insert_indices.append(i + 1)
|
|
92
|
+
for idx in reversed(assistant_insert_indices):
|
|
93
|
+
fixed_messages.insert(idx, {"role": "assistant", "content": " "})
|
|
94
|
+
|
|
95
|
+
# Convert any "system" messages that aren't at the start (i.e., after the initial contiguous block) to "user"
|
|
96
|
+
first_non_system_idx = next(
|
|
97
|
+
(i for i, msg in enumerate(fixed_messages) if msg.get("role") != "system"),
|
|
98
|
+
len(fixed_messages),
|
|
99
|
+
)
|
|
100
|
+
for i, msg in enumerate(fixed_messages):
|
|
101
|
+
if msg.get("role") == "system" and i >= first_non_system_idx:
|
|
102
|
+
msg["role"] = "user"
|
|
103
|
+
|
|
83
104
|
# Get the last message
|
|
84
105
|
last_message = fixed_messages[-1]
|
|
85
106
|
|
|
@@ -158,7 +179,7 @@ class MistralLLMService(OpenAILLMService):
|
|
|
158
179
|
- Core completion settings
|
|
159
180
|
"""
|
|
160
181
|
# Apply Mistral's assistant prefix requirement for API compatibility
|
|
161
|
-
fixed_messages = self.
|
|
182
|
+
fixed_messages = self._apply_mistral_fixups(params_from_context["messages"])
|
|
162
183
|
|
|
163
184
|
params = {
|
|
164
185
|
"model": self.model_name,
|
|
@@ -11,17 +11,20 @@ for image analysis and description generation.
|
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
import asyncio
|
|
14
|
-
|
|
14
|
+
import base64
|
|
15
|
+
from io import BytesIO
|
|
16
|
+
from typing import AsyncGenerator, Optional
|
|
15
17
|
|
|
16
18
|
from loguru import logger
|
|
17
19
|
from PIL import Image
|
|
18
20
|
|
|
19
|
-
from pipecat.frames.frames import ErrorFrame, Frame, TextFrame
|
|
21
|
+
from pipecat.frames.frames import ErrorFrame, Frame, TextFrame
|
|
22
|
+
from pipecat.processors.aggregators.llm_context import LLMContext
|
|
20
23
|
from pipecat.services.vision_service import VisionService
|
|
21
24
|
|
|
22
25
|
try:
|
|
23
26
|
import torch
|
|
24
|
-
from transformers import AutoModelForCausalLM
|
|
27
|
+
from transformers import AutoModelForCausalLM
|
|
25
28
|
except ModuleNotFoundError as e:
|
|
26
29
|
logger.error(f"Exception: {e}")
|
|
27
30
|
logger.error("In order to use Moondream, you need to `pip install pipecat-ai[moondream]`.")
|
|
@@ -94,11 +97,11 @@ class MoondreamService(VisionService):
|
|
|
94
97
|
|
|
95
98
|
logger.debug("Loaded Moondream model")
|
|
96
99
|
|
|
97
|
-
async def run_vision(self,
|
|
100
|
+
async def run_vision(self, context: LLMContext) -> AsyncGenerator[Frame, None]:
|
|
98
101
|
"""Analyze an image and generate a description.
|
|
99
102
|
|
|
100
103
|
Args:
|
|
101
|
-
|
|
104
|
+
context: The context to process, containing image data.
|
|
102
105
|
|
|
103
106
|
Yields:
|
|
104
107
|
Frame: TextFrame containing the generated image description, or ErrorFrame
|
|
@@ -109,22 +112,45 @@ class MoondreamService(VisionService):
|
|
|
109
112
|
yield ErrorFrame("Moondream model not available")
|
|
110
113
|
return
|
|
111
114
|
|
|
112
|
-
|
|
115
|
+
image_bytes = None
|
|
116
|
+
text = None
|
|
117
|
+
try:
|
|
118
|
+
messages = context.get_messages()
|
|
119
|
+
last_message = messages[-1]
|
|
120
|
+
last_message_content = last_message.get("content")
|
|
121
|
+
|
|
122
|
+
for item in last_message_content:
|
|
123
|
+
if isinstance(item, dict):
|
|
124
|
+
if (
|
|
125
|
+
"image_url" in item
|
|
126
|
+
and isinstance(item["image_url"], dict)
|
|
127
|
+
and item["image_url"].get("url")
|
|
128
|
+
):
|
|
129
|
+
image_bytes = base64.b64decode(item["image_url"]["url"].split(",")[1])
|
|
130
|
+
elif "text" in item and isinstance(item["text"], str):
|
|
131
|
+
text = item["text"]
|
|
132
|
+
|
|
133
|
+
except Exception as e:
|
|
134
|
+
logger.error(f"Exception during image extraction: {e}")
|
|
135
|
+
yield ErrorFrame("Failed to extract image from context")
|
|
136
|
+
return
|
|
113
137
|
|
|
114
|
-
|
|
115
|
-
"
|
|
138
|
+
if not image_bytes:
|
|
139
|
+
logger.error("No image found in context")
|
|
140
|
+
yield ErrorFrame("No image found in context")
|
|
141
|
+
return
|
|
116
142
|
|
|
117
|
-
|
|
118
|
-
|
|
143
|
+
logger.debug(
|
|
144
|
+
f"Analyzing image (bytes length: {len(image_bytes) if image_bytes else 'None'})"
|
|
145
|
+
)
|
|
119
146
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
image = Image.frombytes(frame.format, frame.size, frame.image)
|
|
147
|
+
def get_image_description(bytes: bytes, text: Optional[str]) -> str:
|
|
148
|
+
image_buffer = BytesIO(bytes)
|
|
149
|
+
image = Image.open(image_buffer)
|
|
124
150
|
image_embeds = self._model.encode_image(image)
|
|
125
|
-
description = self._model.query(image_embeds,
|
|
151
|
+
description = self._model.query(image_embeds, text)["answer"]
|
|
126
152
|
return description
|
|
127
153
|
|
|
128
|
-
description = await asyncio.to_thread(get_image_description,
|
|
154
|
+
description = await asyncio.to_thread(get_image_description, image_bytes, text)
|
|
129
155
|
|
|
130
156
|
yield TextFrame(text=description)
|
|
@@ -25,9 +25,9 @@ from pipecat.frames.frames import (
|
|
|
25
25
|
EndFrame,
|
|
26
26
|
ErrorFrame,
|
|
27
27
|
Frame,
|
|
28
|
+
InterruptionFrame,
|
|
28
29
|
LLMFullResponseEndFrame,
|
|
29
30
|
StartFrame,
|
|
30
|
-
StartInterruptionFrame,
|
|
31
31
|
TTSAudioRawFrame,
|
|
32
32
|
TTSSpeakFrame,
|
|
33
33
|
TTSStartedFrame,
|
|
@@ -224,7 +224,7 @@ class NeuphonicTTSService(InterruptibleTTSService):
|
|
|
224
224
|
direction: The direction to push the frame.
|
|
225
225
|
"""
|
|
226
226
|
await super().push_frame(frame, direction)
|
|
227
|
-
if isinstance(frame, (TTSStoppedFrame,
|
|
227
|
+
if isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
|
|
228
228
|
self._started = False
|
|
229
229
|
|
|
230
230
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
@@ -293,6 +293,8 @@ class NeuphonicTTSService(InterruptibleTTSService):
|
|
|
293
293
|
headers = {"x-api-key": self._api_key}
|
|
294
294
|
|
|
295
295
|
self._websocket = await websocket_connect(url, additional_headers=headers)
|
|
296
|
+
|
|
297
|
+
await self._call_event_handler("on_connected")
|
|
296
298
|
except Exception as e:
|
|
297
299
|
logger.error(f"{self} initialization error: {e}")
|
|
298
300
|
self._websocket = None
|
|
@@ -311,6 +313,7 @@ class NeuphonicTTSService(InterruptibleTTSService):
|
|
|
311
313
|
finally:
|
|
312
314
|
self._started = False
|
|
313
315
|
self._websocket = None
|
|
316
|
+
await self._call_event_handler("on_disconnected")
|
|
314
317
|
|
|
315
318
|
async def _receive_messages(self):
|
|
316
319
|
"""Receive and process messages from Neuphonic WebSocket."""
|