dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -32,7 +32,6 @@ from pipecat.frames.frames import (
|
|
|
32
32
|
LLMMessagesFrame,
|
|
33
33
|
LLMTextFrame,
|
|
34
34
|
LLMUpdateSettingsFrame,
|
|
35
|
-
VisionImageRawFrame,
|
|
36
35
|
)
|
|
37
36
|
from pipecat.metrics.metrics import LLMTokenUsage
|
|
38
37
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
|
@@ -67,6 +66,7 @@ class BaseOpenAILLMService(LLMService):
|
|
|
67
66
|
top_p: Top-p (nucleus) sampling parameter (0.0 to 1.0).
|
|
68
67
|
max_tokens: Maximum tokens in response (deprecated, use max_completion_tokens).
|
|
69
68
|
max_completion_tokens: Maximum completion tokens to generate.
|
|
69
|
+
service_tier: Service tier to use (e.g., "auto", "flex", "priority").
|
|
70
70
|
extra: Additional model-specific parameters.
|
|
71
71
|
"""
|
|
72
72
|
|
|
@@ -84,6 +84,7 @@ class BaseOpenAILLMService(LLMService):
|
|
|
84
84
|
top_p: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=1.0)
|
|
85
85
|
max_tokens: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=1)
|
|
86
86
|
max_completion_tokens: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=1)
|
|
87
|
+
service_tier: Optional[str] = Field(default_factory=lambda: NOT_GIVEN)
|
|
87
88
|
extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
|
88
89
|
|
|
89
90
|
def __init__(
|
|
@@ -126,6 +127,7 @@ class BaseOpenAILLMService(LLMService):
|
|
|
126
127
|
"top_p": params.top_p,
|
|
127
128
|
"max_tokens": params.max_tokens,
|
|
128
129
|
"max_completion_tokens": params.max_completion_tokens,
|
|
130
|
+
"service_tier": params.service_tier,
|
|
129
131
|
"extra": params.extra if isinstance(params.extra, dict) else {},
|
|
130
132
|
}
|
|
131
133
|
self._retry_timeout_secs = retry_timeout_secs
|
|
@@ -237,6 +239,7 @@ class BaseOpenAILLMService(LLMService):
|
|
|
237
239
|
"top_p": self._settings["top_p"],
|
|
238
240
|
"max_tokens": self._settings["max_tokens"],
|
|
239
241
|
"max_completion_tokens": self._settings["max_completion_tokens"],
|
|
242
|
+
"service_tier": self._settings["service_tier"],
|
|
240
243
|
}
|
|
241
244
|
|
|
242
245
|
# Messages, tools, tool_choice
|
|
@@ -282,8 +285,10 @@ class BaseOpenAILLMService(LLMService):
|
|
|
282
285
|
# base64 encode any images
|
|
283
286
|
for message in messages:
|
|
284
287
|
if message.get("mime_type") == "image/jpeg":
|
|
285
|
-
|
|
286
|
-
|
|
288
|
+
# Avoid .getvalue() which makes a full copy of BytesIO
|
|
289
|
+
raw_bytes = message["data"].read()
|
|
290
|
+
encoded_image = base64.b64encode(raw_bytes).decode("utf-8")
|
|
291
|
+
text = message.get("content", "")
|
|
287
292
|
message["content"] = [
|
|
288
293
|
{"type": "text", "text": text},
|
|
289
294
|
{
|
|
@@ -291,6 +296,7 @@ class BaseOpenAILLMService(LLMService):
|
|
|
291
296
|
"image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"},
|
|
292
297
|
},
|
|
293
298
|
]
|
|
299
|
+
# Explicit cleanup
|
|
294
300
|
del message["data"]
|
|
295
301
|
del message["mime_type"]
|
|
296
302
|
|
|
@@ -335,10 +341,16 @@ class BaseOpenAILLMService(LLMService):
|
|
|
335
341
|
|
|
336
342
|
async for chunk in chunk_stream:
|
|
337
343
|
if chunk.usage:
|
|
344
|
+
cached_tokens = (
|
|
345
|
+
chunk.usage.prompt_tokens_details.cached_tokens
|
|
346
|
+
if chunk.usage.prompt_tokens_details
|
|
347
|
+
else None
|
|
348
|
+
)
|
|
338
349
|
tokens = LLMTokenUsage(
|
|
339
350
|
prompt_tokens=chunk.usage.prompt_tokens,
|
|
340
351
|
completion_tokens=chunk.usage.completion_tokens,
|
|
341
352
|
total_tokens=chunk.usage.total_tokens,
|
|
353
|
+
cache_read_input_tokens=cached_tokens,
|
|
342
354
|
)
|
|
343
355
|
await self.start_llm_usage_metrics(tokens)
|
|
344
356
|
|
|
@@ -417,13 +429,18 @@ class BaseOpenAILLMService(LLMService):
|
|
|
417
429
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
418
430
|
"""Process frames for LLM completion requests.
|
|
419
431
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
432
|
+
Handles OpenAILLMContextFrame, LLMContextFrame, LLMMessagesFrame,
|
|
433
|
+
<<<<<<< HEAD
|
|
434
|
+
and LLMUpdateSettingsFrame to trigger LLM completions and manage
|
|
435
|
+
settings.
|
|
436
|
+
=======
|
|
437
|
+
VisionImageRawFrame, and LLMUpdateSettingsFrame to trigger LLM
|
|
438
|
+
completions and manage settings.
|
|
439
|
+
>>>>>>> dv-stage
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
frame: The frame to process.
|
|
443
|
+
direction: The direction of frame processing.
|
|
427
444
|
"""
|
|
428
445
|
await super().process_frame(frame, direction)
|
|
429
446
|
|
|
@@ -438,16 +455,6 @@ class BaseOpenAILLMService(LLMService):
|
|
|
438
455
|
# NOTE: LLMMessagesFrame is deprecated, so we don't support the newer universal
|
|
439
456
|
# LLMContext with it
|
|
440
457
|
context = OpenAILLMContext.from_messages(frame.messages)
|
|
441
|
-
elif isinstance(frame, VisionImageRawFrame):
|
|
442
|
-
# This is only useful in very simple pipelines because it creates
|
|
443
|
-
# a new context. Generally we want a context manager to catch
|
|
444
|
-
# UserImageRawFrames coming through the pipeline and add them
|
|
445
|
-
# to the context.
|
|
446
|
-
# TODO: support the newer universal LLMContext with a VisionImageRawFrame equivalent?
|
|
447
|
-
context = OpenAILLMContext()
|
|
448
|
-
context.add_image_frame_message(
|
|
449
|
-
format=frame.format, size=frame.size, image=frame.image, text=frame.text
|
|
450
|
-
)
|
|
451
458
|
elif isinstance(frame, LLMUpdateSettingsFrame):
|
|
452
459
|
await self._update_settings(frame.settings)
|
|
453
460
|
else:
|
|
File without changes
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""OpenAI Realtime LLM context and aggregator implementations."""
|
|
8
|
+
|
|
9
|
+
import copy
|
|
10
|
+
import json
|
|
11
|
+
|
|
12
|
+
from loguru import logger
|
|
13
|
+
|
|
14
|
+
from pipecat.frames.frames import (
|
|
15
|
+
Frame,
|
|
16
|
+
FunctionCallResultFrame,
|
|
17
|
+
InterimTranscriptionFrame,
|
|
18
|
+
LLMMessagesUpdateFrame,
|
|
19
|
+
LLMSetToolsFrame,
|
|
20
|
+
LLMTextFrame,
|
|
21
|
+
TranscriptionFrame,
|
|
22
|
+
)
|
|
23
|
+
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
|
24
|
+
from pipecat.processors.frame_processor import FrameDirection
|
|
25
|
+
from pipecat.services.openai.llm import (
|
|
26
|
+
OpenAIAssistantContextAggregator,
|
|
27
|
+
OpenAIUserContextAggregator,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
from . import events
|
|
31
|
+
from .frames import RealtimeFunctionCallResultFrame, RealtimeMessagesUpdateFrame
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class OpenAIRealtimeLLMContext(OpenAILLMContext):
|
|
35
|
+
"""OpenAI Realtime LLM context with session management and message conversion.
|
|
36
|
+
|
|
37
|
+
Extends the standard OpenAI LLM context to support real-time session properties,
|
|
38
|
+
instruction management, and conversion between standard message formats and
|
|
39
|
+
realtime conversation items.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, messages=None, tools=None, **kwargs):
|
|
43
|
+
"""Initialize the OpenAIRealtimeLLMContext.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
messages: Initial conversation messages. Defaults to None.
|
|
47
|
+
tools: Available function tools. Defaults to None.
|
|
48
|
+
**kwargs: Additional arguments passed to parent OpenAILLMContext.
|
|
49
|
+
"""
|
|
50
|
+
super().__init__(messages=messages, tools=tools, **kwargs)
|
|
51
|
+
self.__setup_local()
|
|
52
|
+
|
|
53
|
+
def __setup_local(self):
|
|
54
|
+
self.llm_needs_settings_update = True
|
|
55
|
+
self.llm_needs_initial_messages = True
|
|
56
|
+
self._session_instructions = ""
|
|
57
|
+
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def upgrade_to_realtime(obj: OpenAILLMContext) -> "OpenAIRealtimeLLMContext":
|
|
62
|
+
"""Upgrade a standard OpenAI LLM context to a realtime context.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
obj: The OpenAILLMContext instance to upgrade.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
The upgraded OpenAIRealtimeLLMContext instance.
|
|
69
|
+
"""
|
|
70
|
+
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, OpenAIRealtimeLLMContext):
|
|
71
|
+
obj.__class__ = OpenAIRealtimeLLMContext
|
|
72
|
+
obj.__setup_local()
|
|
73
|
+
return obj
|
|
74
|
+
|
|
75
|
+
# todo
|
|
76
|
+
# - finish implementing all frames
|
|
77
|
+
|
|
78
|
+
def from_standard_message(self, message):
|
|
79
|
+
"""Convert a standard message format to a realtime conversation item.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
message: The standard message dictionary to convert.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
A ConversationItem instance for the realtime API.
|
|
86
|
+
"""
|
|
87
|
+
if message.get("role") == "user":
|
|
88
|
+
content = message.get("content")
|
|
89
|
+
if isinstance(message.get("content"), list):
|
|
90
|
+
content = ""
|
|
91
|
+
for c in message.get("content"):
|
|
92
|
+
if c.get("type") == "text":
|
|
93
|
+
content += " " + c.get("text")
|
|
94
|
+
else:
|
|
95
|
+
logger.error(
|
|
96
|
+
f"Unhandled content type in context message: {c.get('type')} - {message}"
|
|
97
|
+
)
|
|
98
|
+
return events.ConversationItem(
|
|
99
|
+
role="user",
|
|
100
|
+
type="message",
|
|
101
|
+
content=[events.ItemContent(type="input_text", text=content)],
|
|
102
|
+
)
|
|
103
|
+
if message.get("role") == "assistant" and message.get("tool_calls"):
|
|
104
|
+
tc = message.get("tool_calls")[0]
|
|
105
|
+
return events.ConversationItem(
|
|
106
|
+
type="function_call",
|
|
107
|
+
call_id=tc["id"],
|
|
108
|
+
name=tc["function"]["name"],
|
|
109
|
+
arguments=tc["function"]["arguments"],
|
|
110
|
+
)
|
|
111
|
+
logger.error(f"Unhandled message type in from_standard_message: {message}")
|
|
112
|
+
|
|
113
|
+
def get_messages_for_initializing_history(self):
|
|
114
|
+
"""Get conversation items for initializing the realtime session history.
|
|
115
|
+
|
|
116
|
+
Converts the context's messages to a format suitable for the realtime API,
|
|
117
|
+
handling system instructions and conversation history packaging.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
List of conversation items for session initialization.
|
|
121
|
+
"""
|
|
122
|
+
# We can't load a long conversation history into the openai realtime api yet. (The API/model
|
|
123
|
+
# forgets that it can do audio, if you do a series of `conversation.item.create` calls.) So
|
|
124
|
+
# our general strategy until this is fixed is just to put everything into a first "user"
|
|
125
|
+
# message as a single input.
|
|
126
|
+
if not self.messages:
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
messages = copy.deepcopy(self.messages)
|
|
130
|
+
|
|
131
|
+
# If we have a "system" message as our first message, let's pull that out into session
|
|
132
|
+
# "instructions"
|
|
133
|
+
if messages[0].get("role") == "system":
|
|
134
|
+
self.llm_needs_settings_update = True
|
|
135
|
+
system = messages.pop(0)
|
|
136
|
+
content = system.get("content")
|
|
137
|
+
if isinstance(content, str):
|
|
138
|
+
self._session_instructions = content
|
|
139
|
+
elif isinstance(content, list):
|
|
140
|
+
self._session_instructions = content[0].get("text")
|
|
141
|
+
if not messages:
|
|
142
|
+
return []
|
|
143
|
+
|
|
144
|
+
# If we have just a single "user" item, we can just send it normally
|
|
145
|
+
if len(messages) == 1 and messages[0].get("role") == "user":
|
|
146
|
+
return [self.from_standard_message(messages[0])]
|
|
147
|
+
|
|
148
|
+
# Otherwise, let's pack everything into a single "user" message with a bit of
|
|
149
|
+
# explanation for the LLM
|
|
150
|
+
intro_text = """
|
|
151
|
+
This is a previously saved conversation. Please treat this conversation history as a
|
|
152
|
+
starting point for the current conversation."""
|
|
153
|
+
|
|
154
|
+
trailing_text = """
|
|
155
|
+
This is the end of the previously saved conversation. Please continue the conversation
|
|
156
|
+
from here. If the last message is a user instruction or question, act on that instruction
|
|
157
|
+
or answer the question. If the last message is an assistant response, simple say that you
|
|
158
|
+
are ready to continue the conversation."""
|
|
159
|
+
|
|
160
|
+
return [
|
|
161
|
+
{
|
|
162
|
+
"role": "user",
|
|
163
|
+
"type": "message",
|
|
164
|
+
"content": [
|
|
165
|
+
{
|
|
166
|
+
"type": "input_text",
|
|
167
|
+
"text": "\n\n".join(
|
|
168
|
+
[intro_text, json.dumps(messages, indent=2), trailing_text]
|
|
169
|
+
),
|
|
170
|
+
}
|
|
171
|
+
],
|
|
172
|
+
}
|
|
173
|
+
]
|
|
174
|
+
|
|
175
|
+
def add_user_content_item_as_message(self, item):
|
|
176
|
+
"""Add a user content item as a standard message to the context.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
item: The conversation item to add as a user message.
|
|
180
|
+
"""
|
|
181
|
+
message = {
|
|
182
|
+
"role": "user",
|
|
183
|
+
"content": [{"type": "text", "text": item.content[0].transcript}],
|
|
184
|
+
}
|
|
185
|
+
self.add_message(message)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class OpenAIRealtimeUserContextAggregator(OpenAIUserContextAggregator):
|
|
189
|
+
"""User context aggregator for OpenAI Realtime API.
|
|
190
|
+
|
|
191
|
+
Handles user input frames and generates appropriate context updates
|
|
192
|
+
for the realtime conversation, including message updates and tool settings.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
context: The OpenAI realtime LLM context.
|
|
196
|
+
**kwargs: Additional arguments passed to parent aggregator.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
async def process_frame(
|
|
200
|
+
self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM
|
|
201
|
+
):
|
|
202
|
+
"""Process incoming frames and handle realtime-specific frame types.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
frame: The frame to process.
|
|
206
|
+
direction: The direction of frame flow in the pipeline.
|
|
207
|
+
"""
|
|
208
|
+
await super().process_frame(frame, direction)
|
|
209
|
+
# Parent does not push LLMMessagesUpdateFrame. This ensures that in a typical pipeline,
|
|
210
|
+
# messages are only processed by the user context aggregator, which is generally what we want. But
|
|
211
|
+
# we also need to send new messages over the websocket, so the openai realtime API has them
|
|
212
|
+
# in its context.
|
|
213
|
+
if isinstance(frame, LLMMessagesUpdateFrame):
|
|
214
|
+
await self.push_frame(RealtimeMessagesUpdateFrame(context=self._context))
|
|
215
|
+
|
|
216
|
+
# Parent also doesn't push the LLMSetToolsFrame.
|
|
217
|
+
if isinstance(frame, LLMSetToolsFrame):
|
|
218
|
+
await self.push_frame(frame, direction)
|
|
219
|
+
|
|
220
|
+
async def push_aggregation(self):
|
|
221
|
+
"""Push user input aggregation.
|
|
222
|
+
|
|
223
|
+
Currently ignores all user input coming into the pipeline as realtime
|
|
224
|
+
audio input is handled directly by the service.
|
|
225
|
+
"""
|
|
226
|
+
# for the moment, ignore all user input coming into the pipeline.
|
|
227
|
+
# todo: think about whether/how to fix this to allow for text input from
|
|
228
|
+
# upstream (transport/transcription, or other sources)
|
|
229
|
+
pass
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class OpenAIRealtimeAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
|
233
|
+
"""Assistant context aggregator for OpenAI Realtime API.
|
|
234
|
+
|
|
235
|
+
Handles assistant output frames from the realtime service, filtering
|
|
236
|
+
out duplicate text frames and managing function call results.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
context: The OpenAI realtime LLM context.
|
|
240
|
+
**kwargs: Additional arguments passed to parent aggregator.
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
# The LLMAssistantContextAggregator uses TextFrames to aggregate the LLM output,
|
|
244
|
+
# but the OpenAIRealtimeLLMService pushes LLMTextFrames and TTSTextFrames. We
|
|
245
|
+
# need to override this proces_frame for LLMTextFrame, so that only the TTSTextFrames
|
|
246
|
+
# are process. This ensures that the context gets only one set of messages.
|
|
247
|
+
# OpenAIRealtimeLLMService also pushes TranscriptionFrames and InterimTranscriptionFrames,
|
|
248
|
+
# so we need to ignore pushing those as well, as they're also TextFrames.
|
|
249
|
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
250
|
+
"""Process assistant frames, filtering out duplicate text content.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
frame: The frame to process.
|
|
254
|
+
direction: The direction of frame flow in the pipeline.
|
|
255
|
+
"""
|
|
256
|
+
if not isinstance(frame, (LLMTextFrame, TranscriptionFrame, InterimTranscriptionFrame)):
|
|
257
|
+
await super().process_frame(frame, direction)
|
|
258
|
+
|
|
259
|
+
async def handle_function_call_result(self, frame: FunctionCallResultFrame):
|
|
260
|
+
"""Handle function call result and notify the realtime service.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
frame: The function call result frame to handle.
|
|
264
|
+
"""
|
|
265
|
+
await super().handle_function_call_result(frame)
|
|
266
|
+
|
|
267
|
+
# The standard function callback code path pushes the FunctionCallResultFrame from the llm itself,
|
|
268
|
+
# so we didn't have a chance to add the result to the openai realtime api context. Let's push a
|
|
269
|
+
# special frame to do that.
|
|
270
|
+
await self.push_frame(
|
|
271
|
+
RealtimeFunctionCallResultFrame(result_frame=frame), FrameDirection.UPSTREAM
|
|
272
|
+
)
|