dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -1,698 +0,0 @@
1
- #
2
- # Copyright (c) 2024–2025, Daily
3
- #
4
- # SPDX-License-Identifier: BSD 2-Clause License
5
- #
6
-
7
- import base64
8
- import io
9
- import json
10
- from dataclasses import dataclass
11
- from typing import Any, AsyncGenerator, Dict, List, Literal, Optional
12
-
13
- import aiohttp
14
- import httpx
15
- from loguru import logger
16
- from openai import (
17
- NOT_GIVEN,
18
- AsyncOpenAI,
19
- AsyncStream,
20
- BadRequestError,
21
- DefaultAsyncHttpxClient,
22
- )
23
- from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam
24
- from PIL import Image
25
- from pydantic import BaseModel, Field
26
-
27
- from pipecat.frames.frames import (
28
- ErrorFrame,
29
- Frame,
30
- FunctionCallInProgressFrame,
31
- FunctionCallResultFrame,
32
- FunctionCallResultProperties,
33
- LLMFullResponseEndFrame,
34
- LLMFullResponseStartFrame,
35
- LLMMessagesFrame,
36
- LLMTextFrame,
37
- LLMUpdateSettingsFrame,
38
- OpenAILLMContextAssistantTimestampFrame,
39
- StartFrame,
40
- StartInterruptionFrame,
41
- TTSAudioRawFrame,
42
- TTSStartedFrame,
43
- TTSStoppedFrame,
44
- URLImageRawFrame,
45
- UserImageRawFrame,
46
- UserImageRequestFrame,
47
- VisionImageRawFrame,
48
- )
49
- from pipecat.metrics.metrics import LLMTokenUsage
50
- from pipecat.processors.aggregators.llm_response import (
51
- LLMAssistantContextAggregator,
52
- LLMUserContextAggregator,
53
- )
54
- from pipecat.processors.aggregators.openai_llm_context import (
55
- OpenAILLMContext,
56
- OpenAILLMContextFrame,
57
- )
58
- from pipecat.processors.frame_processor import FrameDirection
59
- from pipecat.services.ai_services import (
60
- ImageGenService,
61
- LLMService,
62
- TTSService,
63
- )
64
- from pipecat.services.base_whisper import BaseWhisperSTTService, Transcription
65
- from pipecat.transcriptions.language import Language
66
- from pipecat.utils.time import time_now_iso8601
67
-
68
- ValidVoice = Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
69
-
70
- VALID_VOICES: Dict[str, ValidVoice] = {
71
- "alloy": "alloy",
72
- "echo": "echo",
73
- "fable": "fable",
74
- "onyx": "onyx",
75
- "nova": "nova",
76
- "shimmer": "shimmer",
77
- }
78
-
79
-
80
- class OpenAIUnhandledFunctionException(Exception):
81
- pass
82
-
83
-
84
- class BaseOpenAILLMService(LLMService):
85
- """This is the base for all services that use the AsyncOpenAI client.
86
-
87
- This service consumes OpenAILLMContextFrame frames, which contain a reference
88
- to an OpenAILLMContext frame. The OpenAILLMContext object defines the context
89
- sent to the LLM for a completion. This includes user, assistant and system messages
90
- as well as tool choices and the tool, which is used if requesting function
91
- calls from the LLM.
92
- """
93
-
94
- class InputParams(BaseModel):
95
- frequency_penalty: Optional[float] = Field(
96
- default_factory=lambda: NOT_GIVEN, ge=-2.0, le=2.0
97
- )
98
- presence_penalty: Optional[float] = Field(
99
- default_factory=lambda: NOT_GIVEN, ge=-2.0, le=2.0
100
- )
101
- seed: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=0)
102
- temperature: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=2.0)
103
- # Note: top_k is currently not supported by the OpenAI client library,
104
- # so top_k is ignored right now.
105
- top_k: Optional[int] = Field(default=None, ge=0)
106
- top_p: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=1.0)
107
- max_tokens: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=1)
108
- max_completion_tokens: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=1)
109
- extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
110
-
111
- def __init__(
112
- self,
113
- *,
114
- model: str,
115
- api_key=None,
116
- base_url=None,
117
- organization=None,
118
- project=None,
119
- params: InputParams = InputParams(),
120
- **kwargs,
121
- ):
122
- super().__init__(**kwargs)
123
- self._settings = {
124
- "frequency_penalty": params.frequency_penalty,
125
- "presence_penalty": params.presence_penalty,
126
- "seed": params.seed,
127
- "temperature": params.temperature,
128
- "top_p": params.top_p,
129
- "max_tokens": params.max_tokens,
130
- "max_completion_tokens": params.max_completion_tokens,
131
- "extra": params.extra if isinstance(params.extra, dict) else {},
132
- }
133
- self.set_model_name(model)
134
- self._client = self.create_client(
135
- api_key=api_key, base_url=base_url, organization=organization, project=project, **kwargs
136
- )
137
-
138
- def create_client(self, api_key=None, base_url=None, organization=None, project=None, **kwargs):
139
- return AsyncOpenAI(
140
- api_key=api_key,
141
- base_url=base_url,
142
- organization=organization,
143
- project=project,
144
- http_client=DefaultAsyncHttpxClient(
145
- limits=httpx.Limits(
146
- max_keepalive_connections=100, max_connections=1000, keepalive_expiry=None
147
- )
148
- ),
149
- )
150
-
151
- def can_generate_metrics(self) -> bool:
152
- return True
153
-
154
- async def get_chat_completions(
155
- self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
156
- ) -> AsyncStream[ChatCompletionChunk]:
157
- params = {
158
- "model": self.model_name,
159
- "stream": True,
160
- "messages": messages,
161
- "tools": context.tools,
162
- "tool_choice": context.tool_choice,
163
- "stream_options": {"include_usage": True},
164
- "frequency_penalty": self._settings["frequency_penalty"],
165
- "presence_penalty": self._settings["presence_penalty"],
166
- "seed": self._settings["seed"],
167
- "temperature": self._settings["temperature"],
168
- "top_p": self._settings["top_p"],
169
- "max_tokens": self._settings["max_tokens"],
170
- "max_completion_tokens": self._settings["max_completion_tokens"],
171
- }
172
-
173
- params.update(self._settings["extra"])
174
-
175
- chunks = await self._client.chat.completions.create(**params)
176
- return chunks
177
-
178
- async def _stream_chat_completions(
179
- self, context: OpenAILLMContext
180
- ) -> AsyncStream[ChatCompletionChunk]:
181
- self.logger.debug(f"Generating chat: {context.get_messages_for_logging()}")
182
-
183
- messages: List[ChatCompletionMessageParam] = context.get_messages()
184
-
185
- # base64 encode any images
186
- for message in messages:
187
- if message.get("mime_type") == "image/jpeg":
188
- encoded_image = base64.b64encode(message["data"].getvalue()).decode("utf-8")
189
- text = message["content"]
190
- message["content"] = [
191
- {"type": "text", "text": text},
192
- {
193
- "type": "image_url",
194
- "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"},
195
- },
196
- ]
197
- del message["data"]
198
- del message["mime_type"]
199
-
200
- chunks = await self.get_chat_completions(context, messages)
201
-
202
- return chunks
203
-
204
- async def _process_context(self, context: OpenAILLMContext):
205
- functions_list = []
206
- arguments_list = []
207
- tool_id_list = []
208
- func_idx = 0
209
- function_name = ""
210
- arguments = ""
211
- tool_call_id = ""
212
-
213
- await self.start_ttfb_metrics()
214
-
215
- chunk_stream: AsyncStream[ChatCompletionChunk] = await self._stream_chat_completions(
216
- context
217
- )
218
-
219
- async for chunk in chunk_stream:
220
- if chunk.usage:
221
- tokens = LLMTokenUsage(
222
- prompt_tokens=chunk.usage.prompt_tokens,
223
- completion_tokens=chunk.usage.completion_tokens,
224
- total_tokens=chunk.usage.total_tokens,
225
- )
226
- await self.start_llm_usage_metrics(tokens)
227
-
228
- if chunk.choices is None or len(chunk.choices) == 0:
229
- continue
230
-
231
- await self.stop_ttfb_metrics()
232
-
233
- if not chunk.choices[0].delta:
234
- continue
235
-
236
- if chunk.choices[0].delta.tool_calls:
237
- # We're streaming the LLM response to enable the fastest response times.
238
- # For text, we just yield each chunk as we receive it and count on consumers
239
- # to do whatever coalescing they need (eg. to pass full sentences to TTS)
240
- #
241
- # If the LLM is a function call, we'll do some coalescing here.
242
- # If the response contains a function name, we'll yield a frame to tell consumers
243
- # that they can start preparing to call the function with that name.
244
- # We accumulate all the arguments for the rest of the streamed response, then when
245
- # the response is done, we package up all the arguments and the function name and
246
- # yield a frame containing the function name and the arguments.
247
-
248
- tool_call = chunk.choices[0].delta.tool_calls[0]
249
- if tool_call.index != func_idx:
250
- functions_list.append(function_name)
251
- arguments_list.append(arguments)
252
- tool_id_list.append(tool_call_id)
253
- function_name = ""
254
- arguments = ""
255
- tool_call_id = ""
256
- func_idx += 1
257
- if tool_call.function and tool_call.function.name:
258
- function_name += tool_call.function.name
259
- tool_call_id = tool_call.id
260
- if tool_call.function and tool_call.function.arguments:
261
- # Keep iterating through the response to collect all the argument fragments
262
- arguments += tool_call.function.arguments
263
- elif chunk.choices[0].delta.content:
264
- await self.push_frame(LLMTextFrame(chunk.choices[0].delta.content))
265
-
266
- # if we got a function name and arguments, check to see if it's a function with
267
- # a registered handler. If so, run the registered callback, save the result to
268
- # the context, and re-prompt to get a chat answer. If we don't have a registered
269
- # handler, raise an exception.
270
- if function_name and arguments:
271
- # added to the list as last function name and arguments not added to the list
272
- functions_list.append(function_name)
273
- arguments_list.append(arguments)
274
- tool_id_list.append(tool_call_id)
275
-
276
- for index, (function_name, arguments, tool_id) in enumerate(
277
- zip(functions_list, arguments_list, tool_id_list), start=1
278
- ):
279
- if self.has_function(function_name):
280
- run_llm = False
281
- arguments = json.loads(arguments)
282
- await self.call_function(
283
- context=context,
284
- function_name=function_name,
285
- arguments=arguments,
286
- tool_call_id=tool_id,
287
- run_llm=run_llm,
288
- )
289
- else:
290
- raise OpenAIUnhandledFunctionException(
291
- f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
292
- )
293
-
294
- async def process_frame(self, frame: Frame, direction: FrameDirection):
295
- await super().process_frame(frame, direction)
296
-
297
- context = None
298
- if isinstance(frame, OpenAILLMContextFrame):
299
- context: OpenAILLMContext = frame.context
300
- elif isinstance(frame, LLMMessagesFrame):
301
- context = OpenAILLMContext.from_messages(frame.messages)
302
- elif isinstance(frame, VisionImageRawFrame):
303
- context = OpenAILLMContext()
304
- context.add_image_frame_message(
305
- format=frame.format, size=frame.size, image=frame.image, text=frame.text
306
- )
307
- elif isinstance(frame, LLMUpdateSettingsFrame):
308
- await self._update_settings(frame.settings)
309
- else:
310
- await self.push_frame(frame, direction)
311
-
312
- if context:
313
- try:
314
- await self.push_frame(LLMFullResponseStartFrame())
315
- await self.start_processing_metrics()
316
- await self._process_context(context)
317
- except httpx.TimeoutException:
318
- await self._call_event_handler("on_completion_timeout")
319
- finally:
320
- await self.stop_processing_metrics()
321
- await self.push_frame(LLMFullResponseEndFrame())
322
-
323
-
324
- @dataclass
325
- class OpenAIContextAggregatorPair:
326
- _user: "OpenAIUserContextAggregator"
327
- _assistant: "OpenAIAssistantContextAggregator"
328
-
329
- def user(self) -> "OpenAIUserContextAggregator":
330
- return self._user
331
-
332
- def assistant(self) -> "OpenAIAssistantContextAggregator":
333
- return self._assistant
334
-
335
-
336
- class OpenAILLMService(BaseOpenAILLMService):
337
- def __init__(
338
- self,
339
- *,
340
- model: str = "gpt-4o",
341
- params: BaseOpenAILLMService.InputParams = BaseOpenAILLMService.InputParams(),
342
- **kwargs,
343
- ):
344
- super().__init__(model=model, params=params, **kwargs)
345
-
346
- @staticmethod
347
- def create_context_aggregator(
348
- context: OpenAILLMContext, *, assistant_expect_stripped_words: bool = True
349
- ) -> OpenAIContextAggregatorPair:
350
- user = OpenAIUserContextAggregator(context)
351
- assistant = OpenAIAssistantContextAggregator(
352
- context, expect_stripped_words=assistant_expect_stripped_words
353
- )
354
- return OpenAIContextAggregatorPair(_user=user, _assistant=assistant)
355
-
356
-
357
- class OpenAIImageGenService(ImageGenService):
358
- def __init__(
359
- self,
360
- *,
361
- api_key: str,
362
- aiohttp_session: aiohttp.ClientSession,
363
- image_size: Literal["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"],
364
- model: str = "dall-e-3",
365
- ):
366
- super().__init__()
367
- self.set_model_name(model)
368
- self._image_size = image_size
369
- self._client = AsyncOpenAI(api_key=api_key)
370
- self._aiohttp_session = aiohttp_session
371
-
372
- async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
373
- logger.debug(f"Generating image from prompt: {prompt}")
374
-
375
- image = await self._client.images.generate(
376
- prompt=prompt, model=self.model_name, n=1, size=self._image_size
377
- )
378
-
379
- image_url = image.data[0].url
380
-
381
- if not image_url:
382
- logger.error(f"{self} No image provided in response: {image}")
383
- yield ErrorFrame("Image generation failed")
384
- return
385
-
386
- # Load the image from the url
387
- async with self._aiohttp_session.get(image_url) as response:
388
- image_stream = io.BytesIO(await response.content.read())
389
- image = Image.open(image_stream)
390
- frame = URLImageRawFrame(image_url, image.tobytes(), image.size, image.format)
391
- yield frame
392
-
393
-
394
- class OpenAISTTService(BaseWhisperSTTService):
395
- """OpenAI Whisper speech-to-text service.
396
-
397
- Uses OpenAI's Whisper API to convert audio to text. Requires an OpenAI API key
398
- set via the api_key parameter or OPENAI_API_KEY environment variable.
399
-
400
- Args:
401
- model: Whisper model to use. Defaults to "whisper-1".
402
- api_key: OpenAI API key. Defaults to None.
403
- base_url: API base URL. Defaults to None.
404
- language: Language of the audio input. Defaults to English.
405
- prompt: Optional text to guide the model's style or continue a previous segment.
406
- temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
407
- **kwargs: Additional arguments passed to BaseWhisperSTTService.
408
- """
409
-
410
- def __init__(
411
- self,
412
- *,
413
- model: str = "whisper-1",
414
- api_key: Optional[str] = None,
415
- base_url: Optional[str] = None,
416
- language: Optional[Language] = Language.EN,
417
- prompt: Optional[str] = None,
418
- temperature: Optional[float] = None,
419
- **kwargs,
420
- ):
421
- super().__init__(
422
- model=model,
423
- api_key=api_key,
424
- base_url=base_url,
425
- language=language,
426
- prompt=prompt,
427
- temperature=temperature,
428
- **kwargs,
429
- )
430
-
431
- async def _transcribe(self, audio: bytes) -> Transcription:
432
- assert self._language is not None # Assigned in the BaseWhisperSTTService class
433
-
434
- # Build kwargs dict with only set parameters
435
- kwargs = {
436
- "file": ("audio.wav", audio, "audio/wav"),
437
- "model": self.model_name,
438
- "language": self._language,
439
- }
440
-
441
- if self._prompt is not None:
442
- kwargs["prompt"] = self._prompt
443
-
444
- if self._temperature is not None:
445
- kwargs["temperature"] = self._temperature
446
-
447
- return await self._client.audio.transcriptions.create(**kwargs)
448
-
449
-
450
- class OpenAITTSService(TTSService):
451
- """OpenAI Text-to-Speech service that generates audio from text.
452
-
453
- This service uses the OpenAI TTS API to generate PCM-encoded audio at 24kHz.
454
- When using with DailyTransport, configure the sample rate in DailyParams
455
- as shown below:
456
-
457
- DailyParams(
458
- audio_out_enabled=True,
459
- audio_out_sample_rate=24_000,
460
- )
461
-
462
- Args:
463
- api_key: OpenAI API key. Defaults to None.
464
- voice: Voice ID to use. Defaults to "alloy".
465
- model: TTS model to use ("tts-1" or "tts-1-hd"). Defaults to "tts-1".
466
- sample_rate: Output audio sample rate in Hz. Defaults to 24000.
467
- **kwargs: Additional keyword arguments passed to TTSService.
468
-
469
- The service returns PCM-encoded audio at the specified sample rate.
470
- """
471
-
472
- OPENAI_SAMPLE_RATE = 24000 # OpenAI TTS always outputs at 24kHz
473
-
474
- def __init__(
475
- self,
476
- *,
477
- api_key: Optional[str] = None,
478
- voice: str = "alloy",
479
- model: Literal["tts-1", "tts-1-hd"] = "tts-1",
480
- sample_rate: Optional[int] = None,
481
- **kwargs,
482
- ):
483
- if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
484
- logger.warning(
485
- f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
486
- f"Current rate of {self.sample_rate}Hz may cause issues."
487
- )
488
- super().__init__(sample_rate=sample_rate, **kwargs)
489
-
490
- self.set_model_name(model)
491
- self.set_voice(voice)
492
-
493
- self._client = AsyncOpenAI(api_key=api_key)
494
-
495
- def can_generate_metrics(self) -> bool:
496
- return True
497
-
498
- async def set_model(self, model: str):
499
- logger.info(f"Switching TTS model to: [{model}]")
500
- self.set_model_name(model)
501
-
502
- async def start(self, frame: StartFrame):
503
- await super().start(frame)
504
- if self.sample_rate != self.OPENAI_SAMPLE_RATE:
505
- logger.warning(
506
- f"OpenAI TTS requires {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
507
- f"Current rate of {self.sample_rate}Hz may cause issues."
508
- )
509
-
510
- async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
511
- logger.debug(f"Generating TTS: [{text}]")
512
- try:
513
- await self.start_ttfb_metrics()
514
-
515
- async with self._client.audio.speech.with_streaming_response.create(
516
- input=text or " ", # Text must contain at least one character
517
- model=self.model_name,
518
- voice=VALID_VOICES[self._voice_id],
519
- response_format="pcm",
520
- ) as r:
521
- if r.status_code != 200:
522
- error = await r.text()
523
- logger.error(
524
- f"{self} error getting audio (status: {r.status_code}, error: {error})"
525
- )
526
- yield ErrorFrame(
527
- f"Error getting audio (status: {r.status_code}, error: {error})"
528
- )
529
- return
530
-
531
- await self.start_tts_usage_metrics(text)
532
-
533
- CHUNK_SIZE = 1024
534
-
535
- yield TTSStartedFrame()
536
- async for chunk in r.iter_bytes(CHUNK_SIZE):
537
- if len(chunk) > 0:
538
- await self.stop_ttfb_metrics()
539
- frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
540
- yield frame
541
- yield TTSStoppedFrame()
542
- except BadRequestError as e:
543
- logger.exception(f"{self} error generating TTS: {e}")
544
-
545
-
546
- # internal use only -- todo: refactor
547
- @dataclass
548
- class OpenAIImageMessageFrame(Frame):
549
- user_image_raw_frame: UserImageRawFrame
550
- text: Optional[str] = None
551
-
552
-
553
- class OpenAIUserContextAggregator(LLMUserContextAggregator):
554
- def __init__(self, context: OpenAILLMContext, **kwargs):
555
- super().__init__(context=context, **kwargs)
556
-
557
- async def process_frame(self, frame, direction):
558
- await super().process_frame(frame, direction)
559
- # Our parent method has already called push_frame(). So we can't interrupt the
560
- # flow here and we don't need to call push_frame() ourselves.
561
- try:
562
- if isinstance(frame, UserImageRequestFrame):
563
- # The LLM sends a UserImageRequestFrame upstream. Cache any context provided with
564
- # that frame so we can use it when we assemble the image message in the assistant
565
- # context aggregator.
566
- if frame.context:
567
- if isinstance(frame.context, str):
568
- self._context._user_image_request_context[frame.user_id] = frame.context
569
- else:
570
- self.logger.error(
571
- f"Unexpected UserImageRequestFrame context type: {type(frame.context)}"
572
- )
573
- del self._context._user_image_request_context[frame.user_id]
574
- else:
575
- if frame.user_id in self._context._user_image_request_context:
576
- del self._context._user_image_request_context[frame.user_id]
577
- elif isinstance(frame, UserImageRawFrame):
578
- # Push a new OpenAIImageMessageFrame with the text context we cached
579
- # downstream to be handled by our assistant context aggregator. This is
580
- # necessary so that we add the message to the context in the right order.
581
- text = self._context._user_image_request_context.get(frame.user_id) or ""
582
- if text:
583
- del self._context._user_image_request_context[frame.user_id]
584
- frame = OpenAIImageMessageFrame(user_image_raw_frame=frame, text=text)
585
- await self.push_frame(frame)
586
- except Exception as e:
587
- self.logger.error(f"Error processing frame: {e}")
588
-
589
-
590
- class OpenAIAssistantContextAggregator(LLMAssistantContextAggregator):
591
- def __init__(self, context: OpenAILLMContext, **kwargs):
592
- super().__init__(context=context, **kwargs)
593
- self._function_calls_in_progress = {}
594
- self._function_call_result = None
595
- self._pending_image_frame_message = None
596
-
597
- async def process_frame(self, frame, direction):
598
- await super().process_frame(frame, direction)
599
- # See note above about not calling push_frame() here.
600
- if isinstance(frame, StartInterruptionFrame):
601
- self._function_calls_in_progress.clear()
602
- self._function_call_finished = None
603
- elif isinstance(frame, FunctionCallInProgressFrame):
604
- self.logger.debug(f"FunctionCallInProgressFrame: {frame}")
605
- self._function_calls_in_progress[frame.tool_call_id] = frame
606
- elif isinstance(frame, FunctionCallResultFrame):
607
- self.logger.debug(f"FunctionCallResultFrame: {frame}")
608
- if frame.tool_call_id in self._function_calls_in_progress:
609
- del self._function_calls_in_progress[frame.tool_call_id]
610
- self._function_call_result = frame
611
- # TODO-CB: Kwin wants us to refactor this out of here but I REFUSE
612
- await self.push_aggregation()
613
- else:
614
- self.logger.warning(
615
- "FunctionCallResultFrame tool_call_id does not match any function call in progress"
616
- )
617
- self._function_call_result = None
618
- elif isinstance(frame, OpenAIImageMessageFrame):
619
- self._pending_image_frame_message = frame
620
- await self.push_aggregation()
621
-
622
- async def push_aggregation(self):
623
- if not (
624
- self._aggregation or self._function_call_result or self._pending_image_frame_message
625
- ):
626
- return
627
-
628
- run_llm = False
629
- properties: Optional[FunctionCallResultProperties] = None
630
-
631
- aggregation = self._aggregation.strip()
632
- self.reset()
633
-
634
- try:
635
- if aggregation:
636
- self._context.add_message({"role": "assistant", "content": aggregation})
637
-
638
- if self._function_call_result:
639
- frame = self._function_call_result
640
- properties = frame.properties
641
- self._function_call_result = None
642
- if frame.result:
643
- self._context.add_message(
644
- {
645
- "role": "assistant",
646
- "tool_calls": [
647
- {
648
- "id": frame.tool_call_id,
649
- "function": {
650
- "name": frame.function_name,
651
- "arguments": json.dumps(frame.arguments),
652
- },
653
- "type": "function",
654
- }
655
- ],
656
- }
657
- )
658
- self._context.add_message(
659
- {
660
- "role": "tool",
661
- "content": json.dumps(frame.result),
662
- "tool_call_id": frame.tool_call_id,
663
- }
664
- )
665
- if properties and properties.run_llm is not None:
666
- # If the tool call result has a run_llm property, use it
667
- run_llm = properties.run_llm
668
- else:
669
- # Default behavior is to run the LLM if there are no function calls in progress
670
- run_llm = not bool(self._function_calls_in_progress)
671
-
672
- if self._pending_image_frame_message:
673
- frame = self._pending_image_frame_message
674
- self._pending_image_frame_message = None
675
- self._context.add_image_frame_message(
676
- format=frame.user_image_raw_frame.format,
677
- size=frame.user_image_raw_frame.size,
678
- image=frame.user_image_raw_frame.image,
679
- text=frame.text,
680
- )
681
- run_llm = True
682
-
683
- if run_llm:
684
- await self.push_context_frame(FrameDirection.UPSTREAM)
685
-
686
- # Emit the on_context_updated callback once the function call result is added to the context
687
- if properties and properties.on_context_updated is not None:
688
- await properties.on_context_updated()
689
-
690
- # Push context frame
691
- await self.push_context_frame()
692
-
693
- # Push timestamp frame with current time
694
- timestamp_frame = OpenAILLMContextAssistantTimestampFrame(timestamp=time_now_iso8601())
695
- await self.push_frame(timestamp_frame)
696
-
697
- except Exception as e:
698
- self.logger.error(f"Error processing frame: {e}")