dv-pipecat-ai 0.0.82.dev878__py3-none-any.whl → 0.0.82.dev884__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.82.dev878
3
+ Version: 0.0.82.dev884
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.82.dev878.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.82.dev884.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -59,10 +59,11 @@ pipecat/audio/turn/smart_turn/local_coreml_smart_turn.py,sha256=50kiBeZhnq7FZWZn
59
59
  pipecat/audio/turn/smart_turn/local_smart_turn.py,sha256=KVodqUTu8onfmfeOywgH98vBCNvBb-B3pvsQlTKyP_4,3570
60
60
  pipecat/audio/turn/smart_turn/local_smart_turn_v2.py,sha256=aYLMDURpmYycQgKsxbNEENtUe5oujeQ9H3Lbi0GYmZA,7160
61
61
  pipecat/audio/vad/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
- pipecat/audio/vad/silero.py,sha256=r9UL8aEe-QoRMNDGWLUlgUYew93-QFojE9sIqLO0VYE,7792
62
+ pipecat/audio/vad/silero.py,sha256=FM3Qj7X13-5vL5lPYBE-irFDeVGHO5HULXyDUVLEiAU,7887
63
63
  pipecat/audio/vad/vad_analyzer.py,sha256=XkZLEe4z7Ja0lGoYZst1HNYqt5qOwG-vjsk_w8chiNA,7430
64
64
  pipecat/audio/vad/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
65
  pipecat/audio/vad/data/silero_vad.onnx,sha256=WX0ws-wHZgjQWUd7sUz-_9-VG_XK43DTj2XTO7_oIAQ,2327524
66
+ pipecat/audio/vad/data/silero_vad_v2.onnx,sha256=JiOilT9v89LB5hdAxs23FoEzR5smff7xFKSjzFvdeI8,2327524
66
67
  pipecat/clocks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
68
  pipecat/clocks/base_clock.py,sha256=PuTmCtPKz5VG0VxhN5cyhbvduEBnfNPgA6GLAu1eSns,929
68
69
  pipecat/clocks/system_clock.py,sha256=ht6TdDAn0JVXEmhLdt5igcHMQOkKO4YHNuOjuKcxkUU,1315
@@ -161,7 +162,6 @@ pipecat/services/ai_services.py,sha256=_RrDWfM8adV17atzY9RxK0nXRVM5kbUkKrvN90GAW
161
162
  pipecat/services/image_service.py,sha256=tqJun4nYeyN_PaWqTdF_CFsOiqBf3XX7R4et5Y07mEU,2357
162
163
  pipecat/services/llm_service.py,sha256=KapuWbSUceKmNUOYmItBjVUbpwb3byegu6FmnQ2wuV0,25041
163
164
  pipecat/services/mcp_service.py,sha256=OYftGfdfGlDmjsWbF2b3CuMhPw8B1jcgaZUUYZPIA_o,14298
164
- pipecat/services/openai.py,sha256=fg5-MIvwqgKTN6i5Kp7GD6XUvMRo3nlughuNt9QqLGA,27546
165
165
  pipecat/services/stt_service.py,sha256=tShjVEl374j1Sc3qsdhTuWaT-8NJsAn-3yFw0XLRm4A,11163
166
166
  pipecat/services/tts_service.py,sha256=7OYTKrMLxsdfGGb4g8WfgeXEiZWxObFScLwhED2dVmM,34734
167
167
  pipecat/services/vision_service.py,sha256=dtI3U5RX30R6i97d6Rh7bVMqeh5ogWuwnM9j6djeXQ8,2519
@@ -186,7 +186,7 @@ pipecat/services/aws_nova_sonic/ready.wav,sha256=pxdKxZtYRV2IVv63v7K1EPkxyV_Oxoc
186
186
  pipecat/services/azure/__init__.py,sha256=mgnoJYeqKqwRvr18UZJhFZ2FTkGyob7r6IdtEiOeT3k,301
187
187
  pipecat/services/azure/common.py,sha256=JKGDYYW1zpRaWy_l_5ZL6viHj2Ch-mKMoVx2gdCKpeo,9893
188
188
  pipecat/services/azure/image.py,sha256=yP7_Uelz9gq2-nhRbjTNOJ6s-DrsjsGaqXPq-8Ud4q4,4191
189
- pipecat/services/azure/llm.py,sha256=rsysuFtC3oL2ozYaP0SrF1QKqAzK9b3MCwwzzMAmAbk,5489
189
+ pipecat/services/azure/llm.py,sha256=jmpKLU5T6fh4Y9nrgVvI-A60bUzptZbf3uEfJd0qUbI,4292
190
190
  pipecat/services/azure/stt.py,sha256=POhS5XTS-Z0SlKJDdGf18eR_5Nvbq0SnjG3R2xRcykg,12772
191
191
  pipecat/services/azure/tts.py,sha256=ytgXcYvdVkshC30K88ZGbYFtK8SmSV22h9jQEYKf9ew,19233
192
192
  pipecat/services/cartesia/__init__.py,sha256=vzh0jBnfPwWdxFfV-tu0x1HFoOTgr9s91GYmD-CJUtY,284
@@ -377,7 +377,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=HwDCqLGijhYD3F8nxDuQmEw-YkRw0
377
377
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
378
378
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
379
379
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
380
- dv_pipecat_ai-0.0.82.dev878.dist-info/METADATA,sha256=79zfMdkh5ZK6UwXYUi8LCnQ0PAOzLIHmhweMTd3F0AE,32639
381
- dv_pipecat_ai-0.0.82.dev878.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
382
- dv_pipecat_ai-0.0.82.dev878.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
383
- dv_pipecat_ai-0.0.82.dev878.dist-info/RECORD,,
380
+ dv_pipecat_ai-0.0.82.dev884.dist-info/METADATA,sha256=Y5QzIXGi-M4w82gZ8JBv9XMh92836vwjSWGKK1oisp0,32639
381
+ dv_pipecat_ai-0.0.82.dev884.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
382
+ dv_pipecat_ai-0.0.82.dev884.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
383
+ dv_pipecat_ai-0.0.82.dev884.dist-info/RECORD,,
@@ -135,7 +135,13 @@ class SileroVADAnalyzer(VADAnalyzer):
135
135
  with automatic model state management and periodic resets.
136
136
  """
137
137
 
138
- def __init__(self, *, sample_rate: Optional[int] = None, params: Optional[VADParams] = None):
138
+ def __init__(
139
+ self,
140
+ *,
141
+ sample_rate: Optional[int] = None,
142
+ params: Optional[VADParams] = None,
143
+ model_name: Optional[str] = None,
144
+ ):
139
145
  """Initialize the Silero VAD analyzer.
140
146
 
141
147
  Args:
@@ -146,7 +152,7 @@ class SileroVADAnalyzer(VADAnalyzer):
146
152
 
147
153
  logger.debug("Loading Silero VAD model...")
148
154
 
149
- model_name = "silero_vad.onnx"
155
+ model_name = model_name or "silero_vad.onnx"
150
156
  package_path = "pipecat.audio.vad.data"
151
157
 
152
158
  try:
@@ -5,14 +5,12 @@
5
5
 
6
6
  """Azure OpenAI service implementation for the Pipecat AI framework."""
7
7
 
8
- from typing import Any, Dict, List, Optional
8
+ from typing import Optional
9
9
 
10
10
  from loguru import logger
11
11
  from openai import AsyncAzureOpenAI
12
- from openai._streaming import AsyncStream
13
- from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam
14
12
 
15
- from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
13
+ from pipecat.adapters.services.open_ai_adapter import OpenAILLMInvocationParams
16
14
  from pipecat.services.openai.llm import OpenAILLMService
17
15
 
18
16
 
@@ -95,46 +93,22 @@ class AzureLLMService(OpenAILLMService):
95
93
  reasoning_models = {"gpt-5-nano", "gpt-5", "gpt-5-mini"}
96
94
  return model_name_lower in reasoning_models
97
95
 
98
- async def get_chat_completions(
99
- self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
100
- ) -> AsyncStream[ChatCompletionChunk]:
101
- """Get streaming chat completions from Azure OpenAI API.
102
-
103
- Handles both reasoning and standard models according to Azure AI Foundry documentation.
104
- Reasoning models use automatic chain of thought and have parameter limitations.
105
- """
106
- params = {
107
- "model": self.model_name,
108
- "stream": True,
109
- "messages": messages,
110
- "tools": context.tools,
111
- "tool_choice": context.tool_choice,
112
- "stream_options": {"include_usage": True},
113
- "max_tokens": self._settings["max_tokens"],
114
- "max_completion_tokens": self._settings["max_completion_tokens"],
115
- }
96
+ def build_chat_completion_params(self, params_from_context: OpenAILLMInvocationParams) -> dict:
97
+ # include base params
98
+ params = super().build_chat_completion_params(params_from_context)
116
99
 
117
100
  if self._is_reasoning_model():
118
- # Reasoning models generally do NOT support temperature, presence_penalty, top_p
101
+ # not required for reasoning models
102
+ for k in ("frequency_penalty", "presence_penalty", "temperature", "top_p"):
103
+ if k in params:
104
+ params.pop(k, None)
119
105
  if self._reasoning_effort:
120
106
  params["reasoning_effort"] = self._reasoning_effort
121
- if self._settings.get("seed"):
122
- params["seed"] = self._settings["seed"]
107
+ seed = self._settings.get("seed")
108
+ if seed is not None:
109
+ params["seed"] = seed
123
110
  else:
124
- # Standard models support all parameters
125
- params.update(
126
- {
127
- "frequency_penalty": self._settings["frequency_penalty"],
128
- "presence_penalty": self._settings["presence_penalty"],
129
- "seed": self._settings["seed"],
130
- "temperature": self._settings["temperature"],
131
- "top_p": self._settings["top_p"],
132
- }
133
- )
134
-
135
- # Add any extra parameters from settings
136
- extra_params = self._settings.get("extra", {})
137
- params.update(extra_params)
138
-
139
- chunks = await self._client.chat.completions.create(**params)
140
- return chunks
111
+ # Standard models are fine with the defaults from the base class
112
+ pass
113
+
114
+ return params
@@ -1,698 +0,0 @@
1
- #
2
- # Copyright (c) 2024–2025, Daily
3
- #
4
- # SPDX-License-Identifier: BSD 2-Clause License
5
- #
6
-
7
- import base64
8
- import io
9
- import json
10
- from dataclasses import dataclass
11
- from typing import Any, AsyncGenerator, Dict, List, Literal, Optional
12
-
13
- import aiohttp
14
- import httpx
15
- from loguru import logger
16
- from openai import (
17
- NOT_GIVEN,
18
- AsyncOpenAI,
19
- AsyncStream,
20
- BadRequestError,
21
- DefaultAsyncHttpxClient,
22
- )
23
- from openai.types.chat import ChatCompletionChunk, ChatCompletionMessageParam
24
- from PIL import Image
25
- from pydantic import BaseModel, Field
26
-
27
- from pipecat.frames.frames import (
28
- ErrorFrame,
29
- Frame,
30
- FunctionCallInProgressFrame,
31
- FunctionCallResultFrame,
32
- FunctionCallResultProperties,
33
- LLMFullResponseEndFrame,
34
- LLMFullResponseStartFrame,
35
- LLMMessagesFrame,
36
- LLMTextFrame,
37
- LLMUpdateSettingsFrame,
38
- OpenAILLMContextAssistantTimestampFrame,
39
- StartFrame,
40
- StartInterruptionFrame,
41
- TTSAudioRawFrame,
42
- TTSStartedFrame,
43
- TTSStoppedFrame,
44
- URLImageRawFrame,
45
- UserImageRawFrame,
46
- UserImageRequestFrame,
47
- VisionImageRawFrame,
48
- )
49
- from pipecat.metrics.metrics import LLMTokenUsage
50
- from pipecat.processors.aggregators.llm_response import (
51
- LLMAssistantContextAggregator,
52
- LLMUserContextAggregator,
53
- )
54
- from pipecat.processors.aggregators.openai_llm_context import (
55
- OpenAILLMContext,
56
- OpenAILLMContextFrame,
57
- )
58
- from pipecat.processors.frame_processor import FrameDirection
59
- from pipecat.services.ai_services import (
60
- ImageGenService,
61
- LLMService,
62
- TTSService,
63
- )
64
- from pipecat.services.base_whisper import BaseWhisperSTTService, Transcription
65
- from pipecat.transcriptions.language import Language
66
- from pipecat.utils.time import time_now_iso8601
67
-
68
- ValidVoice = Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
69
-
70
- VALID_VOICES: Dict[str, ValidVoice] = {
71
- "alloy": "alloy",
72
- "echo": "echo",
73
- "fable": "fable",
74
- "onyx": "onyx",
75
- "nova": "nova",
76
- "shimmer": "shimmer",
77
- }
78
-
79
-
80
- class OpenAIUnhandledFunctionException(Exception):
81
- pass
82
-
83
-
84
- class BaseOpenAILLMService(LLMService):
85
- """This is the base for all services that use the AsyncOpenAI client.
86
-
87
- This service consumes OpenAILLMContextFrame frames, which contain a reference
88
- to an OpenAILLMContext frame. The OpenAILLMContext object defines the context
89
- sent to the LLM for a completion. This includes user, assistant and system messages
90
- as well as tool choices and the tool, which is used if requesting function
91
- calls from the LLM.
92
- """
93
-
94
- class InputParams(BaseModel):
95
- frequency_penalty: Optional[float] = Field(
96
- default_factory=lambda: NOT_GIVEN, ge=-2.0, le=2.0
97
- )
98
- presence_penalty: Optional[float] = Field(
99
- default_factory=lambda: NOT_GIVEN, ge=-2.0, le=2.0
100
- )
101
- seed: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=0)
102
- temperature: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=2.0)
103
- # Note: top_k is currently not supported by the OpenAI client library,
104
- # so top_k is ignored right now.
105
- top_k: Optional[int] = Field(default=None, ge=0)
106
- top_p: Optional[float] = Field(default_factory=lambda: NOT_GIVEN, ge=0.0, le=1.0)
107
- max_tokens: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=1)
108
- max_completion_tokens: Optional[int] = Field(default_factory=lambda: NOT_GIVEN, ge=1)
109
- extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
110
-
111
- def __init__(
112
- self,
113
- *,
114
- model: str,
115
- api_key=None,
116
- base_url=None,
117
- organization=None,
118
- project=None,
119
- params: InputParams = InputParams(),
120
- **kwargs,
121
- ):
122
- super().__init__(**kwargs)
123
- self._settings = {
124
- "frequency_penalty": params.frequency_penalty,
125
- "presence_penalty": params.presence_penalty,
126
- "seed": params.seed,
127
- "temperature": params.temperature,
128
- "top_p": params.top_p,
129
- "max_tokens": params.max_tokens,
130
- "max_completion_tokens": params.max_completion_tokens,
131
- "extra": params.extra if isinstance(params.extra, dict) else {},
132
- }
133
- self.set_model_name(model)
134
- self._client = self.create_client(
135
- api_key=api_key, base_url=base_url, organization=organization, project=project, **kwargs
136
- )
137
-
138
- def create_client(self, api_key=None, base_url=None, organization=None, project=None, **kwargs):
139
- return AsyncOpenAI(
140
- api_key=api_key,
141
- base_url=base_url,
142
- organization=organization,
143
- project=project,
144
- http_client=DefaultAsyncHttpxClient(
145
- limits=httpx.Limits(
146
- max_keepalive_connections=100, max_connections=1000, keepalive_expiry=None
147
- )
148
- ),
149
- )
150
-
151
- def can_generate_metrics(self) -> bool:
152
- return True
153
-
154
- async def get_chat_completions(
155
- self, context: OpenAILLMContext, messages: List[ChatCompletionMessageParam]
156
- ) -> AsyncStream[ChatCompletionChunk]:
157
- params = {
158
- "model": self.model_name,
159
- "stream": True,
160
- "messages": messages,
161
- "tools": context.tools,
162
- "tool_choice": context.tool_choice,
163
- "stream_options": {"include_usage": True},
164
- "frequency_penalty": self._settings["frequency_penalty"],
165
- "presence_penalty": self._settings["presence_penalty"],
166
- "seed": self._settings["seed"],
167
- "temperature": self._settings["temperature"],
168
- "top_p": self._settings["top_p"],
169
- "max_tokens": self._settings["max_tokens"],
170
- "max_completion_tokens": self._settings["max_completion_tokens"],
171
- }
172
-
173
- params.update(self._settings["extra"])
174
-
175
- chunks = await self._client.chat.completions.create(**params)
176
- return chunks
177
-
178
- async def _stream_chat_completions(
179
- self, context: OpenAILLMContext
180
- ) -> AsyncStream[ChatCompletionChunk]:
181
- self.logger.debug(f"Generating chat: {context.get_messages_for_logging()}")
182
-
183
- messages: List[ChatCompletionMessageParam] = context.get_messages()
184
-
185
- # base64 encode any images
186
- for message in messages:
187
- if message.get("mime_type") == "image/jpeg":
188
- encoded_image = base64.b64encode(message["data"].getvalue()).decode("utf-8")
189
- text = message["content"]
190
- message["content"] = [
191
- {"type": "text", "text": text},
192
- {
193
- "type": "image_url",
194
- "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"},
195
- },
196
- ]
197
- del message["data"]
198
- del message["mime_type"]
199
-
200
- chunks = await self.get_chat_completions(context, messages)
201
-
202
- return chunks
203
-
204
- async def _process_context(self, context: OpenAILLMContext):
205
- functions_list = []
206
- arguments_list = []
207
- tool_id_list = []
208
- func_idx = 0
209
- function_name = ""
210
- arguments = ""
211
- tool_call_id = ""
212
-
213
- await self.start_ttfb_metrics()
214
-
215
- chunk_stream: AsyncStream[ChatCompletionChunk] = await self._stream_chat_completions(
216
- context
217
- )
218
-
219
- async for chunk in chunk_stream:
220
- if chunk.usage:
221
- tokens = LLMTokenUsage(
222
- prompt_tokens=chunk.usage.prompt_tokens,
223
- completion_tokens=chunk.usage.completion_tokens,
224
- total_tokens=chunk.usage.total_tokens,
225
- )
226
- await self.start_llm_usage_metrics(tokens)
227
-
228
- if chunk.choices is None or len(chunk.choices) == 0:
229
- continue
230
-
231
- await self.stop_ttfb_metrics()
232
-
233
- if not chunk.choices[0].delta:
234
- continue
235
-
236
- if chunk.choices[0].delta.tool_calls:
237
- # We're streaming the LLM response to enable the fastest response times.
238
- # For text, we just yield each chunk as we receive it and count on consumers
239
- # to do whatever coalescing they need (eg. to pass full sentences to TTS)
240
- #
241
- # If the LLM is a function call, we'll do some coalescing here.
242
- # If the response contains a function name, we'll yield a frame to tell consumers
243
- # that they can start preparing to call the function with that name.
244
- # We accumulate all the arguments for the rest of the streamed response, then when
245
- # the response is done, we package up all the arguments and the function name and
246
- # yield a frame containing the function name and the arguments.
247
-
248
- tool_call = chunk.choices[0].delta.tool_calls[0]
249
- if tool_call.index != func_idx:
250
- functions_list.append(function_name)
251
- arguments_list.append(arguments)
252
- tool_id_list.append(tool_call_id)
253
- function_name = ""
254
- arguments = ""
255
- tool_call_id = ""
256
- func_idx += 1
257
- if tool_call.function and tool_call.function.name:
258
- function_name += tool_call.function.name
259
- tool_call_id = tool_call.id
260
- if tool_call.function and tool_call.function.arguments:
261
- # Keep iterating through the response to collect all the argument fragments
262
- arguments += tool_call.function.arguments
263
- elif chunk.choices[0].delta.content:
264
- await self.push_frame(LLMTextFrame(chunk.choices[0].delta.content))
265
-
266
- # if we got a function name and arguments, check to see if it's a function with
267
- # a registered handler. If so, run the registered callback, save the result to
268
- # the context, and re-prompt to get a chat answer. If we don't have a registered
269
- # handler, raise an exception.
270
- if function_name and arguments:
271
- # added to the list as last function name and arguments not added to the list
272
- functions_list.append(function_name)
273
- arguments_list.append(arguments)
274
- tool_id_list.append(tool_call_id)
275
-
276
- for index, (function_name, arguments, tool_id) in enumerate(
277
- zip(functions_list, arguments_list, tool_id_list), start=1
278
- ):
279
- if self.has_function(function_name):
280
- run_llm = False
281
- arguments = json.loads(arguments)
282
- await self.call_function(
283
- context=context,
284
- function_name=function_name,
285
- arguments=arguments,
286
- tool_call_id=tool_id,
287
- run_llm=run_llm,
288
- )
289
- else:
290
- raise OpenAIUnhandledFunctionException(
291
- f"The LLM tried to call a function named '{function_name}', but there isn't a callback registered for that function."
292
- )
293
-
294
- async def process_frame(self, frame: Frame, direction: FrameDirection):
295
- await super().process_frame(frame, direction)
296
-
297
- context = None
298
- if isinstance(frame, OpenAILLMContextFrame):
299
- context: OpenAILLMContext = frame.context
300
- elif isinstance(frame, LLMMessagesFrame):
301
- context = OpenAILLMContext.from_messages(frame.messages)
302
- elif isinstance(frame, VisionImageRawFrame):
303
- context = OpenAILLMContext()
304
- context.add_image_frame_message(
305
- format=frame.format, size=frame.size, image=frame.image, text=frame.text
306
- )
307
- elif isinstance(frame, LLMUpdateSettingsFrame):
308
- await self._update_settings(frame.settings)
309
- else:
310
- await self.push_frame(frame, direction)
311
-
312
- if context:
313
- try:
314
- await self.push_frame(LLMFullResponseStartFrame())
315
- await self.start_processing_metrics()
316
- await self._process_context(context)
317
- except httpx.TimeoutException:
318
- await self._call_event_handler("on_completion_timeout")
319
- finally:
320
- await self.stop_processing_metrics()
321
- await self.push_frame(LLMFullResponseEndFrame())
322
-
323
-
324
- @dataclass
325
- class OpenAIContextAggregatorPair:
326
- _user: "OpenAIUserContextAggregator"
327
- _assistant: "OpenAIAssistantContextAggregator"
328
-
329
- def user(self) -> "OpenAIUserContextAggregator":
330
- return self._user
331
-
332
- def assistant(self) -> "OpenAIAssistantContextAggregator":
333
- return self._assistant
334
-
335
-
336
- class OpenAILLMService(BaseOpenAILLMService):
337
- def __init__(
338
- self,
339
- *,
340
- model: str = "gpt-4o",
341
- params: BaseOpenAILLMService.InputParams = BaseOpenAILLMService.InputParams(),
342
- **kwargs,
343
- ):
344
- super().__init__(model=model, params=params, **kwargs)
345
-
346
- @staticmethod
347
- def create_context_aggregator(
348
- context: OpenAILLMContext, *, assistant_expect_stripped_words: bool = True
349
- ) -> OpenAIContextAggregatorPair:
350
- user = OpenAIUserContextAggregator(context)
351
- assistant = OpenAIAssistantContextAggregator(
352
- context, expect_stripped_words=assistant_expect_stripped_words
353
- )
354
- return OpenAIContextAggregatorPair(_user=user, _assistant=assistant)
355
-
356
-
357
- class OpenAIImageGenService(ImageGenService):
358
- def __init__(
359
- self,
360
- *,
361
- api_key: str,
362
- aiohttp_session: aiohttp.ClientSession,
363
- image_size: Literal["256x256", "512x512", "1024x1024", "1792x1024", "1024x1792"],
364
- model: str = "dall-e-3",
365
- ):
366
- super().__init__()
367
- self.set_model_name(model)
368
- self._image_size = image_size
369
- self._client = AsyncOpenAI(api_key=api_key)
370
- self._aiohttp_session = aiohttp_session
371
-
372
- async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
373
- logger.debug(f"Generating image from prompt: {prompt}")
374
-
375
- image = await self._client.images.generate(
376
- prompt=prompt, model=self.model_name, n=1, size=self._image_size
377
- )
378
-
379
- image_url = image.data[0].url
380
-
381
- if not image_url:
382
- logger.error(f"{self} No image provided in response: {image}")
383
- yield ErrorFrame("Image generation failed")
384
- return
385
-
386
- # Load the image from the url
387
- async with self._aiohttp_session.get(image_url) as response:
388
- image_stream = io.BytesIO(await response.content.read())
389
- image = Image.open(image_stream)
390
- frame = URLImageRawFrame(image_url, image.tobytes(), image.size, image.format)
391
- yield frame
392
-
393
-
394
- class OpenAISTTService(BaseWhisperSTTService):
395
- """OpenAI Whisper speech-to-text service.
396
-
397
- Uses OpenAI's Whisper API to convert audio to text. Requires an OpenAI API key
398
- set via the api_key parameter or OPENAI_API_KEY environment variable.
399
-
400
- Args:
401
- model: Whisper model to use. Defaults to "whisper-1".
402
- api_key: OpenAI API key. Defaults to None.
403
- base_url: API base URL. Defaults to None.
404
- language: Language of the audio input. Defaults to English.
405
- prompt: Optional text to guide the model's style or continue a previous segment.
406
- temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.
407
- **kwargs: Additional arguments passed to BaseWhisperSTTService.
408
- """
409
-
410
- def __init__(
411
- self,
412
- *,
413
- model: str = "whisper-1",
414
- api_key: Optional[str] = None,
415
- base_url: Optional[str] = None,
416
- language: Optional[Language] = Language.EN,
417
- prompt: Optional[str] = None,
418
- temperature: Optional[float] = None,
419
- **kwargs,
420
- ):
421
- super().__init__(
422
- model=model,
423
- api_key=api_key,
424
- base_url=base_url,
425
- language=language,
426
- prompt=prompt,
427
- temperature=temperature,
428
- **kwargs,
429
- )
430
-
431
- async def _transcribe(self, audio: bytes) -> Transcription:
432
- assert self._language is not None # Assigned in the BaseWhisperSTTService class
433
-
434
- # Build kwargs dict with only set parameters
435
- kwargs = {
436
- "file": ("audio.wav", audio, "audio/wav"),
437
- "model": self.model_name,
438
- "language": self._language,
439
- }
440
-
441
- if self._prompt is not None:
442
- kwargs["prompt"] = self._prompt
443
-
444
- if self._temperature is not None:
445
- kwargs["temperature"] = self._temperature
446
-
447
- return await self._client.audio.transcriptions.create(**kwargs)
448
-
449
-
450
- class OpenAITTSService(TTSService):
451
- """OpenAI Text-to-Speech service that generates audio from text.
452
-
453
- This service uses the OpenAI TTS API to generate PCM-encoded audio at 24kHz.
454
- When using with DailyTransport, configure the sample rate in DailyParams
455
- as shown below:
456
-
457
- DailyParams(
458
- audio_out_enabled=True,
459
- audio_out_sample_rate=24_000,
460
- )
461
-
462
- Args:
463
- api_key: OpenAI API key. Defaults to None.
464
- voice: Voice ID to use. Defaults to "alloy".
465
- model: TTS model to use ("tts-1" or "tts-1-hd"). Defaults to "tts-1".
466
- sample_rate: Output audio sample rate in Hz. Defaults to 24000.
467
- **kwargs: Additional keyword arguments passed to TTSService.
468
-
469
- The service returns PCM-encoded audio at the specified sample rate.
470
- """
471
-
472
- OPENAI_SAMPLE_RATE = 24000 # OpenAI TTS always outputs at 24kHz
473
-
474
- def __init__(
475
- self,
476
- *,
477
- api_key: Optional[str] = None,
478
- voice: str = "alloy",
479
- model: Literal["tts-1", "tts-1-hd"] = "tts-1",
480
- sample_rate: Optional[int] = None,
481
- **kwargs,
482
- ):
483
- if sample_rate and sample_rate != self.OPENAI_SAMPLE_RATE:
484
- logger.warning(
485
- f"OpenAI TTS only supports {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
486
- f"Current rate of {self.sample_rate}Hz may cause issues."
487
- )
488
- super().__init__(sample_rate=sample_rate, **kwargs)
489
-
490
- self.set_model_name(model)
491
- self.set_voice(voice)
492
-
493
- self._client = AsyncOpenAI(api_key=api_key)
494
-
495
- def can_generate_metrics(self) -> bool:
496
- return True
497
-
498
- async def set_model(self, model: str):
499
- logger.info(f"Switching TTS model to: [{model}]")
500
- self.set_model_name(model)
501
-
502
- async def start(self, frame: StartFrame):
503
- await super().start(frame)
504
- if self.sample_rate != self.OPENAI_SAMPLE_RATE:
505
- logger.warning(
506
- f"OpenAI TTS requires {self.OPENAI_SAMPLE_RATE}Hz sample rate. "
507
- f"Current rate of {self.sample_rate}Hz may cause issues."
508
- )
509
-
510
- async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
511
- logger.debug(f"Generating TTS: [{text}]")
512
- try:
513
- await self.start_ttfb_metrics()
514
-
515
- async with self._client.audio.speech.with_streaming_response.create(
516
- input=text or " ", # Text must contain at least one character
517
- model=self.model_name,
518
- voice=VALID_VOICES[self._voice_id],
519
- response_format="pcm",
520
- ) as r:
521
- if r.status_code != 200:
522
- error = await r.text()
523
- logger.error(
524
- f"{self} error getting audio (status: {r.status_code}, error: {error})"
525
- )
526
- yield ErrorFrame(
527
- f"Error getting audio (status: {r.status_code}, error: {error})"
528
- )
529
- return
530
-
531
- await self.start_tts_usage_metrics(text)
532
-
533
- CHUNK_SIZE = 1024
534
-
535
- yield TTSStartedFrame()
536
- async for chunk in r.iter_bytes(CHUNK_SIZE):
537
- if len(chunk) > 0:
538
- await self.stop_ttfb_metrics()
539
- frame = TTSAudioRawFrame(chunk, self.sample_rate, 1)
540
- yield frame
541
- yield TTSStoppedFrame()
542
- except BadRequestError as e:
543
- logger.exception(f"{self} error generating TTS: {e}")
544
-
545
-
546
- # internal use only -- todo: refactor
547
- @dataclass
548
- class OpenAIImageMessageFrame(Frame):
549
- user_image_raw_frame: UserImageRawFrame
550
- text: Optional[str] = None
551
-
552
-
553
- class OpenAIUserContextAggregator(LLMUserContextAggregator):
554
- def __init__(self, context: OpenAILLMContext, **kwargs):
555
- super().__init__(context=context, **kwargs)
556
-
557
- async def process_frame(self, frame, direction):
558
- await super().process_frame(frame, direction)
559
- # Our parent method has already called push_frame(). So we can't interrupt the
560
- # flow here and we don't need to call push_frame() ourselves.
561
- try:
562
- if isinstance(frame, UserImageRequestFrame):
563
- # The LLM sends a UserImageRequestFrame upstream. Cache any context provided with
564
- # that frame so we can use it when we assemble the image message in the assistant
565
- # context aggregator.
566
- if frame.context:
567
- if isinstance(frame.context, str):
568
- self._context._user_image_request_context[frame.user_id] = frame.context
569
- else:
570
- self.logger.error(
571
- f"Unexpected UserImageRequestFrame context type: {type(frame.context)}"
572
- )
573
- del self._context._user_image_request_context[frame.user_id]
574
- else:
575
- if frame.user_id in self._context._user_image_request_context:
576
- del self._context._user_image_request_context[frame.user_id]
577
- elif isinstance(frame, UserImageRawFrame):
578
- # Push a new OpenAIImageMessageFrame with the text context we cached
579
- # downstream to be handled by our assistant context aggregator. This is
580
- # necessary so that we add the message to the context in the right order.
581
- text = self._context._user_image_request_context.get(frame.user_id) or ""
582
- if text:
583
- del self._context._user_image_request_context[frame.user_id]
584
- frame = OpenAIImageMessageFrame(user_image_raw_frame=frame, text=text)
585
- await self.push_frame(frame)
586
- except Exception as e:
587
- self.logger.error(f"Error processing frame: {e}")
588
-
589
-
590
- class OpenAIAssistantContextAggregator(LLMAssistantContextAggregator):
591
- def __init__(self, context: OpenAILLMContext, **kwargs):
592
- super().__init__(context=context, **kwargs)
593
- self._function_calls_in_progress = {}
594
- self._function_call_result = None
595
- self._pending_image_frame_message = None
596
-
597
- async def process_frame(self, frame, direction):
598
- await super().process_frame(frame, direction)
599
- # See note above about not calling push_frame() here.
600
- if isinstance(frame, StartInterruptionFrame):
601
- self._function_calls_in_progress.clear()
602
- self._function_call_finished = None
603
- elif isinstance(frame, FunctionCallInProgressFrame):
604
- self.logger.debug(f"FunctionCallInProgressFrame: {frame}")
605
- self._function_calls_in_progress[frame.tool_call_id] = frame
606
- elif isinstance(frame, FunctionCallResultFrame):
607
- self.logger.debug(f"FunctionCallResultFrame: {frame}")
608
- if frame.tool_call_id in self._function_calls_in_progress:
609
- del self._function_calls_in_progress[frame.tool_call_id]
610
- self._function_call_result = frame
611
- # TODO-CB: Kwin wants us to refactor this out of here but I REFUSE
612
- await self.push_aggregation()
613
- else:
614
- self.logger.warning(
615
- "FunctionCallResultFrame tool_call_id does not match any function call in progress"
616
- )
617
- self._function_call_result = None
618
- elif isinstance(frame, OpenAIImageMessageFrame):
619
- self._pending_image_frame_message = frame
620
- await self.push_aggregation()
621
-
622
- async def push_aggregation(self):
623
- if not (
624
- self._aggregation or self._function_call_result or self._pending_image_frame_message
625
- ):
626
- return
627
-
628
- run_llm = False
629
- properties: Optional[FunctionCallResultProperties] = None
630
-
631
- aggregation = self._aggregation.strip()
632
- self.reset()
633
-
634
- try:
635
- if aggregation:
636
- self._context.add_message({"role": "assistant", "content": aggregation})
637
-
638
- if self._function_call_result:
639
- frame = self._function_call_result
640
- properties = frame.properties
641
- self._function_call_result = None
642
- if frame.result:
643
- self._context.add_message(
644
- {
645
- "role": "assistant",
646
- "tool_calls": [
647
- {
648
- "id": frame.tool_call_id,
649
- "function": {
650
- "name": frame.function_name,
651
- "arguments": json.dumps(frame.arguments),
652
- },
653
- "type": "function",
654
- }
655
- ],
656
- }
657
- )
658
- self._context.add_message(
659
- {
660
- "role": "tool",
661
- "content": json.dumps(frame.result),
662
- "tool_call_id": frame.tool_call_id,
663
- }
664
- )
665
- if properties and properties.run_llm is not None:
666
- # If the tool call result has a run_llm property, use it
667
- run_llm = properties.run_llm
668
- else:
669
- # Default behavior is to run the LLM if there are no function calls in progress
670
- run_llm = not bool(self._function_calls_in_progress)
671
-
672
- if self._pending_image_frame_message:
673
- frame = self._pending_image_frame_message
674
- self._pending_image_frame_message = None
675
- self._context.add_image_frame_message(
676
- format=frame.user_image_raw_frame.format,
677
- size=frame.user_image_raw_frame.size,
678
- image=frame.user_image_raw_frame.image,
679
- text=frame.text,
680
- )
681
- run_llm = True
682
-
683
- if run_llm:
684
- await self.push_context_frame(FrameDirection.UPSTREAM)
685
-
686
- # Emit the on_context_updated callback once the function call result is added to the context
687
- if properties and properties.on_context_updated is not None:
688
- await properties.on_context_updated()
689
-
690
- # Push context frame
691
- await self.push_context_frame()
692
-
693
- # Push timestamp frame with current time
694
- timestamp_frame = OpenAILLMContextAssistantTimestampFrame(timestamp=time_now_iso8601())
695
- await self.push_frame(timestamp_frame)
696
-
697
- except Exception as e:
698
- self.logger.error(f"Error processing frame: {e}")