PyPI - letta-nightly - Versions diffs - 0.11.4.dev20250825104222__py3-none-any.whl → 0.11.5__py3-none-any.whl - Mend

letta-nightly 0.11.4.dev20250825104222py3-none-any.whl → 0.11.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

letta/__init__.py +1 -1
letta/agent.py +9 -3
letta/agents/base_agent.py +2 -2
letta/agents/letta_agent.py +56 -45
letta/agents/voice_agent.py +2 -2
letta/data_sources/redis_client.py +146 -1
letta/errors.py +4 -0
letta/functions/function_sets/files.py +2 -2
letta/functions/mcp_client/types.py +30 -6
letta/functions/schema_generator.py +46 -1
letta/functions/schema_validator.py +17 -2
letta/functions/types.py +1 -1
letta/helpers/tool_execution_helper.py +0 -2
letta/llm_api/anthropic_client.py +27 -5
letta/llm_api/deepseek_client.py +97 -0
letta/llm_api/groq_client.py +79 -0
letta/llm_api/helpers.py +0 -1
letta/llm_api/llm_api_tools.py +2 -113
letta/llm_api/llm_client.py +21 -0
letta/llm_api/llm_client_base.py +11 -9
letta/llm_api/openai_client.py +3 -0
letta/llm_api/xai_client.py +85 -0
letta/prompts/prompt_generator.py +190 -0
letta/schemas/agent_file.py +17 -2
letta/schemas/file.py +24 -1
letta/schemas/job.py +2 -0
letta/schemas/letta_message.py +2 -0
letta/schemas/letta_request.py +22 -0
letta/schemas/message.py +10 -1
letta/schemas/providers/bedrock.py +1 -0
letta/server/rest_api/redis_stream_manager.py +300 -0
letta/server/rest_api/routers/v1/agents.py +129 -7
letta/server/rest_api/routers/v1/folders.py +15 -5
letta/server/rest_api/routers/v1/runs.py +101 -11
letta/server/rest_api/routers/v1/sources.py +21 -53
letta/server/rest_api/routers/v1/telemetry.py +14 -4
letta/server/rest_api/routers/v1/tools.py +2 -2
letta/server/rest_api/streaming_response.py +3 -24
letta/server/server.py +0 -1
letta/services/agent_manager.py +2 -2
letta/services/agent_serialization_manager.py +129 -32
letta/services/file_manager.py +111 -6
letta/services/file_processor/file_processor.py +5 -2
letta/services/files_agents_manager.py +60 -0
letta/services/helpers/agent_manager_helper.py +4 -205
letta/services/helpers/tool_parser_helper.py +6 -3
letta/services/mcp/base_client.py +7 -1
letta/services/mcp/sse_client.py +7 -2
letta/services/mcp/stdio_client.py +5 -0
letta/services/mcp/streamable_http_client.py +11 -2
letta/services/mcp_manager.py +31 -30
letta/services/source_manager.py +26 -1
letta/services/summarizer/summarizer.py +21 -10
letta/services/tool_executor/files_tool_executor.py +13 -9
letta/services/tool_executor/mcp_tool_executor.py +3 -0
letta/services/tool_executor/tool_execution_manager.py +13 -0
letta/services/tool_manager.py +43 -20
letta/settings.py +1 -0
letta/utils.py +37 -0
{letta_nightly-0.11.4.dev20250825104222.dist-info → letta_nightly-0.11.5.dist-info}/METADATA +2 -2
{letta_nightly-0.11.4.dev20250825104222.dist-info → letta_nightly-0.11.5.dist-info}/RECORD +64 -63
letta/functions/mcp_client/__init__.py +0 -0
letta/functions/mcp_client/base_client.py +0 -156
letta/functions/mcp_client/sse_client.py +0 -51
letta/functions/mcp_client/stdio_client.py +0 -109
{letta_nightly-0.11.4.dev20250825104222.dist-info → letta_nightly-0.11.5.dist-info}/LICENSE +0 -0
{letta_nightly-0.11.4.dev20250825104222.dist-info → letta_nightly-0.11.5.dist-info}/WHEEL +0 -0
{letta_nightly-0.11.4.dev20250825104222.dist-info → letta_nightly-0.11.5.dist-info}/entry_points.txt +0 -0

letta/llm_api/anthropic_client.py CHANGED Viewed

@@ -287,12 +287,34 @@ class AnthropicClient(LLMClientBase):
         else:
             anthropic_tools = None
+        thinking_enabled = False
+        if messages and len(messages) > 0:
+            # Check if the last assistant message starts with a thinking block
+            # Find the last assistant message
+            last_assistant_message = None
+            for message in reversed(messages):
+                if message.get("role") == "assistant":
+                    last_assistant_message = message
+                    break
+            if (
+                last_assistant_message
+                and isinstance(last_assistant_message.get("content"), list)
+                and len(last_assistant_message["content"]) > 0
+                and last_assistant_message["content"][0].get("type") == "thinking"
+            ):
+                thinking_enabled = True
         try:
-            result = await client.beta.messages.count_tokens(
-                model=model or "claude-3-7-sonnet-20250219",
-                messages=messages or [{"role": "user", "content": "hi"}],
-                tools=anthropic_tools or [],
-            )
+            count_params = {
+                "model": model or "claude-3-7-sonnet-20250219",
+                "messages": messages or [{"role": "user", "content": "hi"}],
+                "tools": anthropic_tools or [],
+            }
+            if thinking_enabled:
+                count_params["thinking"] = {"type": "enabled", "budget_tokens": 16000}
+            result = await client.beta.messages.count_tokens(**count_params)
         except:
             raise

letta/llm_api/deepseek_client.py ADDED Viewed

@@ -0,0 +1,97 @@
+import os
+from typing import List, Optional
+from openai import AsyncOpenAI, AsyncStream, OpenAI
+from openai.types.chat.chat_completion import ChatCompletion
+from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
+from letta.llm_api.deepseek import convert_deepseek_response_to_chatcompletion, map_messages_to_deepseek_format
+from letta.llm_api.openai_client import OpenAIClient
+from letta.otel.tracing import trace_method
+from letta.schemas.llm_config import LLMConfig
+from letta.schemas.message import Message as PydanticMessage
+from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
+from letta.settings import model_settings
+class DeepseekClient(OpenAIClient):
+    def requires_auto_tool_choice(self, llm_config: LLMConfig) -> bool:
+        return False
+    def supports_structured_output(self, llm_config: LLMConfig) -> bool:
+        return False
+    @trace_method
+    def build_request_data(
+        self,
+        messages: List[PydanticMessage],
+        llm_config: LLMConfig,
+        tools: Optional[List[dict]] = None,
+        force_tool_call: Optional[str] = None,
+    ) -> dict:
+        # Override put_inner_thoughts_in_kwargs to False for DeepSeek
+        llm_config.put_inner_thoughts_in_kwargs = False
+        data = super().build_request_data(messages, llm_config, tools, force_tool_call)
+        def add_functions_to_system_message(system_message: ChatMessage):
+            system_message.content += f"<available functions> {''.join(json.dumps(f) for f in functions)} </available functions>"
+            system_message.content += 'Select best function to call simply respond with a single json block with the fields "name" and "arguments". Use double quotes around the arguments.'
+        if llm_config.model == "deepseek-reasoner":  # R1 currently doesn't support function calling natively
+            add_functions_to_system_message(
+                data["messages"][0]
+            )  # Inject additional instructions to the system prompt with the available functions
+            data["messages"] = map_messages_to_deepseek_format(data["messages"])
+        return data
+    @trace_method
+    def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
+        """
+        Performs underlying synchronous request to OpenAI API and returns raw response dict.
+        """
+        api_key = model_settings.deepseek_api_key or os.environ.get("DEEPSEEK_API_KEY")
+        client = OpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
+        response: ChatCompletion = client.chat.completions.create(**request_data)
+        return response.model_dump()
+    @trace_method
+    async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
+        """
+        Performs underlying asynchronous request to OpenAI API and returns raw response dict.
+        """
+        api_key = model_settings.deepseek_api_key or os.environ.get("DEEPSEEK_API_KEY")
+        client = AsyncOpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
+        response: ChatCompletion = await client.chat.completions.create(**request_data)
+        return response.model_dump()
+    @trace_method
+    async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk]:
+        """
+        Performs underlying asynchronous streaming request to OpenAI and returns the async stream iterator.
+        """
+        api_key = model_settings.deepseek_api_key or os.environ.get("DEEPSEEK_API_KEY")
+        client = AsyncOpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
+        response_stream: AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(
+            **request_data, stream=True, stream_options={"include_usage": True}
+        )
+        return response_stream
+    @trace_method
+    def convert_response_to_chat_completion(
+        self,
+        response_data: dict,
+        input_messages: List[PydanticMessage],  # Included for consistency, maybe used later
+        llm_config: LLMConfig,
+    ) -> ChatCompletionResponse:
+        """
+        Converts raw OpenAI response dict into the ChatCompletionResponse Pydantic model.
+        Handles potential extraction of inner thoughts if they were added via kwargs.
+        """
+        response = ChatCompletionResponse(**response_data)
+        return convert_deepseek_response_to_chatcompletion(response)

letta/llm_api/groq_client.py ADDED Viewed

@@ -0,0 +1,79 @@
+import os
+from typing import List, Optional
+from openai import AsyncOpenAI, AsyncStream, OpenAI
+from openai.types.chat.chat_completion import ChatCompletion
+from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
+from letta.llm_api.openai_client import OpenAIClient
+from letta.otel.tracing import trace_method
+from letta.schemas.embedding_config import EmbeddingConfig
+from letta.schemas.llm_config import LLMConfig
+from letta.schemas.message import Message as PydanticMessage
+from letta.settings import model_settings
+class GroqClient(OpenAIClient):
+    def requires_auto_tool_choice(self, llm_config: LLMConfig) -> bool:
+        return False
+    def supports_structured_output(self, llm_config: LLMConfig) -> bool:
+        return True
+    @trace_method
+    def build_request_data(
+        self,
+        messages: List[PydanticMessage],
+        llm_config: LLMConfig,
+        tools: Optional[List[dict]] = None,
+        force_tool_call: Optional[str] = None,
+    ) -> dict:
+        data = super().build_request_data(messages, llm_config, tools, force_tool_call)
+        # Groq validation - these fields are not supported and will cause 400 errors
+        # https://console.groq.com/docs/openai
+        if "top_logprobs" in data:
+            del data["top_logprobs"]
+        if "logit_bias" in data:
+            del data["logit_bias"]
+        data["logprobs"] = False
+        data["n"] = 1
+        return data
+    @trace_method
+    def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
+        """
+        Performs underlying synchronous request to Groq API and returns raw response dict.
+        """
+        api_key = model_settings.groq_api_key or os.environ.get("GROQ_API_KEY")
+        client = OpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
+        response: ChatCompletion = client.chat.completions.create(**request_data)
+        return response.model_dump()
+    @trace_method
+    async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
+        """
+        Performs underlying asynchronous request to Groq API and returns raw response dict.
+        """
+        api_key = model_settings.groq_api_key or os.environ.get("GROQ_API_KEY")
+        client = AsyncOpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
+        response: ChatCompletion = await client.chat.completions.create(**request_data)
+        return response.model_dump()
+    @trace_method
+    async def request_embeddings(self, inputs: List[str], embedding_config: EmbeddingConfig) -> List[List[float]]:
+        """Request embeddings given texts and embedding config"""
+        api_key = model_settings.groq_api_key or os.environ.get("GROQ_API_KEY")
+        client = AsyncOpenAI(api_key=api_key, base_url=embedding_config.embedding_endpoint)
+        response = await client.embeddings.create(model=embedding_config.embedding_model, input=inputs)
+        # TODO: add total usage
+        return [r.embedding for r in response.data]
+    @trace_method
+    async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk]:
+        raise NotImplementedError("Streaming not supported for Groq.")

letta/llm_api/helpers.py CHANGED Viewed

@@ -133,7 +133,6 @@ def convert_to_structured_output(openai_function: dict, allow_optional: bool = F
         structured_output["parameters"]["required"] = list(structured_output["parameters"]["properties"].keys())
     else:
         raise NotImplementedError("Optional parameter handling is not implemented.")
     return structured_output

letta/llm_api/llm_api_tools.py CHANGED Viewed

@@ -8,7 +8,7 @@ import requests
 from letta.constants import CLI_WARNING_PREFIX
 from letta.errors import LettaConfigurationError, RateLimitExceededError
 from letta.llm_api.deepseek import build_deepseek_chat_completions_request, convert_deepseek_response_to_chatcompletion
-from letta.llm_api.helpers import add_inner_thoughts_to_functions, unpack_all_inner_thoughts_from_kwargs
+from letta.llm_api.helpers import unpack_all_inner_thoughts_from_kwargs
 from letta.llm_api.openai import (
     build_openai_chat_completions_request,
     openai_chat_completions_process_stream,
@@ -16,14 +16,13 @@ from letta.llm_api.openai import (
     prepare_openai_payload,
 )
 from letta.local_llm.chat_completion_proxy import get_chat_completion
-from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
+from letta.local_llm.constants import INNER_THOUGHTS_KWARG
 from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
 from letta.orm.user import User
 from letta.otel.tracing import log_event, trace_method
 from letta.schemas.enums import ProviderCategory
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message
-from letta.schemas.openai.chat_completion_request import ChatCompletionRequest
 from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
 from letta.schemas.provider_trace import ProviderTraceCreate
 from letta.services.telemetry_manager import TelemetryManager
@@ -246,116 +245,6 @@ def create(
         return response
-    elif llm_config.model_endpoint_type == "xai":
-        api_key = model_settings.xai_api_key
-        if function_call is None and functions is not None and len(functions) > 0:
-            # force function calling for reliability, see https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
-            function_call = "required"
-        data = build_openai_chat_completions_request(
-            llm_config,
-            messages,
-            user_id,
-            functions,
-            function_call,
-            use_tool_naming,
-            put_inner_thoughts_first=put_inner_thoughts_first,
-            use_structured_output=False,  # NOTE: not supported atm for xAI
-        )
-        # Specific bug for the mini models (as of Apr 14, 2025)
-        # 400 - {'code': 'Client specified an invalid argument', 'error': 'Argument not supported on this model: presencePenalty'}
-        # 400 - {'code': 'Client specified an invalid argument', 'error': 'Argument not supported on this model: frequencyPenalty'}
-        if "grok-3-mini-" in llm_config.model:
-            data.presence_penalty = None
-            data.frequency_penalty = None
-        if stream:  # Client requested token streaming
-            data.stream = True
-            assert isinstance(stream_interface, AgentChunkStreamingInterface) or isinstance(
-                stream_interface, AgentRefreshStreamingInterface
-            ), type(stream_interface)
-            response = openai_chat_completions_process_stream(
-                url=llm_config.model_endpoint,
-                api_key=api_key,
-                chat_completion_request=data,
-                stream_interface=stream_interface,
-                name=name,
-                # TODO turn on to support reasoning content from xAI reasoners:
-                # https://docs.x.ai/docs/guides/reasoning#reasoning
-                expect_reasoning_content=False,
-            )
-        else:  # Client did not request token streaming (expect a blocking backend response)
-            data.stream = False
-            if isinstance(stream_interface, AgentChunkStreamingInterface):
-                stream_interface.stream_start()
-            try:
-                response = openai_chat_completions_request(
-                    url=llm_config.model_endpoint,
-                    api_key=api_key,
-                    chat_completion_request=data,
-                )
-            finally:
-                if isinstance(stream_interface, AgentChunkStreamingInterface):
-                    stream_interface.stream_end()
-        if llm_config.put_inner_thoughts_in_kwargs:
-            response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG)
-        return response
-    elif llm_config.model_endpoint_type == "groq":
-        if stream:
-            raise NotImplementedError("Streaming not yet implemented for Groq.")
-        if model_settings.groq_api_key is None and llm_config.model_endpoint == "https://api.groq.com/openai/v1/chat/completions":
-            raise LettaConfigurationError(message="Groq key is missing from letta config file", missing_fields=["groq_api_key"])
-        # force to true for groq, since they don't support 'content' is non-null
-        if llm_config.put_inner_thoughts_in_kwargs:
-            functions = add_inner_thoughts_to_functions(
-                functions=functions,
-                inner_thoughts_key=INNER_THOUGHTS_KWARG,
-                inner_thoughts_description=INNER_THOUGHTS_KWARG_DESCRIPTION,
-            )
-        tools = [{"type": "function", "function": f} for f in functions] if functions is not None else None
-        data = ChatCompletionRequest(
-            model=llm_config.model,
-            messages=[m.to_openai_dict(put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs) for m in messages],
-            tools=tools,
-            tool_choice=function_call,
-            user=str(user_id),
-        )
-        # https://console.groq.com/docs/openai
-        # "The following fields are currently not supported and will result in a 400 error (yikes) if they are supplied:"
-        assert data.top_logprobs is None
-        assert data.logit_bias is None
-        assert data.logprobs == False
-        assert data.n == 1
-        # They mention that none of the messages can have names, but it seems to not error out (for now)
-        data.stream = False
-        if isinstance(stream_interface, AgentChunkStreamingInterface):
-            stream_interface.stream_start()
-        try:
-            # groq uses the openai chat completions API, so this component should be reusable
-            response = openai_chat_completions_request(
-                url=llm_config.model_endpoint,
-                api_key=model_settings.groq_api_key,
-                chat_completion_request=data,
-            )
-        finally:
-            if isinstance(stream_interface, AgentChunkStreamingInterface):
-                stream_interface.stream_end()
-        if llm_config.put_inner_thoughts_in_kwargs:
-            response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG)
-        return response
     elif llm_config.model_endpoint_type == "deepseek":
         if model_settings.deepseek_api_key is None and llm_config.model_endpoint == "":
             # only is a problem if we are *not* using an openai proxy

letta/llm_api/llm_client.py CHANGED Viewed

@@ -79,5 +79,26 @@ class LLMClient:
                     put_inner_thoughts_first=put_inner_thoughts_first,
                     actor=actor,
                 )
+            case ProviderType.xai:
+                from letta.llm_api.xai_client import XAIClient
+                return XAIClient(
+                    put_inner_thoughts_first=put_inner_thoughts_first,
+                    actor=actor,
+                )
+            case ProviderType.groq:
+                from letta.llm_api.groq_client import GroqClient
+                return GroqClient(
+                    put_inner_thoughts_first=put_inner_thoughts_first,
+                    actor=actor,
+                )
+            case ProviderType.deepseek:
+                from letta.llm_api.deepseek_client import DeepseekClient
+                return DeepseekClient(
+                    put_inner_thoughts_first=put_inner_thoughts_first,
+                    actor=actor,
+                )
             case _:
                 return None

letta/llm_api/llm_client_base.py CHANGED Viewed

@@ -15,6 +15,7 @@ from letta.schemas.message import Message
 from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
 from letta.schemas.provider_trace import ProviderTraceCreate
 from letta.services.telemetry_manager import TelemetryManager
+from letta.settings import settings
 if TYPE_CHECKING:
     from letta.orm import User
@@ -90,15 +91,16 @@ class LLMClientBase:
         try:
             log_event(name="llm_request_sent", attributes=request_data)
             response_data = await self.request_async(request_data, llm_config)
-            await telemetry_manager.create_provider_trace_async(
-                actor=self.actor,
-                provider_trace_create=ProviderTraceCreate(
-                    request_json=request_data,
-                    response_json=response_data,
-                    step_id=step_id,
-                    organization_id=self.actor.organization_id,
-                ),
-            )
+            if settings.track_provider_trace and telemetry_manager:
+                await telemetry_manager.create_provider_trace_async(
+                    actor=self.actor,
+                    provider_trace_create=ProviderTraceCreate(
+                        request_json=request_data,
+                        response_json=response_data,
+                        step_id=step_id,
+                        organization_id=self.actor.organization_id,
+                    ),
+                )
             log_event(name="llm_response_received", attributes=response_data)
         except Exception as e:

letta/llm_api/openai_client.py CHANGED Viewed

@@ -146,6 +146,9 @@ class OpenAIClient(LLMClientBase):
     def requires_auto_tool_choice(self, llm_config: LLMConfig) -> bool:
         return requires_auto_tool_choice(llm_config)
+    def supports_structured_output(self, llm_config: LLMConfig) -> bool:
+        return supports_structured_output(llm_config)
     @trace_method
     def build_request_data(
         self,

letta/llm_api/xai_client.py ADDED Viewed

@@ -0,0 +1,85 @@
+import os
+from typing import List, Optional
+from openai import AsyncOpenAI, AsyncStream, OpenAI
+from openai.types.chat.chat_completion import ChatCompletion
+from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
+from letta.llm_api.openai_client import OpenAIClient
+from letta.otel.tracing import trace_method
+from letta.schemas.embedding_config import EmbeddingConfig
+from letta.schemas.llm_config import LLMConfig
+from letta.schemas.message import Message as PydanticMessage
+from letta.settings import model_settings
+class XAIClient(OpenAIClient):
+    def requires_auto_tool_choice(self, llm_config: LLMConfig) -> bool:
+        return False
+    def supports_structured_output(self, llm_config: LLMConfig) -> bool:
+        return False
+    @trace_method
+    def build_request_data(
+        self,
+        messages: List[PydanticMessage],
+        llm_config: LLMConfig,
+        tools: Optional[List[dict]] = None,
+        force_tool_call: Optional[str] = None,
+    ) -> dict:
+        data = super().build_request_data(messages, llm_config, tools, force_tool_call)
+        # Specific bug for the mini models (as of Apr 14, 2025)
+        # 400 - {'code': 'Client specified an invalid argument', 'error': 'Argument not supported on this model: presencePenalty'}
+        # 400 - {'code': 'Client specified an invalid argument', 'error': 'Argument not supported on this model: frequencyPenalty'}
+        if "grok-3-mini-" in llm_config.model:
+            data.pop("presence_penalty", None)
+            data.pop("frequency_penalty", None)
+        return data
+    @trace_method
+    def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
+        """
+        Performs underlying synchronous request to OpenAI API and returns raw response dict.
+        """
+        api_key = model_settings.xai_api_key or os.environ.get("XAI_API_KEY")
+        client = OpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
+        response: ChatCompletion = client.chat.completions.create(**request_data)
+        return response.model_dump()
+    @trace_method
+    async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
+        """
+        Performs underlying asynchronous request to OpenAI API and returns raw response dict.
+        """
+        api_key = model_settings.xai_api_key or os.environ.get("XAI_API_KEY")
+        client = AsyncOpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
+        response: ChatCompletion = await client.chat.completions.create(**request_data)
+        return response.model_dump()
+    @trace_method
+    async def stream_async(self, request_data: dict, llm_config: LLMConfig) -> AsyncStream[ChatCompletionChunk]:
+        """
+        Performs underlying asynchronous streaming request to OpenAI and returns the async stream iterator.
+        """
+        api_key = model_settings.xai_api_key or os.environ.get("XAI_API_KEY")
+        client = AsyncOpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
+        response_stream: AsyncStream[ChatCompletionChunk] = await client.chat.completions.create(
+            **request_data, stream=True, stream_options={"include_usage": True}
+        )
+        return response_stream
+    @trace_method
+    async def request_embeddings(self, inputs: List[str], embedding_config: EmbeddingConfig) -> List[List[float]]:
+        """Request embeddings given texts and embedding config"""
+        api_key = model_settings.xai_api_key or os.environ.get("XAI_API_KEY")
+        client = AsyncOpenAI(api_key=api_key, base_url=embedding_config.embedding_endpoint)
+        response = await client.embeddings.create(model=embedding_config.embedding_model, input=inputs)
+        # TODO: add total usage
+        return [r.embedding for r in response.data]

letta-nightly 0.11.4.dev20250825104222__py3-none-any.whl → 0.11.5__py3-none-any.whl

letta-nightly 0.11.4.dev20250825104222py3-none-any.whl → 0.11.5py3-none-any.whl