PyPI - letta-nightly - Versions diffs - 0.11.3.dev20250820104219__py3-none-any.whl → 0.11.4.dev20250820213507__py3-none-any.whl - Mend

letta-nightly 0.11.3.dev20250820104219py3-none-any.whl → 0.11.4.dev20250820213507py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

letta/__init__.py +1 -1
letta/agents/helpers.py +4 -0
letta/agents/letta_agent.py +142 -5
letta/constants.py +10 -7
letta/data_sources/connectors.py +70 -53
letta/embeddings.py +3 -240
letta/errors.py +28 -0
letta/functions/function_sets/base.py +4 -4
letta/functions/functions.py +287 -32
letta/functions/mcp_client/types.py +11 -0
letta/functions/schema_validator.py +187 -0
letta/functions/typescript_parser.py +196 -0
letta/helpers/datetime_helpers.py +8 -4
letta/helpers/tool_execution_helper.py +25 -2
letta/llm_api/anthropic_client.py +23 -18
letta/llm_api/azure_client.py +73 -0
letta/llm_api/bedrock_client.py +8 -4
letta/llm_api/google_vertex_client.py +14 -5
letta/llm_api/llm_api_tools.py +2 -217
letta/llm_api/llm_client.py +15 -1
letta/llm_api/llm_client_base.py +32 -1
letta/llm_api/openai.py +1 -0
letta/llm_api/openai_client.py +18 -28
letta/llm_api/together_client.py +55 -0
letta/orm/provider.py +1 -0
letta/orm/step_metrics.py +40 -1
letta/otel/db_pool_monitoring.py +1 -1
letta/schemas/agent.py +3 -4
letta/schemas/agent_file.py +2 -0
letta/schemas/block.py +11 -5
letta/schemas/embedding_config.py +4 -5
letta/schemas/enums.py +1 -1
letta/schemas/job.py +2 -3
letta/schemas/llm_config.py +79 -7
letta/schemas/mcp.py +0 -24
letta/schemas/message.py +0 -108
letta/schemas/openai/chat_completion_request.py +1 -0
letta/schemas/providers/__init__.py +0 -2
letta/schemas/providers/anthropic.py +106 -8
letta/schemas/providers/azure.py +102 -8
letta/schemas/providers/base.py +10 -3
letta/schemas/providers/bedrock.py +28 -16
letta/schemas/providers/letta.py +3 -3
letta/schemas/providers/ollama.py +2 -12
letta/schemas/providers/openai.py +4 -4
letta/schemas/providers/together.py +14 -2
letta/schemas/sandbox_config.py +2 -1
letta/schemas/tool.py +46 -22
letta/server/rest_api/routers/v1/agents.py +179 -38
letta/server/rest_api/routers/v1/folders.py +13 -8
letta/server/rest_api/routers/v1/providers.py +10 -3
letta/server/rest_api/routers/v1/sources.py +14 -8
letta/server/rest_api/routers/v1/steps.py +17 -1
letta/server/rest_api/routers/v1/tools.py +96 -5
letta/server/rest_api/streaming_response.py +91 -45
letta/server/server.py +27 -38
letta/services/agent_manager.py +92 -20
letta/services/agent_serialization_manager.py +11 -7
letta/services/context_window_calculator/context_window_calculator.py +40 -2
letta/services/helpers/agent_manager_helper.py +73 -12
letta/services/mcp_manager.py +109 -15
letta/services/passage_manager.py +28 -109
letta/services/provider_manager.py +24 -0
letta/services/step_manager.py +68 -0
letta/services/summarizer/summarizer.py +1 -4
letta/services/tool_executor/core_tool_executor.py +1 -1
letta/services/tool_executor/sandbox_tool_executor.py +26 -9
letta/services/tool_manager.py +82 -5
letta/services/tool_sandbox/base.py +3 -11
letta/services/tool_sandbox/modal_constants.py +17 -0
letta/services/tool_sandbox/modal_deployment_manager.py +242 -0
letta/services/tool_sandbox/modal_sandbox.py +218 -3
letta/services/tool_sandbox/modal_sandbox_v2.py +429 -0
letta/services/tool_sandbox/modal_version_manager.py +273 -0
letta/services/tool_sandbox/safe_pickle.py +193 -0
letta/settings.py +5 -3
letta/templates/sandbox_code_file.py.j2 +2 -4
letta/templates/sandbox_code_file_async.py.j2 +2 -4
letta/utils.py +1 -1
{letta_nightly-0.11.3.dev20250820104219.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/METADATA +2 -2
{letta_nightly-0.11.3.dev20250820104219.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/RECORD +84 -81
letta/llm_api/anthropic.py +0 -1206
letta/llm_api/aws_bedrock.py +0 -104
letta/llm_api/azure_openai.py +0 -118
letta/llm_api/azure_openai_constants.py +0 -11
letta/llm_api/cohere.py +0 -391
letta/schemas/providers/cohere.py +0 -18
{letta_nightly-0.11.3.dev20250820104219.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/LICENSE +0 -0
{letta_nightly-0.11.3.dev20250820104219.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/WHEEL +0 -0
{letta_nightly-0.11.3.dev20250820104219.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/entry_points.txt +0 -0

letta/llm_api/google_vertex_client.py CHANGED Viewed

@@ -239,7 +239,7 @@ class GoogleVertexClient(LLMClientBase):
             request_data["config"]["response_mime_type"] = "application/json"
             request_data["config"]["response_schema"] = self.get_function_call_response_schema(tools[0])
             del request_data["config"]["tools"]
-        else:
+        elif tools:
             tool_config = ToolConfig(
                 function_calling_config=FunctionCallingConfig(
                     # ANY mode forces the model to predict only function calls
@@ -255,10 +255,13 @@ class GoogleVertexClient(LLMClientBase):
         # Otherwise, use the value from max_reasoning_tokens
         if "flash" in llm_config.model:
             # Gemini flash models may fail to call tools even with FunctionCallingConfigMode.ANY if thinking is fully disabled, set to minimum to prevent tool call failure
+            thinking_budget = llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
+            if thinking_budget <= 0:
+                logger.error(
+                    f"Thinking budget of {thinking_budget} for Gemini reasoning model {llm_config.model}, this will likely cause tool call failures"
+                )
             thinking_config = ThinkingConfig(
-                thinking_budget=(
-                    llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
-                ),
+                thinking_budget=(thinking_budget),
             )
             request_data["config"]["thinking_config"] = thinking_config.model_dump()
@@ -309,7 +312,7 @@ class GoogleVertexClient(LLMClientBase):
                     if candidate.finish_reason == "MALFORMED_FUNCTION_CALL":
                         raise ValueError(f"Error in response data from LLM: {candidate.finish_reason}...")
                     else:
-                        raise ValueError(f"Error in response data from LLM: {response_data}")
+                        raise ValueError(f"Error in response data from LLM: {candidate.model_dump()}")
                 role = content.role
                 assert role == "model", f"Unknown role in response: {role}"
@@ -496,6 +499,12 @@ class GoogleVertexClient(LLMClientBase):
             "required": ["name", "args"],
         }
+    # https://ai.google.dev/gemini-api/docs/thinking#set-budget
+    # | Model           | Default setting                                                   | Range        | Disable thinking           | Turn on dynamic thinking|
+    # |-----------------|-------------------------------------------------------------------|--------------|----------------------------|-------------------------|
+    # | 2.5 Pro         | Dynamic thinking: Model decides when and how much to think        | 128-32768    | N/A: Cannot disable        | thinkingBudget = -1     |
+    # | 2.5 Flash       | Dynamic thinking: Model decides when and how much to think        | 0-24576      | thinkingBudget = 0         | thinkingBudget = -1     |
+    # | 2.5 Flash Lite  | Model does not think                                              | 512-24576    | thinkingBudget = 0         | thinkingBudget = -1     |
     def get_thinking_budget(self, model: str) -> bool:
         if model_settings.gemini_force_minimum_thinking_budget:
             if all(substring in model for substring in ["2.5", "flash", "lite"]):

letta/llm_api/llm_api_tools.py CHANGED Viewed

@@ -7,13 +7,6 @@ import requests
 from letta.constants import CLI_WARNING_PREFIX
 from letta.errors import LettaConfigurationError, RateLimitExceededError
-from letta.llm_api.anthropic import (
-    anthropic_bedrock_chat_completions_request,
-    anthropic_chat_completions_process_stream,
-    anthropic_chat_completions_request,
-)
-from letta.llm_api.aws_bedrock import has_valid_aws_credentials
-from letta.llm_api.azure_openai import azure_openai_chat_completions_request
 from letta.llm_api.deepseek import build_deepseek_chat_completions_request, convert_deepseek_response_to_chatcompletion
 from letta.llm_api.helpers import add_inner_thoughts_to_functions, unpack_all_inner_thoughts_from_kwargs
 from letta.llm_api.openai import (
@@ -30,14 +23,14 @@ from letta.otel.tracing import log_event, trace_method
 from letta.schemas.enums import ProviderCategory
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message
-from letta.schemas.openai.chat_completion_request import ChatCompletionRequest, cast_message_to_subtype
+from letta.schemas.openai.chat_completion_request import ChatCompletionRequest
 from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
 from letta.schemas.provider_trace import ProviderTraceCreate
 from letta.services.telemetry_manager import TelemetryManager
 from letta.settings import ModelSettings
 from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface
-LLM_API_PROVIDER_OPTIONS = ["openai", "azure", "anthropic", "google_ai", "cohere", "local", "groq", "deepseek"]
+LLM_API_PROVIDER_OPTIONS = ["openai", "azure", "anthropic", "google_ai", "local", "groq", "deepseek"]
 def retry_with_exponential_backoff(
@@ -312,153 +305,6 @@ def create(
         return response
-    # azure
-    elif llm_config.model_endpoint_type == "azure":
-        if stream:
-            raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}")
-        if model_settings.azure_api_key is None:
-            raise LettaConfigurationError(
-                message="Azure API key is missing. Did you set AZURE_API_KEY in your env?", missing_fields=["azure_api_key"]
-            )
-        if model_settings.azure_base_url is None:
-            raise LettaConfigurationError(
-                message="Azure base url is missing. Did you set AZURE_BASE_URL in your env?", missing_fields=["azure_base_url"]
-            )
-        if model_settings.azure_api_version is None:
-            raise LettaConfigurationError(
-                message="Azure API version is missing. Did you set AZURE_API_VERSION in your env?", missing_fields=["azure_api_version"]
-            )
-        # Set the llm config model_endpoint from model_settings
-        # For Azure, this model_endpoint is required to be configured via env variable, so users don't need to provide it in the LLM config
-        llm_config.model_endpoint = model_settings.azure_base_url
-        chat_completion_request = build_openai_chat_completions_request(
-            llm_config, messages, user_id, functions, function_call, use_tool_naming
-        )
-        response = azure_openai_chat_completions_request(
-            model_settings=model_settings,
-            llm_config=llm_config,
-            chat_completion_request=chat_completion_request,
-        )
-        if llm_config.put_inner_thoughts_in_kwargs:
-            response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG)
-        return response
-    elif llm_config.model_endpoint_type == "anthropic":
-        if not use_tool_naming:
-            raise NotImplementedError("Only tool calling supported on Anthropic API requests")
-        if llm_config.enable_reasoner:
-            llm_config.put_inner_thoughts_in_kwargs = False
-        # Force tool calling
-        tool_call = None
-        if functions is None:
-            # Special case for summarization path
-            tools = None
-            tool_choice = None
-        elif force_tool_call is not None:
-            # tool_call = {"type": "function", "function": {"name": force_tool_call}}
-            tool_choice = {"type": "tool", "name": force_tool_call}
-            tools = [{"type": "function", "function": f} for f in functions if f["name"] == force_tool_call]
-            assert functions is not None
-            # need to have this setting to be able to put inner thoughts in kwargs
-            llm_config.put_inner_thoughts_in_kwargs = True
-        else:
-            if llm_config.put_inner_thoughts_in_kwargs:
-                # tool_choice_type other than "auto" only plays nice if thinking goes inside the tool calls
-                tool_choice = {"type": "any", "disable_parallel_tool_use": True}
-            else:
-                tool_choice = {"type": "auto", "disable_parallel_tool_use": True}
-            tools = [{"type": "function", "function": f} for f in functions] if functions is not None else None
-        chat_completion_request = ChatCompletionRequest(
-            model=llm_config.model,
-            messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
-            tools=tools,
-            tool_choice=tool_choice,
-            max_tokens=llm_config.max_tokens,  # Note: max_tokens is required for Anthropic API
-            temperature=llm_config.temperature,
-            stream=stream,
-        )
-        # Handle streaming
-        if stream:  # Client requested token streaming
-            assert isinstance(stream_interface, (AgentChunkStreamingInterface, AgentRefreshStreamingInterface)), type(stream_interface)
-            stream_interface.inner_thoughts_in_kwargs = True
-            response = anthropic_chat_completions_process_stream(
-                chat_completion_request=chat_completion_request,
-                put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
-                stream_interface=stream_interface,
-                extended_thinking=llm_config.enable_reasoner,
-                max_reasoning_tokens=llm_config.max_reasoning_tokens,
-                provider_name=llm_config.provider_name,
-                provider_category=llm_config.provider_category,
-                name=name,
-                user_id=user_id,
-            )
-        else:
-            # Client did not request token streaming (expect a blocking backend response)
-            response = anthropic_chat_completions_request(
-                data=chat_completion_request,
-                put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
-                extended_thinking=llm_config.enable_reasoner,
-                max_reasoning_tokens=llm_config.max_reasoning_tokens,
-                provider_name=llm_config.provider_name,
-                provider_category=llm_config.provider_category,
-                user_id=user_id,
-            )
-        if llm_config.put_inner_thoughts_in_kwargs:
-            response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG)
-        telemetry_manager.create_provider_trace(
-            actor=actor,
-            provider_trace_create=ProviderTraceCreate(
-                request_json=chat_completion_request.model_json_schema(),
-                response_json=response.model_json_schema(),
-                step_id=step_id,
-                organization_id=actor.organization_id,
-            ),
-        )
-        return response
-    # elif llm_config.model_endpoint_type == "cohere":
-    #     if stream:
-    #         raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}")
-    #     if not use_tool_naming:
-    #         raise NotImplementedError("Only tool calling supported on Cohere API requests")
-    #
-    #     if functions is not None:
-    #         tools = [{"type": "function", "function": f} for f in functions]
-    #         tools = [Tool(**t) for t in tools]
-    #     else:
-    #         tools = None
-    #
-    #     return cohere_chat_completions_request(
-    #         # url=llm_config.model_endpoint,
-    #         url="https://api.cohere.ai/v1",  # TODO
-    #         api_key=os.getenv("COHERE_API_KEY"),  # TODO remove
-    #         chat_completion_request=ChatCompletionRequest(
-    #             model="command-r-plus",  # TODO
-    #             messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
-    #             tools=tools,
-    #             tool_choice=function_call,
-    #             # user=str(user_id),
-    #             # NOTE: max_tokens is required for Anthropic API
-    #             # max_tokens=1024,  # TODO make dynamic
-    #         ),
-    #     )
     elif llm_config.model_endpoint_type == "groq":
         if stream:
             raise NotImplementedError("Streaming not yet implemented for Groq.")
@@ -510,67 +356,6 @@ def create(
         return response
-    elif llm_config.model_endpoint_type == "together":
-        """TogetherAI endpoint that goes via /completions instead of /chat/completions"""
-        if stream:
-            raise NotImplementedError("Streaming not yet implemented for TogetherAI (via the /completions endpoint).")
-        if model_settings.together_api_key is None and (
-            llm_config.model_endpoint == "https://api.together.ai/v1/completions"
-            or llm_config.model_endpoint == "https://api.together.xyz/v1/completions"
-        ):
-            raise LettaConfigurationError(message="TogetherAI key is missing from letta config file", missing_fields=["together_api_key"])
-        return get_chat_completion(
-            model=llm_config.model,
-            messages=messages,
-            functions=functions,
-            functions_python=functions_python,
-            function_call=function_call,
-            context_window=llm_config.context_window,
-            endpoint=llm_config.model_endpoint,
-            endpoint_type="vllm",  # NOTE: use the vLLM path through /completions
-            wrapper=llm_config.model_wrapper,
-            user=str(user_id),
-            # hint
-            first_message=first_message,
-            # auth-related
-            auth_type="bearer_token",  # NOTE: Together expects bearer token auth
-            auth_key=model_settings.together_api_key,
-        )
-    elif llm_config.model_endpoint_type == "bedrock":
-        """Anthropic endpoint that goes via /embeddings instead of /chat/completions"""
-        if stream:
-            raise NotImplementedError("Streaming not yet implemented for Anthropic (via the /embeddings endpoint).")
-        if not use_tool_naming:
-            raise NotImplementedError("Only tool calling supported on Anthropic API requests")
-        if not has_valid_aws_credentials():
-            raise LettaConfigurationError(message="Invalid or missing AWS credentials. Please configure valid AWS credentials.")
-        tool_call = None
-        if force_tool_call is not None:
-            tool_call = {"type": "function", "function": {"name": force_tool_call}}
-            assert functions is not None
-        return anthropic_bedrock_chat_completions_request(
-            data=ChatCompletionRequest(
-                model=llm_config.model,
-                messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
-                tools=[{"type": "function", "function": f} for f in functions] if functions else None,
-                tool_choice=tool_call,
-                # user=str(user_id),
-                # NOTE: max_tokens is required for Anthropic API
-                max_tokens=llm_config.max_tokens,
-            ),
-            provider_name=llm_config.provider_name,
-            provider_category=llm_config.provider_category,
-            user_id=user_id,
-        )
     elif llm_config.model_endpoint_type == "deepseek":
         if model_settings.deepseek_api_key is None and llm_config.model_endpoint == "":
             # only is a problem if we are *not* using an openai proxy

letta/llm_api/llm_client.py CHANGED Viewed

@@ -58,12 +58,26 @@ class LLMClient:
                     put_inner_thoughts_first=put_inner_thoughts_first,
                     actor=actor,
                 )
-            case ProviderType.openai | ProviderType.together | ProviderType.ollama:
+            case ProviderType.openai | ProviderType.ollama:
                 from letta.llm_api.openai_client import OpenAIClient
                 return OpenAIClient(
                     put_inner_thoughts_first=put_inner_thoughts_first,
                     actor=actor,
                 )
+            case ProviderType.together:
+                from letta.llm_api.together_client import TogetherClient
+                return TogetherClient(
+                    put_inner_thoughts_first=put_inner_thoughts_first,
+                    actor=actor,
+                )
+            case ProviderType.azure:
+                from letta.llm_api.azure_client import AzureClient
+                return AzureClient(
+                    put_inner_thoughts_first=put_inner_thoughts_first,
+                    actor=actor,
+                )
             case _:
                 return None

letta/llm_api/llm_client_base.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 from abc import abstractmethod
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 from anthropic.types.beta.messages import BetaMessageBatch
 from openai import AsyncStream, Stream
@@ -9,6 +9,7 @@ from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
 from letta.errors import LLMError
 from letta.otel.tracing import log_event, trace_method
 from letta.schemas.embedding_config import EmbeddingConfig
+from letta.schemas.enums import ProviderCategory
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message
 from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
@@ -111,6 +112,9 @@ class LLMClientBase:
         agent_tools_mapping: Dict[str, List[dict]],
         agent_llm_config_mapping: Dict[str, LLMConfig],
     ) -> Union[BetaMessageBatch]:
+        """
+        Issues a batch request to the downstream model endpoint and parses response.
+        """
         raise NotImplementedError
     @abstractmethod
@@ -176,6 +180,9 @@ class LLMClientBase:
     @abstractmethod
     def is_reasoning_model(self, llm_config: LLMConfig) -> bool:
+        """
+        Returns True if the model is a native reasoning model.
+        """
         raise NotImplementedError
     @abstractmethod
@@ -192,6 +199,30 @@ class LLMClientBase:
         """
         return LLMError(f"Unhandled LLM error: {str(e)}")
+    def get_byok_overrides(self, llm_config: LLMConfig) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+        """
+        Returns the override key for the given llm config.
+        """
+        api_key = None
+        if llm_config.provider_category == ProviderCategory.byok:
+            from letta.services.provider_manager import ProviderManager
+            api_key = ProviderManager().get_override_key(llm_config.provider_name, actor=self.actor)
+        return api_key, None, None
+    async def get_byok_overrides_async(self, llm_config: LLMConfig) -> Tuple[Optional[str], Optional[str], Optional[str]]:
+        """
+        Returns the override key for the given llm config.
+        """
+        api_key = None
+        if llm_config.provider_category == ProviderCategory.byok:
+            from letta.services.provider_manager import ProviderManager
+            api_key = await ProviderManager().get_override_key_async(llm_config.provider_name, actor=self.actor)
+        return api_key, None, None
     def _fix_truncated_json_response(self, response: ChatCompletionResponse) -> ChatCompletionResponse:
         """
         Fixes truncated JSON responses by ensuring the content is properly formatted.

letta/llm_api/openai.py CHANGED Viewed

@@ -42,6 +42,7 @@ from letta.utils import get_tool_call_id, smart_urljoin
 logger = get_logger(__name__)
+# TODO: MOVE THIS TO OPENAI_CLIENT
 def openai_check_valid_api_key(base_url: str, api_key: Union[str, None]) -> None:
     if api_key:
         try:

letta/llm_api/openai_client.py CHANGED Viewed

@@ -26,7 +26,6 @@ from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG
 from letta.log import get_logger
 from letta.otel.tracing import trace_method
 from letta.schemas.embedding_config import EmbeddingConfig
-from letta.schemas.enums import ProviderCategory, ProviderType
 from letta.schemas.letta_message_content import MessageContentType
 from letta.schemas.llm_config import LLMConfig
 from letta.schemas.message import Message as PydanticMessage
@@ -54,6 +53,11 @@ def is_openai_5_model(model: str) -> bool:
     return model.startswith("gpt-5")
+def supports_verbosity_control(model: str) -> bool:
+    """Check if the model supports verbosity control, currently only GPT-5 models support this"""
+    return is_openai_5_model(model)
 def accepts_developer_role(model: str) -> bool:
     """Checks if the model accepts the 'developer' role. Note that not all reasoning models accept this role.
@@ -102,8 +106,6 @@ def requires_auto_tool_choice(llm_config: LLMConfig) -> bool:
     """Certain providers require the tool choice to be set to 'auto'."""
     if "nebius.com" in llm_config.model_endpoint:
         return True
-    if "together.ai" in llm_config.model_endpoint or "together.xyz" in llm_config.model_endpoint:
-        return True
     if llm_config.handle and "vllm" in llm_config.handle:
         return True
     if llm_config.compatibility_type == "mlx":
@@ -113,13 +115,7 @@ def requires_auto_tool_choice(llm_config: LLMConfig) -> bool:
 class OpenAIClient(LLMClientBase):
     def _prepare_client_kwargs(self, llm_config: LLMConfig) -> dict:
-        api_key = None
-        if llm_config.provider_category == ProviderCategory.byok:
-            from letta.services.provider_manager import ProviderManager
-            api_key = ProviderManager().get_override_key(llm_config.provider_name, actor=self.actor)
-        if llm_config.model_endpoint_type == ProviderType.together:
-            api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
+        api_key, _, _ = self.get_byok_overrides(llm_config)
         if not api_key:
             api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
@@ -130,25 +126,14 @@ class OpenAIClient(LLMClientBase):
         return kwargs
     def _prepare_client_kwargs_embedding(self, embedding_config: EmbeddingConfig) -> dict:
-        api_key = None
-        if embedding_config.embedding_endpoint_type == ProviderType.together:
-            api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
-        if not api_key:
-            api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
+        api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
         # supposedly the openai python client requires a dummy API key
         api_key = api_key or "DUMMY_API_KEY"
         kwargs = {"api_key": api_key, "base_url": embedding_config.embedding_endpoint}
         return kwargs
     async def _prepare_client_kwargs_async(self, llm_config: LLMConfig) -> dict:
-        api_key = None
-        if llm_config.provider_category == ProviderCategory.byok:
-            from letta.services.provider_manager import ProviderManager
-            api_key = await ProviderManager().get_override_key_async(llm_config.provider_name, actor=self.actor)
-        if llm_config.model_endpoint_type == ProviderType.together:
-            api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
+        api_key, _, _ = await self.get_byok_overrides_async(llm_config)
         if not api_key:
             api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
@@ -158,6 +143,9 @@ class OpenAIClient(LLMClientBase):
         return kwargs
+    def requires_auto_tool_choice(self, llm_config: LLMConfig) -> bool:
+        return requires_auto_tool_choice(llm_config)
     @trace_method
     def build_request_data(
         self,
@@ -204,7 +192,7 @@ class OpenAIClient(LLMClientBase):
         # TODO(matt) move into LLMConfig
         # TODO: This vllm checking is very brittle and is a patch at most
         tool_choice = None
-        if requires_auto_tool_choice(llm_config):
+        if self.requires_auto_tool_choice(llm_config):
             tool_choice = "auto"
         elif tools:
             # only set if tools is non-Null
@@ -224,6 +212,10 @@ class OpenAIClient(LLMClientBase):
             temperature=llm_config.temperature if supports_temperature_param(model) else 1.0,
         )
+        # Add verbosity control for GPT-5 models
+        if supports_verbosity_control(model) and llm_config.verbosity:
+            data.verbosity = llm_config.verbosity
         if llm_config.frequency_penalty is not None:
             data.frequency_penalty = llm_config.frequency_penalty
@@ -252,8 +244,8 @@ class OpenAIClient(LLMClientBase):
                         tool.function = FunctionSchema(**structured_output_version)
                     except ValueError as e:
                         logger.warning(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
-        return data.model_dump(exclude_unset=True)
+        request_data = data.model_dump(exclude_unset=True)
+        return request_data
     @trace_method
     def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
@@ -261,7 +253,6 @@ class OpenAIClient(LLMClientBase):
         Performs underlying synchronous request to OpenAI API and returns raw response dict.
         """
         client = OpenAI(**self._prepare_client_kwargs(llm_config))
         response: ChatCompletion = client.chat.completions.create(**request_data)
         return response.model_dump()
@@ -272,7 +263,6 @@ class OpenAIClient(LLMClientBase):
         """
         kwargs = await self._prepare_client_kwargs_async(llm_config)
         client = AsyncOpenAI(**kwargs)
         response: ChatCompletion = await client.chat.completions.create(**request_data)
         return response.model_dump()

letta/llm_api/together_client.py ADDED Viewed

@@ -0,0 +1,55 @@
+import os
+from typing import List
+from openai import AsyncOpenAI, OpenAI
+from openai.types.chat.chat_completion import ChatCompletion
+from letta.llm_api.openai_client import OpenAIClient
+from letta.otel.tracing import trace_method
+from letta.schemas.embedding_config import EmbeddingConfig
+from letta.schemas.llm_config import LLMConfig
+from letta.settings import model_settings
+class TogetherClient(OpenAIClient):
+    def requires_auto_tool_choice(self, llm_config: LLMConfig) -> bool:
+        return True
+    @trace_method
+    def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
+        """
+        Performs underlying synchronous request to OpenAI API and returns raw response dict.
+        """
+        api_key, _, _ = self.get_byok_overrides(llm_config)
+        if not api_key:
+            api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
+        client = OpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
+        response: ChatCompletion = client.chat.completions.create(**request_data)
+        return response.model_dump()
+    @trace_method
+    async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
+        """
+        Performs underlying asynchronous request to OpenAI API and returns raw response dict.
+        """
+        api_key, _, _ = await self.get_byok_overrides_async(llm_config)
+        if not api_key:
+            api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
+        client = AsyncOpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
+        response: ChatCompletion = await client.chat.completions.create(**request_data)
+        return response.model_dump()
+    @trace_method
+    async def request_embeddings(self, inputs: List[str], embedding_config: EmbeddingConfig) -> List[List[float]]:
+        """Request embeddings given texts and embedding config"""
+        api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
+        client = AsyncOpenAI(api_key=api_key, base_url=embedding_config.embedding_endpoint)
+        response = await client.embeddings.create(model=embedding_config.embedding_model, input=inputs)
+        # TODO: add total usage
+        return [r.embedding for r in response.data]

letta/orm/provider.py CHANGED Viewed

@@ -31,6 +31,7 @@ class Provider(SqlalchemyBase, OrganizationMixin):
     base_url: Mapped[str] = mapped_column(nullable=True, doc="Base URL for the provider.")
     access_key: Mapped[str] = mapped_column(nullable=True, doc="Access key used for requests to the provider.")
     region: Mapped[str] = mapped_column(nullable=True, doc="Region used for requests to the provider.")
+    api_version: Mapped[str] = mapped_column(nullable=True, doc="API version used for requests to the provider.")
     # relationships
     organization: Mapped["Organization"] = relationship("Organization", back_populates="providers")

letta/orm/step_metrics.py CHANGED Viewed

@@ -1,11 +1,15 @@
+from datetime import datetime, timezone
 from typing import TYPE_CHECKING, Optional
 from sqlalchemy import BigInteger, ForeignKey, String
-from sqlalchemy.orm import Mapped, mapped_column, relationship
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.orm import Mapped, Session, mapped_column, relationship
 from letta.orm.mixins import AgentMixin, ProjectMixin
 from letta.orm.sqlalchemy_base import SqlalchemyBase
 from letta.schemas.step_metrics import StepMetrics as PydanticStepMetrics
+from letta.schemas.user import User
+from letta.settings import DatabaseChoice, settings
 if TYPE_CHECKING:
     from letta.orm.agent import Agent
@@ -69,3 +73,38 @@ class StepMetrics(SqlalchemyBase, ProjectMixin, AgentMixin):
     step: Mapped["Step"] = relationship("Step", back_populates="metrics", uselist=False)
     job: Mapped[Optional["Job"]] = relationship("Job")
     agent: Mapped[Optional["Agent"]] = relationship("Agent")
+    def create(
+        self,
+        db_session: Session,
+        actor: Optional[User] = None,
+        no_commit: bool = False,
+    ) -> "StepMetrics":
+        """Override create to handle SQLite timestamp issues"""
+        # For SQLite, explicitly set timestamps as server_default may not work
+        if settings.database_engine == DatabaseChoice.SQLITE:
+            now = datetime.now(timezone.utc)
+            if not self.created_at:
+                self.created_at = now
+            if not self.updated_at:
+                self.updated_at = now
+        return super().create(db_session, actor=actor, no_commit=no_commit)
+    async def create_async(
+        self,
+        db_session: AsyncSession,
+        actor: Optional[User] = None,
+        no_commit: bool = False,
+        no_refresh: bool = False,
+    ) -> "StepMetrics":
+        """Override create_async to handle SQLite timestamp issues"""
+        # For SQLite, explicitly set timestamps as server_default may not work
+        if settings.database_engine == DatabaseChoice.SQLITE:
+            now = datetime.now(timezone.utc)
+            if not self.created_at:
+                self.created_at = now
+            if not self.updated_at:
+                self.updated_at = now
+        return await super().create_async(db_session, actor=actor, no_commit=no_commit, no_refresh=no_refresh)

letta/otel/db_pool_monitoring.py CHANGED Viewed

@@ -252,7 +252,7 @@ class DatabasePoolMonitor:
                 logger.info(f"Failed to record detach event metric: {e}")
         @event.listens_for(pool, "reset")
-        def on_reset(dbapi_connection: DBAPIConnection, connection_record: ConnectionPoolEntry):
+        def on_reset(dbapi_connection: DBAPIConnection, connection_record: ConnectionPoolEntry, reset_state):
             """Called when a connection is reset."""
             try:
                 from letta.otel.metric_registry import MetricRegistry

letta-nightly 0.11.3.dev20250820104219__py3-none-any.whl → 0.11.4.dev20250820213507__py3-none-any.whl

letta-nightly 0.11.3.dev20250820104219py3-none-any.whl → 0.11.4.dev20250820213507py3-none-any.whl