PyPI - letta-nightly - Versions diffs - 0.11.7.dev20250911104039__py3-none-any.whl → 0.11.7.dev20250913103940__py3-none-any.whl - Mend

letta-nightly 0.11.7.dev20250911104039py3-none-any.whl → 0.11.7.dev20250913103940py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

letta/adapters/letta_llm_stream_adapter.py CHANGED Viewed

@@ -149,7 +149,7 @@ class LettaLLMStreamAdapter(LettaLLMAdapter):
                     request_json=self.request_data,
                     response_json={
                         "content": {
-                            "tool_call": self.tool_call.model_dump_json(),
+                            "tool_call": self.tool_call.model_dump_json() if self.tool_call else None,
                             "reasoning": [content.model_dump_json() for content in self.reasoning_content],
                         },
                         "id": self.interface.message_id,

letta/agents/letta_agent_v2.py CHANGED Viewed

@@ -19,7 +19,7 @@ from letta.agents.helpers import (
     generate_step_id,
 )
 from letta.constants import DEFAULT_MAX_STEPS, NON_USER_MSG_PREFIX
-from letta.errors import ContextWindowExceededError
+from letta.errors import ContextWindowExceededError, LLMError
 from letta.helpers import ToolRulesSolver
 from letta.helpers.datetime_helpers import get_utc_time, get_utc_timestamp_ns, ns_to_ms
 from letta.helpers.reasoning_helper import scrub_inner_thoughts_from_messages
@@ -213,8 +213,17 @@ class LettaAgentV2(BaseAgentV2):
         if self.stop_reason is None:
             self.stop_reason = LettaStopReason(stop_reason=StopReasonType.end_turn.value)
-        self._request_checkpoint_finish(request_span=request_span, request_start_timestamp_ns=request_start_timestamp_ns)
-        return LettaResponse(messages=response_letta_messages, stop_reason=self.stop_reason, usage=self.usage)
+        result = LettaResponse(messages=response_letta_messages, stop_reason=self.stop_reason, usage=self.usage)
+        if run_id:
+            if self.job_update_metadata is None:
+                self.job_update_metadata = {}
+            self.job_update_metadata["result"] = result.model_dump(mode="json")
+        await self._request_checkpoint_finish(
+            request_span=request_span, request_start_timestamp_ns=request_start_timestamp_ns, run_id=run_id
+        )
+        return result
     @trace_method
     async def stream(
@@ -297,11 +306,24 @@ class LettaAgentV2(BaseAgentV2):
                 )
         except:
-            if self.stop_reason:
+            if self.stop_reason and not first_chunk:
                 yield f"data: {self.stop_reason.model_dump_json()}\n\n"
             raise
-        self._request_checkpoint_finish(request_span=request_span, request_start_timestamp_ns=request_start_timestamp_ns)
+        if run_id:
+            letta_messages = Message.to_letta_messages_from_list(
+                self.response_messages,
+                use_assistant_message=use_assistant_message,
+                reverse=False,
+            )
+            result = LettaResponse(messages=letta_messages, stop_reason=self.stop_reason, usage=self.usage)
+            if self.job_update_metadata is None:
+                self.job_update_metadata = {}
+            self.job_update_metadata["result"] = result.model_dump(mode="json")
+        await self._request_checkpoint_finish(
+            request_span=request_span, request_start_timestamp_ns=request_start_timestamp_ns, run_id=run_id
+        )
         for finish_chunk in self.get_finish_chunks_for_stream(self.usage, self.stop_reason):
             yield f"data: {finish_chunk}\n\n"
@@ -409,6 +431,9 @@ class LettaAgentV2(BaseAgentV2):
                     except ValueError as e:
                         self.stop_reason = LettaStopReason(stop_reason=StopReasonType.invalid_llm_response.value)
                         raise e
+                    except LLMError as e:
+                        self.stop_reason = LettaStopReason(stop_reason=StopReasonType.llm_api_error.value)
+                        raise e
                     except Exception as e:
                         if isinstance(e, ContextWindowExceededError) and llm_request_attempt < summarizer_settings.max_summarizer_retries:
                             # Retry case
@@ -475,6 +500,17 @@ class LettaAgentV2(BaseAgentV2):
                     if include_return_message_types is None or message.message_type in include_return_message_types:
                         yield message
+            # Persist approval responses immediately to prevent agent from getting into a bad state
+            if (
+                len(input_messages_to_persist) == 1
+                and input_messages_to_persist[0].role == "approval"
+                and persisted_messages[0].role == "approval"
+                and persisted_messages[1].role == "tool"
+            ):
+                self.agent_state.message_ids = self.agent_state.message_ids + [m.id for m in persisted_messages[:2]]
+                await self.agent_manager.update_message_ids_async(
+                    agent_id=self.agent_state.id, message_ids=self.agent_state.message_ids, actor=self.actor
+                )
             step_progression, step_metrics = await self._step_checkpoint_finish(step_metrics, agent_step_span, logged_step)
         except Exception as e:
             self.logger.error(f"Error during step processing: {e}")
@@ -489,6 +525,7 @@ class LettaAgentV2(BaseAgentV2):
                 StopReasonType.no_tool_call,
                 StopReasonType.invalid_tool_call,
                 StopReasonType.invalid_llm_response,
+                StopReasonType.llm_api_error,
             ):
                 self.logger.error("Error occurred during step processing, with unexpected stop reason: %s", self.stop_reason.stop_reason)
             raise e
@@ -736,11 +773,10 @@ class LettaAgentV2(BaseAgentV2):
         return None
     @trace_method
-    def _request_checkpoint_finish(self, request_span: Span | None, request_start_timestamp_ns: int | None) -> None:
-        if request_span is not None:
-            duration_ns = get_utc_timestamp_ns() - request_start_timestamp_ns
-            request_span.add_event(name="letta_request_ms", attributes={"duration_ms": ns_to_ms(duration_ns)})
-            request_span.end()
+    async def _request_checkpoint_finish(
+        self, request_span: Span | None, request_start_timestamp_ns: int | None, run_id: str | None
+    ) -> None:
+        await self._log_request(request_start_timestamp_ns, request_span, self.job_update_metadata, is_error=False, run_id=run_id)
         return None
     @trace_method

letta/helpers/tpuf_client.py CHANGED Viewed

@@ -62,11 +62,18 @@ class TurbopufferClient:
         """
         from letta.llm_api.llm_client import LLMClient
+        # filter out empty strings after stripping
+        filtered_texts = [text for text in texts if text.strip()]
+        # skip embedding if no valid texts
+        if not filtered_texts:
+            return []
         embedding_client = LLMClient.create(
             provider_type=self.default_embedding_config.embedding_endpoint_type,
             actor=actor,
         )
-        embeddings = await embedding_client.request_embeddings(texts, self.default_embedding_config)
+        embeddings = await embedding_client.request_embeddings(filtered_texts, self.default_embedding_config)
         return embeddings
     @trace_method
@@ -119,8 +126,16 @@ class TurbopufferClient:
         """
         from turbopuffer import AsyncTurbopuffer
+        # filter out empty text chunks
+        filtered_chunks = [(i, text) for i, text in enumerate(text_chunks) if text.strip()]
+        if not filtered_chunks:
+            logger.warning("All text chunks were empty, skipping insertion")
+            return []
         # generate embeddings using the default config
-        embeddings = await self._generate_embeddings(text_chunks, actor)
+        filtered_texts = [text for _, text in filtered_chunks]
+        embeddings = await self._generate_embeddings(filtered_texts, actor)
         namespace_name = await self._get_archive_namespace_name(archive_id)
@@ -152,8 +167,8 @@ class TurbopufferClient:
         tags_arrays = []  # Store tags as arrays
         passages = []
-        for idx, (text, embedding) in enumerate(zip(text_chunks, embeddings)):
-            passage_id = passage_ids[idx]
+        for (original_idx, text), embedding in zip(filtered_chunks, embeddings):
+            passage_id = passage_ids[original_idx]
             # append to columns
             ids.append(passage_id)
@@ -240,8 +255,16 @@ class TurbopufferClient:
         """
         from turbopuffer import AsyncTurbopuffer
+        # filter out empty message texts
+        filtered_messages = [(i, text) for i, text in enumerate(message_texts) if text.strip()]
+        if not filtered_messages:
+            logger.warning("All message texts were empty, skipping insertion")
+            return True
         # generate embeddings using the default config
-        embeddings = await self._generate_embeddings(message_texts, actor)
+        filtered_texts = [text for _, text in filtered_messages]
+        embeddings = await self._generate_embeddings(filtered_texts, actor)
         namespace_name = await self._get_message_namespace_name(organization_id)
@@ -266,8 +289,10 @@ class TurbopufferClient:
         project_ids = []
         template_ids = []
-        for idx, (text, embedding, role, created_at) in enumerate(zip(message_texts, embeddings, roles, created_ats)):
-            message_id = message_ids[idx]
+        for (original_idx, text), embedding in zip(filtered_messages, embeddings):
+            message_id = message_ids[original_idx]
+            role = roles[original_idx]
+            created_at = created_ats[original_idx]
             # ensure the provided timestamp is timezone-aware and in UTC
             if created_at.tzinfo is None:
@@ -1162,8 +1187,15 @@ class TurbopufferClient:
         if not text_chunks:
             return []
+        # filter out empty text chunks
+        filtered_chunks = [text for text in text_chunks if text.strip()]
+        if not filtered_chunks:
+            logger.warning("All text chunks were empty, skipping file passage insertion")
+            return []
         # generate embeddings using the default config
-        embeddings = await self._generate_embeddings(text_chunks, actor)
+        embeddings = await self._generate_embeddings(filtered_chunks, actor)
         namespace_name = await self._get_file_passages_namespace_name(organization_id)
@@ -1189,7 +1221,7 @@ class TurbopufferClient:
         created_ats = []
         passages = []
-        for idx, (text, embedding) in enumerate(zip(text_chunks, embeddings)):
+        for text, embedding in zip(filtered_chunks, embeddings):
             passage = PydanticPassage(
                 text=text,
                 file_id=file_id,

letta/interfaces/openai_streaming_interface.py CHANGED Viewed

@@ -24,7 +24,7 @@ from letta.schemas.letta_stop_reason import LettaStopReason, StopReasonType
 from letta.schemas.message import Message
 from letta.schemas.openai.chat_completion_response import FunctionCall, ToolCall
 from letta.server.rest_api.json_parser import OptimisticJSONParser
-from letta.streaming_utils import JSONInnerThoughtsExtractor
+from letta.streaming_utils import FunctionArgumentsStreamHandler, JSONInnerThoughtsExtractor
 from letta.utils import count_tokens
 logger = get_logger(__name__)
@@ -53,6 +53,8 @@ class OpenAIStreamingInterface:
         self.optimistic_json_parser: OptimisticJSONParser = OptimisticJSONParser()
         self.function_args_reader = JSONInnerThoughtsExtractor(wait_for_first_key=put_inner_thoughts_in_kwarg)
+        # Reader that extracts only the assistant message value from send_message args
+        self.assistant_message_json_reader = FunctionArgumentsStreamHandler(json_key=self.assistant_message_tool_kwarg)
         self.function_name_buffer = None
         self.function_args_buffer = None
         self.function_id_buffer = None
@@ -274,6 +276,10 @@ class OpenAIStreamingInterface:
                                 # Store the ID of the tool call so allow skipping the corresponding response
                                 if self.function_id_buffer:
                                     self.prev_assistant_message_id = self.function_id_buffer
+                                # Reset message reader at the start of a new send_message stream
+                                self.assistant_message_json_reader.reset()
+                                self.assistant_message_json_reader.in_message = True
+                                self.assistant_message_json_reader.message_started = True
                             else:
                                 if prev_message_type and prev_message_type != "tool_call_message":
@@ -328,39 +334,15 @@ class OpenAIStreamingInterface:
                                 self.last_flushed_function_name is not None
                                 and self.last_flushed_function_name == self.assistant_message_tool_name
                             ):
-                                # do an additional parse on the updates_main_json
-                                if self.function_args_buffer:
-                                    updates_main_json = self.function_args_buffer + updates_main_json
-                                    self.function_args_buffer = None
-                                    # Pretty gross hardcoding that assumes that if we're toggling into the keywords, we have the full prefix
-                                    match_str = '{"' + self.assistant_message_tool_kwarg + '":"'
-                                    if updates_main_json == match_str:
-                                        updates_main_json = None
-                                else:
-                                    # Some hardcoding to strip off the trailing "}"
-                                    if updates_main_json in ["}", '"}']:
-                                        updates_main_json = None
-                                    if updates_main_json and len(updates_main_json) > 0 and updates_main_json[-1:] == '"':
-                                        updates_main_json = updates_main_json[:-1]
-                                if not updates_main_json:
-                                    # early exit to turn into content mode
-                                    pass
-                                # There may be a buffer from a previous chunk, for example
-                                # if the previous chunk had arguments but we needed to flush name
-                                if self.function_args_buffer:
-                                    # In this case, we should release the buffer + new data at once
-                                    combined_chunk = self.function_args_buffer + updates_main_json
+                                # Minimal, robust extraction: only emit the value of "message"
+                                extracted = self.assistant_message_json_reader.process_json_chunk(tool_call.function.arguments)
+                                if extracted:
                                     if prev_message_type and prev_message_type != "assistant_message":
                                         message_index += 1
                                     assistant_message = AssistantMessage(
                                         id=self.letta_message_id,
                                         date=datetime.now(timezone.utc),
-                                        content=combined_chunk,
+                                        content=extracted,
                                         otid=Message.generate_otid_from_id(self.letta_message_id, message_index),
                                     )
                                     prev_message_type = assistant_message.message_type
@@ -368,51 +350,6 @@ class OpenAIStreamingInterface:
                                     # Store the ID of the tool call so allow skipping the corresponding response
                                     if self.function_id_buffer:
                                         self.prev_assistant_message_id = self.function_id_buffer
-                                    # clear buffer
-                                    self.function_args_buffer = None
-                                    self.function_id_buffer = None
-                                else:
-                                    # If there's no buffer to clear, just output a new chunk with new data
-                                    # TODO: THIS IS HORRIBLE
-                                    # TODO: WE USE THE OLD JSON PARSER EARLIER (WHICH DOES NOTHING) AND NOW THE NEW JSON PARSER
-                                    # TODO: THIS IS TOTALLY WRONG AND BAD, BUT SAVING FOR A LARGER REWRITE IN THE NEAR FUTURE
-                                    parsed_args = self.optimistic_json_parser.parse(self.current_function_arguments)
-                                    if parsed_args.get(self.assistant_message_tool_kwarg) and parsed_args.get(
-                                        self.assistant_message_tool_kwarg
-                                    ) != self.current_json_parse_result.get(self.assistant_message_tool_kwarg):
-                                        new_content = parsed_args.get(self.assistant_message_tool_kwarg)
-                                        prev_content = self.current_json_parse_result.get(self.assistant_message_tool_kwarg, "")
-                                        # TODO: Assumes consistent state and that prev_content is subset of new_content
-                                        diff = new_content.replace(prev_content, "", 1)
-                                        # quick patch to mitigate double message streaming error
-                                        # TODO: root cause this issue and remove patch
-                                        if diff != "" and "\\n" not in new_content:
-                                            converted_new_content = new_content.replace("\n", "\\n")
-                                            converted_content_diff = converted_new_content.replace(prev_content, "", 1)
-                                            if converted_content_diff == "":
-                                                diff = converted_content_diff
-                                        self.current_json_parse_result = parsed_args
-                                        if prev_message_type and prev_message_type != "assistant_message":
-                                            message_index += 1
-                                        assistant_message = AssistantMessage(
-                                            id=self.letta_message_id,
-                                            date=datetime.now(timezone.utc),
-                                            content=diff,
-                                            # name=name,
-                                            otid=Message.generate_otid_from_id(self.letta_message_id, message_index),
-                                        )
-                                        prev_message_type = assistant_message.message_type
-                                        yield assistant_message
-                                    # Store the ID of the tool call so allow skipping the corresponding response
-                                    if self.function_id_buffer:
-                                        self.prev_assistant_message_id = self.function_id_buffer
-                                    # clear buffers
-                                    self.function_id_buffer = None
                             else:
                                 # There may be a buffer from a previous chunk, for example
                                 # if the previous chunk had arguments but we needed to flush name

letta/llm_api/anthropic_client.py CHANGED Viewed

@@ -497,7 +497,7 @@ class AnthropicClient(LLMClientBase):
                         try:
                             args_json = json.loads(arguments)
                             if not isinstance(args_json, dict):
-                                raise ValueError("Expected parseable json object for arguments")
+                                raise LLMServerError("Expected parseable json object for arguments")
                         except:
                             arguments = str(tool_input["function"]["arguments"])
                     else:
@@ -854,7 +854,7 @@ def remap_finish_reason(stop_reason: str) -> str:
     elif stop_reason == "tool_use":
         return "function_call"
     else:
-        raise ValueError(f"Unexpected stop_reason: {stop_reason}")
+        raise LLMServerError(f"Unexpected stop_reason: {stop_reason}")
 def strip_xml_tags(string: str, tag: Optional[str]) -> str:

letta/llm_api/azure_client.py CHANGED Viewed

@@ -54,9 +54,12 @@ class AzureClient(OpenAIClient):
             api_key = model_settings.azure_api_key or os.environ.get("AZURE_API_KEY")
             base_url = model_settings.azure_base_url or os.environ.get("AZURE_BASE_URL")
             api_version = model_settings.azure_api_version or os.environ.get("AZURE_API_VERSION")
+        try:
+            client = AsyncAzureOpenAI(api_key=api_key, azure_endpoint=base_url, api_version=api_version)
+            response: ChatCompletion = await client.chat.completions.create(**request_data)
+        except Exception as e:
+            raise self.handle_llm_error(e)
-        client = AsyncAzureOpenAI(api_key=api_key, azure_endpoint=base_url, api_version=api_version)
-        response: ChatCompletion = await client.chat.completions.create(**request_data)
         return response.model_dump()
     @trace_method

letta/llm_api/google_vertex_client.py CHANGED Viewed

@@ -14,6 +14,19 @@ from google.genai.types import (
 )
 from letta.constants import NON_USER_MSG_PREFIX
+from letta.errors import (
+    ContextWindowExceededError,
+    ErrorCode,
+    LLMAuthenticationError,
+    LLMBadRequestError,
+    LLMConnectionError,
+    LLMNotFoundError,
+    LLMPermissionDeniedError,
+    LLMRateLimitError,
+    LLMServerError,
+    LLMTimeoutError,
+    LLMUnprocessableEntityError,
+)
 from letta.helpers.datetime_helpers import get_utc_time_int
 from letta.helpers.json_helpers import json_dumps, json_loads
 from letta.llm_api.llm_client_base import LLMClientBase
@@ -48,13 +61,16 @@ class GoogleVertexClient(LLMClientBase):
         """
         Performs underlying request to llm and returns raw response.
         """
-        client = self._get_client()
-        response = client.models.generate_content(
-            model=llm_config.model,
-            contents=request_data["contents"],
-            config=request_data["config"],
-        )
-        return response.model_dump()
+        try:
+            client = self._get_client()
+            response = client.models.generate_content(
+                model=llm_config.model,
+                contents=request_data["contents"],
+                config=request_data["config"],
+            )
+            return response.model_dump()
+        except Exception as e:
+            raise self.handle_llm_error(e)
     @trace_method
     async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
@@ -67,6 +83,7 @@ class GoogleVertexClient(LLMClientBase):
         # https://github.com/googleapis/python-aiplatform/issues/4472
         retry_count = 1
         should_retry = True
+        response_data = None
         while should_retry and retry_count <= self.MAX_RETRIES:
             try:
                 response = await client.aio.models.generate_content(
@@ -76,13 +93,15 @@ class GoogleVertexClient(LLMClientBase):
                 )
             except errors.APIError as e:
                 # Retry on 503 and 500 errors as well, usually ephemeral from Gemini
-                if e.code == 503 or e.code == 500:
+                if e.code == 503 or e.code == 500 or e.code == 504:
                     logger.warning(f"Received {e}, retrying {retry_count}/{self.MAX_RETRIES}")
                     retry_count += 1
+                    if retry_count > self.MAX_RETRIES:
+                        raise self.handle_llm_error(e)
                     continue
-                raise e
+                raise self.handle_llm_error(e)
             except Exception as e:
-                raise e
+                raise self.handle_llm_error(e)
             response_data = response.model_dump()
             is_malformed_function_call = self.is_malformed_function_call(response_data)
             if is_malformed_function_call:
@@ -114,6 +133,8 @@ class GoogleVertexClient(LLMClientBase):
             should_retry = is_malformed_function_call
             retry_count += 1
+        if response_data is None:
+            raise RuntimeError("Failed to get response data after all retries")
         return response_data
     @staticmethod
@@ -358,11 +379,10 @@ class GoogleVertexClient(LLMClientBase):
                 if content is None or content.role is None or content.parts is None:
                     # This means the response is malformed like MALFORMED_FUNCTION_CALL
-                    # NOTE: must be a ValueError to trigger a retry
                     if candidate.finish_reason == "MALFORMED_FUNCTION_CALL":
-                        raise ValueError(f"Error in response data from LLM: {candidate.finish_reason}")
+                        raise LLMServerError(f"Malformed response from Google Vertex: {candidate.finish_reason}")
                     else:
-                        raise ValueError(f"Error in response data from LLM: {candidate.model_dump()}")
+                        raise LLMServerError(f"Invalid response data from Google Vertex: {candidate.model_dump()}")
                 role = content.role
                 assert role == "model", f"Unknown role in response: {role}"
@@ -456,7 +476,7 @@ class GoogleVertexClient(LLMClientBase):
                         except json.decoder.JSONDecodeError:
                             if candidate.finish_reason == "MAX_TOKENS":
-                                raise ValueError("Could not parse response data from LLM: exceeded max token limit")
+                                raise LLMServerError("Could not parse response data from LLM: exceeded max token limit")
                             # Inner thoughts are the content by default
                             inner_thoughts = response_message.text
@@ -485,7 +505,7 @@ class GoogleVertexClient(LLMClientBase):
                     elif finish_reason == "RECITATION":
                         openai_finish_reason = "content_filter"
                     else:
-                        raise ValueError(f"Unrecognized finish reason in Google AI response: {finish_reason}")
+                        raise LLMServerError(f"Unrecognized finish reason in Google AI response: {finish_reason}")
                     choices.append(
                         Choice(
@@ -576,5 +596,127 @@ class GoogleVertexClient(LLMClientBase):
     @trace_method
     def handle_llm_error(self, e: Exception) -> Exception:
-        # Fallback to base implementation
+        # Handle Google GenAI specific errors
+        if isinstance(e, errors.ClientError):
+            logger.warning(f"[Google Vertex] Client error ({e.code}): {e}")
+            # Handle specific error codes
+            if e.code == 400:
+                error_str = str(e).lower()
+                if "context" in error_str and ("exceed" in error_str or "limit" in error_str or "too long" in error_str):
+                    return ContextWindowExceededError(
+                        message=f"Bad request to Google Vertex (context window exceeded): {str(e)}",
+                    )
+                else:
+                    return LLMBadRequestError(
+                        message=f"Bad request to Google Vertex: {str(e)}",
+                        code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    )
+            elif e.code == 401:
+                return LLMAuthenticationError(
+                    message=f"Authentication failed with Google Vertex: {str(e)}",
+                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                )
+            elif e.code == 403:
+                return LLMPermissionDeniedError(
+                    message=f"Permission denied by Google Vertex: {str(e)}",
+                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                )
+            elif e.code == 404:
+                return LLMNotFoundError(
+                    message=f"Resource not found in Google Vertex: {str(e)}",
+                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                )
+            elif e.code == 408:
+                return LLMTimeoutError(
+                    message=f"Request to Google Vertex timed out: {str(e)}",
+                    code=ErrorCode.TIMEOUT,
+                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                )
+            elif e.code == 422:
+                return LLMUnprocessableEntityError(
+                    message=f"Invalid request content for Google Vertex: {str(e)}",
+                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                )
+            elif e.code == 429:
+                logger.warning("[Google Vertex] Rate limited (429). Consider backoff.")
+                return LLMRateLimitError(
+                    message=f"Rate limited by Google Vertex: {str(e)}",
+                    code=ErrorCode.RATE_LIMIT_EXCEEDED,
+                )
+            else:
+                return LLMServerError(
+                    message=f"Google Vertex client error: {str(e)}",
+                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={
+                        "status_code": e.code,
+                        "response_json": getattr(e, "response_json", None),
+                    },
+                )
+        if isinstance(e, errors.ServerError):
+            logger.warning(f"[Google Vertex] Server error ({e.code}): {e}")
+            # Handle specific server error codes
+            if e.code == 500:
+                return LLMServerError(
+                    message=f"Google Vertex internal server error: {str(e)}",
+                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={
+                        "status_code": e.code,
+                        "response_json": getattr(e, "response_json", None),
+                    },
+                )
+            elif e.code == 502:
+                return LLMConnectionError(
+                    message=f"Bad gateway from Google Vertex: {str(e)}",
+                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                )
+            elif e.code == 503:
+                return LLMServerError(
+                    message=f"Google Vertex service unavailable: {str(e)}",
+                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={
+                        "status_code": e.code,
+                        "response_json": getattr(e, "response_json", None),
+                    },
+                )
+            elif e.code == 504:
+                return LLMTimeoutError(
+                    message=f"Gateway timeout from Google Vertex: {str(e)}",
+                    code=ErrorCode.TIMEOUT,
+                    details={"cause": str(e.__cause__) if e.__cause__ else None},
+                )
+            else:
+                return LLMServerError(
+                    message=f"Google Vertex server error: {str(e)}",
+                    code=ErrorCode.INTERNAL_SERVER_ERROR,
+                    details={
+                        "status_code": e.code,
+                        "response_json": getattr(e, "response_json", None),
+                    },
+                )
+        if isinstance(e, errors.APIError):
+            logger.warning(f"[Google Vertex] API error ({e.code}): {e}")
+            return LLMServerError(
+                message=f"Google Vertex API error: {str(e)}",
+                code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={
+                    "status_code": e.code,
+                    "response_json": getattr(e, "response_json", None),
+                },
+            )
+        # Handle connection-related errors
+        if "connection" in str(e).lower() or "timeout" in str(e).lower():
+            logger.warning(f"[Google Vertex] Connection/timeout error: {e}")
+            return LLMConnectionError(
+                message=f"Failed to connect to Google Vertex: {str(e)}",
+                code=ErrorCode.INTERNAL_SERVER_ERROR,
+                details={"cause": str(e.__cause__) if e.__cause__ else None},
+            )
+        # Fallback to base implementation for other errors
         return super().handle_llm_error(e)

letta-nightly 0.11.7.dev20250911104039__py3-none-any.whl → 0.11.7.dev20250913103940__py3-none-any.whl

letta-nightly 0.11.7.dev20250911104039py3-none-any.whl → 0.11.7.dev20250913103940py3-none-any.whl