letta-nightly 0.11.3.dev20250819104229__py3-none-any.whl → 0.11.4.dev20250820213507__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. letta/__init__.py +1 -1
  2. letta/agents/helpers.py +4 -0
  3. letta/agents/letta_agent.py +142 -5
  4. letta/constants.py +10 -7
  5. letta/data_sources/connectors.py +70 -53
  6. letta/embeddings.py +3 -240
  7. letta/errors.py +28 -0
  8. letta/functions/function_sets/base.py +4 -4
  9. letta/functions/functions.py +287 -32
  10. letta/functions/mcp_client/types.py +11 -0
  11. letta/functions/schema_validator.py +187 -0
  12. letta/functions/typescript_parser.py +196 -0
  13. letta/helpers/datetime_helpers.py +8 -4
  14. letta/helpers/tool_execution_helper.py +25 -2
  15. letta/llm_api/anthropic_client.py +23 -18
  16. letta/llm_api/azure_client.py +73 -0
  17. letta/llm_api/bedrock_client.py +8 -4
  18. letta/llm_api/google_vertex_client.py +14 -5
  19. letta/llm_api/llm_api_tools.py +2 -217
  20. letta/llm_api/llm_client.py +15 -1
  21. letta/llm_api/llm_client_base.py +32 -1
  22. letta/llm_api/openai.py +1 -0
  23. letta/llm_api/openai_client.py +18 -28
  24. letta/llm_api/together_client.py +55 -0
  25. letta/orm/provider.py +1 -0
  26. letta/orm/step_metrics.py +40 -1
  27. letta/otel/db_pool_monitoring.py +1 -1
  28. letta/schemas/agent.py +3 -4
  29. letta/schemas/agent_file.py +2 -0
  30. letta/schemas/block.py +11 -5
  31. letta/schemas/embedding_config.py +4 -5
  32. letta/schemas/enums.py +1 -1
  33. letta/schemas/job.py +2 -3
  34. letta/schemas/llm_config.py +79 -7
  35. letta/schemas/mcp.py +0 -24
  36. letta/schemas/message.py +0 -108
  37. letta/schemas/openai/chat_completion_request.py +1 -0
  38. letta/schemas/providers/__init__.py +0 -2
  39. letta/schemas/providers/anthropic.py +106 -8
  40. letta/schemas/providers/azure.py +102 -8
  41. letta/schemas/providers/base.py +10 -3
  42. letta/schemas/providers/bedrock.py +28 -16
  43. letta/schemas/providers/letta.py +3 -3
  44. letta/schemas/providers/ollama.py +2 -12
  45. letta/schemas/providers/openai.py +4 -4
  46. letta/schemas/providers/together.py +14 -2
  47. letta/schemas/sandbox_config.py +2 -1
  48. letta/schemas/tool.py +46 -22
  49. letta/server/rest_api/routers/v1/agents.py +179 -38
  50. letta/server/rest_api/routers/v1/folders.py +13 -8
  51. letta/server/rest_api/routers/v1/providers.py +10 -3
  52. letta/server/rest_api/routers/v1/sources.py +14 -8
  53. letta/server/rest_api/routers/v1/steps.py +17 -1
  54. letta/server/rest_api/routers/v1/tools.py +96 -5
  55. letta/server/rest_api/streaming_response.py +91 -45
  56. letta/server/server.py +27 -38
  57. letta/services/agent_manager.py +92 -20
  58. letta/services/agent_serialization_manager.py +11 -7
  59. letta/services/context_window_calculator/context_window_calculator.py +40 -2
  60. letta/services/helpers/agent_manager_helper.py +73 -12
  61. letta/services/mcp_manager.py +109 -15
  62. letta/services/passage_manager.py +28 -109
  63. letta/services/provider_manager.py +24 -0
  64. letta/services/step_manager.py +68 -0
  65. letta/services/summarizer/summarizer.py +1 -4
  66. letta/services/tool_executor/core_tool_executor.py +1 -1
  67. letta/services/tool_executor/sandbox_tool_executor.py +26 -9
  68. letta/services/tool_manager.py +82 -5
  69. letta/services/tool_sandbox/base.py +3 -11
  70. letta/services/tool_sandbox/modal_constants.py +17 -0
  71. letta/services/tool_sandbox/modal_deployment_manager.py +242 -0
  72. letta/services/tool_sandbox/modal_sandbox.py +218 -3
  73. letta/services/tool_sandbox/modal_sandbox_v2.py +429 -0
  74. letta/services/tool_sandbox/modal_version_manager.py +273 -0
  75. letta/services/tool_sandbox/safe_pickle.py +193 -0
  76. letta/settings.py +5 -3
  77. letta/templates/sandbox_code_file.py.j2 +2 -4
  78. letta/templates/sandbox_code_file_async.py.j2 +2 -4
  79. letta/utils.py +1 -1
  80. {letta_nightly-0.11.3.dev20250819104229.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/METADATA +2 -2
  81. {letta_nightly-0.11.3.dev20250819104229.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/RECORD +84 -81
  82. letta/llm_api/anthropic.py +0 -1206
  83. letta/llm_api/aws_bedrock.py +0 -104
  84. letta/llm_api/azure_openai.py +0 -118
  85. letta/llm_api/azure_openai_constants.py +0 -11
  86. letta/llm_api/cohere.py +0 -391
  87. letta/schemas/providers/cohere.py +0 -18
  88. {letta_nightly-0.11.3.dev20250819104229.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/LICENSE +0 -0
  89. {letta_nightly-0.11.3.dev20250819104229.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/WHEEL +0 -0
  90. {letta_nightly-0.11.3.dev20250819104229.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/entry_points.txt +0 -0
@@ -239,7 +239,7 @@ class GoogleVertexClient(LLMClientBase):
239
239
  request_data["config"]["response_mime_type"] = "application/json"
240
240
  request_data["config"]["response_schema"] = self.get_function_call_response_schema(tools[0])
241
241
  del request_data["config"]["tools"]
242
- else:
242
+ elif tools:
243
243
  tool_config = ToolConfig(
244
244
  function_calling_config=FunctionCallingConfig(
245
245
  # ANY mode forces the model to predict only function calls
@@ -255,10 +255,13 @@ class GoogleVertexClient(LLMClientBase):
255
255
  # Otherwise, use the value from max_reasoning_tokens
256
256
  if "flash" in llm_config.model:
257
257
  # Gemini flash models may fail to call tools even with FunctionCallingConfigMode.ANY if thinking is fully disabled, set to minimum to prevent tool call failure
258
+ thinking_budget = llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
259
+ if thinking_budget <= 0:
260
+ logger.error(
261
+ f"Thinking budget of {thinking_budget} for Gemini reasoning model {llm_config.model}, this will likely cause tool call failures"
262
+ )
258
263
  thinking_config = ThinkingConfig(
259
- thinking_budget=(
260
- llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
261
- ),
264
+ thinking_budget=(thinking_budget),
262
265
  )
263
266
  request_data["config"]["thinking_config"] = thinking_config.model_dump()
264
267
 
@@ -309,7 +312,7 @@ class GoogleVertexClient(LLMClientBase):
309
312
  if candidate.finish_reason == "MALFORMED_FUNCTION_CALL":
310
313
  raise ValueError(f"Error in response data from LLM: {candidate.finish_reason}...")
311
314
  else:
312
- raise ValueError(f"Error in response data from LLM: {response_data}")
315
+ raise ValueError(f"Error in response data from LLM: {candidate.model_dump()}")
313
316
 
314
317
  role = content.role
315
318
  assert role == "model", f"Unknown role in response: {role}"
@@ -496,6 +499,12 @@ class GoogleVertexClient(LLMClientBase):
496
499
  "required": ["name", "args"],
497
500
  }
498
501
 
502
+ # https://ai.google.dev/gemini-api/docs/thinking#set-budget
503
+ # | Model | Default setting | Range | Disable thinking | Turn on dynamic thinking|
504
+ # |-----------------|-------------------------------------------------------------------|--------------|----------------------------|-------------------------|
505
+ # | 2.5 Pro | Dynamic thinking: Model decides when and how much to think | 128-32768 | N/A: Cannot disable | thinkingBudget = -1 |
506
+ # | 2.5 Flash | Dynamic thinking: Model decides when and how much to think | 0-24576 | thinkingBudget = 0 | thinkingBudget = -1 |
507
+ # | 2.5 Flash Lite | Model does not think | 512-24576 | thinkingBudget = 0 | thinkingBudget = -1 |
499
508
  def get_thinking_budget(self, model: str) -> bool:
500
509
  if model_settings.gemini_force_minimum_thinking_budget:
501
510
  if all(substring in model for substring in ["2.5", "flash", "lite"]):
@@ -7,13 +7,6 @@ import requests
7
7
 
8
8
  from letta.constants import CLI_WARNING_PREFIX
9
9
  from letta.errors import LettaConfigurationError, RateLimitExceededError
10
- from letta.llm_api.anthropic import (
11
- anthropic_bedrock_chat_completions_request,
12
- anthropic_chat_completions_process_stream,
13
- anthropic_chat_completions_request,
14
- )
15
- from letta.llm_api.aws_bedrock import has_valid_aws_credentials
16
- from letta.llm_api.azure_openai import azure_openai_chat_completions_request
17
10
  from letta.llm_api.deepseek import build_deepseek_chat_completions_request, convert_deepseek_response_to_chatcompletion
18
11
  from letta.llm_api.helpers import add_inner_thoughts_to_functions, unpack_all_inner_thoughts_from_kwargs
19
12
  from letta.llm_api.openai import (
@@ -30,14 +23,14 @@ from letta.otel.tracing import log_event, trace_method
30
23
  from letta.schemas.enums import ProviderCategory
31
24
  from letta.schemas.llm_config import LLMConfig
32
25
  from letta.schemas.message import Message
33
- from letta.schemas.openai.chat_completion_request import ChatCompletionRequest, cast_message_to_subtype
26
+ from letta.schemas.openai.chat_completion_request import ChatCompletionRequest
34
27
  from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
35
28
  from letta.schemas.provider_trace import ProviderTraceCreate
36
29
  from letta.services.telemetry_manager import TelemetryManager
37
30
  from letta.settings import ModelSettings
38
31
  from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface
39
32
 
40
- LLM_API_PROVIDER_OPTIONS = ["openai", "azure", "anthropic", "google_ai", "cohere", "local", "groq", "deepseek"]
33
+ LLM_API_PROVIDER_OPTIONS = ["openai", "azure", "anthropic", "google_ai", "local", "groq", "deepseek"]
41
34
 
42
35
 
43
36
  def retry_with_exponential_backoff(
@@ -312,153 +305,6 @@ def create(
312
305
 
313
306
  return response
314
307
 
315
- # azure
316
- elif llm_config.model_endpoint_type == "azure":
317
- if stream:
318
- raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}")
319
-
320
- if model_settings.azure_api_key is None:
321
- raise LettaConfigurationError(
322
- message="Azure API key is missing. Did you set AZURE_API_KEY in your env?", missing_fields=["azure_api_key"]
323
- )
324
-
325
- if model_settings.azure_base_url is None:
326
- raise LettaConfigurationError(
327
- message="Azure base url is missing. Did you set AZURE_BASE_URL in your env?", missing_fields=["azure_base_url"]
328
- )
329
-
330
- if model_settings.azure_api_version is None:
331
- raise LettaConfigurationError(
332
- message="Azure API version is missing. Did you set AZURE_API_VERSION in your env?", missing_fields=["azure_api_version"]
333
- )
334
-
335
- # Set the llm config model_endpoint from model_settings
336
- # For Azure, this model_endpoint is required to be configured via env variable, so users don't need to provide it in the LLM config
337
- llm_config.model_endpoint = model_settings.azure_base_url
338
- chat_completion_request = build_openai_chat_completions_request(
339
- llm_config, messages, user_id, functions, function_call, use_tool_naming
340
- )
341
-
342
- response = azure_openai_chat_completions_request(
343
- model_settings=model_settings,
344
- llm_config=llm_config,
345
- chat_completion_request=chat_completion_request,
346
- )
347
-
348
- if llm_config.put_inner_thoughts_in_kwargs:
349
- response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG)
350
-
351
- return response
352
-
353
- elif llm_config.model_endpoint_type == "anthropic":
354
- if not use_tool_naming:
355
- raise NotImplementedError("Only tool calling supported on Anthropic API requests")
356
-
357
- if llm_config.enable_reasoner:
358
- llm_config.put_inner_thoughts_in_kwargs = False
359
-
360
- # Force tool calling
361
- tool_call = None
362
- if functions is None:
363
- # Special case for summarization path
364
- tools = None
365
- tool_choice = None
366
- elif force_tool_call is not None:
367
- # tool_call = {"type": "function", "function": {"name": force_tool_call}}
368
- tool_choice = {"type": "tool", "name": force_tool_call}
369
- tools = [{"type": "function", "function": f} for f in functions if f["name"] == force_tool_call]
370
- assert functions is not None
371
-
372
- # need to have this setting to be able to put inner thoughts in kwargs
373
- llm_config.put_inner_thoughts_in_kwargs = True
374
- else:
375
- if llm_config.put_inner_thoughts_in_kwargs:
376
- # tool_choice_type other than "auto" only plays nice if thinking goes inside the tool calls
377
- tool_choice = {"type": "any", "disable_parallel_tool_use": True}
378
- else:
379
- tool_choice = {"type": "auto", "disable_parallel_tool_use": True}
380
- tools = [{"type": "function", "function": f} for f in functions] if functions is not None else None
381
-
382
- chat_completion_request = ChatCompletionRequest(
383
- model=llm_config.model,
384
- messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
385
- tools=tools,
386
- tool_choice=tool_choice,
387
- max_tokens=llm_config.max_tokens, # Note: max_tokens is required for Anthropic API
388
- temperature=llm_config.temperature,
389
- stream=stream,
390
- )
391
-
392
- # Handle streaming
393
- if stream: # Client requested token streaming
394
- assert isinstance(stream_interface, (AgentChunkStreamingInterface, AgentRefreshStreamingInterface)), type(stream_interface)
395
-
396
- stream_interface.inner_thoughts_in_kwargs = True
397
- response = anthropic_chat_completions_process_stream(
398
- chat_completion_request=chat_completion_request,
399
- put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
400
- stream_interface=stream_interface,
401
- extended_thinking=llm_config.enable_reasoner,
402
- max_reasoning_tokens=llm_config.max_reasoning_tokens,
403
- provider_name=llm_config.provider_name,
404
- provider_category=llm_config.provider_category,
405
- name=name,
406
- user_id=user_id,
407
- )
408
-
409
- else:
410
- # Client did not request token streaming (expect a blocking backend response)
411
- response = anthropic_chat_completions_request(
412
- data=chat_completion_request,
413
- put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
414
- extended_thinking=llm_config.enable_reasoner,
415
- max_reasoning_tokens=llm_config.max_reasoning_tokens,
416
- provider_name=llm_config.provider_name,
417
- provider_category=llm_config.provider_category,
418
- user_id=user_id,
419
- )
420
-
421
- if llm_config.put_inner_thoughts_in_kwargs:
422
- response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG)
423
-
424
- telemetry_manager.create_provider_trace(
425
- actor=actor,
426
- provider_trace_create=ProviderTraceCreate(
427
- request_json=chat_completion_request.model_json_schema(),
428
- response_json=response.model_json_schema(),
429
- step_id=step_id,
430
- organization_id=actor.organization_id,
431
- ),
432
- )
433
-
434
- return response
435
-
436
- # elif llm_config.model_endpoint_type == "cohere":
437
- # if stream:
438
- # raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}")
439
- # if not use_tool_naming:
440
- # raise NotImplementedError("Only tool calling supported on Cohere API requests")
441
- #
442
- # if functions is not None:
443
- # tools = [{"type": "function", "function": f} for f in functions]
444
- # tools = [Tool(**t) for t in tools]
445
- # else:
446
- # tools = None
447
- #
448
- # return cohere_chat_completions_request(
449
- # # url=llm_config.model_endpoint,
450
- # url="https://api.cohere.ai/v1", # TODO
451
- # api_key=os.getenv("COHERE_API_KEY"), # TODO remove
452
- # chat_completion_request=ChatCompletionRequest(
453
- # model="command-r-plus", # TODO
454
- # messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
455
- # tools=tools,
456
- # tool_choice=function_call,
457
- # # user=str(user_id),
458
- # # NOTE: max_tokens is required for Anthropic API
459
- # # max_tokens=1024, # TODO make dynamic
460
- # ),
461
- # )
462
308
  elif llm_config.model_endpoint_type == "groq":
463
309
  if stream:
464
310
  raise NotImplementedError("Streaming not yet implemented for Groq.")
@@ -510,67 +356,6 @@ def create(
510
356
 
511
357
  return response
512
358
 
513
- elif llm_config.model_endpoint_type == "together":
514
- """TogetherAI endpoint that goes via /completions instead of /chat/completions"""
515
-
516
- if stream:
517
- raise NotImplementedError("Streaming not yet implemented for TogetherAI (via the /completions endpoint).")
518
-
519
- if model_settings.together_api_key is None and (
520
- llm_config.model_endpoint == "https://api.together.ai/v1/completions"
521
- or llm_config.model_endpoint == "https://api.together.xyz/v1/completions"
522
- ):
523
- raise LettaConfigurationError(message="TogetherAI key is missing from letta config file", missing_fields=["together_api_key"])
524
-
525
- return get_chat_completion(
526
- model=llm_config.model,
527
- messages=messages,
528
- functions=functions,
529
- functions_python=functions_python,
530
- function_call=function_call,
531
- context_window=llm_config.context_window,
532
- endpoint=llm_config.model_endpoint,
533
- endpoint_type="vllm", # NOTE: use the vLLM path through /completions
534
- wrapper=llm_config.model_wrapper,
535
- user=str(user_id),
536
- # hint
537
- first_message=first_message,
538
- # auth-related
539
- auth_type="bearer_token", # NOTE: Together expects bearer token auth
540
- auth_key=model_settings.together_api_key,
541
- )
542
-
543
- elif llm_config.model_endpoint_type == "bedrock":
544
- """Anthropic endpoint that goes via /embeddings instead of /chat/completions"""
545
-
546
- if stream:
547
- raise NotImplementedError("Streaming not yet implemented for Anthropic (via the /embeddings endpoint).")
548
- if not use_tool_naming:
549
- raise NotImplementedError("Only tool calling supported on Anthropic API requests")
550
-
551
- if not has_valid_aws_credentials():
552
- raise LettaConfigurationError(message="Invalid or missing AWS credentials. Please configure valid AWS credentials.")
553
-
554
- tool_call = None
555
- if force_tool_call is not None:
556
- tool_call = {"type": "function", "function": {"name": force_tool_call}}
557
- assert functions is not None
558
-
559
- return anthropic_bedrock_chat_completions_request(
560
- data=ChatCompletionRequest(
561
- model=llm_config.model,
562
- messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
563
- tools=[{"type": "function", "function": f} for f in functions] if functions else None,
564
- tool_choice=tool_call,
565
- # user=str(user_id),
566
- # NOTE: max_tokens is required for Anthropic API
567
- max_tokens=llm_config.max_tokens,
568
- ),
569
- provider_name=llm_config.provider_name,
570
- provider_category=llm_config.provider_category,
571
- user_id=user_id,
572
- )
573
-
574
359
  elif llm_config.model_endpoint_type == "deepseek":
575
360
  if model_settings.deepseek_api_key is None and llm_config.model_endpoint == "":
576
361
  # only is a problem if we are *not* using an openai proxy
@@ -58,12 +58,26 @@ class LLMClient:
58
58
  put_inner_thoughts_first=put_inner_thoughts_first,
59
59
  actor=actor,
60
60
  )
61
- case ProviderType.openai | ProviderType.together | ProviderType.ollama:
61
+ case ProviderType.openai | ProviderType.ollama:
62
62
  from letta.llm_api.openai_client import OpenAIClient
63
63
 
64
64
  return OpenAIClient(
65
65
  put_inner_thoughts_first=put_inner_thoughts_first,
66
66
  actor=actor,
67
67
  )
68
+ case ProviderType.together:
69
+ from letta.llm_api.together_client import TogetherClient
70
+
71
+ return TogetherClient(
72
+ put_inner_thoughts_first=put_inner_thoughts_first,
73
+ actor=actor,
74
+ )
75
+ case ProviderType.azure:
76
+ from letta.llm_api.azure_client import AzureClient
77
+
78
+ return AzureClient(
79
+ put_inner_thoughts_first=put_inner_thoughts_first,
80
+ actor=actor,
81
+ )
68
82
  case _:
69
83
  return None
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  from abc import abstractmethod
3
- from typing import TYPE_CHECKING, Dict, List, Optional, Union
3
+ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
4
4
 
5
5
  from anthropic.types.beta.messages import BetaMessageBatch
6
6
  from openai import AsyncStream, Stream
@@ -9,6 +9,7 @@ from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
9
9
  from letta.errors import LLMError
10
10
  from letta.otel.tracing import log_event, trace_method
11
11
  from letta.schemas.embedding_config import EmbeddingConfig
12
+ from letta.schemas.enums import ProviderCategory
12
13
  from letta.schemas.llm_config import LLMConfig
13
14
  from letta.schemas.message import Message
14
15
  from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
@@ -111,6 +112,9 @@ class LLMClientBase:
111
112
  agent_tools_mapping: Dict[str, List[dict]],
112
113
  agent_llm_config_mapping: Dict[str, LLMConfig],
113
114
  ) -> Union[BetaMessageBatch]:
115
+ """
116
+ Issues a batch request to the downstream model endpoint and parses response.
117
+ """
114
118
  raise NotImplementedError
115
119
 
116
120
  @abstractmethod
@@ -176,6 +180,9 @@ class LLMClientBase:
176
180
 
177
181
  @abstractmethod
178
182
  def is_reasoning_model(self, llm_config: LLMConfig) -> bool:
183
+ """
184
+ Returns True if the model is a native reasoning model.
185
+ """
179
186
  raise NotImplementedError
180
187
 
181
188
  @abstractmethod
@@ -192,6 +199,30 @@ class LLMClientBase:
192
199
  """
193
200
  return LLMError(f"Unhandled LLM error: {str(e)}")
194
201
 
202
+ def get_byok_overrides(self, llm_config: LLMConfig) -> Tuple[Optional[str], Optional[str], Optional[str]]:
203
+ """
204
+ Returns the override key for the given llm config.
205
+ """
206
+ api_key = None
207
+ if llm_config.provider_category == ProviderCategory.byok:
208
+ from letta.services.provider_manager import ProviderManager
209
+
210
+ api_key = ProviderManager().get_override_key(llm_config.provider_name, actor=self.actor)
211
+
212
+ return api_key, None, None
213
+
214
+ async def get_byok_overrides_async(self, llm_config: LLMConfig) -> Tuple[Optional[str], Optional[str], Optional[str]]:
215
+ """
216
+ Returns the override key for the given llm config.
217
+ """
218
+ api_key = None
219
+ if llm_config.provider_category == ProviderCategory.byok:
220
+ from letta.services.provider_manager import ProviderManager
221
+
222
+ api_key = await ProviderManager().get_override_key_async(llm_config.provider_name, actor=self.actor)
223
+
224
+ return api_key, None, None
225
+
195
226
  def _fix_truncated_json_response(self, response: ChatCompletionResponse) -> ChatCompletionResponse:
196
227
  """
197
228
  Fixes truncated JSON responses by ensuring the content is properly formatted.
letta/llm_api/openai.py CHANGED
@@ -42,6 +42,7 @@ from letta.utils import get_tool_call_id, smart_urljoin
42
42
  logger = get_logger(__name__)
43
43
 
44
44
 
45
+ # TODO: MOVE THIS TO OPENAI_CLIENT
45
46
  def openai_check_valid_api_key(base_url: str, api_key: Union[str, None]) -> None:
46
47
  if api_key:
47
48
  try:
@@ -26,7 +26,6 @@ from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG
26
26
  from letta.log import get_logger
27
27
  from letta.otel.tracing import trace_method
28
28
  from letta.schemas.embedding_config import EmbeddingConfig
29
- from letta.schemas.enums import ProviderCategory, ProviderType
30
29
  from letta.schemas.letta_message_content import MessageContentType
31
30
  from letta.schemas.llm_config import LLMConfig
32
31
  from letta.schemas.message import Message as PydanticMessage
@@ -54,6 +53,11 @@ def is_openai_5_model(model: str) -> bool:
54
53
  return model.startswith("gpt-5")
55
54
 
56
55
 
56
+ def supports_verbosity_control(model: str) -> bool:
57
+ """Check if the model supports verbosity control, currently only GPT-5 models support this"""
58
+ return is_openai_5_model(model)
59
+
60
+
57
61
  def accepts_developer_role(model: str) -> bool:
58
62
  """Checks if the model accepts the 'developer' role. Note that not all reasoning models accept this role.
59
63
 
@@ -102,8 +106,6 @@ def requires_auto_tool_choice(llm_config: LLMConfig) -> bool:
102
106
  """Certain providers require the tool choice to be set to 'auto'."""
103
107
  if "nebius.com" in llm_config.model_endpoint:
104
108
  return True
105
- if "together.ai" in llm_config.model_endpoint or "together.xyz" in llm_config.model_endpoint:
106
- return True
107
109
  if llm_config.handle and "vllm" in llm_config.handle:
108
110
  return True
109
111
  if llm_config.compatibility_type == "mlx":
@@ -113,13 +115,7 @@ def requires_auto_tool_choice(llm_config: LLMConfig) -> bool:
113
115
 
114
116
  class OpenAIClient(LLMClientBase):
115
117
  def _prepare_client_kwargs(self, llm_config: LLMConfig) -> dict:
116
- api_key = None
117
- if llm_config.provider_category == ProviderCategory.byok:
118
- from letta.services.provider_manager import ProviderManager
119
-
120
- api_key = ProviderManager().get_override_key(llm_config.provider_name, actor=self.actor)
121
- if llm_config.model_endpoint_type == ProviderType.together:
122
- api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
118
+ api_key, _, _ = self.get_byok_overrides(llm_config)
123
119
 
124
120
  if not api_key:
125
121
  api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
@@ -130,25 +126,14 @@ class OpenAIClient(LLMClientBase):
130
126
  return kwargs
131
127
 
132
128
  def _prepare_client_kwargs_embedding(self, embedding_config: EmbeddingConfig) -> dict:
133
- api_key = None
134
- if embedding_config.embedding_endpoint_type == ProviderType.together:
135
- api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
136
-
137
- if not api_key:
138
- api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
129
+ api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
139
130
  # supposedly the openai python client requires a dummy API key
140
131
  api_key = api_key or "DUMMY_API_KEY"
141
132
  kwargs = {"api_key": api_key, "base_url": embedding_config.embedding_endpoint}
142
133
  return kwargs
143
134
 
144
135
  async def _prepare_client_kwargs_async(self, llm_config: LLMConfig) -> dict:
145
- api_key = None
146
- if llm_config.provider_category == ProviderCategory.byok:
147
- from letta.services.provider_manager import ProviderManager
148
-
149
- api_key = await ProviderManager().get_override_key_async(llm_config.provider_name, actor=self.actor)
150
- if llm_config.model_endpoint_type == ProviderType.together:
151
- api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
136
+ api_key, _, _ = await self.get_byok_overrides_async(llm_config)
152
137
 
153
138
  if not api_key:
154
139
  api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
@@ -158,6 +143,9 @@ class OpenAIClient(LLMClientBase):
158
143
 
159
144
  return kwargs
160
145
 
146
+ def requires_auto_tool_choice(self, llm_config: LLMConfig) -> bool:
147
+ return requires_auto_tool_choice(llm_config)
148
+
161
149
  @trace_method
162
150
  def build_request_data(
163
151
  self,
@@ -204,7 +192,7 @@ class OpenAIClient(LLMClientBase):
204
192
  # TODO(matt) move into LLMConfig
205
193
  # TODO: This vllm checking is very brittle and is a patch at most
206
194
  tool_choice = None
207
- if requires_auto_tool_choice(llm_config):
195
+ if self.requires_auto_tool_choice(llm_config):
208
196
  tool_choice = "auto"
209
197
  elif tools:
210
198
  # only set if tools is non-Null
@@ -224,6 +212,10 @@ class OpenAIClient(LLMClientBase):
224
212
  temperature=llm_config.temperature if supports_temperature_param(model) else 1.0,
225
213
  )
226
214
 
215
+ # Add verbosity control for GPT-5 models
216
+ if supports_verbosity_control(model) and llm_config.verbosity:
217
+ data.verbosity = llm_config.verbosity
218
+
227
219
  if llm_config.frequency_penalty is not None:
228
220
  data.frequency_penalty = llm_config.frequency_penalty
229
221
 
@@ -252,8 +244,8 @@ class OpenAIClient(LLMClientBase):
252
244
  tool.function = FunctionSchema(**structured_output_version)
253
245
  except ValueError as e:
254
246
  logger.warning(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
255
-
256
- return data.model_dump(exclude_unset=True)
247
+ request_data = data.model_dump(exclude_unset=True)
248
+ return request_data
257
249
 
258
250
  @trace_method
259
251
  def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
@@ -261,7 +253,6 @@ class OpenAIClient(LLMClientBase):
261
253
  Performs underlying synchronous request to OpenAI API and returns raw response dict.
262
254
  """
263
255
  client = OpenAI(**self._prepare_client_kwargs(llm_config))
264
-
265
256
  response: ChatCompletion = client.chat.completions.create(**request_data)
266
257
  return response.model_dump()
267
258
 
@@ -272,7 +263,6 @@ class OpenAIClient(LLMClientBase):
272
263
  """
273
264
  kwargs = await self._prepare_client_kwargs_async(llm_config)
274
265
  client = AsyncOpenAI(**kwargs)
275
-
276
266
  response: ChatCompletion = await client.chat.completions.create(**request_data)
277
267
  return response.model_dump()
278
268
 
@@ -0,0 +1,55 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from openai import AsyncOpenAI, OpenAI
5
+ from openai.types.chat.chat_completion import ChatCompletion
6
+
7
+ from letta.llm_api.openai_client import OpenAIClient
8
+ from letta.otel.tracing import trace_method
9
+ from letta.schemas.embedding_config import EmbeddingConfig
10
+ from letta.schemas.llm_config import LLMConfig
11
+ from letta.settings import model_settings
12
+
13
+
14
+ class TogetherClient(OpenAIClient):
15
+
16
+ def requires_auto_tool_choice(self, llm_config: LLMConfig) -> bool:
17
+ return True
18
+
19
+ @trace_method
20
+ def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
21
+ """
22
+ Performs underlying synchronous request to OpenAI API and returns raw response dict.
23
+ """
24
+ api_key, _, _ = self.get_byok_overrides(llm_config)
25
+
26
+ if not api_key:
27
+ api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
28
+ client = OpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
29
+
30
+ response: ChatCompletion = client.chat.completions.create(**request_data)
31
+ return response.model_dump()
32
+
33
+ @trace_method
34
+ async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
35
+ """
36
+ Performs underlying asynchronous request to OpenAI API and returns raw response dict.
37
+ """
38
+ api_key, _, _ = await self.get_byok_overrides_async(llm_config)
39
+
40
+ if not api_key:
41
+ api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
42
+ client = AsyncOpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
43
+
44
+ response: ChatCompletion = await client.chat.completions.create(**request_data)
45
+ return response.model_dump()
46
+
47
+ @trace_method
48
+ async def request_embeddings(self, inputs: List[str], embedding_config: EmbeddingConfig) -> List[List[float]]:
49
+ """Request embeddings given texts and embedding config"""
50
+ api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
51
+ client = AsyncOpenAI(api_key=api_key, base_url=embedding_config.embedding_endpoint)
52
+ response = await client.embeddings.create(model=embedding_config.embedding_model, input=inputs)
53
+
54
+ # TODO: add total usage
55
+ return [r.embedding for r in response.data]
letta/orm/provider.py CHANGED
@@ -31,6 +31,7 @@ class Provider(SqlalchemyBase, OrganizationMixin):
31
31
  base_url: Mapped[str] = mapped_column(nullable=True, doc="Base URL for the provider.")
32
32
  access_key: Mapped[str] = mapped_column(nullable=True, doc="Access key used for requests to the provider.")
33
33
  region: Mapped[str] = mapped_column(nullable=True, doc="Region used for requests to the provider.")
34
+ api_version: Mapped[str] = mapped_column(nullable=True, doc="API version used for requests to the provider.")
34
35
 
35
36
  # relationships
36
37
  organization: Mapped["Organization"] = relationship("Organization", back_populates="providers")
letta/orm/step_metrics.py CHANGED
@@ -1,11 +1,15 @@
1
+ from datetime import datetime, timezone
1
2
  from typing import TYPE_CHECKING, Optional
2
3
 
3
4
  from sqlalchemy import BigInteger, ForeignKey, String
4
- from sqlalchemy.orm import Mapped, mapped_column, relationship
5
+ from sqlalchemy.ext.asyncio import AsyncSession
6
+ from sqlalchemy.orm import Mapped, Session, mapped_column, relationship
5
7
 
6
8
  from letta.orm.mixins import AgentMixin, ProjectMixin
7
9
  from letta.orm.sqlalchemy_base import SqlalchemyBase
8
10
  from letta.schemas.step_metrics import StepMetrics as PydanticStepMetrics
11
+ from letta.schemas.user import User
12
+ from letta.settings import DatabaseChoice, settings
9
13
 
10
14
  if TYPE_CHECKING:
11
15
  from letta.orm.agent import Agent
@@ -69,3 +73,38 @@ class StepMetrics(SqlalchemyBase, ProjectMixin, AgentMixin):
69
73
  step: Mapped["Step"] = relationship("Step", back_populates="metrics", uselist=False)
70
74
  job: Mapped[Optional["Job"]] = relationship("Job")
71
75
  agent: Mapped[Optional["Agent"]] = relationship("Agent")
76
+
77
+ def create(
78
+ self,
79
+ db_session: Session,
80
+ actor: Optional[User] = None,
81
+ no_commit: bool = False,
82
+ ) -> "StepMetrics":
83
+ """Override create to handle SQLite timestamp issues"""
84
+ # For SQLite, explicitly set timestamps as server_default may not work
85
+ if settings.database_engine == DatabaseChoice.SQLITE:
86
+ now = datetime.now(timezone.utc)
87
+ if not self.created_at:
88
+ self.created_at = now
89
+ if not self.updated_at:
90
+ self.updated_at = now
91
+
92
+ return super().create(db_session, actor=actor, no_commit=no_commit)
93
+
94
+ async def create_async(
95
+ self,
96
+ db_session: AsyncSession,
97
+ actor: Optional[User] = None,
98
+ no_commit: bool = False,
99
+ no_refresh: bool = False,
100
+ ) -> "StepMetrics":
101
+ """Override create_async to handle SQLite timestamp issues"""
102
+ # For SQLite, explicitly set timestamps as server_default may not work
103
+ if settings.database_engine == DatabaseChoice.SQLITE:
104
+ now = datetime.now(timezone.utc)
105
+ if not self.created_at:
106
+ self.created_at = now
107
+ if not self.updated_at:
108
+ self.updated_at = now
109
+
110
+ return await super().create_async(db_session, actor=actor, no_commit=no_commit, no_refresh=no_refresh)
@@ -252,7 +252,7 @@ class DatabasePoolMonitor:
252
252
  logger.info(f"Failed to record detach event metric: {e}")
253
253
 
254
254
  @event.listens_for(pool, "reset")
255
- def on_reset(dbapi_connection: DBAPIConnection, connection_record: ConnectionPoolEntry):
255
+ def on_reset(dbapi_connection: DBAPIConnection, connection_record: ConnectionPoolEntry, reset_state):
256
256
  """Called when a connection is reset."""
257
257
  try:
258
258
  from letta.otel.metric_registry import MetricRegistry