letta-nightly 0.11.3.dev20250820104219__py3-none-any.whl → 0.11.4.dev20250820213507__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letta/__init__.py +1 -1
- letta/agents/helpers.py +4 -0
- letta/agents/letta_agent.py +142 -5
- letta/constants.py +10 -7
- letta/data_sources/connectors.py +70 -53
- letta/embeddings.py +3 -240
- letta/errors.py +28 -0
- letta/functions/function_sets/base.py +4 -4
- letta/functions/functions.py +287 -32
- letta/functions/mcp_client/types.py +11 -0
- letta/functions/schema_validator.py +187 -0
- letta/functions/typescript_parser.py +196 -0
- letta/helpers/datetime_helpers.py +8 -4
- letta/helpers/tool_execution_helper.py +25 -2
- letta/llm_api/anthropic_client.py +23 -18
- letta/llm_api/azure_client.py +73 -0
- letta/llm_api/bedrock_client.py +8 -4
- letta/llm_api/google_vertex_client.py +14 -5
- letta/llm_api/llm_api_tools.py +2 -217
- letta/llm_api/llm_client.py +15 -1
- letta/llm_api/llm_client_base.py +32 -1
- letta/llm_api/openai.py +1 -0
- letta/llm_api/openai_client.py +18 -28
- letta/llm_api/together_client.py +55 -0
- letta/orm/provider.py +1 -0
- letta/orm/step_metrics.py +40 -1
- letta/otel/db_pool_monitoring.py +1 -1
- letta/schemas/agent.py +3 -4
- letta/schemas/agent_file.py +2 -0
- letta/schemas/block.py +11 -5
- letta/schemas/embedding_config.py +4 -5
- letta/schemas/enums.py +1 -1
- letta/schemas/job.py +2 -3
- letta/schemas/llm_config.py +79 -7
- letta/schemas/mcp.py +0 -24
- letta/schemas/message.py +0 -108
- letta/schemas/openai/chat_completion_request.py +1 -0
- letta/schemas/providers/__init__.py +0 -2
- letta/schemas/providers/anthropic.py +106 -8
- letta/schemas/providers/azure.py +102 -8
- letta/schemas/providers/base.py +10 -3
- letta/schemas/providers/bedrock.py +28 -16
- letta/schemas/providers/letta.py +3 -3
- letta/schemas/providers/ollama.py +2 -12
- letta/schemas/providers/openai.py +4 -4
- letta/schemas/providers/together.py +14 -2
- letta/schemas/sandbox_config.py +2 -1
- letta/schemas/tool.py +46 -22
- letta/server/rest_api/routers/v1/agents.py +179 -38
- letta/server/rest_api/routers/v1/folders.py +13 -8
- letta/server/rest_api/routers/v1/providers.py +10 -3
- letta/server/rest_api/routers/v1/sources.py +14 -8
- letta/server/rest_api/routers/v1/steps.py +17 -1
- letta/server/rest_api/routers/v1/tools.py +96 -5
- letta/server/rest_api/streaming_response.py +91 -45
- letta/server/server.py +27 -38
- letta/services/agent_manager.py +92 -20
- letta/services/agent_serialization_manager.py +11 -7
- letta/services/context_window_calculator/context_window_calculator.py +40 -2
- letta/services/helpers/agent_manager_helper.py +73 -12
- letta/services/mcp_manager.py +109 -15
- letta/services/passage_manager.py +28 -109
- letta/services/provider_manager.py +24 -0
- letta/services/step_manager.py +68 -0
- letta/services/summarizer/summarizer.py +1 -4
- letta/services/tool_executor/core_tool_executor.py +1 -1
- letta/services/tool_executor/sandbox_tool_executor.py +26 -9
- letta/services/tool_manager.py +82 -5
- letta/services/tool_sandbox/base.py +3 -11
- letta/services/tool_sandbox/modal_constants.py +17 -0
- letta/services/tool_sandbox/modal_deployment_manager.py +242 -0
- letta/services/tool_sandbox/modal_sandbox.py +218 -3
- letta/services/tool_sandbox/modal_sandbox_v2.py +429 -0
- letta/services/tool_sandbox/modal_version_manager.py +273 -0
- letta/services/tool_sandbox/safe_pickle.py +193 -0
- letta/settings.py +5 -3
- letta/templates/sandbox_code_file.py.j2 +2 -4
- letta/templates/sandbox_code_file_async.py.j2 +2 -4
- letta/utils.py +1 -1
- {letta_nightly-0.11.3.dev20250820104219.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/METADATA +2 -2
- {letta_nightly-0.11.3.dev20250820104219.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/RECORD +84 -81
- letta/llm_api/anthropic.py +0 -1206
- letta/llm_api/aws_bedrock.py +0 -104
- letta/llm_api/azure_openai.py +0 -118
- letta/llm_api/azure_openai_constants.py +0 -11
- letta/llm_api/cohere.py +0 -391
- letta/schemas/providers/cohere.py +0 -18
- {letta_nightly-0.11.3.dev20250820104219.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/LICENSE +0 -0
- {letta_nightly-0.11.3.dev20250820104219.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/WHEEL +0 -0
- {letta_nightly-0.11.3.dev20250820104219.dist-info → letta_nightly-0.11.4.dev20250820213507.dist-info}/entry_points.txt +0 -0
@@ -239,7 +239,7 @@ class GoogleVertexClient(LLMClientBase):
|
|
239
239
|
request_data["config"]["response_mime_type"] = "application/json"
|
240
240
|
request_data["config"]["response_schema"] = self.get_function_call_response_schema(tools[0])
|
241
241
|
del request_data["config"]["tools"]
|
242
|
-
|
242
|
+
elif tools:
|
243
243
|
tool_config = ToolConfig(
|
244
244
|
function_calling_config=FunctionCallingConfig(
|
245
245
|
# ANY mode forces the model to predict only function calls
|
@@ -255,10 +255,13 @@ class GoogleVertexClient(LLMClientBase):
|
|
255
255
|
# Otherwise, use the value from max_reasoning_tokens
|
256
256
|
if "flash" in llm_config.model:
|
257
257
|
# Gemini flash models may fail to call tools even with FunctionCallingConfigMode.ANY if thinking is fully disabled, set to minimum to prevent tool call failure
|
258
|
+
thinking_budget = llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
|
259
|
+
if thinking_budget <= 0:
|
260
|
+
logger.error(
|
261
|
+
f"Thinking budget of {thinking_budget} for Gemini reasoning model {llm_config.model}, this will likely cause tool call failures"
|
262
|
+
)
|
258
263
|
thinking_config = ThinkingConfig(
|
259
|
-
thinking_budget=(
|
260
|
-
llm_config.max_reasoning_tokens if llm_config.enable_reasoner else self.get_thinking_budget(llm_config.model)
|
261
|
-
),
|
264
|
+
thinking_budget=(thinking_budget),
|
262
265
|
)
|
263
266
|
request_data["config"]["thinking_config"] = thinking_config.model_dump()
|
264
267
|
|
@@ -309,7 +312,7 @@ class GoogleVertexClient(LLMClientBase):
|
|
309
312
|
if candidate.finish_reason == "MALFORMED_FUNCTION_CALL":
|
310
313
|
raise ValueError(f"Error in response data from LLM: {candidate.finish_reason}...")
|
311
314
|
else:
|
312
|
-
raise ValueError(f"Error in response data from LLM: {
|
315
|
+
raise ValueError(f"Error in response data from LLM: {candidate.model_dump()}")
|
313
316
|
|
314
317
|
role = content.role
|
315
318
|
assert role == "model", f"Unknown role in response: {role}"
|
@@ -496,6 +499,12 @@ class GoogleVertexClient(LLMClientBase):
|
|
496
499
|
"required": ["name", "args"],
|
497
500
|
}
|
498
501
|
|
502
|
+
# https://ai.google.dev/gemini-api/docs/thinking#set-budget
|
503
|
+
# | Model | Default setting | Range | Disable thinking | Turn on dynamic thinking|
|
504
|
+
# |-----------------|-------------------------------------------------------------------|--------------|----------------------------|-------------------------|
|
505
|
+
# | 2.5 Pro | Dynamic thinking: Model decides when and how much to think | 128-32768 | N/A: Cannot disable | thinkingBudget = -1 |
|
506
|
+
# | 2.5 Flash | Dynamic thinking: Model decides when and how much to think | 0-24576 | thinkingBudget = 0 | thinkingBudget = -1 |
|
507
|
+
# | 2.5 Flash Lite | Model does not think | 512-24576 | thinkingBudget = 0 | thinkingBudget = -1 |
|
499
508
|
def get_thinking_budget(self, model: str) -> bool:
|
500
509
|
if model_settings.gemini_force_minimum_thinking_budget:
|
501
510
|
if all(substring in model for substring in ["2.5", "flash", "lite"]):
|
letta/llm_api/llm_api_tools.py
CHANGED
@@ -7,13 +7,6 @@ import requests
|
|
7
7
|
|
8
8
|
from letta.constants import CLI_WARNING_PREFIX
|
9
9
|
from letta.errors import LettaConfigurationError, RateLimitExceededError
|
10
|
-
from letta.llm_api.anthropic import (
|
11
|
-
anthropic_bedrock_chat_completions_request,
|
12
|
-
anthropic_chat_completions_process_stream,
|
13
|
-
anthropic_chat_completions_request,
|
14
|
-
)
|
15
|
-
from letta.llm_api.aws_bedrock import has_valid_aws_credentials
|
16
|
-
from letta.llm_api.azure_openai import azure_openai_chat_completions_request
|
17
10
|
from letta.llm_api.deepseek import build_deepseek_chat_completions_request, convert_deepseek_response_to_chatcompletion
|
18
11
|
from letta.llm_api.helpers import add_inner_thoughts_to_functions, unpack_all_inner_thoughts_from_kwargs
|
19
12
|
from letta.llm_api.openai import (
|
@@ -30,14 +23,14 @@ from letta.otel.tracing import log_event, trace_method
|
|
30
23
|
from letta.schemas.enums import ProviderCategory
|
31
24
|
from letta.schemas.llm_config import LLMConfig
|
32
25
|
from letta.schemas.message import Message
|
33
|
-
from letta.schemas.openai.chat_completion_request import ChatCompletionRequest
|
26
|
+
from letta.schemas.openai.chat_completion_request import ChatCompletionRequest
|
34
27
|
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
|
35
28
|
from letta.schemas.provider_trace import ProviderTraceCreate
|
36
29
|
from letta.services.telemetry_manager import TelemetryManager
|
37
30
|
from letta.settings import ModelSettings
|
38
31
|
from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface
|
39
32
|
|
40
|
-
LLM_API_PROVIDER_OPTIONS = ["openai", "azure", "anthropic", "google_ai", "
|
33
|
+
LLM_API_PROVIDER_OPTIONS = ["openai", "azure", "anthropic", "google_ai", "local", "groq", "deepseek"]
|
41
34
|
|
42
35
|
|
43
36
|
def retry_with_exponential_backoff(
|
@@ -312,153 +305,6 @@ def create(
|
|
312
305
|
|
313
306
|
return response
|
314
307
|
|
315
|
-
# azure
|
316
|
-
elif llm_config.model_endpoint_type == "azure":
|
317
|
-
if stream:
|
318
|
-
raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}")
|
319
|
-
|
320
|
-
if model_settings.azure_api_key is None:
|
321
|
-
raise LettaConfigurationError(
|
322
|
-
message="Azure API key is missing. Did you set AZURE_API_KEY in your env?", missing_fields=["azure_api_key"]
|
323
|
-
)
|
324
|
-
|
325
|
-
if model_settings.azure_base_url is None:
|
326
|
-
raise LettaConfigurationError(
|
327
|
-
message="Azure base url is missing. Did you set AZURE_BASE_URL in your env?", missing_fields=["azure_base_url"]
|
328
|
-
)
|
329
|
-
|
330
|
-
if model_settings.azure_api_version is None:
|
331
|
-
raise LettaConfigurationError(
|
332
|
-
message="Azure API version is missing. Did you set AZURE_API_VERSION in your env?", missing_fields=["azure_api_version"]
|
333
|
-
)
|
334
|
-
|
335
|
-
# Set the llm config model_endpoint from model_settings
|
336
|
-
# For Azure, this model_endpoint is required to be configured via env variable, so users don't need to provide it in the LLM config
|
337
|
-
llm_config.model_endpoint = model_settings.azure_base_url
|
338
|
-
chat_completion_request = build_openai_chat_completions_request(
|
339
|
-
llm_config, messages, user_id, functions, function_call, use_tool_naming
|
340
|
-
)
|
341
|
-
|
342
|
-
response = azure_openai_chat_completions_request(
|
343
|
-
model_settings=model_settings,
|
344
|
-
llm_config=llm_config,
|
345
|
-
chat_completion_request=chat_completion_request,
|
346
|
-
)
|
347
|
-
|
348
|
-
if llm_config.put_inner_thoughts_in_kwargs:
|
349
|
-
response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG)
|
350
|
-
|
351
|
-
return response
|
352
|
-
|
353
|
-
elif llm_config.model_endpoint_type == "anthropic":
|
354
|
-
if not use_tool_naming:
|
355
|
-
raise NotImplementedError("Only tool calling supported on Anthropic API requests")
|
356
|
-
|
357
|
-
if llm_config.enable_reasoner:
|
358
|
-
llm_config.put_inner_thoughts_in_kwargs = False
|
359
|
-
|
360
|
-
# Force tool calling
|
361
|
-
tool_call = None
|
362
|
-
if functions is None:
|
363
|
-
# Special case for summarization path
|
364
|
-
tools = None
|
365
|
-
tool_choice = None
|
366
|
-
elif force_tool_call is not None:
|
367
|
-
# tool_call = {"type": "function", "function": {"name": force_tool_call}}
|
368
|
-
tool_choice = {"type": "tool", "name": force_tool_call}
|
369
|
-
tools = [{"type": "function", "function": f} for f in functions if f["name"] == force_tool_call]
|
370
|
-
assert functions is not None
|
371
|
-
|
372
|
-
# need to have this setting to be able to put inner thoughts in kwargs
|
373
|
-
llm_config.put_inner_thoughts_in_kwargs = True
|
374
|
-
else:
|
375
|
-
if llm_config.put_inner_thoughts_in_kwargs:
|
376
|
-
# tool_choice_type other than "auto" only plays nice if thinking goes inside the tool calls
|
377
|
-
tool_choice = {"type": "any", "disable_parallel_tool_use": True}
|
378
|
-
else:
|
379
|
-
tool_choice = {"type": "auto", "disable_parallel_tool_use": True}
|
380
|
-
tools = [{"type": "function", "function": f} for f in functions] if functions is not None else None
|
381
|
-
|
382
|
-
chat_completion_request = ChatCompletionRequest(
|
383
|
-
model=llm_config.model,
|
384
|
-
messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
|
385
|
-
tools=tools,
|
386
|
-
tool_choice=tool_choice,
|
387
|
-
max_tokens=llm_config.max_tokens, # Note: max_tokens is required for Anthropic API
|
388
|
-
temperature=llm_config.temperature,
|
389
|
-
stream=stream,
|
390
|
-
)
|
391
|
-
|
392
|
-
# Handle streaming
|
393
|
-
if stream: # Client requested token streaming
|
394
|
-
assert isinstance(stream_interface, (AgentChunkStreamingInterface, AgentRefreshStreamingInterface)), type(stream_interface)
|
395
|
-
|
396
|
-
stream_interface.inner_thoughts_in_kwargs = True
|
397
|
-
response = anthropic_chat_completions_process_stream(
|
398
|
-
chat_completion_request=chat_completion_request,
|
399
|
-
put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
|
400
|
-
stream_interface=stream_interface,
|
401
|
-
extended_thinking=llm_config.enable_reasoner,
|
402
|
-
max_reasoning_tokens=llm_config.max_reasoning_tokens,
|
403
|
-
provider_name=llm_config.provider_name,
|
404
|
-
provider_category=llm_config.provider_category,
|
405
|
-
name=name,
|
406
|
-
user_id=user_id,
|
407
|
-
)
|
408
|
-
|
409
|
-
else:
|
410
|
-
# Client did not request token streaming (expect a blocking backend response)
|
411
|
-
response = anthropic_chat_completions_request(
|
412
|
-
data=chat_completion_request,
|
413
|
-
put_inner_thoughts_in_kwargs=llm_config.put_inner_thoughts_in_kwargs,
|
414
|
-
extended_thinking=llm_config.enable_reasoner,
|
415
|
-
max_reasoning_tokens=llm_config.max_reasoning_tokens,
|
416
|
-
provider_name=llm_config.provider_name,
|
417
|
-
provider_category=llm_config.provider_category,
|
418
|
-
user_id=user_id,
|
419
|
-
)
|
420
|
-
|
421
|
-
if llm_config.put_inner_thoughts_in_kwargs:
|
422
|
-
response = unpack_all_inner_thoughts_from_kwargs(response=response, inner_thoughts_key=INNER_THOUGHTS_KWARG)
|
423
|
-
|
424
|
-
telemetry_manager.create_provider_trace(
|
425
|
-
actor=actor,
|
426
|
-
provider_trace_create=ProviderTraceCreate(
|
427
|
-
request_json=chat_completion_request.model_json_schema(),
|
428
|
-
response_json=response.model_json_schema(),
|
429
|
-
step_id=step_id,
|
430
|
-
organization_id=actor.organization_id,
|
431
|
-
),
|
432
|
-
)
|
433
|
-
|
434
|
-
return response
|
435
|
-
|
436
|
-
# elif llm_config.model_endpoint_type == "cohere":
|
437
|
-
# if stream:
|
438
|
-
# raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}")
|
439
|
-
# if not use_tool_naming:
|
440
|
-
# raise NotImplementedError("Only tool calling supported on Cohere API requests")
|
441
|
-
#
|
442
|
-
# if functions is not None:
|
443
|
-
# tools = [{"type": "function", "function": f} for f in functions]
|
444
|
-
# tools = [Tool(**t) for t in tools]
|
445
|
-
# else:
|
446
|
-
# tools = None
|
447
|
-
#
|
448
|
-
# return cohere_chat_completions_request(
|
449
|
-
# # url=llm_config.model_endpoint,
|
450
|
-
# url="https://api.cohere.ai/v1", # TODO
|
451
|
-
# api_key=os.getenv("COHERE_API_KEY"), # TODO remove
|
452
|
-
# chat_completion_request=ChatCompletionRequest(
|
453
|
-
# model="command-r-plus", # TODO
|
454
|
-
# messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
|
455
|
-
# tools=tools,
|
456
|
-
# tool_choice=function_call,
|
457
|
-
# # user=str(user_id),
|
458
|
-
# # NOTE: max_tokens is required for Anthropic API
|
459
|
-
# # max_tokens=1024, # TODO make dynamic
|
460
|
-
# ),
|
461
|
-
# )
|
462
308
|
elif llm_config.model_endpoint_type == "groq":
|
463
309
|
if stream:
|
464
310
|
raise NotImplementedError("Streaming not yet implemented for Groq.")
|
@@ -510,67 +356,6 @@ def create(
|
|
510
356
|
|
511
357
|
return response
|
512
358
|
|
513
|
-
elif llm_config.model_endpoint_type == "together":
|
514
|
-
"""TogetherAI endpoint that goes via /completions instead of /chat/completions"""
|
515
|
-
|
516
|
-
if stream:
|
517
|
-
raise NotImplementedError("Streaming not yet implemented for TogetherAI (via the /completions endpoint).")
|
518
|
-
|
519
|
-
if model_settings.together_api_key is None and (
|
520
|
-
llm_config.model_endpoint == "https://api.together.ai/v1/completions"
|
521
|
-
or llm_config.model_endpoint == "https://api.together.xyz/v1/completions"
|
522
|
-
):
|
523
|
-
raise LettaConfigurationError(message="TogetherAI key is missing from letta config file", missing_fields=["together_api_key"])
|
524
|
-
|
525
|
-
return get_chat_completion(
|
526
|
-
model=llm_config.model,
|
527
|
-
messages=messages,
|
528
|
-
functions=functions,
|
529
|
-
functions_python=functions_python,
|
530
|
-
function_call=function_call,
|
531
|
-
context_window=llm_config.context_window,
|
532
|
-
endpoint=llm_config.model_endpoint,
|
533
|
-
endpoint_type="vllm", # NOTE: use the vLLM path through /completions
|
534
|
-
wrapper=llm_config.model_wrapper,
|
535
|
-
user=str(user_id),
|
536
|
-
# hint
|
537
|
-
first_message=first_message,
|
538
|
-
# auth-related
|
539
|
-
auth_type="bearer_token", # NOTE: Together expects bearer token auth
|
540
|
-
auth_key=model_settings.together_api_key,
|
541
|
-
)
|
542
|
-
|
543
|
-
elif llm_config.model_endpoint_type == "bedrock":
|
544
|
-
"""Anthropic endpoint that goes via /embeddings instead of /chat/completions"""
|
545
|
-
|
546
|
-
if stream:
|
547
|
-
raise NotImplementedError("Streaming not yet implemented for Anthropic (via the /embeddings endpoint).")
|
548
|
-
if not use_tool_naming:
|
549
|
-
raise NotImplementedError("Only tool calling supported on Anthropic API requests")
|
550
|
-
|
551
|
-
if not has_valid_aws_credentials():
|
552
|
-
raise LettaConfigurationError(message="Invalid or missing AWS credentials. Please configure valid AWS credentials.")
|
553
|
-
|
554
|
-
tool_call = None
|
555
|
-
if force_tool_call is not None:
|
556
|
-
tool_call = {"type": "function", "function": {"name": force_tool_call}}
|
557
|
-
assert functions is not None
|
558
|
-
|
559
|
-
return anthropic_bedrock_chat_completions_request(
|
560
|
-
data=ChatCompletionRequest(
|
561
|
-
model=llm_config.model,
|
562
|
-
messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
|
563
|
-
tools=[{"type": "function", "function": f} for f in functions] if functions else None,
|
564
|
-
tool_choice=tool_call,
|
565
|
-
# user=str(user_id),
|
566
|
-
# NOTE: max_tokens is required for Anthropic API
|
567
|
-
max_tokens=llm_config.max_tokens,
|
568
|
-
),
|
569
|
-
provider_name=llm_config.provider_name,
|
570
|
-
provider_category=llm_config.provider_category,
|
571
|
-
user_id=user_id,
|
572
|
-
)
|
573
|
-
|
574
359
|
elif llm_config.model_endpoint_type == "deepseek":
|
575
360
|
if model_settings.deepseek_api_key is None and llm_config.model_endpoint == "":
|
576
361
|
# only is a problem if we are *not* using an openai proxy
|
letta/llm_api/llm_client.py
CHANGED
@@ -58,12 +58,26 @@ class LLMClient:
|
|
58
58
|
put_inner_thoughts_first=put_inner_thoughts_first,
|
59
59
|
actor=actor,
|
60
60
|
)
|
61
|
-
case ProviderType.openai | ProviderType.
|
61
|
+
case ProviderType.openai | ProviderType.ollama:
|
62
62
|
from letta.llm_api.openai_client import OpenAIClient
|
63
63
|
|
64
64
|
return OpenAIClient(
|
65
65
|
put_inner_thoughts_first=put_inner_thoughts_first,
|
66
66
|
actor=actor,
|
67
67
|
)
|
68
|
+
case ProviderType.together:
|
69
|
+
from letta.llm_api.together_client import TogetherClient
|
70
|
+
|
71
|
+
return TogetherClient(
|
72
|
+
put_inner_thoughts_first=put_inner_thoughts_first,
|
73
|
+
actor=actor,
|
74
|
+
)
|
75
|
+
case ProviderType.azure:
|
76
|
+
from letta.llm_api.azure_client import AzureClient
|
77
|
+
|
78
|
+
return AzureClient(
|
79
|
+
put_inner_thoughts_first=put_inner_thoughts_first,
|
80
|
+
actor=actor,
|
81
|
+
)
|
68
82
|
case _:
|
69
83
|
return None
|
letta/llm_api/llm_client_base.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import json
|
2
2
|
from abc import abstractmethod
|
3
|
-
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
3
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
4
4
|
|
5
5
|
from anthropic.types.beta.messages import BetaMessageBatch
|
6
6
|
from openai import AsyncStream, Stream
|
@@ -9,6 +9,7 @@ from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
|
|
9
9
|
from letta.errors import LLMError
|
10
10
|
from letta.otel.tracing import log_event, trace_method
|
11
11
|
from letta.schemas.embedding_config import EmbeddingConfig
|
12
|
+
from letta.schemas.enums import ProviderCategory
|
12
13
|
from letta.schemas.llm_config import LLMConfig
|
13
14
|
from letta.schemas.message import Message
|
14
15
|
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse
|
@@ -111,6 +112,9 @@ class LLMClientBase:
|
|
111
112
|
agent_tools_mapping: Dict[str, List[dict]],
|
112
113
|
agent_llm_config_mapping: Dict[str, LLMConfig],
|
113
114
|
) -> Union[BetaMessageBatch]:
|
115
|
+
"""
|
116
|
+
Issues a batch request to the downstream model endpoint and parses response.
|
117
|
+
"""
|
114
118
|
raise NotImplementedError
|
115
119
|
|
116
120
|
@abstractmethod
|
@@ -176,6 +180,9 @@ class LLMClientBase:
|
|
176
180
|
|
177
181
|
@abstractmethod
|
178
182
|
def is_reasoning_model(self, llm_config: LLMConfig) -> bool:
|
183
|
+
"""
|
184
|
+
Returns True if the model is a native reasoning model.
|
185
|
+
"""
|
179
186
|
raise NotImplementedError
|
180
187
|
|
181
188
|
@abstractmethod
|
@@ -192,6 +199,30 @@ class LLMClientBase:
|
|
192
199
|
"""
|
193
200
|
return LLMError(f"Unhandled LLM error: {str(e)}")
|
194
201
|
|
202
|
+
def get_byok_overrides(self, llm_config: LLMConfig) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
203
|
+
"""
|
204
|
+
Returns the override key for the given llm config.
|
205
|
+
"""
|
206
|
+
api_key = None
|
207
|
+
if llm_config.provider_category == ProviderCategory.byok:
|
208
|
+
from letta.services.provider_manager import ProviderManager
|
209
|
+
|
210
|
+
api_key = ProviderManager().get_override_key(llm_config.provider_name, actor=self.actor)
|
211
|
+
|
212
|
+
return api_key, None, None
|
213
|
+
|
214
|
+
async def get_byok_overrides_async(self, llm_config: LLMConfig) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
215
|
+
"""
|
216
|
+
Returns the override key for the given llm config.
|
217
|
+
"""
|
218
|
+
api_key = None
|
219
|
+
if llm_config.provider_category == ProviderCategory.byok:
|
220
|
+
from letta.services.provider_manager import ProviderManager
|
221
|
+
|
222
|
+
api_key = await ProviderManager().get_override_key_async(llm_config.provider_name, actor=self.actor)
|
223
|
+
|
224
|
+
return api_key, None, None
|
225
|
+
|
195
226
|
def _fix_truncated_json_response(self, response: ChatCompletionResponse) -> ChatCompletionResponse:
|
196
227
|
"""
|
197
228
|
Fixes truncated JSON responses by ensuring the content is properly formatted.
|
letta/llm_api/openai.py
CHANGED
letta/llm_api/openai_client.py
CHANGED
@@ -26,7 +26,6 @@ from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG
|
|
26
26
|
from letta.log import get_logger
|
27
27
|
from letta.otel.tracing import trace_method
|
28
28
|
from letta.schemas.embedding_config import EmbeddingConfig
|
29
|
-
from letta.schemas.enums import ProviderCategory, ProviderType
|
30
29
|
from letta.schemas.letta_message_content import MessageContentType
|
31
30
|
from letta.schemas.llm_config import LLMConfig
|
32
31
|
from letta.schemas.message import Message as PydanticMessage
|
@@ -54,6 +53,11 @@ def is_openai_5_model(model: str) -> bool:
|
|
54
53
|
return model.startswith("gpt-5")
|
55
54
|
|
56
55
|
|
56
|
+
def supports_verbosity_control(model: str) -> bool:
|
57
|
+
"""Check if the model supports verbosity control, currently only GPT-5 models support this"""
|
58
|
+
return is_openai_5_model(model)
|
59
|
+
|
60
|
+
|
57
61
|
def accepts_developer_role(model: str) -> bool:
|
58
62
|
"""Checks if the model accepts the 'developer' role. Note that not all reasoning models accept this role.
|
59
63
|
|
@@ -102,8 +106,6 @@ def requires_auto_tool_choice(llm_config: LLMConfig) -> bool:
|
|
102
106
|
"""Certain providers require the tool choice to be set to 'auto'."""
|
103
107
|
if "nebius.com" in llm_config.model_endpoint:
|
104
108
|
return True
|
105
|
-
if "together.ai" in llm_config.model_endpoint or "together.xyz" in llm_config.model_endpoint:
|
106
|
-
return True
|
107
109
|
if llm_config.handle and "vllm" in llm_config.handle:
|
108
110
|
return True
|
109
111
|
if llm_config.compatibility_type == "mlx":
|
@@ -113,13 +115,7 @@ def requires_auto_tool_choice(llm_config: LLMConfig) -> bool:
|
|
113
115
|
|
114
116
|
class OpenAIClient(LLMClientBase):
|
115
117
|
def _prepare_client_kwargs(self, llm_config: LLMConfig) -> dict:
|
116
|
-
api_key =
|
117
|
-
if llm_config.provider_category == ProviderCategory.byok:
|
118
|
-
from letta.services.provider_manager import ProviderManager
|
119
|
-
|
120
|
-
api_key = ProviderManager().get_override_key(llm_config.provider_name, actor=self.actor)
|
121
|
-
if llm_config.model_endpoint_type == ProviderType.together:
|
122
|
-
api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
|
118
|
+
api_key, _, _ = self.get_byok_overrides(llm_config)
|
123
119
|
|
124
120
|
if not api_key:
|
125
121
|
api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
|
@@ -130,25 +126,14 @@ class OpenAIClient(LLMClientBase):
|
|
130
126
|
return kwargs
|
131
127
|
|
132
128
|
def _prepare_client_kwargs_embedding(self, embedding_config: EmbeddingConfig) -> dict:
|
133
|
-
api_key =
|
134
|
-
if embedding_config.embedding_endpoint_type == ProviderType.together:
|
135
|
-
api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
|
136
|
-
|
137
|
-
if not api_key:
|
138
|
-
api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
|
129
|
+
api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
|
139
130
|
# supposedly the openai python client requires a dummy API key
|
140
131
|
api_key = api_key or "DUMMY_API_KEY"
|
141
132
|
kwargs = {"api_key": api_key, "base_url": embedding_config.embedding_endpoint}
|
142
133
|
return kwargs
|
143
134
|
|
144
135
|
async def _prepare_client_kwargs_async(self, llm_config: LLMConfig) -> dict:
|
145
|
-
api_key =
|
146
|
-
if llm_config.provider_category == ProviderCategory.byok:
|
147
|
-
from letta.services.provider_manager import ProviderManager
|
148
|
-
|
149
|
-
api_key = await ProviderManager().get_override_key_async(llm_config.provider_name, actor=self.actor)
|
150
|
-
if llm_config.model_endpoint_type == ProviderType.together:
|
151
|
-
api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
|
136
|
+
api_key, _, _ = await self.get_byok_overrides_async(llm_config)
|
152
137
|
|
153
138
|
if not api_key:
|
154
139
|
api_key = model_settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
|
@@ -158,6 +143,9 @@ class OpenAIClient(LLMClientBase):
|
|
158
143
|
|
159
144
|
return kwargs
|
160
145
|
|
146
|
+
def requires_auto_tool_choice(self, llm_config: LLMConfig) -> bool:
|
147
|
+
return requires_auto_tool_choice(llm_config)
|
148
|
+
|
161
149
|
@trace_method
|
162
150
|
def build_request_data(
|
163
151
|
self,
|
@@ -204,7 +192,7 @@ class OpenAIClient(LLMClientBase):
|
|
204
192
|
# TODO(matt) move into LLMConfig
|
205
193
|
# TODO: This vllm checking is very brittle and is a patch at most
|
206
194
|
tool_choice = None
|
207
|
-
if requires_auto_tool_choice(llm_config):
|
195
|
+
if self.requires_auto_tool_choice(llm_config):
|
208
196
|
tool_choice = "auto"
|
209
197
|
elif tools:
|
210
198
|
# only set if tools is non-Null
|
@@ -224,6 +212,10 @@ class OpenAIClient(LLMClientBase):
|
|
224
212
|
temperature=llm_config.temperature if supports_temperature_param(model) else 1.0,
|
225
213
|
)
|
226
214
|
|
215
|
+
# Add verbosity control for GPT-5 models
|
216
|
+
if supports_verbosity_control(model) and llm_config.verbosity:
|
217
|
+
data.verbosity = llm_config.verbosity
|
218
|
+
|
227
219
|
if llm_config.frequency_penalty is not None:
|
228
220
|
data.frequency_penalty = llm_config.frequency_penalty
|
229
221
|
|
@@ -252,8 +244,8 @@ class OpenAIClient(LLMClientBase):
|
|
252
244
|
tool.function = FunctionSchema(**structured_output_version)
|
253
245
|
except ValueError as e:
|
254
246
|
logger.warning(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
|
255
|
-
|
256
|
-
return
|
247
|
+
request_data = data.model_dump(exclude_unset=True)
|
248
|
+
return request_data
|
257
249
|
|
258
250
|
@trace_method
|
259
251
|
def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
|
@@ -261,7 +253,6 @@ class OpenAIClient(LLMClientBase):
|
|
261
253
|
Performs underlying synchronous request to OpenAI API and returns raw response dict.
|
262
254
|
"""
|
263
255
|
client = OpenAI(**self._prepare_client_kwargs(llm_config))
|
264
|
-
|
265
256
|
response: ChatCompletion = client.chat.completions.create(**request_data)
|
266
257
|
return response.model_dump()
|
267
258
|
|
@@ -272,7 +263,6 @@ class OpenAIClient(LLMClientBase):
|
|
272
263
|
"""
|
273
264
|
kwargs = await self._prepare_client_kwargs_async(llm_config)
|
274
265
|
client = AsyncOpenAI(**kwargs)
|
275
|
-
|
276
266
|
response: ChatCompletion = await client.chat.completions.create(**request_data)
|
277
267
|
return response.model_dump()
|
278
268
|
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import os
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
from openai import AsyncOpenAI, OpenAI
|
5
|
+
from openai.types.chat.chat_completion import ChatCompletion
|
6
|
+
|
7
|
+
from letta.llm_api.openai_client import OpenAIClient
|
8
|
+
from letta.otel.tracing import trace_method
|
9
|
+
from letta.schemas.embedding_config import EmbeddingConfig
|
10
|
+
from letta.schemas.llm_config import LLMConfig
|
11
|
+
from letta.settings import model_settings
|
12
|
+
|
13
|
+
|
14
|
+
class TogetherClient(OpenAIClient):
|
15
|
+
|
16
|
+
def requires_auto_tool_choice(self, llm_config: LLMConfig) -> bool:
|
17
|
+
return True
|
18
|
+
|
19
|
+
@trace_method
|
20
|
+
def request(self, request_data: dict, llm_config: LLMConfig) -> dict:
|
21
|
+
"""
|
22
|
+
Performs underlying synchronous request to OpenAI API and returns raw response dict.
|
23
|
+
"""
|
24
|
+
api_key, _, _ = self.get_byok_overrides(llm_config)
|
25
|
+
|
26
|
+
if not api_key:
|
27
|
+
api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
|
28
|
+
client = OpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
|
29
|
+
|
30
|
+
response: ChatCompletion = client.chat.completions.create(**request_data)
|
31
|
+
return response.model_dump()
|
32
|
+
|
33
|
+
@trace_method
|
34
|
+
async def request_async(self, request_data: dict, llm_config: LLMConfig) -> dict:
|
35
|
+
"""
|
36
|
+
Performs underlying asynchronous request to OpenAI API and returns raw response dict.
|
37
|
+
"""
|
38
|
+
api_key, _, _ = await self.get_byok_overrides_async(llm_config)
|
39
|
+
|
40
|
+
if not api_key:
|
41
|
+
api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
|
42
|
+
client = AsyncOpenAI(api_key=api_key, base_url=llm_config.model_endpoint)
|
43
|
+
|
44
|
+
response: ChatCompletion = await client.chat.completions.create(**request_data)
|
45
|
+
return response.model_dump()
|
46
|
+
|
47
|
+
@trace_method
|
48
|
+
async def request_embeddings(self, inputs: List[str], embedding_config: EmbeddingConfig) -> List[List[float]]:
|
49
|
+
"""Request embeddings given texts and embedding config"""
|
50
|
+
api_key = model_settings.together_api_key or os.environ.get("TOGETHER_API_KEY")
|
51
|
+
client = AsyncOpenAI(api_key=api_key, base_url=embedding_config.embedding_endpoint)
|
52
|
+
response = await client.embeddings.create(model=embedding_config.embedding_model, input=inputs)
|
53
|
+
|
54
|
+
# TODO: add total usage
|
55
|
+
return [r.embedding for r in response.data]
|
letta/orm/provider.py
CHANGED
@@ -31,6 +31,7 @@ class Provider(SqlalchemyBase, OrganizationMixin):
|
|
31
31
|
base_url: Mapped[str] = mapped_column(nullable=True, doc="Base URL for the provider.")
|
32
32
|
access_key: Mapped[str] = mapped_column(nullable=True, doc="Access key used for requests to the provider.")
|
33
33
|
region: Mapped[str] = mapped_column(nullable=True, doc="Region used for requests to the provider.")
|
34
|
+
api_version: Mapped[str] = mapped_column(nullable=True, doc="API version used for requests to the provider.")
|
34
35
|
|
35
36
|
# relationships
|
36
37
|
organization: Mapped["Organization"] = relationship("Organization", back_populates="providers")
|
letta/orm/step_metrics.py
CHANGED
@@ -1,11 +1,15 @@
|
|
1
|
+
from datetime import datetime, timezone
|
1
2
|
from typing import TYPE_CHECKING, Optional
|
2
3
|
|
3
4
|
from sqlalchemy import BigInteger, ForeignKey, String
|
4
|
-
from sqlalchemy.
|
5
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
6
|
+
from sqlalchemy.orm import Mapped, Session, mapped_column, relationship
|
5
7
|
|
6
8
|
from letta.orm.mixins import AgentMixin, ProjectMixin
|
7
9
|
from letta.orm.sqlalchemy_base import SqlalchemyBase
|
8
10
|
from letta.schemas.step_metrics import StepMetrics as PydanticStepMetrics
|
11
|
+
from letta.schemas.user import User
|
12
|
+
from letta.settings import DatabaseChoice, settings
|
9
13
|
|
10
14
|
if TYPE_CHECKING:
|
11
15
|
from letta.orm.agent import Agent
|
@@ -69,3 +73,38 @@ class StepMetrics(SqlalchemyBase, ProjectMixin, AgentMixin):
|
|
69
73
|
step: Mapped["Step"] = relationship("Step", back_populates="metrics", uselist=False)
|
70
74
|
job: Mapped[Optional["Job"]] = relationship("Job")
|
71
75
|
agent: Mapped[Optional["Agent"]] = relationship("Agent")
|
76
|
+
|
77
|
+
def create(
|
78
|
+
self,
|
79
|
+
db_session: Session,
|
80
|
+
actor: Optional[User] = None,
|
81
|
+
no_commit: bool = False,
|
82
|
+
) -> "StepMetrics":
|
83
|
+
"""Override create to handle SQLite timestamp issues"""
|
84
|
+
# For SQLite, explicitly set timestamps as server_default may not work
|
85
|
+
if settings.database_engine == DatabaseChoice.SQLITE:
|
86
|
+
now = datetime.now(timezone.utc)
|
87
|
+
if not self.created_at:
|
88
|
+
self.created_at = now
|
89
|
+
if not self.updated_at:
|
90
|
+
self.updated_at = now
|
91
|
+
|
92
|
+
return super().create(db_session, actor=actor, no_commit=no_commit)
|
93
|
+
|
94
|
+
async def create_async(
|
95
|
+
self,
|
96
|
+
db_session: AsyncSession,
|
97
|
+
actor: Optional[User] = None,
|
98
|
+
no_commit: bool = False,
|
99
|
+
no_refresh: bool = False,
|
100
|
+
) -> "StepMetrics":
|
101
|
+
"""Override create_async to handle SQLite timestamp issues"""
|
102
|
+
# For SQLite, explicitly set timestamps as server_default may not work
|
103
|
+
if settings.database_engine == DatabaseChoice.SQLITE:
|
104
|
+
now = datetime.now(timezone.utc)
|
105
|
+
if not self.created_at:
|
106
|
+
self.created_at = now
|
107
|
+
if not self.updated_at:
|
108
|
+
self.updated_at = now
|
109
|
+
|
110
|
+
return await super().create_async(db_session, actor=actor, no_commit=no_commit, no_refresh=no_refresh)
|
letta/otel/db_pool_monitoring.py
CHANGED
@@ -252,7 +252,7 @@ class DatabasePoolMonitor:
|
|
252
252
|
logger.info(f"Failed to record detach event metric: {e}")
|
253
253
|
|
254
254
|
@event.listens_for(pool, "reset")
|
255
|
-
def on_reset(dbapi_connection: DBAPIConnection, connection_record: ConnectionPoolEntry):
|
255
|
+
def on_reset(dbapi_connection: DBAPIConnection, connection_record: ConnectionPoolEntry, reset_state):
|
256
256
|
"""Called when a connection is reset."""
|
257
257
|
try:
|
258
258
|
from letta.otel.metric_registry import MetricRegistry
|