letta-nightly 0.6.50.dev20250411104155__py3-none-any.whl → 0.6.52.dev20250412051016__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- letta/__init__.py +1 -1
- letta/agent.py +23 -32
- letta/agents/base_agent.py +17 -6
- letta/agents/ephemeral_agent.py +5 -6
- letta/agents/ephemeral_memory_agent.py +8 -10
- letta/agents/helpers.py +6 -6
- letta/agents/letta_agent.py +9 -10
- letta/agents/letta_agent_batch.py +164 -0
- letta/agents/voice_agent.py +8 -8
- letta/functions/function_sets/base.py +1 -1
- letta/helpers/converters.py +5 -2
- letta/helpers/tool_rule_solver.py +12 -2
- letta/jobs/scheduler.py +13 -11
- letta/llm_api/anthropic.py +0 -1
- letta/llm_api/anthropic_client.py +61 -23
- letta/llm_api/cohere.py +1 -1
- letta/llm_api/google_ai_client.py +48 -13
- letta/llm_api/google_vertex_client.py +19 -1
- letta/llm_api/llm_client_base.py +13 -5
- letta/llm_api/openai.py +4 -3
- letta/llm_api/openai_client.py +18 -10
- letta/orm/organization.py +4 -2
- letta/orm/sqlalchemy_base.py +3 -0
- letta/schemas/enums.py +1 -0
- letta/schemas/group.py +30 -1
- letta/schemas/identity.py +10 -0
- letta/schemas/letta_request.py +4 -0
- letta/schemas/letta_response.py +9 -1
- letta/schemas/llm_config.py +10 -0
- letta/schemas/message.py +21 -12
- letta/schemas/openai/chat_completion_request.py +1 -0
- letta/schemas/tool_rule.py +14 -1
- letta/server/rest_api/interface.py +5 -4
- letta/server/rest_api/routers/v1/agents.py +20 -13
- letta/server/rest_api/routers/v1/groups.py +1 -1
- letta/server/rest_api/routers/v1/identities.py +23 -2
- letta/server/rest_api/utils.py +20 -22
- letta/server/server.py +34 -21
- letta/services/agent_manager.py +13 -9
- letta/services/block_manager.py +2 -4
- letta/services/identity_manager.py +21 -5
- letta/services/llm_batch_manager.py +21 -1
- letta/services/summarizer/summarizer.py +11 -4
- letta/services/tool_manager.py +1 -1
- letta/settings.py +1 -0
- letta/utils.py +2 -2
- {letta_nightly-0.6.50.dev20250411104155.dist-info → letta_nightly-0.6.52.dev20250412051016.dist-info}/METADATA +3 -3
- {letta_nightly-0.6.50.dev20250411104155.dist-info → letta_nightly-0.6.52.dev20250412051016.dist-info}/RECORD +51 -50
- {letta_nightly-0.6.50.dev20250411104155.dist-info → letta_nightly-0.6.52.dev20250412051016.dist-info}/LICENSE +0 -0
- {letta_nightly-0.6.50.dev20250411104155.dist-info → letta_nightly-0.6.52.dev20250412051016.dist-info}/WHEEL +0 -0
- {letta_nightly-0.6.50.dev20250411104155.dist-info → letta_nightly-0.6.52.dev20250412051016.dist-info}/entry_points.txt +0 -0
letta/helpers/converters.py
CHANGED
@@ -28,6 +28,7 @@ from letta.schemas.tool_rule import (
|
|
28
28
|
ContinueToolRule,
|
29
29
|
InitToolRule,
|
30
30
|
MaxCountPerStepToolRule,
|
31
|
+
ParentToolRule,
|
31
32
|
TerminalToolRule,
|
32
33
|
ToolRule,
|
33
34
|
)
|
@@ -89,7 +90,7 @@ def serialize_tool_rules(tool_rules: Optional[List[ToolRule]]) -> List[Dict[str,
|
|
89
90
|
return data
|
90
91
|
|
91
92
|
|
92
|
-
def deserialize_tool_rules(data: Optional[List[Dict]]) -> List[
|
93
|
+
def deserialize_tool_rules(data: Optional[List[Dict]]) -> List[ToolRule]:
|
93
94
|
"""Convert a list of dictionaries back into ToolRule objects."""
|
94
95
|
if not data:
|
95
96
|
return []
|
@@ -99,7 +100,7 @@ def deserialize_tool_rules(data: Optional[List[Dict]]) -> List[Union[ChildToolRu
|
|
99
100
|
|
100
101
|
def deserialize_tool_rule(
|
101
102
|
data: Dict,
|
102
|
-
) ->
|
103
|
+
) -> ToolRule:
|
103
104
|
"""Deserialize a dictionary to the appropriate ToolRule subclass based on 'type'."""
|
104
105
|
rule_type = ToolRuleType(data.get("type"))
|
105
106
|
|
@@ -118,6 +119,8 @@ def deserialize_tool_rule(
|
|
118
119
|
return ContinueToolRule(**data)
|
119
120
|
elif rule_type == ToolRuleType.max_count_per_step:
|
120
121
|
return MaxCountPerStepToolRule(**data)
|
122
|
+
elif rule_type == ToolRuleType.parent_last_tool:
|
123
|
+
return ParentToolRule(**data)
|
121
124
|
raise ValueError(f"Unknown ToolRule type: {rule_type}")
|
122
125
|
|
123
126
|
|
@@ -10,6 +10,7 @@ from letta.schemas.tool_rule import (
|
|
10
10
|
ContinueToolRule,
|
11
11
|
InitToolRule,
|
12
12
|
MaxCountPerStepToolRule,
|
13
|
+
ParentToolRule,
|
13
14
|
TerminalToolRule,
|
14
15
|
)
|
15
16
|
|
@@ -33,6 +34,9 @@ class ToolRulesSolver(BaseModel):
|
|
33
34
|
child_based_tool_rules: List[Union[ChildToolRule, ConditionalToolRule, MaxCountPerStepToolRule]] = Field(
|
34
35
|
default_factory=list, description="Standard tool rules for controlling execution sequence and allowed transitions."
|
35
36
|
)
|
37
|
+
parent_tool_rules: List[ParentToolRule] = Field(
|
38
|
+
default_factory=list, description="Filter tool rules to be used to filter out tools from the available set."
|
39
|
+
)
|
36
40
|
terminal_tool_rules: List[TerminalToolRule] = Field(
|
37
41
|
default_factory=list, description="Terminal tool rules that end the agent loop if called."
|
38
42
|
)
|
@@ -44,6 +48,7 @@ class ToolRulesSolver(BaseModel):
|
|
44
48
|
init_tool_rules: Optional[List[InitToolRule]] = None,
|
45
49
|
continue_tool_rules: Optional[List[ContinueToolRule]] = None,
|
46
50
|
child_based_tool_rules: Optional[List[Union[ChildToolRule, ConditionalToolRule, MaxCountPerStepToolRule]]] = None,
|
51
|
+
parent_tool_rules: Optional[List[ParentToolRule]] = None,
|
47
52
|
terminal_tool_rules: Optional[List[TerminalToolRule]] = None,
|
48
53
|
tool_call_history: Optional[List[str]] = None,
|
49
54
|
**kwargs,
|
@@ -52,6 +57,7 @@ class ToolRulesSolver(BaseModel):
|
|
52
57
|
init_tool_rules=init_tool_rules or [],
|
53
58
|
continue_tool_rules=continue_tool_rules or [],
|
54
59
|
child_based_tool_rules=child_based_tool_rules or [],
|
60
|
+
parent_tool_rules=parent_tool_rules or [],
|
55
61
|
terminal_tool_rules=terminal_tool_rules or [],
|
56
62
|
tool_call_history=tool_call_history or [],
|
57
63
|
**kwargs,
|
@@ -78,6 +84,9 @@ class ToolRulesSolver(BaseModel):
|
|
78
84
|
elif rule.type == ToolRuleType.max_count_per_step:
|
79
85
|
assert isinstance(rule, MaxCountPerStepToolRule)
|
80
86
|
self.child_based_tool_rules.append(rule)
|
87
|
+
elif rule.type == ToolRuleType.parent_last_tool:
|
88
|
+
assert isinstance(rule, ParentToolRule)
|
89
|
+
self.parent_tool_rules.append(rule)
|
81
90
|
|
82
91
|
def register_tool_call(self, tool_name: str):
|
83
92
|
"""Update the internal state to track tool call history."""
|
@@ -102,13 +111,14 @@ class ToolRulesSolver(BaseModel):
|
|
102
111
|
# If there are init tool rules, only return those defined in the init tool rules
|
103
112
|
return [rule.tool_name for rule in self.init_tool_rules]
|
104
113
|
else:
|
105
|
-
# Otherwise, return all
|
114
|
+
# Otherwise, return all tools besides those constrained by parent tool rules
|
115
|
+
available_tools = available_tools - set.union(set(), *(set(rule.children) for rule in self.parent_tool_rules))
|
106
116
|
return list(available_tools)
|
107
117
|
else:
|
108
118
|
# Collect valid tools from all child-based rules
|
109
119
|
valid_tool_sets = [
|
110
120
|
rule.get_valid_tools(self.tool_call_history, available_tools, last_function_response)
|
111
|
-
for rule in self.child_based_tool_rules
|
121
|
+
for rule in self.child_based_tool_rules + self.parent_tool_rules
|
112
122
|
]
|
113
123
|
|
114
124
|
# Compute intersection of all valid tool sets
|
letta/jobs/scheduler.py
CHANGED
@@ -12,17 +12,19 @@ scheduler = AsyncIOScheduler()
|
|
12
12
|
|
13
13
|
def start_cron_jobs(server: SyncServer):
|
14
14
|
"""Initialize cron jobs"""
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
15
|
+
if settings.enable_batch_job_polling:
|
16
|
+
scheduler.add_job(
|
17
|
+
poll_running_llm_batches,
|
18
|
+
args=[server],
|
19
|
+
trigger=IntervalTrigger(seconds=settings.poll_running_llm_batches_interval_seconds),
|
20
|
+
next_run_time=datetime.datetime.now(datetime.timezone.utc),
|
21
|
+
id="poll_llm_batches",
|
22
|
+
name="Poll LLM API batch jobs and update status",
|
23
|
+
replace_existing=True,
|
24
|
+
)
|
25
|
+
scheduler.start()
|
25
26
|
|
26
27
|
|
27
28
|
def shutdown_cron_scheduler():
|
28
|
-
|
29
|
+
if settings.enable_batch_job_polling:
|
30
|
+
scheduler.shutdown()
|
letta/llm_api/anthropic.py
CHANGED
@@ -27,6 +27,7 @@ from letta.llm_api.helpers import add_inner_thoughts_to_functions, unpack_all_in
|
|
27
27
|
from letta.llm_api.llm_client_base import LLMClientBase
|
28
28
|
from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
|
29
29
|
from letta.log import get_logger
|
30
|
+
from letta.schemas.llm_config import LLMConfig
|
30
31
|
from letta.schemas.message import Message as PydanticMessage
|
31
32
|
from letta.schemas.openai.chat_completion_request import Tool
|
32
33
|
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall
|
@@ -59,25 +60,55 @@ class AnthropicClient(LLMClientBase):
|
|
59
60
|
return await client.beta.messages.create(**request_data, betas=["tools-2024-04-04"])
|
60
61
|
|
61
62
|
@trace_method
|
62
|
-
async def
|
63
|
+
async def send_llm_batch_request_async(
|
64
|
+
self,
|
65
|
+
agent_messages_mapping: Dict[str, List[PydanticMessage]],
|
66
|
+
agent_tools_mapping: Dict[str, List[dict]],
|
67
|
+
agent_llm_config_mapping: Dict[str, LLMConfig],
|
68
|
+
) -> BetaMessageBatch:
|
63
69
|
"""
|
64
|
-
|
70
|
+
Sends a batch request to the Anthropic API using the provided agent messages and tools mappings.
|
65
71
|
|
66
72
|
Args:
|
67
|
-
|
73
|
+
agent_messages_mapping: A dict mapping agent_id to their list of PydanticMessages.
|
74
|
+
agent_tools_mapping: A dict mapping agent_id to their list of tool dicts.
|
75
|
+
agent_llm_config_mapping: A dict mapping agent_id to their LLM config
|
68
76
|
|
69
77
|
Returns:
|
70
|
-
|
78
|
+
BetaMessageBatch: The batch response from the Anthropic API.
|
79
|
+
|
80
|
+
Raises:
|
81
|
+
ValueError: If the sets of agent_ids in the two mappings do not match.
|
82
|
+
Exception: Transformed errors from the underlying API call.
|
71
83
|
"""
|
72
|
-
|
84
|
+
# Validate that both mappings use the same set of agent_ids.
|
85
|
+
if set(agent_messages_mapping.keys()) != set(agent_tools_mapping.keys()):
|
86
|
+
raise ValueError("Agent mappings for messages and tools must use the same agent_ids.")
|
87
|
+
|
88
|
+
try:
|
89
|
+
requests = {
|
90
|
+
agent_id: self.build_request_data(
|
91
|
+
messages=agent_messages_mapping[agent_id],
|
92
|
+
llm_config=agent_llm_config_mapping[agent_id],
|
93
|
+
tools=agent_tools_mapping[agent_id],
|
94
|
+
)
|
95
|
+
for agent_id in agent_messages_mapping
|
96
|
+
}
|
73
97
|
|
74
|
-
|
75
|
-
|
76
|
-
|
98
|
+
client = self._get_anthropic_client(async_client=True)
|
99
|
+
|
100
|
+
anthropic_requests = [
|
101
|
+
Request(custom_id=agent_id, params=MessageCreateParamsNonStreaming(**params)) for agent_id, params in requests.items()
|
102
|
+
]
|
103
|
+
|
104
|
+
batch_response = await client.beta.messages.batches.create(requests=anthropic_requests)
|
77
105
|
|
78
|
-
|
106
|
+
return batch_response
|
79
107
|
|
80
|
-
|
108
|
+
except Exception as e:
|
109
|
+
# Enhance logging here if additional context is needed
|
110
|
+
logger.error("Error during send_llm_batch_request_async.", exc_info=True)
|
111
|
+
raise self.handle_llm_error(e)
|
81
112
|
|
82
113
|
@trace_method
|
83
114
|
def _get_anthropic_client(self, async_client: bool = False) -> Union[anthropic.AsyncAnthropic, anthropic.Anthropic]:
|
@@ -90,6 +121,7 @@ class AnthropicClient(LLMClientBase):
|
|
90
121
|
def build_request_data(
|
91
122
|
self,
|
92
123
|
messages: List[PydanticMessage],
|
124
|
+
llm_config: LLMConfig,
|
93
125
|
tools: Optional[List[dict]] = None,
|
94
126
|
force_tool_call: Optional[str] = None,
|
95
127
|
) -> dict:
|
@@ -99,20 +131,20 @@ class AnthropicClient(LLMClientBase):
|
|
99
131
|
if not self.use_tool_naming:
|
100
132
|
raise NotImplementedError("Only tool calling supported on Anthropic API requests")
|
101
133
|
|
102
|
-
if not
|
134
|
+
if not llm_config.max_tokens:
|
103
135
|
raise ValueError("Max tokens must be set for anthropic")
|
104
136
|
|
105
137
|
data = {
|
106
|
-
"model":
|
107
|
-
"max_tokens":
|
108
|
-
"temperature":
|
138
|
+
"model": llm_config.model,
|
139
|
+
"max_tokens": llm_config.max_tokens,
|
140
|
+
"temperature": llm_config.temperature,
|
109
141
|
}
|
110
142
|
|
111
143
|
# Extended Thinking
|
112
|
-
if
|
144
|
+
if llm_config.enable_reasoner:
|
113
145
|
data["thinking"] = {
|
114
146
|
"type": "enabled",
|
115
|
-
"budget_tokens":
|
147
|
+
"budget_tokens": llm_config.max_reasoning_tokens,
|
116
148
|
}
|
117
149
|
# `temperature` may only be set to 1 when thinking is enabled. Please consult our documentation at https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking'
|
118
150
|
data["temperature"] = 1.0
|
@@ -132,13 +164,13 @@ class AnthropicClient(LLMClientBase):
|
|
132
164
|
tools_for_request = [Tool(function=f) for f in tools if f["name"] == force_tool_call]
|
133
165
|
|
134
166
|
# need to have this setting to be able to put inner thoughts in kwargs
|
135
|
-
if not
|
167
|
+
if not llm_config.put_inner_thoughts_in_kwargs:
|
136
168
|
logger.warning(
|
137
169
|
f"Force setting put_inner_thoughts_in_kwargs to True for Claude because there is a forced tool call: {force_tool_call}"
|
138
170
|
)
|
139
|
-
|
171
|
+
llm_config.put_inner_thoughts_in_kwargs = True
|
140
172
|
else:
|
141
|
-
if
|
173
|
+
if llm_config.put_inner_thoughts_in_kwargs:
|
142
174
|
# tool_choice_type other than "auto" only plays nice if thinking goes inside the tool calls
|
143
175
|
tool_choice = {"type": "any", "disable_parallel_tool_use": True}
|
144
176
|
else:
|
@@ -151,7 +183,7 @@ class AnthropicClient(LLMClientBase):
|
|
151
183
|
|
152
184
|
# Add inner thoughts kwarg
|
153
185
|
# TODO: Can probably make this more efficient
|
154
|
-
if tools_for_request and len(tools_for_request) > 0 and
|
186
|
+
if tools_for_request and len(tools_for_request) > 0 and llm_config.put_inner_thoughts_in_kwargs:
|
155
187
|
tools_with_inner_thoughts = add_inner_thoughts_to_functions(
|
156
188
|
functions=[t.function.model_dump() for t in tools_for_request],
|
157
189
|
inner_thoughts_key=INNER_THOUGHTS_KWARG,
|
@@ -173,7 +205,7 @@ class AnthropicClient(LLMClientBase):
|
|
173
205
|
data["messages"] = [
|
174
206
|
m.to_anthropic_dict(
|
175
207
|
inner_thoughts_xml_tag=inner_thoughts_xml_tag,
|
176
|
-
put_inner_thoughts_in_kwargs=bool(
|
208
|
+
put_inner_thoughts_in_kwargs=bool(llm_config.put_inner_thoughts_in_kwargs),
|
177
209
|
)
|
178
210
|
for m in messages[1:]
|
179
211
|
]
|
@@ -189,7 +221,7 @@ class AnthropicClient(LLMClientBase):
|
|
189
221
|
# https://docs.anthropic.com/en/api/messages#body-messages
|
190
222
|
# NOTE: cannot prefill with tools for opus:
|
191
223
|
# Your API request included an `assistant` message in the final position, which would pre-fill the `assistant` response. When using tools with "claude-3-opus-20240229"
|
192
|
-
if prefix_fill and not
|
224
|
+
if prefix_fill and not llm_config.put_inner_thoughts_in_kwargs and "opus" not in data["model"]:
|
193
225
|
data["messages"].append(
|
194
226
|
# Start the thinking process for the assistant
|
195
227
|
{"role": "assistant", "content": f"<{inner_thoughts_xml_tag}>"},
|
@@ -323,13 +355,19 @@ class AnthropicClient(LLMClientBase):
|
|
323
355
|
if content_part.type == "text":
|
324
356
|
content = strip_xml_tags(string=content_part.text, tag="thinking")
|
325
357
|
if content_part.type == "tool_use":
|
358
|
+
# hack for tool rules
|
359
|
+
input = json.loads(json.dumps(content_part.input))
|
360
|
+
if "id" in input and input["id"].startswith("toolu_") and "function" in input:
|
361
|
+
arguments = str(input["function"]["arguments"])
|
362
|
+
else:
|
363
|
+
arguments = json.dumps(content_part.input, indent=2)
|
326
364
|
tool_calls = [
|
327
365
|
ToolCall(
|
328
366
|
id=content_part.id,
|
329
367
|
type="function",
|
330
368
|
function=FunctionCall(
|
331
369
|
name=content_part.name,
|
332
|
-
arguments=
|
370
|
+
arguments=arguments,
|
333
371
|
),
|
334
372
|
)
|
335
373
|
]
|
letta/llm_api/cohere.py
CHANGED
@@ -315,7 +315,7 @@ def cohere_chat_completions_request(
|
|
315
315
|
data.pop("tool_choice", None) # extra safe, should exist always (default="auto")
|
316
316
|
|
317
317
|
# Convert messages to Cohere format
|
318
|
-
msg_objs = [Message.dict_to_message(
|
318
|
+
msg_objs = [Message.dict_to_message(agent_id=uuid.uuid4(), openai_message_dict=m) for m in data["messages"]]
|
319
319
|
|
320
320
|
# System message 0 should instead be a "preamble"
|
321
321
|
# See: https://docs.cohere.com/reference/chat
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import json
|
1
2
|
import uuid
|
2
3
|
from typing import List, Optional, Tuple
|
3
4
|
|
@@ -11,12 +12,16 @@ from letta.llm_api.helpers import make_post_request
|
|
11
12
|
from letta.llm_api.llm_client_base import LLMClientBase
|
12
13
|
from letta.local_llm.json_parser import clean_json_string_extra_backslash
|
13
14
|
from letta.local_llm.utils import count_tokens
|
15
|
+
from letta.log import get_logger
|
16
|
+
from letta.schemas.llm_config import LLMConfig
|
14
17
|
from letta.schemas.message import Message as PydanticMessage
|
15
18
|
from letta.schemas.openai.chat_completion_request import Tool
|
16
19
|
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall, Message, ToolCall, UsageStatistics
|
17
20
|
from letta.settings import model_settings
|
18
21
|
from letta.utils import get_tool_call_id
|
19
22
|
|
23
|
+
logger = get_logger(__name__)
|
24
|
+
|
20
25
|
|
21
26
|
class GoogleAIClient(LLMClientBase):
|
22
27
|
|
@@ -24,6 +29,8 @@ class GoogleAIClient(LLMClientBase):
|
|
24
29
|
"""
|
25
30
|
Performs underlying request to llm and returns raw response.
|
26
31
|
"""
|
32
|
+
# print("[google_ai request]", json.dumps(request_data, indent=2))
|
33
|
+
|
27
34
|
url, headers = get_gemini_endpoint_and_headers(
|
28
35
|
base_url=str(self.llm_config.model_endpoint),
|
29
36
|
model=self.llm_config.model,
|
@@ -36,6 +43,7 @@ class GoogleAIClient(LLMClientBase):
|
|
36
43
|
def build_request_data(
|
37
44
|
self,
|
38
45
|
messages: List[PydanticMessage],
|
46
|
+
llm_config: LLMConfig,
|
39
47
|
tools: List[dict],
|
40
48
|
force_tool_call: Optional[str] = None,
|
41
49
|
) -> dict:
|
@@ -44,9 +52,10 @@ class GoogleAIClient(LLMClientBase):
|
|
44
52
|
"""
|
45
53
|
if tools:
|
46
54
|
tools = [{"type": "function", "function": f} for f in tools]
|
47
|
-
|
48
|
-
|
49
|
-
|
55
|
+
tool_objs = [Tool(**t) for t in tools]
|
56
|
+
tool_names = [t.function.name for t in tool_objs]
|
57
|
+
# Convert to the exact payload style Google expects
|
58
|
+
tools = self.convert_tools_to_google_ai_format(tool_objs)
|
50
59
|
contents = self.add_dummy_model_messages(
|
51
60
|
[m.to_google_ai_dict() for m in messages],
|
52
61
|
)
|
@@ -55,8 +64,8 @@ class GoogleAIClient(LLMClientBase):
|
|
55
64
|
"contents": contents,
|
56
65
|
"tools": tools,
|
57
66
|
"generation_config": {
|
58
|
-
"temperature":
|
59
|
-
"max_output_tokens":
|
67
|
+
"temperature": llm_config.temperature,
|
68
|
+
"max_output_tokens": llm_config.max_tokens,
|
60
69
|
},
|
61
70
|
}
|
62
71
|
|
@@ -65,6 +74,8 @@ class GoogleAIClient(LLMClientBase):
|
|
65
74
|
function_calling_config=FunctionCallingConfig(
|
66
75
|
# ANY mode forces the model to predict only function calls
|
67
76
|
mode=FunctionCallingConfigMode.ANY,
|
77
|
+
# Provide the list of tools (though empty should also work, it seems not to)
|
78
|
+
allowed_function_names=tool_names,
|
68
79
|
)
|
69
80
|
)
|
70
81
|
request_data["tool_config"] = tool_config.model_dump()
|
@@ -99,6 +110,8 @@ class GoogleAIClient(LLMClientBase):
|
|
99
110
|
}
|
100
111
|
}
|
101
112
|
"""
|
113
|
+
# print("[google_ai response]", json.dumps(response_data, indent=2))
|
114
|
+
|
102
115
|
try:
|
103
116
|
choices = []
|
104
117
|
index = 0
|
@@ -109,6 +122,17 @@ class GoogleAIClient(LLMClientBase):
|
|
109
122
|
assert role == "model", f"Unknown role in response: {role}"
|
110
123
|
|
111
124
|
parts = content["parts"]
|
125
|
+
|
126
|
+
# NOTE: we aren't properly supported multi-parts here anyways (we're just appending choices),
|
127
|
+
# so let's disable it for now
|
128
|
+
|
129
|
+
# NOTE(Apr 9, 2025): there's a very strange bug on 2.5 where the response has a part with broken text
|
130
|
+
# {'candidates': [{'content': {'parts': [{'functionCall': {'name': 'send_message', 'args': {'request_heartbeat': False, 'message': 'Hello! How can I make your day better?', 'inner_thoughts': 'User has initiated contact. Sending a greeting.'}}}], 'role': 'model'}, 'finishReason': 'STOP', 'avgLogprobs': -0.25891534213362066}], 'usageMetadata': {'promptTokenCount': 2493, 'candidatesTokenCount': 29, 'totalTokenCount': 2522, 'promptTokensDetails': [{'modality': 'TEXT', 'tokenCount': 2493}], 'candidatesTokensDetails': [{'modality': 'TEXT', 'tokenCount': 29}]}, 'modelVersion': 'gemini-1.5-pro-002'}
|
131
|
+
# To patch this, if we have multiple parts we can take the last one
|
132
|
+
if len(parts) > 1:
|
133
|
+
logger.warning(f"Unexpected multiple parts in response from Google AI: {parts}")
|
134
|
+
parts = [parts[-1]]
|
135
|
+
|
112
136
|
# TODO support parts / multimodal
|
113
137
|
# TODO support parallel tool calling natively
|
114
138
|
# TODO Alternative here is to throw away everything else except for the first part
|
@@ -199,10 +223,22 @@ class GoogleAIClient(LLMClientBase):
|
|
199
223
|
# "totalTokenCount": 36
|
200
224
|
# }
|
201
225
|
if "usageMetadata" in response_data:
|
226
|
+
usage_data = response_data["usageMetadata"]
|
227
|
+
if "promptTokenCount" not in usage_data:
|
228
|
+
raise ValueError(f"promptTokenCount not found in usageMetadata:\n{json.dumps(usage_data, indent=2)}")
|
229
|
+
if "totalTokenCount" not in usage_data:
|
230
|
+
raise ValueError(f"totalTokenCount not found in usageMetadata:\n{json.dumps(usage_data, indent=2)}")
|
231
|
+
if "candidatesTokenCount" not in usage_data:
|
232
|
+
raise ValueError(f"candidatesTokenCount not found in usageMetadata:\n{json.dumps(usage_data, indent=2)}")
|
233
|
+
|
234
|
+
prompt_tokens = usage_data["promptTokenCount"]
|
235
|
+
completion_tokens = usage_data["candidatesTokenCount"]
|
236
|
+
total_tokens = usage_data["totalTokenCount"]
|
237
|
+
|
202
238
|
usage = UsageStatistics(
|
203
|
-
prompt_tokens=
|
204
|
-
completion_tokens=
|
205
|
-
total_tokens=
|
239
|
+
prompt_tokens=prompt_tokens,
|
240
|
+
completion_tokens=completion_tokens,
|
241
|
+
total_tokens=total_tokens,
|
206
242
|
)
|
207
243
|
else:
|
208
244
|
# Count it ourselves
|
@@ -282,17 +318,16 @@ class GoogleAIClient(LLMClientBase):
|
|
282
318
|
for t in tools
|
283
319
|
]
|
284
320
|
|
285
|
-
#
|
321
|
+
# Add inner thoughts if needed
|
286
322
|
for func in function_list:
|
287
|
-
|
288
|
-
|
289
|
-
param_fields["type"] = param_fields["type"].upper()
|
323
|
+
# Note: Google AI API used to have weird casing requirements, but not any more
|
324
|
+
|
290
325
|
# Add inner thoughts
|
291
326
|
if self.llm_config.put_inner_thoughts_in_kwargs:
|
292
327
|
from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
|
293
328
|
|
294
329
|
func["parameters"]["properties"][INNER_THOUGHTS_KWARG] = {
|
295
|
-
"type": "
|
330
|
+
"type": "string",
|
296
331
|
"description": INNER_THOUGHTS_KWARG_DESCRIPTION,
|
297
332
|
}
|
298
333
|
func["parameters"]["required"].append(INNER_THOUGHTS_KWARG)
|
@@ -9,6 +9,7 @@ from letta.helpers.json_helpers import json_dumps
|
|
9
9
|
from letta.llm_api.google_ai_client import GoogleAIClient
|
10
10
|
from letta.local_llm.json_parser import clean_json_string_extra_backslash
|
11
11
|
from letta.local_llm.utils import count_tokens
|
12
|
+
from letta.schemas.llm_config import LLMConfig
|
12
13
|
from letta.schemas.message import Message as PydanticMessage
|
13
14
|
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall, Message, ToolCall, UsageStatistics
|
14
15
|
from letta.settings import model_settings
|
@@ -37,20 +38,24 @@ class GoogleVertexClient(GoogleAIClient):
|
|
37
38
|
def build_request_data(
|
38
39
|
self,
|
39
40
|
messages: List[PydanticMessage],
|
41
|
+
llm_config: LLMConfig,
|
40
42
|
tools: List[dict],
|
41
43
|
force_tool_call: Optional[str] = None,
|
42
44
|
) -> dict:
|
43
45
|
"""
|
44
46
|
Constructs a request object in the expected data format for this client.
|
45
47
|
"""
|
46
|
-
request_data = super().build_request_data(messages, tools, force_tool_call)
|
48
|
+
request_data = super().build_request_data(messages, self.llm_config, tools, force_tool_call)
|
47
49
|
request_data["config"] = request_data.pop("generation_config")
|
48
50
|
request_data["config"]["tools"] = request_data.pop("tools")
|
49
51
|
|
52
|
+
tool_names = [t["name"] for t in tools]
|
50
53
|
tool_config = ToolConfig(
|
51
54
|
function_calling_config=FunctionCallingConfig(
|
52
55
|
# ANY mode forces the model to predict only function calls
|
53
56
|
mode=FunctionCallingConfigMode.ANY,
|
57
|
+
# Provide the list of tools (though empty should also work, it seems not to)
|
58
|
+
allowed_function_names=tool_names,
|
54
59
|
)
|
55
60
|
)
|
56
61
|
request_data["config"]["tool_config"] = tool_config.model_dump()
|
@@ -86,6 +91,8 @@ class GoogleVertexClient(GoogleAIClient):
|
|
86
91
|
}
|
87
92
|
}
|
88
93
|
"""
|
94
|
+
# print(response_data)
|
95
|
+
|
89
96
|
response = GenerateContentResponse(**response_data)
|
90
97
|
try:
|
91
98
|
choices = []
|
@@ -97,6 +104,17 @@ class GoogleVertexClient(GoogleAIClient):
|
|
97
104
|
assert role == "model", f"Unknown role in response: {role}"
|
98
105
|
|
99
106
|
parts = content.parts
|
107
|
+
|
108
|
+
# NOTE: we aren't properly supported multi-parts here anyways (we're just appending choices),
|
109
|
+
# so let's disable it for now
|
110
|
+
|
111
|
+
# NOTE(Apr 9, 2025): there's a very strange bug on 2.5 where the response has a part with broken text
|
112
|
+
# {'candidates': [{'content': {'parts': [{'functionCall': {'name': 'send_message', 'args': {'request_heartbeat': False, 'message': 'Hello! How can I make your day better?', 'inner_thoughts': 'User has initiated contact. Sending a greeting.'}}}], 'role': 'model'}, 'finishReason': 'STOP', 'avgLogprobs': -0.25891534213362066}], 'usageMetadata': {'promptTokenCount': 2493, 'candidatesTokenCount': 29, 'totalTokenCount': 2522, 'promptTokensDetails': [{'modality': 'TEXT', 'tokenCount': 2493}], 'candidatesTokensDetails': [{'modality': 'TEXT', 'tokenCount': 29}]}, 'modelVersion': 'gemini-1.5-pro-002'}
|
113
|
+
# To patch this, if we have multiple parts we can take the last one
|
114
|
+
if len(parts) > 1:
|
115
|
+
logger.warning(f"Unexpected multiple parts in response from Google AI: {parts}")
|
116
|
+
parts = [parts[-1]]
|
117
|
+
|
100
118
|
# TODO support parts / multimodal
|
101
119
|
# TODO support parallel tool calling natively
|
102
120
|
# TODO Alternative here is to throw away everything else except for the first part
|
letta/llm_api/llm_client_base.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from abc import abstractmethod
|
2
|
-
from typing import List, Optional, Union
|
2
|
+
from typing import Dict, List, Optional, Union
|
3
3
|
|
4
|
+
from anthropic.types.beta.messages import BetaMessageBatch
|
4
5
|
from openai import AsyncStream, Stream
|
5
6
|
from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
|
6
7
|
|
@@ -21,7 +22,6 @@ class LLMClientBase:
|
|
21
22
|
self,
|
22
23
|
llm_config: LLMConfig,
|
23
24
|
put_inner_thoughts_first: Optional[bool] = True,
|
24
|
-
use_structured_output: Optional[bool] = True,
|
25
25
|
use_tool_naming: bool = True,
|
26
26
|
):
|
27
27
|
self.llm_config = llm_config
|
@@ -40,7 +40,7 @@ class LLMClientBase:
|
|
40
40
|
If stream=True, returns a Stream[ChatCompletionChunk] that can be iterated over.
|
41
41
|
Otherwise returns a ChatCompletionResponse.
|
42
42
|
"""
|
43
|
-
request_data = self.build_request_data(messages, tools, force_tool_call)
|
43
|
+
request_data = self.build_request_data(messages, self.llm_config, tools, force_tool_call)
|
44
44
|
|
45
45
|
try:
|
46
46
|
log_event(name="llm_request_sent", attributes=request_data)
|
@@ -66,8 +66,7 @@ class LLMClientBase:
|
|
66
66
|
If stream=True, returns an AsyncStream[ChatCompletionChunk] that can be async iterated over.
|
67
67
|
Otherwise returns a ChatCompletionResponse.
|
68
68
|
"""
|
69
|
-
request_data = self.build_request_data(messages, tools, force_tool_call)
|
70
|
-
response_data = {}
|
69
|
+
request_data = self.build_request_data(messages, self.llm_config, tools, force_tool_call)
|
71
70
|
|
72
71
|
try:
|
73
72
|
log_event(name="llm_request_sent", attributes=request_data)
|
@@ -81,10 +80,19 @@ class LLMClientBase:
|
|
81
80
|
|
82
81
|
return self.convert_response_to_chat_completion(response_data, messages)
|
83
82
|
|
83
|
+
async def send_llm_batch_request_async(
|
84
|
+
self,
|
85
|
+
agent_messages_mapping: Dict[str, List[Message]],
|
86
|
+
agent_tools_mapping: Dict[str, List[dict]],
|
87
|
+
agent_llm_config_mapping: Dict[str, LLMConfig],
|
88
|
+
) -> Union[BetaMessageBatch]:
|
89
|
+
raise NotImplementedError
|
90
|
+
|
84
91
|
@abstractmethod
|
85
92
|
def build_request_data(
|
86
93
|
self,
|
87
94
|
messages: List[Message],
|
95
|
+
llm_config: LLMConfig,
|
88
96
|
tools: List[dict],
|
89
97
|
force_tool_call: Optional[str] = None,
|
90
98
|
) -> dict:
|
letta/llm_api/openai.py
CHANGED
@@ -135,7 +135,7 @@ def build_openai_chat_completions_request(
|
|
135
135
|
tool_choice=tool_choice,
|
136
136
|
user=str(user_id),
|
137
137
|
max_completion_tokens=llm_config.max_tokens,
|
138
|
-
temperature=llm_config.temperature,
|
138
|
+
temperature=1.0 if llm_config.enable_reasoner else llm_config.temperature,
|
139
139
|
)
|
140
140
|
else:
|
141
141
|
data = ChatCompletionRequest(
|
@@ -145,7 +145,7 @@ def build_openai_chat_completions_request(
|
|
145
145
|
function_call=function_call,
|
146
146
|
user=str(user_id),
|
147
147
|
max_completion_tokens=llm_config.max_tokens,
|
148
|
-
temperature=llm_config.temperature,
|
148
|
+
temperature=1.0 if llm_config.enable_reasoner else llm_config.temperature,
|
149
149
|
)
|
150
150
|
# https://platform.openai.com/docs/guides/text-generation/json-mode
|
151
151
|
# only supported by gpt-4o, gpt-4-turbo, or gpt-3.5-turbo
|
@@ -168,7 +168,6 @@ def build_openai_chat_completions_request(
|
|
168
168
|
tool.function = FunctionSchema(**structured_output_version)
|
169
169
|
except ValueError as e:
|
170
170
|
warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
|
171
|
-
|
172
171
|
return data
|
173
172
|
|
174
173
|
|
@@ -488,4 +487,6 @@ def prepare_openai_payload(chat_completion_request: ChatCompletionRequest):
|
|
488
487
|
# except ValueError as e:
|
489
488
|
# warnings.warn(f"Failed to convert tool function to structured output, tool={tool}, error={e}")
|
490
489
|
|
490
|
+
if "o3-mini" in chat_completion_request.model or "o1" in chat_completion_request.model:
|
491
|
+
data.pop("parallel_tool_calls", None)
|
491
492
|
return data
|