letta-nightly 0.6.15.dev20250126103925__py3-none-any.whl → 0.6.16.dev20250127104048__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of letta-nightly might be problematic. Click here for more details.
- letta/__init__.py +1 -2
- letta/agent.py +5 -1
- letta/cli/cli_config.py +1 -1
- letta/client/client.py +4 -20
- letta/functions/schema_generator.py +24 -11
- letta/llm_api/anthropic.py +485 -7
- letta/llm_api/llm_api_tools.py +28 -13
- letta/llm_api/openai.py +8 -3
- letta/local_llm/constants.py +1 -0
- letta/schemas/message.py +6 -5
- letta/schemas/providers.py +125 -0
- letta/schemas/tool.py +0 -4
- letta/server/rest_api/interface.py +15 -3
- letta/server/rest_api/routers/v1/agents.py +2 -0
- letta/server/rest_api/routers/v1/tools.py +1 -1
- letta/server/server.py +23 -5
- letta/services/helpers/agent_manager_helper.py +22 -1
- letta/services/tool_manager.py +1 -0
- letta/settings.py +3 -0
- letta/streaming_utils.py +5 -1
- {letta_nightly-0.6.15.dev20250126103925.dist-info → letta_nightly-0.6.16.dev20250127104048.dist-info}/METADATA +1 -1
- {letta_nightly-0.6.15.dev20250126103925.dist-info → letta_nightly-0.6.16.dev20250127104048.dist-info}/RECORD +25 -25
- {letta_nightly-0.6.15.dev20250126103925.dist-info → letta_nightly-0.6.16.dev20250127104048.dist-info}/LICENSE +0 -0
- {letta_nightly-0.6.15.dev20250126103925.dist-info → letta_nightly-0.6.16.dev20250127104048.dist-info}/WHEEL +0 -0
- {letta_nightly-0.6.15.dev20250126103925.dist-info → letta_nightly-0.6.16.dev20250127104048.dist-info}/entry_points.txt +0 -0
letta/llm_api/anthropic.py
CHANGED
|
@@ -1,21 +1,41 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import re
|
|
3
|
-
|
|
3
|
+
import time
|
|
4
|
+
from typing import Generator, List, Optional, Tuple, Union
|
|
4
5
|
|
|
5
6
|
import anthropic
|
|
6
7
|
from anthropic import PermissionDeniedError
|
|
8
|
+
from anthropic.types.beta import (
|
|
9
|
+
BetaRawContentBlockDeltaEvent,
|
|
10
|
+
BetaRawContentBlockStartEvent,
|
|
11
|
+
BetaRawContentBlockStopEvent,
|
|
12
|
+
BetaRawMessageDeltaEvent,
|
|
13
|
+
BetaRawMessageStartEvent,
|
|
14
|
+
BetaRawMessageStopEvent,
|
|
15
|
+
BetaTextBlock,
|
|
16
|
+
BetaToolUseBlock,
|
|
17
|
+
)
|
|
7
18
|
|
|
8
19
|
from letta.errors import BedrockError, BedrockPermissionError
|
|
9
20
|
from letta.llm_api.aws_bedrock import get_bedrock_client
|
|
10
|
-
from letta.
|
|
21
|
+
from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
|
|
22
|
+
from letta.schemas.message import Message as _Message
|
|
23
|
+
from letta.schemas.message import MessageRole as _MessageRole
|
|
11
24
|
from letta.schemas.openai.chat_completion_request import ChatCompletionRequest, Tool
|
|
12
|
-
from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall
|
|
13
25
|
from letta.schemas.openai.chat_completion_response import (
|
|
14
|
-
|
|
26
|
+
ChatCompletionChunkResponse,
|
|
27
|
+
ChatCompletionResponse,
|
|
28
|
+
Choice,
|
|
29
|
+
ChunkChoice,
|
|
30
|
+
FunctionCall,
|
|
31
|
+
FunctionCallDelta,
|
|
15
32
|
)
|
|
16
|
-
from letta.schemas.openai.chat_completion_response import
|
|
33
|
+
from letta.schemas.openai.chat_completion_response import Message
|
|
34
|
+
from letta.schemas.openai.chat_completion_response import Message as ChoiceMessage
|
|
35
|
+
from letta.schemas.openai.chat_completion_response import MessageDelta, ToolCall, ToolCallDelta, UsageStatistics
|
|
17
36
|
from letta.services.provider_manager import ProviderManager
|
|
18
37
|
from letta.settings import model_settings
|
|
38
|
+
from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface
|
|
19
39
|
from letta.utils import get_utc_time, smart_urljoin
|
|
20
40
|
|
|
21
41
|
BASE_URL = "https://api.anthropic.com/v1"
|
|
@@ -200,6 +220,28 @@ def strip_xml_tags(string: str, tag: Optional[str]) -> str:
|
|
|
200
220
|
return re.sub(tag_pattern, "", string)
|
|
201
221
|
|
|
202
222
|
|
|
223
|
+
def strip_xml_tags_streaming(string: str, tag: Optional[str]) -> str:
|
|
224
|
+
if tag is None:
|
|
225
|
+
return string
|
|
226
|
+
|
|
227
|
+
# Handle common partial tag cases
|
|
228
|
+
parts_to_remove = [
|
|
229
|
+
"<", # Leftover start bracket
|
|
230
|
+
f"<{tag}", # Opening tag start
|
|
231
|
+
f"</{tag}", # Closing tag start
|
|
232
|
+
f"/{tag}>", # Closing tag end
|
|
233
|
+
f"{tag}>", # Opening tag end
|
|
234
|
+
f"/{tag}", # Partial closing tag without >
|
|
235
|
+
">", # Leftover end bracket
|
|
236
|
+
]
|
|
237
|
+
|
|
238
|
+
result = string
|
|
239
|
+
for part in parts_to_remove:
|
|
240
|
+
result = result.replace(part, "")
|
|
241
|
+
|
|
242
|
+
return result
|
|
243
|
+
|
|
244
|
+
|
|
203
245
|
def convert_anthropic_response_to_chatcompletion(
|
|
204
246
|
response: anthropic.types.Message,
|
|
205
247
|
inner_thoughts_xml_tag: Optional[str] = None,
|
|
@@ -307,6 +349,166 @@ def convert_anthropic_response_to_chatcompletion(
|
|
|
307
349
|
)
|
|
308
350
|
|
|
309
351
|
|
|
352
|
+
def convert_anthropic_stream_event_to_chatcompletion(
|
|
353
|
+
event: Union[
|
|
354
|
+
BetaRawMessageStartEvent,
|
|
355
|
+
BetaRawContentBlockStartEvent,
|
|
356
|
+
BetaRawContentBlockDeltaEvent,
|
|
357
|
+
BetaRawContentBlockStopEvent,
|
|
358
|
+
BetaRawMessageDeltaEvent,
|
|
359
|
+
BetaRawMessageStopEvent,
|
|
360
|
+
],
|
|
361
|
+
message_id: str,
|
|
362
|
+
model: str,
|
|
363
|
+
inner_thoughts_xml_tag: Optional[str] = "thinking",
|
|
364
|
+
) -> ChatCompletionChunkResponse:
|
|
365
|
+
"""Convert Anthropic stream events to OpenAI ChatCompletionResponse format.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
event: The event to convert
|
|
369
|
+
message_id: The ID of the message. Anthropic does not return this on every event, so we need to keep track of it
|
|
370
|
+
model: The model used. Anthropic does not return this on every event, so we need to keep track of it
|
|
371
|
+
|
|
372
|
+
Example response from OpenAI:
|
|
373
|
+
|
|
374
|
+
'id': 'MESSAGE_ID',
|
|
375
|
+
'choices': [
|
|
376
|
+
{
|
|
377
|
+
'finish_reason': None,
|
|
378
|
+
'index': 0,
|
|
379
|
+
'delta': {
|
|
380
|
+
'content': None,
|
|
381
|
+
'tool_calls': [
|
|
382
|
+
{
|
|
383
|
+
'index': 0,
|
|
384
|
+
'id': None,
|
|
385
|
+
'type': 'function',
|
|
386
|
+
'function': {
|
|
387
|
+
'name': None,
|
|
388
|
+
'arguments': '_th'
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
],
|
|
392
|
+
'function_call': None
|
|
393
|
+
},
|
|
394
|
+
'logprobs': None
|
|
395
|
+
}
|
|
396
|
+
],
|
|
397
|
+
'created': datetime.datetime(2025, 1, 24, 0, 18, 55, tzinfo=TzInfo(UTC)),
|
|
398
|
+
'model': 'gpt-4o-mini-2024-07-18',
|
|
399
|
+
'system_fingerprint': 'fp_bd83329f63',
|
|
400
|
+
'object': 'chat.completion.chunk'
|
|
401
|
+
}
|
|
402
|
+
"""
|
|
403
|
+
# Get finish reason
|
|
404
|
+
finish_reason = None
|
|
405
|
+
if isinstance(event, BetaRawMessageDeltaEvent):
|
|
406
|
+
"""
|
|
407
|
+
BetaRawMessageDeltaEvent(
|
|
408
|
+
delta=Delta(
|
|
409
|
+
stop_reason='tool_use',
|
|
410
|
+
stop_sequence=None
|
|
411
|
+
),
|
|
412
|
+
type='message_delta',
|
|
413
|
+
usage=BetaMessageDeltaUsage(output_tokens=45)
|
|
414
|
+
)
|
|
415
|
+
"""
|
|
416
|
+
finish_reason = remap_finish_reason(event.delta.stop_reason)
|
|
417
|
+
|
|
418
|
+
# Get content and tool calls
|
|
419
|
+
content = None
|
|
420
|
+
tool_calls = None
|
|
421
|
+
if isinstance(event, BetaRawContentBlockDeltaEvent):
|
|
422
|
+
"""
|
|
423
|
+
BetaRawContentBlockDeltaEvent(
|
|
424
|
+
delta=BetaInputJSONDelta(
|
|
425
|
+
partial_json='lo',
|
|
426
|
+
type='input_json_delta'
|
|
427
|
+
),
|
|
428
|
+
index=0,
|
|
429
|
+
type='content_block_delta'
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
OR
|
|
433
|
+
|
|
434
|
+
BetaRawContentBlockDeltaEvent(
|
|
435
|
+
delta=BetaTextDelta(
|
|
436
|
+
text='👋 ',
|
|
437
|
+
type='text_delta'
|
|
438
|
+
),
|
|
439
|
+
index=0,
|
|
440
|
+
type='content_block_delta'
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
"""
|
|
444
|
+
if event.delta.type == "text_delta":
|
|
445
|
+
content = strip_xml_tags_streaming(string=event.delta.text, tag=inner_thoughts_xml_tag)
|
|
446
|
+
|
|
447
|
+
elif event.delta.type == "input_json_delta":
|
|
448
|
+
tool_calls = [
|
|
449
|
+
ToolCallDelta(
|
|
450
|
+
index=0,
|
|
451
|
+
function=FunctionCallDelta(
|
|
452
|
+
name=None,
|
|
453
|
+
arguments=event.delta.partial_json,
|
|
454
|
+
),
|
|
455
|
+
)
|
|
456
|
+
]
|
|
457
|
+
elif isinstance(event, BetaRawContentBlockStartEvent):
|
|
458
|
+
"""
|
|
459
|
+
BetaRawContentBlockStartEvent(
|
|
460
|
+
content_block=BetaToolUseBlock(
|
|
461
|
+
id='toolu_01LmpZhRhR3WdrRdUrfkKfFw',
|
|
462
|
+
input={},
|
|
463
|
+
name='get_weather',
|
|
464
|
+
type='tool_use'
|
|
465
|
+
),
|
|
466
|
+
index=0,
|
|
467
|
+
type='content_block_start'
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
OR
|
|
471
|
+
|
|
472
|
+
BetaRawContentBlockStartEvent(
|
|
473
|
+
content_block=BetaTextBlock(
|
|
474
|
+
text='',
|
|
475
|
+
type='text'
|
|
476
|
+
),
|
|
477
|
+
index=0,
|
|
478
|
+
type='content_block_start'
|
|
479
|
+
)
|
|
480
|
+
"""
|
|
481
|
+
if isinstance(event.content_block, BetaToolUseBlock):
|
|
482
|
+
tool_calls = [
|
|
483
|
+
ToolCallDelta(
|
|
484
|
+
index=0,
|
|
485
|
+
id=event.content_block.id,
|
|
486
|
+
function=FunctionCallDelta(
|
|
487
|
+
name=event.content_block.name,
|
|
488
|
+
arguments="",
|
|
489
|
+
),
|
|
490
|
+
)
|
|
491
|
+
]
|
|
492
|
+
elif isinstance(event.content_block, BetaTextBlock):
|
|
493
|
+
content = event.content_block.text
|
|
494
|
+
|
|
495
|
+
# Initialize base response
|
|
496
|
+
choice = ChunkChoice(
|
|
497
|
+
index=0,
|
|
498
|
+
finish_reason=finish_reason,
|
|
499
|
+
delta=MessageDelta(
|
|
500
|
+
content=content,
|
|
501
|
+
tool_calls=tool_calls,
|
|
502
|
+
),
|
|
503
|
+
)
|
|
504
|
+
return ChatCompletionChunkResponse(
|
|
505
|
+
id=message_id,
|
|
506
|
+
choices=[choice],
|
|
507
|
+
created=get_utc_time(),
|
|
508
|
+
model=model,
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
|
|
310
512
|
def _prepare_anthropic_request(
|
|
311
513
|
data: ChatCompletionRequest,
|
|
312
514
|
inner_thoughts_xml_tag: Optional[str] = "thinking",
|
|
@@ -345,7 +547,7 @@ def _prepare_anthropic_request(
|
|
|
345
547
|
message["content"] = None
|
|
346
548
|
|
|
347
549
|
# Convert to Anthropic format
|
|
348
|
-
msg_objs = [
|
|
550
|
+
msg_objs = [_Message.dict_to_message(user_id=None, agent_id=None, openai_message_dict=m) for m in data["messages"]]
|
|
349
551
|
data["messages"] = [m.to_anthropic_dict(inner_thoughts_xml_tag=inner_thoughts_xml_tag) for m in msg_objs]
|
|
350
552
|
|
|
351
553
|
# Ensure first message is user
|
|
@@ -359,7 +561,7 @@ def _prepare_anthropic_request(
|
|
|
359
561
|
assert "max_tokens" in data, data
|
|
360
562
|
|
|
361
563
|
# Remove OpenAI-specific fields
|
|
362
|
-
for field in ["frequency_penalty", "logprobs", "n", "top_p", "presence_penalty", "user"]:
|
|
564
|
+
for field in ["frequency_penalty", "logprobs", "n", "top_p", "presence_penalty", "user", "stream"]:
|
|
363
565
|
data.pop(field, None)
|
|
364
566
|
|
|
365
567
|
return data
|
|
@@ -427,3 +629,279 @@ def anthropic_bedrock_chat_completions_request(
|
|
|
427
629
|
raise BedrockPermissionError(f"User does not have access to the Bedrock model with the specified ID. {data['model']}")
|
|
428
630
|
except Exception as e:
|
|
429
631
|
raise BedrockError(f"Bedrock error: {e}")
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def anthropic_chat_completions_request_stream(
|
|
635
|
+
data: ChatCompletionRequest,
|
|
636
|
+
inner_thoughts_xml_tag: Optional[str] = "thinking",
|
|
637
|
+
betas: List[str] = ["tools-2024-04-04"],
|
|
638
|
+
) -> Generator[ChatCompletionChunkResponse, None, None]:
|
|
639
|
+
"""Stream chat completions from Anthropic API.
|
|
640
|
+
|
|
641
|
+
Similar to OpenAI's streaming, but using Anthropic's native streaming support.
|
|
642
|
+
See: https://docs.anthropic.com/claude/reference/messages-streaming
|
|
643
|
+
"""
|
|
644
|
+
data = _prepare_anthropic_request(data, inner_thoughts_xml_tag)
|
|
645
|
+
|
|
646
|
+
anthropic_override_key = ProviderManager().get_anthropic_override_key()
|
|
647
|
+
if anthropic_override_key:
|
|
648
|
+
anthropic_client = anthropic.Anthropic(api_key=anthropic_override_key)
|
|
649
|
+
elif model_settings.anthropic_api_key:
|
|
650
|
+
anthropic_client = anthropic.Anthropic()
|
|
651
|
+
|
|
652
|
+
with anthropic_client.beta.messages.stream(
|
|
653
|
+
**data,
|
|
654
|
+
betas=betas,
|
|
655
|
+
) as stream:
|
|
656
|
+
# Stream: https://github.com/anthropics/anthropic-sdk-python/blob/d212ec9f6d5e956f13bc0ddc3d86b5888a954383/src/anthropic/lib/streaming/_beta_messages.py#L22
|
|
657
|
+
message_id = None
|
|
658
|
+
model = None
|
|
659
|
+
|
|
660
|
+
for chunk in stream._raw_stream:
|
|
661
|
+
time.sleep(0.01) # Anthropic is really fast, faster than frontend can upload.
|
|
662
|
+
if isinstance(chunk, BetaRawMessageStartEvent):
|
|
663
|
+
"""
|
|
664
|
+
BetaRawMessageStartEvent(
|
|
665
|
+
message=BetaMessage(
|
|
666
|
+
id='MESSAGE ID HERE',
|
|
667
|
+
content=[],
|
|
668
|
+
model='claude-3-5-sonnet-20241022',
|
|
669
|
+
role='assistant',
|
|
670
|
+
stop_reason=None,
|
|
671
|
+
stop_sequence=None,
|
|
672
|
+
type='message',
|
|
673
|
+
usage=BetaUsage(
|
|
674
|
+
cache_creation_input_tokens=0,
|
|
675
|
+
cache_read_input_tokens=0,
|
|
676
|
+
input_tokens=30,
|
|
677
|
+
output_tokens=4
|
|
678
|
+
)
|
|
679
|
+
),
|
|
680
|
+
type='message_start'
|
|
681
|
+
),
|
|
682
|
+
"""
|
|
683
|
+
message_id = chunk.message.id
|
|
684
|
+
model = chunk.message.model
|
|
685
|
+
yield convert_anthropic_stream_event_to_chatcompletion(chunk, message_id, model, inner_thoughts_xml_tag)
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
def anthropic_chat_completions_process_stream(
|
|
689
|
+
chat_completion_request: ChatCompletionRequest,
|
|
690
|
+
stream_interface: Optional[Union[AgentChunkStreamingInterface, AgentRefreshStreamingInterface]] = None,
|
|
691
|
+
inner_thoughts_xml_tag: Optional[str] = "thinking",
|
|
692
|
+
create_message_id: bool = True,
|
|
693
|
+
create_message_datetime: bool = True,
|
|
694
|
+
betas: List[str] = ["tools-2024-04-04"],
|
|
695
|
+
) -> ChatCompletionResponse:
|
|
696
|
+
"""Process a streaming completion response from Anthropic, similar to OpenAI's streaming.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
api_key: The Anthropic API key
|
|
700
|
+
chat_completion_request: The chat completion request
|
|
701
|
+
stream_interface: Interface for handling streaming chunks
|
|
702
|
+
inner_thoughts_xml_tag: Tag for inner thoughts in the response
|
|
703
|
+
create_message_id: Whether to create a message ID
|
|
704
|
+
create_message_datetime: Whether to create message datetime
|
|
705
|
+
betas: Beta features to enable
|
|
706
|
+
|
|
707
|
+
Returns:
|
|
708
|
+
The final ChatCompletionResponse
|
|
709
|
+
"""
|
|
710
|
+
assert chat_completion_request.stream == True
|
|
711
|
+
assert stream_interface is not None, "Required"
|
|
712
|
+
|
|
713
|
+
# Count prompt tokens - we'll get completion tokens from the final response
|
|
714
|
+
chat_history = [m.model_dump(exclude_none=True) for m in chat_completion_request.messages]
|
|
715
|
+
prompt_tokens = num_tokens_from_messages(
|
|
716
|
+
messages=chat_history,
|
|
717
|
+
model=chat_completion_request.model,
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
# Add tokens for tools if present
|
|
721
|
+
if chat_completion_request.tools is not None:
|
|
722
|
+
assert chat_completion_request.functions is None
|
|
723
|
+
prompt_tokens += num_tokens_from_functions(
|
|
724
|
+
functions=[t.function.model_dump() for t in chat_completion_request.tools],
|
|
725
|
+
model=chat_completion_request.model,
|
|
726
|
+
)
|
|
727
|
+
elif chat_completion_request.functions is not None:
|
|
728
|
+
assert chat_completion_request.tools is None
|
|
729
|
+
prompt_tokens += num_tokens_from_functions(
|
|
730
|
+
functions=[f.model_dump() for f in chat_completion_request.functions],
|
|
731
|
+
model=chat_completion_request.model,
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
# Create a dummy message for ID/datetime if needed
|
|
735
|
+
dummy_message = _Message(
|
|
736
|
+
role=_MessageRole.assistant,
|
|
737
|
+
text="",
|
|
738
|
+
agent_id="",
|
|
739
|
+
model="",
|
|
740
|
+
name=None,
|
|
741
|
+
tool_calls=None,
|
|
742
|
+
tool_call_id=None,
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
TEMP_STREAM_RESPONSE_ID = "temp_id"
|
|
746
|
+
TEMP_STREAM_FINISH_REASON = "temp_null"
|
|
747
|
+
TEMP_STREAM_TOOL_CALL_ID = "temp_id"
|
|
748
|
+
chat_completion_response = ChatCompletionResponse(
|
|
749
|
+
id=dummy_message.id if create_message_id else TEMP_STREAM_RESPONSE_ID,
|
|
750
|
+
choices=[],
|
|
751
|
+
created=dummy_message.created_at,
|
|
752
|
+
model=chat_completion_request.model,
|
|
753
|
+
usage=UsageStatistics(
|
|
754
|
+
completion_tokens=0,
|
|
755
|
+
prompt_tokens=prompt_tokens,
|
|
756
|
+
total_tokens=prompt_tokens,
|
|
757
|
+
),
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
if stream_interface:
|
|
761
|
+
stream_interface.stream_start()
|
|
762
|
+
|
|
763
|
+
n_chunks = 0
|
|
764
|
+
try:
|
|
765
|
+
for chunk_idx, chat_completion_chunk in enumerate(
|
|
766
|
+
anthropic_chat_completions_request_stream(
|
|
767
|
+
data=chat_completion_request,
|
|
768
|
+
inner_thoughts_xml_tag=inner_thoughts_xml_tag,
|
|
769
|
+
betas=betas,
|
|
770
|
+
)
|
|
771
|
+
):
|
|
772
|
+
assert isinstance(chat_completion_chunk, ChatCompletionChunkResponse), type(chat_completion_chunk)
|
|
773
|
+
|
|
774
|
+
if stream_interface:
|
|
775
|
+
if isinstance(stream_interface, AgentChunkStreamingInterface):
|
|
776
|
+
stream_interface.process_chunk(
|
|
777
|
+
chat_completion_chunk,
|
|
778
|
+
message_id=chat_completion_response.id if create_message_id else chat_completion_chunk.id,
|
|
779
|
+
message_date=chat_completion_response.created if create_message_datetime else chat_completion_chunk.created,
|
|
780
|
+
)
|
|
781
|
+
elif isinstance(stream_interface, AgentRefreshStreamingInterface):
|
|
782
|
+
stream_interface.process_refresh(chat_completion_response)
|
|
783
|
+
else:
|
|
784
|
+
raise TypeError(stream_interface)
|
|
785
|
+
|
|
786
|
+
if chunk_idx == 0:
|
|
787
|
+
# initialize the choice objects which we will increment with the deltas
|
|
788
|
+
num_choices = len(chat_completion_chunk.choices)
|
|
789
|
+
assert num_choices > 0
|
|
790
|
+
chat_completion_response.choices = [
|
|
791
|
+
Choice(
|
|
792
|
+
finish_reason=TEMP_STREAM_FINISH_REASON, # NOTE: needs to be ovrerwritten
|
|
793
|
+
index=i,
|
|
794
|
+
message=Message(
|
|
795
|
+
role="assistant",
|
|
796
|
+
),
|
|
797
|
+
)
|
|
798
|
+
for i in range(len(chat_completion_chunk.choices))
|
|
799
|
+
]
|
|
800
|
+
|
|
801
|
+
# add the choice delta
|
|
802
|
+
assert len(chat_completion_chunk.choices) == len(chat_completion_response.choices), chat_completion_chunk
|
|
803
|
+
for chunk_choice in chat_completion_chunk.choices:
|
|
804
|
+
if chunk_choice.finish_reason is not None:
|
|
805
|
+
chat_completion_response.choices[chunk_choice.index].finish_reason = chunk_choice.finish_reason
|
|
806
|
+
|
|
807
|
+
if chunk_choice.logprobs is not None:
|
|
808
|
+
chat_completion_response.choices[chunk_choice.index].logprobs = chunk_choice.logprobs
|
|
809
|
+
|
|
810
|
+
accum_message = chat_completion_response.choices[chunk_choice.index].message
|
|
811
|
+
message_delta = chunk_choice.delta
|
|
812
|
+
|
|
813
|
+
if message_delta.content is not None:
|
|
814
|
+
content_delta = message_delta.content
|
|
815
|
+
if accum_message.content is None:
|
|
816
|
+
accum_message.content = content_delta
|
|
817
|
+
else:
|
|
818
|
+
accum_message.content += content_delta
|
|
819
|
+
|
|
820
|
+
# TODO(charles) make sure this works for parallel tool calling?
|
|
821
|
+
if message_delta.tool_calls is not None:
|
|
822
|
+
tool_calls_delta = message_delta.tool_calls
|
|
823
|
+
|
|
824
|
+
# If this is the first tool call showing up in a chunk, initialize the list with it
|
|
825
|
+
if accum_message.tool_calls is None:
|
|
826
|
+
accum_message.tool_calls = [
|
|
827
|
+
ToolCall(id=TEMP_STREAM_TOOL_CALL_ID, function=FunctionCall(name="", arguments=""))
|
|
828
|
+
for _ in range(len(tool_calls_delta))
|
|
829
|
+
]
|
|
830
|
+
|
|
831
|
+
# There may be many tool calls in a tool calls delta (e.g. parallel tool calls)
|
|
832
|
+
for tool_call_delta in tool_calls_delta:
|
|
833
|
+
if tool_call_delta.id is not None:
|
|
834
|
+
# TODO assert that we're not overwriting?
|
|
835
|
+
# TODO += instead of =?
|
|
836
|
+
if tool_call_delta.index not in range(len(accum_message.tool_calls)):
|
|
837
|
+
warnings.warn(
|
|
838
|
+
f"Tool call index out of range ({tool_call_delta.index})\ncurrent tool calls: {accum_message.tool_calls}\ncurrent delta: {tool_call_delta}"
|
|
839
|
+
)
|
|
840
|
+
# force index 0
|
|
841
|
+
# accum_message.tool_calls[0].id = tool_call_delta.id
|
|
842
|
+
else:
|
|
843
|
+
accum_message.tool_calls[tool_call_delta.index].id = tool_call_delta.id
|
|
844
|
+
if tool_call_delta.function is not None:
|
|
845
|
+
if tool_call_delta.function.name is not None:
|
|
846
|
+
# TODO assert that we're not overwriting?
|
|
847
|
+
# TODO += instead of =?
|
|
848
|
+
if tool_call_delta.index not in range(len(accum_message.tool_calls)):
|
|
849
|
+
warnings.warn(
|
|
850
|
+
f"Tool call index out of range ({tool_call_delta.index})\ncurrent tool calls: {accum_message.tool_calls}\ncurrent delta: {tool_call_delta}"
|
|
851
|
+
)
|
|
852
|
+
# force index 0
|
|
853
|
+
# accum_message.tool_calls[0].function.name = tool_call_delta.function.name
|
|
854
|
+
else:
|
|
855
|
+
accum_message.tool_calls[tool_call_delta.index].function.name = tool_call_delta.function.name
|
|
856
|
+
if tool_call_delta.function.arguments is not None:
|
|
857
|
+
if tool_call_delta.index not in range(len(accum_message.tool_calls)):
|
|
858
|
+
warnings.warn(
|
|
859
|
+
f"Tool call index out of range ({tool_call_delta.index})\ncurrent tool calls: {accum_message.tool_calls}\ncurrent delta: {tool_call_delta}"
|
|
860
|
+
)
|
|
861
|
+
# force index 0
|
|
862
|
+
# accum_message.tool_calls[0].function.arguments += tool_call_delta.function.arguments
|
|
863
|
+
else:
|
|
864
|
+
accum_message.tool_calls[tool_call_delta.index].function.arguments += tool_call_delta.function.arguments
|
|
865
|
+
|
|
866
|
+
if message_delta.function_call is not None:
|
|
867
|
+
raise NotImplementedError(f"Old function_call style not support with stream=True")
|
|
868
|
+
|
|
869
|
+
# overwrite response fields based on latest chunk
|
|
870
|
+
if not create_message_id:
|
|
871
|
+
chat_completion_response.id = chat_completion_chunk.id
|
|
872
|
+
if not create_message_datetime:
|
|
873
|
+
chat_completion_response.created = chat_completion_chunk.created
|
|
874
|
+
chat_completion_response.model = chat_completion_chunk.model
|
|
875
|
+
chat_completion_response.system_fingerprint = chat_completion_chunk.system_fingerprint
|
|
876
|
+
|
|
877
|
+
# increment chunk counter
|
|
878
|
+
n_chunks += 1
|
|
879
|
+
|
|
880
|
+
except Exception as e:
|
|
881
|
+
if stream_interface:
|
|
882
|
+
stream_interface.stream_end()
|
|
883
|
+
print(f"Parsing ChatCompletion stream failed with error:\n{str(e)}")
|
|
884
|
+
raise e
|
|
885
|
+
finally:
|
|
886
|
+
if stream_interface:
|
|
887
|
+
stream_interface.stream_end()
|
|
888
|
+
|
|
889
|
+
# make sure we didn't leave temp stuff in
|
|
890
|
+
assert all([c.finish_reason != TEMP_STREAM_FINISH_REASON for c in chat_completion_response.choices])
|
|
891
|
+
assert all(
|
|
892
|
+
[
|
|
893
|
+
all([tc.id != TEMP_STREAM_TOOL_CALL_ID for tc in c.message.tool_calls]) if c.message.tool_calls else True
|
|
894
|
+
for c in chat_completion_response.choices
|
|
895
|
+
]
|
|
896
|
+
)
|
|
897
|
+
if not create_message_id:
|
|
898
|
+
assert chat_completion_response.id != dummy_message.id
|
|
899
|
+
|
|
900
|
+
# compute token usage before returning
|
|
901
|
+
# TODO try actually computing the #tokens instead of assuming the chunks is the same
|
|
902
|
+
chat_completion_response.usage.completion_tokens = n_chunks
|
|
903
|
+
chat_completion_response.usage.total_tokens = prompt_tokens + n_chunks
|
|
904
|
+
|
|
905
|
+
assert len(chat_completion_response.choices) > 0, chat_completion_response
|
|
906
|
+
|
|
907
|
+
return chat_completion_response
|
letta/llm_api/llm_api_tools.py
CHANGED
|
@@ -6,7 +6,11 @@ import requests
|
|
|
6
6
|
|
|
7
7
|
from letta.constants import CLI_WARNING_PREFIX
|
|
8
8
|
from letta.errors import LettaConfigurationError, RateLimitExceededError
|
|
9
|
-
from letta.llm_api.anthropic import
|
|
9
|
+
from letta.llm_api.anthropic import (
|
|
10
|
+
anthropic_bedrock_chat_completions_request,
|
|
11
|
+
anthropic_chat_completions_process_stream,
|
|
12
|
+
anthropic_chat_completions_request,
|
|
13
|
+
)
|
|
10
14
|
from letta.llm_api.aws_bedrock import has_valid_aws_credentials
|
|
11
15
|
from letta.llm_api.azure_openai import azure_openai_chat_completions_request
|
|
12
16
|
from letta.llm_api.google_ai import convert_tools_to_google_ai_format, google_ai_chat_completions_request
|
|
@@ -243,27 +247,38 @@ def create(
|
|
|
243
247
|
)
|
|
244
248
|
|
|
245
249
|
elif llm_config.model_endpoint_type == "anthropic":
|
|
246
|
-
if stream:
|
|
247
|
-
raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}")
|
|
248
250
|
if not use_tool_naming:
|
|
249
251
|
raise NotImplementedError("Only tool calling supported on Anthropic API requests")
|
|
250
252
|
|
|
253
|
+
# Force tool calling
|
|
251
254
|
tool_call = None
|
|
252
255
|
if force_tool_call is not None:
|
|
253
256
|
tool_call = {"type": "function", "function": {"name": force_tool_call}}
|
|
254
257
|
assert functions is not None
|
|
255
258
|
|
|
259
|
+
chat_completion_request = ChatCompletionRequest(
|
|
260
|
+
model=llm_config.model,
|
|
261
|
+
messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
|
|
262
|
+
tools=([{"type": "function", "function": f} for f in functions] if functions else None),
|
|
263
|
+
tool_choice=tool_call,
|
|
264
|
+
max_tokens=1024, # TODO make dynamic
|
|
265
|
+
temperature=llm_config.temperature,
|
|
266
|
+
stream=stream,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Handle streaming
|
|
270
|
+
if stream: # Client requested token streaming
|
|
271
|
+
assert isinstance(stream_interface, (AgentChunkStreamingInterface, AgentRefreshStreamingInterface)), type(stream_interface)
|
|
272
|
+
|
|
273
|
+
response = anthropic_chat_completions_process_stream(
|
|
274
|
+
chat_completion_request=chat_completion_request,
|
|
275
|
+
stream_interface=stream_interface,
|
|
276
|
+
)
|
|
277
|
+
return response
|
|
278
|
+
|
|
279
|
+
# Client did not request token streaming (expect a blocking backend response)
|
|
256
280
|
return anthropic_chat_completions_request(
|
|
257
|
-
data=
|
|
258
|
-
model=llm_config.model,
|
|
259
|
-
messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
|
|
260
|
-
tools=[{"type": "function", "function": f} for f in functions] if functions else None,
|
|
261
|
-
tool_choice=tool_call,
|
|
262
|
-
# user=str(user_id),
|
|
263
|
-
# NOTE: max_tokens is required for Anthropic API
|
|
264
|
-
max_tokens=1024, # TODO make dynamic
|
|
265
|
-
temperature=llm_config.temperature,
|
|
266
|
-
),
|
|
281
|
+
data=chat_completion_request,
|
|
267
282
|
)
|
|
268
283
|
|
|
269
284
|
# elif llm_config.model_endpoint_type == "cohere":
|
letta/llm_api/openai.py
CHANGED
|
@@ -5,7 +5,7 @@ import requests
|
|
|
5
5
|
from openai import OpenAI
|
|
6
6
|
|
|
7
7
|
from letta.llm_api.helpers import add_inner_thoughts_to_functions, convert_to_structured_output, make_post_request
|
|
8
|
-
from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
|
|
8
|
+
from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION, INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST
|
|
9
9
|
from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
|
|
10
10
|
from letta.schemas.llm_config import LLMConfig
|
|
11
11
|
from letta.schemas.message import Message as _Message
|
|
@@ -30,7 +30,7 @@ OPENAI_SSE_DONE = "[DONE]"
|
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
def openai_get_model_list(
|
|
33
|
-
url: str, api_key:
|
|
33
|
+
url: str, api_key: Optional[str] = None, fix_url: Optional[bool] = False, extra_params: Optional[dict] = None
|
|
34
34
|
) -> dict:
|
|
35
35
|
"""https://platform.openai.com/docs/api-reference/models/list"""
|
|
36
36
|
from letta.utils import printd
|
|
@@ -96,10 +96,15 @@ def build_openai_chat_completions_request(
|
|
|
96
96
|
max_tokens: Optional[int],
|
|
97
97
|
) -> ChatCompletionRequest:
|
|
98
98
|
if functions and llm_config.put_inner_thoughts_in_kwargs:
|
|
99
|
+
# Special case for LM Studio backend since it needs extra guidance to force out the thoughts first
|
|
100
|
+
# TODO(fix)
|
|
101
|
+
inner_thoughts_desc = (
|
|
102
|
+
INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST if ":1234" in llm_config.model_endpoint else INNER_THOUGHTS_KWARG_DESCRIPTION
|
|
103
|
+
)
|
|
99
104
|
functions = add_inner_thoughts_to_functions(
|
|
100
105
|
functions=functions,
|
|
101
106
|
inner_thoughts_key=INNER_THOUGHTS_KWARG,
|
|
102
|
-
inner_thoughts_description=
|
|
107
|
+
inner_thoughts_description=inner_thoughts_desc,
|
|
103
108
|
)
|
|
104
109
|
|
|
105
110
|
openai_message_list = [
|
letta/local_llm/constants.py
CHANGED
|
@@ -27,6 +27,7 @@ DEFAULT_WRAPPER_NAME = "chatml"
|
|
|
27
27
|
|
|
28
28
|
INNER_THOUGHTS_KWARG = "inner_thoughts"
|
|
29
29
|
INNER_THOUGHTS_KWARG_DESCRIPTION = "Deep inner monologue private to you only."
|
|
30
|
+
INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST = f"Deep inner monologue private to you only. Think before you act, so always generate arg '{INNER_THOUGHTS_KWARG}' first before any other arg."
|
|
30
31
|
INNER_THOUGHTS_CLI_SYMBOL = "💭"
|
|
31
32
|
|
|
32
33
|
ASSISTANT_MESSAGE_CLI_SYMBOL = "🤖"
|
letta/schemas/message.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
import json
|
|
3
3
|
import warnings
|
|
4
|
+
from collections import OrderedDict
|
|
4
5
|
from datetime import datetime, timezone
|
|
5
6
|
from typing import Any, Dict, List, Literal, Optional, Union
|
|
6
7
|
|
|
@@ -33,18 +34,18 @@ def add_inner_thoughts_to_tool_call(
|
|
|
33
34
|
inner_thoughts_key: str,
|
|
34
35
|
) -> OpenAIToolCall:
|
|
35
36
|
"""Add inner thoughts (arg + value) to a tool call"""
|
|
36
|
-
# because the kwargs are stored as strings, we need to load then write the JSON dicts
|
|
37
37
|
try:
|
|
38
38
|
# load the args list
|
|
39
39
|
func_args = json.loads(tool_call.function.arguments)
|
|
40
|
-
#
|
|
41
|
-
|
|
40
|
+
# create new ordered dict with inner thoughts first
|
|
41
|
+
ordered_args = OrderedDict({inner_thoughts_key: inner_thoughts})
|
|
42
|
+
# update with remaining args
|
|
43
|
+
ordered_args.update(func_args)
|
|
42
44
|
# create the updated tool call (as a string)
|
|
43
45
|
updated_tool_call = copy.deepcopy(tool_call)
|
|
44
|
-
updated_tool_call.function.arguments = json_dumps(
|
|
46
|
+
updated_tool_call.function.arguments = json_dumps(ordered_args)
|
|
45
47
|
return updated_tool_call
|
|
46
48
|
except json.JSONDecodeError as e:
|
|
47
|
-
# TODO: change to logging
|
|
48
49
|
warnings.warn(f"Failed to put inner thoughts in kwargs: {e}")
|
|
49
50
|
raise e
|
|
50
51
|
|