letta-nightly 0.6.15.dev20250125103914__py3-none-any.whl → 0.6.16.dev20250127040412__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of letta-nightly might be problematic. Click here for more details.

@@ -1,21 +1,41 @@
1
1
  import json
2
2
  import re
3
- from typing import List, Optional, Tuple, Union
3
+ import time
4
+ from typing import Generator, List, Optional, Tuple, Union
4
5
 
5
6
  import anthropic
6
7
  from anthropic import PermissionDeniedError
8
+ from anthropic.types.beta import (
9
+ BetaRawContentBlockDeltaEvent,
10
+ BetaRawContentBlockStartEvent,
11
+ BetaRawContentBlockStopEvent,
12
+ BetaRawMessageDeltaEvent,
13
+ BetaRawMessageStartEvent,
14
+ BetaRawMessageStopEvent,
15
+ BetaTextBlock,
16
+ BetaToolUseBlock,
17
+ )
7
18
 
8
19
  from letta.errors import BedrockError, BedrockPermissionError
9
20
  from letta.llm_api.aws_bedrock import get_bedrock_client
10
- from letta.schemas.message import Message
21
+ from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
22
+ from letta.schemas.message import Message as _Message
23
+ from letta.schemas.message import MessageRole as _MessageRole
11
24
  from letta.schemas.openai.chat_completion_request import ChatCompletionRequest, Tool
12
- from letta.schemas.openai.chat_completion_response import ChatCompletionResponse, Choice, FunctionCall
13
25
  from letta.schemas.openai.chat_completion_response import (
14
- Message as ChoiceMessage, # NOTE: avoid conflict with our own Letta Message datatype
26
+ ChatCompletionChunkResponse,
27
+ ChatCompletionResponse,
28
+ Choice,
29
+ ChunkChoice,
30
+ FunctionCall,
31
+ FunctionCallDelta,
15
32
  )
16
- from letta.schemas.openai.chat_completion_response import ToolCall, UsageStatistics
33
+ from letta.schemas.openai.chat_completion_response import Message
34
+ from letta.schemas.openai.chat_completion_response import Message as ChoiceMessage
35
+ from letta.schemas.openai.chat_completion_response import MessageDelta, ToolCall, ToolCallDelta, UsageStatistics
17
36
  from letta.services.provider_manager import ProviderManager
18
37
  from letta.settings import model_settings
38
+ from letta.streaming_interface import AgentChunkStreamingInterface, AgentRefreshStreamingInterface
19
39
  from letta.utils import get_utc_time, smart_urljoin
20
40
 
21
41
  BASE_URL = "https://api.anthropic.com/v1"
@@ -200,6 +220,28 @@ def strip_xml_tags(string: str, tag: Optional[str]) -> str:
200
220
  return re.sub(tag_pattern, "", string)
201
221
 
202
222
 
223
+ def strip_xml_tags_streaming(string: str, tag: Optional[str]) -> str:
224
+ if tag is None:
225
+ return string
226
+
227
+ # Handle common partial tag cases
228
+ parts_to_remove = [
229
+ "<", # Leftover start bracket
230
+ f"<{tag}", # Opening tag start
231
+ f"</{tag}", # Closing tag start
232
+ f"/{tag}>", # Closing tag end
233
+ f"{tag}>", # Opening tag end
234
+ f"/{tag}", # Partial closing tag without >
235
+ ">", # Leftover end bracket
236
+ ]
237
+
238
+ result = string
239
+ for part in parts_to_remove:
240
+ result = result.replace(part, "")
241
+
242
+ return result
243
+
244
+
203
245
  def convert_anthropic_response_to_chatcompletion(
204
246
  response: anthropic.types.Message,
205
247
  inner_thoughts_xml_tag: Optional[str] = None,
@@ -307,6 +349,166 @@ def convert_anthropic_response_to_chatcompletion(
307
349
  )
308
350
 
309
351
 
352
+ def convert_anthropic_stream_event_to_chatcompletion(
353
+ event: Union[
354
+ BetaRawMessageStartEvent,
355
+ BetaRawContentBlockStartEvent,
356
+ BetaRawContentBlockDeltaEvent,
357
+ BetaRawContentBlockStopEvent,
358
+ BetaRawMessageDeltaEvent,
359
+ BetaRawMessageStopEvent,
360
+ ],
361
+ message_id: str,
362
+ model: str,
363
+ inner_thoughts_xml_tag: Optional[str] = "thinking",
364
+ ) -> ChatCompletionChunkResponse:
365
+ """Convert Anthropic stream events to OpenAI ChatCompletionResponse format.
366
+
367
+ Args:
368
+ event: The event to convert
369
+ message_id: The ID of the message. Anthropic does not return this on every event, so we need to keep track of it
370
+ model: The model used. Anthropic does not return this on every event, so we need to keep track of it
371
+
372
+ Example response from OpenAI:
373
+
374
+ 'id': 'MESSAGE_ID',
375
+ 'choices': [
376
+ {
377
+ 'finish_reason': None,
378
+ 'index': 0,
379
+ 'delta': {
380
+ 'content': None,
381
+ 'tool_calls': [
382
+ {
383
+ 'index': 0,
384
+ 'id': None,
385
+ 'type': 'function',
386
+ 'function': {
387
+ 'name': None,
388
+ 'arguments': '_th'
389
+ }
390
+ }
391
+ ],
392
+ 'function_call': None
393
+ },
394
+ 'logprobs': None
395
+ }
396
+ ],
397
+ 'created': datetime.datetime(2025, 1, 24, 0, 18, 55, tzinfo=TzInfo(UTC)),
398
+ 'model': 'gpt-4o-mini-2024-07-18',
399
+ 'system_fingerprint': 'fp_bd83329f63',
400
+ 'object': 'chat.completion.chunk'
401
+ }
402
+ """
403
+ # Get finish reason
404
+ finish_reason = None
405
+ if isinstance(event, BetaRawMessageDeltaEvent):
406
+ """
407
+ BetaRawMessageDeltaEvent(
408
+ delta=Delta(
409
+ stop_reason='tool_use',
410
+ stop_sequence=None
411
+ ),
412
+ type='message_delta',
413
+ usage=BetaMessageDeltaUsage(output_tokens=45)
414
+ )
415
+ """
416
+ finish_reason = remap_finish_reason(event.delta.stop_reason)
417
+
418
+ # Get content and tool calls
419
+ content = None
420
+ tool_calls = None
421
+ if isinstance(event, BetaRawContentBlockDeltaEvent):
422
+ """
423
+ BetaRawContentBlockDeltaEvent(
424
+ delta=BetaInputJSONDelta(
425
+ partial_json='lo',
426
+ type='input_json_delta'
427
+ ),
428
+ index=0,
429
+ type='content_block_delta'
430
+ )
431
+
432
+ OR
433
+
434
+ BetaRawContentBlockDeltaEvent(
435
+ delta=BetaTextDelta(
436
+ text='👋 ',
437
+ type='text_delta'
438
+ ),
439
+ index=0,
440
+ type='content_block_delta'
441
+ )
442
+
443
+ """
444
+ if event.delta.type == "text_delta":
445
+ content = strip_xml_tags_streaming(string=event.delta.text, tag=inner_thoughts_xml_tag)
446
+
447
+ elif event.delta.type == "input_json_delta":
448
+ tool_calls = [
449
+ ToolCallDelta(
450
+ index=0,
451
+ function=FunctionCallDelta(
452
+ name=None,
453
+ arguments=event.delta.partial_json,
454
+ ),
455
+ )
456
+ ]
457
+ elif isinstance(event, BetaRawContentBlockStartEvent):
458
+ """
459
+ BetaRawContentBlockStartEvent(
460
+ content_block=BetaToolUseBlock(
461
+ id='toolu_01LmpZhRhR3WdrRdUrfkKfFw',
462
+ input={},
463
+ name='get_weather',
464
+ type='tool_use'
465
+ ),
466
+ index=0,
467
+ type='content_block_start'
468
+ )
469
+
470
+ OR
471
+
472
+ BetaRawContentBlockStartEvent(
473
+ content_block=BetaTextBlock(
474
+ text='',
475
+ type='text'
476
+ ),
477
+ index=0,
478
+ type='content_block_start'
479
+ )
480
+ """
481
+ if isinstance(event.content_block, BetaToolUseBlock):
482
+ tool_calls = [
483
+ ToolCallDelta(
484
+ index=0,
485
+ id=event.content_block.id,
486
+ function=FunctionCallDelta(
487
+ name=event.content_block.name,
488
+ arguments="",
489
+ ),
490
+ )
491
+ ]
492
+ elif isinstance(event.content_block, BetaTextBlock):
493
+ content = event.content_block.text
494
+
495
+ # Initialize base response
496
+ choice = ChunkChoice(
497
+ index=0,
498
+ finish_reason=finish_reason,
499
+ delta=MessageDelta(
500
+ content=content,
501
+ tool_calls=tool_calls,
502
+ ),
503
+ )
504
+ return ChatCompletionChunkResponse(
505
+ id=message_id,
506
+ choices=[choice],
507
+ created=get_utc_time(),
508
+ model=model,
509
+ )
510
+
511
+
310
512
  def _prepare_anthropic_request(
311
513
  data: ChatCompletionRequest,
312
514
  inner_thoughts_xml_tag: Optional[str] = "thinking",
@@ -345,7 +547,7 @@ def _prepare_anthropic_request(
345
547
  message["content"] = None
346
548
 
347
549
  # Convert to Anthropic format
348
- msg_objs = [Message.dict_to_message(user_id=None, agent_id=None, openai_message_dict=m) for m in data["messages"]]
550
+ msg_objs = [_Message.dict_to_message(user_id=None, agent_id=None, openai_message_dict=m) for m in data["messages"]]
349
551
  data["messages"] = [m.to_anthropic_dict(inner_thoughts_xml_tag=inner_thoughts_xml_tag) for m in msg_objs]
350
552
 
351
553
  # Ensure first message is user
@@ -359,7 +561,7 @@ def _prepare_anthropic_request(
359
561
  assert "max_tokens" in data, data
360
562
 
361
563
  # Remove OpenAI-specific fields
362
- for field in ["frequency_penalty", "logprobs", "n", "top_p", "presence_penalty", "user"]:
564
+ for field in ["frequency_penalty", "logprobs", "n", "top_p", "presence_penalty", "user", "stream"]:
363
565
  data.pop(field, None)
364
566
 
365
567
  return data
@@ -427,3 +629,279 @@ def anthropic_bedrock_chat_completions_request(
427
629
  raise BedrockPermissionError(f"User does not have access to the Bedrock model with the specified ID. {data['model']}")
428
630
  except Exception as e:
429
631
  raise BedrockError(f"Bedrock error: {e}")
632
+
633
+
634
+ def anthropic_chat_completions_request_stream(
635
+ data: ChatCompletionRequest,
636
+ inner_thoughts_xml_tag: Optional[str] = "thinking",
637
+ betas: List[str] = ["tools-2024-04-04"],
638
+ ) -> Generator[ChatCompletionChunkResponse, None, None]:
639
+ """Stream chat completions from Anthropic API.
640
+
641
+ Similar to OpenAI's streaming, but using Anthropic's native streaming support.
642
+ See: https://docs.anthropic.com/claude/reference/messages-streaming
643
+ """
644
+ data = _prepare_anthropic_request(data, inner_thoughts_xml_tag)
645
+
646
+ anthropic_override_key = ProviderManager().get_anthropic_override_key()
647
+ if anthropic_override_key:
648
+ anthropic_client = anthropic.Anthropic(api_key=anthropic_override_key)
649
+ elif model_settings.anthropic_api_key:
650
+ anthropic_client = anthropic.Anthropic()
651
+
652
+ with anthropic_client.beta.messages.stream(
653
+ **data,
654
+ betas=betas,
655
+ ) as stream:
656
+ # Stream: https://github.com/anthropics/anthropic-sdk-python/blob/d212ec9f6d5e956f13bc0ddc3d86b5888a954383/src/anthropic/lib/streaming/_beta_messages.py#L22
657
+ message_id = None
658
+ model = None
659
+
660
+ for chunk in stream._raw_stream:
661
+ time.sleep(0.01) # Anthropic is really fast, faster than frontend can upload.
662
+ if isinstance(chunk, BetaRawMessageStartEvent):
663
+ """
664
+ BetaRawMessageStartEvent(
665
+ message=BetaMessage(
666
+ id='MESSAGE ID HERE',
667
+ content=[],
668
+ model='claude-3-5-sonnet-20241022',
669
+ role='assistant',
670
+ stop_reason=None,
671
+ stop_sequence=None,
672
+ type='message',
673
+ usage=BetaUsage(
674
+ cache_creation_input_tokens=0,
675
+ cache_read_input_tokens=0,
676
+ input_tokens=30,
677
+ output_tokens=4
678
+ )
679
+ ),
680
+ type='message_start'
681
+ ),
682
+ """
683
+ message_id = chunk.message.id
684
+ model = chunk.message.model
685
+ yield convert_anthropic_stream_event_to_chatcompletion(chunk, message_id, model, inner_thoughts_xml_tag)
686
+
687
+
688
+ def anthropic_chat_completions_process_stream(
689
+ chat_completion_request: ChatCompletionRequest,
690
+ stream_interface: Optional[Union[AgentChunkStreamingInterface, AgentRefreshStreamingInterface]] = None,
691
+ inner_thoughts_xml_tag: Optional[str] = "thinking",
692
+ create_message_id: bool = True,
693
+ create_message_datetime: bool = True,
694
+ betas: List[str] = ["tools-2024-04-04"],
695
+ ) -> ChatCompletionResponse:
696
+ """Process a streaming completion response from Anthropic, similar to OpenAI's streaming.
697
+
698
+ Args:
699
+ api_key: The Anthropic API key
700
+ chat_completion_request: The chat completion request
701
+ stream_interface: Interface for handling streaming chunks
702
+ inner_thoughts_xml_tag: Tag for inner thoughts in the response
703
+ create_message_id: Whether to create a message ID
704
+ create_message_datetime: Whether to create message datetime
705
+ betas: Beta features to enable
706
+
707
+ Returns:
708
+ The final ChatCompletionResponse
709
+ """
710
+ assert chat_completion_request.stream == True
711
+ assert stream_interface is not None, "Required"
712
+
713
+ # Count prompt tokens - we'll get completion tokens from the final response
714
+ chat_history = [m.model_dump(exclude_none=True) for m in chat_completion_request.messages]
715
+ prompt_tokens = num_tokens_from_messages(
716
+ messages=chat_history,
717
+ model=chat_completion_request.model,
718
+ )
719
+
720
+ # Add tokens for tools if present
721
+ if chat_completion_request.tools is not None:
722
+ assert chat_completion_request.functions is None
723
+ prompt_tokens += num_tokens_from_functions(
724
+ functions=[t.function.model_dump() for t in chat_completion_request.tools],
725
+ model=chat_completion_request.model,
726
+ )
727
+ elif chat_completion_request.functions is not None:
728
+ assert chat_completion_request.tools is None
729
+ prompt_tokens += num_tokens_from_functions(
730
+ functions=[f.model_dump() for f in chat_completion_request.functions],
731
+ model=chat_completion_request.model,
732
+ )
733
+
734
+ # Create a dummy message for ID/datetime if needed
735
+ dummy_message = _Message(
736
+ role=_MessageRole.assistant,
737
+ text="",
738
+ agent_id="",
739
+ model="",
740
+ name=None,
741
+ tool_calls=None,
742
+ tool_call_id=None,
743
+ )
744
+
745
+ TEMP_STREAM_RESPONSE_ID = "temp_id"
746
+ TEMP_STREAM_FINISH_REASON = "temp_null"
747
+ TEMP_STREAM_TOOL_CALL_ID = "temp_id"
748
+ chat_completion_response = ChatCompletionResponse(
749
+ id=dummy_message.id if create_message_id else TEMP_STREAM_RESPONSE_ID,
750
+ choices=[],
751
+ created=dummy_message.created_at,
752
+ model=chat_completion_request.model,
753
+ usage=UsageStatistics(
754
+ completion_tokens=0,
755
+ prompt_tokens=prompt_tokens,
756
+ total_tokens=prompt_tokens,
757
+ ),
758
+ )
759
+
760
+ if stream_interface:
761
+ stream_interface.stream_start()
762
+
763
+ n_chunks = 0
764
+ try:
765
+ for chunk_idx, chat_completion_chunk in enumerate(
766
+ anthropic_chat_completions_request_stream(
767
+ data=chat_completion_request,
768
+ inner_thoughts_xml_tag=inner_thoughts_xml_tag,
769
+ betas=betas,
770
+ )
771
+ ):
772
+ assert isinstance(chat_completion_chunk, ChatCompletionChunkResponse), type(chat_completion_chunk)
773
+
774
+ if stream_interface:
775
+ if isinstance(stream_interface, AgentChunkStreamingInterface):
776
+ stream_interface.process_chunk(
777
+ chat_completion_chunk,
778
+ message_id=chat_completion_response.id if create_message_id else chat_completion_chunk.id,
779
+ message_date=chat_completion_response.created if create_message_datetime else chat_completion_chunk.created,
780
+ )
781
+ elif isinstance(stream_interface, AgentRefreshStreamingInterface):
782
+ stream_interface.process_refresh(chat_completion_response)
783
+ else:
784
+ raise TypeError(stream_interface)
785
+
786
+ if chunk_idx == 0:
787
+ # initialize the choice objects which we will increment with the deltas
788
+ num_choices = len(chat_completion_chunk.choices)
789
+ assert num_choices > 0
790
+ chat_completion_response.choices = [
791
+ Choice(
792
+ finish_reason=TEMP_STREAM_FINISH_REASON, # NOTE: needs to be ovrerwritten
793
+ index=i,
794
+ message=Message(
795
+ role="assistant",
796
+ ),
797
+ )
798
+ for i in range(len(chat_completion_chunk.choices))
799
+ ]
800
+
801
+ # add the choice delta
802
+ assert len(chat_completion_chunk.choices) == len(chat_completion_response.choices), chat_completion_chunk
803
+ for chunk_choice in chat_completion_chunk.choices:
804
+ if chunk_choice.finish_reason is not None:
805
+ chat_completion_response.choices[chunk_choice.index].finish_reason = chunk_choice.finish_reason
806
+
807
+ if chunk_choice.logprobs is not None:
808
+ chat_completion_response.choices[chunk_choice.index].logprobs = chunk_choice.logprobs
809
+
810
+ accum_message = chat_completion_response.choices[chunk_choice.index].message
811
+ message_delta = chunk_choice.delta
812
+
813
+ if message_delta.content is not None:
814
+ content_delta = message_delta.content
815
+ if accum_message.content is None:
816
+ accum_message.content = content_delta
817
+ else:
818
+ accum_message.content += content_delta
819
+
820
+ # TODO(charles) make sure this works for parallel tool calling?
821
+ if message_delta.tool_calls is not None:
822
+ tool_calls_delta = message_delta.tool_calls
823
+
824
+ # If this is the first tool call showing up in a chunk, initialize the list with it
825
+ if accum_message.tool_calls is None:
826
+ accum_message.tool_calls = [
827
+ ToolCall(id=TEMP_STREAM_TOOL_CALL_ID, function=FunctionCall(name="", arguments=""))
828
+ for _ in range(len(tool_calls_delta))
829
+ ]
830
+
831
+ # There may be many tool calls in a tool calls delta (e.g. parallel tool calls)
832
+ for tool_call_delta in tool_calls_delta:
833
+ if tool_call_delta.id is not None:
834
+ # TODO assert that we're not overwriting?
835
+ # TODO += instead of =?
836
+ if tool_call_delta.index not in range(len(accum_message.tool_calls)):
837
+ warnings.warn(
838
+ f"Tool call index out of range ({tool_call_delta.index})\ncurrent tool calls: {accum_message.tool_calls}\ncurrent delta: {tool_call_delta}"
839
+ )
840
+ # force index 0
841
+ # accum_message.tool_calls[0].id = tool_call_delta.id
842
+ else:
843
+ accum_message.tool_calls[tool_call_delta.index].id = tool_call_delta.id
844
+ if tool_call_delta.function is not None:
845
+ if tool_call_delta.function.name is not None:
846
+ # TODO assert that we're not overwriting?
847
+ # TODO += instead of =?
848
+ if tool_call_delta.index not in range(len(accum_message.tool_calls)):
849
+ warnings.warn(
850
+ f"Tool call index out of range ({tool_call_delta.index})\ncurrent tool calls: {accum_message.tool_calls}\ncurrent delta: {tool_call_delta}"
851
+ )
852
+ # force index 0
853
+ # accum_message.tool_calls[0].function.name = tool_call_delta.function.name
854
+ else:
855
+ accum_message.tool_calls[tool_call_delta.index].function.name = tool_call_delta.function.name
856
+ if tool_call_delta.function.arguments is not None:
857
+ if tool_call_delta.index not in range(len(accum_message.tool_calls)):
858
+ warnings.warn(
859
+ f"Tool call index out of range ({tool_call_delta.index})\ncurrent tool calls: {accum_message.tool_calls}\ncurrent delta: {tool_call_delta}"
860
+ )
861
+ # force index 0
862
+ # accum_message.tool_calls[0].function.arguments += tool_call_delta.function.arguments
863
+ else:
864
+ accum_message.tool_calls[tool_call_delta.index].function.arguments += tool_call_delta.function.arguments
865
+
866
+ if message_delta.function_call is not None:
867
+ raise NotImplementedError(f"Old function_call style not support with stream=True")
868
+
869
+ # overwrite response fields based on latest chunk
870
+ if not create_message_id:
871
+ chat_completion_response.id = chat_completion_chunk.id
872
+ if not create_message_datetime:
873
+ chat_completion_response.created = chat_completion_chunk.created
874
+ chat_completion_response.model = chat_completion_chunk.model
875
+ chat_completion_response.system_fingerprint = chat_completion_chunk.system_fingerprint
876
+
877
+ # increment chunk counter
878
+ n_chunks += 1
879
+
880
+ except Exception as e:
881
+ if stream_interface:
882
+ stream_interface.stream_end()
883
+ print(f"Parsing ChatCompletion stream failed with error:\n{str(e)}")
884
+ raise e
885
+ finally:
886
+ if stream_interface:
887
+ stream_interface.stream_end()
888
+
889
+ # make sure we didn't leave temp stuff in
890
+ assert all([c.finish_reason != TEMP_STREAM_FINISH_REASON for c in chat_completion_response.choices])
891
+ assert all(
892
+ [
893
+ all([tc.id != TEMP_STREAM_TOOL_CALL_ID for tc in c.message.tool_calls]) if c.message.tool_calls else True
894
+ for c in chat_completion_response.choices
895
+ ]
896
+ )
897
+ if not create_message_id:
898
+ assert chat_completion_response.id != dummy_message.id
899
+
900
+ # compute token usage before returning
901
+ # TODO try actually computing the #tokens instead of assuming the chunks is the same
902
+ chat_completion_response.usage.completion_tokens = n_chunks
903
+ chat_completion_response.usage.total_tokens = prompt_tokens + n_chunks
904
+
905
+ assert len(chat_completion_response.choices) > 0, chat_completion_response
906
+
907
+ return chat_completion_response
@@ -6,7 +6,11 @@ import requests
6
6
 
7
7
  from letta.constants import CLI_WARNING_PREFIX
8
8
  from letta.errors import LettaConfigurationError, RateLimitExceededError
9
- from letta.llm_api.anthropic import anthropic_bedrock_chat_completions_request, anthropic_chat_completions_request
9
+ from letta.llm_api.anthropic import (
10
+ anthropic_bedrock_chat_completions_request,
11
+ anthropic_chat_completions_process_stream,
12
+ anthropic_chat_completions_request,
13
+ )
10
14
  from letta.llm_api.aws_bedrock import has_valid_aws_credentials
11
15
  from letta.llm_api.azure_openai import azure_openai_chat_completions_request
12
16
  from letta.llm_api.google_ai import convert_tools_to_google_ai_format, google_ai_chat_completions_request
@@ -243,27 +247,38 @@ def create(
243
247
  )
244
248
 
245
249
  elif llm_config.model_endpoint_type == "anthropic":
246
- if stream:
247
- raise NotImplementedError(f"Streaming not yet implemented for {llm_config.model_endpoint_type}")
248
250
  if not use_tool_naming:
249
251
  raise NotImplementedError("Only tool calling supported on Anthropic API requests")
250
252
 
253
+ # Force tool calling
251
254
  tool_call = None
252
255
  if force_tool_call is not None:
253
256
  tool_call = {"type": "function", "function": {"name": force_tool_call}}
254
257
  assert functions is not None
255
258
 
259
+ chat_completion_request = ChatCompletionRequest(
260
+ model=llm_config.model,
261
+ messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
262
+ tools=([{"type": "function", "function": f} for f in functions] if functions else None),
263
+ tool_choice=tool_call,
264
+ max_tokens=1024, # TODO make dynamic
265
+ temperature=llm_config.temperature,
266
+ stream=stream,
267
+ )
268
+
269
+ # Handle streaming
270
+ if stream: # Client requested token streaming
271
+ assert isinstance(stream_interface, (AgentChunkStreamingInterface, AgentRefreshStreamingInterface)), type(stream_interface)
272
+
273
+ response = anthropic_chat_completions_process_stream(
274
+ chat_completion_request=chat_completion_request,
275
+ stream_interface=stream_interface,
276
+ )
277
+ return response
278
+
279
+ # Client did not request token streaming (expect a blocking backend response)
256
280
  return anthropic_chat_completions_request(
257
- data=ChatCompletionRequest(
258
- model=llm_config.model,
259
- messages=[cast_message_to_subtype(m.to_openai_dict()) for m in messages],
260
- tools=[{"type": "function", "function": f} for f in functions] if functions else None,
261
- tool_choice=tool_call,
262
- # user=str(user_id),
263
- # NOTE: max_tokens is required for Anthropic API
264
- max_tokens=1024, # TODO make dynamic
265
- temperature=llm_config.temperature,
266
- ),
281
+ data=chat_completion_request,
267
282
  )
268
283
 
269
284
  # elif llm_config.model_endpoint_type == "cohere":
letta/llm_api/openai.py CHANGED
@@ -5,7 +5,7 @@ import requests
5
5
  from openai import OpenAI
6
6
 
7
7
  from letta.llm_api.helpers import add_inner_thoughts_to_functions, convert_to_structured_output, make_post_request
8
- from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION
8
+ from letta.local_llm.constants import INNER_THOUGHTS_KWARG, INNER_THOUGHTS_KWARG_DESCRIPTION, INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST
9
9
  from letta.local_llm.utils import num_tokens_from_functions, num_tokens_from_messages
10
10
  from letta.schemas.llm_config import LLMConfig
11
11
  from letta.schemas.message import Message as _Message
@@ -30,7 +30,7 @@ OPENAI_SSE_DONE = "[DONE]"
30
30
 
31
31
 
32
32
  def openai_get_model_list(
33
- url: str, api_key: Union[str, None], fix_url: Optional[bool] = False, extra_params: Optional[dict] = None
33
+ url: str, api_key: Optional[str] = None, fix_url: Optional[bool] = False, extra_params: Optional[dict] = None
34
34
  ) -> dict:
35
35
  """https://platform.openai.com/docs/api-reference/models/list"""
36
36
  from letta.utils import printd
@@ -96,10 +96,15 @@ def build_openai_chat_completions_request(
96
96
  max_tokens: Optional[int],
97
97
  ) -> ChatCompletionRequest:
98
98
  if functions and llm_config.put_inner_thoughts_in_kwargs:
99
+ # Special case for LM Studio backend since it needs extra guidance to force out the thoughts first
100
+ # TODO(fix)
101
+ inner_thoughts_desc = (
102
+ INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST if ":1234" in llm_config.model_endpoint else INNER_THOUGHTS_KWARG_DESCRIPTION
103
+ )
99
104
  functions = add_inner_thoughts_to_functions(
100
105
  functions=functions,
101
106
  inner_thoughts_key=INNER_THOUGHTS_KWARG,
102
- inner_thoughts_description=INNER_THOUGHTS_KWARG_DESCRIPTION,
107
+ inner_thoughts_description=inner_thoughts_desc,
103
108
  )
104
109
 
105
110
  openai_message_list = [
@@ -27,6 +27,7 @@ DEFAULT_WRAPPER_NAME = "chatml"
27
27
 
28
28
  INNER_THOUGHTS_KWARG = "inner_thoughts"
29
29
  INNER_THOUGHTS_KWARG_DESCRIPTION = "Deep inner monologue private to you only."
30
+ INNER_THOUGHTS_KWARG_DESCRIPTION_GO_FIRST = f"Deep inner monologue private to you only. Think before you act, so always generate arg '{INNER_THOUGHTS_KWARG}' first before any other arg."
30
31
  INNER_THOUGHTS_CLI_SYMBOL = "💭"
31
32
 
32
33
  ASSISTANT_MESSAGE_CLI_SYMBOL = "🤖"
letta/schemas/message.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import copy
2
2
  import json
3
3
  import warnings
4
+ from collections import OrderedDict
4
5
  from datetime import datetime, timezone
5
6
  from typing import Any, Dict, List, Literal, Optional, Union
6
7
 
@@ -33,18 +34,18 @@ def add_inner_thoughts_to_tool_call(
33
34
  inner_thoughts_key: str,
34
35
  ) -> OpenAIToolCall:
35
36
  """Add inner thoughts (arg + value) to a tool call"""
36
- # because the kwargs are stored as strings, we need to load then write the JSON dicts
37
37
  try:
38
38
  # load the args list
39
39
  func_args = json.loads(tool_call.function.arguments)
40
- # add the inner thoughts to the args list
41
- func_args[inner_thoughts_key] = inner_thoughts
40
+ # create new ordered dict with inner thoughts first
41
+ ordered_args = OrderedDict({inner_thoughts_key: inner_thoughts})
42
+ # update with remaining args
43
+ ordered_args.update(func_args)
42
44
  # create the updated tool call (as a string)
43
45
  updated_tool_call = copy.deepcopy(tool_call)
44
- updated_tool_call.function.arguments = json_dumps(func_args)
46
+ updated_tool_call.function.arguments = json_dumps(ordered_args)
45
47
  return updated_tool_call
46
48
  except json.JSONDecodeError as e:
47
- # TODO: change to logging
48
49
  warnings.warn(f"Failed to put inner thoughts in kwargs: {e}")
49
50
  raise e
50
51