xgae 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xgae might be problematic. Click here for more details.

@@ -0,0 +1,830 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import uuid
5
+
6
+ from dataclasses import dataclass
7
+ from datetime import datetime, timezone
8
+ from typing import List, Dict, Any, Optional, AsyncGenerator, override
9
+
10
+ from xgae.engine.responser.xga_responser_base import TaskResponseProcessor, ToolExecutionStrategy, XmlAddingStrategy, TaskResponseContext,TaskRunContinuousState
11
+ from xgae.utils.json_helpers import (
12
+ ensure_dict, safe_json_parse,
13
+ to_json_string, format_for_yield
14
+ )
15
+
16
+ @dataclass
17
+ class ProcessorConfig:
18
+ """
19
+ Configuration for response processing and tool execution.
20
+
21
+ This class controls how the LLM's responses are processed, including how tool calls
22
+ are detected, executed, and their results handled.
23
+
24
+ Attributes:
25
+ xml_tool_calling: Enable XML-based tool call detection (<tool>...</tool>)
26
+ native_tool_calling: Enable OpenAI-style function calling format
27
+ execute_tools: Whether to automatically execute detected tool calls
28
+ execute_on_stream: For streaming, execute tools as they appear vs. at the end
29
+ tool_execution_strategy: How to execute multiple tools ("sequential" or "parallel")
30
+ xml_adding_strategy: How to add XML tool results to the conversation
31
+ max_xml_tool_calls: Maximum number of XML tool calls to process (0 = no limit)
32
+ """
33
+
34
+ xml_tool_calling: bool = True
35
+ native_tool_calling: bool = False
36
+
37
+ execute_tools: bool = True
38
+ execute_on_stream: bool = False
39
+ tool_execution_strategy: ToolExecutionStrategy = "sequential"
40
+ xml_adding_strategy: XmlAddingStrategy = "assistant_message"
41
+ max_xml_tool_calls: int = 0 # 0 means no limit
42
+
43
+ def __post_init__(self):
44
+ """Validate configuration after initialization."""
45
+ if self.xml_tool_calling is False and self.native_tool_calling is False and self.execute_tools:
46
+ raise ValueError(
47
+ "At least one tool calling format (XML or native) must be enabled if execute_tools is True")
48
+
49
+ if self.xml_adding_strategy not in ["user_message", "assistant_message", "inline_edit"]:
50
+ raise ValueError("xml_adding_strategy must be 'user_message', 'assistant_message', or 'inline_edit'")
51
+
52
+ if self.max_xml_tool_calls < 0:
53
+ raise ValueError("max_xml_tool_calls must be a non-negative integer (0 = no limit)")
54
+
55
+
56
+
57
+ class StreamTaskResponser(TaskResponseProcessor):
58
+ def __init__(self, response_context: TaskResponseContext):
59
+ super().__init__(response_context)
60
+
61
+ @override
62
+ async def process_response(
63
+ self,
64
+ llm_response: AsyncGenerator,
65
+ prompt_messages: List[Dict[str, Any]],
66
+ continuous_state: Optional[TaskRunContinuousState] = None,
67
+ ) -> AsyncGenerator[Dict[str, Any], None]:
68
+ """Process a streaming LLM response, handling tool calls and execution.
69
+
70
+ Args:
71
+ llm_response: Streaming response from the LLM
72
+ thread_id: ID of the conversation thread
73
+ prompt_messages: List of messages sent to the LLM (the prompt)
74
+ llm_model: The name of the LLM model used
75
+ config: Configuration for parsing and execution
76
+ can_auto_continue: Whether auto-continue is enabled
77
+ auto_continue_count: Number of auto-continue cycles
78
+ continuous_state: Previous state of the conversation
79
+
80
+ Yields:
81
+ Complete message objects matching the DB schema, except for content chunks.
82
+ """
83
+ # Initialize from continuous state if provided (for auto-continue)
84
+ can_auto_continue = continuous_state.get("auto_continue", False)
85
+ auto_continue_count = continuous_state.get("auto_continue_count", 0)
86
+ llm_model = self.response_context.get("model_name")
87
+ config: ProcessorConfig = ProcessorConfig()
88
+ thread_id = self.response_context.get("task_id")
89
+
90
+ continuous_state = continuous_state or {}
91
+ accumulated_content = continuous_state.get('accumulated_content', "")
92
+ tool_calls_buffer = {}
93
+ current_xml_content = accumulated_content # equal to accumulated_content if auto-continuing, else blank
94
+ xml_chunks_buffer = []
95
+ pending_tool_executions = []
96
+ yielded_tool_indices = set() # Stores indices of tools whose *status* has been yielded
97
+ tool_index = 0
98
+ xml_tool_call_count = 0
99
+ finish_reason = None
100
+ should_auto_continue = False
101
+ last_assistant_message_object = None # Store the final saved assistant message object
102
+ tool_result_message_objects = {} # tool_index -> full saved message object
103
+ has_printed_thinking_prefix = False # Flag for printing thinking prefix only once
104
+ agent_should_terminate = False # Flag to track if a terminating tool has been executed
105
+ complete_native_tool_calls = [] # Initialize early for use in assistant_response_end
106
+
107
+ # Collect metadata for reconstructing LiteLLM response object
108
+ streaming_metadata = {
109
+ "model": llm_model,
110
+ "created": None,
111
+ "usage": {
112
+ "prompt_tokens": 0,
113
+ "completion_tokens": 0,
114
+ "total_tokens": 0
115
+ },
116
+ "response_ms": None,
117
+ "first_chunk_time": None,
118
+ "last_chunk_time": None
119
+ }
120
+
121
+ logging.info(f"Streaming Config: XML={config.xml_tool_calling}, Native={config.native_tool_calling}, "
122
+ f"Execute on stream={config.execute_on_stream}, Strategy={config.tool_execution_strategy}")
123
+
124
+ # Reuse thread_run_id for auto-continue or create new one
125
+ thread_run_id = continuous_state.get('thread_run_id') or str(uuid.uuid4())
126
+ continuous_state['thread_run_id'] = thread_run_id
127
+
128
+ try:
129
+ # --- Save and Yield Start Events (only if not auto-continuing) ---
130
+ if auto_continue_count == 0:
131
+ start_content = {"status_type": "thread_run_start", "thread_run_id": thread_run_id}
132
+ start_msg_obj = await self.add_message(
133
+ type="status", content=start_content,
134
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
135
+ )
136
+ if start_msg_obj: yield format_for_yield(start_msg_obj)
137
+
138
+ assist_start_content = {"status_type": "assistant_response_start"}
139
+ assist_start_msg_obj = await self.add_message(
140
+ type="status", content=assist_start_content,
141
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
142
+ )
143
+ if assist_start_msg_obj: yield format_for_yield(assist_start_msg_obj)
144
+ # --- End Start Events ---
145
+
146
+ __sequence = continuous_state.get('sequence', 0) # get the sequence from the previous auto-continue cycle
147
+
148
+ async for chunk in llm_response:
149
+ # Extract streaming metadata from chunks
150
+ current_time = datetime.now(timezone.utc).timestamp()
151
+ if streaming_metadata["first_chunk_time"] is None:
152
+ streaming_metadata["first_chunk_time"] = current_time
153
+ streaming_metadata["last_chunk_time"] = current_time
154
+
155
+ # Extract metadata from chunk attributes
156
+ if hasattr(chunk, 'created') and chunk.created:
157
+ streaming_metadata["created"] = chunk.created
158
+ if hasattr(chunk, 'model') and chunk.model:
159
+ streaming_metadata["model"] = chunk.model
160
+ if hasattr(chunk, 'usage') and chunk.usage:
161
+ # Update usage information if available (including zero values)
162
+ if hasattr(chunk.usage, 'prompt_tokens') and chunk.usage.prompt_tokens is not None:
163
+ streaming_metadata["usage"]["prompt_tokens"] = chunk.usage.prompt_tokens
164
+ if hasattr(chunk.usage, 'completion_tokens') and chunk.usage.completion_tokens is not None:
165
+ streaming_metadata["usage"]["completion_tokens"] = chunk.usage.completion_tokens
166
+ if hasattr(chunk.usage, 'total_tokens') and chunk.usage.total_tokens is not None:
167
+ streaming_metadata["usage"]["total_tokens"] = chunk.usage.total_tokens
168
+
169
+ if hasattr(chunk, 'choices') and chunk.choices and hasattr(chunk.choices[0], 'finish_reason') and \
170
+ chunk.choices[0].finish_reason:
171
+ finish_reason = chunk.choices[0].finish_reason
172
+ logging.debug(f"Detected finish_reason: {finish_reason}")
173
+
174
+ if hasattr(chunk, 'choices') and chunk.choices:
175
+ delta = chunk.choices[0].delta if hasattr(chunk.choices[0], 'delta') else None
176
+
177
+ # Check for and log Anthropic thinking content
178
+ if delta and hasattr(delta, 'reasoning_content') and delta.reasoning_content:
179
+ if not has_printed_thinking_prefix:
180
+ # print("[THINKING]: ", end='', flush=True)
181
+ has_printed_thinking_prefix = True
182
+ # print(delta.reasoning_content, end='', flush=True)
183
+ # Append reasoning to main content to be saved in the final message
184
+ accumulated_content += delta.reasoning_content
185
+
186
+ # Process content chunk
187
+ if delta and hasattr(delta, 'content') and delta.content:
188
+ chunk_content = delta.content
189
+ # print(chunk_content, end='', flush=True)
190
+ accumulated_content += chunk_content
191
+ current_xml_content += chunk_content
192
+
193
+ if not (config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls):
194
+ # Yield ONLY content chunk (don't save)
195
+ now_chunk = datetime.now(timezone.utc).isoformat()
196
+ yield {
197
+ "sequence": __sequence,
198
+ "message_id": None, "thread_id": thread_id, "type": "assistant",
199
+ "is_llm_message": True,
200
+ "content": to_json_string({"role": "assistant", "content": chunk_content}),
201
+ "metadata": to_json_string({"stream_status": "chunk", "thread_run_id": thread_run_id}),
202
+ "created_at": now_chunk, "updated_at": now_chunk
203
+ }
204
+ __sequence += 1
205
+ else:
206
+ logging.info("XML tool call limit reached - not yielding more content chunks")
207
+ self.trace.event(name="xml_tool_call_limit_reached", level="DEFAULT", status_message=(
208
+ f"XML tool call limit reached - not yielding more content chunks"))
209
+
210
+ # --- Process XML Tool Calls (if enabled and limit not reached) ---
211
+ if config.xml_tool_calling and not (
212
+ config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls):
213
+ xml_chunks = self._extract_xml_chunks(current_xml_content)
214
+ for xml_chunk in xml_chunks:
215
+ current_xml_content = current_xml_content.replace(xml_chunk, "", 1)
216
+ xml_chunks_buffer.append(xml_chunk)
217
+ result = self._parse_xml_tool_call(xml_chunk)
218
+ if result:
219
+ tool_call, parsing_details = result
220
+ xml_tool_call_count += 1
221
+ current_assistant_id = last_assistant_message_object[
222
+ 'message_id'] if last_assistant_message_object else None
223
+ context = self._create_tool_context(
224
+ tool_call, tool_index, current_assistant_id, parsing_details
225
+ )
226
+
227
+ if config.execute_tools and config.execute_on_stream:
228
+ # Save and Yield tool_started status
229
+ started_msg_obj = await self._yield_and_save_tool_started(context, thread_id,
230
+ thread_run_id)
231
+ if started_msg_obj: yield format_for_yield(started_msg_obj)
232
+ yielded_tool_indices.add(tool_index) # Mark status as yielded
233
+
234
+ execution_task = asyncio.create_task(self._execute_tool(tool_call))
235
+ pending_tool_executions.append({
236
+ "task": execution_task, "tool_call": tool_call,
237
+ "tool_index": tool_index, "context": context
238
+ })
239
+ tool_index += 1
240
+
241
+ if config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls:
242
+ logging.debug(f"Reached XML tool call limit ({config.max_xml_tool_calls})")
243
+ finish_reason = "xml_tool_limit_reached"
244
+ break # Stop processing more XML chunks in this delta
245
+
246
+ # --- Process Native Tool Call Chunks ---
247
+ if config.native_tool_calling and delta and hasattr(delta, 'tool_calls') and delta.tool_calls:
248
+ for tool_call_chunk in delta.tool_calls:
249
+ # Yield Native Tool Call Chunk (transient status, not saved)
250
+ # ... (safe extraction logic for tool_call_data_chunk) ...
251
+ tool_call_data_chunk = {} # Placeholder for extracted data
252
+ if hasattr(tool_call_chunk, 'model_dump'):
253
+ tool_call_data_chunk = tool_call_chunk.model_dump()
254
+ else: # Manual extraction...
255
+ if hasattr(tool_call_chunk, 'id'): tool_call_data_chunk['id'] = tool_call_chunk.id
256
+ if hasattr(tool_call_chunk, 'index'): tool_call_data_chunk[
257
+ 'index'] = tool_call_chunk.index
258
+ if hasattr(tool_call_chunk, 'type'): tool_call_data_chunk['type'] = tool_call_chunk.type
259
+ if hasattr(tool_call_chunk, 'function'):
260
+ tool_call_data_chunk['function'] = {}
261
+ if hasattr(tool_call_chunk.function, 'name'): tool_call_data_chunk['function'][
262
+ 'name'] = tool_call_chunk.function.name
263
+ if hasattr(tool_call_chunk.function, 'arguments'): tool_call_data_chunk['function'][
264
+ 'arguments'] = tool_call_chunk.function.arguments if isinstance(
265
+ tool_call_chunk.function.arguments, str) else to_json_string(
266
+ tool_call_chunk.function.arguments)
267
+
268
+ now_tool_chunk = datetime.now(timezone.utc).isoformat()
269
+ yield {
270
+ "message_id": None, "thread_id": thread_id, "type": "status", "is_llm_message": True,
271
+ "content": to_json_string({"role": "assistant", "status_type": "tool_call_chunk",
272
+ "tool_call_chunk": tool_call_data_chunk}),
273
+ "metadata": to_json_string({"thread_run_id": thread_run_id}),
274
+ "created_at": now_tool_chunk, "updated_at": now_tool_chunk
275
+ }
276
+
277
+ # --- Buffer and Execute Complete Native Tool Calls ---
278
+ if not hasattr(tool_call_chunk, 'function'): continue
279
+ idx = tool_call_chunk.index if hasattr(tool_call_chunk, 'index') else 0
280
+ # ... (buffer update logic remains same) ...
281
+ # ... (check complete logic remains same) ...
282
+ has_complete_tool_call = False # Placeholder
283
+ if (tool_calls_buffer.get(idx) and
284
+ tool_calls_buffer[idx]['id'] and
285
+ tool_calls_buffer[idx]['function']['name'] and
286
+ tool_calls_buffer[idx]['function']['arguments']):
287
+ try:
288
+ safe_json_parse(tool_calls_buffer[idx]['function']['arguments'])
289
+ has_complete_tool_call = True
290
+ except json.JSONDecodeError:
291
+ pass
292
+
293
+ if has_complete_tool_call and config.execute_tools and config.execute_on_stream:
294
+ current_tool = tool_calls_buffer[idx]
295
+ tool_call_data = {
296
+ "function_name": current_tool['function']['name'],
297
+ "arguments": safe_json_parse(current_tool['function']['arguments']),
298
+ "id": current_tool['id']
299
+ }
300
+ current_assistant_id = last_assistant_message_object[
301
+ 'message_id'] if last_assistant_message_object else None
302
+ context = self._create_tool_context(
303
+ tool_call_data, tool_index, current_assistant_id
304
+ )
305
+
306
+ # Save and Yield tool_started status
307
+ started_msg_obj = await self._yield_and_save_tool_started(context, thread_id,
308
+ thread_run_id)
309
+ if started_msg_obj: yield format_for_yield(started_msg_obj)
310
+ yielded_tool_indices.add(tool_index) # Mark status as yielded
311
+
312
+ execution_task = asyncio.create_task(self._execute_tool(tool_call_data))
313
+ pending_tool_executions.append({
314
+ "task": execution_task, "tool_call": tool_call_data,
315
+ "tool_index": tool_index, "context": context
316
+ })
317
+ tool_index += 1
318
+
319
+ if finish_reason == "xml_tool_limit_reached":
320
+ logging.info("Stopping stream processing after loop due to XML tool call limit")
321
+ self.trace.event(name="stopping_stream_processing_after_loop_due_to_xml_tool_call_limit",
322
+ level="DEFAULT", status_message=(
323
+ f"Stopping stream processing after loop due to XML tool call limit"))
324
+ break
325
+
326
+ # print() # Add a final newline after the streaming loop finishes
327
+
328
+ # --- After Streaming Loop ---
329
+
330
+ if (
331
+ streaming_metadata["usage"]["total_tokens"] == 0
332
+ ):
333
+ logging.info("🔥 No usage data from provider, counting with litellm.token_counter")
334
+
335
+ try:
336
+ # prompt side
337
+ # prompt_tokens = token_counter(
338
+ # model=llm_model,
339
+ # messages=prompt_messages # chat or plain; token_counter handles both
340
+ # )
341
+ #
342
+ # # completion side
343
+ # completion_tokens = token_counter(
344
+ # model=llm_model,
345
+ # text=accumulated_content or "" # empty string safe
346
+ # )
347
+
348
+ # streaming_metadata["usage"]["prompt_tokens"] = prompt_tokens
349
+ # streaming_metadata["usage"]["completion_tokens"] = completion_tokens
350
+ # streaming_metadata["usage"]["total_tokens"] = prompt_tokens + completion_tokens
351
+ #
352
+ # logging.info(
353
+ # f"🔥 Estimated tokens – prompt: {prompt_tokens}, "
354
+ # f"completion: {completion_tokens}, total: {prompt_tokens + completion_tokens}"
355
+ # )
356
+ self.trace.event(name="usage_calculated_with_litellm_token_counter", level="DEFAULT",
357
+ status_message=(f"Usage calculated with litellm.token_counter"))
358
+ except Exception as e:
359
+ logging.warning(f"Failed to calculate usage: {str(e)}")
360
+ self.trace.event(name="failed_to_calculate_usage", level="WARNING",
361
+ status_message=(f"Failed to calculate usage: {str(e)}"))
362
+
363
+ # Wait for pending tool executions from streaming phase
364
+ tool_results_buffer = [] # Stores (tool_call, result, tool_index, context)
365
+ if pending_tool_executions:
366
+ logging.info(f"Waiting for {len(pending_tool_executions)} pending streamed tool executions")
367
+ self.trace.event(name="waiting_for_pending_streamed_tool_executions", level="DEFAULT", status_message=(
368
+ f"Waiting for {len(pending_tool_executions)} pending streamed tool executions"))
369
+ # ... (asyncio.wait logic) ...
370
+ pending_tasks = [execution["task"] for execution in pending_tool_executions]
371
+ done, _ = await asyncio.wait(pending_tasks)
372
+
373
+ for execution in pending_tool_executions:
374
+ tool_idx = execution.get("tool_index", -1)
375
+ context = execution["context"]
376
+ tool_name = context.function_name
377
+
378
+ # Check if status was already yielded during stream run
379
+ if tool_idx in yielded_tool_indices:
380
+ logging.debug(f"Status for tool index {tool_idx} already yielded.")
381
+ # Still need to process the result for the buffer
382
+ try:
383
+ if execution["task"].done():
384
+ result = execution["task"].result()
385
+ context.result = result
386
+ tool_results_buffer.append((execution["tool_call"], result, tool_idx, context))
387
+
388
+ if tool_name in ['ask', 'complete']:
389
+ logging.info(
390
+ f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag.")
391
+ self.trace.event(name="terminating_tool_completed_during_streaming",
392
+ level="DEFAULT", status_message=(
393
+ f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag."))
394
+ agent_should_terminate = True
395
+
396
+ else: # Should not happen with asyncio.wait
397
+ logging.warning(f"Task for tool index {tool_idx} not done after wait.")
398
+ self.trace.event(name="task_for_tool_index_not_done_after_wait", level="WARNING",
399
+ status_message=(
400
+ f"Task for tool index {tool_idx} not done after wait."))
401
+ except Exception as e:
402
+ logging.error(f"Error getting result for pending tool execution {tool_idx}: {str(e)}")
403
+ self.trace.event(name="error_getting_result_for_pending_tool_execution", level="ERROR",
404
+ status_message=(
405
+ f"Error getting result for pending tool execution {tool_idx}: {str(e)}"))
406
+ context.error = e
407
+ # Save and Yield tool error status message (even if started was yielded)
408
+ error_msg_obj = await self._yield_and_save_tool_error(context, thread_id, thread_run_id)
409
+ if error_msg_obj: yield format_for_yield(error_msg_obj)
410
+ continue # Skip further status yielding for this tool index
411
+
412
+ # If status wasn't yielded before (shouldn't happen with current logic), yield it now
413
+ try:
414
+ if execution["task"].done():
415
+ result = execution["task"].result()
416
+ context.result = result
417
+ tool_results_buffer.append((execution["tool_call"], result, tool_idx, context))
418
+
419
+ # Check if this is a terminating tool
420
+ if tool_name in ['ask', 'complete']:
421
+ logging.info(
422
+ f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag.")
423
+ self.trace.event(name="terminating_tool_completed_during_streaming", level="DEFAULT",
424
+ status_message=(
425
+ f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag."))
426
+ agent_should_terminate = True
427
+
428
+ # Save and Yield tool completed/failed status
429
+ completed_msg_obj = await self._yield_and_save_tool_completed(
430
+ context, None, thread_id, thread_run_id
431
+ )
432
+ if completed_msg_obj: yield format_for_yield(completed_msg_obj)
433
+ yielded_tool_indices.add(tool_idx)
434
+ except Exception as e:
435
+ logging.error(
436
+ f"Error getting result/yielding status for pending tool execution {tool_idx}: {str(e)}")
437
+ self.trace.event(name="error_getting_result_yielding_status_for_pending_tool_execution",
438
+ level="ERROR", status_message=(
439
+ f"Error getting result/yielding status for pending tool execution {tool_idx}: {str(e)}"))
440
+ context.error = e
441
+ # Save and Yield tool error status
442
+ error_msg_obj = await self._yield_and_save_tool_error(context, thread_id, thread_run_id)
443
+ if error_msg_obj: yield format_for_yield(error_msg_obj)
444
+ yielded_tool_indices.add(tool_idx)
445
+
446
+ # Save and yield finish status if limit was reached
447
+ if finish_reason == "xml_tool_limit_reached":
448
+ finish_content = {"status_type": "finish", "finish_reason": "xml_tool_limit_reached"}
449
+ finish_msg_obj = await self.add_message(
450
+ type="status", content=finish_content,
451
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
452
+ )
453
+ if finish_msg_obj: yield format_for_yield(finish_msg_obj)
454
+ logging.info(
455
+ f"Stream finished with reason: xml_tool_limit_reached after {xml_tool_call_count} XML tool calls")
456
+ self.trace.event(name="stream_finished_with_reason_xml_tool_limit_reached_after_xml_tool_calls",
457
+ level="DEFAULT", status_message=(
458
+ f"Stream finished with reason: xml_tool_limit_reached after {xml_tool_call_count} XML tool calls"))
459
+
460
+ # Calculate if auto-continue is needed if the finish reason is length
461
+ should_auto_continue = (can_auto_continue and finish_reason == 'length')
462
+
463
+ # --- SAVE and YIELD Final Assistant Message ---
464
+ # Only save assistant message if NOT auto-continuing due to length to avoid duplicate messages
465
+ if accumulated_content and not should_auto_continue:
466
+ # ... (Truncate accumulated_content logic) ...
467
+ if config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls and xml_chunks_buffer:
468
+ last_xml_chunk = xml_chunks_buffer[-1]
469
+ last_chunk_end_pos = accumulated_content.find(last_xml_chunk) + len(last_xml_chunk)
470
+ if last_chunk_end_pos > 0:
471
+ accumulated_content = accumulated_content[:last_chunk_end_pos]
472
+
473
+ # ... (Extract complete_native_tool_calls logic) ...
474
+ # Update complete_native_tool_calls from buffer (initialized earlier)
475
+ if config.native_tool_calling:
476
+ for idx, tc_buf in tool_calls_buffer.items():
477
+ if tc_buf['id'] and tc_buf['function']['name'] and tc_buf['function']['arguments']:
478
+ try:
479
+ args = safe_json_parse(tc_buf['function']['arguments'])
480
+ complete_native_tool_calls.append({
481
+ "id": tc_buf['id'], "type": "function",
482
+ "function": {"name": tc_buf['function']['name'], "arguments": args}
483
+ })
484
+ except json.JSONDecodeError:
485
+ continue
486
+
487
+ message_data = { # Dict to be saved in 'content'
488
+ "role": "assistant", "content": accumulated_content,
489
+ "tool_calls": complete_native_tool_calls or None
490
+ }
491
+
492
+ last_assistant_message_object = await self._add_message_with_agent_info(type="assistant", content=message_data,
493
+ is_llm_message=True, metadata={"thread_run_id": thread_run_id}
494
+ )
495
+
496
+ if last_assistant_message_object:
497
+ # Yield the complete saved object, adding stream_status metadata just for yield
498
+ yield_metadata = ensure_dict(last_assistant_message_object.get('metadata'), {})
499
+ yield_metadata['stream_status'] = 'complete'
500
+ # Format the message for yielding
501
+ yield_message = last_assistant_message_object.copy()
502
+ yield_message['metadata'] = yield_metadata
503
+ yield format_for_yield(yield_message)
504
+ else:
505
+ logging.error(f"Failed to save final assistant message for thread {thread_id}")
506
+ self.trace.event(name="failed_to_save_final_assistant_message_for_thread", level="ERROR",
507
+ status_message=(f"Failed to save final assistant message for thread {thread_id}"))
508
+ # Save and yield an error status
509
+ err_content = {"role": "system", "status_type": "error",
510
+ "message": "Failed to save final assistant message"}
511
+ err_msg_obj = await self.add_message(
512
+ type="status", content=err_content,
513
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
514
+ )
515
+ if err_msg_obj: yield format_for_yield(err_msg_obj)
516
+
517
+ # --- Process All Tool Results Now ---
518
+ if config.execute_tools:
519
+ final_tool_calls_to_process = []
520
+ # ... (Gather final_tool_calls_to_process from native and XML buffers) ...
521
+ # Gather native tool calls from buffer
522
+ if config.native_tool_calling and complete_native_tool_calls:
523
+ for tc in complete_native_tool_calls:
524
+ final_tool_calls_to_process.append({
525
+ "function_name": tc["function"]["name"],
526
+ "arguments": tc["function"]["arguments"], # Already parsed object
527
+ "id": tc["id"]
528
+ })
529
+ # Gather XML tool calls from buffer (up to limit)
530
+ parsed_xml_data = []
531
+ if config.xml_tool_calling:
532
+ # Reparse remaining content just in case (should be empty if processed correctly)
533
+ xml_chunks = self._extract_xml_chunks(current_xml_content)
534
+ xml_chunks_buffer.extend(xml_chunks)
535
+ # Process only chunks not already handled in the stream loop
536
+ remaining_limit = config.max_xml_tool_calls - xml_tool_call_count if config.max_xml_tool_calls > 0 else len(
537
+ xml_chunks_buffer)
538
+ xml_chunks_to_process = xml_chunks_buffer[:remaining_limit] # Ensure limit is respected
539
+
540
+ for chunk in xml_chunks_to_process:
541
+ parsed_result = self._parse_xml_tool_call(chunk)
542
+ if parsed_result:
543
+ tool_call, parsing_details = parsed_result
544
+ # Avoid adding if already processed during streaming
545
+ if not any(exec['tool_call'] == tool_call for exec in pending_tool_executions):
546
+ final_tool_calls_to_process.append(tool_call)
547
+ parsed_xml_data.append({'tool_call': tool_call, 'parsing_details': parsing_details})
548
+
549
+ all_tool_data_map = {} # tool_index -> {'tool_call': ..., 'parsing_details': ...}
550
+ # Add native tool data
551
+ native_tool_index = 0
552
+ if config.native_tool_calling and complete_native_tool_calls:
553
+ for tc in complete_native_tool_calls:
554
+ # Find the corresponding entry in final_tool_calls_to_process if needed
555
+ # For now, assume order matches if only native used
556
+ exec_tool_call = {
557
+ "function_name": tc["function"]["name"],
558
+ "arguments": tc["function"]["arguments"],
559
+ "id": tc["id"]
560
+ }
561
+ all_tool_data_map[native_tool_index] = {"tool_call": exec_tool_call, "parsing_details": None}
562
+ native_tool_index += 1
563
+
564
+ # Add XML tool data
565
+ xml_tool_index_start = native_tool_index
566
+ for idx, item in enumerate(parsed_xml_data):
567
+ all_tool_data_map[xml_tool_index_start + idx] = item
568
+
569
+ tool_results_map = {} # tool_index -> (tool_call, result, context)
570
+
571
+ # Populate from buffer if executed on stream
572
+ if config.execute_on_stream and tool_results_buffer:
573
+ logging.info(f"Processing {len(tool_results_buffer)} buffered tool results")
574
+ self.trace.event(name="processing_buffered_tool_results", level="DEFAULT",
575
+ status_message=(f"Processing {len(tool_results_buffer)} buffered tool results"))
576
+ for tool_call, result, tool_idx, context in tool_results_buffer:
577
+ if last_assistant_message_object: context.assistant_message_id = last_assistant_message_object[
578
+ 'message_id']
579
+ tool_results_map[tool_idx] = (tool_call, result, context)
580
+
581
+ # Or execute now if not streamed
582
+ elif final_tool_calls_to_process and not config.execute_on_stream:
583
+ logging.info(
584
+ f"Executing {len(final_tool_calls_to_process)} tools ({config.tool_execution_strategy}) after stream")
585
+ self.trace.event(name="executing_tools_after_stream", level="DEFAULT", status_message=(
586
+ f"Executing {len(final_tool_calls_to_process)} tools ({config.tool_execution_strategy}) after stream"))
587
+ results_list = await self._execute_tools(final_tool_calls_to_process,
588
+ config.tool_execution_strategy)
589
+ current_tool_idx = 0
590
+ for tc, res in results_list:
591
+ # Map back using all_tool_data_map which has correct indices
592
+ if current_tool_idx in all_tool_data_map:
593
+ tool_data = all_tool_data_map[current_tool_idx]
594
+ context = self._create_tool_context(
595
+ tc, current_tool_idx,
596
+ last_assistant_message_object['message_id'] if last_assistant_message_object else None,
597
+ tool_data.get('parsing_details')
598
+ )
599
+ context.result = res
600
+ tool_results_map[current_tool_idx] = (tc, res, context)
601
+ else:
602
+ logging.warning(f"Could not map result for tool index {current_tool_idx}")
603
+ self.trace.event(name="could_not_map_result_for_tool_index", level="WARNING",
604
+ status_message=(f"Could not map result for tool index {current_tool_idx}"))
605
+ current_tool_idx += 1
606
+
607
+ # Save and Yield each result message
608
+ if tool_results_map:
609
+ logging.info(f"Saving and yielding {len(tool_results_map)} final tool result messages")
610
+ self.trace.event(name="saving_and_yielding_final_tool_result_messages", level="DEFAULT",
611
+ status_message=(
612
+ f"Saving and yielding {len(tool_results_map)} final tool result messages"))
613
+ for tool_idx in sorted(tool_results_map.keys()):
614
+ tool_call, result, context = tool_results_map[tool_idx]
615
+ context.result = result
616
+ if not context.assistant_message_id and last_assistant_message_object:
617
+ context.assistant_message_id = last_assistant_message_object['message_id']
618
+
619
+ # Yield start status ONLY IF executing non-streamed (already yielded if streamed)
620
+ if not config.execute_on_stream and tool_idx not in yielded_tool_indices:
621
+ started_msg_obj = await self._yield_and_save_tool_started(context, thread_id, thread_run_id)
622
+ if started_msg_obj: yield format_for_yield(started_msg_obj)
623
+ yielded_tool_indices.add(tool_idx) # Mark status yielded
624
+
625
+ # Save the tool result message to DB
626
+ saved_tool_result_object = await self._add_tool_result( # Returns full object or None
627
+ thread_id, tool_call, result, config.xml_adding_strategy,
628
+ context.assistant_message_id, context.parsing_details
629
+ )
630
+
631
+ # Yield completed/failed status (linked to saved result ID if available)
632
+ completed_msg_obj = await self._yield_and_save_tool_completed(
633
+ context,
634
+ saved_tool_result_object['message_id'] if saved_tool_result_object else None,
635
+ thread_id, thread_run_id
636
+ )
637
+ if completed_msg_obj: yield format_for_yield(completed_msg_obj)
638
+ # Don't add to yielded_tool_indices here, completion status is separate yield
639
+
640
+ # Yield the saved tool result object
641
+ if saved_tool_result_object:
642
+ tool_result_message_objects[tool_idx] = saved_tool_result_object
643
+ yield format_for_yield(saved_tool_result_object)
644
+ else:
645
+ logging.error(
646
+ f"Failed to save tool result for index {tool_idx}, not yielding result message.")
647
+ self.trace.event(name="failed_to_save_tool_result_for_index", level="ERROR",
648
+ status_message=(
649
+ f"Failed to save tool result for index {tool_idx}, not yielding result message."))
650
+ # Optionally yield error status for saving failure?
651
+
652
+ # --- Final Finish Status ---
653
+ if finish_reason and finish_reason != "xml_tool_limit_reached":
654
+ finish_content = {"status_type": "finish", "finish_reason": finish_reason}
655
+ finish_msg_obj = await self.add_message(
656
+ type="status", content=finish_content,
657
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
658
+ )
659
+ if finish_msg_obj: yield format_for_yield(finish_msg_obj)
660
+
661
+ # Check if agent should terminate after processing pending tools
662
+ if agent_should_terminate:
663
+ logging.info(
664
+ "Agent termination requested after executing ask/complete tool. Stopping further processing.")
665
+ self.trace.event(name="agent_termination_requested", level="DEFAULT",
666
+ status_message="Agent termination requested after executing ask/complete tool. Stopping further processing.")
667
+
668
+ # Set finish reason to indicate termination
669
+ finish_reason = "agent_terminated"
670
+
671
+ # Save and yield termination status
672
+ finish_content = {"status_type": "finish", "finish_reason": "agent_terminated"}
673
+ finish_msg_obj = await self.add_message(
674
+ type="status", content=finish_content,
675
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
676
+ )
677
+ if finish_msg_obj: yield format_for_yield(finish_msg_obj)
678
+
679
+ # Save assistant_response_end BEFORE terminating
680
+ if last_assistant_message_object:
681
+ try:
682
+ # Calculate response time if we have timing data
683
+ if streaming_metadata["first_chunk_time"] and streaming_metadata["last_chunk_time"]:
684
+ streaming_metadata["response_ms"] = (streaming_metadata["last_chunk_time"] -
685
+ streaming_metadata["first_chunk_time"]) * 1000
686
+
687
+ # Create a LiteLLM-like response object for streaming (before termination)
688
+ # Check if we have any actual usage data
689
+ has_usage_data = (
690
+ streaming_metadata["usage"]["prompt_tokens"] > 0 or
691
+ streaming_metadata["usage"]["completion_tokens"] > 0 or
692
+ streaming_metadata["usage"]["total_tokens"] > 0
693
+ )
694
+
695
+ assistant_end_content = {
696
+ "choices": [
697
+ {
698
+ "finish_reason": finish_reason or "stop",
699
+ "index": 0,
700
+ "message": {
701
+ "role": "assistant",
702
+ "content": accumulated_content,
703
+ "tool_calls": complete_native_tool_calls or None
704
+ }
705
+ }
706
+ ],
707
+ "created": streaming_metadata.get("created"),
708
+ "model": streaming_metadata.get("model", llm_model),
709
+ "usage": streaming_metadata["usage"], # Always include usage like LiteLLM does
710
+ "streaming": True, # Add flag to indicate this was reconstructed from streaming
711
+ }
712
+
713
+ # Only include response_ms if we have timing data
714
+ if streaming_metadata.get("response_ms"):
715
+ assistant_end_content["response_ms"] = streaming_metadata["response_ms"]
716
+
717
+ await self.add_message(
718
+ type="assistant_response_end",
719
+ content=assistant_end_content,
720
+ is_llm_message=False,
721
+ metadata={"thread_run_id": thread_run_id}
722
+ )
723
+ logging.info("Assistant response end saved for stream (before termination)")
724
+ except Exception as e:
725
+ logging.error(f"Error saving assistant response end for stream (before termination): {str(e)}")
726
+ self.trace.event(name="error_saving_assistant_response_end_for_stream_before_termination",
727
+ level="ERROR", status_message=(
728
+ f"Error saving assistant response end for stream (before termination): {str(e)}"))
729
+
730
+ # Skip all remaining processing and go to finally block
731
+ return
732
+
733
+ # --- Save and Yield assistant_response_end ---
734
+ # Only save assistant_response_end if not auto-continuing (response is actually complete)
735
+ if not should_auto_continue:
736
+ if last_assistant_message_object: # Only save if assistant message was saved
737
+ try:
738
+ # Calculate response time if we have timing data
739
+ if streaming_metadata["first_chunk_time"] and streaming_metadata["last_chunk_time"]:
740
+ streaming_metadata["response_ms"] = (streaming_metadata["last_chunk_time"] -
741
+ streaming_metadata["first_chunk_time"]) * 1000
742
+
743
+ # Create a LiteLLM-like response object for streaming
744
+ # Check if we have any actual usage data
745
+ has_usage_data = (
746
+ streaming_metadata["usage"]["prompt_tokens"] > 0 or
747
+ streaming_metadata["usage"]["completion_tokens"] > 0 or
748
+ streaming_metadata["usage"]["total_tokens"] > 0
749
+ )
750
+
751
+ assistant_end_content = {
752
+ "choices": [
753
+ {
754
+ "finish_reason": finish_reason or "stop",
755
+ "index": 0,
756
+ "message": {
757
+ "role": "assistant",
758
+ "content": accumulated_content,
759
+ "tool_calls": complete_native_tool_calls or None
760
+ }
761
+ }
762
+ ],
763
+ "created": streaming_metadata.get("created"),
764
+ "model": streaming_metadata.get("model", llm_model),
765
+ "usage": streaming_metadata["usage"], # Always include usage like LiteLLM does
766
+ "streaming": True, # Add flag to indicate this was reconstructed from streaming
767
+ }
768
+
769
+ # Only include response_ms if we have timing data
770
+ if streaming_metadata.get("response_ms"):
771
+ assistant_end_content["response_ms"] = streaming_metadata["response_ms"]
772
+
773
+ await self.add_message(
774
+ type="assistant_response_end",
775
+ content=assistant_end_content,
776
+ is_llm_message=False,
777
+ metadata={"thread_run_id": thread_run_id}
778
+ )
779
+ logging.info("Assistant response end saved for stream")
780
+ except Exception as e:
781
+ logging.error(f"Error saving assistant response end for stream: {str(e)}")
782
+ self.trace.event(name="error_saving_assistant_response_end_for_stream", level="ERROR",
783
+ status_message=(f"Error saving assistant response end for stream: {str(e)}"))
784
+
785
+ except Exception as e:
786
+ logging.error(f"Error processing stream: {str(e)}", exc_info=True)
787
+ self.trace.event(name="error_processing_stream", level="ERROR",
788
+ status_message=(f"Error processing stream: {str(e)}"))
789
+ # Save and yield error status message
790
+
791
+ err_content = {"role": "system", "status_type": "error", "message": str(e)}
792
+ if (not "AnthropicException - Overloaded" in str(e)):
793
+ err_msg_obj = await self.add_message(
794
+ type="status", content=err_content,
795
+ is_llm_message=False,
796
+ metadata={"thread_run_id": thread_run_id if 'thread_run_id' in locals() else None}
797
+ )
798
+ if err_msg_obj: yield format_for_yield(err_msg_obj) # Yield the saved error message
799
+ # Re-raise the same exception (not a new one) to ensure proper error propagation
800
+ logging.critical(f"Re-raising error to stop further processing: {str(e)}")
801
+ self.trace.event(name="re_raising_error_to_stop_further_processing", level="ERROR",
802
+ status_message=(f"Re-raising error to stop further processing: {str(e)}"))
803
+ else:
804
+ logging.error(f"AnthropicException - Overloaded detected - Falling back to OpenRouter: {str(e)}",
805
+ exc_info=True)
806
+ self.trace.event(name="anthropic_exception_overloaded_detected", level="ERROR", status_message=(
807
+ f"AnthropicException - Overloaded detected - Falling back to OpenRouter: {str(e)}"))
808
+ raise # Use bare 'raise' to preserve the original exception with its traceback
809
+
810
+ finally:
811
+ # Update continuous state for potential auto-continue
812
+ if should_auto_continue:
813
+ continuous_state['accumulated_content'] = accumulated_content
814
+ continuous_state['sequence'] = __sequence
815
+
816
+ logging.info(f"Updated continuous state for auto-continue with {len(accumulated_content)} chars")
817
+ else:
818
+ # Save and Yield the final thread_run_end status (only if not auto-continuing and finish_reason is not 'length')
819
+ try:
820
+ end_content = {"status_type": "thread_run_end"}
821
+ end_msg_obj = await self.add_message(
822
+ type="status", content=end_content,
823
+ is_llm_message=False,
824
+ metadata={"thread_run_id": thread_run_id if 'thread_run_id' in locals() else None}
825
+ )
826
+ if end_msg_obj: yield format_for_yield(end_msg_obj)
827
+ except Exception as final_e:
828
+ logging.error(f"Error in finally block: {str(final_e)}", exc_info=True)
829
+ self.trace.event(name="error_in_finally_block", level="ERROR",
830
+ status_message=(f"Error in finally block: {str(final_e)}"))