xgae 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xgae might be problematic. Click here for more details.

@@ -0,0 +1,787 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import uuid
5
+ from datetime import datetime, timezone
6
+ from typing import List, Dict, Any, Optional, AsyncGenerator, override
7
+
8
+ from xgae.engine.responser.xga_responser_base import TaskResponseProcessor, ProcessorConfig, TaskResponseContext,TaskRunContinuousState
9
+ from xgae.utils.json_helpers import (
10
+ ensure_dict, safe_json_parse,
11
+ to_json_string, format_for_yield
12
+ )
13
+
14
+ class StreamTaskResponser(TaskResponseProcessor):
15
+ def __init__(self, response_context: TaskResponseContext):
16
+ super().__init__(response_context)
17
+
18
+ @override
19
+ async def process_response(
20
+ self,
21
+ llm_response: AsyncGenerator,
22
+ prompt_messages: List[Dict[str, Any]],
23
+ continuous_state: Optional[TaskRunContinuousState] = None,
24
+ ) -> AsyncGenerator[Dict[str, Any], None]:
25
+ """Process a streaming LLM response, handling tool calls and execution.
26
+
27
+ Args:
28
+ llm_response: Streaming response from the LLM
29
+ thread_id: ID of the conversation thread
30
+ prompt_messages: List of messages sent to the LLM (the prompt)
31
+ llm_model: The name of the LLM model used
32
+ config: Configuration for parsing and execution
33
+ can_auto_continue: Whether auto-continue is enabled
34
+ auto_continue_count: Number of auto-continue cycles
35
+ continuous_state: Previous state of the conversation
36
+
37
+ Yields:
38
+ Complete message objects matching the DB schema, except for content chunks.
39
+ """
40
+ # Initialize from continuous state if provided (for auto-continue)
41
+ can_auto_continue = continuous_state.get("auto_continue", False)
42
+ auto_continue_count = continuous_state.get("auto_continue_count", 0)
43
+ llm_model = self.response_context.get("model_name")
44
+ config: ProcessorConfig = ProcessorConfig()
45
+ thread_id = self.response_context.get("task_id")
46
+
47
+ continuous_state = continuous_state or {}
48
+ accumulated_content = continuous_state.get('accumulated_content', "")
49
+ tool_calls_buffer = {}
50
+ current_xml_content = accumulated_content # equal to accumulated_content if auto-continuing, else blank
51
+ xml_chunks_buffer = []
52
+ pending_tool_executions = []
53
+ yielded_tool_indices = set() # Stores indices of tools whose *status* has been yielded
54
+ tool_index = 0
55
+ xml_tool_call_count = 0
56
+ finish_reason = None
57
+ should_auto_continue = False
58
+ last_assistant_message_object = None # Store the final saved assistant message object
59
+ tool_result_message_objects = {} # tool_index -> full saved message object
60
+ has_printed_thinking_prefix = False # Flag for printing thinking prefix only once
61
+ agent_should_terminate = False # Flag to track if a terminating tool has been executed
62
+ complete_native_tool_calls = [] # Initialize early for use in assistant_response_end
63
+
64
+ # Collect metadata for reconstructing LiteLLM response object
65
+ streaming_metadata = {
66
+ "model": llm_model,
67
+ "created": None,
68
+ "usage": {
69
+ "prompt_tokens": 0,
70
+ "completion_tokens": 0,
71
+ "total_tokens": 0
72
+ },
73
+ "response_ms": None,
74
+ "first_chunk_time": None,
75
+ "last_chunk_time": None
76
+ }
77
+
78
+ logging.info(f"Streaming Config: XML={config.xml_tool_calling}, Native={config.native_tool_calling}, "
79
+ f"Execute on stream={config.execute_on_stream}, Strategy={config.tool_execution_strategy}")
80
+
81
+ # Reuse thread_run_id for auto-continue or create new one
82
+ thread_run_id = continuous_state.get('thread_run_id') or str(uuid.uuid4())
83
+ continuous_state['thread_run_id'] = thread_run_id
84
+
85
+ try:
86
+ # --- Save and Yield Start Events (only if not auto-continuing) ---
87
+ if auto_continue_count == 0:
88
+ start_content = {"status_type": "thread_run_start", "thread_run_id": thread_run_id}
89
+ start_msg_obj = await self.add_message(
90
+ type="status", content=start_content,
91
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
92
+ )
93
+ if start_msg_obj: yield format_for_yield(start_msg_obj)
94
+
95
+ assist_start_content = {"status_type": "assistant_response_start"}
96
+ assist_start_msg_obj = await self.add_message(
97
+ type="status", content=assist_start_content,
98
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
99
+ )
100
+ if assist_start_msg_obj: yield format_for_yield(assist_start_msg_obj)
101
+ # --- End Start Events ---
102
+
103
+ __sequence = continuous_state.get('sequence', 0) # get the sequence from the previous auto-continue cycle
104
+
105
+ async for chunk in llm_response:
106
+ # Extract streaming metadata from chunks
107
+ current_time = datetime.now(timezone.utc).timestamp()
108
+ if streaming_metadata["first_chunk_time"] is None:
109
+ streaming_metadata["first_chunk_time"] = current_time
110
+ streaming_metadata["last_chunk_time"] = current_time
111
+
112
+ # Extract metadata from chunk attributes
113
+ if hasattr(chunk, 'created') and chunk.created:
114
+ streaming_metadata["created"] = chunk.created
115
+ if hasattr(chunk, 'model') and chunk.model:
116
+ streaming_metadata["model"] = chunk.model
117
+ if hasattr(chunk, 'usage') and chunk.usage:
118
+ # Update usage information if available (including zero values)
119
+ if hasattr(chunk.usage, 'prompt_tokens') and chunk.usage.prompt_tokens is not None:
120
+ streaming_metadata["usage"]["prompt_tokens"] = chunk.usage.prompt_tokens
121
+ if hasattr(chunk.usage, 'completion_tokens') and chunk.usage.completion_tokens is not None:
122
+ streaming_metadata["usage"]["completion_tokens"] = chunk.usage.completion_tokens
123
+ if hasattr(chunk.usage, 'total_tokens') and chunk.usage.total_tokens is not None:
124
+ streaming_metadata["usage"]["total_tokens"] = chunk.usage.total_tokens
125
+
126
+ if hasattr(chunk, 'choices') and chunk.choices and hasattr(chunk.choices[0], 'finish_reason') and \
127
+ chunk.choices[0].finish_reason:
128
+ finish_reason = chunk.choices[0].finish_reason
129
+ logging.debug(f"Detected finish_reason: {finish_reason}")
130
+
131
+ if hasattr(chunk, 'choices') and chunk.choices:
132
+ delta = chunk.choices[0].delta if hasattr(chunk.choices[0], 'delta') else None
133
+
134
+ # Check for and log Anthropic thinking content
135
+ if delta and hasattr(delta, 'reasoning_content') and delta.reasoning_content:
136
+ if not has_printed_thinking_prefix:
137
+ # print("[THINKING]: ", end='', flush=True)
138
+ has_printed_thinking_prefix = True
139
+ # print(delta.reasoning_content, end='', flush=True)
140
+ # Append reasoning to main content to be saved in the final message
141
+ accumulated_content += delta.reasoning_content
142
+
143
+ # Process content chunk
144
+ if delta and hasattr(delta, 'content') and delta.content:
145
+ chunk_content = delta.content
146
+ # print(chunk_content, end='', flush=True)
147
+ accumulated_content += chunk_content
148
+ current_xml_content += chunk_content
149
+
150
+ if not (config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls):
151
+ # Yield ONLY content chunk (don't save)
152
+ now_chunk = datetime.now(timezone.utc).isoformat()
153
+ yield {
154
+ "sequence": __sequence,
155
+ "message_id": None, "thread_id": thread_id, "type": "assistant",
156
+ "is_llm_message": True,
157
+ "content": to_json_string({"role": "assistant", "content": chunk_content}),
158
+ "metadata": to_json_string({"stream_status": "chunk", "thread_run_id": thread_run_id}),
159
+ "created_at": now_chunk, "updated_at": now_chunk
160
+ }
161
+ __sequence += 1
162
+ else:
163
+ logging.info("XML tool call limit reached - not yielding more content chunks")
164
+ self.trace.event(name="xml_tool_call_limit_reached", level="DEFAULT", status_message=(
165
+ f"XML tool call limit reached - not yielding more content chunks"))
166
+
167
+ # --- Process XML Tool Calls (if enabled and limit not reached) ---
168
+ if config.xml_tool_calling and not (
169
+ config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls):
170
+ xml_chunks = self._extract_xml_chunks(current_xml_content)
171
+ for xml_chunk in xml_chunks:
172
+ current_xml_content = current_xml_content.replace(xml_chunk, "", 1)
173
+ xml_chunks_buffer.append(xml_chunk)
174
+ result = self._parse_xml_tool_call(xml_chunk)
175
+ if result:
176
+ tool_call, parsing_details = result
177
+ xml_tool_call_count += 1
178
+ current_assistant_id = last_assistant_message_object[
179
+ 'message_id'] if last_assistant_message_object else None
180
+ context = self._create_tool_context(
181
+ tool_call, tool_index, current_assistant_id, parsing_details
182
+ )
183
+
184
+ if config.execute_tools and config.execute_on_stream:
185
+ # Save and Yield tool_started status
186
+ started_msg_obj = await self._yield_and_save_tool_started(context, thread_id,
187
+ thread_run_id)
188
+ if started_msg_obj: yield format_for_yield(started_msg_obj)
189
+ yielded_tool_indices.add(tool_index) # Mark status as yielded
190
+
191
+ execution_task = asyncio.create_task(self._execute_tool(tool_call))
192
+ pending_tool_executions.append({
193
+ "task": execution_task, "tool_call": tool_call,
194
+ "tool_index": tool_index, "context": context
195
+ })
196
+ tool_index += 1
197
+
198
+ if config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls:
199
+ logging.debug(f"Reached XML tool call limit ({config.max_xml_tool_calls})")
200
+ finish_reason = "xml_tool_limit_reached"
201
+ break # Stop processing more XML chunks in this delta
202
+
203
+ # --- Process Native Tool Call Chunks ---
204
+ if config.native_tool_calling and delta and hasattr(delta, 'tool_calls') and delta.tool_calls:
205
+ for tool_call_chunk in delta.tool_calls:
206
+ # Yield Native Tool Call Chunk (transient status, not saved)
207
+ # ... (safe extraction logic for tool_call_data_chunk) ...
208
+ tool_call_data_chunk = {} # Placeholder for extracted data
209
+ if hasattr(tool_call_chunk, 'model_dump'):
210
+ tool_call_data_chunk = tool_call_chunk.model_dump()
211
+ else: # Manual extraction...
212
+ if hasattr(tool_call_chunk, 'id'): tool_call_data_chunk['id'] = tool_call_chunk.id
213
+ if hasattr(tool_call_chunk, 'index'): tool_call_data_chunk[
214
+ 'index'] = tool_call_chunk.index
215
+ if hasattr(tool_call_chunk, 'type'): tool_call_data_chunk['type'] = tool_call_chunk.type
216
+ if hasattr(tool_call_chunk, 'function'):
217
+ tool_call_data_chunk['function'] = {}
218
+ if hasattr(tool_call_chunk.function, 'name'): tool_call_data_chunk['function'][
219
+ 'name'] = tool_call_chunk.function.name
220
+ if hasattr(tool_call_chunk.function, 'arguments'): tool_call_data_chunk['function'][
221
+ 'arguments'] = tool_call_chunk.function.arguments if isinstance(
222
+ tool_call_chunk.function.arguments, str) else to_json_string(
223
+ tool_call_chunk.function.arguments)
224
+
225
+ now_tool_chunk = datetime.now(timezone.utc).isoformat()
226
+ yield {
227
+ "message_id": None, "thread_id": thread_id, "type": "status", "is_llm_message": True,
228
+ "content": to_json_string({"role": "assistant", "status_type": "tool_call_chunk",
229
+ "tool_call_chunk": tool_call_data_chunk}),
230
+ "metadata": to_json_string({"thread_run_id": thread_run_id}),
231
+ "created_at": now_tool_chunk, "updated_at": now_tool_chunk
232
+ }
233
+
234
+ # --- Buffer and Execute Complete Native Tool Calls ---
235
+ if not hasattr(tool_call_chunk, 'function'): continue
236
+ idx = tool_call_chunk.index if hasattr(tool_call_chunk, 'index') else 0
237
+ # ... (buffer update logic remains same) ...
238
+ # ... (check complete logic remains same) ...
239
+ has_complete_tool_call = False # Placeholder
240
+ if (tool_calls_buffer.get(idx) and
241
+ tool_calls_buffer[idx]['id'] and
242
+ tool_calls_buffer[idx]['function']['name'] and
243
+ tool_calls_buffer[idx]['function']['arguments']):
244
+ try:
245
+ safe_json_parse(tool_calls_buffer[idx]['function']['arguments'])
246
+ has_complete_tool_call = True
247
+ except json.JSONDecodeError:
248
+ pass
249
+
250
+ if has_complete_tool_call and config.execute_tools and config.execute_on_stream:
251
+ current_tool = tool_calls_buffer[idx]
252
+ tool_call_data = {
253
+ "function_name": current_tool['function']['name'],
254
+ "arguments": safe_json_parse(current_tool['function']['arguments']),
255
+ "id": current_tool['id']
256
+ }
257
+ current_assistant_id = last_assistant_message_object[
258
+ 'message_id'] if last_assistant_message_object else None
259
+ context = self._create_tool_context(
260
+ tool_call_data, tool_index, current_assistant_id
261
+ )
262
+
263
+ # Save and Yield tool_started status
264
+ started_msg_obj = await self._yield_and_save_tool_started(context, thread_id,
265
+ thread_run_id)
266
+ if started_msg_obj: yield format_for_yield(started_msg_obj)
267
+ yielded_tool_indices.add(tool_index) # Mark status as yielded
268
+
269
+ execution_task = asyncio.create_task(self._execute_tool(tool_call_data))
270
+ pending_tool_executions.append({
271
+ "task": execution_task, "tool_call": tool_call_data,
272
+ "tool_index": tool_index, "context": context
273
+ })
274
+ tool_index += 1
275
+
276
+ if finish_reason == "xml_tool_limit_reached":
277
+ logging.info("Stopping stream processing after loop due to XML tool call limit")
278
+ self.trace.event(name="stopping_stream_processing_after_loop_due_to_xml_tool_call_limit",
279
+ level="DEFAULT", status_message=(
280
+ f"Stopping stream processing after loop due to XML tool call limit"))
281
+ break
282
+
283
+ # print() # Add a final newline after the streaming loop finishes
284
+
285
+ # --- After Streaming Loop ---
286
+
287
+ if (
288
+ streaming_metadata["usage"]["total_tokens"] == 0
289
+ ):
290
+ logging.info("🔥 No usage data from provider, counting with litellm.token_counter")
291
+
292
+ try:
293
+ # prompt side
294
+ # prompt_tokens = token_counter(
295
+ # model=llm_model,
296
+ # messages=prompt_messages # chat or plain; token_counter handles both
297
+ # )
298
+ #
299
+ # # completion side
300
+ # completion_tokens = token_counter(
301
+ # model=llm_model,
302
+ # text=accumulated_content or "" # empty string safe
303
+ # )
304
+
305
+ # streaming_metadata["usage"]["prompt_tokens"] = prompt_tokens
306
+ # streaming_metadata["usage"]["completion_tokens"] = completion_tokens
307
+ # streaming_metadata["usage"]["total_tokens"] = prompt_tokens + completion_tokens
308
+ #
309
+ # logging.info(
310
+ # f"🔥 Estimated tokens – prompt: {prompt_tokens}, "
311
+ # f"completion: {completion_tokens}, total: {prompt_tokens + completion_tokens}"
312
+ # )
313
+ self.trace.event(name="usage_calculated_with_litellm_token_counter", level="DEFAULT",
314
+ status_message=(f"Usage calculated with litellm.token_counter"))
315
+ except Exception as e:
316
+ logging.warning(f"Failed to calculate usage: {str(e)}")
317
+ self.trace.event(name="failed_to_calculate_usage", level="WARNING",
318
+ status_message=(f"Failed to calculate usage: {str(e)}"))
319
+
320
+ # Wait for pending tool executions from streaming phase
321
+ tool_results_buffer = [] # Stores (tool_call, result, tool_index, context)
322
+ if pending_tool_executions:
323
+ logging.info(f"Waiting for {len(pending_tool_executions)} pending streamed tool executions")
324
+ self.trace.event(name="waiting_for_pending_streamed_tool_executions", level="DEFAULT", status_message=(
325
+ f"Waiting for {len(pending_tool_executions)} pending streamed tool executions"))
326
+ # ... (asyncio.wait logic) ...
327
+ pending_tasks = [execution["task"] for execution in pending_tool_executions]
328
+ done, _ = await asyncio.wait(pending_tasks)
329
+
330
+ for execution in pending_tool_executions:
331
+ tool_idx = execution.get("tool_index", -1)
332
+ context = execution["context"]
333
+ tool_name = context.function_name
334
+
335
+ # Check if status was already yielded during stream run
336
+ if tool_idx in yielded_tool_indices:
337
+ logging.debug(f"Status for tool index {tool_idx} already yielded.")
338
+ # Still need to process the result for the buffer
339
+ try:
340
+ if execution["task"].done():
341
+ result = execution["task"].result()
342
+ context.result = result
343
+ tool_results_buffer.append((execution["tool_call"], result, tool_idx, context))
344
+
345
+ if tool_name in ['ask', 'complete']:
346
+ logging.info(
347
+ f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag.")
348
+ self.trace.event(name="terminating_tool_completed_during_streaming",
349
+ level="DEFAULT", status_message=(
350
+ f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag."))
351
+ agent_should_terminate = True
352
+
353
+ else: # Should not happen with asyncio.wait
354
+ logging.warning(f"Task for tool index {tool_idx} not done after wait.")
355
+ self.trace.event(name="task_for_tool_index_not_done_after_wait", level="WARNING",
356
+ status_message=(
357
+ f"Task for tool index {tool_idx} not done after wait."))
358
+ except Exception as e:
359
+ logging.error(f"Error getting result for pending tool execution {tool_idx}: {str(e)}")
360
+ self.trace.event(name="error_getting_result_for_pending_tool_execution", level="ERROR",
361
+ status_message=(
362
+ f"Error getting result for pending tool execution {tool_idx}: {str(e)}"))
363
+ context.error = e
364
+ # Save and Yield tool error status message (even if started was yielded)
365
+ error_msg_obj = await self._yield_and_save_tool_error(context, thread_id, thread_run_id)
366
+ if error_msg_obj: yield format_for_yield(error_msg_obj)
367
+ continue # Skip further status yielding for this tool index
368
+
369
+ # If status wasn't yielded before (shouldn't happen with current logic), yield it now
370
+ try:
371
+ if execution["task"].done():
372
+ result = execution["task"].result()
373
+ context.result = result
374
+ tool_results_buffer.append((execution["tool_call"], result, tool_idx, context))
375
+
376
+ # Check if this is a terminating tool
377
+ if tool_name in ['ask', 'complete']:
378
+ logging.info(
379
+ f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag.")
380
+ self.trace.event(name="terminating_tool_completed_during_streaming", level="DEFAULT",
381
+ status_message=(
382
+ f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag."))
383
+ agent_should_terminate = True
384
+
385
+ # Save and Yield tool completed/failed status
386
+ completed_msg_obj = await self._yield_and_save_tool_completed(
387
+ context, None, thread_id, thread_run_id
388
+ )
389
+ if completed_msg_obj: yield format_for_yield(completed_msg_obj)
390
+ yielded_tool_indices.add(tool_idx)
391
+ except Exception as e:
392
+ logging.error(
393
+ f"Error getting result/yielding status for pending tool execution {tool_idx}: {str(e)}")
394
+ self.trace.event(name="error_getting_result_yielding_status_for_pending_tool_execution",
395
+ level="ERROR", status_message=(
396
+ f"Error getting result/yielding status for pending tool execution {tool_idx}: {str(e)}"))
397
+ context.error = e
398
+ # Save and Yield tool error status
399
+ error_msg_obj = await self._yield_and_save_tool_error(context, thread_id, thread_run_id)
400
+ if error_msg_obj: yield format_for_yield(error_msg_obj)
401
+ yielded_tool_indices.add(tool_idx)
402
+
403
+ # Save and yield finish status if limit was reached
404
+ if finish_reason == "xml_tool_limit_reached":
405
+ finish_content = {"status_type": "finish", "finish_reason": "xml_tool_limit_reached"}
406
+ finish_msg_obj = await self.add_message(
407
+ type="status", content=finish_content,
408
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
409
+ )
410
+ if finish_msg_obj: yield format_for_yield(finish_msg_obj)
411
+ logging.info(
412
+ f"Stream finished with reason: xml_tool_limit_reached after {xml_tool_call_count} XML tool calls")
413
+ self.trace.event(name="stream_finished_with_reason_xml_tool_limit_reached_after_xml_tool_calls",
414
+ level="DEFAULT", status_message=(
415
+ f"Stream finished with reason: xml_tool_limit_reached after {xml_tool_call_count} XML tool calls"))
416
+
417
+ # Calculate if auto-continue is needed if the finish reason is length
418
+ should_auto_continue = (can_auto_continue and finish_reason == 'length')
419
+
420
+ # --- SAVE and YIELD Final Assistant Message ---
421
+ # Only save assistant message if NOT auto-continuing due to length to avoid duplicate messages
422
+ if accumulated_content and not should_auto_continue:
423
+ # ... (Truncate accumulated_content logic) ...
424
+ if config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls and xml_chunks_buffer:
425
+ last_xml_chunk = xml_chunks_buffer[-1]
426
+ last_chunk_end_pos = accumulated_content.find(last_xml_chunk) + len(last_xml_chunk)
427
+ if last_chunk_end_pos > 0:
428
+ accumulated_content = accumulated_content[:last_chunk_end_pos]
429
+
430
+ # ... (Extract complete_native_tool_calls logic) ...
431
+ # Update complete_native_tool_calls from buffer (initialized earlier)
432
+ if config.native_tool_calling:
433
+ for idx, tc_buf in tool_calls_buffer.items():
434
+ if tc_buf['id'] and tc_buf['function']['name'] and tc_buf['function']['arguments']:
435
+ try:
436
+ args = safe_json_parse(tc_buf['function']['arguments'])
437
+ complete_native_tool_calls.append({
438
+ "id": tc_buf['id'], "type": "function",
439
+ "function": {"name": tc_buf['function']['name'], "arguments": args}
440
+ })
441
+ except json.JSONDecodeError:
442
+ continue
443
+
444
+ message_data = { # Dict to be saved in 'content'
445
+ "role": "assistant", "content": accumulated_content,
446
+ "tool_calls": complete_native_tool_calls or None
447
+ }
448
+
449
+ last_assistant_message_object = await self._add_message_with_agent_info(type="assistant", content=message_data,
450
+ is_llm_message=True, metadata={"thread_run_id": thread_run_id}
451
+ )
452
+
453
+ if last_assistant_message_object:
454
+ # Yield the complete saved object, adding stream_status metadata just for yield
455
+ yield_metadata = ensure_dict(last_assistant_message_object.get('metadata'), {})
456
+ yield_metadata['stream_status'] = 'complete'
457
+ # Format the message for yielding
458
+ yield_message = last_assistant_message_object.copy()
459
+ yield_message['metadata'] = yield_metadata
460
+ yield format_for_yield(yield_message)
461
+ else:
462
+ logging.error(f"Failed to save final assistant message for thread {thread_id}")
463
+ self.trace.event(name="failed_to_save_final_assistant_message_for_thread", level="ERROR",
464
+ status_message=(f"Failed to save final assistant message for thread {thread_id}"))
465
+ # Save and yield an error status
466
+ err_content = {"role": "system", "status_type": "error",
467
+ "message": "Failed to save final assistant message"}
468
+ err_msg_obj = await self.add_message(
469
+ type="status", content=err_content,
470
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
471
+ )
472
+ if err_msg_obj: yield format_for_yield(err_msg_obj)
473
+
474
+ # --- Process All Tool Results Now ---
475
+ if config.execute_tools:
476
+ final_tool_calls_to_process = []
477
+ # ... (Gather final_tool_calls_to_process from native and XML buffers) ...
478
+ # Gather native tool calls from buffer
479
+ if config.native_tool_calling and complete_native_tool_calls:
480
+ for tc in complete_native_tool_calls:
481
+ final_tool_calls_to_process.append({
482
+ "function_name": tc["function"]["name"],
483
+ "arguments": tc["function"]["arguments"], # Already parsed object
484
+ "id": tc["id"]
485
+ })
486
+ # Gather XML tool calls from buffer (up to limit)
487
+ parsed_xml_data = []
488
+ if config.xml_tool_calling:
489
+ # Reparse remaining content just in case (should be empty if processed correctly)
490
+ xml_chunks = self._extract_xml_chunks(current_xml_content)
491
+ xml_chunks_buffer.extend(xml_chunks)
492
+ # Process only chunks not already handled in the stream loop
493
+ remaining_limit = config.max_xml_tool_calls - xml_tool_call_count if config.max_xml_tool_calls > 0 else len(
494
+ xml_chunks_buffer)
495
+ xml_chunks_to_process = xml_chunks_buffer[:remaining_limit] # Ensure limit is respected
496
+
497
+ for chunk in xml_chunks_to_process:
498
+ parsed_result = self._parse_xml_tool_call(chunk)
499
+ if parsed_result:
500
+ tool_call, parsing_details = parsed_result
501
+ # Avoid adding if already processed during streaming
502
+ if not any(exec['tool_call'] == tool_call for exec in pending_tool_executions):
503
+ final_tool_calls_to_process.append(tool_call)
504
+ parsed_xml_data.append({'tool_call': tool_call, 'parsing_details': parsing_details})
505
+
506
+ all_tool_data_map = {} # tool_index -> {'tool_call': ..., 'parsing_details': ...}
507
+ # Add native tool data
508
+ native_tool_index = 0
509
+ if config.native_tool_calling and complete_native_tool_calls:
510
+ for tc in complete_native_tool_calls:
511
+ # Find the corresponding entry in final_tool_calls_to_process if needed
512
+ # For now, assume order matches if only native used
513
+ exec_tool_call = {
514
+ "function_name": tc["function"]["name"],
515
+ "arguments": tc["function"]["arguments"],
516
+ "id": tc["id"]
517
+ }
518
+ all_tool_data_map[native_tool_index] = {"tool_call": exec_tool_call, "parsing_details": None}
519
+ native_tool_index += 1
520
+
521
+ # Add XML tool data
522
+ xml_tool_index_start = native_tool_index
523
+ for idx, item in enumerate(parsed_xml_data):
524
+ all_tool_data_map[xml_tool_index_start + idx] = item
525
+
526
+ tool_results_map = {} # tool_index -> (tool_call, result, context)
527
+
528
+ # Populate from buffer if executed on stream
529
+ if config.execute_on_stream and tool_results_buffer:
530
+ logging.info(f"Processing {len(tool_results_buffer)} buffered tool results")
531
+ self.trace.event(name="processing_buffered_tool_results", level="DEFAULT",
532
+ status_message=(f"Processing {len(tool_results_buffer)} buffered tool results"))
533
+ for tool_call, result, tool_idx, context in tool_results_buffer:
534
+ if last_assistant_message_object: context.assistant_message_id = last_assistant_message_object[
535
+ 'message_id']
536
+ tool_results_map[tool_idx] = (tool_call, result, context)
537
+
538
+ # Or execute now if not streamed
539
+ elif final_tool_calls_to_process and not config.execute_on_stream:
540
+ logging.info(
541
+ f"Executing {len(final_tool_calls_to_process)} tools ({config.tool_execution_strategy}) after stream")
542
+ self.trace.event(name="executing_tools_after_stream", level="DEFAULT", status_message=(
543
+ f"Executing {len(final_tool_calls_to_process)} tools ({config.tool_execution_strategy}) after stream"))
544
+ results_list = await self._execute_tools(final_tool_calls_to_process,
545
+ config.tool_execution_strategy)
546
+ current_tool_idx = 0
547
+ for tc, res in results_list:
548
+ # Map back using all_tool_data_map which has correct indices
549
+ if current_tool_idx in all_tool_data_map:
550
+ tool_data = all_tool_data_map[current_tool_idx]
551
+ context = self._create_tool_context(
552
+ tc, current_tool_idx,
553
+ last_assistant_message_object['message_id'] if last_assistant_message_object else None,
554
+ tool_data.get('parsing_details')
555
+ )
556
+ context.result = res
557
+ tool_results_map[current_tool_idx] = (tc, res, context)
558
+ else:
559
+ logging.warning(f"Could not map result for tool index {current_tool_idx}")
560
+ self.trace.event(name="could_not_map_result_for_tool_index", level="WARNING",
561
+ status_message=(f"Could not map result for tool index {current_tool_idx}"))
562
+ current_tool_idx += 1
563
+
564
+ # Save and Yield each result message
565
+ if tool_results_map:
566
+ logging.info(f"Saving and yielding {len(tool_results_map)} final tool result messages")
567
+ self.trace.event(name="saving_and_yielding_final_tool_result_messages", level="DEFAULT",
568
+ status_message=(
569
+ f"Saving and yielding {len(tool_results_map)} final tool result messages"))
570
+ for tool_idx in sorted(tool_results_map.keys()):
571
+ tool_call, result, context = tool_results_map[tool_idx]
572
+ context.result = result
573
+ if not context.assistant_message_id and last_assistant_message_object:
574
+ context.assistant_message_id = last_assistant_message_object['message_id']
575
+
576
+ # Yield start status ONLY IF executing non-streamed (already yielded if streamed)
577
+ if not config.execute_on_stream and tool_idx not in yielded_tool_indices:
578
+ started_msg_obj = await self._yield_and_save_tool_started(context, thread_id, thread_run_id)
579
+ if started_msg_obj: yield format_for_yield(started_msg_obj)
580
+ yielded_tool_indices.add(tool_idx) # Mark status yielded
581
+
582
+ # Save the tool result message to DB
583
+ saved_tool_result_object = await self._add_tool_result( # Returns full object or None
584
+ thread_id, tool_call, result, config.xml_adding_strategy,
585
+ context.assistant_message_id, context.parsing_details
586
+ )
587
+
588
+ # Yield completed/failed status (linked to saved result ID if available)
589
+ completed_msg_obj = await self._yield_and_save_tool_completed(
590
+ context,
591
+ saved_tool_result_object['message_id'] if saved_tool_result_object else None,
592
+ thread_id, thread_run_id
593
+ )
594
+ if completed_msg_obj: yield format_for_yield(completed_msg_obj)
595
+ # Don't add to yielded_tool_indices here, completion status is separate yield
596
+
597
+ # Yield the saved tool result object
598
+ if saved_tool_result_object:
599
+ tool_result_message_objects[tool_idx] = saved_tool_result_object
600
+ yield format_for_yield(saved_tool_result_object)
601
+ else:
602
+ logging.error(
603
+ f"Failed to save tool result for index {tool_idx}, not yielding result message.")
604
+ self.trace.event(name="failed_to_save_tool_result_for_index", level="ERROR",
605
+ status_message=(
606
+ f"Failed to save tool result for index {tool_idx}, not yielding result message."))
607
+ # Optionally yield error status for saving failure?
608
+
609
+ # --- Final Finish Status ---
610
+ if finish_reason and finish_reason != "xml_tool_limit_reached":
611
+ finish_content = {"status_type": "finish", "finish_reason": finish_reason}
612
+ finish_msg_obj = await self.add_message(
613
+ type="status", content=finish_content,
614
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
615
+ )
616
+ if finish_msg_obj: yield format_for_yield(finish_msg_obj)
617
+
618
+ # Check if agent should terminate after processing pending tools
619
+ if agent_should_terminate:
620
+ logging.info(
621
+ "Agent termination requested after executing ask/complete tool. Stopping further processing.")
622
+ self.trace.event(name="agent_termination_requested", level="DEFAULT",
623
+ status_message="Agent termination requested after executing ask/complete tool. Stopping further processing.")
624
+
625
+ # Set finish reason to indicate termination
626
+ finish_reason = "agent_terminated"
627
+
628
+ # Save and yield termination status
629
+ finish_content = {"status_type": "finish", "finish_reason": "agent_terminated"}
630
+ finish_msg_obj = await self.add_message(
631
+ type="status", content=finish_content,
632
+ is_llm_message=False, metadata={"thread_run_id": thread_run_id}
633
+ )
634
+ if finish_msg_obj: yield format_for_yield(finish_msg_obj)
635
+
636
+ # Save assistant_response_end BEFORE terminating
637
+ if last_assistant_message_object:
638
+ try:
639
+ # Calculate response time if we have timing data
640
+ if streaming_metadata["first_chunk_time"] and streaming_metadata["last_chunk_time"]:
641
+ streaming_metadata["response_ms"] = (streaming_metadata["last_chunk_time"] -
642
+ streaming_metadata["first_chunk_time"]) * 1000
643
+
644
+ # Create a LiteLLM-like response object for streaming (before termination)
645
+ # Check if we have any actual usage data
646
+ has_usage_data = (
647
+ streaming_metadata["usage"]["prompt_tokens"] > 0 or
648
+ streaming_metadata["usage"]["completion_tokens"] > 0 or
649
+ streaming_metadata["usage"]["total_tokens"] > 0
650
+ )
651
+
652
+ assistant_end_content = {
653
+ "choices": [
654
+ {
655
+ "finish_reason": finish_reason or "stop",
656
+ "index": 0,
657
+ "message": {
658
+ "role": "assistant",
659
+ "content": accumulated_content,
660
+ "tool_calls": complete_native_tool_calls or None
661
+ }
662
+ }
663
+ ],
664
+ "created": streaming_metadata.get("created"),
665
+ "model": streaming_metadata.get("model", llm_model),
666
+ "usage": streaming_metadata["usage"], # Always include usage like LiteLLM does
667
+ "streaming": True, # Add flag to indicate this was reconstructed from streaming
668
+ }
669
+
670
+ # Only include response_ms if we have timing data
671
+ if streaming_metadata.get("response_ms"):
672
+ assistant_end_content["response_ms"] = streaming_metadata["response_ms"]
673
+
674
+ await self.add_message(
675
+ type="assistant_response_end",
676
+ content=assistant_end_content,
677
+ is_llm_message=False,
678
+ metadata={"thread_run_id": thread_run_id}
679
+ )
680
+ logging.info("Assistant response end saved for stream (before termination)")
681
+ except Exception as e:
682
+ logging.error(f"Error saving assistant response end for stream (before termination): {str(e)}")
683
+ self.trace.event(name="error_saving_assistant_response_end_for_stream_before_termination",
684
+ level="ERROR", status_message=(
685
+ f"Error saving assistant response end for stream (before termination): {str(e)}"))
686
+
687
+ # Skip all remaining processing and go to finally block
688
+ return
689
+
690
+ # --- Save and Yield assistant_response_end ---
691
+ # Only save assistant_response_end if not auto-continuing (response is actually complete)
692
+ if not should_auto_continue:
693
+ if last_assistant_message_object: # Only save if assistant message was saved
694
+ try:
695
+ # Calculate response time if we have timing data
696
+ if streaming_metadata["first_chunk_time"] and streaming_metadata["last_chunk_time"]:
697
+ streaming_metadata["response_ms"] = (streaming_metadata["last_chunk_time"] -
698
+ streaming_metadata["first_chunk_time"]) * 1000
699
+
700
+ # Create a LiteLLM-like response object for streaming
701
+ # Check if we have any actual usage data
702
+ has_usage_data = (
703
+ streaming_metadata["usage"]["prompt_tokens"] > 0 or
704
+ streaming_metadata["usage"]["completion_tokens"] > 0 or
705
+ streaming_metadata["usage"]["total_tokens"] > 0
706
+ )
707
+
708
+ assistant_end_content = {
709
+ "choices": [
710
+ {
711
+ "finish_reason": finish_reason or "stop",
712
+ "index": 0,
713
+ "message": {
714
+ "role": "assistant",
715
+ "content": accumulated_content,
716
+ "tool_calls": complete_native_tool_calls or None
717
+ }
718
+ }
719
+ ],
720
+ "created": streaming_metadata.get("created"),
721
+ "model": streaming_metadata.get("model", llm_model),
722
+ "usage": streaming_metadata["usage"], # Always include usage like LiteLLM does
723
+ "streaming": True, # Add flag to indicate this was reconstructed from streaming
724
+ }
725
+
726
+ # Only include response_ms if we have timing data
727
+ if streaming_metadata.get("response_ms"):
728
+ assistant_end_content["response_ms"] = streaming_metadata["response_ms"]
729
+
730
+ await self.add_message(
731
+ type="assistant_response_end",
732
+ content=assistant_end_content,
733
+ is_llm_message=False,
734
+ metadata={"thread_run_id": thread_run_id}
735
+ )
736
+ logging.info("Assistant response end saved for stream")
737
+ except Exception as e:
738
+ logging.error(f"Error saving assistant response end for stream: {str(e)}")
739
+ self.trace.event(name="error_saving_assistant_response_end_for_stream", level="ERROR",
740
+ status_message=(f"Error saving assistant response end for stream: {str(e)}"))
741
+
742
+ except Exception as e:
743
+ logging.error(f"Error processing stream: {str(e)}", exc_info=True)
744
+ self.trace.event(name="error_processing_stream", level="ERROR",
745
+ status_message=(f"Error processing stream: {str(e)}"))
746
+ # Save and yield error status message
747
+
748
+ err_content = {"role": "system", "status_type": "error", "message": str(e)}
749
+ if (not "AnthropicException - Overloaded" in str(e)):
750
+ err_msg_obj = await self.add_message(
751
+ type="status", content=err_content,
752
+ is_llm_message=False,
753
+ metadata={"thread_run_id": thread_run_id if 'thread_run_id' in locals() else None}
754
+ )
755
+ if err_msg_obj: yield format_for_yield(err_msg_obj) # Yield the saved error message
756
+ # Re-raise the same exception (not a new one) to ensure proper error propagation
757
+ logging.critical(f"Re-raising error to stop further processing: {str(e)}")
758
+ self.trace.event(name="re_raising_error_to_stop_further_processing", level="ERROR",
759
+ status_message=(f"Re-raising error to stop further processing: {str(e)}"))
760
+ else:
761
+ logging.error(f"AnthropicException - Overloaded detected - Falling back to OpenRouter: {str(e)}",
762
+ exc_info=True)
763
+ self.trace.event(name="anthropic_exception_overloaded_detected", level="ERROR", status_message=(
764
+ f"AnthropicException - Overloaded detected - Falling back to OpenRouter: {str(e)}"))
765
+ raise # Use bare 'raise' to preserve the original exception with its traceback
766
+
767
+ finally:
768
+ # Update continuous state for potential auto-continue
769
+ if should_auto_continue:
770
+ continuous_state['accumulated_content'] = accumulated_content
771
+ continuous_state['sequence'] = __sequence
772
+
773
+ logging.info(f"Updated continuous state for auto-continue with {len(accumulated_content)} chars")
774
+ else:
775
+ # Save and Yield the final thread_run_end status (only if not auto-continuing and finish_reason is not 'length')
776
+ try:
777
+ end_content = {"status_type": "thread_run_end"}
778
+ end_msg_obj = await self.add_message(
779
+ type="status", content=end_content,
780
+ is_llm_message=False,
781
+ metadata={"thread_run_id": thread_run_id if 'thread_run_id' in locals() else None}
782
+ )
783
+ if end_msg_obj: yield format_for_yield(end_msg_obj)
784
+ except Exception as final_e:
785
+ logging.error(f"Error in finally block: {str(final_e)}", exc_info=True)
786
+ self.trace.event(name="error_in_finally_block", level="ERROR",
787
+ status_message=(f"Error in finally block: {str(final_e)}"))