xgae 0.1.10__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of xgae might be problematic. Click here for more details.
- xgae/cli_app.py +2 -4
- xgae/engine/engine_base.py +3 -3
- xgae/engine/mcp_tool_box.py +11 -6
- xgae/engine/responser/non_stream_responser.py +30 -39
- xgae/engine/responser/responser_base.py +52 -49
- xgae/engine/responser/stream_responser.py +93 -782
- xgae/engine/task_engine.py +77 -48
- xgae/utils/__init__.py +18 -6
- xgae/utils/json_helpers.py +2 -2
- xgae/utils/llm_client.py +21 -19
- xgae/utils/misc.py +1 -2
- xgae/utils/setup_env.py +1 -0
- {xgae-0.1.10.dist-info → xgae-0.1.13.dist-info}/METADATA +1 -1
- xgae-0.1.13.dist-info/RECORD +21 -0
- xgae-0.1.10.dist-info/RECORD +0 -21
- {xgae-0.1.10.dist-info → xgae-0.1.13.dist-info}/WHEEL +0 -0
- {xgae-0.1.10.dist-info → xgae-0.1.13.dist-info}/entry_points.txt +0 -0
|
@@ -1,57 +1,10 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import json
|
|
3
1
|
import logging
|
|
4
|
-
import uuid
|
|
5
2
|
|
|
6
|
-
from
|
|
7
|
-
from datetime import datetime, timezone
|
|
8
|
-
from typing import List, Dict, Any, Optional, AsyncGenerator, override, Literal
|
|
9
|
-
|
|
10
|
-
from xgae.engine.responser.responser_base import TaskResponseProcessor, TaskResponserContext,TaskRunContinuousState,XmlAddingStrategy,ToolExecutionStrategy
|
|
11
|
-
from xgae.utils.json_helpers import (
|
|
12
|
-
ensure_dict, safe_json_parse,
|
|
13
|
-
to_json_string, format_for_yield
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
@dataclass
|
|
17
|
-
class ProcessorConfig:
|
|
18
|
-
"""
|
|
19
|
-
Configuration for response processing and tool execution.
|
|
20
|
-
|
|
21
|
-
This class controls how the LLM's responses are processed, including how tool calls
|
|
22
|
-
are detected, executed, and their results handled.
|
|
23
|
-
|
|
24
|
-
Attributes:
|
|
25
|
-
xml_tool_calling: Enable XML-based tool call detection (<tool>...</tool>)
|
|
26
|
-
native_tool_calling: Enable OpenAI-style function calling format
|
|
27
|
-
execute_tools: Whether to automatically execute detected tool calls
|
|
28
|
-
execute_on_stream: For streaming, execute tools as they appear vs. at the end
|
|
29
|
-
tool_execution_strategy: How to execute multiple tools ("sequential" or "parallel")
|
|
30
|
-
xml_adding_strategy: How to add XML tool results to the conversation
|
|
31
|
-
max_xml_tool_calls: Maximum number of XML tool calls to process (0 = no limit)
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
xml_tool_calling: bool = True
|
|
35
|
-
native_tool_calling: bool = False
|
|
36
|
-
|
|
37
|
-
execute_tools: bool = True
|
|
38
|
-
execute_on_stream: bool = False
|
|
39
|
-
tool_execution_strategy: ToolExecutionStrategy = "sequential"
|
|
40
|
-
xml_adding_strategy: XmlAddingStrategy = "assistant_message"
|
|
41
|
-
max_xml_tool_calls: int = 0 # 0 means no limit
|
|
42
|
-
|
|
43
|
-
def __post_init__(self):
|
|
44
|
-
"""Validate configuration after initialization."""
|
|
45
|
-
if self.xml_tool_calling is False and self.native_tool_calling is False and self.execute_tools:
|
|
46
|
-
raise ValueError(
|
|
47
|
-
"At least one tool calling format (XML or native) must be enabled if execute_tools is True")
|
|
48
|
-
|
|
49
|
-
if self.xml_adding_strategy not in ["user_message", "assistant_message", "inline_edit"]:
|
|
50
|
-
raise ValueError("xml_adding_strategy must be 'user_message', 'assistant_message', or 'inline_edit'")
|
|
51
|
-
|
|
52
|
-
if self.max_xml_tool_calls < 0:
|
|
53
|
-
raise ValueError("max_xml_tool_calls must be a non-negative integer (0 = no limit)")
|
|
3
|
+
from typing import List, Dict, Any, Optional, AsyncGenerator, override
|
|
54
4
|
|
|
5
|
+
from xgae.utils import log_trace
|
|
6
|
+
from xgae.utils.json_helpers import format_for_yield
|
|
7
|
+
from xgae.engine.responser.responser_base import TaskResponseProcessor, TaskResponserContext, TaskRunContinuousState
|
|
55
8
|
|
|
56
9
|
|
|
57
10
|
class StreamTaskResponser(TaskResponseProcessor):
|
|
@@ -59,767 +12,125 @@ class StreamTaskResponser(TaskResponseProcessor):
|
|
|
59
12
|
super().__init__(response_context)
|
|
60
13
|
|
|
61
14
|
@override
|
|
62
|
-
async def process_response(
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
) -> AsyncGenerator[Dict[str, Any], None]:
|
|
68
|
-
"""Process a streaming LLM response, handling tool calls and execution.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
llm_response: Streaming response from the LLM
|
|
72
|
-
thread_id: ID of the conversation thread
|
|
73
|
-
prompt_messages: List of messages sent to the LLM (the prompt)
|
|
74
|
-
llm_model: The name of the LLM model used
|
|
75
|
-
config: Configuration for parsing and execution
|
|
76
|
-
can_auto_continue: Whether auto-continue is enabled
|
|
77
|
-
auto_continue_count: Number of auto-continue cycles
|
|
78
|
-
continuous_state: Previous state of the conversation
|
|
79
|
-
|
|
80
|
-
Yields:
|
|
81
|
-
Complete message objects matching the DB schema, except for content chunks.
|
|
82
|
-
"""
|
|
83
|
-
# Initialize from continuous state if provided (for auto-continue)
|
|
84
|
-
can_auto_continue = continuous_state.get("auto_continue", False)
|
|
85
|
-
auto_continue_count = continuous_state.get("auto_continue_count", 0)
|
|
86
|
-
llm_model = self.response_context.get("model_name")
|
|
87
|
-
config: ProcessorConfig = ProcessorConfig()
|
|
88
|
-
thread_id = self.response_context.get("task_id")
|
|
89
|
-
|
|
90
|
-
continuous_state = continuous_state or {}
|
|
15
|
+
async def process_response(self,
|
|
16
|
+
llm_response: AsyncGenerator,
|
|
17
|
+
prompt_messages: List[Dict[str, Any]],
|
|
18
|
+
continuous_state: TaskRunContinuousState
|
|
19
|
+
) -> AsyncGenerator[Dict[str, Any], None]:
|
|
91
20
|
accumulated_content = continuous_state.get('accumulated_content', "")
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
yielded_tool_indices = set() # Stores indices of tools whose *status* has been yielded
|
|
97
|
-
tool_index = 0
|
|
98
|
-
xml_tool_call_count = 0
|
|
21
|
+
auto_continue_count = continuous_state.get('auto_continue_count', 0)
|
|
22
|
+
can_auto_continue = continuous_state.get("auto_continue", False)
|
|
23
|
+
use_assistant_chunk_msg = self.response_context.get("use_assistant_chunk_msg")
|
|
24
|
+
|
|
99
25
|
finish_reason = None
|
|
100
26
|
should_auto_continue = False
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
has_printed_thinking_prefix = False # Flag for printing thinking prefix only once
|
|
104
|
-
agent_should_terminate = False # Flag to track if a terminating tool has been executed
|
|
105
|
-
complete_native_tool_calls = [] # Initialize early for use in assistant_response_end
|
|
106
|
-
|
|
107
|
-
# Collect metadata for reconstructing LiteLLM response object
|
|
108
|
-
streaming_metadata = {
|
|
109
|
-
"model": llm_model,
|
|
110
|
-
"created": None,
|
|
111
|
-
"usage": {
|
|
112
|
-
"prompt_tokens": 0,
|
|
113
|
-
"completion_tokens": 0,
|
|
114
|
-
"total_tokens": 0
|
|
115
|
-
},
|
|
116
|
-
"response_ms": None,
|
|
117
|
-
"first_chunk_time": None,
|
|
118
|
-
"last_chunk_time": None
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
logging.info(f"Streaming Config: XML={config.xml_tool_calling}, Native={config.native_tool_calling}, "
|
|
122
|
-
f"Execute on stream={config.execute_on_stream}, Strategy={config.tool_execution_strategy}")
|
|
123
|
-
|
|
124
|
-
# Reuse thread_run_id for auto-continue or create new one
|
|
125
|
-
thread_run_id = continuous_state.get('thread_run_id') or str(uuid.uuid4())
|
|
126
|
-
continuous_state['thread_run_id'] = thread_run_id
|
|
127
|
-
|
|
27
|
+
sequence = continuous_state.get('assistant_msg_sequence', 0)
|
|
28
|
+
|
|
128
29
|
try:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
is_llm_message=False, metadata={"thread_run_id": thread_run_id}
|
|
135
|
-
)
|
|
136
|
-
if start_msg_obj: yield format_for_yield(start_msg_obj)
|
|
137
|
-
|
|
138
|
-
assist_start_content = {"status_type": "assistant_response_start"}
|
|
139
|
-
assist_start_msg_obj = await self.add_response_message(
|
|
140
|
-
type="status", content=assist_start_content,
|
|
141
|
-
is_llm_message=False, metadata={"thread_run_id": thread_run_id}
|
|
142
|
-
)
|
|
143
|
-
if assist_start_msg_obj: yield format_for_yield(assist_start_msg_obj)
|
|
144
|
-
# --- End Start Events ---
|
|
145
|
-
|
|
146
|
-
__sequence = continuous_state.get('sequence', 0) # get the sequence from the previous auto-continue cycle
|
|
30
|
+
async for llm_chunk in llm_response:
|
|
31
|
+
if hasattr(llm_chunk, 'choices') and llm_chunk.choices and hasattr(llm_chunk.choices[0], 'finish_reason'):
|
|
32
|
+
if llm_chunk.choices[0].finish_reason:
|
|
33
|
+
finish_reason = llm_chunk.choices[0].finish_reason
|
|
34
|
+
logging.info(f"StreamResp:LLM chunk response finish_reason={finish_reason}")
|
|
147
35
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
current_time = datetime.now(timezone.utc).timestamp()
|
|
151
|
-
if streaming_metadata["first_chunk_time"] is None:
|
|
152
|
-
streaming_metadata["first_chunk_time"] = current_time
|
|
153
|
-
streaming_metadata["last_chunk_time"] = current_time
|
|
36
|
+
if hasattr(llm_chunk, 'choices') and llm_chunk.choices:
|
|
37
|
+
llm_chunk_msg = llm_chunk.choices[0].delta if hasattr(llm_chunk.choices[0], 'delta') else None
|
|
154
38
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
streaming_metadata["created"] = chunk.created
|
|
158
|
-
if hasattr(chunk, 'model') and chunk.model:
|
|
159
|
-
streaming_metadata["model"] = chunk.model
|
|
160
|
-
if hasattr(chunk, 'usage') and chunk.usage:
|
|
161
|
-
# Update usage information if available (including zero values)
|
|
162
|
-
if hasattr(chunk.usage, 'prompt_tokens') and chunk.usage.prompt_tokens is not None:
|
|
163
|
-
streaming_metadata["usage"]["prompt_tokens"] = chunk.usage.prompt_tokens
|
|
164
|
-
if hasattr(chunk.usage, 'completion_tokens') and chunk.usage.completion_tokens is not None:
|
|
165
|
-
streaming_metadata["usage"]["completion_tokens"] = chunk.usage.completion_tokens
|
|
166
|
-
if hasattr(chunk.usage, 'total_tokens') and chunk.usage.total_tokens is not None:
|
|
167
|
-
streaming_metadata["usage"]["total_tokens"] = chunk.usage.total_tokens
|
|
168
|
-
|
|
169
|
-
if hasattr(chunk, 'choices') and chunk.choices and hasattr(chunk.choices[0], 'finish_reason') and \
|
|
170
|
-
chunk.choices[0].finish_reason:
|
|
171
|
-
finish_reason = chunk.choices[0].finish_reason
|
|
172
|
-
logging.debug(f"Detected finish_reason: {finish_reason}")
|
|
173
|
-
|
|
174
|
-
if hasattr(chunk, 'choices') and chunk.choices:
|
|
175
|
-
delta = chunk.choices[0].delta if hasattr(chunk.choices[0], 'delta') else None
|
|
176
|
-
|
|
177
|
-
# Check for and log Anthropic thinking content
|
|
178
|
-
if delta and hasattr(delta, 'reasoning_content') and delta.reasoning_content:
|
|
179
|
-
if not has_printed_thinking_prefix:
|
|
180
|
-
# print("[THINKING]: ", end='', flush=True)
|
|
181
|
-
has_printed_thinking_prefix = True
|
|
182
|
-
# print(delta.reasoning_content, end='', flush=True)
|
|
183
|
-
# Append reasoning to main content to be saved in the final message
|
|
184
|
-
accumulated_content += delta.reasoning_content
|
|
185
|
-
|
|
186
|
-
# Process content chunk
|
|
187
|
-
if delta and hasattr(delta, 'content') and delta.content:
|
|
188
|
-
chunk_content = delta.content
|
|
189
|
-
# print(chunk_content, end='', flush=True)
|
|
39
|
+
if llm_chunk_msg and hasattr(llm_chunk_msg, 'content') and llm_chunk_msg.content:
|
|
40
|
+
chunk_content = llm_chunk_msg.content
|
|
190
41
|
accumulated_content += chunk_content
|
|
191
|
-
current_xml_content += chunk_content
|
|
192
|
-
|
|
193
|
-
if not (config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls):
|
|
194
|
-
# Yield ONLY content chunk (don't save)
|
|
195
|
-
now_chunk = datetime.now(timezone.utc).isoformat()
|
|
196
|
-
yield {
|
|
197
|
-
"sequence": __sequence,
|
|
198
|
-
"message_id": None, "thread_id": thread_id, "type": "assistant",
|
|
199
|
-
"is_llm_message": True,
|
|
200
|
-
"content": to_json_string({"role": "assistant", "content": chunk_content}),
|
|
201
|
-
"metadata": to_json_string({"stream_status": "chunk", "thread_run_id": thread_run_id}),
|
|
202
|
-
"created_at": now_chunk, "updated_at": now_chunk
|
|
203
|
-
}
|
|
204
|
-
__sequence += 1
|
|
205
|
-
else:
|
|
206
|
-
logging.info("XML tool call limit reached - not yielding more content chunks")
|
|
207
|
-
self.root_span.event(name="xml_tool_call_limit_reached", level="DEFAULT", status_message=(
|
|
208
|
-
f"XML tool call limit reached - not yielding more content chunks"))
|
|
209
|
-
|
|
210
|
-
# --- Process XML Tool Calls (if enabled and limit not reached) ---
|
|
211
|
-
if config.xml_tool_calling and not (
|
|
212
|
-
config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls):
|
|
213
|
-
xml_chunks = self._extract_xml_chunks(current_xml_content)
|
|
214
|
-
for xml_chunk in xml_chunks:
|
|
215
|
-
current_xml_content = current_xml_content.replace(xml_chunk, "", 1)
|
|
216
|
-
xml_chunks_buffer.append(xml_chunk)
|
|
217
|
-
result = self._parse_xml_tool_call(xml_chunk)
|
|
218
|
-
if result:
|
|
219
|
-
tool_call, parsing_details = result
|
|
220
|
-
xml_tool_call_count += 1
|
|
221
|
-
current_assistant_id = last_assistant_message_object[
|
|
222
|
-
'message_id'] if last_assistant_message_object else None
|
|
223
|
-
context = self._create_tool_context(
|
|
224
|
-
tool_call, tool_index, current_assistant_id, parsing_details
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
if config.execute_tools and config.execute_on_stream:
|
|
228
|
-
# Save and Yield tool_started status
|
|
229
|
-
started_msg_obj = await self._add_tool_start_message(context)
|
|
230
|
-
if started_msg_obj: yield format_for_yield(started_msg_obj)
|
|
231
|
-
yielded_tool_indices.add(tool_index) # Mark status as yielded
|
|
232
|
-
|
|
233
|
-
execution_task = asyncio.create_task(self._execute_tool(tool_call))
|
|
234
|
-
pending_tool_executions.append({
|
|
235
|
-
"task": execution_task, "tool_call": tool_call,
|
|
236
|
-
"tool_index": tool_index, "context": context
|
|
237
|
-
})
|
|
238
|
-
tool_index += 1
|
|
239
|
-
|
|
240
|
-
if config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls:
|
|
241
|
-
logging.debug(f"Reached XML tool call limit ({config.max_xml_tool_calls})")
|
|
242
|
-
finish_reason = "xml_tool_limit_reached"
|
|
243
|
-
break # Stop processing more XML chunks in this delta
|
|
244
|
-
|
|
245
|
-
# --- Process Native Tool Call Chunks ---
|
|
246
|
-
if config.native_tool_calling and delta and hasattr(delta, 'tool_calls') and delta.tool_calls:
|
|
247
|
-
for tool_call_chunk in delta.tool_calls:
|
|
248
|
-
# Yield Native Tool Call Chunk (transient status, not saved)
|
|
249
|
-
# ... (safe extraction logic for tool_call_data_chunk) ...
|
|
250
|
-
tool_call_data_chunk = {} # Placeholder for extracted data
|
|
251
|
-
if hasattr(tool_call_chunk, 'model_dump'):
|
|
252
|
-
tool_call_data_chunk = tool_call_chunk.model_dump()
|
|
253
|
-
else: # Manual extraction...
|
|
254
|
-
if hasattr(tool_call_chunk, 'id'): tool_call_data_chunk['id'] = tool_call_chunk.id
|
|
255
|
-
if hasattr(tool_call_chunk, 'index'): tool_call_data_chunk[
|
|
256
|
-
'index'] = tool_call_chunk.index
|
|
257
|
-
if hasattr(tool_call_chunk, 'type'): tool_call_data_chunk['type'] = tool_call_chunk.type
|
|
258
|
-
if hasattr(tool_call_chunk, 'function'):
|
|
259
|
-
tool_call_data_chunk['function'] = {}
|
|
260
|
-
if hasattr(tool_call_chunk.function, 'name'): tool_call_data_chunk['function'][
|
|
261
|
-
'name'] = tool_call_chunk.function.name
|
|
262
|
-
if hasattr(tool_call_chunk.function, 'arguments'): tool_call_data_chunk['function'][
|
|
263
|
-
'arguments'] = tool_call_chunk.function.arguments if isinstance(
|
|
264
|
-
tool_call_chunk.function.arguments, str) else to_json_string(
|
|
265
|
-
tool_call_chunk.function.arguments)
|
|
266
|
-
|
|
267
|
-
now_tool_chunk = datetime.now(timezone.utc).isoformat()
|
|
268
|
-
yield {
|
|
269
|
-
"message_id": None, "thread_id": thread_id, "type": "status", "is_llm_message": True,
|
|
270
|
-
"content": to_json_string({"role": "assistant", "status_type": "tool_call_chunk",
|
|
271
|
-
"tool_call_chunk": tool_call_data_chunk}),
|
|
272
|
-
"metadata": to_json_string({"thread_run_id": thread_run_id}),
|
|
273
|
-
"created_at": now_tool_chunk, "updated_at": now_tool_chunk
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
# --- Buffer and Execute Complete Native Tool Calls ---
|
|
277
|
-
if not hasattr(tool_call_chunk, 'function'): continue
|
|
278
|
-
idx = tool_call_chunk.index if hasattr(tool_call_chunk, 'index') else 0
|
|
279
|
-
# ... (buffer update logic remains same) ...
|
|
280
|
-
# ... (check complete logic remains same) ...
|
|
281
|
-
has_complete_tool_call = False # Placeholder
|
|
282
|
-
if (tool_calls_buffer.get(idx) and
|
|
283
|
-
tool_calls_buffer[idx]['id'] and
|
|
284
|
-
tool_calls_buffer[idx]['function']['name'] and
|
|
285
|
-
tool_calls_buffer[idx]['function']['arguments']):
|
|
286
|
-
try:
|
|
287
|
-
safe_json_parse(tool_calls_buffer[idx]['function']['arguments'])
|
|
288
|
-
has_complete_tool_call = True
|
|
289
|
-
except json.JSONDecodeError:
|
|
290
|
-
pass
|
|
291
|
-
|
|
292
|
-
if has_complete_tool_call and config.execute_tools and config.execute_on_stream:
|
|
293
|
-
current_tool = tool_calls_buffer[idx]
|
|
294
|
-
tool_call_data = {
|
|
295
|
-
"function_name": current_tool['function']['name'],
|
|
296
|
-
"arguments": safe_json_parse(current_tool['function']['arguments']),
|
|
297
|
-
"id": current_tool['id']
|
|
298
|
-
}
|
|
299
|
-
current_assistant_id = last_assistant_message_object[
|
|
300
|
-
'message_id'] if last_assistant_message_object else None
|
|
301
|
-
context = self._create_tool_context(
|
|
302
|
-
tool_call_data, tool_index, current_assistant_id
|
|
303
|
-
)
|
|
304
|
-
|
|
305
|
-
# Save and Yield tool_started status
|
|
306
|
-
started_msg_obj = await self._add_tool_start_message(context)
|
|
307
|
-
if started_msg_obj: yield format_for_yield(started_msg_obj)
|
|
308
|
-
yielded_tool_indices.add(tool_index) # Mark status as yielded
|
|
309
|
-
|
|
310
|
-
execution_task = asyncio.create_task(self._execute_tool(tool_call_data))
|
|
311
|
-
pending_tool_executions.append({
|
|
312
|
-
"task": execution_task, "tool_call": tool_call_data,
|
|
313
|
-
"tool_index": tool_index, "context": context
|
|
314
|
-
})
|
|
315
|
-
tool_index += 1
|
|
316
|
-
|
|
317
|
-
if finish_reason == "xml_tool_limit_reached":
|
|
318
|
-
logging.info("Stopping stream processing after loop due to XML tool call limit")
|
|
319
|
-
self.root_span.event(name="stopping_stream_processing_after_loop_due_to_xml_tool_call_limit",
|
|
320
|
-
level="DEFAULT", status_message=(
|
|
321
|
-
f"Stopping stream processing after loop due to XML tool call limit"))
|
|
322
|
-
break
|
|
323
|
-
|
|
324
|
-
# print() # Add a final newline after the streaming loop finishes
|
|
325
|
-
|
|
326
|
-
# --- After Streaming Loop ---
|
|
327
|
-
|
|
328
|
-
if (
|
|
329
|
-
streaming_metadata["usage"]["total_tokens"] == 0
|
|
330
|
-
):
|
|
331
|
-
logging.info("🔥 No usage data from provider, counting with litellm.token_counter")
|
|
332
42
|
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
# completion_tokens = token_counter(
|
|
342
|
-
# model=llm_model,
|
|
343
|
-
# text=accumulated_content or "" # empty string safe
|
|
344
|
-
# )
|
|
43
|
+
xml_tool_call_count = len(self._extract_xml_chunks(accumulated_content))
|
|
44
|
+
if self.max_xml_tool_calls <= 0 or xml_tool_call_count < self.max_xml_tool_calls:
|
|
45
|
+
if use_assistant_chunk_msg:
|
|
46
|
+
message_data = {"role": "assistant", "content": chunk_content}
|
|
47
|
+
metadata = {"sequence": sequence}
|
|
48
|
+
assistant_chunk_msg = self.create_response_message(type="assistant_chunk", content=message_data,
|
|
49
|
+
is_llm_message=True, metadata=metadata)
|
|
50
|
+
yield assistant_chunk_msg
|
|
345
51
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
# logging.info(
|
|
351
|
-
# f"🔥 Estimated tokens – prompt: {prompt_tokens}, "
|
|
352
|
-
# f"completion: {completion_tokens}, total: {prompt_tokens + completion_tokens}"
|
|
353
|
-
# )
|
|
354
|
-
self.root_span.event(name="usage_calculated_with_litellm_token_counter", level="DEFAULT",
|
|
355
|
-
status_message=(f"Usage calculated with litellm.token_counter"))
|
|
356
|
-
except Exception as e:
|
|
357
|
-
logging.warning(f"Failed to calculate usage: {str(e)}")
|
|
358
|
-
self.root_span.event(name="failed_to_calculate_usage", level="WARNING",
|
|
359
|
-
status_message=(f"Failed to calculate usage: {str(e)}"))
|
|
360
|
-
|
|
361
|
-
# Wait for pending tool executions from streaming phase
|
|
362
|
-
tool_results_buffer = [] # Stores (tool_call, result, tool_index, context)
|
|
363
|
-
if pending_tool_executions:
|
|
364
|
-
logging.info(f"Waiting for {len(pending_tool_executions)} pending streamed tool executions")
|
|
365
|
-
self.root_span.event(name="waiting_for_pending_streamed_tool_executions", level="DEFAULT", status_message=(
|
|
366
|
-
f"Waiting for {len(pending_tool_executions)} pending streamed tool executions"))
|
|
367
|
-
# ... (asyncio.wait logic) ...
|
|
368
|
-
pending_tasks = [execution["task"] for execution in pending_tool_executions]
|
|
369
|
-
done, _ = await asyncio.wait(pending_tasks)
|
|
370
|
-
|
|
371
|
-
for execution in pending_tool_executions:
|
|
372
|
-
tool_idx = execution.get("tool_index", -1)
|
|
373
|
-
context = execution["context"]
|
|
374
|
-
tool_name = context.function_name
|
|
375
|
-
|
|
376
|
-
# Check if status was already yielded during stream run
|
|
377
|
-
if tool_idx in yielded_tool_indices:
|
|
378
|
-
logging.debug(f"Status for tool index {tool_idx} already yielded.")
|
|
379
|
-
# Still need to process the result for the buffer
|
|
380
|
-
try:
|
|
381
|
-
if execution["task"].done():
|
|
382
|
-
result = execution["task"].result()
|
|
383
|
-
context.result = result
|
|
384
|
-
tool_results_buffer.append((execution["tool_call"], result, tool_idx, context))
|
|
385
|
-
|
|
386
|
-
if tool_name in ['ask', 'complete']:
|
|
387
|
-
logging.info(
|
|
388
|
-
f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag.")
|
|
389
|
-
self.root_span.event(name="terminating_tool_completed_during_streaming",
|
|
390
|
-
level="DEFAULT", status_message=(
|
|
391
|
-
f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag."))
|
|
392
|
-
agent_should_terminate = True
|
|
393
|
-
|
|
394
|
-
else: # Should not happen with asyncio.wait
|
|
395
|
-
logging.warning(f"Task for tool index {tool_idx} not done after wait.")
|
|
396
|
-
self.root_span.event(name="task_for_tool_index_not_done_after_wait", level="WARNING",
|
|
397
|
-
status_message=(
|
|
398
|
-
f"Task for tool index {tool_idx} not done after wait."))
|
|
399
|
-
except Exception as e:
|
|
400
|
-
logging.error(f"Error getting result for pending tool execution {tool_idx}: {str(e)}")
|
|
401
|
-
self.root_span.event(name="error_getting_result_for_pending_tool_execution", level="ERROR",
|
|
402
|
-
status_message=(
|
|
403
|
-
f"Error getting result for pending tool execution {tool_idx}: {str(e)}"))
|
|
404
|
-
context.error = e
|
|
405
|
-
# Save and Yield tool error status message (even if started was yielded)
|
|
406
|
-
error_msg_obj = await self._add_tool_error_message(context)
|
|
407
|
-
if error_msg_obj: yield format_for_yield(error_msg_obj)
|
|
408
|
-
continue # Skip further status yielding for this tool index
|
|
409
|
-
|
|
410
|
-
# If status wasn't yielded before (shouldn't happen with current logic), yield it now
|
|
411
|
-
try:
|
|
412
|
-
if execution["task"].done():
|
|
413
|
-
result = execution["task"].result()
|
|
414
|
-
context.result = result
|
|
415
|
-
tool_results_buffer.append((execution["tool_call"], result, tool_idx, context))
|
|
416
|
-
|
|
417
|
-
# Check if this is a terminating tool
|
|
418
|
-
if tool_name in ['ask', 'complete']:
|
|
419
|
-
logging.info(
|
|
420
|
-
f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag.")
|
|
421
|
-
self.root_span.event(name="terminating_tool_completed_during_streaming", level="DEFAULT",
|
|
422
|
-
status_message=(
|
|
423
|
-
f"Terminating tool '{tool_name}' completed during streaming. Setting termination flag."))
|
|
424
|
-
agent_should_terminate = True
|
|
52
|
+
sequence += 1
|
|
53
|
+
else:
|
|
54
|
+
finish_reason = "xml_tool_limit_reached"
|
|
55
|
+
break
|
|
425
56
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
context, None)
|
|
429
|
-
if completed_msg_obj: yield format_for_yield(completed_msg_obj)
|
|
430
|
-
yielded_tool_indices.add(tool_idx)
|
|
431
|
-
except Exception as e:
|
|
432
|
-
logging.error(
|
|
433
|
-
f"Error getting result/yielding status for pending tool execution {tool_idx}: {str(e)}")
|
|
434
|
-
self.root_span.event(name="error_getting_result_yielding_status_for_pending_tool_execution",
|
|
435
|
-
level="ERROR", status_message=(
|
|
436
|
-
f"Error getting result/yielding status for pending tool execution {tool_idx}: {str(e)}"))
|
|
437
|
-
context.error = e
|
|
438
|
-
# Save and Yield tool error status
|
|
439
|
-
error_msg_obj = await self._add_tool_error_message(context)
|
|
440
|
-
if error_msg_obj: yield format_for_yield(error_msg_obj)
|
|
441
|
-
yielded_tool_indices.add(tool_idx)
|
|
57
|
+
if len(accumulated_content) == 0:
|
|
58
|
+
logging.warning(f"StreamResp: LLM response_message content is empty")
|
|
442
59
|
|
|
443
|
-
# Save and yield finish status if limit was reached
|
|
444
60
|
if finish_reason == "xml_tool_limit_reached":
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
self.root_span.event(name="stream_finished_with_reason_xml_tool_limit_reached_after_xml_tool_calls",
|
|
454
|
-
level="DEFAULT", status_message=(
|
|
455
|
-
f"Stream finished with reason: xml_tool_limit_reached after {xml_tool_call_count} XML tool calls"))
|
|
61
|
+
xml_chunks = self._extract_xml_chunks(accumulated_content)
|
|
62
|
+
if len(xml_chunks) > self.max_xml_tool_calls:
|
|
63
|
+
limited_chunks = xml_chunks[:self.max_xml_tool_calls]
|
|
64
|
+
if limited_chunks:
|
|
65
|
+
last_chunk = limited_chunks[-1]
|
|
66
|
+
last_chunk_pos = accumulated_content.find(last_chunk) + len(last_chunk)
|
|
67
|
+
accumulated_content = accumulated_content[:last_chunk_pos]
|
|
68
|
+
parsed_xml_data = self._parse_xml_tool_calls(accumulated_content)
|
|
456
69
|
|
|
457
|
-
# Calculate if auto-continue is needed if the finish reason is length
|
|
458
70
|
should_auto_continue = (can_auto_continue and finish_reason == 'length')
|
|
459
71
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
if config.max_xml_tool_calls > 0 and xml_tool_call_count >= config.max_xml_tool_calls and xml_chunks_buffer:
|
|
465
|
-
last_xml_chunk = xml_chunks_buffer[-1]
|
|
466
|
-
last_chunk_end_pos = accumulated_content.find(last_xml_chunk) + len(last_xml_chunk)
|
|
467
|
-
if last_chunk_end_pos > 0:
|
|
468
|
-
accumulated_content = accumulated_content[:last_chunk_end_pos]
|
|
469
|
-
|
|
470
|
-
# ... (Extract complete_native_tool_calls logic) ...
|
|
471
|
-
# Update complete_native_tool_calls from buffer (initialized earlier)
|
|
472
|
-
if config.native_tool_calling:
|
|
473
|
-
for idx, tc_buf in tool_calls_buffer.items():
|
|
474
|
-
if tc_buf['id'] and tc_buf['function']['name'] and tc_buf['function']['arguments']:
|
|
475
|
-
try:
|
|
476
|
-
args = safe_json_parse(tc_buf['function']['arguments'])
|
|
477
|
-
complete_native_tool_calls.append({
|
|
478
|
-
"id": tc_buf['id'], "type": "function",
|
|
479
|
-
"function": {"name": tc_buf['function']['name'], "arguments": args}
|
|
480
|
-
})
|
|
481
|
-
except json.JSONDecodeError:
|
|
482
|
-
continue
|
|
72
|
+
self.root_span.event(name=f"stream_processor_start[{self.task_no}]({auto_continue_count})",level="DEFAULT",
|
|
73
|
+
status_message=f"finish_reason={finish_reason}, tool_exec_strategy={self.tool_execution_strategy}, "
|
|
74
|
+
f"parsed_xml_data_len={len(parsed_xml_data)}, accumulated_content={len(accumulated_content)}, "
|
|
75
|
+
f"should_auto_continue={should_auto_continue}")
|
|
483
76
|
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
is_llm_message=True, metadata={"thread_run_id": thread_run_id}
|
|
491
|
-
)
|
|
492
|
-
|
|
493
|
-
if last_assistant_message_object:
|
|
494
|
-
# Yield the complete saved object, adding stream_status metadata just for yield
|
|
495
|
-
yield_metadata = ensure_dict(last_assistant_message_object.get('metadata'), {})
|
|
496
|
-
yield_metadata['stream_status'] = 'complete'
|
|
497
|
-
# Format the message for yielding
|
|
498
|
-
yield_message = last_assistant_message_object.copy()
|
|
499
|
-
yield_message['metadata'] = yield_metadata
|
|
500
|
-
yield format_for_yield(yield_message)
|
|
501
|
-
else:
|
|
502
|
-
logging.error(f"Failed to save final assistant message for thread {thread_id}")
|
|
503
|
-
self.root_span.event(name="failed_to_save_final_assistant_message_for_thread", level="ERROR",
|
|
504
|
-
status_message=(f"Failed to save final assistant message for thread {thread_id}"))
|
|
505
|
-
# Save and yield an error status
|
|
506
|
-
err_content = {"role": "system", "status_type": "error",
|
|
507
|
-
"message": "Failed to save final assistant message"}
|
|
508
|
-
err_msg_obj = await self.add_response_message(
|
|
509
|
-
type="status", content=err_content,
|
|
510
|
-
is_llm_message=False, metadata={"thread_run_id": thread_run_id}
|
|
511
|
-
)
|
|
512
|
-
if err_msg_obj: yield format_for_yield(err_msg_obj)
|
|
513
|
-
|
|
514
|
-
# --- Process All Tool Results Now ---
|
|
515
|
-
if config.execute_tools:
|
|
516
|
-
final_tool_calls_to_process = []
|
|
517
|
-
# ... (Gather final_tool_calls_to_process from native and XML buffers) ...
|
|
518
|
-
# Gather native tool calls from buffer
|
|
519
|
-
if config.native_tool_calling and complete_native_tool_calls:
|
|
520
|
-
for tc in complete_native_tool_calls:
|
|
521
|
-
final_tool_calls_to_process.append({
|
|
522
|
-
"function_name": tc["function"]["name"],
|
|
523
|
-
"arguments": tc["function"]["arguments"], # Already parsed object
|
|
524
|
-
"id": tc["id"]
|
|
525
|
-
})
|
|
526
|
-
# Gather XML tool calls from buffer (up to limit)
|
|
527
|
-
parsed_xml_data = []
|
|
528
|
-
if config.xml_tool_calling:
|
|
529
|
-
# Reparse remaining content just in case (should be empty if processed correctly)
|
|
530
|
-
xml_chunks = self._extract_xml_chunks(current_xml_content)
|
|
531
|
-
xml_chunks_buffer.extend(xml_chunks)
|
|
532
|
-
# Process only chunks not already handled in the stream loop
|
|
533
|
-
remaining_limit = config.max_xml_tool_calls - xml_tool_call_count if config.max_xml_tool_calls > 0 else len(
|
|
534
|
-
xml_chunks_buffer)
|
|
535
|
-
xml_chunks_to_process = xml_chunks_buffer[:remaining_limit] # Ensure limit is respected
|
|
536
|
-
|
|
537
|
-
for chunk in xml_chunks_to_process:
|
|
538
|
-
parsed_result = self._parse_xml_tool_call(chunk)
|
|
539
|
-
if parsed_result:
|
|
540
|
-
tool_call, parsing_details = parsed_result
|
|
541
|
-
# Avoid adding if already processed during streaming
|
|
542
|
-
if not any(exec['tool_call'] == tool_call for exec in pending_tool_executions):
|
|
543
|
-
final_tool_calls_to_process.append(tool_call)
|
|
544
|
-
parsed_xml_data.append({'tool_call': tool_call, 'parsing_details': parsing_details})
|
|
545
|
-
|
|
546
|
-
all_tool_data_map = {} # tool_index -> {'tool_call': ..., 'parsing_details': ...}
|
|
547
|
-
# Add native tool data
|
|
548
|
-
native_tool_index = 0
|
|
549
|
-
if config.native_tool_calling and complete_native_tool_calls:
|
|
550
|
-
for tc in complete_native_tool_calls:
|
|
551
|
-
# Find the corresponding entry in final_tool_calls_to_process if needed
|
|
552
|
-
# For now, assume order matches if only native used
|
|
553
|
-
exec_tool_call = {
|
|
554
|
-
"function_name": tc["function"]["name"],
|
|
555
|
-
"arguments": tc["function"]["arguments"],
|
|
556
|
-
"id": tc["id"]
|
|
557
|
-
}
|
|
558
|
-
all_tool_data_map[native_tool_index] = {"tool_call": exec_tool_call, "parsing_details": None}
|
|
559
|
-
native_tool_index += 1
|
|
77
|
+
assistant_msg = None
|
|
78
|
+
if accumulated_content and not should_auto_continue:
|
|
79
|
+
message_data = {"role": "assistant", "content": accumulated_content}
|
|
80
|
+
assistant_msg = self.add_response_message(type="assistant", content=message_data,
|
|
81
|
+
is_llm_message=True)
|
|
82
|
+
yield assistant_msg
|
|
560
83
|
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
all_tool_data_map[xml_tool_index_start + idx] = item
|
|
84
|
+
tool_calls_to_execute = [item['tool_call'] for item in parsed_xml_data]
|
|
85
|
+
if len(tool_calls_to_execute) > 0:
|
|
86
|
+
tool_results = await self._execute_tools(tool_calls_to_execute, self.tool_execution_strategy)
|
|
565
87
|
|
|
566
|
-
|
|
88
|
+
tool_index = 0
|
|
89
|
+
for i, (returned_tool_call, tool_result) in enumerate(tool_results):
|
|
90
|
+
parsed_xml_item = parsed_xml_data[i]
|
|
91
|
+
tool_call = parsed_xml_item['tool_call']
|
|
92
|
+
parsing_details = parsed_xml_item['parsing_details']
|
|
93
|
+
assistant_msg_id = assistant_msg['message_id'] if assistant_msg else None
|
|
567
94
|
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
logging.info(f"Processing {len(tool_results_buffer)} buffered tool results")
|
|
571
|
-
self.root_span.event(name="processing_buffered_tool_results", level="DEFAULT",
|
|
572
|
-
status_message=(f"Processing {len(tool_results_buffer)} buffered tool results"))
|
|
573
|
-
for tool_call, result, tool_idx, context in tool_results_buffer:
|
|
574
|
-
if last_assistant_message_object: context.assistant_message_id = last_assistant_message_object[
|
|
575
|
-
'message_id']
|
|
576
|
-
tool_results_map[tool_idx] = (tool_call, result, context)
|
|
95
|
+
tool_context = self._create_tool_context(tool_call, tool_index, assistant_msg_id, parsing_details)
|
|
96
|
+
tool_context.result = tool_result
|
|
577
97
|
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
logging.info(
|
|
581
|
-
f"Executing {len(final_tool_calls_to_process)} tools ({config.tool_execution_strategy}) after stream")
|
|
582
|
-
self.root_span.event(name="executing_tools_after_stream", level="DEFAULT", status_message=(
|
|
583
|
-
f"Executing {len(final_tool_calls_to_process)} tools ({config.tool_execution_strategy}) after stream"))
|
|
584
|
-
results_list = await self._execute_tools(final_tool_calls_to_process,
|
|
585
|
-
config.tool_execution_strategy)
|
|
586
|
-
current_tool_idx = 0
|
|
587
|
-
for tc, res in results_list:
|
|
588
|
-
# Map back using all_tool_data_map which has correct indices
|
|
589
|
-
if current_tool_idx in all_tool_data_map:
|
|
590
|
-
tool_data = all_tool_data_map[current_tool_idx]
|
|
591
|
-
context = self._create_tool_context(
|
|
592
|
-
tc, current_tool_idx,
|
|
593
|
-
last_assistant_message_object['message_id'] if last_assistant_message_object else None,
|
|
594
|
-
tool_data.get('parsing_details')
|
|
595
|
-
)
|
|
596
|
-
context.result = res
|
|
597
|
-
tool_results_map[current_tool_idx] = (tc, res, context)
|
|
598
|
-
else:
|
|
599
|
-
logging.warning(f"Could not map result for tool index {current_tool_idx}")
|
|
600
|
-
self.root_span.event(name="could_not_map_result_for_tool_index", level="WARNING",
|
|
601
|
-
status_message=(f"Could not map result for tool index {current_tool_idx}"))
|
|
602
|
-
current_tool_idx += 1
|
|
98
|
+
tool_start_msg = self._add_tool_start_message(tool_context)
|
|
99
|
+
yield format_for_yield(tool_start_msg)
|
|
603
100
|
|
|
604
|
-
|
|
605
|
-
if tool_results_map:
|
|
606
|
-
logging.info(f"Saving and yielding {len(tool_results_map)} final tool result messages")
|
|
607
|
-
self.root_span.event(name="saving_and_yielding_final_tool_result_messages", level="DEFAULT",
|
|
608
|
-
status_message=(
|
|
609
|
-
f"Saving and yielding {len(tool_results_map)} final tool result messages"))
|
|
610
|
-
for tool_idx in sorted(tool_results_map.keys()):
|
|
611
|
-
tool_call, result, context = tool_results_map[tool_idx]
|
|
612
|
-
context.result = result
|
|
613
|
-
if not context.assistant_message_id and last_assistant_message_object:
|
|
614
|
-
context.assistant_message_id = last_assistant_message_object['message_id']
|
|
101
|
+
tool_message = self._add_tool_messsage(tool_call, tool_result, self.xml_adding_strategy, assistant_msg_id, parsing_details)
|
|
615
102
|
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
started_msg_obj = await self._add_tool_start_message(context)
|
|
619
|
-
if started_msg_obj: yield format_for_yield(started_msg_obj)
|
|
620
|
-
yielded_tool_indices.add(tool_idx) # Mark status yielded
|
|
103
|
+
tool_completed_msg = self._add_tool_completed_message(tool_context, tool_message['message_id'])
|
|
104
|
+
yield format_for_yield(tool_completed_msg)
|
|
621
105
|
|
|
622
|
-
|
|
623
|
-
saved_tool_result_object = await self._add_tool_messsage(tool_call, result, config.xml_adding_strategy,
|
|
624
|
-
context.assistant_message_id, context.parsing_details
|
|
625
|
-
)
|
|
106
|
+
yield format_for_yield(tool_message)
|
|
626
107
|
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
saved_tool_result_object['message_id'] if saved_tool_result_object else None
|
|
631
|
-
)
|
|
632
|
-
if completed_msg_obj: yield format_for_yield(completed_msg_obj)
|
|
633
|
-
# Don't add to yielded_tool_indices here, completion status is separate yield
|
|
108
|
+
if tool_completed_msg["metadata"].get("agent_should_terminate") == "true":
|
|
109
|
+
finish_reason = "completed"
|
|
110
|
+
break
|
|
634
111
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
else:
|
|
640
|
-
logging.error(
|
|
641
|
-
f"Failed to save tool result for index {tool_idx}, not yielding result message.")
|
|
642
|
-
self.root_span.event(name="failed_to_save_tool_result_for_index", level="ERROR",
|
|
643
|
-
status_message=(
|
|
644
|
-
f"Failed to save tool result for index {tool_idx}, not yielding result message."))
|
|
645
|
-
# Optionally yield error status for saving failure?
|
|
112
|
+
tool_index += 1
|
|
113
|
+
else:
|
|
114
|
+
finish_reason = "non_tool_call"
|
|
115
|
+
logging.warning(f"StreamResp: tool_calls is empty, No Tool need to call !")
|
|
646
116
|
|
|
647
|
-
|
|
648
|
-
if finish_reason and finish_reason != "xml_tool_limit_reached":
|
|
117
|
+
if finish_reason:
|
|
649
118
|
finish_content = {"status_type": "finish", "finish_reason": finish_reason}
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
is_llm_message=False, metadata={"thread_run_id": thread_run_id}
|
|
653
|
-
)
|
|
654
|
-
if finish_msg_obj: yield format_for_yield(finish_msg_obj)
|
|
655
|
-
|
|
656
|
-
# Check if agent should terminate after processing pending tools
|
|
657
|
-
if agent_should_terminate:
|
|
658
|
-
logging.info(
|
|
659
|
-
"Agent termination requested after executing ask/complete tool. Stopping further processing.")
|
|
660
|
-
self.root_span.event(name="agent_termination_requested", level="DEFAULT",
|
|
661
|
-
status_message="Agent termination requested after executing ask/complete tool. Stopping further processing.")
|
|
662
|
-
|
|
663
|
-
# Set finish reason to indicate termination
|
|
664
|
-
finish_reason = "agent_terminated"
|
|
665
|
-
|
|
666
|
-
# Save and yield termination status
|
|
667
|
-
finish_content = {"status_type": "finish", "finish_reason": "agent_terminated"}
|
|
668
|
-
finish_msg_obj = await self.add_response_message(
|
|
669
|
-
type="status", content=finish_content,
|
|
670
|
-
is_llm_message=False, metadata={"thread_run_id": thread_run_id}
|
|
671
|
-
)
|
|
672
|
-
if finish_msg_obj: yield format_for_yield(finish_msg_obj)
|
|
673
|
-
|
|
674
|
-
# Save assistant_response_end BEFORE terminating
|
|
675
|
-
if last_assistant_message_object:
|
|
676
|
-
try:
|
|
677
|
-
# Calculate response time if we have timing data
|
|
678
|
-
if streaming_metadata["first_chunk_time"] and streaming_metadata["last_chunk_time"]:
|
|
679
|
-
streaming_metadata["response_ms"] = (streaming_metadata["last_chunk_time"] -
|
|
680
|
-
streaming_metadata["first_chunk_time"]) * 1000
|
|
681
|
-
|
|
682
|
-
# Create a LiteLLM-like response object for streaming (before termination)
|
|
683
|
-
# Check if we have any actual usage data
|
|
684
|
-
has_usage_data = (
|
|
685
|
-
streaming_metadata["usage"]["prompt_tokens"] > 0 or
|
|
686
|
-
streaming_metadata["usage"]["completion_tokens"] > 0 or
|
|
687
|
-
streaming_metadata["usage"]["total_tokens"] > 0
|
|
688
|
-
)
|
|
689
|
-
|
|
690
|
-
assistant_end_content = {
|
|
691
|
-
"choices": [
|
|
692
|
-
{
|
|
693
|
-
"finish_reason": finish_reason or "stop",
|
|
694
|
-
"index": 0,
|
|
695
|
-
"message": {
|
|
696
|
-
"role": "assistant",
|
|
697
|
-
"content": accumulated_content,
|
|
698
|
-
"tool_calls": complete_native_tool_calls or None
|
|
699
|
-
}
|
|
700
|
-
}
|
|
701
|
-
],
|
|
702
|
-
"created": streaming_metadata.get("created"),
|
|
703
|
-
"model": streaming_metadata.get("model", llm_model),
|
|
704
|
-
"usage": streaming_metadata["usage"], # Always include usage like LiteLLM does
|
|
705
|
-
"streaming": True, # Add flag to indicate this was reconstructed from streaming
|
|
706
|
-
}
|
|
707
|
-
|
|
708
|
-
# Only include response_ms if we have timing data
|
|
709
|
-
if streaming_metadata.get("response_ms"):
|
|
710
|
-
assistant_end_content["response_ms"] = streaming_metadata["response_ms"]
|
|
711
|
-
|
|
712
|
-
await self.add_response_message(
|
|
713
|
-
type="assistant_response_end",
|
|
714
|
-
content=assistant_end_content,
|
|
715
|
-
is_llm_message=False,
|
|
716
|
-
metadata={"thread_run_id": thread_run_id}
|
|
717
|
-
)
|
|
718
|
-
logging.info("Assistant response end saved for stream (before termination)")
|
|
719
|
-
except Exception as e:
|
|
720
|
-
logging.error(f"Error saving assistant response end for stream (before termination): {str(e)}")
|
|
721
|
-
self.root_span.event(name="error_saving_assistant_response_end_for_stream_before_termination",
|
|
722
|
-
level="ERROR", status_message=(
|
|
723
|
-
f"Error saving assistant response end for stream (before termination): {str(e)}"))
|
|
724
|
-
|
|
725
|
-
# Skip all remaining processing and go to finally block
|
|
726
|
-
return
|
|
727
|
-
|
|
728
|
-
# --- Save and Yield assistant_response_end ---
|
|
729
|
-
# Only save assistant_response_end if not auto-continuing (response is actually complete)
|
|
730
|
-
if not should_auto_continue:
|
|
731
|
-
if last_assistant_message_object: # Only save if assistant message was saved
|
|
732
|
-
try:
|
|
733
|
-
# Calculate response time if we have timing data
|
|
734
|
-
if streaming_metadata["first_chunk_time"] and streaming_metadata["last_chunk_time"]:
|
|
735
|
-
streaming_metadata["response_ms"] = (streaming_metadata["last_chunk_time"] -
|
|
736
|
-
streaming_metadata["first_chunk_time"]) * 1000
|
|
737
|
-
|
|
738
|
-
# Create a LiteLLM-like response object for streaming
|
|
739
|
-
# Check if we have any actual usage data
|
|
740
|
-
has_usage_data = (
|
|
741
|
-
streaming_metadata["usage"]["prompt_tokens"] > 0 or
|
|
742
|
-
streaming_metadata["usage"]["completion_tokens"] > 0 or
|
|
743
|
-
streaming_metadata["usage"]["total_tokens"] > 0
|
|
744
|
-
)
|
|
745
|
-
|
|
746
|
-
assistant_end_content = {
|
|
747
|
-
"choices": [
|
|
748
|
-
{
|
|
749
|
-
"finish_reason": finish_reason or "stop",
|
|
750
|
-
"index": 0,
|
|
751
|
-
"message": {
|
|
752
|
-
"role": "assistant",
|
|
753
|
-
"content": accumulated_content,
|
|
754
|
-
"tool_calls": complete_native_tool_calls or None
|
|
755
|
-
}
|
|
756
|
-
}
|
|
757
|
-
],
|
|
758
|
-
"created": streaming_metadata.get("created"),
|
|
759
|
-
"model": streaming_metadata.get("model", llm_model),
|
|
760
|
-
"usage": streaming_metadata["usage"], # Always include usage like LiteLLM does
|
|
761
|
-
"streaming": True, # Add flag to indicate this was reconstructed from streaming
|
|
762
|
-
}
|
|
763
|
-
|
|
764
|
-
# Only include response_ms if we have timing data
|
|
765
|
-
if streaming_metadata.get("response_ms"):
|
|
766
|
-
assistant_end_content["response_ms"] = streaming_metadata["response_ms"]
|
|
767
|
-
|
|
768
|
-
await self.add_response_message(
|
|
769
|
-
type="assistant_response_end",
|
|
770
|
-
content=assistant_end_content,
|
|
771
|
-
is_llm_message=False,
|
|
772
|
-
metadata={"thread_run_id": thread_run_id}
|
|
773
|
-
)
|
|
774
|
-
logging.info("Assistant response end saved for stream")
|
|
775
|
-
except Exception as e:
|
|
776
|
-
logging.error(f"Error saving assistant response end for stream: {str(e)}")
|
|
777
|
-
self.root_span.event(name="error_saving_assistant_response_end_for_stream", level="ERROR",
|
|
778
|
-
status_message=(f"Error saving assistant response end for stream: {str(e)}"))
|
|
779
|
-
|
|
119
|
+
finish_msg = self.add_response_message(type="status", content=finish_content, is_llm_message=False)
|
|
120
|
+
yield format_for_yield(finish_msg)
|
|
780
121
|
except Exception as e:
|
|
781
|
-
|
|
782
|
-
self.root_span.event(name="
|
|
783
|
-
|
|
784
|
-
|
|
122
|
+
trace = log_trace(e, f"StreamResp: Process response accumulated_content:\n {accumulated_content}")
|
|
123
|
+
self.root_span.event(name="stream_response_process_error", level="ERROR",
|
|
124
|
+
status_message=f"Process streaming response error: {e}",
|
|
125
|
+
metadata={"content": accumulated_content, "trace": trace})
|
|
785
126
|
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
type="status", content=err_content,
|
|
790
|
-
is_llm_message=False,
|
|
791
|
-
metadata={"thread_run_id": thread_run_id if 'thread_run_id' in locals() else None}
|
|
792
|
-
)
|
|
793
|
-
if err_msg_obj: yield format_for_yield(err_msg_obj) # Yield the saved error message
|
|
794
|
-
# Re-raise the same exception (not a new one) to ensure proper error propagation
|
|
795
|
-
logging.critical(f"Re-raising error to stop further processing: {str(e)}")
|
|
796
|
-
self.root_span.event(name="re_raising_error_to_stop_further_processing", level="ERROR",
|
|
797
|
-
status_message=(f"Re-raising error to stop further processing: {str(e)}"))
|
|
798
|
-
else:
|
|
799
|
-
logging.error(f"AnthropicException - Overloaded detected - Falling back to OpenRouter: {str(e)}",
|
|
800
|
-
exc_info=True)
|
|
801
|
-
self.root_span.event(name="anthropic_exception_overloaded_detected", level="ERROR", status_message=(
|
|
802
|
-
f"AnthropicException - Overloaded detected - Falling back to OpenRouter: {str(e)}"))
|
|
803
|
-
raise # Use bare 'raise' to preserve the original exception with its traceback
|
|
127
|
+
content = {"role": "system", "status_type": "error", "message": f"Process streaming response error: {e}"}
|
|
128
|
+
error_msg = self.add_response_message(type="status", content=content, is_llm_message=False)
|
|
129
|
+
yield format_for_yield(error_msg)
|
|
804
130
|
|
|
131
|
+
raise # Use bare 'raise' to preserve the original exception with its traceback
|
|
805
132
|
finally:
|
|
806
|
-
# Update continuous state for potential auto-continue
|
|
807
133
|
if should_auto_continue:
|
|
808
134
|
continuous_state['accumulated_content'] = accumulated_content
|
|
809
|
-
continuous_state['
|
|
810
|
-
|
|
811
|
-
logging.info(f"Updated continuous state for auto-continue with {len(accumulated_content)} chars")
|
|
812
|
-
else:
|
|
813
|
-
# Save and Yield the final thread_run_end status (only if not auto-continuing and finish_reason is not 'length')
|
|
814
|
-
try:
|
|
815
|
-
end_content = {"status_type": "thread_run_end"}
|
|
816
|
-
end_msg_obj = await self.add_response_message(
|
|
817
|
-
type="status", content=end_content,
|
|
818
|
-
is_llm_message=False,
|
|
819
|
-
metadata={"thread_run_id": thread_run_id if 'thread_run_id' in locals() else None}
|
|
820
|
-
)
|
|
821
|
-
if end_msg_obj: yield format_for_yield(end_msg_obj)
|
|
822
|
-
except Exception as final_e:
|
|
823
|
-
logging.error(f"Error in finally block: {str(final_e)}", exc_info=True)
|
|
824
|
-
self.root_span.event(name="error_in_finally_block", level="ERROR",
|
|
825
|
-
status_message=(f"Error in finally block: {str(final_e)}"))
|
|
135
|
+
continuous_state['assistant_msg_sequence'] = sequence
|
|
136
|
+
logging.warning(f"StreamResp: Updated continuous state for auto-continue with {len(accumulated_content)} chars")
|