remdb 0.3.226__py3-none-any.whl → 0.3.245__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

@@ -1,42 +1,36 @@
1
1
  """
2
2
  OpenAI-compatible streaming relay for Pydantic AI agents.
3
3
 
4
- Design Pattern:
5
- - Uses Pydantic AI's agent.iter() to capture full execution including tool calls
6
- - Emits rich SSE events: reasoning, tool_call, progress, metadata, text_delta
7
- - Proper OpenAI SSE format with data: prefix and [DONE] terminator
8
- - Error handling with graceful degradation
9
-
10
- Key Insight
11
- - agent.run_stream() stops after first output, missing tool calls
12
- - agent.iter() provides complete execution with tool call visibility
13
- - Use PartStartEvent to detect tool calls and thinking parts
14
- - Use PartDeltaEvent with TextPartDelta/ThinkingPartDelta for streaming
15
- - Use PartEndEvent to detect tool completion
16
- - Use FunctionToolResultEvent to get tool results
17
-
18
- Multi-Agent Context Propagation:
19
- - AgentContext is set via agent_context_scope() before agent.iter()
20
- - Child agents (via ask_agent tool) can access parent context via get_current_context()
21
- - Context includes user_id, tenant_id, session_id, is_eval for proper scoping
22
-
23
- SSE Format (OpenAI-compatible):
24
- data: {"id": "chatcmpl-...", "choices": [{"delta": {"content": "..."}}]}\\n\\n
25
- data: [DONE]\\n\\n
26
-
27
- Extended SSE Format (Custom Events):
28
- event: reasoning\\ndata: {"type": "reasoning", "content": "..."}\\n\\n
29
- event: tool_call\\ndata: {"type": "tool_call", "tool_name": "...", "status": "started"}\\n\\n
30
- event: progress\\ndata: {"type": "progress", "step": 1, "total_steps": 3}\\n\\n
31
- event: metadata\\ndata: {"type": "metadata", "confidence": 0.95}\\n\\n
32
-
33
- See sse_events.py for the full event type definitions.
4
+ Architecture:
5
+ ```
6
+ User Request stream_openai_response agent.iter() SSE Events → Client
7
+
8
+ ├── Parent agent events (text, tool calls)
9
+
10
+ └── Child agent events (via ask_agent tool)
11
+
12
+
13
+ Event Sink (asyncio.Queue)
14
+
15
+
16
+ drain_child_events() SSE + DB
17
+ ```
18
+
19
+ Modules:
20
+ - streaming.py: Main workflow orchestrator (this file)
21
+ - streaming_utils.py: Pure utility functions, StreamingState dataclass
22
+ - child_streaming.py: Child agent event handling
23
+
24
+ Key Design Decision (DUPLICATION FIX):
25
+ When child_content is streamed, state.child_content_streamed is set True.
26
+ Parent TextPartDelta events are SKIPPED when this flag is True,
27
+ preventing content from being emitted twice.
34
28
  """
35
29
 
36
30
  from __future__ import annotations
37
31
 
32
+ import asyncio
38
33
  import json
39
- import time
40
34
  import uuid
41
35
  from typing import TYPE_CHECKING, AsyncGenerator
42
36
 
@@ -54,7 +48,17 @@ from pydantic_ai.messages import (
54
48
  ToolCallPart,
55
49
  )
56
50
 
57
- from .otel_utils import get_current_trace_context, get_tracer
51
+ from .child_streaming import drain_child_events, stream_with_child_events, process_child_event
52
+ from .streaming_utils import (
53
+ StreamingState,
54
+ build_content_chunk,
55
+ build_progress_event,
56
+ build_tool_start_event,
57
+ extract_metadata_from_result,
58
+ extract_tool_args,
59
+ log_tool_call,
60
+ )
61
+ from .otel_utils import get_current_trace_context
58
62
  from .models import (
59
63
  ChatCompletionMessageDelta,
60
64
  ChatCompletionStreamChoice,
@@ -69,6 +73,9 @@ from .sse_events import (
69
73
  ToolCallEvent,
70
74
  format_sse_event,
71
75
  )
76
+ from ....services.session import SessionMessageStore
77
+ from ....settings import settings
78
+ from ....utils.date_utils import to_iso, utc_now
72
79
 
73
80
  if TYPE_CHECKING:
74
81
  from ....agentic.context import AgentContext
@@ -147,35 +154,16 @@ async def stream_openai_response(
147
154
  event: done
148
155
  data: {"type": "done", "reason": "stop"}
149
156
  """
150
- if request_id is None:
151
- request_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
152
-
153
- created_at = int(time.time())
154
- start_time = time.time()
155
- is_first_chunk = True
156
- reasoning_step = 0
157
- current_step = 0
158
- total_steps = 3 # Model request, tool execution (optional), final response
159
- token_count = 0
160
-
161
- # Track active tool calls for completion events
162
- # Maps index -> (tool_name, tool_id) for correlating start/end events
163
- active_tool_calls: dict[int, tuple[str, str]] = {}
164
- # Queue of tool calls awaiting completion (FIFO for matching)
165
- pending_tool_completions: list[tuple[str, str]] = []
166
- # Track if metadata was registered via register_metadata tool
167
- metadata_registered = False
168
- # Track which agent is actually responding (may be child agent if delegated)
169
- responding_agent: str | None = None
170
- # Track pending tool calls with full data for persistence
171
- # Maps tool_id -> {"tool_name": str, "tool_id": str, "arguments": dict}
172
- pending_tool_data: dict[str, dict] = {}
157
+ # Initialize streaming state
158
+ state = StreamingState.create(model=model, request_id=request_id)
159
+
160
+ # Get effective user_id for database operations
161
+ effective_user_id = agent_context.user_id if agent_context else None
173
162
 
174
163
  # Import context functions for multi-agent support
175
164
  from ....agentic.context import set_current_context, set_event_sink
176
165
 
177
166
  # Set up context for multi-agent propagation
178
- # This allows child agents (via ask_agent tool) to access parent context
179
167
  previous_context = None
180
168
  if agent_context is not None:
181
169
  from ....agentic.context import get_current_context
@@ -183,20 +171,17 @@ async def stream_openai_response(
183
171
  set_current_context(agent_context)
184
172
 
185
173
  # Set up event sink for child agent event proxying
186
- # Child agents (via ask_agent) will push their events here
187
- import asyncio
188
174
  child_event_sink: asyncio.Queue = asyncio.Queue()
189
175
  set_event_sink(child_event_sink)
190
176
 
191
177
  try:
192
178
  # Emit initial progress event
193
- current_step = 1
194
- yield format_sse_event(ProgressEvent(
195
- step=current_step,
196
- total_steps=total_steps,
179
+ state.current_step = 1
180
+ yield build_progress_event(
181
+ step=state.current_step,
182
+ total_steps=state.total_steps,
197
183
  label="Processing request",
198
- status="in_progress"
199
- ))
184
+ )
200
185
 
201
186
  # Use agent.iter() to get complete execution with tool calls
202
187
  # Pass message_history if available for proper tool call/return pairing
@@ -222,11 +207,11 @@ async def stream_openai_response(
222
207
  if isinstance(event, PartStartEvent) and isinstance(
223
208
  event.part, ThinkingPart
224
209
  ):
225
- reasoning_step += 1
210
+ state.reasoning_step += 1
226
211
  if event.part.content:
227
212
  yield format_sse_event(ReasoningEvent(
228
213
  content=event.part.content,
229
- step=reasoning_step
214
+ step=state.reasoning_step
230
215
  ))
231
216
 
232
217
  # Reasoning delta (streaming thinking)
@@ -236,7 +221,7 @@ async def stream_openai_response(
236
221
  if event.delta.content_delta:
237
222
  yield format_sse_event(ReasoningEvent(
238
223
  content=event.delta.content_delta,
239
- step=reasoning_step
224
+ step=state.reasoning_step
240
225
  ))
241
226
 
242
227
  # ============================================
@@ -245,28 +230,11 @@ async def stream_openai_response(
245
230
  elif isinstance(event, PartStartEvent) and isinstance(
246
231
  event.part, TextPart
247
232
  ):
248
- # TextPart may contain initial content that needs to be emitted
233
+ # Skip if child already streamed content
234
+ if state.child_content_streamed:
235
+ continue
249
236
  if event.part.content:
250
- content = event.part.content
251
- token_count += len(content.split())
252
-
253
- content_chunk = ChatCompletionStreamResponse(
254
- id=request_id,
255
- created=created_at,
256
- model=model,
257
- choices=[
258
- ChatCompletionStreamChoice(
259
- index=0,
260
- delta=ChatCompletionMessageDelta(
261
- role="assistant" if is_first_chunk else None,
262
- content=content,
263
- ),
264
- finish_reason=None,
265
- )
266
- ],
267
- )
268
- is_first_chunk = False
269
- yield f"data: {content_chunk.model_dump_json()}\n\n"
237
+ yield build_content_chunk(state, event.part.content)
270
238
 
271
239
  # ============================================
272
240
  # TOOL CALL START EVENTS
@@ -276,94 +244,39 @@ async def stream_openai_response(
276
244
  ):
277
245
  tool_name = event.part.tool_name
278
246
 
279
- # Handle final_result specially - it's Pydantic AI's
280
- # internal tool for structured output
247
+ # Handle final_result (Pydantic AI's internal tool)
281
248
  if tool_name == "final_result":
282
- # Extract the structured result and emit as content
283
- args_dict = None
284
- if event.part.args is not None:
285
- if hasattr(event.part.args, 'args_dict'):
286
- args_dict = event.part.args.args_dict
287
- elif isinstance(event.part.args, dict):
288
- args_dict = event.part.args
289
-
249
+ args_dict = extract_tool_args(event.part)
290
250
  if args_dict:
291
- # Emit the structured result as JSON content
292
251
  result_json = json.dumps(args_dict, indent=2)
293
- content_chunk = ChatCompletionStreamResponse(
294
- id=request_id,
295
- created=created_at,
296
- model=model,
297
- choices=[
298
- ChatCompletionStreamChoice(
299
- index=0,
300
- delta=ChatCompletionMessageDelta(
301
- role="assistant" if is_first_chunk else None,
302
- content=result_json,
303
- ),
304
- finish_reason=None,
305
- )
306
- ],
307
- )
308
- is_first_chunk = False
309
- yield f"data: {content_chunk.model_dump_json()}\n\n"
310
- continue # Skip regular tool call handling
252
+ yield build_content_chunk(state, result_json)
253
+ continue
311
254
 
312
255
  tool_id = f"call_{uuid.uuid4().hex[:8]}"
313
- active_tool_calls[event.index] = (tool_name, tool_id)
314
- # Queue for completion matching (FIFO)
315
- pending_tool_completions.append((tool_name, tool_id))
316
-
317
- # Emit tool_call SSE event (started)
318
- # Try to get arguments as dict
319
- args_dict = None
320
- if event.part.args is not None:
321
- if hasattr(event.part.args, 'args_dict'):
322
- args_dict = event.part.args.args_dict
323
- elif isinstance(event.part.args, dict):
324
- args_dict = event.part.args
325
- elif isinstance(event.part.args, str):
326
- # Parse JSON string args (common with pydantic-ai)
327
- try:
328
- args_dict = json.loads(event.part.args)
329
- except json.JSONDecodeError:
330
- logger.warning(f"Failed to parse tool args as JSON: {event.part.args[:100]}")
331
-
332
- # Log tool call with key parameters
333
- if args_dict and tool_name == "search_rem":
334
- query_type = args_dict.get("query_type", "?")
335
- limit = args_dict.get("limit", 20)
336
- table = args_dict.get("table", "")
337
- query_text = args_dict.get("query_text", args_dict.get("entity_key", ""))
338
- if query_text and len(query_text) > 50:
339
- query_text = query_text[:50] + "..."
340
- logger.info(f"🔧 {tool_name} {query_type.upper()} '{query_text}' table={table} limit={limit}")
341
- else:
342
- logger.info(f"🔧 {tool_name}")
256
+ state.active_tool_calls[event.index] = (tool_name, tool_id)
257
+ state.pending_tool_completions.append((tool_name, tool_id))
343
258
 
344
- yield format_sse_event(ToolCallEvent(
345
- tool_name=tool_name,
346
- tool_id=tool_id,
347
- status="started",
348
- arguments=args_dict
349
- ))
259
+ # Extract and log arguments
260
+ args_dict = extract_tool_args(event.part)
261
+ log_tool_call(tool_name, args_dict)
262
+
263
+ yield build_tool_start_event(tool_name, tool_id, args_dict)
350
264
 
351
- # Track tool call data for persistence (especially register_metadata)
352
- pending_tool_data[tool_id] = {
265
+ # Track for persistence
266
+ state.pending_tool_data[tool_id] = {
353
267
  "tool_name": tool_name,
354
268
  "tool_id": tool_id,
355
269
  "arguments": args_dict,
356
270
  }
357
271
 
358
272
  # Update progress
359
- current_step = 2
360
- total_steps = 4 # Added tool execution step
361
- yield format_sse_event(ProgressEvent(
362
- step=current_step,
363
- total_steps=total_steps,
273
+ state.current_step = 2
274
+ state.total_steps = 4
275
+ yield build_progress_event(
276
+ step=state.current_step,
277
+ total_steps=state.total_steps,
364
278
  label=f"Calling {tool_name}",
365
- status="in_progress"
366
- ))
279
+ )
367
280
 
368
281
  # ============================================
369
282
  # TOOL CALL COMPLETION (PartEndEvent)
@@ -371,28 +284,14 @@ async def stream_openai_response(
371
284
  elif isinstance(event, PartEndEvent) and isinstance(
372
285
  event.part, ToolCallPart
373
286
  ):
374
- if event.index in active_tool_calls:
375
- tool_name, tool_id = active_tool_calls[event.index]
376
-
377
- # Extract full args from completed ToolCallPart
378
- # (PartStartEvent only has empty/partial args during streaming)
379
- args_dict = None
380
- if event.part.args is not None:
381
- if hasattr(event.part.args, 'args_dict'):
382
- args_dict = event.part.args.args_dict
383
- elif isinstance(event.part.args, dict):
384
- args_dict = event.part.args
385
- elif isinstance(event.part.args, str) and event.part.args:
386
- try:
387
- args_dict = json.loads(event.part.args)
388
- except json.JSONDecodeError:
389
- logger.warning(f"Failed to parse tool args: {event.part.args[:100]}")
390
-
391
- # Update pending_tool_data with complete args
392
- if tool_id in pending_tool_data:
393
- pending_tool_data[tool_id]["arguments"] = args_dict
394
-
395
- del active_tool_calls[event.index]
287
+ if event.index in state.active_tool_calls:
288
+ tool_name, tool_id = state.active_tool_calls[event.index]
289
+ args_dict = extract_tool_args(event.part)
290
+
291
+ if tool_id in state.pending_tool_data:
292
+ state.pending_tool_data[tool_id]["arguments"] = args_dict
293
+
294
+ del state.active_tool_calls[event.index]
396
295
 
397
296
  # ============================================
398
297
  # TEXT CONTENT DELTA
@@ -400,186 +299,110 @@ async def stream_openai_response(
400
299
  elif isinstance(event, PartDeltaEvent) and isinstance(
401
300
  event.delta, TextPartDelta
402
301
  ):
302
+ # DUPLICATION FIX: Skip parent text if child already streamed content
303
+ # Child agents stream via child_content events in ask_agent tool.
304
+ # If parent tries to echo that content, skip it.
305
+ if state.child_content_streamed:
306
+ logger.debug("Skipping parent TextPartDelta - child content already streamed")
307
+ continue
308
+
403
309
  content = event.delta.content_delta
404
- token_count += len(content.split()) # Rough token estimate
405
-
406
- content_chunk = ChatCompletionStreamResponse(
407
- id=request_id,
408
- created=created_at,
409
- model=model,
410
- choices=[
411
- ChatCompletionStreamChoice(
412
- index=0,
413
- delta=ChatCompletionMessageDelta(
414
- role="assistant" if is_first_chunk else None,
415
- content=content,
416
- ),
417
- finish_reason=None,
418
- )
419
- ],
420
- )
421
- is_first_chunk = False
422
- yield f"data: {content_chunk.model_dump_json()}\n\n"
310
+ yield build_content_chunk(state, content)
423
311
 
424
312
  # ============================================
425
313
  # TOOL EXECUTION NODE
426
314
  # ============================================
427
315
  elif Agent.is_call_tools_node(node):
428
316
  async with node.stream(agent_run.ctx) as tools_stream:
429
- async for tool_event in tools_stream:
430
- # First, drain any child agent events that were pushed while tool was executing
431
- # This handles ask_agent streaming - child events are proxied here
432
- while not child_event_sink.empty():
433
- try:
434
- child_event = child_event_sink.get_nowait()
435
- event_type = child_event.get("type", "")
436
- child_agent = child_event.get("agent_name", "child")
437
-
438
- if event_type == "child_tool_start":
439
- # Emit child tool start as a nested tool call
440
- child_tool_id = f"call_{uuid.uuid4().hex[:8]}"
441
- # Ensure arguments is a dict or None (not empty string)
442
- child_args = child_event.get("arguments")
443
- if not isinstance(child_args, dict):
444
- child_args = None
445
- yield format_sse_event(ToolCallEvent(
446
- tool_name=f"{child_agent}:{child_event.get('tool_name', 'tool')}",
447
- tool_id=child_tool_id,
448
- status="started",
449
- arguments=child_args,
450
- ))
451
- elif event_type == "child_content":
452
- # Emit child content as assistant content
453
- # Track which child agent is responding
454
- responding_agent = child_agent
455
- content = child_event.get("content", "")
456
- if content:
457
- content_chunk = ChatCompletionStreamResponse(
458
- id=request_id,
459
- created=created_at,
460
- model=model,
461
- choices=[
462
- ChatCompletionStreamChoice(
463
- index=0,
464
- delta=ChatCompletionMessageDelta(
465
- role="assistant" if is_first_chunk else None,
466
- content=content,
467
- ),
468
- finish_reason=None,
469
- )
470
- ],
471
- )
472
- is_first_chunk = False
473
- yield f"data: {content_chunk.model_dump_json()}\n\n"
474
- elif event_type == "child_tool_result":
475
- # Emit child tool completion
476
- result = child_event.get("result", {})
477
- # Emit metadata event for child agent if it registered metadata
478
- if isinstance(result, dict) and result.get("_metadata_event"):
479
- responding_agent = result.get("agent_schema") or responding_agent
480
- yield format_sse_event(MetadataEvent(
481
- message_id=message_id,
482
- session_id=session_id,
483
- agent_schema=agent_schema,
484
- responding_agent=responding_agent,
485
- confidence=result.get("confidence"),
486
- extra={"risk_level": result.get("risk_level")} if result.get("risk_level") else None,
487
- ))
488
- yield format_sse_event(ToolCallEvent(
489
- tool_name=f"{child_agent}:tool",
490
- tool_id=f"call_{uuid.uuid4().hex[:8]}",
491
- status="completed",
492
- result=str(result)[:200] if result else None,
493
- ))
494
- except Exception as e:
495
- logger.warning(f"Error processing child event: {e}")
317
+ # Use concurrent multiplexer to handle both tool events
318
+ # and child agent events as they arrive (fixes streaming lag)
319
+ async for event_type, event_data in stream_with_child_events(
320
+ tools_stream=tools_stream,
321
+ child_event_sink=child_event_sink,
322
+ state=state,
323
+ session_id=session_id,
324
+ user_id=effective_user_id,
325
+ message_id=message_id,
326
+ agent_schema=agent_schema,
327
+ ):
328
+ # Handle child events (streamed from ask_agent)
329
+ if event_type == "child":
330
+ async for chunk in process_child_event(
331
+ child_event=event_data,
332
+ state=state,
333
+ session_id=session_id,
334
+ user_id=effective_user_id,
335
+ message_id=message_id,
336
+ agent_schema=agent_schema,
337
+ ):
338
+ yield chunk
339
+ continue
340
+
341
+ # Handle tool events
342
+ tool_event = event_data
496
343
 
497
344
  # Tool result event - emit completion
498
345
  if isinstance(tool_event, FunctionToolResultEvent):
499
346
  # Get the tool name/id from the pending queue (FIFO)
500
- if pending_tool_completions:
501
- tool_name, tool_id = pending_tool_completions.pop(0)
347
+ if state.pending_tool_completions:
348
+ tool_name, tool_id = state.pending_tool_completions.pop(0)
502
349
  else:
503
- # Fallback if queue is empty (shouldn't happen)
504
350
  tool_name = "tool"
505
351
  tool_id = f"call_{uuid.uuid4().hex[:8]}"
506
352
 
507
- # Check if this is a register_metadata tool result
508
- # It returns a dict with _metadata_event: True marker
509
353
  result_content = tool_event.result.content if hasattr(tool_event.result, 'content') else tool_event.result
510
354
  is_metadata_event = False
511
355
 
512
- if isinstance(result_content, dict) and result_content.get("_metadata_event"):
356
+ # Handle register_metadata tool results
357
+ metadata = extract_metadata_from_result(result_content)
358
+ if metadata:
513
359
  is_metadata_event = True
514
- metadata_registered = True # Skip default metadata at end
515
- # Emit MetadataEvent with registered values
516
- registered_confidence = result_content.get("confidence")
517
- registered_sources = result_content.get("sources")
518
- registered_references = result_content.get("references")
519
- registered_flags = result_content.get("flags")
520
- # Session naming
521
- registered_session_name = result_content.get("session_name")
522
- # Risk assessment fields
523
- registered_risk_level = result_content.get("risk_level")
524
- registered_risk_score = result_content.get("risk_score")
525
- registered_risk_reasoning = result_content.get("risk_reasoning")
526
- registered_recommended_action = result_content.get("recommended_action")
527
- # Extra fields
528
- registered_extra = result_content.get("extra")
529
- # Only set responding_agent if not already set by child events
530
- # Child agents should take precedence - they're the actual responders
531
- if not responding_agent:
532
- responding_agent = result_content.get("agent_schema")
360
+ state.metadata_registered = True
361
+
362
+ # Only set responding_agent if not already set by child
363
+ if not state.responding_agent and metadata.get("agent_schema"):
364
+ state.responding_agent = metadata["agent_schema"]
533
365
 
534
366
  logger.info(
535
- f"📊 Metadata registered: confidence={registered_confidence}, "
536
- f"session_name={registered_session_name}, "
537
- f"risk_level={registered_risk_level}, sources={registered_sources}"
367
+ f"📊 Metadata: confidence={metadata.get('confidence')}, "
368
+ f"risk_level={metadata.get('risk_level')}"
538
369
  )
539
370
 
540
- # Build extra dict with risk fields and any custom extras
371
+ # Build extra dict with risk fields
541
372
  extra_data = {}
542
- if registered_risk_level is not None:
543
- extra_data["risk_level"] = registered_risk_level
544
- if registered_risk_score is not None:
545
- extra_data["risk_score"] = registered_risk_score
546
- if registered_risk_reasoning is not None:
547
- extra_data["risk_reasoning"] = registered_risk_reasoning
548
- if registered_recommended_action is not None:
549
- extra_data["recommended_action"] = registered_recommended_action
550
- if registered_extra:
551
- extra_data.update(registered_extra)
552
-
553
- # Emit metadata event immediately
373
+ for field in ["risk_level", "risk_score", "risk_reasoning", "recommended_action"]:
374
+ if metadata.get(field) is not None:
375
+ extra_data[field] = metadata[field]
376
+ if metadata.get("extra"):
377
+ extra_data.update(metadata["extra"])
378
+
554
379
  yield format_sse_event(MetadataEvent(
555
380
  message_id=message_id,
556
381
  in_reply_to=in_reply_to,
557
382
  session_id=session_id,
558
383
  agent_schema=agent_schema,
559
- responding_agent=responding_agent,
560
- session_name=registered_session_name,
561
- confidence=registered_confidence,
562
- sources=registered_sources,
384
+ responding_agent=state.responding_agent,
385
+ session_name=metadata.get("session_name"),
386
+ confidence=metadata.get("confidence"),
387
+ sources=metadata.get("sources"),
563
388
  model_version=model,
564
- flags=registered_flags,
389
+ flags=metadata.get("flags"),
565
390
  extra=extra_data if extra_data else None,
566
391
  hidden=False,
567
392
  ))
568
393
 
569
- # Get complete args from pending_tool_data BEFORE deleting
570
- # (captured at PartEndEvent with full args)
394
+ # Get complete args from pending_tool_data
571
395
  completed_args = None
572
- if tool_id in pending_tool_data:
573
- completed_args = pending_tool_data[tool_id].get("arguments")
396
+ if tool_id in state.pending_tool_data:
397
+ completed_args = state.pending_tool_data[tool_id].get("arguments")
574
398
 
575
- # Capture tool call with result for persistence
576
- # Special handling for register_metadata - always capture full data
577
- if tool_calls_out is not None and tool_id in pending_tool_data:
578
- tool_data = pending_tool_data[tool_id]
399
+ # Capture tool call for persistence
400
+ if tool_calls_out is not None and tool_id in state.pending_tool_data:
401
+ tool_data = state.pending_tool_data[tool_id]
579
402
  tool_data["result"] = result_content
580
403
  tool_data["is_metadata"] = is_metadata_event
581
404
  tool_calls_out.append(tool_data)
582
- del pending_tool_data[tool_id]
405
+ del state.pending_tool_data[tool_id]
583
406
 
584
407
  if not is_metadata_event:
585
408
  # NOTE: text_response fallback is DISABLED
@@ -629,10 +452,10 @@ async def stream_openai_response(
629
452
  ))
630
453
 
631
454
  # Update progress after tool completion
632
- current_step = 3
455
+ state.current_step = 3
633
456
  yield format_sse_event(ProgressEvent(
634
- step=current_step,
635
- total_steps=total_steps,
457
+ step=state.current_step,
458
+ total_steps=state.total_steps,
636
459
  label="Generating response",
637
460
  status="in_progress"
638
461
  ))
@@ -661,36 +484,36 @@ async def stream_openai_response(
661
484
  result_dict = {"result": str(output)}
662
485
 
663
486
  result_json = json.dumps(result_dict, indent=2, default=str)
664
- token_count += len(result_json.split())
487
+ state.token_count += len(result_json.split())
665
488
 
666
489
  # Emit structured result as content
667
490
  result_chunk = ChatCompletionStreamResponse(
668
- id=request_id,
669
- created=created_at,
491
+ id=state.request_id,
492
+ created=state.created_at,
670
493
  model=model,
671
494
  choices=[
672
495
  ChatCompletionStreamChoice(
673
496
  index=0,
674
497
  delta=ChatCompletionMessageDelta(
675
- role="assistant" if is_first_chunk else None,
498
+ role="assistant" if state.is_first_chunk else None,
676
499
  content=result_json,
677
500
  ),
678
501
  finish_reason=None,
679
502
  )
680
503
  ],
681
504
  )
682
- is_first_chunk = False
505
+ state.is_first_chunk = False
683
506
  yield f"data: {result_chunk.model_dump_json()}\n\n"
684
507
  except Exception as e:
685
508
  logger.debug(f"No structured result available: {e}")
686
509
 
687
510
  # Calculate latency
688
- latency_ms = int((time.time() - start_time) * 1000)
511
+ latency_ms = state.latency_ms()
689
512
 
690
513
  # Final OpenAI chunk with finish_reason
691
514
  final_chunk = ChatCompletionStreamResponse(
692
- id=request_id,
693
- created=created_at,
515
+ id=state.request_id,
516
+ created=state.created_at,
694
517
  model=model,
695
518
  choices=[
696
519
  ChatCompletionStreamChoice(
@@ -703,28 +526,28 @@ async def stream_openai_response(
703
526
  yield f"data: {final_chunk.model_dump_json()}\n\n"
704
527
 
705
528
  # Emit metadata event only if not already registered via register_metadata tool
706
- if not metadata_registered:
529
+ if not state.metadata_registered:
707
530
  yield format_sse_event(MetadataEvent(
708
531
  message_id=message_id,
709
532
  in_reply_to=in_reply_to,
710
533
  session_id=session_id,
711
534
  agent_schema=agent_schema,
712
- responding_agent=responding_agent,
535
+ responding_agent=state.responding_agent,
713
536
  confidence=1.0, # Default to 100% confidence
714
537
  model_version=model,
715
538
  latency_ms=latency_ms,
716
- token_count=token_count,
539
+ token_count=state.token_count,
717
540
  # Include deterministic trace context captured from OTEL
718
541
  trace_id=captured_trace_id,
719
542
  span_id=captured_span_id,
720
543
  ))
721
544
 
722
545
  # Mark all progress complete
723
- for step in range(1, total_steps + 1):
546
+ for step in range(1, state.total_steps + 1):
724
547
  yield format_sse_event(ProgressEvent(
725
548
  step=step,
726
- total_steps=total_steps,
727
- label="Complete" if step == total_steps else f"Step {step}",
549
+ total_steps=state.total_steps,
550
+ label="Complete" if step == state.total_steps else f"Step {step}",
728
551
  status="completed"
729
552
  ))
730
553
 
@@ -919,18 +742,8 @@ async def save_user_message(
919
742
  """
920
743
  Save user message to database before streaming.
921
744
 
922
- This is a shared utility used by both API and CLI to ensure consistent
923
- user message storage.
924
-
925
- Args:
926
- session_id: Session ID for message storage
927
- user_id: User ID for message storage
928
- content: The user's message content
745
+ Shared utility used by both API and CLI for consistent user message storage.
929
746
  """
930
- from ....utils.date_utils import utc_now, to_iso
931
- from ....services.session import SessionMessageStore
932
- from ....settings import settings
933
-
934
747
  if not settings.postgres.enabled or not session_id:
935
748
  return
936
749
 
@@ -987,10 +800,6 @@ async def stream_openai_response_with_save(
987
800
  Yields:
988
801
  SSE-formatted strings
989
802
  """
990
- from ....utils.date_utils import utc_now, to_iso
991
- from ....services.session import SessionMessageStore
992
- from ....settings import settings
993
-
994
803
  # Pre-generate message_id so it can be sent in metadata event
995
804
  # This allows frontend to use it for feedback before DB persistence
996
805
  message_id = str(uuid.uuid4())