remdb 0.3.202__py3-none-any.whl → 0.3.245__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (44) hide show
  1. rem/agentic/README.md +36 -2
  2. rem/agentic/context.py +86 -3
  3. rem/agentic/context_builder.py +39 -33
  4. rem/agentic/mcp/tool_wrapper.py +2 -2
  5. rem/agentic/providers/pydantic_ai.py +68 -51
  6. rem/agentic/schema.py +2 -2
  7. rem/api/mcp_router/resources.py +223 -0
  8. rem/api/mcp_router/tools.py +170 -18
  9. rem/api/routers/admin.py +30 -4
  10. rem/api/routers/auth.py +175 -18
  11. rem/api/routers/chat/child_streaming.py +394 -0
  12. rem/api/routers/chat/completions.py +24 -29
  13. rem/api/routers/chat/sse_events.py +5 -1
  14. rem/api/routers/chat/streaming.py +242 -272
  15. rem/api/routers/chat/streaming_utils.py +327 -0
  16. rem/api/routers/common.py +18 -0
  17. rem/api/routers/dev.py +7 -1
  18. rem/api/routers/feedback.py +9 -1
  19. rem/api/routers/messages.py +80 -15
  20. rem/api/routers/models.py +9 -1
  21. rem/api/routers/query.py +17 -15
  22. rem/api/routers/shared_sessions.py +16 -0
  23. rem/cli/commands/ask.py +205 -114
  24. rem/cli/commands/process.py +12 -4
  25. rem/cli/commands/query.py +109 -0
  26. rem/cli/commands/session.py +117 -0
  27. rem/cli/main.py +2 -0
  28. rem/models/entities/session.py +1 -0
  29. rem/schemas/agents/rem.yaml +1 -1
  30. rem/services/postgres/repository.py +7 -7
  31. rem/services/rem/service.py +47 -0
  32. rem/services/session/__init__.py +2 -1
  33. rem/services/session/compression.py +14 -12
  34. rem/services/session/pydantic_messages.py +111 -11
  35. rem/services/session/reload.py +2 -1
  36. rem/settings.py +71 -0
  37. rem/sql/migrations/001_install.sql +4 -4
  38. rem/sql/migrations/004_cache_system.sql +3 -1
  39. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  40. rem/utils/schema_loader.py +139 -111
  41. {remdb-0.3.202.dist-info → remdb-0.3.245.dist-info}/METADATA +2 -2
  42. {remdb-0.3.202.dist-info → remdb-0.3.245.dist-info}/RECORD +44 -39
  43. {remdb-0.3.202.dist-info → remdb-0.3.245.dist-info}/WHEEL +0 -0
  44. {remdb-0.3.202.dist-info → remdb-0.3.245.dist-info}/entry_points.txt +0 -0
@@ -1,42 +1,36 @@
1
1
  """
2
2
  OpenAI-compatible streaming relay for Pydantic AI agents.
3
3
 
4
- Design Pattern:
5
- - Uses Pydantic AI's agent.iter() to capture full execution including tool calls
6
- - Emits rich SSE events: reasoning, tool_call, progress, metadata, text_delta
7
- - Proper OpenAI SSE format with data: prefix and [DONE] terminator
8
- - Error handling with graceful degradation
9
-
10
- Key Insight
11
- - agent.run_stream() stops after first output, missing tool calls
12
- - agent.iter() provides complete execution with tool call visibility
13
- - Use PartStartEvent to detect tool calls and thinking parts
14
- - Use PartDeltaEvent with TextPartDelta/ThinkingPartDelta for streaming
15
- - Use PartEndEvent to detect tool completion
16
- - Use FunctionToolResultEvent to get tool results
17
-
18
- Multi-Agent Context Propagation:
19
- - AgentContext is set via agent_context_scope() before agent.iter()
20
- - Child agents (via ask_agent tool) can access parent context via get_current_context()
21
- - Context includes user_id, tenant_id, session_id, is_eval for proper scoping
22
-
23
- SSE Format (OpenAI-compatible):
24
- data: {"id": "chatcmpl-...", "choices": [{"delta": {"content": "..."}}]}\\n\\n
25
- data: [DONE]\\n\\n
26
-
27
- Extended SSE Format (Custom Events):
28
- event: reasoning\\ndata: {"type": "reasoning", "content": "..."}\\n\\n
29
- event: tool_call\\ndata: {"type": "tool_call", "tool_name": "...", "status": "started"}\\n\\n
30
- event: progress\\ndata: {"type": "progress", "step": 1, "total_steps": 3}\\n\\n
31
- event: metadata\\ndata: {"type": "metadata", "confidence": 0.95}\\n\\n
32
-
33
- See sse_events.py for the full event type definitions.
4
+ Architecture:
5
+ ```
6
+ User Request stream_openai_response agent.iter() SSE Events → Client
7
+
8
+ ├── Parent agent events (text, tool calls)
9
+
10
+ └── Child agent events (via ask_agent tool)
11
+
12
+
13
+ Event Sink (asyncio.Queue)
14
+
15
+
16
+ drain_child_events() SSE + DB
17
+ ```
18
+
19
+ Modules:
20
+ - streaming.py: Main workflow orchestrator (this file)
21
+ - streaming_utils.py: Pure utility functions, StreamingState dataclass
22
+ - child_streaming.py: Child agent event handling
23
+
24
+ Key Design Decision (DUPLICATION FIX):
25
+ When child_content is streamed, state.child_content_streamed is set True.
26
+ Parent TextPartDelta events are SKIPPED when this flag is True,
27
+ preventing content from being emitted twice.
34
28
  """
35
29
 
36
30
  from __future__ import annotations
37
31
 
32
+ import asyncio
38
33
  import json
39
- import time
40
34
  import uuid
41
35
  from typing import TYPE_CHECKING, AsyncGenerator
42
36
 
@@ -54,7 +48,17 @@ from pydantic_ai.messages import (
54
48
  ToolCallPart,
55
49
  )
56
50
 
57
- from .otel_utils import get_current_trace_context, get_tracer
51
+ from .child_streaming import drain_child_events, stream_with_child_events, process_child_event
52
+ from .streaming_utils import (
53
+ StreamingState,
54
+ build_content_chunk,
55
+ build_progress_event,
56
+ build_tool_start_event,
57
+ extract_metadata_from_result,
58
+ extract_tool_args,
59
+ log_tool_call,
60
+ )
61
+ from .otel_utils import get_current_trace_context
58
62
  from .models import (
59
63
  ChatCompletionMessageDelta,
60
64
  ChatCompletionStreamChoice,
@@ -69,6 +73,9 @@ from .sse_events import (
69
73
  ToolCallEvent,
70
74
  format_sse_event,
71
75
  )
76
+ from ....services.session import SessionMessageStore
77
+ from ....settings import settings
78
+ from ....utils.date_utils import to_iso, utc_now
72
79
 
73
80
  if TYPE_CHECKING:
74
81
  from ....agentic.context import AgentContext
@@ -147,48 +154,34 @@ async def stream_openai_response(
147
154
  event: done
148
155
  data: {"type": "done", "reason": "stop"}
149
156
  """
150
- if request_id is None:
151
- request_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
152
-
153
- created_at = int(time.time())
154
- start_time = time.time()
155
- is_first_chunk = True
156
- reasoning_step = 0
157
- current_step = 0
158
- total_steps = 3 # Model request, tool execution (optional), final response
159
- token_count = 0
160
-
161
- # Track active tool calls for completion events
162
- # Maps index -> (tool_name, tool_id) for correlating start/end events
163
- active_tool_calls: dict[int, tuple[str, str]] = {}
164
- # Queue of tool calls awaiting completion (FIFO for matching)
165
- pending_tool_completions: list[tuple[str, str]] = []
166
- # Track if metadata was registered via register_metadata tool
167
- metadata_registered = False
168
- # Track pending tool calls with full data for persistence
169
- # Maps tool_id -> {"tool_name": str, "tool_id": str, "arguments": dict}
170
- pending_tool_data: dict[str, dict] = {}
157
+ # Initialize streaming state
158
+ state = StreamingState.create(model=model, request_id=request_id)
159
+
160
+ # Get effective user_id for database operations
161
+ effective_user_id = agent_context.user_id if agent_context else None
171
162
 
172
163
  # Import context functions for multi-agent support
173
- from ....agentic.context import set_current_context
164
+ from ....agentic.context import set_current_context, set_event_sink
174
165
 
175
166
  # Set up context for multi-agent propagation
176
- # This allows child agents (via ask_agent tool) to access parent context
177
167
  previous_context = None
178
168
  if agent_context is not None:
179
169
  from ....agentic.context import get_current_context
180
170
  previous_context = get_current_context()
181
171
  set_current_context(agent_context)
182
172
 
173
+ # Set up event sink for child agent event proxying
174
+ child_event_sink: asyncio.Queue = asyncio.Queue()
175
+ set_event_sink(child_event_sink)
176
+
183
177
  try:
184
178
  # Emit initial progress event
185
- current_step = 1
186
- yield format_sse_event(ProgressEvent(
187
- step=current_step,
188
- total_steps=total_steps,
179
+ state.current_step = 1
180
+ yield build_progress_event(
181
+ step=state.current_step,
182
+ total_steps=state.total_steps,
189
183
  label="Processing request",
190
- status="in_progress"
191
- ))
184
+ )
192
185
 
193
186
  # Use agent.iter() to get complete execution with tool calls
194
187
  # Pass message_history if available for proper tool call/return pairing
@@ -214,11 +207,11 @@ async def stream_openai_response(
214
207
  if isinstance(event, PartStartEvent) and isinstance(
215
208
  event.part, ThinkingPart
216
209
  ):
217
- reasoning_step += 1
210
+ state.reasoning_step += 1
218
211
  if event.part.content:
219
212
  yield format_sse_event(ReasoningEvent(
220
213
  content=event.part.content,
221
- step=reasoning_step
214
+ step=state.reasoning_step
222
215
  ))
223
216
 
224
217
  # Reasoning delta (streaming thinking)
@@ -228,7 +221,7 @@ async def stream_openai_response(
228
221
  if event.delta.content_delta:
229
222
  yield format_sse_event(ReasoningEvent(
230
223
  content=event.delta.content_delta,
231
- step=reasoning_step
224
+ step=state.reasoning_step
232
225
  ))
233
226
 
234
227
  # ============================================
@@ -237,28 +230,11 @@ async def stream_openai_response(
237
230
  elif isinstance(event, PartStartEvent) and isinstance(
238
231
  event.part, TextPart
239
232
  ):
240
- # TextPart may contain initial content that needs to be emitted
233
+ # Skip if child already streamed content
234
+ if state.child_content_streamed:
235
+ continue
241
236
  if event.part.content:
242
- content = event.part.content
243
- token_count += len(content.split())
244
-
245
- content_chunk = ChatCompletionStreamResponse(
246
- id=request_id,
247
- created=created_at,
248
- model=model,
249
- choices=[
250
- ChatCompletionStreamChoice(
251
- index=0,
252
- delta=ChatCompletionMessageDelta(
253
- role="assistant" if is_first_chunk else None,
254
- content=content,
255
- ),
256
- finish_reason=None,
257
- )
258
- ],
259
- )
260
- is_first_chunk = False
261
- yield f"data: {content_chunk.model_dump_json()}\n\n"
237
+ yield build_content_chunk(state, event.part.content)
262
238
 
263
239
  # ============================================
264
240
  # TOOL CALL START EVENTS
@@ -268,94 +244,39 @@ async def stream_openai_response(
268
244
  ):
269
245
  tool_name = event.part.tool_name
270
246
 
271
- # Handle final_result specially - it's Pydantic AI's
272
- # internal tool for structured output
247
+ # Handle final_result (Pydantic AI's internal tool)
273
248
  if tool_name == "final_result":
274
- # Extract the structured result and emit as content
275
- args_dict = None
276
- if event.part.args is not None:
277
- if hasattr(event.part.args, 'args_dict'):
278
- args_dict = event.part.args.args_dict
279
- elif isinstance(event.part.args, dict):
280
- args_dict = event.part.args
281
-
249
+ args_dict = extract_tool_args(event.part)
282
250
  if args_dict:
283
- # Emit the structured result as JSON content
284
251
  result_json = json.dumps(args_dict, indent=2)
285
- content_chunk = ChatCompletionStreamResponse(
286
- id=request_id,
287
- created=created_at,
288
- model=model,
289
- choices=[
290
- ChatCompletionStreamChoice(
291
- index=0,
292
- delta=ChatCompletionMessageDelta(
293
- role="assistant" if is_first_chunk else None,
294
- content=result_json,
295
- ),
296
- finish_reason=None,
297
- )
298
- ],
299
- )
300
- is_first_chunk = False
301
- yield f"data: {content_chunk.model_dump_json()}\n\n"
302
- continue # Skip regular tool call handling
252
+ yield build_content_chunk(state, result_json)
253
+ continue
303
254
 
304
255
  tool_id = f"call_{uuid.uuid4().hex[:8]}"
305
- active_tool_calls[event.index] = (tool_name, tool_id)
306
- # Queue for completion matching (FIFO)
307
- pending_tool_completions.append((tool_name, tool_id))
308
-
309
- # Emit tool_call SSE event (started)
310
- # Try to get arguments as dict
311
- args_dict = None
312
- if event.part.args is not None:
313
- if hasattr(event.part.args, 'args_dict'):
314
- args_dict = event.part.args.args_dict
315
- elif isinstance(event.part.args, dict):
316
- args_dict = event.part.args
317
- elif isinstance(event.part.args, str):
318
- # Parse JSON string args (common with pydantic-ai)
319
- try:
320
- args_dict = json.loads(event.part.args)
321
- except json.JSONDecodeError:
322
- logger.warning(f"Failed to parse tool args as JSON: {event.part.args[:100]}")
323
-
324
- # Log tool call with key parameters
325
- if args_dict and tool_name == "search_rem":
326
- query_type = args_dict.get("query_type", "?")
327
- limit = args_dict.get("limit", 20)
328
- table = args_dict.get("table", "")
329
- query_text = args_dict.get("query_text", args_dict.get("entity_key", ""))
330
- if query_text and len(query_text) > 50:
331
- query_text = query_text[:50] + "..."
332
- logger.info(f"🔧 {tool_name} {query_type.upper()} '{query_text}' table={table} limit={limit}")
333
- else:
334
- logger.info(f"🔧 {tool_name}")
256
+ state.active_tool_calls[event.index] = (tool_name, tool_id)
257
+ state.pending_tool_completions.append((tool_name, tool_id))
335
258
 
336
- yield format_sse_event(ToolCallEvent(
337
- tool_name=tool_name,
338
- tool_id=tool_id,
339
- status="started",
340
- arguments=args_dict
341
- ))
259
+ # Extract and log arguments
260
+ args_dict = extract_tool_args(event.part)
261
+ log_tool_call(tool_name, args_dict)
342
262
 
343
- # Track tool call data for persistence (especially register_metadata)
344
- pending_tool_data[tool_id] = {
263
+ yield build_tool_start_event(tool_name, tool_id, args_dict)
264
+
265
+ # Track for persistence
266
+ state.pending_tool_data[tool_id] = {
345
267
  "tool_name": tool_name,
346
268
  "tool_id": tool_id,
347
269
  "arguments": args_dict,
348
270
  }
349
271
 
350
272
  # Update progress
351
- current_step = 2
352
- total_steps = 4 # Added tool execution step
353
- yield format_sse_event(ProgressEvent(
354
- step=current_step,
355
- total_steps=total_steps,
273
+ state.current_step = 2
274
+ state.total_steps = 4
275
+ yield build_progress_event(
276
+ step=state.current_step,
277
+ total_steps=state.total_steps,
356
278
  label=f"Calling {tool_name}",
357
- status="in_progress"
358
- ))
279
+ )
359
280
 
360
281
  # ============================================
361
282
  # TOOL CALL COMPLETION (PartEndEvent)
@@ -363,28 +284,14 @@ async def stream_openai_response(
363
284
  elif isinstance(event, PartEndEvent) and isinstance(
364
285
  event.part, ToolCallPart
365
286
  ):
366
- if event.index in active_tool_calls:
367
- tool_name, tool_id = active_tool_calls[event.index]
368
-
369
- # Extract full args from completed ToolCallPart
370
- # (PartStartEvent only has empty/partial args during streaming)
371
- args_dict = None
372
- if event.part.args is not None:
373
- if hasattr(event.part.args, 'args_dict'):
374
- args_dict = event.part.args.args_dict
375
- elif isinstance(event.part.args, dict):
376
- args_dict = event.part.args
377
- elif isinstance(event.part.args, str) and event.part.args:
378
- try:
379
- args_dict = json.loads(event.part.args)
380
- except json.JSONDecodeError:
381
- logger.warning(f"Failed to parse tool args: {event.part.args[:100]}")
382
-
383
- # Update pending_tool_data with complete args
384
- if tool_id in pending_tool_data:
385
- pending_tool_data[tool_id]["arguments"] = args_dict
386
-
387
- del active_tool_calls[event.index]
287
+ if event.index in state.active_tool_calls:
288
+ tool_name, tool_id = state.active_tool_calls[event.index]
289
+ args_dict = extract_tool_args(event.part)
290
+
291
+ if tool_id in state.pending_tool_data:
292
+ state.pending_tool_data[tool_id]["arguments"] = args_dict
293
+
294
+ del state.active_tool_calls[event.index]
388
295
 
389
296
  # ============================================
390
297
  # TEXT CONTENT DELTA
@@ -392,116 +299,117 @@ async def stream_openai_response(
392
299
  elif isinstance(event, PartDeltaEvent) and isinstance(
393
300
  event.delta, TextPartDelta
394
301
  ):
302
+ # DUPLICATION FIX: Skip parent text if child already streamed content
303
+ # Child agents stream via child_content events in ask_agent tool.
304
+ # If parent tries to echo that content, skip it.
305
+ if state.child_content_streamed:
306
+ logger.debug("Skipping parent TextPartDelta - child content already streamed")
307
+ continue
308
+
395
309
  content = event.delta.content_delta
396
- token_count += len(content.split()) # Rough token estimate
397
-
398
- content_chunk = ChatCompletionStreamResponse(
399
- id=request_id,
400
- created=created_at,
401
- model=model,
402
- choices=[
403
- ChatCompletionStreamChoice(
404
- index=0,
405
- delta=ChatCompletionMessageDelta(
406
- role="assistant" if is_first_chunk else None,
407
- content=content,
408
- ),
409
- finish_reason=None,
410
- )
411
- ],
412
- )
413
- is_first_chunk = False
414
- yield f"data: {content_chunk.model_dump_json()}\n\n"
310
+ yield build_content_chunk(state, content)
415
311
 
416
312
  # ============================================
417
313
  # TOOL EXECUTION NODE
418
314
  # ============================================
419
315
  elif Agent.is_call_tools_node(node):
420
316
  async with node.stream(agent_run.ctx) as tools_stream:
421
- async for tool_event in tools_stream:
317
+ # Use concurrent multiplexer to handle both tool events
318
+ # and child agent events as they arrive (fixes streaming lag)
319
+ async for event_type, event_data in stream_with_child_events(
320
+ tools_stream=tools_stream,
321
+ child_event_sink=child_event_sink,
322
+ state=state,
323
+ session_id=session_id,
324
+ user_id=effective_user_id,
325
+ message_id=message_id,
326
+ agent_schema=agent_schema,
327
+ ):
328
+ # Handle child events (streamed from ask_agent)
329
+ if event_type == "child":
330
+ async for chunk in process_child_event(
331
+ child_event=event_data,
332
+ state=state,
333
+ session_id=session_id,
334
+ user_id=effective_user_id,
335
+ message_id=message_id,
336
+ agent_schema=agent_schema,
337
+ ):
338
+ yield chunk
339
+ continue
340
+
341
+ # Handle tool events
342
+ tool_event = event_data
343
+
422
344
  # Tool result event - emit completion
423
345
  if isinstance(tool_event, FunctionToolResultEvent):
424
346
  # Get the tool name/id from the pending queue (FIFO)
425
- if pending_tool_completions:
426
- tool_name, tool_id = pending_tool_completions.pop(0)
347
+ if state.pending_tool_completions:
348
+ tool_name, tool_id = state.pending_tool_completions.pop(0)
427
349
  else:
428
- # Fallback if queue is empty (shouldn't happen)
429
350
  tool_name = "tool"
430
351
  tool_id = f"call_{uuid.uuid4().hex[:8]}"
431
352
 
432
- # Check if this is a register_metadata tool result
433
- # It returns a dict with _metadata_event: True marker
434
353
  result_content = tool_event.result.content if hasattr(tool_event.result, 'content') else tool_event.result
435
354
  is_metadata_event = False
436
355
 
437
- if isinstance(result_content, dict) and result_content.get("_metadata_event"):
356
+ # Handle register_metadata tool results
357
+ metadata = extract_metadata_from_result(result_content)
358
+ if metadata:
438
359
  is_metadata_event = True
439
- metadata_registered = True # Skip default metadata at end
440
- # Emit MetadataEvent with registered values
441
- registered_confidence = result_content.get("confidence")
442
- registered_sources = result_content.get("sources")
443
- registered_references = result_content.get("references")
444
- registered_flags = result_content.get("flags")
445
- # Session naming
446
- registered_session_name = result_content.get("session_name")
447
- # Risk assessment fields
448
- registered_risk_level = result_content.get("risk_level")
449
- registered_risk_score = result_content.get("risk_score")
450
- registered_risk_reasoning = result_content.get("risk_reasoning")
451
- registered_recommended_action = result_content.get("recommended_action")
452
- # Extra fields
453
- registered_extra = result_content.get("extra")
360
+ state.metadata_registered = True
361
+
362
+ # Only set responding_agent if not already set by child
363
+ if not state.responding_agent and metadata.get("agent_schema"):
364
+ state.responding_agent = metadata["agent_schema"]
454
365
 
455
366
  logger.info(
456
- f"📊 Metadata registered: confidence={registered_confidence}, "
457
- f"session_name={registered_session_name}, "
458
- f"risk_level={registered_risk_level}, sources={registered_sources}"
367
+ f"📊 Metadata: confidence={metadata.get('confidence')}, "
368
+ f"risk_level={metadata.get('risk_level')}"
459
369
  )
460
370
 
461
- # Build extra dict with risk fields and any custom extras
371
+ # Build extra dict with risk fields
462
372
  extra_data = {}
463
- if registered_risk_level is not None:
464
- extra_data["risk_level"] = registered_risk_level
465
- if registered_risk_score is not None:
466
- extra_data["risk_score"] = registered_risk_score
467
- if registered_risk_reasoning is not None:
468
- extra_data["risk_reasoning"] = registered_risk_reasoning
469
- if registered_recommended_action is not None:
470
- extra_data["recommended_action"] = registered_recommended_action
471
- if registered_extra:
472
- extra_data.update(registered_extra)
473
-
474
- # Emit metadata event immediately
373
+ for field in ["risk_level", "risk_score", "risk_reasoning", "recommended_action"]:
374
+ if metadata.get(field) is not None:
375
+ extra_data[field] = metadata[field]
376
+ if metadata.get("extra"):
377
+ extra_data.update(metadata["extra"])
378
+
475
379
  yield format_sse_event(MetadataEvent(
476
380
  message_id=message_id,
477
381
  in_reply_to=in_reply_to,
478
382
  session_id=session_id,
479
383
  agent_schema=agent_schema,
480
- session_name=registered_session_name,
481
- confidence=registered_confidence,
482
- sources=registered_sources,
384
+ responding_agent=state.responding_agent,
385
+ session_name=metadata.get("session_name"),
386
+ confidence=metadata.get("confidence"),
387
+ sources=metadata.get("sources"),
483
388
  model_version=model,
484
- flags=registered_flags,
389
+ flags=metadata.get("flags"),
485
390
  extra=extra_data if extra_data else None,
486
391
  hidden=False,
487
392
  ))
488
393
 
489
- # Get complete args from pending_tool_data BEFORE deleting
490
- # (captured at PartEndEvent with full args)
394
+ # Get complete args from pending_tool_data
491
395
  completed_args = None
492
- if tool_id in pending_tool_data:
493
- completed_args = pending_tool_data[tool_id].get("arguments")
396
+ if tool_id in state.pending_tool_data:
397
+ completed_args = state.pending_tool_data[tool_id].get("arguments")
494
398
 
495
- # Capture tool call with result for persistence
496
- # Special handling for register_metadata - always capture full data
497
- if tool_calls_out is not None and tool_id in pending_tool_data:
498
- tool_data = pending_tool_data[tool_id]
399
+ # Capture tool call for persistence
400
+ if tool_calls_out is not None and tool_id in state.pending_tool_data:
401
+ tool_data = state.pending_tool_data[tool_id]
499
402
  tool_data["result"] = result_content
500
403
  tool_data["is_metadata"] = is_metadata_event
501
404
  tool_calls_out.append(tool_data)
502
- del pending_tool_data[tool_id]
405
+ del state.pending_tool_data[tool_id]
503
406
 
504
407
  if not is_metadata_event:
408
+ # NOTE: text_response fallback is DISABLED
409
+ # Child agents now stream content via child_content events (above)
410
+ # which provides real-time streaming. The text_response in tool
411
+ # result would duplicate that content, so we skip it entirely.
412
+
505
413
  # Normal tool completion - emit ToolCallEvent
506
414
  # For finalize_intake, send full result dict for frontend
507
415
  if tool_name == "finalize_intake" and isinstance(result_content, dict):
@@ -544,10 +452,10 @@ async def stream_openai_response(
544
452
  ))
545
453
 
546
454
  # Update progress after tool completion
547
- current_step = 3
455
+ state.current_step = 3
548
456
  yield format_sse_event(ProgressEvent(
549
- step=current_step,
550
- total_steps=total_steps,
457
+ step=state.current_step,
458
+ total_steps=state.total_steps,
551
459
  label="Generating response",
552
460
  status="in_progress"
553
461
  ))
@@ -576,36 +484,36 @@ async def stream_openai_response(
576
484
  result_dict = {"result": str(output)}
577
485
 
578
486
  result_json = json.dumps(result_dict, indent=2, default=str)
579
- token_count += len(result_json.split())
487
+ state.token_count += len(result_json.split())
580
488
 
581
489
  # Emit structured result as content
582
490
  result_chunk = ChatCompletionStreamResponse(
583
- id=request_id,
584
- created=created_at,
491
+ id=state.request_id,
492
+ created=state.created_at,
585
493
  model=model,
586
494
  choices=[
587
495
  ChatCompletionStreamChoice(
588
496
  index=0,
589
497
  delta=ChatCompletionMessageDelta(
590
- role="assistant" if is_first_chunk else None,
498
+ role="assistant" if state.is_first_chunk else None,
591
499
  content=result_json,
592
500
  ),
593
501
  finish_reason=None,
594
502
  )
595
503
  ],
596
504
  )
597
- is_first_chunk = False
505
+ state.is_first_chunk = False
598
506
  yield f"data: {result_chunk.model_dump_json()}\n\n"
599
507
  except Exception as e:
600
508
  logger.debug(f"No structured result available: {e}")
601
509
 
602
510
  # Calculate latency
603
- latency_ms = int((time.time() - start_time) * 1000)
511
+ latency_ms = state.latency_ms()
604
512
 
605
513
  # Final OpenAI chunk with finish_reason
606
514
  final_chunk = ChatCompletionStreamResponse(
607
- id=request_id,
608
- created=created_at,
515
+ id=state.request_id,
516
+ created=state.created_at,
609
517
  model=model,
610
518
  choices=[
611
519
  ChatCompletionStreamChoice(
@@ -618,27 +526,28 @@ async def stream_openai_response(
618
526
  yield f"data: {final_chunk.model_dump_json()}\n\n"
619
527
 
620
528
  # Emit metadata event only if not already registered via register_metadata tool
621
- if not metadata_registered:
529
+ if not state.metadata_registered:
622
530
  yield format_sse_event(MetadataEvent(
623
531
  message_id=message_id,
624
532
  in_reply_to=in_reply_to,
625
533
  session_id=session_id,
626
534
  agent_schema=agent_schema,
535
+ responding_agent=state.responding_agent,
627
536
  confidence=1.0, # Default to 100% confidence
628
537
  model_version=model,
629
538
  latency_ms=latency_ms,
630
- token_count=token_count,
539
+ token_count=state.token_count,
631
540
  # Include deterministic trace context captured from OTEL
632
541
  trace_id=captured_trace_id,
633
542
  span_id=captured_span_id,
634
543
  ))
635
544
 
636
545
  # Mark all progress complete
637
- for step in range(1, total_steps + 1):
546
+ for step in range(1, state.total_steps + 1):
638
547
  yield format_sse_event(ProgressEvent(
639
548
  step=step,
640
- total_steps=total_steps,
641
- label="Complete" if step == total_steps else f"Step {step}",
549
+ total_steps=state.total_steps,
550
+ label="Complete" if step == state.total_steps else f"Step {step}",
642
551
  status="completed"
643
552
  ))
644
553
 
@@ -716,6 +625,8 @@ async def stream_openai_response(
716
625
  yield "data: [DONE]\n\n"
717
626
 
718
627
  finally:
628
+ # Clean up event sink for multi-agent streaming
629
+ set_event_sink(None)
719
630
  # Restore previous context for multi-agent support
720
631
  # This ensures nested agent calls don't pollute the parent's context
721
632
  if agent_context is not None:
@@ -823,6 +734,37 @@ async def stream_minimal_simulator(
823
734
  yield sse_string
824
735
 
825
736
 
737
+ async def save_user_message(
738
+ session_id: str,
739
+ user_id: str | None,
740
+ content: str,
741
+ ) -> None:
742
+ """
743
+ Save user message to database before streaming.
744
+
745
+ Shared utility used by both API and CLI for consistent user message storage.
746
+ """
747
+ if not settings.postgres.enabled or not session_id:
748
+ return
749
+
750
+ user_msg = {
751
+ "role": "user",
752
+ "content": content,
753
+ "timestamp": to_iso(utc_now()),
754
+ }
755
+ try:
756
+ store = SessionMessageStore(user_id=user_id or settings.test.effective_user_id)
757
+ await store.store_session_messages(
758
+ session_id=session_id,
759
+ messages=[user_msg],
760
+ user_id=user_id,
761
+ compress=False,
762
+ )
763
+ logger.debug(f"Saved user message to session {session_id}")
764
+ except Exception as e:
765
+ logger.error(f"Failed to save user message: {e}", exc_info=True)
766
+
767
+
826
768
  async def stream_openai_response_with_save(
827
769
  agent: Agent,
828
770
  prompt: str,
@@ -842,6 +784,9 @@ async def stream_openai_response_with_save(
842
784
  This accumulates all text content during streaming and saves it to the database
843
785
  after the stream completes.
844
786
 
787
+ NOTE: Call save_user_message() BEFORE this function to save the user's message.
788
+ This function only saves tool calls and assistant responses.
789
+
845
790
  Args:
846
791
  agent: Pydantic AI agent instance
847
792
  prompt: User prompt
@@ -855,10 +800,6 @@ async def stream_openai_response_with_save(
855
800
  Yields:
856
801
  SSE-formatted strings
857
802
  """
858
- from ....utils.date_utils import utc_now, to_iso
859
- from ....services.session import SessionMessageStore
860
- from ....settings import settings
861
-
862
803
  # Pre-generate message_id so it can be sent in metadata event
863
804
  # This allows frontend to use it for feedback before DB persistence
864
805
  message_id = str(uuid.uuid4())
@@ -899,6 +840,9 @@ async def stream_openai_response_with_save(
899
840
  delta = data["choices"][0].get("delta", {})
900
841
  content = delta.get("content")
901
842
  if content:
843
+ # DEBUG: Check for [Calling markers in content
844
+ if "[Calling" in content:
845
+ logger.warning(f"DEBUG: Found [Calling in content chunk: {repr(content[:100])}")
902
846
  accumulated_content.append(content)
903
847
  except (json.JSONDecodeError, KeyError, IndexError):
904
848
  pass # Skip non-JSON or malformed chunks
@@ -931,8 +875,34 @@ async def stream_openai_response_with_save(
931
875
  messages_to_store.append(tool_message)
932
876
 
933
877
  # Then store assistant text response (if any)
878
+ # Priority: direct TextPartDelta content > tool call text_response
879
+ # When an agent delegates via ask_agent, the child's text_response becomes
880
+ # the parent's assistant response (the parent is just orchestrating)
881
+ full_content = None
882
+
934
883
  if accumulated_content:
935
884
  full_content = "".join(accumulated_content)
885
+ logger.warning(f"DEBUG: Using accumulated_content ({len(accumulated_content)} chunks, {len(full_content)} chars)")
886
+ logger.warning(f"DEBUG: First 200 chars: {repr(full_content[:200])}")
887
+ else:
888
+ logger.warning("DEBUG: accumulated_content is empty, checking text_response fallback")
889
+ # No direct text from TextPartDelta - check tool results for text_response
890
+ # This handles multi-agent delegation where child agent output is the response
891
+ for tool_call in tool_calls:
892
+ if not tool_call:
893
+ continue
894
+ result = tool_call.get("result")
895
+ if isinstance(result, dict) and result.get("text_response"):
896
+ text_response = result["text_response"]
897
+ if text_response and str(text_response).strip():
898
+ full_content = str(text_response)
899
+ logger.debug(
900
+ f"Using text_response from {tool_call.get('tool_name', 'tool')} "
901
+ f"({len(full_content)} chars) as assistant message"
902
+ )
903
+ break
904
+
905
+ if full_content:
936
906
  assistant_message = {
937
907
  "id": message_id, # Use pre-generated ID for consistency with metadata event
938
908
  "role": "assistant",
@@ -954,7 +924,7 @@ async def stream_openai_response_with_save(
954
924
  )
955
925
  logger.debug(
956
926
  f"Saved {len(tool_calls)} tool calls and "
957
- f"{'assistant response' if accumulated_content else 'no text'} "
927
+ f"{'assistant response' if full_content else 'no text'} "
958
928
  f"to session {session_id}"
959
929
  )
960
930
  except Exception as e: