remdb 0.3.171__py3-none-any.whl → 0.3.230__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. rem/agentic/README.md +36 -2
  2. rem/agentic/context.py +173 -0
  3. rem/agentic/context_builder.py +12 -2
  4. rem/agentic/mcp/tool_wrapper.py +39 -16
  5. rem/agentic/providers/pydantic_ai.py +78 -45
  6. rem/agentic/schema.py +6 -5
  7. rem/agentic/tools/rem_tools.py +11 -0
  8. rem/api/main.py +1 -1
  9. rem/api/mcp_router/resources.py +75 -14
  10. rem/api/mcp_router/server.py +31 -24
  11. rem/api/mcp_router/tools.py +621 -166
  12. rem/api/routers/admin.py +30 -4
  13. rem/api/routers/auth.py +114 -15
  14. rem/api/routers/chat/child_streaming.py +379 -0
  15. rem/api/routers/chat/completions.py +74 -37
  16. rem/api/routers/chat/sse_events.py +7 -3
  17. rem/api/routers/chat/streaming.py +352 -257
  18. rem/api/routers/chat/streaming_utils.py +327 -0
  19. rem/api/routers/common.py +18 -0
  20. rem/api/routers/dev.py +7 -1
  21. rem/api/routers/feedback.py +9 -1
  22. rem/api/routers/messages.py +176 -38
  23. rem/api/routers/models.py +9 -1
  24. rem/api/routers/query.py +12 -1
  25. rem/api/routers/shared_sessions.py +16 -0
  26. rem/auth/jwt.py +19 -4
  27. rem/auth/middleware.py +42 -28
  28. rem/cli/README.md +62 -0
  29. rem/cli/commands/ask.py +61 -81
  30. rem/cli/commands/db.py +148 -70
  31. rem/cli/commands/process.py +171 -43
  32. rem/models/entities/ontology.py +91 -101
  33. rem/schemas/agents/rem.yaml +1 -1
  34. rem/services/content/service.py +18 -5
  35. rem/services/email/service.py +11 -2
  36. rem/services/embeddings/worker.py +26 -12
  37. rem/services/postgres/__init__.py +28 -3
  38. rem/services/postgres/diff_service.py +57 -5
  39. rem/services/postgres/programmable_diff_service.py +635 -0
  40. rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
  41. rem/services/postgres/register_type.py +12 -11
  42. rem/services/postgres/repository.py +39 -29
  43. rem/services/postgres/schema_generator.py +5 -5
  44. rem/services/postgres/sql_builder.py +6 -5
  45. rem/services/session/__init__.py +8 -1
  46. rem/services/session/compression.py +40 -2
  47. rem/services/session/pydantic_messages.py +292 -0
  48. rem/settings.py +34 -0
  49. rem/sql/background_indexes.sql +5 -0
  50. rem/sql/migrations/001_install.sql +157 -10
  51. rem/sql/migrations/002_install_models.sql +160 -132
  52. rem/sql/migrations/004_cache_system.sql +7 -275
  53. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  54. rem/utils/model_helpers.py +101 -0
  55. rem/utils/schema_loader.py +79 -51
  56. {remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/METADATA +2 -2
  57. {remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/RECORD +59 -53
  58. {remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/WHEEL +0 -0
  59. {remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/entry_points.txt +0 -0
@@ -1,37 +1,38 @@
1
1
  """
2
2
  OpenAI-compatible streaming relay for Pydantic AI agents.
3
3
 
4
- Design Pattern:
5
- - Uses Pydantic AI's agent.iter() to capture full execution including tool calls
6
- - Emits rich SSE events: reasoning, tool_call, progress, metadata, text_delta
7
- - Proper OpenAI SSE format with data: prefix and [DONE] terminator
8
- - Error handling with graceful degradation
9
-
10
- Key Insight
11
- - agent.run_stream() stops after first output, missing tool calls
12
- - agent.iter() provides complete execution with tool call visibility
13
- - Use PartStartEvent to detect tool calls and thinking parts
14
- - Use PartDeltaEvent with TextPartDelta/ThinkingPartDelta for streaming
15
- - Use PartEndEvent to detect tool completion
16
- - Use FunctionToolResultEvent to get tool results
17
-
18
- SSE Format (OpenAI-compatible):
19
- data: {"id": "chatcmpl-...", "choices": [{"delta": {"content": "..."}}]}\\n\\n
20
- data: [DONE]\\n\\n
21
-
22
- Extended SSE Format (Custom Events):
23
- event: reasoning\\ndata: {"type": "reasoning", "content": "..."}\\n\\n
24
- event: tool_call\\ndata: {"type": "tool_call", "tool_name": "...", "status": "started"}\\n\\n
25
- event: progress\\ndata: {"type": "progress", "step": 1, "total_steps": 3}\\n\\n
26
- event: metadata\\ndata: {"type": "metadata", "confidence": 0.95}\\n\\n
27
-
28
- See sse_events.py for the full event type definitions.
4
+ Architecture:
5
+ ```
6
+ User Request stream_openai_response agent.iter() SSE Events → Client
7
+
8
+ ├── Parent agent events (text, tool calls)
9
+
10
+ └── Child agent events (via ask_agent tool)
11
+
12
+
13
+ Event Sink (asyncio.Queue)
14
+
15
+
16
+ drain_child_events() SSE + DB
17
+ ```
18
+
19
+ Modules:
20
+ - streaming.py: Main workflow orchestrator (this file)
21
+ - streaming_utils.py: Pure utility functions, StreamingState dataclass
22
+ - child_streaming.py: Child agent event handling
23
+
24
+ Key Design Decision (DUPLICATION FIX):
25
+ When child_content is streamed, state.child_content_streamed is set True.
26
+ Parent TextPartDelta events are SKIPPED when this flag is True,
27
+ preventing content from being emitted twice.
29
28
  """
30
29
 
30
+ from __future__ import annotations
31
+
32
+ import asyncio
31
33
  import json
32
- import time
33
34
  import uuid
34
- from typing import AsyncGenerator
35
+ from typing import TYPE_CHECKING, AsyncGenerator
35
36
 
36
37
  from loguru import logger
37
38
  from pydantic_ai.agent import Agent
@@ -47,7 +48,17 @@ from pydantic_ai.messages import (
47
48
  ToolCallPart,
48
49
  )
49
50
 
50
- from .otel_utils import get_current_trace_context, get_tracer
51
+ from .child_streaming import drain_child_events, stream_with_child_events, process_child_event
52
+ from .streaming_utils import (
53
+ StreamingState,
54
+ build_content_chunk,
55
+ build_progress_event,
56
+ build_tool_start_event,
57
+ extract_metadata_from_result,
58
+ extract_tool_args,
59
+ log_tool_call,
60
+ )
61
+ from .otel_utils import get_current_trace_context
51
62
  from .models import (
52
63
  ChatCompletionMessageDelta,
53
64
  ChatCompletionStreamChoice,
@@ -55,12 +66,19 @@ from .models import (
55
66
  )
56
67
  from .sse_events import (
57
68
  DoneEvent,
69
+ ErrorEvent,
58
70
  MetadataEvent,
59
71
  ProgressEvent,
60
72
  ReasoningEvent,
61
73
  ToolCallEvent,
62
74
  format_sse_event,
63
75
  )
76
+ from ....services.session import SessionMessageStore
77
+ from ....settings import settings
78
+ from ....utils.date_utils import to_iso, utc_now
79
+
80
+ if TYPE_CHECKING:
81
+ from ....agentic.context import AgentContext
64
82
 
65
83
 
66
84
  async def stream_openai_response(
@@ -79,6 +97,11 @@ async def stream_openai_response(
79
97
  # Mutable container to capture tool calls for persistence
80
98
  # Format: list of {"tool_name": str, "tool_id": str, "arguments": dict, "result": any}
81
99
  tool_calls_out: list | None = None,
100
+ # Agent context for multi-agent propagation
101
+ # When set, enables child agents to access parent context via get_current_context()
102
+ agent_context: "AgentContext | None" = None,
103
+ # Pydantic-ai native message history for proper tool call/return pairing
104
+ message_history: list | None = None,
82
105
  ) -> AsyncGenerator[str, None]:
83
106
  """
84
107
  Stream Pydantic AI agent responses with rich SSE events.
@@ -131,40 +154,39 @@ async def stream_openai_response(
131
154
  event: done
132
155
  data: {"type": "done", "reason": "stop"}
133
156
  """
134
- if request_id is None:
135
- request_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
136
-
137
- created_at = int(time.time())
138
- start_time = time.time()
139
- is_first_chunk = True
140
- reasoning_step = 0
141
- current_step = 0
142
- total_steps = 3 # Model request, tool execution (optional), final response
143
- token_count = 0
144
-
145
- # Track active tool calls for completion events
146
- # Maps index -> (tool_name, tool_id) for correlating start/end events
147
- active_tool_calls: dict[int, tuple[str, str]] = {}
148
- # Queue of tool calls awaiting completion (FIFO for matching)
149
- pending_tool_completions: list[tuple[str, str]] = []
150
- # Track if metadata was registered via register_metadata tool
151
- metadata_registered = False
152
- # Track pending tool calls with full data for persistence
153
- # Maps tool_id -> {"tool_name": str, "tool_id": str, "arguments": dict}
154
- pending_tool_data: dict[str, dict] = {}
157
+ # Initialize streaming state
158
+ state = StreamingState.create(model=model, request_id=request_id)
159
+
160
+ # Get effective user_id for database operations
161
+ effective_user_id = agent_context.user_id if agent_context else None
162
+
163
+ # Import context functions for multi-agent support
164
+ from ....agentic.context import set_current_context, set_event_sink
165
+
166
+ # Set up context for multi-agent propagation
167
+ previous_context = None
168
+ if agent_context is not None:
169
+ from ....agentic.context import get_current_context
170
+ previous_context = get_current_context()
171
+ set_current_context(agent_context)
172
+
173
+ # Set up event sink for child agent event proxying
174
+ child_event_sink: asyncio.Queue = asyncio.Queue()
175
+ set_event_sink(child_event_sink)
155
176
 
156
177
  try:
157
178
  # Emit initial progress event
158
- current_step = 1
159
- yield format_sse_event(ProgressEvent(
160
- step=current_step,
161
- total_steps=total_steps,
179
+ state.current_step = 1
180
+ yield build_progress_event(
181
+ step=state.current_step,
182
+ total_steps=state.total_steps,
162
183
  label="Processing request",
163
- status="in_progress"
164
- ))
184
+ )
165
185
 
166
186
  # Use agent.iter() to get complete execution with tool calls
167
- async with agent.iter(prompt) as agent_run:
187
+ # Pass message_history if available for proper tool call/return pairing
188
+ iter_kwargs = {"message_history": message_history} if message_history else {}
189
+ async with agent.iter(prompt, **iter_kwargs) as agent_run:
168
190
  # Capture trace context IMMEDIATELY inside agent execution
169
191
  # This is deterministic - it's the OTEL context from Pydantic AI instrumentation
170
192
  # NOT dependent on any AI-generated content
@@ -185,11 +207,11 @@ async def stream_openai_response(
185
207
  if isinstance(event, PartStartEvent) and isinstance(
186
208
  event.part, ThinkingPart
187
209
  ):
188
- reasoning_step += 1
210
+ state.reasoning_step += 1
189
211
  if event.part.content:
190
212
  yield format_sse_event(ReasoningEvent(
191
213
  content=event.part.content,
192
- step=reasoning_step
214
+ step=state.reasoning_step
193
215
  ))
194
216
 
195
217
  # Reasoning delta (streaming thinking)
@@ -199,7 +221,7 @@ async def stream_openai_response(
199
221
  if event.delta.content_delta:
200
222
  yield format_sse_event(ReasoningEvent(
201
223
  content=event.delta.content_delta,
202
- step=reasoning_step
224
+ step=state.reasoning_step
203
225
  ))
204
226
 
205
227
  # ============================================
@@ -208,28 +230,11 @@ async def stream_openai_response(
208
230
  elif isinstance(event, PartStartEvent) and isinstance(
209
231
  event.part, TextPart
210
232
  ):
211
- # TextPart may contain initial content that needs to be emitted
233
+ # Skip if child already streamed content
234
+ if state.child_content_streamed:
235
+ continue
212
236
  if event.part.content:
213
- content = event.part.content
214
- token_count += len(content.split())
215
-
216
- content_chunk = ChatCompletionStreamResponse(
217
- id=request_id,
218
- created=created_at,
219
- model=model,
220
- choices=[
221
- ChatCompletionStreamChoice(
222
- index=0,
223
- delta=ChatCompletionMessageDelta(
224
- role="assistant" if is_first_chunk else None,
225
- content=content,
226
- ),
227
- finish_reason=None,
228
- )
229
- ],
230
- )
231
- is_first_chunk = False
232
- yield f"data: {content_chunk.model_dump_json()}\n\n"
237
+ yield build_content_chunk(state, event.part.content)
233
238
 
234
239
  # ============================================
235
240
  # TOOL CALL START EVENTS
@@ -239,88 +244,39 @@ async def stream_openai_response(
239
244
  ):
240
245
  tool_name = event.part.tool_name
241
246
 
242
- # Handle final_result specially - it's Pydantic AI's
243
- # internal tool for structured output
247
+ # Handle final_result (Pydantic AI's internal tool)
244
248
  if tool_name == "final_result":
245
- # Extract the structured result and emit as content
246
- args_dict = None
247
- if event.part.args is not None:
248
- if hasattr(event.part.args, 'args_dict'):
249
- args_dict = event.part.args.args_dict
250
- elif isinstance(event.part.args, dict):
251
- args_dict = event.part.args
252
-
249
+ args_dict = extract_tool_args(event.part)
253
250
  if args_dict:
254
- # Emit the structured result as JSON content
255
251
  result_json = json.dumps(args_dict, indent=2)
256
- content_chunk = ChatCompletionStreamResponse(
257
- id=request_id,
258
- created=created_at,
259
- model=model,
260
- choices=[
261
- ChatCompletionStreamChoice(
262
- index=0,
263
- delta=ChatCompletionMessageDelta(
264
- role="assistant" if is_first_chunk else None,
265
- content=result_json,
266
- ),
267
- finish_reason=None,
268
- )
269
- ],
270
- )
271
- is_first_chunk = False
272
- yield f"data: {content_chunk.model_dump_json()}\n\n"
273
- continue # Skip regular tool call handling
252
+ yield build_content_chunk(state, result_json)
253
+ continue
274
254
 
275
255
  tool_id = f"call_{uuid.uuid4().hex[:8]}"
276
- active_tool_calls[event.index] = (tool_name, tool_id)
277
- # Queue for completion matching (FIFO)
278
- pending_tool_completions.append((tool_name, tool_id))
279
-
280
- # Emit tool_call SSE event (started)
281
- # Try to get arguments as dict
282
- args_dict = None
283
- if event.part.args is not None:
284
- if hasattr(event.part.args, 'args_dict'):
285
- args_dict = event.part.args.args_dict
286
- elif isinstance(event.part.args, dict):
287
- args_dict = event.part.args
288
-
289
- # Log tool call with key parameters
290
- if args_dict and tool_name == "search_rem":
291
- query_type = args_dict.get("query_type", "?")
292
- limit = args_dict.get("limit", 20)
293
- table = args_dict.get("table", "")
294
- query_text = args_dict.get("query_text", args_dict.get("entity_key", ""))
295
- if query_text and len(query_text) > 50:
296
- query_text = query_text[:50] + "..."
297
- logger.info(f"🔧 {tool_name} {query_type.upper()} '{query_text}' table={table} limit={limit}")
298
- else:
299
- logger.info(f"🔧 {tool_name}")
256
+ state.active_tool_calls[event.index] = (tool_name, tool_id)
257
+ state.pending_tool_completions.append((tool_name, tool_id))
300
258
 
301
- yield format_sse_event(ToolCallEvent(
302
- tool_name=tool_name,
303
- tool_id=tool_id,
304
- status="started",
305
- arguments=args_dict
306
- ))
259
+ # Extract and log arguments
260
+ args_dict = extract_tool_args(event.part)
261
+ log_tool_call(tool_name, args_dict)
262
+
263
+ yield build_tool_start_event(tool_name, tool_id, args_dict)
307
264
 
308
- # Track tool call data for persistence (especially register_metadata)
309
- pending_tool_data[tool_id] = {
265
+ # Track for persistence
266
+ state.pending_tool_data[tool_id] = {
310
267
  "tool_name": tool_name,
311
268
  "tool_id": tool_id,
312
269
  "arguments": args_dict,
313
270
  }
314
271
 
315
272
  # Update progress
316
- current_step = 2
317
- total_steps = 4 # Added tool execution step
318
- yield format_sse_event(ProgressEvent(
319
- step=current_step,
320
- total_steps=total_steps,
273
+ state.current_step = 2
274
+ state.total_steps = 4
275
+ yield build_progress_event(
276
+ step=state.current_step,
277
+ total_steps=state.total_steps,
321
278
  label=f"Calling {tool_name}",
322
- status="in_progress"
323
- ))
279
+ )
324
280
 
325
281
  # ============================================
326
282
  # TOOL CALL COMPLETION (PartEndEvent)
@@ -328,11 +284,14 @@ async def stream_openai_response(
328
284
  elif isinstance(event, PartEndEvent) and isinstance(
329
285
  event.part, ToolCallPart
330
286
  ):
331
- if event.index in active_tool_calls:
332
- tool_name, tool_id = active_tool_calls[event.index]
333
- # Note: result comes from FunctionToolResultEvent below
334
- # For now, mark as completed without result
335
- del active_tool_calls[event.index]
287
+ if event.index in state.active_tool_calls:
288
+ tool_name, tool_id = state.active_tool_calls[event.index]
289
+ args_dict = extract_tool_args(event.part)
290
+
291
+ if tool_id in state.pending_tool_data:
292
+ state.pending_tool_data[tool_id]["arguments"] = args_dict
293
+
294
+ del state.active_tool_calls[event.index]
336
295
 
337
296
  # ============================================
338
297
  # TEXT CONTENT DELTA
@@ -340,113 +299,124 @@ async def stream_openai_response(
340
299
  elif isinstance(event, PartDeltaEvent) and isinstance(
341
300
  event.delta, TextPartDelta
342
301
  ):
302
+ # DUPLICATION FIX: Skip parent text if child already streamed content
303
+ # Child agents stream via child_content events in ask_agent tool.
304
+ # If parent tries to echo that content, skip it.
305
+ if state.child_content_streamed:
306
+ logger.debug("Skipping parent TextPartDelta - child content already streamed")
307
+ continue
308
+
343
309
  content = event.delta.content_delta
344
- token_count += len(content.split()) # Rough token estimate
345
-
346
- content_chunk = ChatCompletionStreamResponse(
347
- id=request_id,
348
- created=created_at,
349
- model=model,
350
- choices=[
351
- ChatCompletionStreamChoice(
352
- index=0,
353
- delta=ChatCompletionMessageDelta(
354
- role="assistant" if is_first_chunk else None,
355
- content=content,
356
- ),
357
- finish_reason=None,
358
- )
359
- ],
360
- )
361
- is_first_chunk = False
362
- yield f"data: {content_chunk.model_dump_json()}\n\n"
310
+ yield build_content_chunk(state, content)
363
311
 
364
312
  # ============================================
365
313
  # TOOL EXECUTION NODE
366
314
  # ============================================
367
315
  elif Agent.is_call_tools_node(node):
368
316
  async with node.stream(agent_run.ctx) as tools_stream:
369
- async for tool_event in tools_stream:
317
+ # Use concurrent multiplexer to handle both tool events
318
+ # and child agent events as they arrive (fixes streaming lag)
319
+ async for event_type, event_data in stream_with_child_events(
320
+ tools_stream=tools_stream,
321
+ child_event_sink=child_event_sink,
322
+ state=state,
323
+ session_id=session_id,
324
+ user_id=effective_user_id,
325
+ message_id=message_id,
326
+ agent_schema=agent_schema,
327
+ ):
328
+ # Handle child events (streamed from ask_agent)
329
+ if event_type == "child":
330
+ async for chunk in process_child_event(
331
+ child_event=event_data,
332
+ state=state,
333
+ session_id=session_id,
334
+ user_id=effective_user_id,
335
+ message_id=message_id,
336
+ agent_schema=agent_schema,
337
+ ):
338
+ yield chunk
339
+ continue
340
+
341
+ # Handle tool events
342
+ tool_event = event_data
343
+
370
344
  # Tool result event - emit completion
371
345
  if isinstance(tool_event, FunctionToolResultEvent):
372
346
  # Get the tool name/id from the pending queue (FIFO)
373
- if pending_tool_completions:
374
- tool_name, tool_id = pending_tool_completions.pop(0)
347
+ if state.pending_tool_completions:
348
+ tool_name, tool_id = state.pending_tool_completions.pop(0)
375
349
  else:
376
- # Fallback if queue is empty (shouldn't happen)
377
350
  tool_name = "tool"
378
351
  tool_id = f"call_{uuid.uuid4().hex[:8]}"
379
352
 
380
- # Check if this is a register_metadata tool result
381
- # It returns a dict with _metadata_event: True marker
382
353
  result_content = tool_event.result.content if hasattr(tool_event.result, 'content') else tool_event.result
383
354
  is_metadata_event = False
384
355
 
385
- if isinstance(result_content, dict) and result_content.get("_metadata_event"):
356
+ # Handle register_metadata tool results
357
+ metadata = extract_metadata_from_result(result_content)
358
+ if metadata:
386
359
  is_metadata_event = True
387
- metadata_registered = True # Skip default metadata at end
388
- # Emit MetadataEvent with registered values
389
- registered_confidence = result_content.get("confidence")
390
- registered_sources = result_content.get("sources")
391
- registered_references = result_content.get("references")
392
- registered_flags = result_content.get("flags")
393
- # Session naming
394
- registered_session_name = result_content.get("session_name")
395
- # Risk assessment fields
396
- registered_risk_level = result_content.get("risk_level")
397
- registered_risk_score = result_content.get("risk_score")
398
- registered_risk_reasoning = result_content.get("risk_reasoning")
399
- registered_recommended_action = result_content.get("recommended_action")
400
- # Extra fields
401
- registered_extra = result_content.get("extra")
360
+ state.metadata_registered = True
361
+
362
+ # Only set responding_agent if not already set by child
363
+ if not state.responding_agent and metadata.get("agent_schema"):
364
+ state.responding_agent = metadata["agent_schema"]
402
365
 
403
366
  logger.info(
404
- f"📊 Metadata registered: confidence={registered_confidence}, "
405
- f"session_name={registered_session_name}, "
406
- f"risk_level={registered_risk_level}, sources={registered_sources}"
367
+ f"📊 Metadata: confidence={metadata.get('confidence')}, "
368
+ f"risk_level={metadata.get('risk_level')}"
407
369
  )
408
370
 
409
- # Build extra dict with risk fields and any custom extras
371
+ # Build extra dict with risk fields
410
372
  extra_data = {}
411
- if registered_risk_level is not None:
412
- extra_data["risk_level"] = registered_risk_level
413
- if registered_risk_score is not None:
414
- extra_data["risk_score"] = registered_risk_score
415
- if registered_risk_reasoning is not None:
416
- extra_data["risk_reasoning"] = registered_risk_reasoning
417
- if registered_recommended_action is not None:
418
- extra_data["recommended_action"] = registered_recommended_action
419
- if registered_extra:
420
- extra_data.update(registered_extra)
421
-
422
- # Emit metadata event immediately
373
+ for field in ["risk_level", "risk_score", "risk_reasoning", "recommended_action"]:
374
+ if metadata.get(field) is not None:
375
+ extra_data[field] = metadata[field]
376
+ if metadata.get("extra"):
377
+ extra_data.update(metadata["extra"])
378
+
423
379
  yield format_sse_event(MetadataEvent(
424
380
  message_id=message_id,
425
381
  in_reply_to=in_reply_to,
426
382
  session_id=session_id,
427
383
  agent_schema=agent_schema,
428
- session_name=registered_session_name,
429
- confidence=registered_confidence,
430
- sources=registered_sources,
384
+ responding_agent=state.responding_agent,
385
+ session_name=metadata.get("session_name"),
386
+ confidence=metadata.get("confidence"),
387
+ sources=metadata.get("sources"),
431
388
  model_version=model,
432
- flags=registered_flags,
389
+ flags=metadata.get("flags"),
433
390
  extra=extra_data if extra_data else None,
434
391
  hidden=False,
435
392
  ))
436
393
 
437
- # Capture tool call with result for persistence
438
- # Special handling for register_metadata - always capture full data
439
- if tool_calls_out is not None and tool_id in pending_tool_data:
440
- tool_data = pending_tool_data[tool_id]
394
+ # Get complete args from pending_tool_data
395
+ completed_args = None
396
+ if tool_id in state.pending_tool_data:
397
+ completed_args = state.pending_tool_data[tool_id].get("arguments")
398
+
399
+ # Capture tool call for persistence
400
+ if tool_calls_out is not None and tool_id in state.pending_tool_data:
401
+ tool_data = state.pending_tool_data[tool_id]
441
402
  tool_data["result"] = result_content
442
403
  tool_data["is_metadata"] = is_metadata_event
443
404
  tool_calls_out.append(tool_data)
444
- del pending_tool_data[tool_id]
405
+ del state.pending_tool_data[tool_id]
445
406
 
446
407
  if not is_metadata_event:
408
+ # NOTE: text_response fallback is DISABLED
409
+ # Child agents now stream content via child_content events (above)
410
+ # which provides real-time streaming. The text_response in tool
411
+ # result would duplicate that content, so we skip it entirely.
412
+
447
413
  # Normal tool completion - emit ToolCallEvent
448
- result_str = str(result_content)
449
- result_summary = result_str[:200] + "..." if len(result_str) > 200 else result_str
414
+ # For finalize_intake, send full result dict for frontend
415
+ if tool_name == "finalize_intake" and isinstance(result_content, dict):
416
+ result_for_sse = result_content
417
+ else:
418
+ result_str = str(result_content)
419
+ result_for_sse = result_str[:200] + "..." if len(result_str) > 200 else result_str
450
420
 
451
421
  # Log result count for search_rem
452
422
  if tool_name == "search_rem" and isinstance(result_content, dict):
@@ -477,14 +447,15 @@ async def stream_openai_response(
477
447
  tool_name=tool_name,
478
448
  tool_id=tool_id,
479
449
  status="completed",
480
- result=result_summary
450
+ arguments=completed_args,
451
+ result=result_for_sse
481
452
  ))
482
453
 
483
454
  # Update progress after tool completion
484
- current_step = 3
455
+ state.current_step = 3
485
456
  yield format_sse_event(ProgressEvent(
486
- step=current_step,
487
- total_steps=total_steps,
457
+ step=state.current_step,
458
+ total_steps=state.total_steps,
488
459
  label="Generating response",
489
460
  status="in_progress"
490
461
  ))
@@ -513,36 +484,36 @@ async def stream_openai_response(
513
484
  result_dict = {"result": str(output)}
514
485
 
515
486
  result_json = json.dumps(result_dict, indent=2, default=str)
516
- token_count += len(result_json.split())
487
+ state.token_count += len(result_json.split())
517
488
 
518
489
  # Emit structured result as content
519
490
  result_chunk = ChatCompletionStreamResponse(
520
- id=request_id,
521
- created=created_at,
491
+ id=state.request_id,
492
+ created=state.created_at,
522
493
  model=model,
523
494
  choices=[
524
495
  ChatCompletionStreamChoice(
525
496
  index=0,
526
497
  delta=ChatCompletionMessageDelta(
527
- role="assistant" if is_first_chunk else None,
498
+ role="assistant" if state.is_first_chunk else None,
528
499
  content=result_json,
529
500
  ),
530
501
  finish_reason=None,
531
502
  )
532
503
  ],
533
504
  )
534
- is_first_chunk = False
505
+ state.is_first_chunk = False
535
506
  yield f"data: {result_chunk.model_dump_json()}\n\n"
536
507
  except Exception as e:
537
508
  logger.debug(f"No structured result available: {e}")
538
509
 
539
510
  # Calculate latency
540
- latency_ms = int((time.time() - start_time) * 1000)
511
+ latency_ms = state.latency_ms()
541
512
 
542
513
  # Final OpenAI chunk with finish_reason
543
514
  final_chunk = ChatCompletionStreamResponse(
544
- id=request_id,
545
- created=created_at,
515
+ id=state.request_id,
516
+ created=state.created_at,
546
517
  model=model,
547
518
  choices=[
548
519
  ChatCompletionStreamChoice(
@@ -555,27 +526,28 @@ async def stream_openai_response(
555
526
  yield f"data: {final_chunk.model_dump_json()}\n\n"
556
527
 
557
528
  # Emit metadata event only if not already registered via register_metadata tool
558
- if not metadata_registered:
529
+ if not state.metadata_registered:
559
530
  yield format_sse_event(MetadataEvent(
560
531
  message_id=message_id,
561
532
  in_reply_to=in_reply_to,
562
533
  session_id=session_id,
563
534
  agent_schema=agent_schema,
535
+ responding_agent=state.responding_agent,
564
536
  confidence=1.0, # Default to 100% confidence
565
537
  model_version=model,
566
538
  latency_ms=latency_ms,
567
- token_count=token_count,
539
+ token_count=state.token_count,
568
540
  # Include deterministic trace context captured from OTEL
569
541
  trace_id=captured_trace_id,
570
542
  span_id=captured_span_id,
571
543
  ))
572
544
 
573
545
  # Mark all progress complete
574
- for step in range(1, total_steps + 1):
546
+ for step in range(1, state.total_steps + 1):
575
547
  yield format_sse_event(ProgressEvent(
576
548
  step=step,
577
- total_steps=total_steps,
578
- label="Complete" if step == total_steps else f"Step {step}",
549
+ total_steps=state.total_steps,
550
+ label="Complete" if step == state.total_steps else f"Step {step}",
579
551
  status="completed"
580
552
  ))
581
553
 
@@ -587,25 +559,79 @@ async def stream_openai_response(
587
559
 
588
560
  except Exception as e:
589
561
  import traceback
562
+ import re
590
563
 
591
564
  error_msg = str(e)
592
- logger.error(f"Streaming error: {error_msg}")
593
- logger.error(traceback.format_exc())
594
-
595
- # Send error as final chunk
596
- error_data = {
597
- "error": {
598
- "message": error_msg,
599
- "type": "internal_error",
600
- "code": "stream_error",
601
- }
602
- }
603
- yield f"data: {json.dumps(error_data)}\n\n"
565
+
566
+ # Parse error details for better client handling
567
+ error_code = "stream_error"
568
+ error_details: dict = {}
569
+ recoverable = True
570
+
571
+ # Check for rate limit errors (OpenAI 429)
572
+ if "429" in error_msg or "rate_limit" in error_msg.lower() or "RateLimitError" in type(e).__name__:
573
+ error_code = "rate_limit_exceeded"
574
+ recoverable = True
575
+
576
+ # Extract retry-after time from error message
577
+ # Pattern: "Please try again in X.XXs" or "Please try again in Xs"
578
+ retry_match = re.search(r"try again in (\d+(?:\.\d+)?)\s*s", error_msg)
579
+ if retry_match:
580
+ retry_seconds = float(retry_match.group(1))
581
+ error_details["retry_after_seconds"] = retry_seconds
582
+ error_details["retry_after_ms"] = int(retry_seconds * 1000)
583
+
584
+ # Extract token usage info if available
585
+ used_match = re.search(r"Used (\d+)", error_msg)
586
+ limit_match = re.search(r"Limit (\d+)", error_msg)
587
+ requested_match = re.search(r"Requested (\d+)", error_msg)
588
+ if used_match:
589
+ error_details["tokens_used"] = int(used_match.group(1))
590
+ if limit_match:
591
+ error_details["tokens_limit"] = int(limit_match.group(1))
592
+ if requested_match:
593
+ error_details["tokens_requested"] = int(requested_match.group(1))
594
+
595
+ logger.error(f"🔴 Streaming error: status_code: 429, model_name: {model}, body: {error_msg[:200]}")
596
+
597
+ # Check for authentication errors
598
+ elif "401" in error_msg or "AuthenticationError" in type(e).__name__:
599
+ error_code = "authentication_error"
600
+ recoverable = False
601
+ logger.error(f"🔴 Streaming error: Authentication failed")
602
+
603
+ # Check for model not found / invalid model
604
+ elif "404" in error_msg or "model" in error_msg.lower() and "not found" in error_msg.lower():
605
+ error_code = "model_not_found"
606
+ recoverable = False
607
+ logger.error(f"🔴 Streaming error: Model not found")
608
+
609
+ # Generic error
610
+ else:
611
+ logger.error(f"🔴 Streaming error: {error_msg}")
612
+
613
+ logger.error(f"🔴 {traceback.format_exc()}")
614
+
615
+ # Emit proper ErrorEvent via SSE (with event: prefix for client parsing)
616
+ yield format_sse_event(ErrorEvent(
617
+ code=error_code,
618
+ message=error_msg,
619
+ details=error_details if error_details else None,
620
+ recoverable=recoverable,
621
+ ))
604
622
 
605
623
  # Emit done event with error reason
606
624
  yield format_sse_event(DoneEvent(reason="error"))
607
625
  yield "data: [DONE]\n\n"
608
626
 
627
+ finally:
628
+ # Clean up event sink for multi-agent streaming
629
+ set_event_sink(None)
630
+ # Restore previous context for multi-agent support
631
+ # This ensures nested agent calls don't pollute the parent's context
632
+ if agent_context is not None:
633
+ set_current_context(previous_context)
634
+
609
635
 
610
636
  async def stream_simulator_response(
611
637
  prompt: str,
@@ -708,6 +734,37 @@ async def stream_minimal_simulator(
708
734
  yield sse_string
709
735
 
710
736
 
737
+ async def save_user_message(
738
+ session_id: str,
739
+ user_id: str | None,
740
+ content: str,
741
+ ) -> None:
742
+ """
743
+ Save user message to database before streaming.
744
+
745
+ Shared utility used by both API and CLI for consistent user message storage.
746
+ """
747
+ if not settings.postgres.enabled or not session_id:
748
+ return
749
+
750
+ user_msg = {
751
+ "role": "user",
752
+ "content": content,
753
+ "timestamp": to_iso(utc_now()),
754
+ }
755
+ try:
756
+ store = SessionMessageStore(user_id=user_id or settings.test.effective_user_id)
757
+ await store.store_session_messages(
758
+ session_id=session_id,
759
+ messages=[user_msg],
760
+ user_id=user_id,
761
+ compress=False,
762
+ )
763
+ logger.debug(f"Saved user message to session {session_id}")
764
+ except Exception as e:
765
+ logger.error(f"Failed to save user message: {e}", exc_info=True)
766
+
767
+
711
768
  async def stream_openai_response_with_save(
712
769
  agent: Agent,
713
770
  prompt: str,
@@ -716,6 +773,10 @@ async def stream_openai_response_with_save(
716
773
  agent_schema: str | None = None,
717
774
  session_id: str | None = None,
718
775
  user_id: str | None = None,
776
+ # Agent context for multi-agent propagation
777
+ agent_context: "AgentContext | None" = None,
778
+ # Pydantic-ai native message history for proper tool call/return pairing
779
+ message_history: list | None = None,
719
780
  ) -> AsyncGenerator[str, None]:
720
781
  """
721
782
  Wrapper around stream_openai_response that saves the assistant response after streaming.
@@ -723,6 +784,9 @@ async def stream_openai_response_with_save(
723
784
  This accumulates all text content during streaming and saves it to the database
724
785
  after the stream completes.
725
786
 
787
+ NOTE: Call save_user_message() BEFORE this function to save the user's message.
788
+ This function only saves tool calls and assistant responses.
789
+
726
790
  Args:
727
791
  agent: Pydantic AI agent instance
728
792
  prompt: User prompt
@@ -731,14 +795,11 @@ async def stream_openai_response_with_save(
731
795
  agent_schema: Agent schema name
732
796
  session_id: Session ID for message storage
733
797
  user_id: User ID for message storage
798
+ agent_context: Agent context for multi-agent propagation (enables child agents)
734
799
 
735
800
  Yields:
736
801
  SSE-formatted strings
737
802
  """
738
- from ....utils.date_utils import utc_now, to_iso
739
- from ....services.session import SessionMessageStore
740
- from ....settings import settings
741
-
742
803
  # Pre-generate message_id so it can be sent in metadata event
743
804
  # This allows frontend to use it for feedback before DB persistence
744
805
  message_id = str(uuid.uuid4())
@@ -763,6 +824,8 @@ async def stream_openai_response_with_save(
763
824
  message_id=message_id,
764
825
  trace_context_out=trace_context, # Pass container to capture trace IDs
765
826
  tool_calls_out=tool_calls, # Capture tool calls for persistence
827
+ agent_context=agent_context, # Pass context for multi-agent support
828
+ message_history=message_history, # Native pydantic-ai message history
766
829
  ):
767
830
  yield chunk
768
831
 
@@ -777,6 +840,9 @@ async def stream_openai_response_with_save(
777
840
  delta = data["choices"][0].get("delta", {})
778
841
  content = delta.get("content")
779
842
  if content:
843
+ # DEBUG: Check for [Calling markers in content
844
+ if "[Calling" in content:
845
+ logger.warning(f"DEBUG: Found [Calling in content chunk: {repr(content[:100])}")
780
846
  accumulated_content.append(content)
781
847
  except (json.JSONDecodeError, KeyError, IndexError):
782
848
  pass # Skip non-JSON or malformed chunks
@@ -793,6 +859,8 @@ async def stream_openai_response_with_save(
793
859
 
794
860
  # First, store tool call messages (message_type: "tool")
795
861
  for tool_call in tool_calls:
862
+ if not tool_call:
863
+ continue
796
864
  tool_message = {
797
865
  "role": "tool",
798
866
  "content": json.dumps(tool_call.get("result", {}), default=str),
@@ -807,8 +875,34 @@ async def stream_openai_response_with_save(
807
875
  messages_to_store.append(tool_message)
808
876
 
809
877
  # Then store assistant text response (if any)
878
+ # Priority: direct TextPartDelta content > tool call text_response
879
+ # When an agent delegates via ask_agent, the child's text_response becomes
880
+ # the parent's assistant response (the parent is just orchestrating)
881
+ full_content = None
882
+
810
883
  if accumulated_content:
811
884
  full_content = "".join(accumulated_content)
885
+ logger.warning(f"DEBUG: Using accumulated_content ({len(accumulated_content)} chunks, {len(full_content)} chars)")
886
+ logger.warning(f"DEBUG: First 200 chars: {repr(full_content[:200])}")
887
+ else:
888
+ logger.warning("DEBUG: accumulated_content is empty, checking text_response fallback")
889
+ # No direct text from TextPartDelta - check tool results for text_response
890
+ # This handles multi-agent delegation where child agent output is the response
891
+ for tool_call in tool_calls:
892
+ if not tool_call:
893
+ continue
894
+ result = tool_call.get("result")
895
+ if isinstance(result, dict) and result.get("text_response"):
896
+ text_response = result["text_response"]
897
+ if text_response and str(text_response).strip():
898
+ full_content = str(text_response)
899
+ logger.debug(
900
+ f"Using text_response from {tool_call.get('tool_name', 'tool')} "
901
+ f"({len(full_content)} chars) as assistant message"
902
+ )
903
+ break
904
+
905
+ if full_content:
812
906
  assistant_message = {
813
907
  "id": message_id, # Use pre-generated ID for consistency with metadata event
814
908
  "role": "assistant",
@@ -830,7 +924,7 @@ async def stream_openai_response_with_save(
830
924
  )
831
925
  logger.debug(
832
926
  f"Saved {len(tool_calls)} tool calls and "
833
- f"{'assistant response' if accumulated_content else 'no text'} "
927
+ f"{'assistant response' if full_content else 'no text'} "
834
928
  f"to session {session_id}"
835
929
  )
836
930
  except Exception as e:
@@ -838,8 +932,9 @@ async def stream_openai_response_with_save(
838
932
 
839
933
  # Update session description with session_name (non-blocking, after all yields)
840
934
  for tool_call in tool_calls:
841
- if tool_call.get("tool_name") == "register_metadata" and tool_call.get("is_metadata"):
842
- session_name = tool_call.get("arguments", {}).get("session_name")
935
+ if tool_call and tool_call.get("tool_name") == "register_metadata" and tool_call.get("is_metadata"):
936
+ arguments = tool_call.get("arguments") or {}
937
+ session_name = arguments.get("session_name")
843
938
  if session_name:
844
939
  try:
845
940
  from ....models.entities import Session