PyPI - remdb - Versions diffs - 0.3.171__py3-none-any.whl → 0.3.230__py3-none-any.whl - Mend

remdb 0.3.171py3-none-any.whl → 0.3.230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

rem/agentic/README.md +36 -2
rem/agentic/context.py +173 -0
rem/agentic/context_builder.py +12 -2
rem/agentic/mcp/tool_wrapper.py +39 -16
rem/agentic/providers/pydantic_ai.py +78 -45
rem/agentic/schema.py +6 -5
rem/agentic/tools/rem_tools.py +11 -0
rem/api/main.py +1 -1
rem/api/mcp_router/resources.py +75 -14
rem/api/mcp_router/server.py +31 -24
rem/api/mcp_router/tools.py +621 -166
rem/api/routers/admin.py +30 -4
rem/api/routers/auth.py +114 -15
rem/api/routers/chat/child_streaming.py +379 -0
rem/api/routers/chat/completions.py +74 -37
rem/api/routers/chat/sse_events.py +7 -3
rem/api/routers/chat/streaming.py +352 -257
rem/api/routers/chat/streaming_utils.py +327 -0
rem/api/routers/common.py +18 -0
rem/api/routers/dev.py +7 -1
rem/api/routers/feedback.py +9 -1
rem/api/routers/messages.py +176 -38
rem/api/routers/models.py +9 -1
rem/api/routers/query.py +12 -1
rem/api/routers/shared_sessions.py +16 -0
rem/auth/jwt.py +19 -4
rem/auth/middleware.py +42 -28
rem/cli/README.md +62 -0
rem/cli/commands/ask.py +61 -81
rem/cli/commands/db.py +148 -70
rem/cli/commands/process.py +171 -43
rem/models/entities/ontology.py +91 -101
rem/schemas/agents/rem.yaml +1 -1
rem/services/content/service.py +18 -5
rem/services/email/service.py +11 -2
rem/services/embeddings/worker.py +26 -12
rem/services/postgres/__init__.py +28 -3
rem/services/postgres/diff_service.py +57 -5
rem/services/postgres/programmable_diff_service.py +635 -0
rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
rem/services/postgres/register_type.py +12 -11
rem/services/postgres/repository.py +39 -29
rem/services/postgres/schema_generator.py +5 -5
rem/services/postgres/sql_builder.py +6 -5
rem/services/session/__init__.py +8 -1
rem/services/session/compression.py +40 -2
rem/services/session/pydantic_messages.py +292 -0
rem/settings.py +34 -0
rem/sql/background_indexes.sql +5 -0
rem/sql/migrations/001_install.sql +157 -10
rem/sql/migrations/002_install_models.sql +160 -132
rem/sql/migrations/004_cache_system.sql +7 -275
rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
rem/utils/model_helpers.py +101 -0
rem/utils/schema_loader.py +79 -51
{remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/METADATA +2 -2
{remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/RECORD +59 -53
{remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/WHEEL +0 -0
{remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/entry_points.txt +0 -0

rem/api/routers/chat/streaming.py CHANGED Viewed

@@ -1,37 +1,38 @@
 """
 OpenAI-compatible streaming relay for Pydantic AI agents.
-Design Pattern:
-- Uses Pydantic AI's agent.iter() to capture full execution including tool calls
-- Emits rich SSE events: reasoning, tool_call, progress, metadata, text_delta
-- Proper OpenAI SSE format with data: prefix and [DONE] terminator
-- Error handling with graceful degradation
-Key Insight
-- agent.run_stream() stops after first output, missing tool calls
-- agent.iter() provides complete execution with tool call visibility
-- Use PartStartEvent to detect tool calls and thinking parts
-- Use PartDeltaEvent with TextPartDelta/ThinkingPartDelta for streaming
-- Use PartEndEvent to detect tool completion
-- Use FunctionToolResultEvent to get tool results
-SSE Format (OpenAI-compatible):
-    data: {"id": "chatcmpl-...", "choices": [{"delta": {"content": "..."}}]}\\n\\n
-    data: [DONE]\\n\\n
-Extended SSE Format (Custom Events):
-    event: reasoning\\ndata: {"type": "reasoning", "content": "..."}\\n\\n
-    event: tool_call\\ndata: {"type": "tool_call", "tool_name": "...", "status": "started"}\\n\\n
-    event: progress\\ndata: {"type": "progress", "step": 1, "total_steps": 3}\\n\\n
-    event: metadata\\ndata: {"type": "metadata", "confidence": 0.95}\\n\\n
-See sse_events.py for the full event type definitions.
+Architecture:
+```
+User Request → stream_openai_response → agent.iter() → SSE Events → Client
+                     │
+                     ├── Parent agent events (text, tool calls)
+                     │
+                     └── Child agent events (via ask_agent tool)
+                              │
+                              ▼
+                         Event Sink (asyncio.Queue)
+                              │
+                              ▼
+                         drain_child_events() → SSE + DB
+```
+Modules:
+- streaming.py: Main workflow orchestrator (this file)
+- streaming_utils.py: Pure utility functions, StreamingState dataclass
+- child_streaming.py: Child agent event handling
+Key Design Decision (DUPLICATION FIX):
+When child_content is streamed, state.child_content_streamed is set True.
+Parent TextPartDelta events are SKIPPED when this flag is True,
+preventing content from being emitted twice.
 """
+from __future__ import annotations
+import asyncio
 import json
-import time
 import uuid
-from typing import AsyncGenerator
+from typing import TYPE_CHECKING, AsyncGenerator
 from loguru import logger
 from pydantic_ai.agent import Agent
@@ -47,7 +48,17 @@ from pydantic_ai.messages import (
     ToolCallPart,
 )
-from .otel_utils import get_current_trace_context, get_tracer
+from .child_streaming import drain_child_events, stream_with_child_events, process_child_event
+from .streaming_utils import (
+    StreamingState,
+    build_content_chunk,
+    build_progress_event,
+    build_tool_start_event,
+    extract_metadata_from_result,
+    extract_tool_args,
+    log_tool_call,
+)
+from .otel_utils import get_current_trace_context
 from .models import (
     ChatCompletionMessageDelta,
     ChatCompletionStreamChoice,
@@ -55,12 +66,19 @@ from .models import (
 )
 from .sse_events import (
     DoneEvent,
+    ErrorEvent,
     MetadataEvent,
     ProgressEvent,
     ReasoningEvent,
     ToolCallEvent,
     format_sse_event,
 )
+from ....services.session import SessionMessageStore
+from ....settings import settings
+from ....utils.date_utils import to_iso, utc_now
+if TYPE_CHECKING:
+    from ....agentic.context import AgentContext
 async def stream_openai_response(
@@ -79,6 +97,11 @@ async def stream_openai_response(
     # Mutable container to capture tool calls for persistence
     # Format: list of {"tool_name": str, "tool_id": str, "arguments": dict, "result": any}
     tool_calls_out: list | None = None,
+    # Agent context for multi-agent propagation
+    # When set, enables child agents to access parent context via get_current_context()
+    agent_context: "AgentContext | None" = None,
+    # Pydantic-ai native message history for proper tool call/return pairing
+    message_history: list | None = None,
 ) -> AsyncGenerator[str, None]:
     """
     Stream Pydantic AI agent responses with rich SSE events.
@@ -131,40 +154,39 @@ async def stream_openai_response(
         event: done
         data: {"type": "done", "reason": "stop"}
     """
-    if request_id is None:
-        request_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
-    created_at = int(time.time())
-    start_time = time.time()
-    is_first_chunk = True
-    reasoning_step = 0
-    current_step = 0
-    total_steps = 3  # Model request, tool execution (optional), final response
-    token_count = 0
-    # Track active tool calls for completion events
-    # Maps index -> (tool_name, tool_id) for correlating start/end events
-    active_tool_calls: dict[int, tuple[str, str]] = {}
-    # Queue of tool calls awaiting completion (FIFO for matching)
-    pending_tool_completions: list[tuple[str, str]] = []
-    # Track if metadata was registered via register_metadata tool
-    metadata_registered = False
-    # Track pending tool calls with full data for persistence
-    # Maps tool_id -> {"tool_name": str, "tool_id": str, "arguments": dict}
-    pending_tool_data: dict[str, dict] = {}
+    # Initialize streaming state
+    state = StreamingState.create(model=model, request_id=request_id)
+    # Get effective user_id for database operations
+    effective_user_id = agent_context.user_id if agent_context else None
+    # Import context functions for multi-agent support
+    from ....agentic.context import set_current_context, set_event_sink
+    # Set up context for multi-agent propagation
+    previous_context = None
+    if agent_context is not None:
+        from ....agentic.context import get_current_context
+        previous_context = get_current_context()
+        set_current_context(agent_context)
+    # Set up event sink for child agent event proxying
+    child_event_sink: asyncio.Queue = asyncio.Queue()
+    set_event_sink(child_event_sink)
     try:
         # Emit initial progress event
-        current_step = 1
-        yield format_sse_event(ProgressEvent(
-            step=current_step,
-            total_steps=total_steps,
+        state.current_step = 1
+        yield build_progress_event(
+            step=state.current_step,
+            total_steps=state.total_steps,
             label="Processing request",
-            status="in_progress"
-        ))
+        )
         # Use agent.iter() to get complete execution with tool calls
-        async with agent.iter(prompt) as agent_run:
+        # Pass message_history if available for proper tool call/return pairing
+        iter_kwargs = {"message_history": message_history} if message_history else {}
+        async with agent.iter(prompt, **iter_kwargs) as agent_run:
             # Capture trace context IMMEDIATELY inside agent execution
             # This is deterministic - it's the OTEL context from Pydantic AI instrumentation
             # NOT dependent on any AI-generated content
@@ -185,11 +207,11 @@ async def stream_openai_response(
                             if isinstance(event, PartStartEvent) and isinstance(
                                 event.part, ThinkingPart
                             ):
-                                reasoning_step += 1
+                                state.reasoning_step += 1
                                 if event.part.content:
                                     yield format_sse_event(ReasoningEvent(
                                         content=event.part.content,
-                                        step=reasoning_step
+                                        step=state.reasoning_step
                                     ))
                             # Reasoning delta (streaming thinking)
@@ -199,7 +221,7 @@ async def stream_openai_response(
                                 if event.delta.content_delta:
                                     yield format_sse_event(ReasoningEvent(
                                         content=event.delta.content_delta,
-                                        step=reasoning_step
+                                        step=state.reasoning_step
                                     ))
                             # ============================================
@@ -208,28 +230,11 @@ async def stream_openai_response(
                             elif isinstance(event, PartStartEvent) and isinstance(
                                 event.part, TextPart
                             ):
-                                # TextPart may contain initial content that needs to be emitted
+                                # Skip if child already streamed content
+                                if state.child_content_streamed:
+                                    continue
                                 if event.part.content:
-                                    content = event.part.content
-                                    token_count += len(content.split())
-                                    content_chunk = ChatCompletionStreamResponse(
-                                        id=request_id,
-                                        created=created_at,
-                                        model=model,
-                                        choices=[
-                                            ChatCompletionStreamChoice(
-                                                index=0,
-                                                delta=ChatCompletionMessageDelta(
-                                                    role="assistant" if is_first_chunk else None,
-                                                    content=content,
-                                                ),
-                                                finish_reason=None,
-                                            )
-                                        ],
-                                    )
-                                    is_first_chunk = False
-                                    yield f"data: {content_chunk.model_dump_json()}\n\n"
+                                    yield build_content_chunk(state, event.part.content)
                             # ============================================
                             # TOOL CALL START EVENTS
@@ -239,88 +244,39 @@ async def stream_openai_response(
                             ):
                                 tool_name = event.part.tool_name
-                                # Handle final_result specially - it's Pydantic AI's
-                                # internal tool for structured output
+                                # Handle final_result (Pydantic AI's internal tool)
                                 if tool_name == "final_result":
-                                    # Extract the structured result and emit as content
-                                    args_dict = None
-                                    if event.part.args is not None:
-                                        if hasattr(event.part.args, 'args_dict'):
-                                            args_dict = event.part.args.args_dict
-                                        elif isinstance(event.part.args, dict):
-                                            args_dict = event.part.args
+                                    args_dict = extract_tool_args(event.part)
                                     if args_dict:
-                                        # Emit the structured result as JSON content
                                         result_json = json.dumps(args_dict, indent=2)
-                                        content_chunk = ChatCompletionStreamResponse(
-                                            id=request_id,
-                                            created=created_at,
-                                            model=model,
-                                            choices=[
-                                                ChatCompletionStreamChoice(
-                                                    index=0,
-                                                    delta=ChatCompletionMessageDelta(
-                                                        role="assistant" if is_first_chunk else None,
-                                                        content=result_json,
-                                                    ),
-                                                    finish_reason=None,
-                                                )
-                                            ],
-                                        )
-                                        is_first_chunk = False
-                                        yield f"data: {content_chunk.model_dump_json()}\n\n"
-                                    continue  # Skip regular tool call handling
+                                        yield build_content_chunk(state, result_json)
+                                    continue
                                 tool_id = f"call_{uuid.uuid4().hex[:8]}"
-                                active_tool_calls[event.index] = (tool_name, tool_id)
-                                # Queue for completion matching (FIFO)
-                                pending_tool_completions.append((tool_name, tool_id))
-                                # Emit tool_call SSE event (started)
-                                # Try to get arguments as dict
-                                args_dict = None
-                                if event.part.args is not None:
-                                    if hasattr(event.part.args, 'args_dict'):
-                                        args_dict = event.part.args.args_dict
-                                    elif isinstance(event.part.args, dict):
-                                        args_dict = event.part.args
-                                # Log tool call with key parameters
-                                if args_dict and tool_name == "search_rem":
-                                    query_type = args_dict.get("query_type", "?")
-                                    limit = args_dict.get("limit", 20)
-                                    table = args_dict.get("table", "")
-                                    query_text = args_dict.get("query_text", args_dict.get("entity_key", ""))
-                                    if query_text and len(query_text) > 50:
-                                        query_text = query_text[:50] + "..."
-                                    logger.info(f"🔧 {tool_name} {query_type.upper()} '{query_text}' table={table} limit={limit}")
-                                else:
-                                    logger.info(f"🔧 {tool_name}")
+                                state.active_tool_calls[event.index] = (tool_name, tool_id)
+                                state.pending_tool_completions.append((tool_name, tool_id))
-                                yield format_sse_event(ToolCallEvent(
-                                    tool_name=tool_name,
-                                    tool_id=tool_id,
-                                    status="started",
-                                    arguments=args_dict
-                                ))
+                                # Extract and log arguments
+                                args_dict = extract_tool_args(event.part)
+                                log_tool_call(tool_name, args_dict)
+                                yield build_tool_start_event(tool_name, tool_id, args_dict)
-                                # Track tool call data for persistence (especially register_metadata)
-                                pending_tool_data[tool_id] = {
+                                # Track for persistence
+                                state.pending_tool_data[tool_id] = {
                                     "tool_name": tool_name,
                                     "tool_id": tool_id,
                                     "arguments": args_dict,
                                 }
                                 # Update progress
-                                current_step = 2
-                                total_steps = 4  # Added tool execution step
-                                yield format_sse_event(ProgressEvent(
-                                    step=current_step,
-                                    total_steps=total_steps,
+                                state.current_step = 2
+                                state.total_steps = 4
+                                yield build_progress_event(
+                                    step=state.current_step,
+                                    total_steps=state.total_steps,
                                     label=f"Calling {tool_name}",
-                                    status="in_progress"
-                                ))
+                                )
                             # ============================================
                             # TOOL CALL COMPLETION (PartEndEvent)
@@ -328,11 +284,14 @@ async def stream_openai_response(
                             elif isinstance(event, PartEndEvent) and isinstance(
                                 event.part, ToolCallPart
                             ):
-                                if event.index in active_tool_calls:
-                                    tool_name, tool_id = active_tool_calls[event.index]
-                                    # Note: result comes from FunctionToolResultEvent below
-                                    # For now, mark as completed without result
-                                    del active_tool_calls[event.index]
+                                if event.index in state.active_tool_calls:
+                                    tool_name, tool_id = state.active_tool_calls[event.index]
+                                    args_dict = extract_tool_args(event.part)
+                                    if tool_id in state.pending_tool_data:
+                                        state.pending_tool_data[tool_id]["arguments"] = args_dict
+                                    del state.active_tool_calls[event.index]
                             # ============================================
                             # TEXT CONTENT DELTA
@@ -340,113 +299,124 @@ async def stream_openai_response(
                             elif isinstance(event, PartDeltaEvent) and isinstance(
                                 event.delta, TextPartDelta
                             ):
+                                # DUPLICATION FIX: Skip parent text if child already streamed content
+                                # Child agents stream via child_content events in ask_agent tool.
+                                # If parent tries to echo that content, skip it.
+                                if state.child_content_streamed:
+                                    logger.debug("Skipping parent TextPartDelta - child content already streamed")
+                                    continue
                                 content = event.delta.content_delta
-                                token_count += len(content.split())  # Rough token estimate
-                                content_chunk = ChatCompletionStreamResponse(
-                                    id=request_id,
-                                    created=created_at,
-                                    model=model,
-                                    choices=[
-                                        ChatCompletionStreamChoice(
-                                            index=0,
-                                            delta=ChatCompletionMessageDelta(
-                                                role="assistant" if is_first_chunk else None,
-                                                content=content,
-                                            ),
-                                            finish_reason=None,
-                                        )
-                                    ],
-                                )
-                                is_first_chunk = False
-                                yield f"data: {content_chunk.model_dump_json()}\n\n"
+                                yield build_content_chunk(state, content)
                 # ============================================
                 # TOOL EXECUTION NODE
                 # ============================================
                 elif Agent.is_call_tools_node(node):
                     async with node.stream(agent_run.ctx) as tools_stream:
-                        async for tool_event in tools_stream:
+                        # Use concurrent multiplexer to handle both tool events
+                        # and child agent events as they arrive (fixes streaming lag)
+                        async for event_type, event_data in stream_with_child_events(
+                            tools_stream=tools_stream,
+                            child_event_sink=child_event_sink,
+                            state=state,
+                            session_id=session_id,
+                            user_id=effective_user_id,
+                            message_id=message_id,
+                            agent_schema=agent_schema,
+                        ):
+                            # Handle child events (streamed from ask_agent)
+                            if event_type == "child":
+                                async for chunk in process_child_event(
+                                    child_event=event_data,
+                                    state=state,
+                                    session_id=session_id,
+                                    user_id=effective_user_id,
+                                    message_id=message_id,
+                                    agent_schema=agent_schema,
+                                ):
+                                    yield chunk
+                                continue
+                            # Handle tool events
+                            tool_event = event_data
                             # Tool result event - emit completion
                             if isinstance(tool_event, FunctionToolResultEvent):
                                 # Get the tool name/id from the pending queue (FIFO)
-                                if pending_tool_completions:
-                                    tool_name, tool_id = pending_tool_completions.pop(0)
+                                if state.pending_tool_completions:
+                                    tool_name, tool_id = state.pending_tool_completions.pop(0)
                                 else:
-                                    # Fallback if queue is empty (shouldn't happen)
                                     tool_name = "tool"
                                     tool_id = f"call_{uuid.uuid4().hex[:8]}"
-                                # Check if this is a register_metadata tool result
-                                # It returns a dict with _metadata_event: True marker
                                 result_content = tool_event.result.content if hasattr(tool_event.result, 'content') else tool_event.result
                                 is_metadata_event = False
-                                if isinstance(result_content, dict) and result_content.get("_metadata_event"):
+                                # Handle register_metadata tool results
+                                metadata = extract_metadata_from_result(result_content)
+                                if metadata:
                                     is_metadata_event = True
-                                    metadata_registered = True  # Skip default metadata at end
-                                    # Emit MetadataEvent with registered values
-                                    registered_confidence = result_content.get("confidence")
-                                    registered_sources = result_content.get("sources")
-                                    registered_references = result_content.get("references")
-                                    registered_flags = result_content.get("flags")
-                                    # Session naming
-                                    registered_session_name = result_content.get("session_name")
-                                    # Risk assessment fields
-                                    registered_risk_level = result_content.get("risk_level")
-                                    registered_risk_score = result_content.get("risk_score")
-                                    registered_risk_reasoning = result_content.get("risk_reasoning")
-                                    registered_recommended_action = result_content.get("recommended_action")
-                                    # Extra fields
-                                    registered_extra = result_content.get("extra")
+                                    state.metadata_registered = True
+                                    # Only set responding_agent if not already set by child
+                                    if not state.responding_agent and metadata.get("agent_schema"):
+                                        state.responding_agent = metadata["agent_schema"]
                                     logger.info(
-                                        f"📊 Metadata registered: confidence={registered_confidence}, "
-                                        f"session_name={registered_session_name}, "
-                                        f"risk_level={registered_risk_level}, sources={registered_sources}"
+                                        f"📊 Metadata: confidence={metadata.get('confidence')}, "
+                                        f"risk_level={metadata.get('risk_level')}"
                                     )
-                                    # Build extra dict with risk fields and any custom extras
+                                    # Build extra dict with risk fields
                                     extra_data = {}
-                                    if registered_risk_level is not None:
-                                        extra_data["risk_level"] = registered_risk_level
-                                    if registered_risk_score is not None:
-                                        extra_data["risk_score"] = registered_risk_score
-                                    if registered_risk_reasoning is not None:
-                                        extra_data["risk_reasoning"] = registered_risk_reasoning
-                                    if registered_recommended_action is not None:
-                                        extra_data["recommended_action"] = registered_recommended_action
-                                    if registered_extra:
-                                        extra_data.update(registered_extra)
-                                    # Emit metadata event immediately
+                                    for field in ["risk_level", "risk_score", "risk_reasoning", "recommended_action"]:
+                                        if metadata.get(field) is not None:
+                                            extra_data[field] = metadata[field]
+                                    if metadata.get("extra"):
+                                        extra_data.update(metadata["extra"])
                                     yield format_sse_event(MetadataEvent(
                                         message_id=message_id,
                                         in_reply_to=in_reply_to,
                                         session_id=session_id,
                                         agent_schema=agent_schema,
-                                        session_name=registered_session_name,
-                                        confidence=registered_confidence,
-                                        sources=registered_sources,
+                                        responding_agent=state.responding_agent,
+                                        session_name=metadata.get("session_name"),
+                                        confidence=metadata.get("confidence"),
+                                        sources=metadata.get("sources"),
                                         model_version=model,
-                                        flags=registered_flags,
+                                        flags=metadata.get("flags"),
                                         extra=extra_data if extra_data else None,
                                         hidden=False,
                                     ))
-                                # Capture tool call with result for persistence
-                                # Special handling for register_metadata - always capture full data
-                                if tool_calls_out is not None and tool_id in pending_tool_data:
-                                    tool_data = pending_tool_data[tool_id]
+                                # Get complete args from pending_tool_data
+                                completed_args = None
+                                if tool_id in state.pending_tool_data:
+                                    completed_args = state.pending_tool_data[tool_id].get("arguments")
+                                # Capture tool call for persistence
+                                if tool_calls_out is not None and tool_id in state.pending_tool_data:
+                                    tool_data = state.pending_tool_data[tool_id]
                                     tool_data["result"] = result_content
                                     tool_data["is_metadata"] = is_metadata_event
                                     tool_calls_out.append(tool_data)
-                                    del pending_tool_data[tool_id]
+                                    del state.pending_tool_data[tool_id]
                                 if not is_metadata_event:
+                                    # NOTE: text_response fallback is DISABLED
+                                    # Child agents now stream content via child_content events (above)
+                                    # which provides real-time streaming. The text_response in tool
+                                    # result would duplicate that content, so we skip it entirely.
                                     # Normal tool completion - emit ToolCallEvent
-                                    result_str = str(result_content)
-                                    result_summary = result_str[:200] + "..." if len(result_str) > 200 else result_str
+                                    # For finalize_intake, send full result dict for frontend
+                                    if tool_name == "finalize_intake" and isinstance(result_content, dict):
+                                        result_for_sse = result_content
+                                    else:
+                                        result_str = str(result_content)
+                                        result_for_sse = result_str[:200] + "..." if len(result_str) > 200 else result_str
                                     # Log result count for search_rem
                                     if tool_name == "search_rem" and isinstance(result_content, dict):
@@ -477,14 +447,15 @@ async def stream_openai_response(
                                         tool_name=tool_name,
                                         tool_id=tool_id,
                                         status="completed",
-                                        result=result_summary
+                                        arguments=completed_args,
+                                        result=result_for_sse
                                     ))
                                 # Update progress after tool completion
-                                current_step = 3
+                                state.current_step = 3
                                 yield format_sse_event(ProgressEvent(
-                                    step=current_step,
-                                    total_steps=total_steps,
+                                    step=state.current_step,
+                                    total_steps=state.total_steps,
                                     label="Generating response",
                                     status="in_progress"
                                 ))
@@ -513,36 +484,36 @@ async def stream_openai_response(
                             result_dict = {"result": str(output)}
                         result_json = json.dumps(result_dict, indent=2, default=str)
-                        token_count += len(result_json.split())
+                        state.token_count += len(result_json.split())
                         # Emit structured result as content
                         result_chunk = ChatCompletionStreamResponse(
-                            id=request_id,
-                            created=created_at,
+                            id=state.request_id,
+                            created=state.created_at,
                             model=model,
                             choices=[
                                 ChatCompletionStreamChoice(
                                     index=0,
                                     delta=ChatCompletionMessageDelta(
-                                        role="assistant" if is_first_chunk else None,
+                                        role="assistant" if state.is_first_chunk else None,
                                         content=result_json,
                                     ),
                                     finish_reason=None,
                                 )
                             ],
                         )
-                        is_first_chunk = False
+                        state.is_first_chunk = False
                         yield f"data: {result_chunk.model_dump_json()}\n\n"
             except Exception as e:
                 logger.debug(f"No structured result available: {e}")
         # Calculate latency
-        latency_ms = int((time.time() - start_time) * 1000)
+        latency_ms = state.latency_ms()
         # Final OpenAI chunk with finish_reason
         final_chunk = ChatCompletionStreamResponse(
-            id=request_id,
-            created=created_at,
+            id=state.request_id,
+            created=state.created_at,
             model=model,
             choices=[
                 ChatCompletionStreamChoice(
@@ -555,27 +526,28 @@ async def stream_openai_response(
         yield f"data: {final_chunk.model_dump_json()}\n\n"
         # Emit metadata event only if not already registered via register_metadata tool
-        if not metadata_registered:
+        if not state.metadata_registered:
             yield format_sse_event(MetadataEvent(
                 message_id=message_id,
                 in_reply_to=in_reply_to,
                 session_id=session_id,
                 agent_schema=agent_schema,
+                responding_agent=state.responding_agent,
                 confidence=1.0,  # Default to 100% confidence
                 model_version=model,
                 latency_ms=latency_ms,
-                token_count=token_count,
+                token_count=state.token_count,
                 # Include deterministic trace context captured from OTEL
                 trace_id=captured_trace_id,
                 span_id=captured_span_id,
             ))
         # Mark all progress complete
-        for step in range(1, total_steps + 1):
+        for step in range(1, state.total_steps + 1):
             yield format_sse_event(ProgressEvent(
                 step=step,
-                total_steps=total_steps,
-                label="Complete" if step == total_steps else f"Step {step}",
+                total_steps=state.total_steps,
+                label="Complete" if step == state.total_steps else f"Step {step}",
                 status="completed"
             ))
@@ -587,25 +559,79 @@ async def stream_openai_response(
     except Exception as e:
         import traceback
+        import re
         error_msg = str(e)
-        logger.error(f"Streaming error: {error_msg}")
-        logger.error(traceback.format_exc())
-        # Send error as final chunk
-        error_data = {
-            "error": {
-                "message": error_msg,
-                "type": "internal_error",
-                "code": "stream_error",
-            }
-        }
-        yield f"data: {json.dumps(error_data)}\n\n"
+        # Parse error details for better client handling
+        error_code = "stream_error"
+        error_details: dict = {}
+        recoverable = True
+        # Check for rate limit errors (OpenAI 429)
+        if "429" in error_msg or "rate_limit" in error_msg.lower() or "RateLimitError" in type(e).__name__:
+            error_code = "rate_limit_exceeded"
+            recoverable = True
+            # Extract retry-after time from error message
+            # Pattern: "Please try again in X.XXs" or "Please try again in Xs"
+            retry_match = re.search(r"try again in (\d+(?:\.\d+)?)\s*s", error_msg)
+            if retry_match:
+                retry_seconds = float(retry_match.group(1))
+                error_details["retry_after_seconds"] = retry_seconds
+                error_details["retry_after_ms"] = int(retry_seconds * 1000)
+            # Extract token usage info if available
+            used_match = re.search(r"Used (\d+)", error_msg)
+            limit_match = re.search(r"Limit (\d+)", error_msg)
+            requested_match = re.search(r"Requested (\d+)", error_msg)
+            if used_match:
+                error_details["tokens_used"] = int(used_match.group(1))
+            if limit_match:
+                error_details["tokens_limit"] = int(limit_match.group(1))
+            if requested_match:
+                error_details["tokens_requested"] = int(requested_match.group(1))
+            logger.error(f"🔴 Streaming error: status_code: 429, model_name: {model}, body: {error_msg[:200]}")
+        # Check for authentication errors
+        elif "401" in error_msg or "AuthenticationError" in type(e).__name__:
+            error_code = "authentication_error"
+            recoverable = False
+            logger.error(f"🔴 Streaming error: Authentication failed")
+        # Check for model not found / invalid model
+        elif "404" in error_msg or "model" in error_msg.lower() and "not found" in error_msg.lower():
+            error_code = "model_not_found"
+            recoverable = False
+            logger.error(f"🔴 Streaming error: Model not found")
+        # Generic error
+        else:
+            logger.error(f"🔴 Streaming error: {error_msg}")
+        logger.error(f"🔴 {traceback.format_exc()}")
+        # Emit proper ErrorEvent via SSE (with event: prefix for client parsing)
+        yield format_sse_event(ErrorEvent(
+            code=error_code,
+            message=error_msg,
+            details=error_details if error_details else None,
+            recoverable=recoverable,
+        ))
         # Emit done event with error reason
         yield format_sse_event(DoneEvent(reason="error"))
         yield "data: [DONE]\n\n"
+    finally:
+        # Clean up event sink for multi-agent streaming
+        set_event_sink(None)
+        # Restore previous context for multi-agent support
+        # This ensures nested agent calls don't pollute the parent's context
+        if agent_context is not None:
+            set_current_context(previous_context)
 async def stream_simulator_response(
     prompt: str,
@@ -708,6 +734,37 @@ async def stream_minimal_simulator(
         yield sse_string
+async def save_user_message(
+    session_id: str,
+    user_id: str | None,
+    content: str,
+) -> None:
+    """
+    Save user message to database before streaming.
+    Shared utility used by both API and CLI for consistent user message storage.
+    """
+    if not settings.postgres.enabled or not session_id:
+        return
+    user_msg = {
+        "role": "user",
+        "content": content,
+        "timestamp": to_iso(utc_now()),
+    }
+    try:
+        store = SessionMessageStore(user_id=user_id or settings.test.effective_user_id)
+        await store.store_session_messages(
+            session_id=session_id,
+            messages=[user_msg],
+            user_id=user_id,
+            compress=False,
+        )
+        logger.debug(f"Saved user message to session {session_id}")
+    except Exception as e:
+        logger.error(f"Failed to save user message: {e}", exc_info=True)
 async def stream_openai_response_with_save(
     agent: Agent,
     prompt: str,
@@ -716,6 +773,10 @@ async def stream_openai_response_with_save(
     agent_schema: str | None = None,
     session_id: str | None = None,
     user_id: str | None = None,
+    # Agent context for multi-agent propagation
+    agent_context: "AgentContext | None" = None,
+    # Pydantic-ai native message history for proper tool call/return pairing
+    message_history: list | None = None,
 ) -> AsyncGenerator[str, None]:
     """
     Wrapper around stream_openai_response that saves the assistant response after streaming.
@@ -723,6 +784,9 @@ async def stream_openai_response_with_save(
     This accumulates all text content during streaming and saves it to the database
     after the stream completes.
+    NOTE: Call save_user_message() BEFORE this function to save the user's message.
+    This function only saves tool calls and assistant responses.
     Args:
         agent: Pydantic AI agent instance
         prompt: User prompt
@@ -731,14 +795,11 @@ async def stream_openai_response_with_save(
         agent_schema: Agent schema name
         session_id: Session ID for message storage
         user_id: User ID for message storage
+        agent_context: Agent context for multi-agent propagation (enables child agents)
     Yields:
         SSE-formatted strings
     """
-    from ....utils.date_utils import utc_now, to_iso
-    from ....services.session import SessionMessageStore
-    from ....settings import settings
     # Pre-generate message_id so it can be sent in metadata event
     # This allows frontend to use it for feedback before DB persistence
     message_id = str(uuid.uuid4())
@@ -763,6 +824,8 @@ async def stream_openai_response_with_save(
         message_id=message_id,
         trace_context_out=trace_context,  # Pass container to capture trace IDs
         tool_calls_out=tool_calls,  # Capture tool calls for persistence
+        agent_context=agent_context,  # Pass context for multi-agent support
+        message_history=message_history,  # Native pydantic-ai message history
     ):
         yield chunk
@@ -777,6 +840,9 @@ async def stream_openai_response_with_save(
                         delta = data["choices"][0].get("delta", {})
                         content = delta.get("content")
                         if content:
+                            # DEBUG: Check for [Calling markers in content
+                            if "[Calling" in content:
+                                logger.warning(f"DEBUG: Found [Calling in content chunk: {repr(content[:100])}")
                             accumulated_content.append(content)
             except (json.JSONDecodeError, KeyError, IndexError):
                 pass  # Skip non-JSON or malformed chunks
@@ -793,6 +859,8 @@ async def stream_openai_response_with_save(
         # First, store tool call messages (message_type: "tool")
         for tool_call in tool_calls:
+            if not tool_call:
+                continue
             tool_message = {
                 "role": "tool",
                 "content": json.dumps(tool_call.get("result", {}), default=str),
@@ -807,8 +875,34 @@ async def stream_openai_response_with_save(
             messages_to_store.append(tool_message)
         # Then store assistant text response (if any)
+        # Priority: direct TextPartDelta content > tool call text_response
+        # When an agent delegates via ask_agent, the child's text_response becomes
+        # the parent's assistant response (the parent is just orchestrating)
+        full_content = None
         if accumulated_content:
             full_content = "".join(accumulated_content)
+            logger.warning(f"DEBUG: Using accumulated_content ({len(accumulated_content)} chunks, {len(full_content)} chars)")
+            logger.warning(f"DEBUG: First 200 chars: {repr(full_content[:200])}")
+        else:
+            logger.warning("DEBUG: accumulated_content is empty, checking text_response fallback")
+            # No direct text from TextPartDelta - check tool results for text_response
+            # This handles multi-agent delegation where child agent output is the response
+            for tool_call in tool_calls:
+                if not tool_call:
+                    continue
+                result = tool_call.get("result")
+                if isinstance(result, dict) and result.get("text_response"):
+                    text_response = result["text_response"]
+                    if text_response and str(text_response).strip():
+                        full_content = str(text_response)
+                        logger.debug(
+                            f"Using text_response from {tool_call.get('tool_name', 'tool')} "
+                            f"({len(full_content)} chars) as assistant message"
+                        )
+                        break
+        if full_content:
             assistant_message = {
                 "id": message_id,  # Use pre-generated ID for consistency with metadata event
                 "role": "assistant",
@@ -830,7 +924,7 @@ async def stream_openai_response_with_save(
                 )
                 logger.debug(
                     f"Saved {len(tool_calls)} tool calls and "
-                    f"{'assistant response' if accumulated_content else 'no text'} "
+                    f"{'assistant response' if full_content else 'no text'} "
                     f"to session {session_id}"
                 )
             except Exception as e:
@@ -838,8 +932,9 @@ async def stream_openai_response_with_save(
         # Update session description with session_name (non-blocking, after all yields)
         for tool_call in tool_calls:
-            if tool_call.get("tool_name") == "register_metadata" and tool_call.get("is_metadata"):
-                session_name = tool_call.get("arguments", {}).get("session_name")
+            if tool_call and tool_call.get("tool_name") == "register_metadata" and tool_call.get("is_metadata"):
+                arguments = tool_call.get("arguments") or {}
+                session_name = arguments.get("session_name")
                 if session_name:
                     try:
                         from ....models.entities import Session

remdb 0.3.171__py3-none-any.whl → 0.3.230__py3-none-any.whl

remdb 0.3.171py3-none-any.whl → 0.3.230py3-none-any.whl