PyPI - remdb - Versions diffs - 0.3.180__py3-none-any.whl → 0.3.258__py3-none-any.whl - Mend

remdb 0.3.180py3-none-any.whl → 0.3.258py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

rem/agentic/README.md +36 -2
rem/agentic/__init__.py +10 -1
rem/agentic/context.py +185 -1
rem/agentic/context_builder.py +56 -35
rem/agentic/mcp/tool_wrapper.py +2 -2
rem/agentic/providers/pydantic_ai.py +303 -111
rem/agentic/schema.py +2 -2
rem/api/main.py +1 -1
rem/api/mcp_router/resources.py +223 -0
rem/api/mcp_router/server.py +4 -0
rem/api/mcp_router/tools.py +608 -166
rem/api/routers/admin.py +30 -4
rem/api/routers/auth.py +219 -20
rem/api/routers/chat/child_streaming.py +393 -0
rem/api/routers/chat/completions.py +77 -40
rem/api/routers/chat/sse_events.py +7 -3
rem/api/routers/chat/streaming.py +381 -291
rem/api/routers/chat/streaming_utils.py +325 -0
rem/api/routers/common.py +18 -0
rem/api/routers/dev.py +7 -1
rem/api/routers/feedback.py +11 -3
rem/api/routers/messages.py +176 -38
rem/api/routers/models.py +9 -1
rem/api/routers/query.py +17 -15
rem/api/routers/shared_sessions.py +16 -0
rem/auth/jwt.py +19 -4
rem/auth/middleware.py +42 -28
rem/cli/README.md +62 -0
rem/cli/commands/ask.py +205 -114
rem/cli/commands/db.py +55 -31
rem/cli/commands/experiments.py +1 -1
rem/cli/commands/process.py +179 -43
rem/cli/commands/query.py +109 -0
rem/cli/commands/session.py +117 -0
rem/cli/main.py +2 -0
rem/models/core/experiment.py +1 -1
rem/models/entities/ontology.py +18 -20
rem/models/entities/session.py +1 -0
rem/schemas/agents/core/agent-builder.yaml +1 -1
rem/schemas/agents/rem.yaml +1 -1
rem/schemas/agents/test_orchestrator.yaml +42 -0
rem/schemas/agents/test_structured_output.yaml +52 -0
rem/services/content/providers.py +151 -49
rem/services/content/service.py +18 -5
rem/services/embeddings/worker.py +26 -12
rem/services/postgres/__init__.py +28 -3
rem/services/postgres/diff_service.py +57 -5
rem/services/postgres/programmable_diff_service.py +635 -0
rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
rem/services/postgres/register_type.py +11 -10
rem/services/postgres/repository.py +39 -28
rem/services/postgres/schema_generator.py +5 -5
rem/services/postgres/sql_builder.py +6 -5
rem/services/rem/README.md +4 -3
rem/services/rem/parser.py +7 -10
rem/services/rem/service.py +47 -0
rem/services/session/__init__.py +8 -1
rem/services/session/compression.py +47 -5
rem/services/session/pydantic_messages.py +310 -0
rem/services/session/reload.py +2 -1
rem/settings.py +92 -7
rem/sql/migrations/001_install.sql +125 -7
rem/sql/migrations/002_install_models.sql +159 -149
rem/sql/migrations/004_cache_system.sql +10 -276
rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
rem/utils/schema_loader.py +180 -120
{remdb-0.3.180.dist-info → remdb-0.3.258.dist-info}/METADATA +7 -6
{remdb-0.3.180.dist-info → remdb-0.3.258.dist-info}/RECORD +70 -61
{remdb-0.3.180.dist-info → remdb-0.3.258.dist-info}/WHEEL +0 -0
{remdb-0.3.180.dist-info → remdb-0.3.258.dist-info}/entry_points.txt +0 -0

rem/cli/README.md CHANGED Viewed

@@ -434,6 +434,68 @@ Ensure you're using the correct model format:
 - OpenAI: `openai:gpt-4o-mini`, `openai:gpt-4o`
 - Anthropic: `anthropic:claude-sonnet-4-5-20250929`
+## Data Visibility: PUBLIC vs PRIVATE
+**IMPORTANT: All ingested data is PUBLIC by default.** This is the correct behavior
+for shared knowledge bases (ontologies, procedures, reference data).
+### Why PUBLIC by Default?
+Most data in REM should be searchable by all users:
+- Clinical ontologies (disorders, symptoms, drugs)
+- Procedures and protocols (SCID-5, PHQ-9, etc.)
+- Reference documentation
+- Shared domain knowledge
+The `rem_lookup()` function searches for data where `user_id IS NULL`, which means
+public data. If you set `user_id` on data, it becomes invisible to other users.
+### Ingesting Public Data (Default)
+```bash
+# Standard ingestion - data is PUBLIC
+rem process ingest ontology/procedures/ --table ontologies
+# From S3 - also PUBLIC
+rem process ingest s3://bucket/docs/reference.pdf
+```
+### Ingesting Private Data (Rare)
+Private data requires explicit `--make-private` flag:
+```bash
+# Private user data - requires --make-private and --user-id
+rem process ingest personal-notes.md --make-private --user-id user-123
+```
+**When to use private data:**
+- User-uploaded personal documents
+- Session-specific content
+- User notes and annotations
+**NEVER use private data for:**
+- Ontologies and reference material
+- Clinical procedures and protocols
+- Shared knowledge bases
+- Anything that should be searchable by agents
+### Common Mistake
+If agents can't find data via `search_rem`, the most common cause is that the data
+was ingested with a `user_id` set. Check with:
+```sql
+SELECT name, user_id FROM ontologies WHERE name = 'phq-9-procedure';
+-- user_id should be NULL for public data
+```
+Fix by setting user_id to NULL:
+```sql
+UPDATE ontologies SET user_id = NULL WHERE user_id IS NOT NULL;
+UPDATE kv_store SET user_id = NULL WHERE entity_type = 'ontologies' AND user_id IS NOT NULL;
+```
 ## Next Steps
 1. **Implement Schema Registry**

rem/cli/commands/ask.py CHANGED Viewed

@@ -71,16 +71,18 @@ async def run_agent_streaming(
     max_turns: int = 10,
     context: AgentContext | None = None,
     max_iterations: int | None = None,
+    user_message: str | None = None,
 ) -> None:
     """
-    Run agent in streaming mode using agent.iter() with usage limits.
+    Run agent in streaming mode using the SAME code path as the API.
-    Design Pattern:
-    - Use agent.iter() for complete execution with tool call visibility
-    - run_stream() stops after first output, missing tool calls
-    - Stream tool call markers: [Calling: tool_name]
-    - Stream text content deltas as they arrive
-    - Show final structured result
+    This uses stream_openai_response_with_save from the API to ensure:
+    1. Tool calls are saved as separate "tool" messages (not embedded in content)
+    2. Assistant response is clean text only (no [Calling: ...] markers)
+    3. CLI testing is equivalent to API testing
+    The CLI displays tool calls as [Calling: tool_name] for visibility,
+    but these are NOT saved to the database.
     Args:
         agent: Pydantic AI agent
@@ -88,88 +90,66 @@ async def run_agent_streaming(
         max_turns: Maximum turns for agent execution (not used in current API)
         context: Optional AgentContext for session persistence
         max_iterations: Maximum iterations/requests (from agent schema or settings)
+        user_message: The user's original message (for database storage)
     """
-    from pydantic_ai import UsageLimits
-    from rem.utils.date_utils import to_iso_with_z, utc_now
+    import json
+    from rem.api.routers.chat.streaming import stream_openai_response_with_save, save_user_message
     logger.info("Running agent in streaming mode...")
     try:
-        # Import event types for streaming
-        from pydantic_ai import Agent as PydanticAgent
-        from pydantic_ai.messages import PartStartEvent, PartDeltaEvent, TextPartDelta, ToolCallPart
-        # Accumulate assistant response for session persistence
-        assistant_response_parts = []
-        # Use agent.iter() to get complete execution with tool calls
-        usage_limits = UsageLimits(request_limit=max_iterations) if max_iterations else None
-        async with agent.iter(prompt, usage_limits=usage_limits) as agent_run:
-            async for node in agent_run:
-                # Check if this is a model request node (includes tool calls and text)
-                if PydanticAgent.is_model_request_node(node):
-                    # Stream events from model request
-                    request_stream: Any
-                    async with node.stream(agent_run.ctx) as request_stream:
-                        async for event in request_stream:
-                            # Tool call start event
-                            if isinstance(event, PartStartEvent) and isinstance(
-                                event.part, ToolCallPart
-                            ):
-                                tool_marker = f"\n[Calling: {event.part.tool_name}]"
-                                print(tool_marker, flush=True)
-                                assistant_response_parts.append(tool_marker)
-                            # Text content delta
-                            elif isinstance(event, PartDeltaEvent) and isinstance(
-                                event.delta, TextPartDelta
-                            ):
-                                print(event.delta.content_delta, end="", flush=True)
-                                assistant_response_parts.append(event.delta.content_delta)
-        print("\n")  # Final newline after streaming
-        # Get final result from agent_run
-        result = agent_run.result
-        if hasattr(result, "output"):
-            logger.info("Final structured result:")
-            output = result.output
-            from rem.agentic.serialization import serialize_agent_result
-            output_json = json.dumps(serialize_agent_result(output), indent=2)
-            print(output_json)
-            assistant_response_parts.append(f"\n{output_json}")
-        # Save session messages (if session_id provided and postgres enabled)
-        if context and context.session_id and settings.postgres.enabled:
-            from ...services.session.compression import SessionMessageStore
-            # Extract just the user query from prompt
-            # Prompt format from ContextBuilder: system + history + user message
-            # We need to extract the last user message
-            user_message_content = prompt.split("\n\n")[-1] if "\n\n" in prompt else prompt
-            user_message = {
-                "role": "user",
-                "content": user_message_content,
-                "timestamp": to_iso_with_z(utc_now()),
-            }
-            assistant_message = {
-                "role": "assistant",
-                "content": "".join(assistant_response_parts),
-                "timestamp": to_iso_with_z(utc_now()),
-            }
-            # Store messages with compression
-            store = SessionMessageStore(user_id=context.user_id or settings.test.effective_user_id)
-            await store.store_session_messages(
+        # Save user message BEFORE streaming (same as API, using shared utility)
+        if context and context.session_id and user_message:
+            await save_user_message(
                 session_id=context.session_id,
-                messages=[user_message, assistant_message],
                 user_id=context.user_id,
-                compress=True,
+                content=user_message,
             )
-            logger.debug(f"Saved conversation to session {context.session_id}")
+        # Use the API streaming code path for consistency
+        # This properly handles tool calls and message persistence
+        model_name = getattr(agent, 'model', 'unknown')
+        if hasattr(model_name, 'model_name'):
+            model_name = model_name.model_name
+        elif hasattr(model_name, 'name'):
+            model_name = model_name.name
+        else:
+            model_name = str(model_name)
+        async for chunk in stream_openai_response_with_save(
+            agent=agent.agent if hasattr(agent, 'agent') else agent,
+            prompt=prompt,
+            model=model_name,
+            session_id=context.session_id if context else None,
+            user_id=context.user_id if context else None,
+            agent_context=context,
+        ):
+            # Parse SSE chunks for CLI display
+            if chunk.startswith("event: tool_call"):
+                # Extract tool call info from next data line
+                continue
+            elif chunk.startswith("data: ") and not chunk.startswith("data: [DONE]"):
+                try:
+                    data_str = chunk[6:].strip()
+                    if data_str:
+                        data = json.loads(data_str)
+                        # Check for tool_call event
+                        if data.get("type") == "tool_call":
+                            tool_name = data.get("tool_name", "tool")
+                            status = data.get("status", "")
+                            if status == "started":
+                                print(f"\n[Calling: {tool_name}]", flush=True)
+                        # Check for text content (OpenAI format)
+                        elif "choices" in data and data["choices"]:
+                            delta = data["choices"][0].get("delta", {})
+                            content = delta.get("content")
+                            if content:
+                                print(content, end="", flush=True)
+                except (json.JSONDecodeError, KeyError, IndexError):
+                    pass
+        print("\n")  # Final newline after streaming
+        logger.info("Final structured result:")
     except Exception as e:
         logger.error(f"Agent execution failed: {e}")
@@ -184,9 +164,13 @@ async def run_agent_non_streaming(
     context: AgentContext | None = None,
     plan: bool = False,
     max_iterations: int | None = None,
+    user_message: str | None = None,
 ) -> dict[str, Any] | None:
     """
-    Run agent in non-streaming mode using agent.run() with usage limits.
+    Run agent in non-streaming mode using agent.iter() to capture tool calls.
+    This mirrors the streaming code path to ensure tool messages are properly
+    persisted to the database for state tracking across turns.
     Args:
         agent: Pydantic AI agent
@@ -196,77 +180,183 @@ async def run_agent_non_streaming(
         context: Optional AgentContext for session persistence
         plan: If True, output only the generated query (for query-agent)
         max_iterations: Maximum iterations/requests (from agent schema or settings)
+        user_message: The user's original message (for database storage)
     Returns:
         Output data if successful, None otherwise
     """
     from pydantic_ai import UsageLimits
+    from pydantic_ai.agent import Agent
+    from pydantic_ai.messages import (
+        FunctionToolResultEvent,
+        PartStartEvent,
+        PartEndEvent,
+        TextPart,
+        ToolCallPart,
+    )
     from rem.utils.date_utils import to_iso_with_z, utc_now
     logger.info("Running agent in non-streaming mode...")
     try:
-        # Run agent and get complete result with usage limits
-        usage_limits = UsageLimits(request_limit=max_iterations) if max_iterations else None
-        result = await agent.run(prompt, usage_limits=usage_limits)
+        # Track tool calls for persistence (same as streaming code path)
+        tool_calls: list = []
+        pending_tool_data: dict = {}
+        pending_tool_completions: list = []
+        accumulated_content: list = []
+        # Get the underlying pydantic-ai agent
+        pydantic_agent = agent.agent if hasattr(agent, 'agent') else agent
+        # Use agent.iter() to capture tool calls (same as streaming)
+        async with pydantic_agent.iter(prompt) as agent_run:
+            async for node in agent_run:
+                # Handle model request nodes (text + tool call starts)
+                if Agent.is_model_request_node(node):
+                    async with node.stream(agent_run.ctx) as request_stream:
+                        async for event in request_stream:
+                            # Capture text content
+                            if isinstance(event, PartStartEvent) and isinstance(event.part, TextPart):
+                                if event.part.content:
+                                    accumulated_content.append(event.part.content)
+                            # Capture tool call starts
+                            elif isinstance(event, PartStartEvent) and isinstance(event.part, ToolCallPart):
+                                tool_name = event.part.tool_name
+                                if tool_name == "final_result":
+                                    continue
+                                import uuid
+                                tool_id = f"call_{uuid.uuid4().hex[:8]}"
+                                pending_tool_completions.append((tool_name, tool_id))
+                                # Extract arguments
+                                args_dict = {}
+                                if hasattr(event.part, 'args'):
+                                    args = event.part.args
+                                    if isinstance(args, str):
+                                        try:
+                                            args_dict = json.loads(args)
+                                        except json.JSONDecodeError:
+                                            args_dict = {"raw": args}
+                                    elif isinstance(args, dict):
+                                        args_dict = args
+                                pending_tool_data[tool_id] = {
+                                    "tool_name": tool_name,
+                                    "tool_id": tool_id,
+                                    "arguments": args_dict,
+                                }
+                                # Print tool call for CLI visibility
+                                print(f"\n[Calling: {tool_name}]", flush=True)
+                            # Capture tool call end (update arguments if changed)
+                            elif isinstance(event, PartEndEvent) and isinstance(event.part, ToolCallPart):
+                                pass  # Arguments already captured at start
+                # Handle tool execution nodes (results)
+                elif Agent.is_call_tools_node(node):
+                    async with node.stream(agent_run.ctx) as tools_stream:
+                        async for event in tools_stream:
+                            if isinstance(event, FunctionToolResultEvent):
+                                # Get tool info from pending queue
+                                if pending_tool_completions:
+                                    tool_name, tool_id = pending_tool_completions.pop(0)
+                                else:
+                                    import uuid
+                                    tool_name = "tool"
+                                    tool_id = f"call_{uuid.uuid4().hex[:8]}"
+                                result_content = event.result.content if hasattr(event.result, 'content') else event.result
+                                # Capture tool call for persistence
+                                if tool_id in pending_tool_data:
+                                    tool_data = pending_tool_data[tool_id]
+                                    tool_data["result"] = result_content
+                                    tool_calls.append(tool_data)
+                                    del pending_tool_data[tool_id]
+            # Get final result
+            result = agent_run.result
         # Extract output data
         output_data = None
         assistant_content = None
-        if hasattr(result, "output"):
+        if result is not None and hasattr(result, "output"):
             output = result.output
             from rem.agentic.serialization import serialize_agent_result
             output_data = serialize_agent_result(output)
             if plan and isinstance(output_data, dict) and "query" in output_data:
-                # Plan mode: Output only the query
-                # Use sql formatting if possible or just raw string
                 assistant_content = output_data["query"]
                 print(assistant_content)
             else:
-                # Normal mode
-                assistant_content = json.dumps(output_data, indent=2)
+                # For string output, use it directly
+                if isinstance(output_data, str):
+                    assistant_content = output_data
+                else:
+                    assistant_content = json.dumps(output_data, indent=2)
                 print(assistant_content)
         else:
-            # Fallback for text-only results
-            assistant_content = str(result)
-            print(assistant_content)
+            assistant_content = str(result) if result else ""
+            if assistant_content:
+                print(assistant_content)
         # Save to file if requested
         if output_file and output_data:
             await _save_output_file(output_file, output_data)
-        # Save session messages (if session_id provided and postgres enabled)
+        # Save session messages including tool calls (same as streaming code path)
         if context and context.session_id and settings.postgres.enabled:
             from ...services.session.compression import SessionMessageStore
-            # Extract just the user query from prompt
-            # Prompt format from ContextBuilder: system + history + user message
-            # We need to extract the last user message
-            user_message_content = prompt.split("\n\n")[-1] if "\n\n" in prompt else prompt
+            timestamp = to_iso_with_z(utc_now())
+            messages_to_store = []
-            user_message = {
+            # Save user message first
+            user_message_content = user_message or (prompt.split("\n\n")[-1] if "\n\n" in prompt else prompt)
+            messages_to_store.append({
                 "role": "user",
                 "content": user_message_content,
-                "timestamp": to_iso_with_z(utc_now()),
-            }
-            assistant_message = {
-                "role": "assistant",
-                "content": assistant_content,
-                "timestamp": to_iso_with_z(utc_now()),
-            }
-            # Store messages with compression
+                "timestamp": timestamp,
+            })
+            # Save tool call messages (message_type: "tool") - CRITICAL for state tracking
+            for tool_call in tool_calls:
+                if not tool_call:
+                    continue
+                tool_message = {
+                    "role": "tool",
+                    "content": json.dumps(tool_call.get("result", {}), default=str),
+                    "timestamp": timestamp,
+                    "tool_call_id": tool_call.get("tool_id"),
+                    "tool_name": tool_call.get("tool_name"),
+                    "tool_arguments": tool_call.get("arguments"),
+                }
+                messages_to_store.append(tool_message)
+            # Save assistant message
+            if assistant_content:
+                messages_to_store.append({
+                    "role": "assistant",
+                    "content": assistant_content,
+                    "timestamp": timestamp,
+                })
+            # Store all messages
             store = SessionMessageStore(user_id=context.user_id or settings.test.effective_user_id)
             await store.store_session_messages(
                 session_id=context.session_id,
-                messages=[user_message, assistant_message],
+                messages=messages_to_store,
                 user_id=context.user_id,
-                compress=True,
+                compress=False,  # Store uncompressed; compression happens on reload
             )
-            logger.debug(f"Saved conversation to session {context.session_id}")
+            logger.debug(
+                f"Saved {len(tool_calls)} tool calls + user/assistant messages "
+                f"to session {context.session_id}"
+            )
         return output_data
@@ -352,8 +442,8 @@ async def _save_output_file(file_path: Path, data: dict[str, Any]) -> None:
 )
 @click.option(
     "--stream/--no-stream",
-    default=False,
-    help="Enable streaming mode (default: disabled)",
+    default=True,
+    help="Enable streaming mode (default: enabled)",
 )
 @click.option(
     "--user-id",
@@ -549,7 +639,7 @@ async def _ask_async(
     # Run agent with session persistence
     if stream:
-        await run_agent_streaming(agent, prompt, max_turns=max_turns, context=context)
+        await run_agent_streaming(agent, prompt, max_turns=max_turns, context=context, user_message=query)
     else:
         await run_agent_non_streaming(
             agent,
@@ -558,6 +648,7 @@ async def _ask_async(
             output_file=output_file,
             context=context,
             plan=plan,
+            user_message=query,
         )
     # Log session ID for reuse

rem/cli/commands/db.py CHANGED Viewed

@@ -375,8 +375,10 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
     import polars as pl
     import yaml
     from ...models.core.inline_edge import InlineEdge
-    from ...models.entities import Resource, Moment, User, Message, SharedSession, Schema
+    from ...models.entities import SharedSession
     from ...services.postgres import get_postgres_service
+    from ...utils.model_helpers import get_table_name
+    from ... import get_model_registry
     logger.info(f"Loading data from: {file_path}")
     scope_msg = f"user: {user_id}" if user_id else "public"
@@ -385,13 +387,12 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
     suffix = file_path.suffix.lower()
     is_yaml = suffix in {".yaml", ".yml"}
-    # Map table names to model classes
+    # Build MODEL_MAP dynamically from registry
+    registry = get_model_registry()
+    registry.register_core_models()
     MODEL_MAP = {
-        "users": User,
-        "moments": Moment,
-        "resources": Resource,
-        "messages": Message,
-        "schemas": Schema,
+        get_table_name(model): model
+        for model in registry.get_model_classes().values()
     }
     # Non-CoreModel tables that need direct SQL insertion
@@ -432,12 +433,9 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
             logger.info(f"Columns: {list(df.columns)}")
             # Validate first row against model if table is known
-            if table in {"users", "moments", "resources", "messages", "schemas"} and rows:
-                from ...models.entities import Resource, Moment, User, Message, Schema
+            if table in MODEL_MAP and rows:
                 from ...utils.model_helpers import validate_data_for_model
-                model_map = {"users": User, "moments": Moment, "resources": Resource,
-                            "messages": Message, "schemas": Schema}
-                result = validate_data_for_model(model_map[table], rows[0])
+                result = validate_data_for_model(MODEL_MAP[table], rows[0])
                 if result.extra_fields:
                     logger.warning(f"Unknown fields (ignored): {result.extra_fields}")
                 if result.valid:
@@ -457,6 +455,10 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
     await pg.connect()
+    # Start embedding worker for generating embeddings
+    if pg.embedding_worker:
+        await pg.embedding_worker.start()
     try:
         total_loaded = 0
@@ -467,8 +469,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
             # Handle direct insert tables (non-CoreModel)
             if table_name in DIRECT_INSERT_TABLES:
                 for row_data in rows:
-                    if "tenant_id" not in row_data:
-                        row_data["tenant_id"] = "default"
+                    # tenant_id is optional - NULL means public/shared
                     if table_name == "shared_sessions":
                         await pg.fetch(
@@ -479,7 +480,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
                             row_data["session_id"],
                             row_data["owner_user_id"],
                             row_data["shared_with_user_id"],
-                            row_data["tenant_id"],
+                            row_data.get("tenant_id"),  # Optional - NULL means public
                         )
                         total_loaded += 1
                         logger.success(f"Loaded shared_session: {row_data['owner_user_id']} -> {row_data['shared_with_user_id']}")
@@ -492,10 +493,8 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
             model_class = MODEL_MAP[table_name]
             for row_idx, row_data in enumerate(rows):
-                if "user_id" not in row_data and user_id is not None:
-                    row_data["user_id"] = user_id
-                if "tenant_id" not in row_data and user_id is not None:
-                    row_data["tenant_id"] = row_data.get("user_id", user_id)
+                # tenant_id and user_id are optional - NULL means public/shared data
+                # Data files can explicitly set tenant_id/user_id if needed
                 # Convert graph_edges to InlineEdge format if present
                 if "graph_edges" in row_data:
@@ -530,6 +529,14 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
         logger.success(f"Data loaded successfully! Total rows: {total_loaded}")
+        # Wait for embeddings to complete
+        if pg.embedding_worker and pg.embedding_worker.running:
+            queue_size = pg.embedding_worker.task_queue.qsize()
+            if queue_size > 0:
+                logger.info(f"Waiting for {queue_size} embeddings to complete...")
+            await pg.embedding_worker.stop()
+            logger.success("Embeddings generated successfully")
     finally:
         await pg.disconnect()
@@ -634,7 +641,7 @@ async def _diff_async(
         if not result.has_changes:
             click.secho("✓ No schema drift detected", fg="green")
-            click.echo("  Database matches Pydantic models")
+            click.echo("  Database matches source (tables, functions, triggers, views)")
             if result.filtered_count > 0:
                 click.echo()
                 click.secho(f"  ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
@@ -646,17 +653,34 @@ async def _diff_async(
         if result.filtered_count > 0:
             click.secho(f"   ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
         click.echo()
-        click.echo("Changes:")
-        for line in result.summary:
-            if line.startswith("+"):
-                click.secho(f"  {line}", fg="green")
-            elif line.startswith("-"):
-                click.secho(f"  {line}", fg="red")
-            elif line.startswith("~"):
-                click.secho(f"  {line}", fg="yellow")
-            else:
-                click.echo(f"  {line}")
-        click.echo()
+        # Table/column changes (Alembic)
+        if result.summary:
+            click.echo("Table Changes:")
+            for line in result.summary:
+                if line.startswith("+"):
+                    click.secho(f"  {line}", fg="green")
+                elif line.startswith("-"):
+                    click.secho(f"  {line}", fg="red")
+                elif line.startswith("~"):
+                    click.secho(f"  {line}", fg="yellow")
+                else:
+                    click.echo(f"  {line}")
+            click.echo()
+        # Programmable object changes (functions, triggers, views)
+        if result.programmable_summary:
+            click.echo("Programmable Objects (functions/triggers/views):")
+            for line in result.programmable_summary:
+                if line.startswith("+"):
+                    click.secho(f"  {line}", fg="green")
+                elif line.startswith("-"):
+                    click.secho(f"  {line}", fg="red")
+                elif line.startswith("~"):
+                    click.secho(f"  {line}", fg="yellow")
+                else:
+                    click.echo(f"  {line}")
+            click.echo()
         # Generate migration if requested
         if generate:

rem/cli/commands/experiments.py CHANGED Viewed

@@ -1568,7 +1568,7 @@ def export(
         rem experiments export my-experiment
         # Export to specific bucket
-        rem experiments export my-experiment --bucket siggy-data
+        rem experiments export my-experiment --bucket my-data-lake
         # Include results in export
         rem experiments export my-experiment --include-results

remdb 0.3.180__py3-none-any.whl → 0.3.258__py3-none-any.whl

remdb 0.3.180py3-none-any.whl → 0.3.258py3-none-any.whl