PyPI - remdb - Versions diffs - 0.3.163__py3-none-any.whl → 0.3.200__py3-none-any.whl - Mend

remdb 0.3.163py3-none-any.whl → 0.3.200py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of remdb might be problematic. Click here for more details.

Files changed (48) hide show

rem/agentic/agents/agent_manager.py +2 -1
rem/agentic/context.py +101 -0
rem/agentic/context_builder.py +30 -8
rem/agentic/mcp/tool_wrapper.py +43 -14
rem/agentic/providers/pydantic_ai.py +76 -34
rem/agentic/schema.py +4 -3
rem/agentic/tools/rem_tools.py +11 -0
rem/api/main.py +1 -1
rem/api/mcp_router/resources.py +75 -14
rem/api/mcp_router/server.py +31 -24
rem/api/mcp_router/tools.py +476 -155
rem/api/routers/auth.py +11 -6
rem/api/routers/chat/completions.py +52 -10
rem/api/routers/chat/sse_events.py +2 -2
rem/api/routers/chat/streaming.py +162 -19
rem/api/routers/messages.py +96 -23
rem/auth/middleware.py +59 -42
rem/cli/README.md +62 -0
rem/cli/commands/ask.py +1 -1
rem/cli/commands/db.py +148 -70
rem/cli/commands/process.py +171 -43
rem/models/entities/ontology.py +93 -101
rem/schemas/agents/core/agent-builder.yaml +143 -42
rem/services/content/service.py +18 -5
rem/services/email/service.py +17 -6
rem/services/embeddings/worker.py +26 -12
rem/services/postgres/__init__.py +28 -3
rem/services/postgres/diff_service.py +57 -5
rem/services/postgres/programmable_diff_service.py +635 -0
rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
rem/services/postgres/register_type.py +12 -11
rem/services/postgres/repository.py +32 -21
rem/services/postgres/schema_generator.py +5 -5
rem/services/postgres/sql_builder.py +6 -5
rem/services/session/__init__.py +7 -1
rem/services/session/pydantic_messages.py +210 -0
rem/services/user_service.py +12 -9
rem/settings.py +7 -1
rem/sql/background_indexes.sql +5 -0
rem/sql/migrations/001_install.sql +148 -11
rem/sql/migrations/002_install_models.sql +162 -132
rem/sql/migrations/004_cache_system.sql +7 -275
rem/utils/model_helpers.py +101 -0
rem/utils/schema_loader.py +51 -13
{remdb-0.3.163.dist-info → remdb-0.3.200.dist-info}/METADATA +1 -1
{remdb-0.3.163.dist-info → remdb-0.3.200.dist-info}/RECORD +48 -46
{remdb-0.3.163.dist-info → remdb-0.3.200.dist-info}/WHEEL +0 -0
{remdb-0.3.163.dist-info → remdb-0.3.200.dist-info}/entry_points.txt +0 -0

rem/auth/middleware.py CHANGED Viewed

@@ -7,14 +7,21 @@ Anonymous access with rate limiting when allow_anonymous=True.
 MCP endpoints are always protected unless explicitly disabled.
 Design Pattern:
-- Check X-API-Key header first (if API key auth enabled)
-- Check JWT token in Authorization header (Bearer token)
-- Check dev token (non-production only, starts with "dev_")
-- Check session for user on protected paths
+- API Key (X-API-Key): Access control guardrail, NOT user identity
+- JWT (Authorization: Bearer): Primary method for user identity
+- Dev token: Non-production testing (starts with "dev_")
+- Session: Backward compatibility for browser-based auth
 - MCP paths always require authentication (protected service)
-- If allow_anonymous=True: Allow unauthenticated requests (marked as ANONYMOUS tier)
-- If allow_anonymous=False: Return 401 for API calls, redirect browsers to login
-- Exclude auth endpoints and public paths
+Authentication Flow:
+1. Check JWT/dev token/session for user identity first
+2. If user is admin: bypass API key check (admin privilege)
+3. If API key enabled and user is not admin: Validate X-API-Key header
+4. If allow_anonymous=True: Allow as anonymous (rate-limited)
+5. If allow_anonymous=False: Return 401 / redirect to login
+IMPORTANT: API key validates ACCESS, JWT identifies USER.
+Admin users bypass the API key requirement (trusted identity).
 Access Modes (configured in settings.auth):
 - enabled=true, allow_anonymous=true: Auth available, anonymous gets rate-limited access
@@ -24,10 +31,9 @@ Access Modes (configured in settings.auth):
 - mcp_requires_auth=false: MCP follows normal allow_anonymous rules (dev only)
 API Key Authentication (configured in settings.api):
-- api_key_enabled=true: Require X-API-Key header for protected endpoints
+- api_key_enabled=true: Require X-API-Key header for access
 - api_key: The secret key to validate against
-- Provides simple programmatic access without OAuth flow
-- X-API-Key header takes precedence over session auth
+- API key is an ACCESS GATE, not user identity - JWT still needed for user
 Dev Token Support (non-production only):
 - GET /api/auth/dev/token returns a Bearer token for test-user
@@ -188,6 +194,12 @@ class AuthMiddleware(BaseHTTPMiddleware):
         return None
+    def _is_admin(self, user: dict | None) -> bool:
+        """Check if user has admin role."""
+        if not user:
+            return False
+        return "admin" in user.get("roles", [])
     async def dispatch(self, request: Request, call_next):
         """
         Check authentication for protected paths.
@@ -212,50 +224,55 @@ class AuthMiddleware(BaseHTTPMiddleware):
         if not is_protected or is_excluded:
             return await call_next(request)
-        # Check for X-API-Key header first (if enabled)
-        api_key_user = self._check_api_key(request)
-        if api_key_user:
-            request.state.user = api_key_user
+        # Check for user identity FIRST (JWT, dev token, session)
+        # This allows admin users to bypass API key requirement
+        user = None
+        # Check for JWT token in Authorization header (primary user identity)
+        jwt_user = self._check_jwt_token(request)
+        if jwt_user:
+            user = jwt_user
+        # Check for dev token (non-production only)
+        if not user:
+            dev_user = self._check_dev_token(request)
+            if dev_user:
+                user = dev_user
+        # Check for valid session (backward compatibility)
+        if not user:
+            session_user = request.session.get("user")
+            if session_user:
+                user = session_user
+        # If user is admin, bypass API key check entirely
+        if self._is_admin(user):
+            logger.debug(f"Admin user {user.get('email')} bypassing API key check")
+            request.state.user = user
             request.state.is_anonymous = False
             return await call_next(request)
-        # If API key auth is enabled but no valid key provided, reject immediately
+        # API key validation for non-admin users (access control guardrail)
         if settings.api.api_key_enabled:
-            # Check if X-API-Key header was provided but invalid
-            if request.headers.get("x-api-key"):
+            api_key = request.headers.get("x-api-key")
+            if not api_key:
+                logger.debug(f"Missing X-API-Key for: {path}")
+                return JSONResponse(
+                    status_code=401,
+                    content={"detail": "API key required. Include X-API-Key header."},
+                    headers={"WWW-Authenticate": 'ApiKey realm="REM API"'},
+                )
+            if api_key != settings.api.api_key:
                 logger.warning(f"Invalid X-API-Key for: {path}")
                 return JSONResponse(
                     status_code=401,
                     content={"detail": "Invalid API key"},
                     headers={"WWW-Authenticate": 'ApiKey realm="REM API"'},
                 )
-            # No API key provided when required
-            logger.debug(f"Missing X-API-Key for: {path}")
-            return JSONResponse(
-                status_code=401,
-                content={"detail": "API key required. Include X-API-Key header."},
-                headers={"WWW-Authenticate": 'ApiKey realm="REM API"'},
-            )
-        # Check for JWT token in Authorization header
-        jwt_user = self._check_jwt_token(request)
-        if jwt_user:
-            request.state.user = jwt_user
-            request.state.is_anonymous = False
-            return await call_next(request)
-        # Check for dev token (non-production only)
-        dev_user = self._check_dev_token(request)
-        if dev_user:
-            request.state.user = dev_user
-            request.state.is_anonymous = False
-            return await call_next(request)
-        # Check for valid session (backward compatibility)
-        user = request.session.get("user")
+            logger.debug("X-API-Key validated for access")
+        # If we have a valid user (non-admin, but passed API key check), allow access
         if user:
-            # Authenticated user - add to request state
             request.state.user = user
             request.state.is_anonymous = False
             return await call_next(request)

rem/cli/README.md CHANGED Viewed

@@ -434,6 +434,68 @@ Ensure you're using the correct model format:
 - OpenAI: `openai:gpt-4o-mini`, `openai:gpt-4o`
 - Anthropic: `anthropic:claude-sonnet-4-5-20250929`
+## Data Visibility: PUBLIC vs PRIVATE
+**IMPORTANT: All ingested data is PUBLIC by default.** This is the correct behavior
+for shared knowledge bases (ontologies, procedures, reference data).
+### Why PUBLIC by Default?
+Most data in REM should be searchable by all users:
+- Clinical ontologies (disorders, symptoms, drugs)
+- Procedures and protocols (SCID-5, PHQ-9, etc.)
+- Reference documentation
+- Shared domain knowledge
+The `rem_lookup()` function searches for data where `user_id IS NULL`, which means
+public data. If you set `user_id` on data, it becomes invisible to other users.
+### Ingesting Public Data (Default)
+```bash
+# Standard ingestion - data is PUBLIC
+rem process ingest ontology/procedures/ --table ontologies
+# From S3 - also PUBLIC
+rem process ingest s3://bucket/docs/reference.pdf
+```
+### Ingesting Private Data (Rare)
+Private data requires explicit `--make-private` flag:
+```bash
+# Private user data - requires --make-private and --user-id
+rem process ingest personal-notes.md --make-private --user-id user-123
+```
+**When to use private data:**
+- User-uploaded personal documents
+- Session-specific content
+- User notes and annotations
+**NEVER use private data for:**
+- Ontologies and reference material
+- Clinical procedures and protocols
+- Shared knowledge bases
+- Anything that should be searchable by agents
+### Common Mistake
+If agents can't find data via `search_rem`, the most common cause is that the data
+was ingested with a `user_id` set. Check with:
+```sql
+SELECT name, user_id FROM ontologies WHERE name = 'phq-9-procedure';
+-- user_id should be NULL for public data
+```
+Fix by setting user_id to NULL:
+```sql
+UPDATE ontologies SET user_id = NULL WHERE user_id IS NOT NULL;
+UPDATE kv_store SET user_id = NULL WHERE entity_type = 'ontologies' AND user_id IS NOT NULL;
+```
 ## Next Steps
 1. **Implement Schema Registry**

rem/cli/commands/ask.py CHANGED Viewed

@@ -75,7 +75,7 @@ async def run_agent_streaming(
     """
     Run agent in streaming mode using agent.iter() with usage limits.
-    Design Pattern (from carrier):
+    Design Pattern:
     - Use agent.iter() for complete execution with tool call visibility
     - run_stream() stops after first output, missing tool calls
     - Stream tool call markers: [Calling: tool_name]

rem/cli/commands/db.py CHANGED Viewed

@@ -333,64 +333,120 @@ def rebuild_cache(connection: str | None):
 @click.command()
 @click.argument("file_path", type=click.Path(exists=True, path_type=Path))
+@click.option("--table", "-t", default=None, help="Target table name (required for non-YAML formats)")
 @click.option("--user-id", default=None, help="User ID to scope data privately (default: public/shared)")
 @click.option("--dry-run", is_flag=True, help="Show what would be loaded without loading")
-def load(file_path: Path, user_id: str | None, dry_run: bool):
+def load(file_path: Path, table: str | None, user_id: str | None, dry_run: bool):
     """
-    Load data from YAML file into database.
+    Load data from file into database.
-    File format:
-        - table: resources
-          key_field: name
-          rows:
-            - name: Example
-              content: Test data...
+    Supports YAML with embedded metadata, or any tabular format via Polars
+    (jsonl, parquet, csv, json, arrow, etc.). For non-YAML formats, use --table.
     Examples:
-        rem db load rem/tests/data/graph_seed.yaml
-        rem db load data.yaml --user-id my-user  # Private to user
-        rem db load data.yaml --dry-run
+        rem db load data.yaml                        # YAML with metadata
+        rem db load data.jsonl -t resources          # Any Polars-supported format
     """
-    asyncio.run(_load_async(file_path, user_id, dry_run))
+    asyncio.run(_load_async(file_path, table, user_id, dry_run))
-async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
+def _load_dataframe_from_file(file_path: Path) -> "pl.DataFrame":
+    """Load any Polars-supported file format into a DataFrame."""
+    import polars as pl
+    suffix = file_path.suffix.lower()
+    if suffix in {".jsonl", ".ndjson"}:
+        return pl.read_ndjson(file_path)
+    elif suffix in {".parquet", ".pq"}:
+        return pl.read_parquet(file_path)
+    elif suffix == ".csv":
+        return pl.read_csv(file_path)
+    elif suffix == ".json":
+        return pl.read_json(file_path)
+    elif suffix in {".ipc", ".arrow"}:
+        return pl.read_ipc(file_path)
+    else:
+        raise ValueError(f"Unsupported file format: {suffix}. Use any Polars-supported format.")
+async def _load_async(file_path: Path, table: str | None, user_id: str | None, dry_run: bool):
     """Async implementation of load command."""
+    import polars as pl
     import yaml
     from ...models.core.inline_edge import InlineEdge
-    from ...models.entities import Resource, Moment, User, Message, SharedSession, Schema
+    from ...models.entities import SharedSession
     from ...services.postgres import get_postgres_service
+    from ...utils.model_helpers import get_table_name
+    from ... import get_model_registry
     logger.info(f"Loading data from: {file_path}")
     scope_msg = f"user: {user_id}" if user_id else "public"
     logger.info(f"Scope: {scope_msg}")
-    # Load YAML file
-    with open(file_path) as f:
-        data = yaml.safe_load(f)
-    if not isinstance(data, list):
-        logger.error("YAML must be a list of table definitions")
-        raise click.Abort()
-    if dry_run:
-        logger.info("DRY RUN - Would load:")
-        logger.info(yaml.dump(data, default_flow_style=False))
-        return
+    suffix = file_path.suffix.lower()
+    is_yaml = suffix in {".yaml", ".yml"}
-    # Map table names to model classes
-    # CoreModel subclasses use Repository.upsert()
+    # Build MODEL_MAP dynamically from registry
+    registry = get_model_registry()
+    registry.register_core_models()
     MODEL_MAP = {
-        "users": User,
-        "moments": Moment,
-        "resources": Resource,
-        "messages": Message,
-        "schemas": Schema,
+        get_table_name(model): model
+        for model in registry.get_model_classes().values()
     }
     # Non-CoreModel tables that need direct SQL insertion
     DIRECT_INSERT_TABLES = {"shared_sessions"}
+    # Parse file based on format
+    if is_yaml:
+        # YAML with embedded metadata
+        with open(file_path) as f:
+            data = yaml.safe_load(f)
+        if not isinstance(data, list):
+            logger.error("YAML must be a list of table definitions")
+            raise click.Abort()
+        if dry_run:
+            logger.info("DRY RUN - Would load:")
+            logger.info(yaml.dump(data, default_flow_style=False))
+            return
+        table_defs = data
+    else:
+        # Polars-supported format - require --table
+        if not table:
+            logger.error(f"For {suffix} files, --table is required. Example: rem db load {file_path.name} -t resources")
+            raise click.Abort()
+        try:
+            df = _load_dataframe_from_file(file_path)
+        except Exception as e:
+            logger.error(f"Failed to load file: {e}")
+            raise click.Abort()
+        rows = df.to_dicts()
+        if dry_run:
+            logger.info(f"DRY RUN - Would load {len(rows)} rows to table '{table}':")
+            logger.info(f"Columns: {list(df.columns)}")
+            # Validate first row against model if table is known
+            if table in MODEL_MAP and rows:
+                from ...utils.model_helpers import validate_data_for_model
+                result = validate_data_for_model(MODEL_MAP[table], rows[0])
+                if result.extra_fields:
+                    logger.warning(f"Unknown fields (ignored): {result.extra_fields}")
+                if result.valid:
+                    logger.success(f"Sample row validates OK. Required: {result.required_fields or '(none)'}")
+                else:
+                    result.log_errors("Sample row")
+            return
+        # Wrap as single table definition
+        table_defs = [{"table": table, "rows": rows}]
     # Connect to database
     pg = get_postgres_service()
     if not pg:
@@ -399,23 +455,23 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
     await pg.connect()
+    # Start embedding worker for generating embeddings
+    if pg.embedding_worker:
+        await pg.embedding_worker.start()
     try:
         total_loaded = 0
-        for table_def in data:
+        for table_def in table_defs:
             table_name = table_def["table"]
-            key_field = table_def.get("key_field", "id")
             rows = table_def.get("rows", [])
             # Handle direct insert tables (non-CoreModel)
             if table_name in DIRECT_INSERT_TABLES:
                 for row_data in rows:
-                    # Add tenant_id if not present
-                    if "tenant_id" not in row_data:
-                        row_data["tenant_id"] = "default"
+                    # tenant_id is optional - NULL means public/shared
                     if table_name == "shared_sessions":
-                        # Insert shared_session directly
                         await pg.fetch(
                             """INSERT INTO shared_sessions
                                (session_id, owner_user_id, shared_with_user_id, tenant_id)
@@ -424,7 +480,7 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
                             row_data["session_id"],
                             row_data["owner_user_id"],
                             row_data["shared_with_user_id"],
-                            row_data["tenant_id"],
+                            row_data.get("tenant_id"),  # Optional - NULL means public
                         )
                         total_loaded += 1
                         logger.success(f"Loaded shared_session: {row_data['owner_user_id']} -> {row_data['shared_with_user_id']}")
@@ -434,16 +490,11 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
                 logger.warning(f"Unknown table: {table_name}, skipping")
                 continue
-            model_class = MODEL_MAP[table_name]  # Type is inferred from MODEL_MAP
+            model_class = MODEL_MAP[table_name]
-            for row_data in rows:
-                # Add user_id and tenant_id only if explicitly provided
-                # Default is public (None) - data is shared/visible to all
-                # Pass --user-id to scope data privately to a specific user
-                if "user_id" not in row_data and user_id is not None:
-                    row_data["user_id"] = user_id
-                if "tenant_id" not in row_data and user_id is not None:
-                    row_data["tenant_id"] = row_data.get("user_id", user_id)
+            for row_idx, row_data in enumerate(rows):
+                # tenant_id and user_id are optional - NULL means public/shared data
+                # Data files can explicitly set tenant_id/user_id if needed
                 # Convert graph_edges to InlineEdge format if present
                 if "graph_edges" in row_data:
@@ -452,30 +503,40 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
                         for edge in row_data["graph_edges"]
                     ]
-                # Convert any ISO timestamp strings with Z suffix to naive datetime
-                # This handles fields like starts_timestamp, ends_timestamp, etc.
+                # Convert ISO timestamp strings
                 from ...utils.date_utils import parse_iso
                 for key, value in list(row_data.items()):
                     if isinstance(value, str) and (key.endswith("_timestamp") or key.endswith("_at")):
                         try:
                             row_data[key] = parse_iso(value)
                         except (ValueError, TypeError):
-                            pass  # Not a valid datetime string, leave as-is
+                            pass
-                # Create model instance and upsert via repository
                 from ...services.postgres.repository import Repository
+                from ...utils.model_helpers import validate_data_for_model
-                instance = model_class(**row_data)
-                repo = Repository(model_class, table_name, pg)  # Type inferred from MODEL_MAP
-                await repo.upsert(instance)  # type: ignore[arg-type]
+                result = validate_data_for_model(model_class, row_data)
+                if not result.valid:
+                    result.log_errors(f"Row {row_idx + 1} ({table_name})")
+                    raise click.Abort()
+                repo = Repository(model_class, table_name, pg)
+                await repo.upsert(result.instance)  # type: ignore[arg-type]
                 total_loaded += 1
-                # Log based on model type
-                name = getattr(instance, 'name', getattr(instance, 'id', '?'))
+                name = getattr(result.instance, 'name', getattr(result.instance, 'id', '?'))
                 logger.success(f"Loaded {table_name[:-1]}: {name}")
         logger.success(f"Data loaded successfully! Total rows: {total_loaded}")
+        # Wait for embeddings to complete
+        if pg.embedding_worker and pg.embedding_worker.running:
+            queue_size = pg.embedding_worker.task_queue.qsize()
+            if queue_size > 0:
+                logger.info(f"Waiting for {queue_size} embeddings to complete...")
+            await pg.embedding_worker.stop()
+            logger.success("Embeddings generated successfully")
     finally:
         await pg.disconnect()
@@ -580,7 +641,7 @@ async def _diff_async(
         if not result.has_changes:
             click.secho("✓ No schema drift detected", fg="green")
-            click.echo("  Database matches Pydantic models")
+            click.echo("  Database matches source (tables, functions, triggers, views)")
             if result.filtered_count > 0:
                 click.echo()
                 click.secho(f"  ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
@@ -592,17 +653,34 @@ async def _diff_async(
         if result.filtered_count > 0:
             click.secho(f"   ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
         click.echo()
-        click.echo("Changes:")
-        for line in result.summary:
-            if line.startswith("+"):
-                click.secho(f"  {line}", fg="green")
-            elif line.startswith("-"):
-                click.secho(f"  {line}", fg="red")
-            elif line.startswith("~"):
-                click.secho(f"  {line}", fg="yellow")
-            else:
-                click.echo(f"  {line}")
-        click.echo()
+        # Table/column changes (Alembic)
+        if result.summary:
+            click.echo("Table Changes:")
+            for line in result.summary:
+                if line.startswith("+"):
+                    click.secho(f"  {line}", fg="green")
+                elif line.startswith("-"):
+                    click.secho(f"  {line}", fg="red")
+                elif line.startswith("~"):
+                    click.secho(f"  {line}", fg="yellow")
+                else:
+                    click.echo(f"  {line}")
+            click.echo()
+        # Programmable object changes (functions, triggers, views)
+        if result.programmable_summary:
+            click.echo("Programmable Objects (functions/triggers/views):")
+            for line in result.programmable_summary:
+                if line.startswith("+"):
+                    click.secho(f"  {line}", fg="green")
+                elif line.startswith("-"):
+                    click.secho(f"  {line}", fg="red")
+                elif line.startswith("~"):
+                    click.secho(f"  {line}", fg="yellow")
+                else:
+                    click.echo(f"  {line}")
+            click.echo()
         # Generate migration if requested
         if generate:

remdb 0.3.163__py3-none-any.whl → 0.3.200__py3-none-any.whl

Potentially problematic release.

remdb 0.3.163py3-none-any.whl → 0.3.200py3-none-any.whl