PyPI - remdb - Versions diffs - 0.3.181__py3-none-any.whl → 0.3.223__py3-none-any.whl - Mend

remdb 0.3.181py3-none-any.whl → 0.3.223py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of remdb might be problematic. Click here for more details.

Files changed (48) hide show

rem/agentic/README.md +262 -2
rem/agentic/context.py +173 -0
rem/agentic/context_builder.py +12 -2
rem/agentic/mcp/tool_wrapper.py +2 -2
rem/agentic/providers/pydantic_ai.py +1 -1
rem/agentic/schema.py +2 -2
rem/api/main.py +1 -1
rem/api/mcp_router/server.py +4 -0
rem/api/mcp_router/tools.py +542 -170
rem/api/routers/admin.py +30 -4
rem/api/routers/auth.py +106 -10
rem/api/routers/chat/completions.py +66 -18
rem/api/routers/chat/sse_events.py +7 -3
rem/api/routers/chat/streaming.py +254 -22
rem/api/routers/common.py +18 -0
rem/api/routers/dev.py +7 -1
rem/api/routers/feedback.py +9 -1
rem/api/routers/messages.py +176 -38
rem/api/routers/models.py +9 -1
rem/api/routers/query.py +12 -1
rem/api/routers/shared_sessions.py +16 -0
rem/auth/jwt.py +19 -4
rem/auth/middleware.py +42 -28
rem/cli/README.md +62 -0
rem/cli/commands/db.py +33 -19
rem/cli/commands/process.py +171 -43
rem/models/entities/ontology.py +18 -20
rem/schemas/agents/rem.yaml +1 -1
rem/services/content/service.py +18 -5
rem/services/postgres/__init__.py +28 -3
rem/services/postgres/diff_service.py +57 -5
rem/services/postgres/programmable_diff_service.py +635 -0
rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
rem/services/postgres/register_type.py +11 -10
rem/services/postgres/repository.py +14 -4
rem/services/session/__init__.py +8 -1
rem/services/session/compression.py +40 -2
rem/services/session/pydantic_messages.py +276 -0
rem/settings.py +28 -0
rem/sql/migrations/001_install.sql +125 -7
rem/sql/migrations/002_install_models.sql +136 -126
rem/sql/migrations/004_cache_system.sql +7 -275
rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
rem/utils/schema_loader.py +6 -6
{remdb-0.3.181.dist-info → remdb-0.3.223.dist-info}/METADATA +1 -1
{remdb-0.3.181.dist-info → remdb-0.3.223.dist-info}/RECORD +48 -44
{remdb-0.3.181.dist-info → remdb-0.3.223.dist-info}/WHEEL +0 -0
{remdb-0.3.181.dist-info → remdb-0.3.223.dist-info}/entry_points.txt +0 -0

rem/cli/README.md CHANGED Viewed

@@ -434,6 +434,68 @@ Ensure you're using the correct model format:
 - OpenAI: `openai:gpt-4o-mini`, `openai:gpt-4o`
 - Anthropic: `anthropic:claude-sonnet-4-5-20250929`
+## Data Visibility: PUBLIC vs PRIVATE
+**IMPORTANT: All ingested data is PUBLIC by default.** This is the correct behavior
+for shared knowledge bases (ontologies, procedures, reference data).
+### Why PUBLIC by Default?
+Most data in REM should be searchable by all users:
+- Clinical ontologies (disorders, symptoms, drugs)
+- Procedures and protocols (SCID-5, PHQ-9, etc.)
+- Reference documentation
+- Shared domain knowledge
+The `rem_lookup()` function searches for data where `user_id IS NULL`, which means
+public data. If you set `user_id` on data, it becomes invisible to other users.
+### Ingesting Public Data (Default)
+```bash
+# Standard ingestion - data is PUBLIC
+rem process ingest ontology/procedures/ --table ontologies
+# From S3 - also PUBLIC
+rem process ingest s3://bucket/docs/reference.pdf
+```
+### Ingesting Private Data (Rare)
+Private data requires explicit `--make-private` flag:
+```bash
+# Private user data - requires --make-private and --user-id
+rem process ingest personal-notes.md --make-private --user-id user-123
+```
+**When to use private data:**
+- User-uploaded personal documents
+- Session-specific content
+- User notes and annotations
+**NEVER use private data for:**
+- Ontologies and reference material
+- Clinical procedures and protocols
+- Shared knowledge bases
+- Anything that should be searchable by agents
+### Common Mistake
+If agents can't find data via `search_rem`, the most common cause is that the data
+was ingested with a `user_id` set. Check with:
+```sql
+SELECT name, user_id FROM ontologies WHERE name = 'phq-9-procedure';
+-- user_id should be NULL for public data
+```
+Fix by setting user_id to NULL:
+```sql
+UPDATE ontologies SET user_id = NULL WHERE user_id IS NOT NULL;
+UPDATE kv_store SET user_id = NULL WHERE entity_type = 'ontologies' AND user_id IS NOT NULL;
+```
 ## Next Steps
 1. **Implement Schema Registry**

rem/cli/commands/db.py CHANGED Viewed

@@ -469,8 +469,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
             # Handle direct insert tables (non-CoreModel)
             if table_name in DIRECT_INSERT_TABLES:
                 for row_data in rows:
-                    if "tenant_id" not in row_data:
-                        row_data["tenant_id"] = "default"
+                    # tenant_id is optional - NULL means public/shared
                     if table_name == "shared_sessions":
                         await pg.fetch(
@@ -481,7 +480,7 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
                             row_data["session_id"],
                             row_data["owner_user_id"],
                             row_data["shared_with_user_id"],
-                            row_data["tenant_id"],
+                            row_data.get("tenant_id"),  # Optional - NULL means public
                         )
                         total_loaded += 1
                         logger.success(f"Loaded shared_session: {row_data['owner_user_id']} -> {row_data['shared_with_user_id']}")
@@ -494,10 +493,8 @@ async def _load_async(file_path: Path, table: str | None, user_id: str | None, d
             model_class = MODEL_MAP[table_name]
             for row_idx, row_data in enumerate(rows):
-                # user_id stays NULL for public data (accessible by any user)
-                # Only set tenant_id for scoping - the --user-id flag controls tenant scope
-                if "tenant_id" not in row_data and user_id is not None:
-                    row_data["tenant_id"] = user_id
+                # tenant_id and user_id are optional - NULL means public/shared data
+                # Data files can explicitly set tenant_id/user_id if needed
                 # Convert graph_edges to InlineEdge format if present
                 if "graph_edges" in row_data:
@@ -644,7 +641,7 @@ async def _diff_async(
         if not result.has_changes:
             click.secho("✓ No schema drift detected", fg="green")
-            click.echo("  Database matches Pydantic models")
+            click.echo("  Database matches source (tables, functions, triggers, views)")
             if result.filtered_count > 0:
                 click.echo()
                 click.secho(f"  ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
@@ -656,17 +653,34 @@ async def _diff_async(
         if result.filtered_count > 0:
             click.secho(f"   ({result.filtered_count} destructive change(s) hidden by '{strategy}' strategy)", fg="yellow")
         click.echo()
-        click.echo("Changes:")
-        for line in result.summary:
-            if line.startswith("+"):
-                click.secho(f"  {line}", fg="green")
-            elif line.startswith("-"):
-                click.secho(f"  {line}", fg="red")
-            elif line.startswith("~"):
-                click.secho(f"  {line}", fg="yellow")
-            else:
-                click.echo(f"  {line}")
-        click.echo()
+        # Table/column changes (Alembic)
+        if result.summary:
+            click.echo("Table Changes:")
+            for line in result.summary:
+                if line.startswith("+"):
+                    click.secho(f"  {line}", fg="green")
+                elif line.startswith("-"):
+                    click.secho(f"  {line}", fg="red")
+                elif line.startswith("~"):
+                    click.secho(f"  {line}", fg="yellow")
+                else:
+                    click.echo(f"  {line}")
+            click.echo()
+        # Programmable object changes (functions, triggers, views)
+        if result.programmable_summary:
+            click.echo("Programmable Objects (functions/triggers/views):")
+            for line in result.programmable_summary:
+                if line.startswith("+"):
+                    click.secho(f"  {line}", fg="green")
+                elif line.startswith("-"):
+                    click.secho(f"  {line}", fg="red")
+                elif line.startswith("~"):
+                    click.secho(f"  {line}", fg="yellow")
+                else:
+                    click.echo(f"  {line}")
+            click.echo()
         # Generate migration if requested
         if generate:

rem/cli/commands/process.py CHANGED Viewed

@@ -11,39 +11,102 @@ from rem.services.content import ContentService
 @click.command(name="ingest")
-@click.argument("file_path", type=click.Path(exists=True))
-@click.option("--user-id", default=None, help="User ID to scope file privately (default: public/shared)")
+@click.argument("path", type=click.Path(exists=True))
+@click.option("--table", "-t", default=None, help="Target table (e.g., ontologies, resources). Auto-detected for schemas.")
+@click.option("--make-private", is_flag=True, help="Make data private to a specific user. RARELY NEEDED - most data should be public/shared.")
+@click.option("--user-id", default=None, help="User ID for private data. REQUIRES --make-private flag.")
 @click.option("--category", help="Optional file category")
 @click.option("--tags", help="Optional comma-separated tags")
+@click.option("--pattern", "-p", default="**/*.md", help="Glob pattern for directory ingestion (default: **/*.md)")
+@click.option("--dry-run", is_flag=True, help="Show what would be ingested without making changes")
 def process_ingest(
-    file_path: str,
+    path: str,
+    table: str | None,
+    make_private: bool,
     user_id: str | None,
     category: str | None,
     tags: str | None,
+    pattern: str,
+    dry_run: bool,
 ):
     """
-    Ingest a file into REM (storage + parsing + embedding).
+    Ingest files into REM (storage + parsing + embedding).
-    This command performs the full ingestion pipeline:
-    1. Reads the file from the local path.
-    2. Stores it in the configured storage (local/S3).
-    3. Parses the content.
-    4. Chunks and embeds the content into Resources.
-    5. Creates a File entity record.
+    Supports both single files and directories. For directories, recursively
+    processes files matching the pattern (default: **/*.md).
+    **IMPORTANT: Data is PUBLIC by default.** This is the correct behavior for
+    shared knowledge bases (ontologies, procedures, reference data). Private
+    user-scoped data is rarely needed and requires explicit --make-private flag.
+    Target table is auto-detected for schemas (agent.yaml → schemas table).
+    Use --table to explicitly set the target (e.g., ontologies for clinical knowledge).
     Examples:
         rem process ingest sample.pdf
         rem process ingest contract.docx --category legal --tags contract,2023
         rem process ingest agent.yaml  # Auto-detects kind=agent, saves to schemas table
+        # Directory ingestion into ontologies table (PUBLIC - no user-id needed)
+        rem process ingest ontology/procedures/scid-5/ --table ontologies
+        rem process ingest ontology/ --table ontologies --pattern "**/*.md"
+        # Preview what would be ingested
+        rem process ingest ontology/ --table ontologies --dry-run
+        # RARE: Private user-scoped data (requires --make-private)
+        rem process ingest private-notes.md --make-private --user-id user-123
     """
     import asyncio
+    # Validate: user_id requires --make-private flag
+    if user_id and not make_private:
+        raise click.UsageError(
+            "Setting --user-id requires the --make-private flag.\n\n"
+            "Data should be PUBLIC by default (no user-id). Private user-scoped data\n"
+            "is rarely needed - only use --make-private for truly personal content.\n\n"
+            "Example: rem process ingest file.md --make-private --user-id user-123"
+        )
+    # If --make-private is set, user_id is required
+    if make_private and not user_id:
+        raise click.UsageError(
+            "--make-private requires --user-id to specify which user owns the data.\n\n"
+            "Example: rem process ingest file.md --make-private --user-id user-123"
+        )
+    # Clear user_id if not making private (ensure None for public data)
+    effective_user_id = user_id if make_private else None
+    from pathlib import Path
     from ...services.content import ContentService
     async def _ingest():
-        # Initialize ContentService with repositories for proper resource saving
         from rem.services.postgres import get_postgres_service
         from rem.services.postgres.repository import Repository
-        from rem.models.entities import File, Resource
+        from rem.models.entities import File, Resource, Ontology
+        input_path = Path(path)
+        tag_list = tags.split(",") if tags else None
+        # Collect files to process
+        if input_path.is_dir():
+            files_to_process = list(input_path.glob(pattern))
+            if not files_to_process:
+                logger.error(f"No files matching '{pattern}' found in {input_path}")
+                sys.exit(1)
+            logger.info(f"Found {len(files_to_process)} files matching '{pattern}'")
+        else:
+            files_to_process = [input_path]
+        # Dry run: just show what would be processed
+        if dry_run:
+            logger.info("DRY RUN - Would ingest:")
+            for f in files_to_process[:20]:
+                entity_key = f.stem  # filename without extension
+                logger.info(f"  {f} → {table or 'auto-detect'} (key: {entity_key})")
+            if len(files_to_process) > 20:
+                logger.info(f"  ... and {len(files_to_process) - 20} more files")
+            return
         db = get_postgres_service()
         if not db:
@@ -51,53 +114,118 @@ def process_ingest(
         await db.connect()
         try:
-            file_repo = Repository(File, "files", db=db)
-            resource_repo = Repository(Resource, "resources", db=db)
-            service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
-            tag_list = tags.split(",") if tags else None
-            scope_msg = f"user: {user_id}" if user_id else "public"
-            logger.info(f"Ingesting file: {file_path} ({scope_msg})")
-            result = await service.ingest_file(
-                file_uri=file_path,
-                user_id=user_id,
-                category=category,
-                tags=tag_list,
-                is_local_server=True, # CLI is local
-            )
-            # Handle schema ingestion (agents/evaluators)
-            if result.get("schema_name"):
-                logger.success(f"Schema ingested: {result['schema_name']} (kind={result.get('kind', 'agent')})")
-                logger.info(f"Version: {result.get('version', '1.0.0')}")
-            # Handle file ingestion
-            elif result.get("processing_status") == "completed":
-                logger.success(f"File ingested: {result['file_name']}")
-                logger.info(f"File ID: {result['file_id']}")
-                logger.info(f"Resources created: {result['resources_created']}")
+            # Direct table ingestion (ontologies, etc.)
+            if table:
+                await _ingest_to_table(
+                    db=db,
+                    files=files_to_process,
+                    table_name=table,
+                    user_id=effective_user_id,
+                    category=category,
+                    tag_list=tag_list,
+                )
             else:
-                logger.error(f"Ingestion failed: {result.get('message', 'Unknown error')}")
-                sys.exit(1)
+                # Standard file ingestion via ContentService
+                file_repo = Repository(File, "files", db=db)
+                resource_repo = Repository(Resource, "resources", db=db)
+                service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
+                for file_path in files_to_process:
+                    scope_msg = f"user: {effective_user_id}" if effective_user_id else "public"
+                    logger.info(f"Ingesting: {file_path} ({scope_msg})")
+                    result = await service.ingest_file(
+                        file_uri=str(file_path),
+                        user_id=effective_user_id,
+                        category=category,
+                        tags=tag_list,
+                        is_local_server=True,
+                    )
+                    # Handle schema ingestion (agents/evaluators)
+                    if result.get("schema_name"):
+                        logger.success(f"Schema: {result['schema_name']} (kind={result.get('kind', 'agent')})")
+                    elif result.get("processing_status") == "completed":
+                        logger.success(f"File: {result['file_name']} ({result['resources_created']} resources)")
+                    else:
+                        logger.error(f"Failed: {result.get('message', 'Unknown error')}")
         except Exception as e:
             logger.error(f"Error during ingestion: {e}")
             sys.exit(1)
         finally:
-            # Wait for global embedding worker to finish queued tasks
+            # Wait for embedding worker to finish
             from rem.services.embeddings.worker import get_global_embedding_worker
             try:
                 worker = get_global_embedding_worker()
                 if worker and worker.running and not worker.task_queue.empty():
-                    logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks to complete...")
-                    # Worker.stop() waits for queue to drain (see worker.py line ~148)
+                    logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks...")
                     await worker.stop()
             except RuntimeError:
-                # Worker doesn't exist yet - no tasks queued
                 pass
             await db.disconnect()
+    async def _ingest_to_table(db, files, table_name, user_id, category, tag_list):
+        """Direct ingestion of files to a specific table (ontologies, etc.)."""
+        from rem.services.postgres.repository import Repository
+        from rem import get_model_registry
+        from rem.utils.model_helpers import get_table_name
+        # Get model class for table
+        registry = get_model_registry()
+        registry.register_core_models()
+        model_class = None
+        for model in registry.get_model_classes().values():
+            if get_table_name(model) == table_name:
+                model_class = model
+                break
+        if not model_class:
+            logger.error(f"Unknown table: {table_name}")
+            sys.exit(1)
+        repo = Repository(model_class, table_name, db=db)
+        processed = 0
+        failed = 0
+        for file_path in files:
+            try:
+                # Read file content
+                content = file_path.read_text(encoding="utf-8")
+                entity_key = file_path.stem  # filename without extension
+                # Build entity based on table
+                entity_data = {
+                    "name": entity_key,
+                    "content": content,
+                    "tags": tag_list or [],
+                }
+                # Add optional fields
+                if category:
+                    entity_data["category"] = category
+                # Scoping: user_id for private data, "public" for shared
+                # tenant_id="public" is the default for shared knowledge bases
+                entity_data["tenant_id"] = user_id or "public"
+                entity_data["user_id"] = user_id  # None = public/shared
+                # For ontologies, add URI
+                if table_name == "ontologies":
+                    entity_data["uri"] = f"file://{file_path.absolute()}"
+                entity = model_class(**entity_data)
+                await repo.upsert(entity, embeddable_fields=["content"], generate_embeddings=True)
+                processed += 1
+                logger.success(f"  ✓ {entity_key}")
+            except Exception as e:
+                failed += 1
+                logger.error(f"  ✗ {file_path.name}: {e}")
+        logger.info(f"Completed: {processed} succeeded, {failed} failed")
     asyncio.run(_ingest())
 def register_commands(group: click.Group):

rem/models/entities/ontology.py CHANGED Viewed

@@ -103,32 +103,30 @@ class Ontology(CoreModel):
             tags=["cv", "engineering"]
         )
-        # Direct-loaded: Medical knowledge base from git
-        disorder_ontology = Ontology(
-            name="panic-disorder",
-            uri="git://bwolfson-siggie/Siggy-MVP/ontology/disorders/anxiety/panic-disorder.md",
-            content="# Panic Disorder\\n\\nPanic disorder is characterized by...",
+        # Direct-loaded: Knowledge base from git
+        api_docs = Ontology(
+            name="rest-api-guide",
+            uri="git://example-org/docs/api/rest-api-guide.md",
+            content="# REST API Guide\\n\\nThis guide covers RESTful API design...",
             extracted_data={
-                "type": "disorder",
-                "category": "anxiety",
-                "icd10": "F41.0",
-                "dsm5_criteria": ["A", "B", "C", "D"],
+                "type": "documentation",
+                "category": "api",
+                "version": "2.0",
             },
-            tags=["disorder", "anxiety", "dsm5"]
+            tags=["api", "rest", "documentation"]
         )
-        # Direct-loaded: Clinical procedure from git
-        scid_node = Ontology(
-            name="scid-5-f1",
-            uri="git://bwolfson-siggie/Siggy-MVP/ontology/procedures/scid-5/module-f/scid-5-f1.md",
-            content="# scid-5-f1: Panic Attack Screening\\n\\n...",
+        # Direct-loaded: Technical spec from git
+        config_spec = Ontology(
+            name="config-schema",
+            uri="git://example-org/docs/specs/config-schema.md",
+            content="# Configuration Schema\\n\\nThis document defines...",
             extracted_data={
-                "type": "procedure",
-                "module": "F",
-                "section": "Panic Disorder",
-                "dsm5_criterion": "Panic Attack Specifier",
+                "type": "specification",
+                "format": "yaml",
+                "version": "1.0",
             },
-            tags=["scid-5", "procedure", "anxiety"]
+            tags=["config", "schema", "specification"]
         )
     """

rem/schemas/agents/rem.yaml CHANGED Viewed

@@ -124,7 +124,7 @@ json_schema_extra:
   # Explicit resource declarations for reference data
   resources:
-    - uri: rem://schemas
+    - uri: rem://agents
       name: Agent Schemas List
       description: List all available agent schemas in the system
     - uri: rem://status

rem/services/content/service.py CHANGED Viewed

@@ -274,7 +274,7 @@ class ContentService:
     async def ingest_file(
         self,
         file_uri: str,
-        user_id: str,
+        user_id: str | None = None,
         category: str | None = None,
         tags: list[str] | None = None,
         is_local_server: bool = False,
@@ -283,6 +283,10 @@ class ContentService:
         """
         Complete file ingestion pipeline: read → store → parse → chunk → embed.
+        **IMPORTANT: Data is PUBLIC by default (user_id=None).**
+        This is correct for shared knowledge bases (ontologies, procedures, reference data).
+        Private user-scoped data is rarely needed - only set user_id for truly personal content.
         **CENTRALIZED INGESTION**: This is the single entry point for all file ingestion
         in REM. It handles:
@@ -319,7 +323,9 @@ class ContentService:
         Args:
             file_uri: Source file location (local path, s3://, or https://)
-            user_id: User identifier for data isolation and ownership
+            user_id: User identifier for PRIVATE data only. Default None = PUBLIC/shared.
+                Leave as None for shared knowledge bases, ontologies, reference data.
+                Only set for truly private user-specific content.
             category: Optional category tag (document, code, audio, etc.)
             tags: Optional list of tags
             is_local_server: True if running as local/stdio MCP server
@@ -347,12 +353,19 @@ class ContentService:
         Example:
             >>> service = ContentService()
+            >>> # PUBLIC data (default) - visible to all users
             >>> result = await service.ingest_file(
-            ...     file_uri="s3://bucket/contract.pdf",
-            ...     user_id="user-123",
-            ...     category="legal"
+            ...     file_uri="s3://bucket/procedure.pdf",
+            ...     category="medical"
             ... )
             >>> print(f"Created {result['resources_created']} searchable chunks")
+            >>>
+            >>> # PRIVATE data (rare) - only for user-specific content
+            >>> result = await service.ingest_file(
+            ...     file_uri="s3://bucket/personal-notes.pdf",
+            ...     user_id="user-123",  # Only this user can access
+            ...     category="personal"
+            ... )
         """
         from pathlib import Path
         from uuid import uuid4

rem/services/postgres/__init__.py CHANGED Viewed

@@ -3,22 +3,47 @@ PostgreSQL service for CloudNativePG database operations.
 """
 from .diff_service import DiffService, SchemaDiff
+from .programmable_diff_service import (
+    DiffResult,
+    ObjectDiff,
+    ObjectType,
+    ProgrammableDiffService,
+)
 from .repository import Repository
 from .service import PostgresService
+_postgres_instance: PostgresService | None = None
 def get_postgres_service() -> PostgresService | None:
     """
-    Get PostgresService instance.
+    Get PostgresService singleton instance.
     Returns None if Postgres is disabled.
+    Uses singleton pattern to prevent connection pool exhaustion.
     """
+    global _postgres_instance
     from ...settings import settings
     if not settings.postgres.enabled:
         return None
-    return PostgresService()
+    if _postgres_instance is None:
+        _postgres_instance = PostgresService()
+    return _postgres_instance
-__all__ = ["PostgresService", "get_postgres_service", "Repository", "DiffService", "SchemaDiff"]
+__all__ = [
+    "DiffResult",
+    "DiffService",
+    "ObjectDiff",
+    "ObjectType",
+    "PostgresService",
+    "ProgrammableDiffService",
+    "Repository",
+    "SchemaDiff",
+    "get_postgres_service",
+]

remdb 0.3.181__py3-none-any.whl → 0.3.223__py3-none-any.whl

Potentially problematic release.

remdb 0.3.181py3-none-any.whl → 0.3.223py3-none-any.whl