PyPI - remdb - Versions diffs - 0.3.171__py3-none-any.whl → 0.3.230__py3-none-any.whl - Mend

remdb 0.3.171py3-none-any.whl → 0.3.230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

rem/agentic/README.md +36 -2
rem/agentic/context.py +173 -0
rem/agentic/context_builder.py +12 -2
rem/agentic/mcp/tool_wrapper.py +39 -16
rem/agentic/providers/pydantic_ai.py +78 -45
rem/agentic/schema.py +6 -5
rem/agentic/tools/rem_tools.py +11 -0
rem/api/main.py +1 -1
rem/api/mcp_router/resources.py +75 -14
rem/api/mcp_router/server.py +31 -24
rem/api/mcp_router/tools.py +621 -166
rem/api/routers/admin.py +30 -4
rem/api/routers/auth.py +114 -15
rem/api/routers/chat/child_streaming.py +379 -0
rem/api/routers/chat/completions.py +74 -37
rem/api/routers/chat/sse_events.py +7 -3
rem/api/routers/chat/streaming.py +352 -257
rem/api/routers/chat/streaming_utils.py +327 -0
rem/api/routers/common.py +18 -0
rem/api/routers/dev.py +7 -1
rem/api/routers/feedback.py +9 -1
rem/api/routers/messages.py +176 -38
rem/api/routers/models.py +9 -1
rem/api/routers/query.py +12 -1
rem/api/routers/shared_sessions.py +16 -0
rem/auth/jwt.py +19 -4
rem/auth/middleware.py +42 -28
rem/cli/README.md +62 -0
rem/cli/commands/ask.py +61 -81
rem/cli/commands/db.py +148 -70
rem/cli/commands/process.py +171 -43
rem/models/entities/ontology.py +91 -101
rem/schemas/agents/rem.yaml +1 -1
rem/services/content/service.py +18 -5
rem/services/email/service.py +11 -2
rem/services/embeddings/worker.py +26 -12
rem/services/postgres/__init__.py +28 -3
rem/services/postgres/diff_service.py +57 -5
rem/services/postgres/programmable_diff_service.py +635 -0
rem/services/postgres/pydantic_to_sqlalchemy.py +2 -2
rem/services/postgres/register_type.py +12 -11
rem/services/postgres/repository.py +39 -29
rem/services/postgres/schema_generator.py +5 -5
rem/services/postgres/sql_builder.py +6 -5
rem/services/session/__init__.py +8 -1
rem/services/session/compression.py +40 -2
rem/services/session/pydantic_messages.py +292 -0
rem/settings.py +34 -0
rem/sql/background_indexes.sql +5 -0
rem/sql/migrations/001_install.sql +157 -10
rem/sql/migrations/002_install_models.sql +160 -132
rem/sql/migrations/004_cache_system.sql +7 -275
rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
rem/utils/model_helpers.py +101 -0
rem/utils/schema_loader.py +79 -51
{remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/METADATA +2 -2
{remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/RECORD +59 -53
{remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/WHEEL +0 -0
{remdb-0.3.171.dist-info → remdb-0.3.230.dist-info}/entry_points.txt +0 -0

rem/cli/commands/process.py CHANGED Viewed

@@ -11,39 +11,102 @@ from rem.services.content import ContentService
 @click.command(name="ingest")
-@click.argument("file_path", type=click.Path(exists=True))
-@click.option("--user-id", default=None, help="User ID to scope file privately (default: public/shared)")
+@click.argument("path", type=click.Path(exists=True))
+@click.option("--table", "-t", default=None, help="Target table (e.g., ontologies, resources). Auto-detected for schemas.")
+@click.option("--make-private", is_flag=True, help="Make data private to a specific user. RARELY NEEDED - most data should be public/shared.")
+@click.option("--user-id", default=None, help="User ID for private data. REQUIRES --make-private flag.")
 @click.option("--category", help="Optional file category")
 @click.option("--tags", help="Optional comma-separated tags")
+@click.option("--pattern", "-p", default="**/*.md", help="Glob pattern for directory ingestion (default: **/*.md)")
+@click.option("--dry-run", is_flag=True, help="Show what would be ingested without making changes")
 def process_ingest(
-    file_path: str,
+    path: str,
+    table: str | None,
+    make_private: bool,
     user_id: str | None,
     category: str | None,
     tags: str | None,
+    pattern: str,
+    dry_run: bool,
 ):
     """
-    Ingest a file into REM (storage + parsing + embedding).
+    Ingest files into REM (storage + parsing + embedding).
-    This command performs the full ingestion pipeline:
-    1. Reads the file from the local path.
-    2. Stores it in the configured storage (local/S3).
-    3. Parses the content.
-    4. Chunks and embeds the content into Resources.
-    5. Creates a File entity record.
+    Supports both single files and directories. For directories, recursively
+    processes files matching the pattern (default: **/*.md).
+    **IMPORTANT: Data is PUBLIC by default.** This is the correct behavior for
+    shared knowledge bases (ontologies, procedures, reference data). Private
+    user-scoped data is rarely needed and requires explicit --make-private flag.
+    Target table is auto-detected for schemas (agent.yaml → schemas table).
+    Use --table to explicitly set the target (e.g., ontologies for clinical knowledge).
     Examples:
         rem process ingest sample.pdf
         rem process ingest contract.docx --category legal --tags contract,2023
         rem process ingest agent.yaml  # Auto-detects kind=agent, saves to schemas table
+        # Directory ingestion into ontologies table (PUBLIC - no user-id needed)
+        rem process ingest ontology/procedures/scid-5/ --table ontologies
+        rem process ingest ontology/ --table ontologies --pattern "**/*.md"
+        # Preview what would be ingested
+        rem process ingest ontology/ --table ontologies --dry-run
+        # RARE: Private user-scoped data (requires --make-private)
+        rem process ingest private-notes.md --make-private --user-id user-123
     """
     import asyncio
+    # Validate: user_id requires --make-private flag
+    if user_id and not make_private:
+        raise click.UsageError(
+            "Setting --user-id requires the --make-private flag.\n\n"
+            "Data should be PUBLIC by default (no user-id). Private user-scoped data\n"
+            "is rarely needed - only use --make-private for truly personal content.\n\n"
+            "Example: rem process ingest file.md --make-private --user-id user-123"
+        )
+    # If --make-private is set, user_id is required
+    if make_private and not user_id:
+        raise click.UsageError(
+            "--make-private requires --user-id to specify which user owns the data.\n\n"
+            "Example: rem process ingest file.md --make-private --user-id user-123"
+        )
+    # Clear user_id if not making private (ensure None for public data)
+    effective_user_id = user_id if make_private else None
+    from pathlib import Path
     from ...services.content import ContentService
     async def _ingest():
-        # Initialize ContentService with repositories for proper resource saving
         from rem.services.postgres import get_postgres_service
         from rem.services.postgres.repository import Repository
-        from rem.models.entities import File, Resource
+        from rem.models.entities import File, Resource, Ontology
+        input_path = Path(path)
+        tag_list = tags.split(",") if tags else None
+        # Collect files to process
+        if input_path.is_dir():
+            files_to_process = list(input_path.glob(pattern))
+            if not files_to_process:
+                logger.error(f"No files matching '{pattern}' found in {input_path}")
+                sys.exit(1)
+            logger.info(f"Found {len(files_to_process)} files matching '{pattern}'")
+        else:
+            files_to_process = [input_path]
+        # Dry run: just show what would be processed
+        if dry_run:
+            logger.info("DRY RUN - Would ingest:")
+            for f in files_to_process[:20]:
+                entity_key = f.stem  # filename without extension
+                logger.info(f"  {f} → {table or 'auto-detect'} (key: {entity_key})")
+            if len(files_to_process) > 20:
+                logger.info(f"  ... and {len(files_to_process) - 20} more files")
+            return
         db = get_postgres_service()
         if not db:
@@ -51,53 +114,118 @@ def process_ingest(
         await db.connect()
         try:
-            file_repo = Repository(File, "files", db=db)
-            resource_repo = Repository(Resource, "resources", db=db)
-            service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
-            tag_list = tags.split(",") if tags else None
-            scope_msg = f"user: {user_id}" if user_id else "public"
-            logger.info(f"Ingesting file: {file_path} ({scope_msg})")
-            result = await service.ingest_file(
-                file_uri=file_path,
-                user_id=user_id,
-                category=category,
-                tags=tag_list,
-                is_local_server=True, # CLI is local
-            )
-            # Handle schema ingestion (agents/evaluators)
-            if result.get("schema_name"):
-                logger.success(f"Schema ingested: {result['schema_name']} (kind={result.get('kind', 'agent')})")
-                logger.info(f"Version: {result.get('version', '1.0.0')}")
-            # Handle file ingestion
-            elif result.get("processing_status") == "completed":
-                logger.success(f"File ingested: {result['file_name']}")
-                logger.info(f"File ID: {result['file_id']}")
-                logger.info(f"Resources created: {result['resources_created']}")
+            # Direct table ingestion (ontologies, etc.)
+            if table:
+                await _ingest_to_table(
+                    db=db,
+                    files=files_to_process,
+                    table_name=table,
+                    user_id=effective_user_id,
+                    category=category,
+                    tag_list=tag_list,
+                )
             else:
-                logger.error(f"Ingestion failed: {result.get('message', 'Unknown error')}")
-                sys.exit(1)
+                # Standard file ingestion via ContentService
+                file_repo = Repository(File, "files", db=db)
+                resource_repo = Repository(Resource, "resources", db=db)
+                service = ContentService(file_repo=file_repo, resource_repo=resource_repo)
+                for file_path in files_to_process:
+                    scope_msg = f"user: {effective_user_id}" if effective_user_id else "public"
+                    logger.info(f"Ingesting: {file_path} ({scope_msg})")
+                    result = await service.ingest_file(
+                        file_uri=str(file_path),
+                        user_id=effective_user_id,
+                        category=category,
+                        tags=tag_list,
+                        is_local_server=True,
+                    )
+                    # Handle schema ingestion (agents/evaluators)
+                    if result.get("schema_name"):
+                        logger.success(f"Schema: {result['schema_name']} (kind={result.get('kind', 'agent')})")
+                    elif result.get("processing_status") == "completed":
+                        logger.success(f"File: {result['file_name']} ({result['resources_created']} resources)")
+                    else:
+                        logger.error(f"Failed: {result.get('message', 'Unknown error')}")
         except Exception as e:
             logger.error(f"Error during ingestion: {e}")
             sys.exit(1)
         finally:
-            # Wait for global embedding worker to finish queued tasks
+            # Wait for embedding worker to finish
             from rem.services.embeddings.worker import get_global_embedding_worker
             try:
                 worker = get_global_embedding_worker()
                 if worker and worker.running and not worker.task_queue.empty():
-                    logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks to complete...")
-                    # Worker.stop() waits for queue to drain (see worker.py line ~148)
+                    logger.info(f"Waiting for {worker.task_queue.qsize()} embedding tasks...")
                     await worker.stop()
             except RuntimeError:
-                # Worker doesn't exist yet - no tasks queued
                 pass
             await db.disconnect()
+    async def _ingest_to_table(db, files, table_name, user_id, category, tag_list):
+        """Direct ingestion of files to a specific table (ontologies, etc.)."""
+        from rem.services.postgres.repository import Repository
+        from rem import get_model_registry
+        from rem.utils.model_helpers import get_table_name
+        # Get model class for table
+        registry = get_model_registry()
+        registry.register_core_models()
+        model_class = None
+        for model in registry.get_model_classes().values():
+            if get_table_name(model) == table_name:
+                model_class = model
+                break
+        if not model_class:
+            logger.error(f"Unknown table: {table_name}")
+            sys.exit(1)
+        repo = Repository(model_class, table_name, db=db)
+        processed = 0
+        failed = 0
+        for file_path in files:
+            try:
+                # Read file content
+                content = file_path.read_text(encoding="utf-8")
+                entity_key = file_path.stem  # filename without extension
+                # Build entity based on table
+                entity_data = {
+                    "name": entity_key,
+                    "content": content,
+                    "tags": tag_list or [],
+                }
+                # Add optional fields
+                if category:
+                    entity_data["category"] = category
+                # Scoping: user_id for private data, "public" for shared
+                # tenant_id="public" is the default for shared knowledge bases
+                entity_data["tenant_id"] = user_id or "public"
+                entity_data["user_id"] = user_id  # None = public/shared
+                # For ontologies, add URI
+                if table_name == "ontologies":
+                    entity_data["uri"] = f"file://{file_path.absolute()}"
+                entity = model_class(**entity_data)
+                await repo.upsert(entity, embeddable_fields=["content"], generate_embeddings=True)
+                processed += 1
+                logger.success(f"  ✓ {entity_key}")
+            except Exception as e:
+                failed += 1
+                logger.error(f"  ✗ {file_path.name}: {e}")
+        logger.info(f"Completed: {processed} succeeded, {failed} failed")
     asyncio.run(_ingest())
 def register_commands(group: click.Group):

rem/models/entities/ontology.py CHANGED Viewed

@@ -1,63 +1,55 @@
-"""Ontology entity for tenant-specific knowledge extensions.
+"""Ontology entity for domain-specific knowledge.
-**What is Ontology Extraction?**
+**What are Ontologies?**
-Ontologies are **domain-specific structured knowledge** extracted from files using custom
-agent schemas. They extend REM's normal file processing pipeline with tenant-specific
-parsers that extract structured data the standard chunking pipeline would miss.
+Ontologies are **domain-specific structured knowledge** that can be:
+1. **Extracted** from files using custom agent schemas (agent-extracted)
+2. **Loaded directly** from external sources like git repos or S3 (direct-loaded)
-**Normal File Processing:**
-File → extract text → chunk → embed → resources (semantic search ready)
+**Use Case 1: Agent-Extracted Ontologies**
-**Ontology Processing (Tenant Knowledge Extensions):**
 File → custom agent → structured JSON → ontology (domain knowledge)
-**Why Ontologies?**
-- Standard chunking gives you semantic search over raw content
-- Ontologies give you **structured queryable fields** from domain logic
-- Example: A contract PDF becomes both searchable chunks AND a structured record with
-  parties, dates, payment terms, obligations as queryable fields
+Example: A contract PDF becomes a structured record with parties, dates, payment terms.
+**Use Case 2: Direct-Loaded Ontologies (Knowledge Bases)**
+External source (git/S3) → load → ontology (reference knowledge)
+Example: A psychiatric ontology of disorders, symptoms, and drugs loaded from markdown
+files in a git repository. Each markdown file becomes an ontology node with:
+- `uri`: git path (e.g., `git://org/repo/ontology/disorders/anxiety/panic-disorder.md`)
+- `content`: markdown content for embedding/search
+- `extracted_data`: parsed frontmatter or structure
 **Architecture:**
-- Runs as part of dreaming worker (background knowledge extraction)
-- OntologyConfig defines which files trigger which extractors (MIME type, URI pattern, tags)
+- Runs as part of dreaming worker (background knowledge extraction) OR
+- Loaded directly via `rem db load` for external knowledge bases
+- OntologyConfig defines which files trigger which extractors
 - Multiple ontologies per file (apply different domain lenses)
-- Tenant-scoped: Each tenant can define their own extractors
+- Tenant-scoped: Each tenant can define their own extractors and knowledge bases
 **Use Cases:**
-1. **Recruitment (CV Parsing)**
-   - Standard pipeline: Chunks for "find me candidates with Python experience"
-   - Ontology: Structured fields for filtering/sorting (years_experience, seniority_level, skills[])
-2. **Legal (Contract Analysis)**
-   - Standard pipeline: Semantic search over contract text
-   - Ontology: Queryable fields (parties, effective_date, payment_amount, key_obligations[])
+1. **Recruitment (CV Parsing)** - Agent-extracted
+   - Ontology: Structured fields for filtering/sorting (years_experience, skills[])
-3. **Medical (Health Records)**
-   - Standard pipeline: Find mentions of conditions
-   - Ontology: Structured diagnoses, medications, dosages, treatment plans
+2. **Legal (Contract Analysis)** - Agent-extracted
+   - Ontology: Queryable fields (parties, effective_date, payment_amount)
-4. **Finance (Report Analysis)**
-   - Standard pipeline: Search for financial terms
-   - Ontology: Extracted metrics, risk_flags, trends, forecasts
+3. **Medical Knowledge Base** - Direct-loaded
+   - Ontology: Disorders, symptoms, medications from curated markdown files
+   - Enables semantic search over psychiatric/medical domain knowledge
-**Example Flow:**
-1. Tenant creates OntologyConfig: "Run cv-parser-v1 on files with mime_type='application/pdf' and tags=['resume']"
-2. File uploaded with tags=["resume"]
-3. Normal processing: File → chunks → resources
-4. Dreaming worker detects matching OntologyConfig
-5. Loads cv-parser-v1 agent schema from database
-6. Runs agent on file content → extracts structured data
-7. Stores Ontology with extracted_data = {candidate_name, skills, experience, education, ...}
-8. Ontology is now queryable via LOOKUP, SEARCH, or direct SQL
+4. **Documentation/Procedures** - Direct-loaded
+   - Ontology: Clinical procedures (e.g., SCID-5 assessment steps)
+   - Reference material accessible via RAG
 **Design:**
-- Each ontology links to a File via file_id
-- Agent schema tracked via agent_schema_id (human-readable label, not UUID)
-- Structured data in `extracted_data` (arbitrary JSON, schema defined by agent)
-- Embeddings generated for semantic search (configurable fields via agent schema)
-- Multiple ontologies per file using different schemas
+- `file_id` and `agent_schema_id` are optional (only needed for agent-extracted)
+- `uri` field for external source references (git://, s3://, https://)
+- Structured data in `extracted_data` (arbitrary JSON)
+- Embeddings generated for semantic search via `content` field
 - Tenant-isolated: OntologyConfigs are tenant-scoped
 """
@@ -70,18 +62,19 @@ from ..core.core_model import CoreModel
 class Ontology(CoreModel):
-    """Domain-specific knowledge extracted from files using custom agents.
+    """Domain-specific knowledge - either agent-extracted or direct-loaded.
     Attributes:
         name: Human-readable label for this ontology instance
-        file_id: Foreign key to File entity that was processed
-        agent_schema_id: Foreign key to Schema entity that performed extraction
-        provider_name: LLM provider used for extraction (e.g., "anthropic", "openai")
-        model_name: Specific model used (e.g., "claude-sonnet-4-5")
-        extracted_data: Structured data extracted by agent (arbitrary JSON)
+        uri: External source reference (git://, s3://, https://) for direct-loaded ontologies
+        file_id: Foreign key to File entity (optional - only for agent-extracted)
+        agent_schema_id: Schema that performed extraction (optional - only for agent-extracted)
+        provider_name: LLM provider used for extraction (optional)
+        model_name: Specific model used (optional)
+        extracted_data: Structured data - either extracted by agent or parsed from source
         confidence_score: Optional confidence score from extraction (0.0-1.0)
         extraction_timestamp: When extraction was performed
-        embedding_text: Text used for generating embedding (derived from extracted_data)
+        content: Text used for generating embedding
     Inherited from CoreModel:
         id: UUID or string identifier
@@ -93,10 +86,9 @@ class Ontology(CoreModel):
         graph_edges: Relationships to other entities
         metadata: Flexible metadata storage
         tags: Classification tags
-        column: Database schema metadata
     Example Usage:
-        # CV extraction
+        # Agent-extracted: CV parsing
         cv_ontology = Ontology(
             name="john-doe-cv-2024",
             file_id="file-uuid-123",
@@ -105,73 +97,72 @@ class Ontology(CoreModel):
             model_name="claude-sonnet-4-5-20250929",
             extracted_data={
                 "candidate_name": "John Doe",
-                "email": "john@example.com",
                 "skills": ["Python", "PostgreSQL", "Kubernetes"],
-                "experience": [
-                    {
-                        "company": "TechCorp",
-                        "role": "Senior Engineer",
-                        "years": 3,
-                        "achievements": ["Led migration to k8s", "Reduced costs 40%"]
-                    }
-                ],
-                "education": [
-                    {"degree": "BS Computer Science", "institution": "MIT", "year": 2018}
-                ]
             },
             confidence_score=0.95,
-            tags=["cv", "engineering", "senior-level"]
+            tags=["cv", "engineering"]
         )
-        # Contract extraction
-        contract_ontology = Ontology(
-            name="acme-supplier-agreement-2024",
-            file_id="file-uuid-456",
-            agent_schema_id="contract-parser-v2",
-            provider_name="openai",
-            model_name="gpt-4.1",
+        # Direct-loaded: Knowledge base from git
+        api_docs = Ontology(
+            name="rest-api-guide",
+            uri="git://example-org/docs/api/rest-api-guide.md",
+            content="# REST API Guide\\n\\nThis guide covers RESTful API design...",
             extracted_data={
-                "contract_type": "supplier_agreement",
-                "parties": [
-                    {"name": "ACME Corp", "role": "buyer"},
-                    {"name": "SupplyChain Inc", "role": "supplier"}
-                ],
-                "effective_date": "2024-01-01",
-                "termination_date": "2026-12-31",
-                "payment_terms": {
-                    "amount": 500000,
-                    "currency": "USD",
-                    "frequency": "quarterly"
-                },
-                "key_obligations": [
-                    "Supplier must deliver within 30 days",
-                    "Buyer must pay within 60 days of invoice"
-                ]
+                "type": "documentation",
+                "category": "api",
+                "version": "2.0",
+            },
+            tags=["api", "rest", "documentation"]
+        )
+        # Direct-loaded: Technical spec from git
+        config_spec = Ontology(
+            name="config-schema",
+            uri="git://example-org/docs/specs/config-schema.md",
+            content="# Configuration Schema\\n\\nThis document defines...",
+            extracted_data={
+                "type": "specification",
+                "format": "yaml",
+                "version": "1.0",
             },
-            confidence_score=0.92,
-            tags=["contract", "supplier", "procurement"]
+            tags=["config", "schema", "specification"]
         )
     """
     # Core fields
     name: str
-    file_id: UUID | str
-    agent_schema_id: str  # Natural language label of Schema entity
+    uri: Optional[str] = None  # External source: git://, s3://, https://
-    # Extraction metadata
-    provider_name: str  # LLM provider (anthropic, openai, etc.)
-    model_name: str  # Specific model used
-    extracted_data: dict[str, Any]  # Arbitrary structured data from agent
+    # Agent extraction fields (optional - only for agent-extracted ontologies)
+    file_id: Optional[UUID | str] = None  # FK to File entity
+    agent_schema_id: Optional[str] = None  # Schema that performed extraction
+    provider_name: Optional[str] = None  # LLM provider (anthropic, openai, etc.)
+    model_name: Optional[str] = None  # Specific model used
+    # Data fields
+    extracted_data: Optional[dict[str, Any]] = None  # Structured data
     confidence_score: Optional[float] = None  # 0.0-1.0 if provided by agent
     extraction_timestamp: Optional[str] = None  # ISO8601 timestamp
-    # Semantic search support
-    embedding_text: Optional[str] = None  # Text for embedding generation
+    # Semantic search support - 'content' is a default embeddable field name
+    content: Optional[str] = None  # Text for embedding generation
     model_config = ConfigDict(
         json_schema_extra={
-            "description": "Domain-specific knowledge extracted from files using custom agents",
+            "description": "Domain-specific knowledge - agent-extracted or direct-loaded from external sources",
             "examples": [
+                {
+                    "name": "panic-disorder",
+                    "uri": "git://org/repo/ontology/disorders/anxiety/panic-disorder.md",
+                    "content": "# Panic Disorder\n\nPanic disorder is characterized by...",
+                    "extracted_data": {
+                        "type": "disorder",
+                        "category": "anxiety",
+                        "icd10": "F41.0"
+                    },
+                    "tags": ["disorder", "anxiety"]
+                },
                 {
                     "name": "john-doe-cv-2024",
                     "file_id": "550e8400-e29b-41d4-a716-446655440000",
@@ -180,8 +171,7 @@ class Ontology(CoreModel):
                     "model_name": "claude-sonnet-4-5-20250929",
                     "extracted_data": {
                         "candidate_name": "John Doe",
-                        "skills": ["Python", "PostgreSQL"],
-                        "experience": []
+                        "skills": ["Python", "PostgreSQL"]
                     },
                     "confidence_score": 0.95,
                     "tags": ["cv", "engineering"]

rem/schemas/agents/rem.yaml CHANGED Viewed

@@ -124,7 +124,7 @@ json_schema_extra:
   # Explicit resource declarations for reference data
   resources:
-    - uri: rem://schemas
+    - uri: rem://agents
       name: Agent Schemas List
       description: List all available agent schemas in the system
     - uri: rem://status

rem/services/content/service.py CHANGED Viewed

@@ -274,7 +274,7 @@ class ContentService:
     async def ingest_file(
         self,
         file_uri: str,
-        user_id: str,
+        user_id: str | None = None,
         category: str | None = None,
         tags: list[str] | None = None,
         is_local_server: bool = False,
@@ -283,6 +283,10 @@ class ContentService:
         """
         Complete file ingestion pipeline: read → store → parse → chunk → embed.
+        **IMPORTANT: Data is PUBLIC by default (user_id=None).**
+        This is correct for shared knowledge bases (ontologies, procedures, reference data).
+        Private user-scoped data is rarely needed - only set user_id for truly personal content.
         **CENTRALIZED INGESTION**: This is the single entry point for all file ingestion
         in REM. It handles:
@@ -319,7 +323,9 @@ class ContentService:
         Args:
             file_uri: Source file location (local path, s3://, or https://)
-            user_id: User identifier for data isolation and ownership
+            user_id: User identifier for PRIVATE data only. Default None = PUBLIC/shared.
+                Leave as None for shared knowledge bases, ontologies, reference data.
+                Only set for truly private user-specific content.
             category: Optional category tag (document, code, audio, etc.)
             tags: Optional list of tags
             is_local_server: True if running as local/stdio MCP server
@@ -347,12 +353,19 @@ class ContentService:
         Example:
             >>> service = ContentService()
+            >>> # PUBLIC data (default) - visible to all users
             >>> result = await service.ingest_file(
-            ...     file_uri="s3://bucket/contract.pdf",
-            ...     user_id="user-123",
-            ...     category="legal"
+            ...     file_uri="s3://bucket/procedure.pdf",
+            ...     category="medical"
             ... )
             >>> print(f"Created {result['resources_created']} searchable chunks")
+            >>>
+            >>> # PRIVATE data (rare) - only for user-specific content
+            >>> result = await service.ingest_file(
+            ...     file_uri="s3://bucket/personal-notes.pdf",
+            ...     user_id="user-123",  # Only this user can access
+            ...     category="personal"
+            ... )
         """
         from pathlib import Path
         from uuid import uuid4

rem/services/email/service.py CHANGED Viewed

@@ -376,8 +376,17 @@ class EmailService:
             await user_repo.upsert(existing_user)
             return {"allowed": True, "error": None}
         else:
-            # New user - check if domain is trusted
-            if settings and hasattr(settings, 'email') and settings.email.trusted_domain_list:
+            # New user - first check if they're a subscriber (by email lookup)
+            from ...models.entities import Subscriber
+            subscriber_repo = Repository(Subscriber, db=db)
+            existing_subscriber = await subscriber_repo.find_one({"email": email})
+            if existing_subscriber:
+                # Subscriber exists - allow them to create account
+                # (approved field may not exist in older schemas, so just check existence)
+                logger.info(f"Subscriber {email} creating user account")
+            elif settings and hasattr(settings, 'email') and settings.email.trusted_domain_list:
+                # Not an approved subscriber - check if domain is trusted
                 if not settings.email.is_domain_trusted(email):
                     email_domain = email.split("@")[-1]
                     logger.warning(f"Untrusted domain attempted signup: {email_domain}")

remdb 0.3.171__py3-none-any.whl → 0.3.230__py3-none-any.whl

remdb 0.3.171py3-none-any.whl → 0.3.230py3-none-any.whl