PyPI - remdb - Versions diffs - 0.3.157__py3-none-any.whl → 0.3.180__py3-none-any.whl - Mend

remdb 0.3.157py3-none-any.whl → 0.3.180py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

rem/agentic/agents/agent_manager.py +2 -1
rem/agentic/context.py +81 -3
rem/agentic/context_builder.py +31 -6
rem/agentic/mcp/tool_wrapper.py +43 -14
rem/agentic/providers/pydantic_ai.py +76 -34
rem/agentic/schema.py +4 -3
rem/agentic/tools/rem_tools.py +11 -0
rem/api/deps.py +1 -3
rem/api/main.py +21 -2
rem/api/mcp_router/resources.py +75 -14
rem/api/mcp_router/server.py +27 -24
rem/api/mcp_router/tools.py +83 -2
rem/api/middleware/tracking.py +5 -5
rem/api/routers/auth.py +152 -10
rem/api/routers/chat/completions.py +5 -3
rem/api/routers/chat/streaming.py +18 -0
rem/api/routers/messages.py +24 -15
rem/auth/jwt.py +352 -0
rem/auth/middleware.py +70 -30
rem/cli/commands/ask.py +1 -1
rem/cli/commands/db.py +98 -44
rem/models/entities/ontology.py +93 -101
rem/schemas/agents/core/agent-builder.yaml +143 -42
rem/services/email/service.py +72 -9
rem/services/postgres/register_type.py +1 -1
rem/services/postgres/repository.py +5 -4
rem/services/user_service.py +41 -9
rem/settings.py +15 -1
rem/sql/background_indexes.sql +5 -0
rem/sql/migrations/001_install.sql +33 -4
rem/sql/migrations/002_install_models.sql +186 -168
rem/utils/model_helpers.py +101 -0
rem/utils/schema_loader.py +45 -7
{remdb-0.3.157.dist-info → remdb-0.3.180.dist-info}/METADATA +1 -1
{remdb-0.3.157.dist-info → remdb-0.3.180.dist-info}/RECORD +37 -36
{remdb-0.3.157.dist-info → remdb-0.3.180.dist-info}/WHEEL +0 -0
{remdb-0.3.157.dist-info → remdb-0.3.180.dist-info}/entry_points.txt +0 -0

rem/cli/commands/db.py CHANGED Viewed

@@ -333,29 +333,46 @@ def rebuild_cache(connection: str | None):
 @click.command()
 @click.argument("file_path", type=click.Path(exists=True, path_type=Path))
+@click.option("--table", "-t", default=None, help="Target table name (required for non-YAML formats)")
 @click.option("--user-id", default=None, help="User ID to scope data privately (default: public/shared)")
 @click.option("--dry-run", is_flag=True, help="Show what would be loaded without loading")
-def load(file_path: Path, user_id: str | None, dry_run: bool):
+def load(file_path: Path, table: str | None, user_id: str | None, dry_run: bool):
     """
-    Load data from YAML file into database.
+    Load data from file into database.
-    File format:
-        - table: resources
-          key_field: name
-          rows:
-            - name: Example
-              content: Test data...
+    Supports YAML with embedded metadata, or any tabular format via Polars
+    (jsonl, parquet, csv, json, arrow, etc.). For non-YAML formats, use --table.
     Examples:
-        rem db load rem/tests/data/graph_seed.yaml
-        rem db load data.yaml --user-id my-user  # Private to user
-        rem db load data.yaml --dry-run
+        rem db load data.yaml                        # YAML with metadata
+        rem db load data.jsonl -t resources          # Any Polars-supported format
     """
-    asyncio.run(_load_async(file_path, user_id, dry_run))
+    asyncio.run(_load_async(file_path, table, user_id, dry_run))
-async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
+def _load_dataframe_from_file(file_path: Path) -> "pl.DataFrame":
+    """Load any Polars-supported file format into a DataFrame."""
+    import polars as pl
+    suffix = file_path.suffix.lower()
+    if suffix in {".jsonl", ".ndjson"}:
+        return pl.read_ndjson(file_path)
+    elif suffix in {".parquet", ".pq"}:
+        return pl.read_parquet(file_path)
+    elif suffix == ".csv":
+        return pl.read_csv(file_path)
+    elif suffix == ".json":
+        return pl.read_json(file_path)
+    elif suffix in {".ipc", ".arrow"}:
+        return pl.read_ipc(file_path)
+    else:
+        raise ValueError(f"Unsupported file format: {suffix}. Use any Polars-supported format.")
+async def _load_async(file_path: Path, table: str | None, user_id: str | None, dry_run: bool):
     """Async implementation of load command."""
+    import polars as pl
     import yaml
     from ...models.core.inline_edge import InlineEdge
     from ...models.entities import Resource, Moment, User, Message, SharedSession, Schema
@@ -365,21 +382,10 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
     scope_msg = f"user: {user_id}" if user_id else "public"
     logger.info(f"Scope: {scope_msg}")
-    # Load YAML file
-    with open(file_path) as f:
-        data = yaml.safe_load(f)
-    if not isinstance(data, list):
-        logger.error("YAML must be a list of table definitions")
-        raise click.Abort()
-    if dry_run:
-        logger.info("DRY RUN - Would load:")
-        logger.info(yaml.dump(data, default_flow_style=False))
-        return
+    suffix = file_path.suffix.lower()
+    is_yaml = suffix in {".yaml", ".yml"}
     # Map table names to model classes
-    # CoreModel subclasses use Repository.upsert()
     MODEL_MAP = {
         "users": User,
         "moments": Moment,
@@ -391,6 +397,58 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
     # Non-CoreModel tables that need direct SQL insertion
     DIRECT_INSERT_TABLES = {"shared_sessions"}
+    # Parse file based on format
+    if is_yaml:
+        # YAML with embedded metadata
+        with open(file_path) as f:
+            data = yaml.safe_load(f)
+        if not isinstance(data, list):
+            logger.error("YAML must be a list of table definitions")
+            raise click.Abort()
+        if dry_run:
+            logger.info("DRY RUN - Would load:")
+            logger.info(yaml.dump(data, default_flow_style=False))
+            return
+        table_defs = data
+    else:
+        # Polars-supported format - require --table
+        if not table:
+            logger.error(f"For {suffix} files, --table is required. Example: rem db load {file_path.name} -t resources")
+            raise click.Abort()
+        try:
+            df = _load_dataframe_from_file(file_path)
+        except Exception as e:
+            logger.error(f"Failed to load file: {e}")
+            raise click.Abort()
+        rows = df.to_dicts()
+        if dry_run:
+            logger.info(f"DRY RUN - Would load {len(rows)} rows to table '{table}':")
+            logger.info(f"Columns: {list(df.columns)}")
+            # Validate first row against model if table is known
+            if table in {"users", "moments", "resources", "messages", "schemas"} and rows:
+                from ...models.entities import Resource, Moment, User, Message, Schema
+                from ...utils.model_helpers import validate_data_for_model
+                model_map = {"users": User, "moments": Moment, "resources": Resource,
+                            "messages": Message, "schemas": Schema}
+                result = validate_data_for_model(model_map[table], rows[0])
+                if result.extra_fields:
+                    logger.warning(f"Unknown fields (ignored): {result.extra_fields}")
+                if result.valid:
+                    logger.success(f"Sample row validates OK. Required: {result.required_fields or '(none)'}")
+                else:
+                    result.log_errors("Sample row")
+            return
+        # Wrap as single table definition
+        table_defs = [{"table": table, "rows": rows}]
     # Connect to database
     pg = get_postgres_service()
     if not pg:
@@ -402,20 +460,17 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
     try:
         total_loaded = 0
-        for table_def in data:
+        for table_def in table_defs:
             table_name = table_def["table"]
-            key_field = table_def.get("key_field", "id")
             rows = table_def.get("rows", [])
             # Handle direct insert tables (non-CoreModel)
             if table_name in DIRECT_INSERT_TABLES:
                 for row_data in rows:
-                    # Add tenant_id if not present
                     if "tenant_id" not in row_data:
                         row_data["tenant_id"] = "default"
                     if table_name == "shared_sessions":
-                        # Insert shared_session directly
                         await pg.fetch(
                             """INSERT INTO shared_sessions
                                (session_id, owner_user_id, shared_with_user_id, tenant_id)
@@ -434,12 +489,9 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
                 logger.warning(f"Unknown table: {table_name}, skipping")
                 continue
-            model_class = MODEL_MAP[table_name]  # Type is inferred from MODEL_MAP
+            model_class = MODEL_MAP[table_name]
-            for row_data in rows:
-                # Add user_id and tenant_id only if explicitly provided
-                # Default is public (None) - data is shared/visible to all
-                # Pass --user-id to scope data privately to a specific user
+            for row_idx, row_data in enumerate(rows):
                 if "user_id" not in row_data and user_id is not None:
                     row_data["user_id"] = user_id
                 if "tenant_id" not in row_data and user_id is not None:
@@ -452,26 +504,28 @@ async def _load_async(file_path: Path, user_id: str | None, dry_run: bool):
                         for edge in row_data["graph_edges"]
                     ]
-                # Convert any ISO timestamp strings with Z suffix to naive datetime
-                # This handles fields like starts_timestamp, ends_timestamp, etc.
+                # Convert ISO timestamp strings
                 from ...utils.date_utils import parse_iso
                 for key, value in list(row_data.items()):
                     if isinstance(value, str) and (key.endswith("_timestamp") or key.endswith("_at")):
                         try:
                             row_data[key] = parse_iso(value)
                         except (ValueError, TypeError):
-                            pass  # Not a valid datetime string, leave as-is
+                            pass
-                # Create model instance and upsert via repository
                 from ...services.postgres.repository import Repository
+                from ...utils.model_helpers import validate_data_for_model
+                result = validate_data_for_model(model_class, row_data)
+                if not result.valid:
+                    result.log_errors(f"Row {row_idx + 1} ({table_name})")
+                    raise click.Abort()
-                instance = model_class(**row_data)
-                repo = Repository(model_class, table_name, pg)  # Type inferred from MODEL_MAP
-                await repo.upsert(instance)  # type: ignore[arg-type]
+                repo = Repository(model_class, table_name, pg)
+                await repo.upsert(result.instance)  # type: ignore[arg-type]
                 total_loaded += 1
-                # Log based on model type
-                name = getattr(instance, 'name', getattr(instance, 'id', '?'))
+                name = getattr(result.instance, 'name', getattr(result.instance, 'id', '?'))
                 logger.success(f"Loaded {table_name[:-1]}: {name}")
         logger.success(f"Data loaded successfully! Total rows: {total_loaded}")

rem/models/entities/ontology.py CHANGED Viewed

@@ -1,63 +1,55 @@
-"""Ontology entity for tenant-specific knowledge extensions.
+"""Ontology entity for domain-specific knowledge.
-**What is Ontology Extraction?**
+**What are Ontologies?**
-Ontologies are **domain-specific structured knowledge** extracted from files using custom
-agent schemas. They extend REM's normal file processing pipeline with tenant-specific
-parsers that extract structured data the standard chunking pipeline would miss.
+Ontologies are **domain-specific structured knowledge** that can be:
+1. **Extracted** from files using custom agent schemas (agent-extracted)
+2. **Loaded directly** from external sources like git repos or S3 (direct-loaded)
-**Normal File Processing:**
-File → extract text → chunk → embed → resources (semantic search ready)
+**Use Case 1: Agent-Extracted Ontologies**
-**Ontology Processing (Tenant Knowledge Extensions):**
 File → custom agent → structured JSON → ontology (domain knowledge)
-**Why Ontologies?**
-- Standard chunking gives you semantic search over raw content
-- Ontologies give you **structured queryable fields** from domain logic
-- Example: A contract PDF becomes both searchable chunks AND a structured record with
-  parties, dates, payment terms, obligations as queryable fields
+Example: A contract PDF becomes a structured record with parties, dates, payment terms.
+**Use Case 2: Direct-Loaded Ontologies (Knowledge Bases)**
+External source (git/S3) → load → ontology (reference knowledge)
+Example: A psychiatric ontology of disorders, symptoms, and drugs loaded from markdown
+files in a git repository. Each markdown file becomes an ontology node with:
+- `uri`: git path (e.g., `git://org/repo/ontology/disorders/anxiety/panic-disorder.md`)
+- `content`: markdown content for embedding/search
+- `extracted_data`: parsed frontmatter or structure
 **Architecture:**
-- Runs as part of dreaming worker (background knowledge extraction)
-- OntologyConfig defines which files trigger which extractors (MIME type, URI pattern, tags)
+- Runs as part of dreaming worker (background knowledge extraction) OR
+- Loaded directly via `rem db load` for external knowledge bases
+- OntologyConfig defines which files trigger which extractors
 - Multiple ontologies per file (apply different domain lenses)
-- Tenant-scoped: Each tenant can define their own extractors
+- Tenant-scoped: Each tenant can define their own extractors and knowledge bases
 **Use Cases:**
-1. **Recruitment (CV Parsing)**
-   - Standard pipeline: Chunks for "find me candidates with Python experience"
-   - Ontology: Structured fields for filtering/sorting (years_experience, seniority_level, skills[])
-2. **Legal (Contract Analysis)**
-   - Standard pipeline: Semantic search over contract text
-   - Ontology: Queryable fields (parties, effective_date, payment_amount, key_obligations[])
+1. **Recruitment (CV Parsing)** - Agent-extracted
+   - Ontology: Structured fields for filtering/sorting (years_experience, skills[])
-3. **Medical (Health Records)**
-   - Standard pipeline: Find mentions of conditions
-   - Ontology: Structured diagnoses, medications, dosages, treatment plans
+2. **Legal (Contract Analysis)** - Agent-extracted
+   - Ontology: Queryable fields (parties, effective_date, payment_amount)
-4. **Finance (Report Analysis)**
-   - Standard pipeline: Search for financial terms
-   - Ontology: Extracted metrics, risk_flags, trends, forecasts
+3. **Medical Knowledge Base** - Direct-loaded
+   - Ontology: Disorders, symptoms, medications from curated markdown files
+   - Enables semantic search over psychiatric/medical domain knowledge
-**Example Flow:**
-1. Tenant creates OntologyConfig: "Run cv-parser-v1 on files with mime_type='application/pdf' and tags=['resume']"
-2. File uploaded with tags=["resume"]
-3. Normal processing: File → chunks → resources
-4. Dreaming worker detects matching OntologyConfig
-5. Loads cv-parser-v1 agent schema from database
-6. Runs agent on file content → extracts structured data
-7. Stores Ontology with extracted_data = {candidate_name, skills, experience, education, ...}
-8. Ontology is now queryable via LOOKUP, SEARCH, or direct SQL
+4. **Documentation/Procedures** - Direct-loaded
+   - Ontology: Clinical procedures (e.g., SCID-5 assessment steps)
+   - Reference material accessible via RAG
 **Design:**
-- Each ontology links to a File via file_id
-- Agent schema tracked via agent_schema_id (human-readable label, not UUID)
-- Structured data in `extracted_data` (arbitrary JSON, schema defined by agent)
-- Embeddings generated for semantic search (configurable fields via agent schema)
-- Multiple ontologies per file using different schemas
+- `file_id` and `agent_schema_id` are optional (only needed for agent-extracted)
+- `uri` field for external source references (git://, s3://, https://)
+- Structured data in `extracted_data` (arbitrary JSON)
+- Embeddings generated for semantic search via `content` field
 - Tenant-isolated: OntologyConfigs are tenant-scoped
 """
@@ -70,18 +62,19 @@ from ..core.core_model import CoreModel
 class Ontology(CoreModel):
-    """Domain-specific knowledge extracted from files using custom agents.
+    """Domain-specific knowledge - either agent-extracted or direct-loaded.
     Attributes:
         name: Human-readable label for this ontology instance
-        file_id: Foreign key to File entity that was processed
-        agent_schema_id: Foreign key to Schema entity that performed extraction
-        provider_name: LLM provider used for extraction (e.g., "anthropic", "openai")
-        model_name: Specific model used (e.g., "claude-sonnet-4-5")
-        extracted_data: Structured data extracted by agent (arbitrary JSON)
+        uri: External source reference (git://, s3://, https://) for direct-loaded ontologies
+        file_id: Foreign key to File entity (optional - only for agent-extracted)
+        agent_schema_id: Schema that performed extraction (optional - only for agent-extracted)
+        provider_name: LLM provider used for extraction (optional)
+        model_name: Specific model used (optional)
+        extracted_data: Structured data - either extracted by agent or parsed from source
         confidence_score: Optional confidence score from extraction (0.0-1.0)
         extraction_timestamp: When extraction was performed
-        embedding_text: Text used for generating embedding (derived from extracted_data)
+        content: Text used for generating embedding
     Inherited from CoreModel:
         id: UUID or string identifier
@@ -93,10 +86,9 @@ class Ontology(CoreModel):
         graph_edges: Relationships to other entities
         metadata: Flexible metadata storage
         tags: Classification tags
-        column: Database schema metadata
     Example Usage:
-        # CV extraction
+        # Agent-extracted: CV parsing
         cv_ontology = Ontology(
             name="john-doe-cv-2024",
             file_id="file-uuid-123",
@@ -105,73 +97,74 @@ class Ontology(CoreModel):
             model_name="claude-sonnet-4-5-20250929",
             extracted_data={
                 "candidate_name": "John Doe",
-                "email": "john@example.com",
                 "skills": ["Python", "PostgreSQL", "Kubernetes"],
-                "experience": [
-                    {
-                        "company": "TechCorp",
-                        "role": "Senior Engineer",
-                        "years": 3,
-                        "achievements": ["Led migration to k8s", "Reduced costs 40%"]
-                    }
-                ],
-                "education": [
-                    {"degree": "BS Computer Science", "institution": "MIT", "year": 2018}
-                ]
             },
             confidence_score=0.95,
-            tags=["cv", "engineering", "senior-level"]
+            tags=["cv", "engineering"]
         )
-        # Contract extraction
-        contract_ontology = Ontology(
-            name="acme-supplier-agreement-2024",
-            file_id="file-uuid-456",
-            agent_schema_id="contract-parser-v2",
-            provider_name="openai",
-            model_name="gpt-4.1",
+        # Direct-loaded: Medical knowledge base from git
+        disorder_ontology = Ontology(
+            name="panic-disorder",
+            uri="git://bwolfson-siggie/Siggy-MVP/ontology/disorders/anxiety/panic-disorder.md",
+            content="# Panic Disorder\\n\\nPanic disorder is characterized by...",
             extracted_data={
-                "contract_type": "supplier_agreement",
-                "parties": [
-                    {"name": "ACME Corp", "role": "buyer"},
-                    {"name": "SupplyChain Inc", "role": "supplier"}
-                ],
-                "effective_date": "2024-01-01",
-                "termination_date": "2026-12-31",
-                "payment_terms": {
-                    "amount": 500000,
-                    "currency": "USD",
-                    "frequency": "quarterly"
-                },
-                "key_obligations": [
-                    "Supplier must deliver within 30 days",
-                    "Buyer must pay within 60 days of invoice"
-                ]
+                "type": "disorder",
+                "category": "anxiety",
+                "icd10": "F41.0",
+                "dsm5_criteria": ["A", "B", "C", "D"],
+            },
+            tags=["disorder", "anxiety", "dsm5"]
+        )
+        # Direct-loaded: Clinical procedure from git
+        scid_node = Ontology(
+            name="scid-5-f1",
+            uri="git://bwolfson-siggie/Siggy-MVP/ontology/procedures/scid-5/module-f/scid-5-f1.md",
+            content="# scid-5-f1: Panic Attack Screening\\n\\n...",
+            extracted_data={
+                "type": "procedure",
+                "module": "F",
+                "section": "Panic Disorder",
+                "dsm5_criterion": "Panic Attack Specifier",
             },
-            confidence_score=0.92,
-            tags=["contract", "supplier", "procurement"]
+            tags=["scid-5", "procedure", "anxiety"]
         )
     """
     # Core fields
     name: str
-    file_id: UUID | str
-    agent_schema_id: str  # Natural language label of Schema entity
+    uri: Optional[str] = None  # External source: git://, s3://, https://
-    # Extraction metadata
-    provider_name: str  # LLM provider (anthropic, openai, etc.)
-    model_name: str  # Specific model used
-    extracted_data: dict[str, Any]  # Arbitrary structured data from agent
+    # Agent extraction fields (optional - only for agent-extracted ontologies)
+    file_id: Optional[UUID | str] = None  # FK to File entity
+    agent_schema_id: Optional[str] = None  # Schema that performed extraction
+    provider_name: Optional[str] = None  # LLM provider (anthropic, openai, etc.)
+    model_name: Optional[str] = None  # Specific model used
+    # Data fields
+    extracted_data: Optional[dict[str, Any]] = None  # Structured data
     confidence_score: Optional[float] = None  # 0.0-1.0 if provided by agent
     extraction_timestamp: Optional[str] = None  # ISO8601 timestamp
-    # Semantic search support
-    embedding_text: Optional[str] = None  # Text for embedding generation
+    # Semantic search support - 'content' is a default embeddable field name
+    content: Optional[str] = None  # Text for embedding generation
     model_config = ConfigDict(
         json_schema_extra={
-            "description": "Domain-specific knowledge extracted from files using custom agents",
+            "description": "Domain-specific knowledge - agent-extracted or direct-loaded from external sources",
             "examples": [
+                {
+                    "name": "panic-disorder",
+                    "uri": "git://org/repo/ontology/disorders/anxiety/panic-disorder.md",
+                    "content": "# Panic Disorder\n\nPanic disorder is characterized by...",
+                    "extracted_data": {
+                        "type": "disorder",
+                        "category": "anxiety",
+                        "icd10": "F41.0"
+                    },
+                    "tags": ["disorder", "anxiety"]
+                },
                 {
                     "name": "john-doe-cv-2024",
                     "file_id": "550e8400-e29b-41d4-a716-446655440000",
@@ -180,8 +173,7 @@ class Ontology(CoreModel):
                     "model_name": "claude-sonnet-4-5-20250929",
                     "extracted_data": {
                         "candidate_name": "John Doe",
-                        "skills": ["Python", "PostgreSQL"],
-                        "experience": []
+                        "skills": ["Python", "PostgreSQL"]
                     },
                     "confidence_score": 0.95,
                     "tags": ["cv", "engineering"]

remdb 0.3.157__py3-none-any.whl → 0.3.180__py3-none-any.whl

remdb 0.3.157py3-none-any.whl → 0.3.180py3-none-any.whl