PyPI - remdb - Versions diffs - 0.3.0__py3-none-any.whl - Mend

remdb 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show

rem/__init__.py +2 -0
rem/agentic/README.md +650 -0
rem/agentic/__init__.py +39 -0
rem/agentic/agents/README.md +155 -0
rem/agentic/agents/__init__.py +8 -0
rem/agentic/context.py +148 -0
rem/agentic/context_builder.py +329 -0
rem/agentic/mcp/__init__.py +0 -0
rem/agentic/mcp/tool_wrapper.py +107 -0
rem/agentic/otel/__init__.py +5 -0
rem/agentic/otel/setup.py +151 -0
rem/agentic/providers/phoenix.py +674 -0
rem/agentic/providers/pydantic_ai.py +572 -0
rem/agentic/query.py +117 -0
rem/agentic/query_helper.py +89 -0
rem/agentic/schema.py +396 -0
rem/agentic/serialization.py +245 -0
rem/agentic/tools/__init__.py +5 -0
rem/agentic/tools/rem_tools.py +231 -0
rem/api/README.md +420 -0
rem/api/main.py +324 -0
rem/api/mcp_router/prompts.py +182 -0
rem/api/mcp_router/resources.py +536 -0
rem/api/mcp_router/server.py +213 -0
rem/api/mcp_router/tools.py +584 -0
rem/api/routers/auth.py +229 -0
rem/api/routers/chat/__init__.py +5 -0
rem/api/routers/chat/completions.py +281 -0
rem/api/routers/chat/json_utils.py +76 -0
rem/api/routers/chat/models.py +124 -0
rem/api/routers/chat/streaming.py +185 -0
rem/auth/README.md +258 -0
rem/auth/__init__.py +26 -0
rem/auth/middleware.py +100 -0
rem/auth/providers/__init__.py +13 -0
rem/auth/providers/base.py +376 -0
rem/auth/providers/google.py +163 -0
rem/auth/providers/microsoft.py +237 -0
rem/cli/README.md +455 -0
rem/cli/__init__.py +8 -0
rem/cli/commands/README.md +126 -0
rem/cli/commands/__init__.py +3 -0
rem/cli/commands/ask.py +566 -0
rem/cli/commands/configure.py +497 -0
rem/cli/commands/db.py +493 -0
rem/cli/commands/dreaming.py +324 -0
rem/cli/commands/experiments.py +1302 -0
rem/cli/commands/mcp.py +66 -0
rem/cli/commands/process.py +245 -0
rem/cli/commands/schema.py +183 -0
rem/cli/commands/serve.py +106 -0
rem/cli/dreaming.py +363 -0
rem/cli/main.py +96 -0
rem/config.py +237 -0
rem/mcp_server.py +41 -0
rem/models/core/__init__.py +49 -0
rem/models/core/core_model.py +64 -0
rem/models/core/engram.py +333 -0
rem/models/core/experiment.py +628 -0
rem/models/core/inline_edge.py +132 -0
rem/models/core/rem_query.py +243 -0
rem/models/entities/__init__.py +43 -0
rem/models/entities/file.py +57 -0
rem/models/entities/image_resource.py +88 -0
rem/models/entities/message.py +35 -0
rem/models/entities/moment.py +123 -0
rem/models/entities/ontology.py +191 -0
rem/models/entities/ontology_config.py +131 -0
rem/models/entities/resource.py +95 -0
rem/models/entities/schema.py +87 -0
rem/models/entities/user.py +85 -0
rem/py.typed +0 -0
rem/schemas/README.md +507 -0
rem/schemas/__init__.py +6 -0
rem/schemas/agents/README.md +92 -0
rem/schemas/agents/core/moment-builder.yaml +178 -0
rem/schemas/agents/core/rem-query-agent.yaml +226 -0
rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
rem/schemas/agents/core/simple-assistant.yaml +19 -0
rem/schemas/agents/core/user-profile-builder.yaml +163 -0
rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
rem/schemas/agents/examples/contract-extractor.yaml +134 -0
rem/schemas/agents/examples/cv-parser.yaml +263 -0
rem/schemas/agents/examples/hello-world.yaml +37 -0
rem/schemas/agents/examples/query.yaml +54 -0
rem/schemas/agents/examples/simple.yaml +21 -0
rem/schemas/agents/examples/test.yaml +29 -0
rem/schemas/agents/rem.yaml +128 -0
rem/schemas/evaluators/hello-world/default.yaml +77 -0
rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
rem/services/__init__.py +16 -0
rem/services/audio/INTEGRATION.md +308 -0
rem/services/audio/README.md +376 -0
rem/services/audio/__init__.py +15 -0
rem/services/audio/chunker.py +354 -0
rem/services/audio/transcriber.py +259 -0
rem/services/content/README.md +1269 -0
rem/services/content/__init__.py +5 -0
rem/services/content/providers.py +806 -0
rem/services/content/service.py +676 -0
rem/services/dreaming/README.md +230 -0
rem/services/dreaming/__init__.py +53 -0
rem/services/dreaming/affinity_service.py +336 -0
rem/services/dreaming/moment_service.py +264 -0
rem/services/dreaming/ontology_service.py +54 -0
rem/services/dreaming/user_model_service.py +297 -0
rem/services/dreaming/utils.py +39 -0
rem/services/embeddings/__init__.py +11 -0
rem/services/embeddings/api.py +120 -0
rem/services/embeddings/worker.py +421 -0
rem/services/fs/README.md +662 -0
rem/services/fs/__init__.py +62 -0
rem/services/fs/examples.py +206 -0
rem/services/fs/examples_paths.py +204 -0
rem/services/fs/git_provider.py +935 -0
rem/services/fs/local_provider.py +760 -0
rem/services/fs/parsing-hooks-examples.md +172 -0
rem/services/fs/paths.py +276 -0
rem/services/fs/provider.py +460 -0
rem/services/fs/s3_provider.py +1042 -0
rem/services/fs/service.py +186 -0
rem/services/git/README.md +1075 -0
rem/services/git/__init__.py +17 -0
rem/services/git/service.py +469 -0
rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
rem/services/phoenix/README.md +453 -0
rem/services/phoenix/__init__.py +46 -0
rem/services/phoenix/client.py +686 -0
rem/services/phoenix/config.py +88 -0
rem/services/phoenix/prompt_labels.py +477 -0
rem/services/postgres/README.md +575 -0
rem/services/postgres/__init__.py +23 -0
rem/services/postgres/migration_service.py +427 -0
rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
rem/services/postgres/register_type.py +352 -0
rem/services/postgres/repository.py +337 -0
rem/services/postgres/schema_generator.py +379 -0
rem/services/postgres/service.py +802 -0
rem/services/postgres/sql_builder.py +354 -0
rem/services/rem/README.md +304 -0
rem/services/rem/__init__.py +23 -0
rem/services/rem/exceptions.py +71 -0
rem/services/rem/executor.py +293 -0
rem/services/rem/parser.py +145 -0
rem/services/rem/queries.py +196 -0
rem/services/rem/query.py +371 -0
rem/services/rem/service.py +527 -0
rem/services/session/README.md +374 -0
rem/services/session/__init__.py +6 -0
rem/services/session/compression.py +360 -0
rem/services/session/reload.py +77 -0
rem/settings.py +1235 -0
rem/sql/002_install_models.sql +1068 -0
rem/sql/background_indexes.sql +42 -0
rem/sql/install_models.sql +1038 -0
rem/sql/migrations/001_install.sql +503 -0
rem/sql/migrations/002_install_models.sql +1202 -0
rem/utils/AGENTIC_CHUNKING.md +597 -0
rem/utils/README.md +583 -0
rem/utils/__init__.py +43 -0
rem/utils/agentic_chunking.py +622 -0
rem/utils/batch_ops.py +343 -0
rem/utils/chunking.py +108 -0
rem/utils/clip_embeddings.py +276 -0
rem/utils/dict_utils.py +98 -0
rem/utils/embeddings.py +423 -0
rem/utils/examples/embeddings_example.py +305 -0
rem/utils/examples/sql_types_example.py +202 -0
rem/utils/markdown.py +16 -0
rem/utils/model_helpers.py +236 -0
rem/utils/schema_loader.py +336 -0
rem/utils/sql_types.py +348 -0
rem/utils/user_id.py +81 -0
rem/utils/vision.py +330 -0
rem/workers/README.md +506 -0
rem/workers/__init__.py +5 -0
rem/workers/dreaming.py +502 -0
rem/workers/engram_processor.py +312 -0
rem/workers/sqs_file_processor.py +193 -0
remdb-0.3.0.dist-info/METADATA +1455 -0
remdb-0.3.0.dist-info/RECORD +187 -0
remdb-0.3.0.dist-info/WHEEL +4 -0
remdb-0.3.0.dist-info/entry_points.txt +2 -0

rem/models/entities/moment.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""
+Moment - Temporal narrative in REM.
+Moments are extracted from Resources through first-order dreaming workflows.
+They represent temporal narratives like meetings, coding sessions, conversations,
+or any classified time period when users were focused on specific activities.
+Moments provide temporal structure to the REM graph:
+- Temporal boundaries (starts_timestamp, ends_timestamp)
+- Present persons (who was involved)
+- Emotion tags (team sentiment)
+- Topic tags (what was discussed)
+- Natural language summaries
+Moments enable temporal queries:
+- "What happened between milestone A and B?"
+- "When did Sarah and Mike meet?"
+- "What was discussed in Q4 retrospective?"
+Data Model:
+- Inherits from CoreModel (id, tenant_id, timestamps, graph_edges, etc.)
+- name: Human-readable moment name
+- moment_type: Classification (meeting, coding-session, conversation, etc.)
+- starts_timestamp: Start time
+- ends_timestamp: End time
+- present_persons: List of Person objects with id, name, role
+- emotion_tags: Sentiment tags (happy, frustrated, focused)
+- topic_tags: Topic/concept tags (project names, technologies)
+- summary: Natural language description
+- source_resource_ids: Resources used to construct this moment
+"""
+from datetime import datetime
+from typing import Optional
+from pydantic import BaseModel, Field, model_validator
+from ..core import CoreModel
+class Person(BaseModel):
+    """Person reference in a moment."""
+    id: str = Field(..., description="Person entity label")
+    name: str = Field(..., description="Person name")
+    role: Optional[str] = Field(default=None, description="Person role in moment")
+class Moment(CoreModel):
+    """
+    Temporal narrative extracted from resources.
+    Moments provide temporal structure and context for the REM graph,
+    enabling time-based queries and understanding of when events occurred.
+    Tenant isolation is provided via CoreModel.tenant_id field.
+    """
+    name: Optional[str] = Field(
+        default=None,
+        description="Human-readable moment name (used as graph label). Auto-generated from starts_timestamp+moment_type if not provided.",
+        json_schema_extra={"entity_key": True},  # Primary business key for KV lookups
+    )
+    moment_type: Optional[str] = Field(
+        default=None,
+        description="Moment classification (meeting, coding-session, conversation, etc.)",
+    )
+    category: Optional[str] = Field(
+        default=None,
+        description="Moment category for grouping and filtering",
+    )
+    starts_timestamp: datetime = Field(
+        ...,
+        description="Moment start time",
+    )
+    ends_timestamp: Optional[datetime] = Field(
+        default=None,
+        description="Moment end time",
+    )
+    present_persons: list[Person] = Field(
+        default_factory=list,
+        description="People present in the moment",
+    )
+    emotion_tags: list[str] = Field(
+        default_factory=list,
+        description="Emotion/sentiment tags (happy, frustrated, focused, etc.)",
+    )
+    topic_tags: list[str] = Field(
+        default_factory=list,
+        description="Topic/concept tags (project names, technologies, etc.)",
+    )
+    summary: Optional[str] = Field(
+        default=None,
+        description="Natural language summary of the moment",
+    )
+    source_resource_ids: list[str] = Field(
+        default_factory=list,
+        description="Resource IDs used to construct this moment",
+    )
+    @model_validator(mode='after')
+    def generate_name_if_missing(self) -> 'Moment':
+        """Auto-generate name from starts_timestamp+moment_type if not provided."""
+        if not self.name:
+            # Format: "Meeting on 2024-12-20" or "Coding Session on 2024-12-20 14:30"
+            if self.starts_timestamp:
+                date_str = self.starts_timestamp.strftime("%Y-%m-%d")
+                time_str = self.starts_timestamp.strftime("%H:%M")
+                if self.moment_type:
+                    moment_label = self.moment_type.replace('-', ' ').replace('_', ' ').title()
+                    self.name = f"{moment_label} on {date_str}"
+                else:
+                    self.name = f"Moment on {date_str} {time_str}"
+            else:
+                # Fallback: use ID or generic name
+                if self.id:
+                    self.name = f"moment-{str(self.id)[:8]}"
+                else:
+                    self.name = "unnamed-moment"
+        return self

rem/models/entities/ontology.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""Ontology entity for tenant-specific knowledge extensions.
+**What is Ontology Extraction?**
+Ontologies are **domain-specific structured knowledge** extracted from files using custom
+agent schemas. They extend REM's normal file processing pipeline with tenant-specific
+parsers that extract structured data the standard chunking pipeline would miss.
+**Normal File Processing:**
+File → extract text → chunk → embed → resources (semantic search ready)
+**Ontology Processing (Tenant Knowledge Extensions):**
+File → custom agent → structured JSON → ontology (domain knowledge)
+**Why Ontologies?**
+- Standard chunking gives you semantic search over raw content
+- Ontologies give you **structured queryable fields** from domain logic
+- Example: A contract PDF becomes both searchable chunks AND a structured record with
+  parties, dates, payment terms, obligations as queryable fields
+**Architecture:**
+- Runs as part of dreaming worker (background knowledge extraction)
+- OntologyConfig defines which files trigger which extractors (MIME type, URI pattern, tags)
+- Multiple ontologies per file (apply different domain lenses)
+- Tenant-scoped: Each tenant can define their own extractors
+**Use Cases:**
+1. **Recruitment (CV Parsing)**
+   - Standard pipeline: Chunks for "find me candidates with Python experience"
+   - Ontology: Structured fields for filtering/sorting (years_experience, seniority_level, skills[])
+2. **Legal (Contract Analysis)**
+   - Standard pipeline: Semantic search over contract text
+   - Ontology: Queryable fields (parties, effective_date, payment_amount, key_obligations[])
+3. **Medical (Health Records)**
+   - Standard pipeline: Find mentions of conditions
+   - Ontology: Structured diagnoses, medications, dosages, treatment plans
+4. **Finance (Report Analysis)**
+   - Standard pipeline: Search for financial terms
+   - Ontology: Extracted metrics, risk_flags, trends, forecasts
+**Example Flow:**
+1. Tenant creates OntologyConfig: "Run cv-parser-v1 on files with mime_type='application/pdf' and tags=['resume']"
+2. File uploaded with tags=["resume"]
+3. Normal processing: File → chunks → resources
+4. Dreaming worker detects matching OntologyConfig
+5. Loads cv-parser-v1 agent schema from database
+6. Runs agent on file content → extracts structured data
+7. Stores Ontology with extracted_data = {candidate_name, skills, experience, education, ...}
+8. Ontology is now queryable via LOOKUP, SEARCH, or direct SQL
+**Design:**
+- Each ontology links to a File via file_id
+- Agent schema tracked via agent_schema_id (human-readable label, not UUID)
+- Structured data in `extracted_data` (arbitrary JSON, schema defined by agent)
+- Embeddings generated for semantic search (configurable fields via agent schema)
+- Multiple ontologies per file using different schemas
+- Tenant-isolated: OntologyConfigs are tenant-scoped
+"""
+from typing import Any, Optional
+from uuid import UUID
+from pydantic import ConfigDict
+from ..core.core_model import CoreModel
+class Ontology(CoreModel):
+    """Domain-specific knowledge extracted from files using custom agents.
+    Attributes:
+        name: Human-readable label for this ontology instance
+        file_id: Foreign key to File entity that was processed
+        agent_schema_id: Foreign key to Schema entity that performed extraction
+        provider_name: LLM provider used for extraction (e.g., "anthropic", "openai")
+        model_name: Specific model used (e.g., "claude-sonnet-4-5")
+        extracted_data: Structured data extracted by agent (arbitrary JSON)
+        confidence_score: Optional confidence score from extraction (0.0-1.0)
+        extraction_timestamp: When extraction was performed
+        embedding_text: Text used for generating embedding (derived from extracted_data)
+    Inherited from CoreModel:
+        id: UUID or string identifier
+        created_at: Entity creation timestamp
+        updated_at: Last update timestamp
+        deleted_at: Soft deletion timestamp
+        tenant_id: Multi-tenancy isolation
+        user_id: Ownership
+        graph_edges: Relationships to other entities
+        metadata: Flexible metadata storage
+        tags: Classification tags
+        column: Database schema metadata
+    Example Usage:
+        # CV extraction
+        cv_ontology = Ontology(
+            name="john-doe-cv-2024",
+            file_id="file-uuid-123",
+            agent_schema_id="cv-parser-v1",
+            provider_name="anthropic",
+            model_name="claude-sonnet-4-5-20250929",
+            extracted_data={
+                "candidate_name": "John Doe",
+                "email": "john@example.com",
+                "skills": ["Python", "PostgreSQL", "Kubernetes"],
+                "experience": [
+                    {
+                        "company": "TechCorp",
+                        "role": "Senior Engineer",
+                        "years": 3,
+                        "achievements": ["Led migration to k8s", "Reduced costs 40%"]
+                    }
+                ],
+                "education": [
+                    {"degree": "BS Computer Science", "institution": "MIT", "year": 2018}
+                ]
+            },
+            confidence_score=0.95,
+            tags=["cv", "engineering", "senior-level"]
+        )
+        # Contract extraction
+        contract_ontology = Ontology(
+            name="acme-supplier-agreement-2024",
+            file_id="file-uuid-456",
+            agent_schema_id="contract-parser-v2",
+            provider_name="openai",
+            model_name="gpt-4o",
+            extracted_data={
+                "contract_type": "supplier_agreement",
+                "parties": [
+                    {"name": "ACME Corp", "role": "buyer"},
+                    {"name": "SupplyChain Inc", "role": "supplier"}
+                ],
+                "effective_date": "2024-01-01",
+                "termination_date": "2026-12-31",
+                "payment_terms": {
+                    "amount": 500000,
+                    "currency": "USD",
+                    "frequency": "quarterly"
+                },
+                "key_obligations": [
+                    "Supplier must deliver within 30 days",
+                    "Buyer must pay within 60 days of invoice"
+                ]
+            },
+            confidence_score=0.92,
+            tags=["contract", "supplier", "procurement"]
+        )
+    """
+    # Core fields
+    name: str
+    file_id: UUID | str
+    agent_schema_id: str  # Natural language label of Schema entity
+    # Extraction metadata
+    provider_name: str  # LLM provider (anthropic, openai, etc.)
+    model_name: str  # Specific model used
+    extracted_data: dict[str, Any]  # Arbitrary structured data from agent
+    confidence_score: Optional[float] = None  # 0.0-1.0 if provided by agent
+    extraction_timestamp: Optional[str] = None  # ISO8601 timestamp
+    # Semantic search support
+    embedding_text: Optional[str] = None  # Text for embedding generation
+    model_config = ConfigDict(
+        json_schema_extra={
+            "description": "Domain-specific knowledge extracted from files using custom agents",
+            "examples": [
+                {
+                    "name": "john-doe-cv-2024",
+                    "file_id": "550e8400-e29b-41d4-a716-446655440000",
+                    "agent_schema_id": "cv-parser-v1",
+                    "provider_name": "anthropic",
+                    "model_name": "claude-sonnet-4-5-20250929",
+                    "extracted_data": {
+                        "candidate_name": "John Doe",
+                        "skills": ["Python", "PostgreSQL"],
+                        "experience": []
+                    },
+                    "confidence_score": 0.95,
+                    "tags": ["cv", "engineering"]
+                }
+            ]
+        }
+    )

rem/models/entities/ontology_config.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""OntologyConfig entity for user-defined ontology extraction rules.
+OntologyConfig allows users to define which agent schemas should be applied to
+which files during the dreaming/processing workflow. This enables domain-specific
+knowledge extraction tailored to user needs.
+Examples:
+- "Apply cv-parser-v1 to all PDF files in /resumes/"
+- "Apply contract-analyzer-v2 to files tagged with 'legal'"
+- "Apply medical-records-extractor to files with mime_type application/pdf AND tags ['medical']"
+Design:
+- Each config is tenant-scoped for isolation
+- File matching via mime_type patterns, tag filters, and URI patterns
+- Multiple configs can match a single file (all will be applied)
+- Priority field for execution order when multiple configs match
+- Enabled/disabled toggle for temporary deactivation
+"""
+from typing import Optional
+from pydantic import ConfigDict
+from ..core.core_model import CoreModel
+class OntologyConfig(CoreModel):
+    """User configuration for automatic ontology extraction.
+    Attributes:
+        name: Human-readable config name
+        agent_schema_id: Foreign key to Schema entity to use for extraction
+        description: Purpose and scope of this config
+        # File matching rules (ANY matching rule triggers extraction)
+        mime_type_pattern: Regex pattern for file MIME types (e.g., "application/pdf")
+        uri_pattern: Regex pattern for file URIs (e.g., "s3://bucket/resumes/.*")
+        tag_filter: List of tags (file must have ALL tags to match)
+        # Execution control
+        priority: Execution order (higher = earlier, default 100)
+        enabled: Whether this config is active (default True)
+        # LLM provider configuration
+        provider_name: Optional LLM provider override (defaults to settings)
+        model_name: Optional model override (defaults to settings)
+    Inherited from CoreModel:
+        id, created_at, updated_at, deleted_at, tenant_id, user_id,
+        graph_edges, metadata, tags, column
+    Example Usage:
+        # CV extraction for recruitment
+        cv_config = OntologyConfig(
+            name="recruitment-cv-parser",
+            agent_schema_id="cv-parser-v1",
+            description="Extract candidate information from resumes",
+            mime_type_pattern="application/pdf",
+            uri_pattern=".*/resumes/.*",
+            tag_filter=["cv", "candidate"],
+            priority=100,
+            enabled=True,
+            tenant_id="acme-corp",
+            tags=["recruitment", "hr"]
+        )
+        # Contract analysis for legal team
+        contract_config = OntologyConfig(
+            name="legal-contract-analyzer",
+            agent_schema_id="contract-parser-v2",
+            description="Extract key terms from supplier contracts",
+            mime_type_pattern="application/(pdf|msword|vnd.openxmlformats.*)",
+            tag_filter=["legal", "contract"],
+            priority=200,  # Higher priority = runs first
+            enabled=True,
+            provider_name="openai",  # Override default provider
+            model_name="gpt-4o",
+            tenant_id="acme-corp",
+            tags=["legal", "procurement"]
+        )
+        # Medical records for healthcare
+        medical_config = OntologyConfig(
+            name="medical-records-extractor",
+            agent_schema_id="medical-parser-v1",
+            description="Extract diagnoses and treatments from medical records",
+            mime_type_pattern="application/pdf",
+            tag_filter=["medical", "patient-record"],
+            priority=50,
+            enabled=True,
+            tenant_id="healthsystem",
+            tags=["medical", "hipaa-compliant"]
+        )
+    """
+    # Core fields
+    name: str
+    agent_schema_id: str  # Foreign key to Schema entity
+    description: Optional[str] = None
+    # File matching rules (ANY rule can trigger match)
+    mime_type_pattern: Optional[str] = None  # Regex for MIME type
+    uri_pattern: Optional[str] = None  # Regex for file URI
+    tag_filter: list[str] = []  # File must have ALL tags
+    # Execution control
+    priority: int = 100  # Higher = runs first
+    enabled: bool = True  # Toggle to disable without deleting
+    # Optional provider overrides
+    provider_name: Optional[str] = None  # Override default provider
+    model_name: Optional[str] = None  # Override default model
+    model_config = ConfigDict(
+        json_schema_extra={
+            "description": "Configuration for automatic ontology extraction from files",
+            "examples": [
+                {
+                    "name": "recruitment-cv-parser",
+                    "agent_schema_id": "cv-parser-v1",
+                    "description": "Extract candidate information from resumes",
+                    "mime_type_pattern": "application/pdf",
+                    "uri_pattern": ".*/resumes/.*",
+                    "tag_filter": ["cv", "candidate"],
+                    "priority": 100,
+                    "enabled": True,
+                    "tenant_id": "acme-corp"
+                }
+            ]
+        }
+    )

rem/models/entities/resource.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""
+Resource - Base content unit in REM.
+Resources represent documents, conversations, artifacts, and any other
+content units that form the foundation of the REM memory system.
+Resources are the primary input to dreaming workflows:
+- First-order dreaming extracts Moments from Resources
+- Second-order dreaming creates affinity edges between Resources
+- Entity extraction populates related_entities field
+- Graph edges stored in graph_edges (inherited from CoreModel)
+Key Fields:
+- name: Human-readable resource identifier (used in graph labels)
+- uri: Content location or identifier
+- content: Actual content text
+- timestamp: Content creation/publication time
+- category: Resource classification (document, conversation, artifact, etc.)
+- related_entities: Extracted entities (people, projects, concepts)
+"""
+from datetime import datetime
+from typing import Optional
+from pydantic import Field, model_validator
+from ..core import CoreModel
+class Resource(CoreModel):
+    """
+    Base content unit in REM.
+    Resources are content units that feed into dreaming workflows for moment
+    extraction and affinity graph construction. Tenant isolation is provided
+    via CoreModel.tenant_id field.
+    """
+    name: Optional[str] = Field(
+        default=None,
+        description="Human-readable resource name (used as graph label). Auto-generated from uri+ordinal if not provided.",
+        json_schema_extra={"entity_key": True},  # Primary business key for KV lookups
+    )
+    uri: Optional[str] = Field(
+        default=None,
+        description="Content URI or identifier (file path, URL, etc.)",
+    )
+    ordinal: int = Field(
+        default=0,
+        description="Chunk ordinal for splitting large documents (0 for single-chunk resources)",
+        json_schema_extra={"composite_key": True},  # Part of composite unique constraint
+    )
+    content: str = Field(
+        default="",
+        description="Resource content text",
+    )
+    timestamp: datetime = Field(
+        default_factory=datetime.utcnow,
+        description="Resource timestamp (content creation/publication time)",
+    )
+    category: Optional[str] = Field(
+        default=None,
+        description="Resource category (document, conversation, artifact, etc.)",
+    )
+    related_entities: list[dict] = Field(
+        default_factory=list,
+        description="Extracted entities (people, projects, concepts) with metadata",
+    )
+    @model_validator(mode='after')
+    def generate_name_if_missing(self) -> 'Resource':
+        """Auto-generate name from uri+ordinal if not provided."""
+        if not self.name:
+            if self.uri:
+                # Extract filename from URI if possible
+                uri_parts = self.uri.rstrip('/').split('/')
+                filename = uri_parts[-1]
+                # Remove file extension for cleaner names
+                if '.' in filename:
+                    filename = filename.rsplit('.', 1)[0]
+                # Generate name with ordinal
+                if self.ordinal > 0:
+                    self.name = f"{filename}-chunk-{self.ordinal}"
+                else:
+                    self.name = filename
+            else:
+                # Fallback: use ID or generic name
+                if self.id:
+                    self.name = f"resource-{str(self.id)[:8]}"
+                else:
+                    self.name = "unnamed-resource"
+        return self

rem/models/entities/schema.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""
+Schema - Agent schema definitions in REM.
+Schemas represent agent definitions that can be loaded into Pydantic AI.
+They store JsonSchema specifications that define agent capabilities, tools,
+and output structures.
+Schemas are used for:
+- Agent definition storage and versioning
+- Dynamic agent loading via X-Agent-Schema header
+- Agent registry and discovery
+- Schema validation and documentation
+- Ontology extraction configuration
+Key Fields:
+- name: Human-readable schema identifier
+- content: Markdown documentation and instructions
+- spec: JsonSchema specification (Pydantic model definition)
+- category: Schema classification (agent-type, workflow, ontology-extractor, etc.)
+- provider_configs: Optional LLM provider configurations (for multi-provider support)
+- embedding_fields: Fields in extracted_data that should be embedded for semantic search
+"""
+from typing import Optional
+from pydantic import Field
+from ..core import CoreModel
+class Schema(CoreModel):
+    """
+    Agent schema definition.
+    Schemas define agents that can be dynamically loaded into Pydantic AI.
+    They store JsonSchema specifications with embedded metadata for tools,
+    resources, and system prompts.
+    For ontology extraction agents:
+    - `provider_configs` enables multi-provider support (test across Anthropic, OpenAI, etc.)
+    - `embedding_fields` specifies which output fields should be embedded for semantic search
+    Tenant isolation is provided via CoreModel.tenant_id field.
+    """
+    name: str = Field(
+        ...,
+        description="Human-readable schema name (used as identifier)",
+    )
+    content: str = Field(
+        default="",
+        description="Markdown documentation and instructions for the schema",
+    )
+    spec: dict = Field(
+        ...,
+        description="JsonSchema specification defining the agent structure and capabilities",
+    )
+    category: Optional[str] = Field(
+        default=None,
+        description=(
+            "Schema category distinguishing schema types. "
+            "Values: 'agent' (AI agents), 'evaluator' (LLM-as-a-Judge evaluators). "
+            "Maps directly from json_schema_extra.kind field during ingestion."
+        ),
+    )
+    # Ontology extraction support
+    provider_configs: list[dict] = Field(
+        default_factory=list,
+        description=(
+            "Optional provider configurations for multi-provider testing. "
+            "Each dict has 'provider_name' and 'model_name'. "
+            "Example: [{'provider_name': 'anthropic', 'model_name': 'claude-sonnet-4-5'}]"
+        ),
+    )
+    embedding_fields: list[str] = Field(
+        default_factory=list,
+        description=(
+            "JSON paths in extracted_data to embed for semantic search. "
+            "Example: ['summary', 'candidate_name', 'skills'] for CV extraction. "
+            "Values will be concatenated and embedded using configured embedding provider."
+        ),
+    )