PyPI - remdb - Versions diffs - 0.3.242__py3-none-any.whl - Mend

remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show

rem/__init__.py +129 -0
rem/agentic/README.md +760 -0
rem/agentic/__init__.py +54 -0
rem/agentic/agents/README.md +155 -0
rem/agentic/agents/__init__.py +38 -0
rem/agentic/agents/agent_manager.py +311 -0
rem/agentic/agents/sse_simulator.py +502 -0
rem/agentic/context.py +425 -0
rem/agentic/context_builder.py +360 -0
rem/agentic/llm_provider_models.py +301 -0
rem/agentic/mcp/__init__.py +0 -0
rem/agentic/mcp/tool_wrapper.py +273 -0
rem/agentic/otel/__init__.py +5 -0
rem/agentic/otel/setup.py +240 -0
rem/agentic/providers/phoenix.py +926 -0
rem/agentic/providers/pydantic_ai.py +854 -0
rem/agentic/query.py +117 -0
rem/agentic/query_helper.py +89 -0
rem/agentic/schema.py +737 -0
rem/agentic/serialization.py +245 -0
rem/agentic/tools/__init__.py +5 -0
rem/agentic/tools/rem_tools.py +242 -0
rem/api/README.md +657 -0
rem/api/deps.py +253 -0
rem/api/main.py +460 -0
rem/api/mcp_router/prompts.py +182 -0
rem/api/mcp_router/resources.py +820 -0
rem/api/mcp_router/server.py +243 -0
rem/api/mcp_router/tools.py +1605 -0
rem/api/middleware/tracking.py +172 -0
rem/api/routers/admin.py +520 -0
rem/api/routers/auth.py +898 -0
rem/api/routers/chat/__init__.py +5 -0
rem/api/routers/chat/child_streaming.py +394 -0
rem/api/routers/chat/completions.py +702 -0
rem/api/routers/chat/json_utils.py +76 -0
rem/api/routers/chat/models.py +202 -0
rem/api/routers/chat/otel_utils.py +33 -0
rem/api/routers/chat/sse_events.py +546 -0
rem/api/routers/chat/streaming.py +950 -0
rem/api/routers/chat/streaming_utils.py +327 -0
rem/api/routers/common.py +18 -0
rem/api/routers/dev.py +87 -0
rem/api/routers/feedback.py +276 -0
rem/api/routers/messages.py +620 -0
rem/api/routers/models.py +86 -0
rem/api/routers/query.py +362 -0
rem/api/routers/shared_sessions.py +422 -0
rem/auth/README.md +258 -0
rem/auth/__init__.py +36 -0
rem/auth/jwt.py +367 -0
rem/auth/middleware.py +318 -0
rem/auth/providers/__init__.py +16 -0
rem/auth/providers/base.py +376 -0
rem/auth/providers/email.py +215 -0
rem/auth/providers/google.py +163 -0
rem/auth/providers/microsoft.py +237 -0
rem/cli/README.md +517 -0
rem/cli/__init__.py +8 -0
rem/cli/commands/README.md +299 -0
rem/cli/commands/__init__.py +3 -0
rem/cli/commands/ask.py +549 -0
rem/cli/commands/cluster.py +1808 -0
rem/cli/commands/configure.py +495 -0
rem/cli/commands/db.py +828 -0
rem/cli/commands/dreaming.py +324 -0
rem/cli/commands/experiments.py +1698 -0
rem/cli/commands/mcp.py +66 -0
rem/cli/commands/process.py +388 -0
rem/cli/commands/query.py +109 -0
rem/cli/commands/scaffold.py +47 -0
rem/cli/commands/schema.py +230 -0
rem/cli/commands/serve.py +106 -0
rem/cli/commands/session.py +453 -0
rem/cli/dreaming.py +363 -0
rem/cli/main.py +123 -0
rem/config.py +244 -0
rem/mcp_server.py +41 -0
rem/models/core/__init__.py +49 -0
rem/models/core/core_model.py +70 -0
rem/models/core/engram.py +333 -0
rem/models/core/experiment.py +672 -0
rem/models/core/inline_edge.py +132 -0
rem/models/core/rem_query.py +246 -0
rem/models/entities/__init__.py +68 -0
rem/models/entities/domain_resource.py +38 -0
rem/models/entities/feedback.py +123 -0
rem/models/entities/file.py +57 -0
rem/models/entities/image_resource.py +88 -0
rem/models/entities/message.py +64 -0
rem/models/entities/moment.py +123 -0
rem/models/entities/ontology.py +181 -0
rem/models/entities/ontology_config.py +131 -0
rem/models/entities/resource.py +95 -0
rem/models/entities/schema.py +87 -0
rem/models/entities/session.py +84 -0
rem/models/entities/shared_session.py +180 -0
rem/models/entities/subscriber.py +175 -0
rem/models/entities/user.py +93 -0
rem/py.typed +0 -0
rem/registry.py +373 -0
rem/schemas/README.md +507 -0
rem/schemas/__init__.py +6 -0
rem/schemas/agents/README.md +92 -0
rem/schemas/agents/core/agent-builder.yaml +235 -0
rem/schemas/agents/core/moment-builder.yaml +178 -0
rem/schemas/agents/core/rem-query-agent.yaml +226 -0
rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
rem/schemas/agents/core/simple-assistant.yaml +19 -0
rem/schemas/agents/core/user-profile-builder.yaml +163 -0
rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
rem/schemas/agents/examples/contract-extractor.yaml +134 -0
rem/schemas/agents/examples/cv-parser.yaml +263 -0
rem/schemas/agents/examples/hello-world.yaml +37 -0
rem/schemas/agents/examples/query.yaml +54 -0
rem/schemas/agents/examples/simple.yaml +21 -0
rem/schemas/agents/examples/test.yaml +29 -0
rem/schemas/agents/rem.yaml +132 -0
rem/schemas/evaluators/hello-world/default.yaml +77 -0
rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
rem/services/__init__.py +18 -0
rem/services/audio/INTEGRATION.md +308 -0
rem/services/audio/README.md +376 -0
rem/services/audio/__init__.py +15 -0
rem/services/audio/chunker.py +354 -0
rem/services/audio/transcriber.py +259 -0
rem/services/content/README.md +1269 -0
rem/services/content/__init__.py +5 -0
rem/services/content/providers.py +760 -0
rem/services/content/service.py +762 -0
rem/services/dreaming/README.md +230 -0
rem/services/dreaming/__init__.py +53 -0
rem/services/dreaming/affinity_service.py +322 -0
rem/services/dreaming/moment_service.py +251 -0
rem/services/dreaming/ontology_service.py +54 -0
rem/services/dreaming/user_model_service.py +297 -0
rem/services/dreaming/utils.py +39 -0
rem/services/email/__init__.py +10 -0
rem/services/email/service.py +522 -0
rem/services/email/templates.py +360 -0
rem/services/embeddings/__init__.py +11 -0
rem/services/embeddings/api.py +127 -0
rem/services/embeddings/worker.py +435 -0
rem/services/fs/README.md +662 -0
rem/services/fs/__init__.py +62 -0
rem/services/fs/examples.py +206 -0
rem/services/fs/examples_paths.py +204 -0
rem/services/fs/git_provider.py +935 -0
rem/services/fs/local_provider.py +760 -0
rem/services/fs/parsing-hooks-examples.md +172 -0
rem/services/fs/paths.py +276 -0
rem/services/fs/provider.py +460 -0
rem/services/fs/s3_provider.py +1042 -0
rem/services/fs/service.py +186 -0
rem/services/git/README.md +1075 -0
rem/services/git/__init__.py +17 -0
rem/services/git/service.py +469 -0
rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
rem/services/phoenix/README.md +453 -0
rem/services/phoenix/__init__.py +46 -0
rem/services/phoenix/client.py +960 -0
rem/services/phoenix/config.py +88 -0
rem/services/phoenix/prompt_labels.py +477 -0
rem/services/postgres/README.md +757 -0
rem/services/postgres/__init__.py +49 -0
rem/services/postgres/diff_service.py +599 -0
rem/services/postgres/migration_service.py +427 -0
rem/services/postgres/programmable_diff_service.py +635 -0
rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
rem/services/postgres/register_type.py +353 -0
rem/services/postgres/repository.py +481 -0
rem/services/postgres/schema_generator.py +661 -0
rem/services/postgres/service.py +802 -0
rem/services/postgres/sql_builder.py +355 -0
rem/services/rate_limit.py +113 -0
rem/services/rem/README.md +318 -0
rem/services/rem/__init__.py +23 -0
rem/services/rem/exceptions.py +71 -0
rem/services/rem/executor.py +293 -0
rem/services/rem/parser.py +180 -0
rem/services/rem/queries.py +196 -0
rem/services/rem/query.py +371 -0
rem/services/rem/service.py +608 -0
rem/services/session/README.md +374 -0
rem/services/session/__init__.py +13 -0
rem/services/session/compression.py +488 -0
rem/services/session/pydantic_messages.py +310 -0
rem/services/session/reload.py +85 -0
rem/services/user_service.py +130 -0
rem/settings.py +1877 -0
rem/sql/background_indexes.sql +52 -0
rem/sql/migrations/001_install.sql +983 -0
rem/sql/migrations/002_install_models.sql +3157 -0
rem/sql/migrations/003_optional_extensions.sql +326 -0
rem/sql/migrations/004_cache_system.sql +282 -0
rem/sql/migrations/005_schema_update.sql +145 -0
rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
rem/utils/AGENTIC_CHUNKING.md +597 -0
rem/utils/README.md +628 -0
rem/utils/__init__.py +61 -0
rem/utils/agentic_chunking.py +622 -0
rem/utils/batch_ops.py +343 -0
rem/utils/chunking.py +108 -0
rem/utils/clip_embeddings.py +276 -0
rem/utils/constants.py +97 -0
rem/utils/date_utils.py +228 -0
rem/utils/dict_utils.py +98 -0
rem/utils/embeddings.py +436 -0
rem/utils/examples/embeddings_example.py +305 -0
rem/utils/examples/sql_types_example.py +202 -0
rem/utils/files.py +323 -0
rem/utils/markdown.py +16 -0
rem/utils/mime_types.py +158 -0
rem/utils/model_helpers.py +492 -0
rem/utils/schema_loader.py +649 -0
rem/utils/sql_paths.py +146 -0
rem/utils/sql_types.py +350 -0
rem/utils/user_id.py +81 -0
rem/utils/vision.py +325 -0
rem/workers/README.md +506 -0
rem/workers/__init__.py +7 -0
rem/workers/db_listener.py +579 -0
rem/workers/db_maintainer.py +74 -0
rem/workers/dreaming.py +502 -0
rem/workers/engram_processor.py +312 -0
rem/workers/sqs_file_processor.py +193 -0
rem/workers/unlogged_maintainer.py +463 -0
remdb-0.3.242.dist-info/METADATA +1632 -0
remdb-0.3.242.dist-info/RECORD +235 -0
remdb-0.3.242.dist-info/WHEEL +4 -0
remdb-0.3.242.dist-info/entry_points.txt +2 -0

rem/models/entities/image_resource.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""
+ImageResource - Image-specific resource with CLIP embeddings.
+ImageResources are a specialized subclass of Resource for images,
+with support for CLIP embeddings and vision LLM descriptions.
+Key differences from base Resource:
+- **Separate table**: Stored in `image_resources` table, not `resources`
+- **Different embeddings**: Uses CLIP embeddings (multimodal) instead of text embeddings
+- **Embedding provider override**: Must use CLIP-compatible provider (Jina AI, self-hosted)
+- **Vision descriptions**: Optional vision LLM descriptions (tier/sampling gated)
+- **Image metadata**: Dimensions, format, and other image-specific fields
+Why separate table?
+1. Different embedding dimensionality (512/768 vs 1536)
+2. Different embedding model (CLIP vs text-embedding-3-small)
+3. Multimodal search capabilities (text-to-image, image-to-image)
+4. Image-specific indexes and queries
+5. Cost tracking (CLIP tokens vs text tokens)
+Usage:
+- ImageProvider saves to ImageResource table with CLIP embeddings
+- Regular text Resources use standard text embeddings
+- Cross-modal search: text queries can search ImageResources via CLIP
+"""
+from typing import Optional
+from pydantic import Field
+from .resource import Resource
+class ImageResource(Resource):
+    """
+    Image-specific resource with CLIP embeddings.
+    Stored in separate `image_resources` table with CLIP embeddings
+    instead of text embeddings. This enables:
+    - Multimodal search (text-to-image, image-to-image)
+    - Proper dimensionality (512/768 for CLIP vs 1536 for text)
+    - Cost tracking (CLIP tokens separate from text tokens)
+    Embedding Strategy:
+    - Default (when JINA_API_KEY set): Jina CLIP API (jina-clip-v2)
+    - Future: Self-hosted OpenCLIP models via KEDA-scaled pods
+    - Fallback: No embeddings (images searchable by metadata only)
+    Vision LLM Strategy (tier/sampling gated):
+    - Gold tier: Always get vision descriptions
+    - Silver/Free: Probabilistic sampling (IMAGE_VLLM_SAMPLE_RATE)
+    - Fallback: Basic metadata only
+    Tenant isolation provided via CoreModel.tenant_id field.
+    """
+    image_width: Optional[int] = Field(
+        default=None,
+        description="Image width in pixels",
+    )
+    image_height: Optional[int] = Field(
+        default=None,
+        description="Image height in pixels",
+    )
+    image_format: Optional[str] = Field(
+        default=None,
+        description="Image format (PNG, JPEG, GIF, WebP)",
+    )
+    vision_description: Optional[str] = Field(
+        default=None,
+        description="Vision LLM generated description (markdown, only for gold tier or sampled images)",
+    )
+    vision_provider: Optional[str] = Field(
+        default=None,
+        description="Vision provider used (anthropic, gemini, openai)",
+    )
+    vision_model: Optional[str] = Field(
+        default=None,
+        description="Vision model used for description",
+    )
+    clip_embedding: Optional[list[float]] = Field(
+        default=None,
+        description="CLIP embedding vector (512 or 768 dimensions, from Jina AI or self-hosted)",
+    )
+    clip_dimensions: Optional[int] = Field(
+        default=None,
+        description="CLIP embedding dimensionality (512 for jina-clip-v2, 768 for jina-clip-v1)",
+    )

rem/models/entities/message.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""
+Message - Communication content in REM.
+Messages represent individual communication units (chat messages, emails, etc.)
+that can be grouped into conversations or moments.
+Messages are simpler than Resources but share the same graph connectivity
+through CoreModel inheritance.
+Trace Integration:
+- trace_id: OTEL trace ID for linking to observability
+- span_id: OTEL span ID for specific span reference
+- These enable feedback to be attached to Phoenix annotations
+"""
+from pydantic import Field
+from ..core import CoreModel
+class Message(CoreModel):
+    """
+    Communication content unit.
+    Represents individual messages in conversations, chats, or other
+    communication contexts. Tenant isolation is provided via CoreModel.tenant_id field.
+    Trace fields (trace_id, span_id) enable integration with OTEL/Phoenix
+    for observability and feedback annotation.
+    """
+    content: str = Field(
+        ...,
+        description="Message content text",
+    )
+    message_type: str | None = Field(
+        default=None,
+        description="Message type e.g. role: 'user', 'assistant', 'system', 'tool'",
+    )
+    session_id: str | None = Field(
+        default=None,
+        description="Session identifier for tracking message context",
+    )
+    prompt: str | None = Field(
+        default=None,
+        description="Custom prompt used for this message (if overridden from default)",
+    )
+    model: str | None = Field(
+        default=None,
+        description="Model used for generating this message (provider:model format)",
+    )
+    token_count: int | None = Field(
+        default=None,
+        description="Token count for this message",
+    )
+    # OTEL/Phoenix trace integration
+    trace_id: str | None = Field(
+        default=None,
+        description="OTEL trace ID for observability integration",
+    )
+    span_id: str | None = Field(
+        default=None,
+        description="OTEL span ID for specific span reference",
+    )

rem/models/entities/moment.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""
+Moment - Temporal narrative in REM.
+Moments are extracted from Resources through first-order dreaming workflows.
+They represent temporal narratives like meetings, coding sessions, conversations,
+or any classified time period when users were focused on specific activities.
+Moments provide temporal structure to the REM graph:
+- Temporal boundaries (starts_timestamp, ends_timestamp)
+- Present persons (who was involved)
+- Emotion tags (team sentiment)
+- Topic tags (what was discussed)
+- Natural language summaries
+Moments enable temporal queries:
+- "What happened between milestone A and B?"
+- "When did Sarah and Mike meet?"
+- "What was discussed in Q4 retrospective?"
+Data Model:
+- Inherits from CoreModel (id, tenant_id, timestamps, graph_edges, etc.)
+- name: Human-readable moment name
+- moment_type: Classification (meeting, coding-session, conversation, etc.)
+- starts_timestamp: Start time
+- ends_timestamp: End time
+- present_persons: List of Person objects with id, name, role
+- emotion_tags: Sentiment tags (happy, frustrated, focused)
+- topic_tags: Topic/concept tags (project names, technologies)
+- summary: Natural language description
+- source_resource_ids: Resources used to construct this moment
+"""
+from datetime import datetime
+from typing import Optional
+from pydantic import BaseModel, Field, model_validator
+from ..core import CoreModel
+class Person(BaseModel):
+    """Person reference in a moment."""
+    id: str = Field(..., description="Person entity label")
+    name: str = Field(..., description="Person name")
+    role: Optional[str] = Field(default=None, description="Person role in moment")
+class Moment(CoreModel):
+    """
+    Temporal narrative extracted from resources.
+    Moments provide temporal structure and context for the REM graph,
+    enabling time-based queries and understanding of when events occurred.
+    Tenant isolation is provided via CoreModel.tenant_id field.
+    """
+    name: Optional[str] = Field(
+        default=None,
+        description="Human-readable moment name (used as graph label). Auto-generated from starts_timestamp+moment_type if not provided.",
+        json_schema_extra={"entity_key": True},  # Primary business key for KV lookups
+    )
+    moment_type: Optional[str] = Field(
+        default=None,
+        description="Moment classification (meeting, coding-session, conversation, etc.)",
+    )
+    category: Optional[str] = Field(
+        default=None,
+        description="Moment category for grouping and filtering",
+    )
+    starts_timestamp: datetime = Field(
+        ...,
+        description="Moment start time",
+    )
+    ends_timestamp: Optional[datetime] = Field(
+        default=None,
+        description="Moment end time",
+    )
+    present_persons: list[Person] = Field(
+        default_factory=list,
+        description="People present in the moment",
+    )
+    emotion_tags: list[str] = Field(
+        default_factory=list,
+        description="Emotion/sentiment tags (happy, frustrated, focused, etc.)",
+    )
+    topic_tags: list[str] = Field(
+        default_factory=list,
+        description="Topic/concept tags (project names, technologies, etc.)",
+    )
+    summary: Optional[str] = Field(
+        default=None,
+        description="Natural language summary of the moment",
+    )
+    source_resource_ids: list[str] = Field(
+        default_factory=list,
+        description="Resource IDs used to construct this moment",
+    )
+    @model_validator(mode='after')
+    def generate_name_if_missing(self) -> 'Moment':
+        """Auto-generate name from starts_timestamp+moment_type if not provided."""
+        if not self.name:
+            # Format: "Meeting on 2024-12-20" or "Coding Session on 2024-12-20 14:30"
+            if self.starts_timestamp:
+                date_str = self.starts_timestamp.strftime("%Y-%m-%d")
+                time_str = self.starts_timestamp.strftime("%H:%M")
+                if self.moment_type:
+                    moment_label = self.moment_type.replace('-', ' ').replace('_', ' ').title()
+                    self.name = f"{moment_label} on {date_str}"
+                else:
+                    self.name = f"Moment on {date_str} {time_str}"
+            else:
+                # Fallback: use ID or generic name
+                if self.id:
+                    self.name = f"moment-{str(self.id)[:8]}"
+                else:
+                    self.name = "unnamed-moment"
+        return self

rem/models/entities/ontology.py ADDED Viewed

@@ -0,0 +1,181 @@
+"""Ontology entity for domain-specific knowledge.
+**What are Ontologies?**
+Ontologies are **domain-specific structured knowledge** that can be:
+1. **Extracted** from files using custom agent schemas (agent-extracted)
+2. **Loaded directly** from external sources like git repos or S3 (direct-loaded)
+**Use Case 1: Agent-Extracted Ontologies**
+File → custom agent → structured JSON → ontology (domain knowledge)
+Example: A contract PDF becomes a structured record with parties, dates, payment terms.
+**Use Case 2: Direct-Loaded Ontologies (Knowledge Bases)**
+External source (git/S3) → load → ontology (reference knowledge)
+Example: A psychiatric ontology of disorders, symptoms, and drugs loaded from markdown
+files in a git repository. Each markdown file becomes an ontology node with:
+- `uri`: git path (e.g., `git://org/repo/ontology/disorders/anxiety/panic-disorder.md`)
+- `content`: markdown content for embedding/search
+- `extracted_data`: parsed frontmatter or structure
+**Architecture:**
+- Runs as part of dreaming worker (background knowledge extraction) OR
+- Loaded directly via `rem db load` for external knowledge bases
+- OntologyConfig defines which files trigger which extractors
+- Multiple ontologies per file (apply different domain lenses)
+- Tenant-scoped: Each tenant can define their own extractors and knowledge bases
+**Use Cases:**
+1. **Recruitment (CV Parsing)** - Agent-extracted
+   - Ontology: Structured fields for filtering/sorting (years_experience, skills[])
+2. **Legal (Contract Analysis)** - Agent-extracted
+   - Ontology: Queryable fields (parties, effective_date, payment_amount)
+3. **Medical Knowledge Base** - Direct-loaded
+   - Ontology: Disorders, symptoms, medications from curated markdown files
+   - Enables semantic search over psychiatric/medical domain knowledge
+4. **Documentation/Procedures** - Direct-loaded
+   - Ontology: Clinical procedures (e.g., SCID-5 assessment steps)
+   - Reference material accessible via RAG
+**Design:**
+- `file_id` and `agent_schema_id` are optional (only needed for agent-extracted)
+- `uri` field for external source references (git://, s3://, https://)
+- Structured data in `extracted_data` (arbitrary JSON)
+- Embeddings generated for semantic search via `content` field
+- Tenant-isolated: OntologyConfigs are tenant-scoped
+"""
+from typing import Any, Optional
+from uuid import UUID
+from pydantic import ConfigDict
+from ..core.core_model import CoreModel
+class Ontology(CoreModel):
+    """Domain-specific knowledge - either agent-extracted or direct-loaded.
+    Attributes:
+        name: Human-readable label for this ontology instance
+        uri: External source reference (git://, s3://, https://) for direct-loaded ontologies
+        file_id: Foreign key to File entity (optional - only for agent-extracted)
+        agent_schema_id: Schema that performed extraction (optional - only for agent-extracted)
+        provider_name: LLM provider used for extraction (optional)
+        model_name: Specific model used (optional)
+        extracted_data: Structured data - either extracted by agent or parsed from source
+        confidence_score: Optional confidence score from extraction (0.0-1.0)
+        extraction_timestamp: When extraction was performed
+        content: Text used for generating embedding
+    Inherited from CoreModel:
+        id: UUID or string identifier
+        created_at: Entity creation timestamp
+        updated_at: Last update timestamp
+        deleted_at: Soft deletion timestamp
+        tenant_id: Multi-tenancy isolation
+        user_id: Ownership
+        graph_edges: Relationships to other entities
+        metadata: Flexible metadata storage
+        tags: Classification tags
+    Example Usage:
+        # Agent-extracted: CV parsing
+        cv_ontology = Ontology(
+            name="john-doe-cv-2024",
+            file_id="file-uuid-123",
+            agent_schema_id="cv-parser-v1",
+            provider_name="anthropic",
+            model_name="claude-sonnet-4-5-20250929",
+            extracted_data={
+                "candidate_name": "John Doe",
+                "skills": ["Python", "PostgreSQL", "Kubernetes"],
+            },
+            confidence_score=0.95,
+            tags=["cv", "engineering"]
+        )
+        # Direct-loaded: Knowledge base from git
+        api_docs = Ontology(
+            name="rest-api-guide",
+            uri="git://example-org/docs/api/rest-api-guide.md",
+            content="# REST API Guide\\n\\nThis guide covers RESTful API design...",
+            extracted_data={
+                "type": "documentation",
+                "category": "api",
+                "version": "2.0",
+            },
+            tags=["api", "rest", "documentation"]
+        )
+        # Direct-loaded: Technical spec from git
+        config_spec = Ontology(
+            name="config-schema",
+            uri="git://example-org/docs/specs/config-schema.md",
+            content="# Configuration Schema\\n\\nThis document defines...",
+            extracted_data={
+                "type": "specification",
+                "format": "yaml",
+                "version": "1.0",
+            },
+            tags=["config", "schema", "specification"]
+        )
+    """
+    # Core fields
+    name: str
+    uri: Optional[str] = None  # External source: git://, s3://, https://
+    # Agent extraction fields (optional - only for agent-extracted ontologies)
+    file_id: Optional[UUID | str] = None  # FK to File entity
+    agent_schema_id: Optional[str] = None  # Schema that performed extraction
+    provider_name: Optional[str] = None  # LLM provider (anthropic, openai, etc.)
+    model_name: Optional[str] = None  # Specific model used
+    # Data fields
+    extracted_data: Optional[dict[str, Any]] = None  # Structured data
+    confidence_score: Optional[float] = None  # 0.0-1.0 if provided by agent
+    extraction_timestamp: Optional[str] = None  # ISO8601 timestamp
+    # Semantic search support - 'content' is a default embeddable field name
+    content: Optional[str] = None  # Text for embedding generation
+    model_config = ConfigDict(
+        json_schema_extra={
+            "description": "Domain-specific knowledge - agent-extracted or direct-loaded from external sources",
+            "examples": [
+                {
+                    "name": "panic-disorder",
+                    "uri": "git://org/repo/ontology/disorders/anxiety/panic-disorder.md",
+                    "content": "# Panic Disorder\n\nPanic disorder is characterized by...",
+                    "extracted_data": {
+                        "type": "disorder",
+                        "category": "anxiety",
+                        "icd10": "F41.0"
+                    },
+                    "tags": ["disorder", "anxiety"]
+                },
+                {
+                    "name": "john-doe-cv-2024",
+                    "file_id": "550e8400-e29b-41d4-a716-446655440000",
+                    "agent_schema_id": "cv-parser-v1",
+                    "provider_name": "anthropic",
+                    "model_name": "claude-sonnet-4-5-20250929",
+                    "extracted_data": {
+                        "candidate_name": "John Doe",
+                        "skills": ["Python", "PostgreSQL"]
+                    },
+                    "confidence_score": 0.95,
+                    "tags": ["cv", "engineering"]
+                }
+            ]
+        }
+    )

rem/models/entities/ontology_config.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""OntologyConfig entity for user-defined ontology extraction rules.
+OntologyConfig allows users to define which agent schemas should be applied to
+which files during the dreaming/processing workflow. This enables domain-specific
+knowledge extraction tailored to user needs.
+Examples:
+- "Apply cv-parser-v1 to all PDF files in /resumes/"
+- "Apply contract-analyzer-v2 to files tagged with 'legal'"
+- "Apply medical-records-extractor to files with mime_type application/pdf AND tags ['medical']"
+Design:
+- Each config is tenant-scoped for isolation
+- File matching via mime_type patterns, tag filters, and URI patterns
+- Multiple configs can match a single file (all will be applied)
+- Priority field for execution order when multiple configs match
+- Enabled/disabled toggle for temporary deactivation
+"""
+from typing import Optional
+from pydantic import ConfigDict
+from ..core.core_model import CoreModel
+class OntologyConfig(CoreModel):
+    """User configuration for automatic ontology extraction.
+    Attributes:
+        name: Human-readable config name
+        agent_schema_id: Foreign key to Schema entity to use for extraction
+        description: Purpose and scope of this config
+        # File matching rules (ANY matching rule triggers extraction)
+        mime_type_pattern: Regex pattern for file MIME types (e.g., "application/pdf")
+        uri_pattern: Regex pattern for file URIs (e.g., "s3://bucket/resumes/.*")
+        tag_filter: List of tags (file must have ALL tags to match)
+        # Execution control
+        priority: Execution order (higher = earlier, default 100)
+        enabled: Whether this config is active (default True)
+        # LLM provider configuration
+        provider_name: Optional LLM provider override (defaults to settings)
+        model_name: Optional model override (defaults to settings)
+    Inherited from CoreModel:
+        id, created_at, updated_at, deleted_at, tenant_id, user_id,
+        graph_edges, metadata, tags, column
+    Example Usage:
+        # CV extraction for recruitment
+        cv_config = OntologyConfig(
+            name="recruitment-cv-parser",
+            agent_schema_id="cv-parser-v1",
+            description="Extract candidate information from resumes",
+            mime_type_pattern="application/pdf",
+            uri_pattern=".*/resumes/.*",
+            tag_filter=["cv", "candidate"],
+            priority=100,
+            enabled=True,
+            tenant_id="acme-corp",
+            tags=["recruitment", "hr"]
+        )
+        # Contract analysis for legal team
+        contract_config = OntologyConfig(
+            name="legal-contract-analyzer",
+            agent_schema_id="contract-parser-v2",
+            description="Extract key terms from supplier contracts",
+            mime_type_pattern="application/(pdf|msword|vnd.openxmlformats.*)",
+            tag_filter=["legal", "contract"],
+            priority=200,  # Higher priority = runs first
+            enabled=True,
+            provider_name="openai",  # Override default provider
+            model_name="gpt-4.1",
+            tenant_id="acme-corp",
+            tags=["legal", "procurement"]
+        )
+        # Medical records for healthcare
+        medical_config = OntologyConfig(
+            name="medical-records-extractor",
+            agent_schema_id="medical-parser-v1",
+            description="Extract diagnoses and treatments from medical records",
+            mime_type_pattern="application/pdf",
+            tag_filter=["medical", "patient-record"],
+            priority=50,
+            enabled=True,
+            tenant_id="healthsystem",
+            tags=["medical", "hipaa-compliant"]
+        )
+    """
+    # Core fields
+    name: str
+    agent_schema_id: str  # Foreign key to Schema entity
+    description: Optional[str] = None
+    # File matching rules (ANY rule can trigger match)
+    mime_type_pattern: Optional[str] = None  # Regex for MIME type
+    uri_pattern: Optional[str] = None  # Regex for file URI
+    tag_filter: list[str] = []  # File must have ALL tags
+    # Execution control
+    priority: int = 100  # Higher = runs first
+    enabled: bool = True  # Toggle to disable without deleting
+    # Optional provider overrides
+    provider_name: Optional[str] = None  # Override default provider
+    model_name: Optional[str] = None  # Override default model
+    model_config = ConfigDict(
+        json_schema_extra={
+            "description": "Configuration for automatic ontology extraction from files",
+            "examples": [
+                {
+                    "name": "recruitment-cv-parser",
+                    "agent_schema_id": "cv-parser-v1",
+                    "description": "Extract candidate information from resumes",
+                    "mime_type_pattern": "application/pdf",
+                    "uri_pattern": ".*/resumes/.*",
+                    "tag_filter": ["cv", "candidate"],
+                    "priority": 100,
+                    "enabled": True,
+                    "tenant_id": "acme-corp"
+                }
+            ]
+        }
+    )

rem/models/entities/resource.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""
+Resource - Base content unit in REM.
+Resources represent documents, conversations, artifacts, and any other
+content units that form the foundation of the REM memory system.
+Resources are the primary input to dreaming workflows:
+- First-order dreaming extracts Moments from Resources
+- Second-order dreaming creates affinity edges between Resources
+- Entity extraction populates related_entities field
+- Graph edges stored in graph_edges (inherited from CoreModel)
+Key Fields:
+- name: Human-readable resource identifier (used in graph labels)
+- uri: Content location or identifier
+- content: Actual content text
+- timestamp: Content creation/publication time
+- category: Resource classification (document, conversation, artifact, etc.)
+- related_entities: Extracted entities (people, projects, concepts)
+"""
+from datetime import datetime
+from typing import Optional
+from pydantic import Field, model_validator
+from ..core import CoreModel
+class Resource(CoreModel):
+    """
+    Base content unit in REM.
+    Resources are content units that feed into dreaming workflows for moment
+    extraction and affinity graph construction. Tenant isolation is provided
+    via CoreModel.tenant_id field.
+    """
+    name: Optional[str] = Field(
+        default=None,
+        description="Human-readable resource name (used as graph label). Auto-generated from uri+ordinal if not provided.",
+        json_schema_extra={"entity_key": True},  # Primary business key for KV lookups
+    )
+    uri: Optional[str] = Field(
+        default=None,
+        description="Content URI or identifier (file path, URL, etc.)",
+    )
+    ordinal: int = Field(
+        default=0,
+        description="Chunk ordinal for splitting large documents (0 for single-chunk resources)",
+        json_schema_extra={"composite_key": True},  # Part of composite unique constraint
+    )
+    content: str = Field(
+        default="",
+        description="Resource content text",
+    )
+    timestamp: datetime = Field(
+        default_factory=datetime.utcnow,
+        description="Resource timestamp (content creation/publication time)",
+    )
+    category: Optional[str] = Field(
+        default=None,
+        description="Resource category (document, conversation, artifact, etc.)",
+    )
+    related_entities: list[dict] = Field(
+        default_factory=list,
+        description="Extracted entities (people, projects, concepts) with metadata",
+    )
+    @model_validator(mode='after')
+    def generate_name_if_missing(self) -> 'Resource':
+        """Auto-generate name from uri+ordinal if not provided."""
+        if not self.name:
+            if self.uri:
+                # Extract filename from URI if possible
+                uri_parts = self.uri.rstrip('/').split('/')
+                filename = uri_parts[-1]
+                # Remove file extension for cleaner names
+                if '.' in filename:
+                    filename = filename.rsplit('.', 1)[0]
+                # Generate name with ordinal
+                if self.ordinal > 0:
+                    self.name = f"{filename}-chunk-{self.ordinal}"
+                else:
+                    self.name = filename
+            else:
+                # Fallback: use ID or generic name
+                if self.id:
+                    self.name = f"resource-{str(self.id)[:8]}"
+                else:
+                    self.name = "unnamed-resource"
+        return self