PyPI - contextual-engine - Versions diffs - 0.1.0__py3-none-any.whl - Mend

contextual-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

contextual/__init__.py +18 -0
contextual/__main__.py +11 -0
contextual/cli.py +339 -0
contextual/cli_docs.py +685 -0
contextual/config.py +7 -0
contextual/core/__init__.py +11 -0
contextual/core/errors.py +470 -0
contextual/core/models.py +590 -0
contextual/docs/__init__.py +66 -0
contextual/docs/chunker.py +550 -0
contextual/docs/pipeline.py +513 -0
contextual/docs/retrieval.py +654 -0
contextual/docs/watcher.py +265 -0
contextual/embedding/__init__.py +87 -0
contextual/embedding/cache.py +455 -0
contextual/embedding/embedder.py +414 -0
contextual/embedding/helpers.py +252 -0
contextual/git/__init__.py +22 -0
contextual/git/blame.py +334 -0
contextual/indexing/__init__.py +20 -0
contextual/indexing/bug_sweep.py +119 -0
contextual/indexing/chunker.py +691 -0
contextual/indexing/embedder.py +271 -0
contextual/indexing/file_watcher.py +154 -0
contextual/indexing/incremental.py +260 -0
contextual/indexing/index_writer.py +442 -0
contextual/indexing/pipeline.py +438 -0
contextual/indexing/processor.py +436 -0
contextual/indexing/queries/readme.md +22 -0
contextual/indexing/symbol_extractor.py +426 -0
contextual/indexing/tokenizer.py +203 -0
contextual/integrations/__init__.py +10 -0
contextual/mcp/__init__.py +15 -0
contextual/mcp/__main__.py +24 -0
contextual/mcp/docs_tools.py +286 -0
contextual/mcp/server.py +118 -0
contextual/mcp/tools.py +443 -0
contextual/observability/__init__.py +21 -0
contextual/observability/logging.py +115 -0
contextual/py.typed +0 -0
contextual/retrieval/__init__.py +24 -0
contextual/retrieval/context_assembler.py +372 -0
contextual/retrieval/ranker.py +193 -0
contextual/retrieval/search.py +548 -0
contextual/security/__init__.py +52 -0
contextual/security/paths.py +347 -0
contextual/security/sanitize.py +349 -0
contextual/security/workspace.py +348 -0
contextual/storage/__init__.py +36 -0
contextual/storage/fts_manager.py +273 -0
contextual/storage/migration_v2.py +289 -0
contextual/storage/migrations.py +316 -0
contextual/storage/schema.py +210 -0
contextual/storage/sqlite_pool.py +468 -0
contextual/storage/vec0_manager.py +421 -0
contextual_engine-0.1.0.dist-info/METADATA +297 -0
contextual_engine-0.1.0.dist-info/RECORD +60 -0
contextual_engine-0.1.0.dist-info/WHEEL +4 -0
contextual_engine-0.1.0.dist-info/entry_points.txt +2 -0
contextual_engine-0.1.0.dist-info/licenses/LICENSE +111 -0

contextual/core/models.py ADDED Viewed

@@ -0,0 +1,590 @@
+"""Core data models for Contextual.
+This module defines all data shapes used throughout the system. Every other module
+imports from here. These contracts are the architectural foundation - changing them
+requires cascading updates across storage, indexing, and retrieval layers.
+All models use Pydantic v2 for validation and serialization.
+"""
+from __future__ import annotations
+from datetime import UTC, datetime
+from enum import StrEnum
+from pathlib import Path
+from typing import Any, Literal
+from pydantic import BaseModel, Field, field_validator
+# ============================================================================
+# ENUMS - Type-safe categorical values
+# ============================================================================
+class EntityType(StrEnum):
+    """Types of entities that can be tracked in the knowledge graph.
+    These align with code structure and decision artifacts.
+    """
+    FUNCTION = "function"
+    CLASS = "class"
+    MODULE = "module"
+    FILE = "file"
+    SYMBOL = "symbol"
+    DECISION = "decision"  # ADR, RFC, design doc
+    ADR = "adr"  # Architecture Decision Record
+    DEPENDENCY = "dependency"  # External package/library
+class ChunkType(StrEnum):
+    """Types of code chunks extracted during indexing.
+    Determines chunking strategy and context assembly behavior.
+    """
+    FUNCTION = "function"
+    METHOD = "method"
+    CLASS = "class"
+    MODULE = "module"
+    IMPORT_BLOCK = "import_block"
+    TYPE_DEFINITION = "type_definition"
+    INTERFACE = "interface"
+    CONSTANT = "constant"
+class EpisodeSource(StrEnum):
+    """Sources of ingestion events.
+    Tracks provenance of information entering the system.
+    """
+    COMMIT = "commit"  # Git commit
+    FILE = "file"  # Direct file ingestion
+    USER = "user"  # User-provided context
+    HOOK = "hook"  # Git hook trigger
+    MANUAL = "manual"  # Manual re-index
+class FactSource(StrEnum):
+    """Sources of factual assertions.
+    Determines confidence and invalidation rules.
+    """
+    TREE_SITTER = "tree-sitter"  # AST parsing (high confidence)
+    GIT_BLAME = "git-blame"  # Temporal attribution (high confidence)
+    USER = "user"  # User annotation (medium confidence)
+    LLM_EXTRACT = "llm-extract"  # LLM extraction (lower confidence)
+    HEURISTIC = "heuristic"  # Pattern matching (medium confidence)
+class Language(StrEnum):
+    """Supported programming languages for indexing.
+    Week 1 MVP: Python, TypeScript, JavaScript, Go, Java, Rust, C#
+    """
+    PYTHON = "python"
+    TYPESCRIPT = "typescript"
+    JAVASCRIPT = "javascript"
+    TSX = "tsx"
+    GO = "go"
+    JAVA = "java"
+    RUST = "rust"
+    CSHARP = "csharp"
+    # Config formats (free wins)
+    JSON = "json"
+    YAML = "yaml"
+    TOML = "toml"
+    MARKDOWN = "markdown"
+    DOCKERFILE = "dockerfile"
+class ModelType(StrEnum):
+    """Embedding models used for vector generation.
+    Determines which index to search and dimension expectations.
+    """
+    JINA_CODE_V2 = "jina-v2-code"  # 768d, code-specific
+# ============================================================================
+# CORE ENTITIES - Bi-temporal knowledge graph nodes
+# ============================================================================
+class Entity(BaseModel):
+    """A tracked entity in the code knowledge graph.
+    Entities represent code structures (functions, classes, modules),
+    decisions (ADRs), and dependencies that have facts associated with them.
+    Attributes:
+        id: Database primary key (assigned on insert).
+        name: Fully qualified name (e.g., "myapp.utils.helpers.parse_json").
+        entity_type: Category of entity.
+        metadata: Flexible JSON blob for entity-specific data.
+        created_at: When this entity was first observed (unix milliseconds UTC).
+    """
+    id: int | None = None
+    name: str = Field(..., min_length=1, max_length=500)
+    entity_type: EntityType
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    created_at: int = Field(
+        default_factory=lambda: int(datetime.now(UTC).timestamp() * 1000),
+    )
+    model_config = {"frozen": False}
+class Fact(BaseModel):
+    """A bi-temporal fact about an entity.
+    Facts are assertions like "function X has signature Y" or "module A imports B".
+    They carry four timestamps for complete temporal tracking:
+    - valid_at: When this became true in reality (from git blame)
+    - invalid_at: When this stopped being true (or None if still valid)
+    - created_at: When we recorded this assertion
+    - expired_at: When we retracted our belief (or None if still believed)
+    This bi-temporal model (Snodgrass/Jensen formalism) enables AS-OF queries
+    and contradiction detection without deleting history.
+    Attributes:
+        id: Database primary key.
+        entity_id: Subject of the fact.
+        object_id: Object entity (for entity-entity relations) or None.
+        predicate: Relationship type (e.g., "has_signature", "calls", "imports").
+        value: Literal value (for entity-value facts) as JSON string.
+        valid_at: When this became true (unix ms UTC, from git blame).
+        invalid_at: When this stopped being true (or None).
+        created_at: When we recorded this (unix ms UTC).
+        expired_at: When we retracted this belief (or None).
+        episode_id: Source episode that created this fact.
+        confidence: 0.0-1.0 confidence score.
+        source: Where this fact came from.
+    """
+    id: int | None = None
+    entity_id: int
+    object_id: int | None = None
+    predicate: str = Field(..., min_length=1, max_length=100)
+    value: str | None = Field(None, max_length=10000)
+    valid_at: int  # Unix milliseconds UTC
+    invalid_at: int | None = None
+    created_at: int = Field(
+        default_factory=lambda: int(datetime.now(UTC).timestamp() * 1000),
+    )
+    expired_at: int | None = None
+    episode_id: int | None = None
+    confidence: float = Field(1.0, ge=0.0, le=1.0)
+    source: FactSource
+    model_config = {"frozen": False}
+    @field_validator("valid_at")
+    @classmethod
+    def validate_valid_at(cls, v: int) -> int:
+        """Ensure valid_at is a reasonable timestamp."""
+        if v < 0:
+            msg = "valid_at must be non-negative"
+            raise ValueError(msg)
+        return v
+class Episode(BaseModel):
+    """An ingestion event that created facts.
+    Episodes provide provenance tracking - every fact can be traced back to
+    the episode that created it (a commit, file change, user annotation, etc.).
+    Attributes:
+        id: Database primary key.
+        source: Type of ingestion event.
+        content: Raw data for provenance (commit SHA, file path, user input).
+        timestamp: When this episode occurred (unix ms UTC).
+        metadata: Flexible JSON blob for episode-specific data.
+    """
+    id: int | None = None
+    source: EpisodeSource
+    content: str | None = Field(None, max_length=50000)
+    timestamp: int = Field(
+        default_factory=lambda: int(datetime.now(UTC).timestamp() * 1000),
+    )
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    model_config = {"frozen": False}
+# ============================================================================
+# CODE CHUNKS - Vector search payloads
+# ============================================================================
+class Chunk(BaseModel):
+    """A code chunk with embedding vector and metadata.
+    Chunks are the atomic units of retrieval. Each represents a coherent piece
+    of code (function, class, module section) with a structural header and body.
+    The content_hash enables deduplication - identical chunks are never re-embedded.
+    Attributes:
+        vector: 768-dimensional embedding (jina-v2-code).
+        path: File path relative to project root.
+        language: Programming language.
+        symbol_name: Function/class name (or None for module-level chunks).
+        chunk_type: Category of code structure.
+        content_hash: SHA-256 of (header + body) for deduplication.
+        header: Structural context (file path, parent class, imports, decorators).
+        body: Actual code content.
+        model_type: Which embedding model generated the vector.
+        start_line: First line of the code chunk.
+        end_line: Last line of the code chunk.
+    """
+    vector: list[float] = Field(..., min_length=256, max_length=768)
+    path: str = Field(..., min_length=1, max_length=500)
+    language: Language
+    symbol_name: str | None = Field(None, max_length=200)
+    chunk_type: ChunkType | str = "code"
+    source_type: str = "code"
+    heading_path: str = ""
+    heading_level: int = 0
+    content_hash: str = Field(..., min_length=64, max_length=64)  # SHA-256 hex
+    header: str = Field(..., max_length=2000)
+    body: str = Field(..., max_length=50000)
+    model_type: ModelType
+    start_line: int = Field(..., ge=0)
+    end_line: int = Field(..., ge=0)
+    model_config = {"frozen": True}  # Chunks are immutable once created
+    @field_validator("vector")
+    @classmethod
+    def validate_vector_dimensions(cls, v: list[float]) -> list[float]:
+        """Ensure vector has valid dimensions (256 or 768)."""
+        if len(v) not in {256, 768}:
+            msg = f"Vector must be 256d or 768d, got {len(v)}d"
+            raise ValueError(msg)
+        return v
+    @field_validator("content_hash")
+    @classmethod
+    def validate_content_hash_format(cls, v: str) -> str:
+        """Ensure content_hash is valid SHA-256 hex."""
+        if not all(c in "0123456789abcdef" for c in v.lower()):
+            msg = "content_hash must be valid SHA-256 hex string"
+            raise ValueError(msg)
+        return v.lower()
+# ============================================================================
+# CONFIGURATION MODELS
+# ============================================================================
+class IndexingConfig(BaseModel):
+    """Configuration for code indexing pipeline.
+    Controls chunking, file discovery, and git integration behavior.
+    """
+    # Chunking parameters
+    target_chunk_size: int = Field(1500, ge=500, le=5000)
+    max_chunk_size: int = Field(2000, ge=1000, le=10000)
+    min_chunk_size: int = Field(50, ge=10, le=500)
+    # File discovery
+    max_file_size_bytes: int = Field(2_097_152, ge=0)  # 2 MB default
+    max_file_size_override_bytes: int = Field(10_485_760, ge=0)  # 10 MB override
+    respect_gitignore: bool = True
+    respect_contextualignore: bool = True
+    # Git integration
+    enable_git_blame: bool = True
+    enable_incremental_indexing: bool = True
+    max_commits_per_walk: int = Field(1000, ge=1)
+    # Performance
+    parser_pool_size: int = Field(4, ge=1, le=32)
+    batch_size_chunks: int = Field(100, ge=1, le=1000)
+    model_config = {"frozen": False}
+class EmbeddingConfig(BaseModel):
+    """Configuration for embedding models.
+    Controls which models to use and inference parameters.
+    """
+    # Model selection
+    code_model: Literal["jina-v2-code"] = "jina-v2-code"
+    # Inference parameters
+    batch_size: int = Field(64, ge=1, le=256)
+    max_seq_length: int = Field(512, ge=128, le=8192)
+    # Caching
+    cache_size: int = Field(2000, ge=0)
+    enable_cache: bool = True
+    model_config = {"frozen": False}
+class RetrievalConfig(BaseModel):
+    """Configuration for hybrid retrieval pipeline.
+    Controls BM25, dense search, fusion, and reranking.
+    """
+    # Retrieval counts
+    bm25_top_k: int = Field(100, ge=1, le=500)
+    dense_top_k: int = Field(100, ge=1, le=500)
+    rerank_top_k: int = Field(20, ge=1, le=100)
+    final_top_k: int = Field(10, ge=1, le=50)
+    # RRF parameters
+    rrf_k: int = Field(60, ge=1, le=100)
+    bm25_weight: float = Field(0.6, ge=0.0, le=1.0)
+    dense_weight: float = Field(0.4, ge=0.0, le=1.0)
+    # MMR parameters
+    mmr_lambda: float = Field(0.7, ge=0.0, le=1.0)
+    max_chunks_per_file: int = Field(2, ge=1, le=10)
+    max_chunks_per_symbol: int = Field(1, ge=1, le=5)
+    # Context assembly
+    max_context_tokens: int = Field(8000, ge=1000, le=200000)
+    structural_context_ratio: float = Field(0.15, ge=0.0, le=0.5)
+    model_config = {"frozen": False}
+class StorageConfig(BaseModel):
+    """Configuration for storage backends.
+    Controls SQLite, LanceDB, and tantivy settings.
+    """
+    # Paths (relative to project root)
+    contextual_dir: Path = Field(default=Path(".contextual"))
+    sqlite_db_name: str = "contextual.db"
+    lance_db_name: str = "lance"
+    tantivy_index_name: str = "tantivy"
+    # SQLite
+    sqlite_cache_size_mb: int = Field(64, ge=8, le=1024)
+    sqlite_mmap_size_mb: int = Field(256, ge=64, le=2048)
+    sqlite_wal_autocheckpoint_pages: int = Field(4000, ge=1000, le=10000)
+    # LanceDB
+    lance_ivf_pq_threshold: int = Field(100_000, ge=10000)
+    lance_num_partitions: int | None = None  # Auto: sqrt(n)
+    lance_num_sub_vectors: int = Field(48, ge=8, le=96)
+    lance_refine_factor: int = Field(20, ge=1, le=100)
+    model_config = {"frozen": False}
+class SecurityConfig(BaseModel):
+    """Configuration for security hardening.
+    Controls path validation, sanitization, and workspace isolation.
+    """
+    # Path safety
+    enable_path_traversal_check: bool = True
+    enable_symlink_resolution: bool = True
+    max_path_depth: int = Field(20, ge=5, le=50)
+    # Sanitization
+    strip_unicode_control_chars: bool = True
+    escape_fts5_special_chars: bool = True
+    # Workspace isolation
+    per_project_workspace: bool = True
+    workspace_permissions: int = Field(0o700, ge=0o600, le=0o777)
+    db_file_permissions: int = Field(0o600, ge=0o600, le=0o666)
+    model_config = {"frozen": False}
+class ContextualConfig(BaseModel):
+    """Top-level configuration for Contextual.
+    Aggregates all subsystem configurations and provides project-level settings.
+    Attributes:
+        project_root: Absolute path to project being indexed.
+        indexing: Indexing pipeline configuration.
+        embedding: Embedding model configuration.
+        retrieval: Retrieval pipeline configuration.
+        storage: Storage backend configuration.
+        security: Security hardening configuration.
+    """
+    project_root: Path
+    indexing: IndexingConfig = Field(default_factory=lambda: IndexingConfig())  # type: ignore[call-arg]  # noqa: PLW0108
+    embedding: EmbeddingConfig = Field(default_factory=lambda: EmbeddingConfig())  # type: ignore[call-arg]  # noqa: PLW0108
+    retrieval: RetrievalConfig = Field(default_factory=lambda: RetrievalConfig())  # type: ignore[call-arg]  # noqa: PLW0108
+    storage: StorageConfig = Field(default_factory=lambda: StorageConfig())  # type: ignore[call-arg]  # noqa: PLW0108
+    security: SecurityConfig = Field(default_factory=lambda: SecurityConfig())  # type: ignore[call-arg]  # noqa: PLW0108
+    model_config = {"frozen": False}
+    @field_validator("project_root")
+    @classmethod
+    def validate_project_root_exists(cls, v: Path) -> Path:
+        """Ensure project root exists and is absolute."""
+        if not v.is_absolute():
+            msg = "project_root must be an absolute path"
+            raise ValueError(msg)
+        if not v.exists():
+            msg = f"project_root does not exist: {v}"
+            raise ValueError(msg)
+        if not v.is_dir():
+            msg = f"project_root is not a directory: {v}"
+            raise ValueError(msg)
+        return v
+# ============================================================================
+# QUERY & RESULT MODELS - API contracts for search operations
+# ============================================================================
+class SearchQuery(BaseModel):
+    """Query parameters for semantic search.
+    Attributes:
+        query: Natural language or code query string.
+        top_k: Number of results to return.
+        language_filter: Optional language filter.
+        path_filter: Optional path prefix filter.
+        include_scores: Whether to include similarity scores.
+    """
+    query: str = Field(..., min_length=1, max_length=1000)
+    top_k: int = Field(10, ge=1, le=100)
+    language_filter: Language | None = None
+    path_filter: str | None = None
+    include_scores: bool = True
+    model_config = {"frozen": True}
+class SearchResult(BaseModel):
+    """A single search result with chunk and metadata.
+    Attributes:
+        chunk: The matched code chunk.
+        score: Similarity score (0.0-1.0, higher is better).
+        rank: Result rank (1-indexed).
+    """
+    chunk: Chunk
+    score: float = Field(..., ge=0.0, le=1.0)
+    rank: int = Field(..., ge=1)
+    model_config = {"frozen": True}
+class SearchResponse(BaseModel):
+    """Complete search response with results and metadata.
+    Attributes:
+        query: Original query.
+        results: Ranked search results.
+        total_candidates: Total chunks considered.
+        pipeline_latency_ms: End-to-end latency.
+    """
+    query: str
+    results: list[SearchResult]
+    total_candidates: int = Field(..., ge=0)
+    pipeline_latency_ms: float = Field(..., ge=0.0)
+    model_config = {"frozen": True}
+class RecallQuery(BaseModel):
+    """Query parameters for entity recall (temporal queries).
+    Attributes:
+        entity_name: Name of entity to recall.
+        predicate: Optional predicate filter.
+        as_of_timestamp: Optional AS-OF timestamp (unix ms UTC).
+    """
+    entity_name: str = Field(..., min_length=1, max_length=500)
+    predicate: str | None = Field(None, max_length=100)
+    as_of_timestamp: int | None = None
+    model_config = {"frozen": True}
+class RecallResult(BaseModel):
+    """Facts about an entity at a specific time.
+    Attributes:
+        entity: The queried entity.
+        facts: List of valid facts at query time.
+        as_of: Query timestamp (or None for current).
+    """
+    entity: Entity
+    facts: list[Fact]
+    as_of: int | None = None
+    model_config = {"frozen": True}
+class IndexProgress(BaseModel):
+    """Progress update during indexing.
+    Attributes:
+        phase: Current indexing phase.
+        files_processed: Number of files processed.
+        total_files: Total files to process.
+        chunks_created: Number of chunks created.
+        facts_created: Number of facts created.
+    """
+    phase: str
+    files_processed: int = Field(..., ge=0)
+    total_files: int = Field(..., ge=0)
+    chunks_created: int = Field(..., ge=0)
+    facts_created: int = Field(..., ge=0)
+    model_config = {"frozen": False}
+class IndexResult(BaseModel):
+    """Result of an indexing operation.
+    Attributes:
+        success: Whether indexing completed successfully.
+        files_indexed: Number of files successfully indexed.
+        chunks_created: Number of new chunks created.
+        chunks_skipped: Number of chunks skipped (deduped).
+        facts_created: Number of new facts created.
+        duration_seconds: Total indexing time.
+        error: Error message if not successful.
+    """
+    success: bool
+    files_indexed: int = Field(0, ge=0)
+    chunks_created: int = Field(0, ge=0)
+    chunks_skipped: int = Field(0, ge=0)
+    facts_created: int = Field(0, ge=0)
+    duration_seconds: float = Field(0.0, ge=0.0)
+    error: str | None = None
+    model_config = {"frozen": True}

contextual/docs/__init__.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""Docs module — Phase 2: heading-aware document indexing and retrieval.
+Public API
+----------
+Chunking:
+    DocsChunker         Heading-aware markdown/MDX/RST/TXT chunker.
+    DocChunk            Dataclass representing a single doc chunk.
+    chunk_doc_file      Convenience: chunk a single file with defaults.
+Indexing:
+    DocsPipeline        Full indexing pipeline (file discovery → embed → store).
+    DocsIndexStats      Stats dataclass returned by pipeline runs.
+    index_docs          Convenience: index all docs in a repo.
+Retrieval:
+    docs_search         Hybrid BM25 + vector search over doc chunks.
+    docs_get_section    Fetch exact section by heading path (5-tier resolver).
+    docs_list_files     List all indexed doc files.
+    DocSearchHit        Search result dataclass.
+    DocSection          Resolved section dataclass.
+    DocFileInfo         File metadata dataclass.
+File Watching:
+    DocsFileWatcher     Watchdog-based watcher; composable with code watcher.
+"""
+from __future__ import annotations
+from contextual.docs.chunker import (
+    DocChunk,
+    DocsChunker,
+    chunk_doc_file,
+)
+from contextual.docs.pipeline import (
+    DocsPipeline,
+    DocsIndexStats,
+    index_docs,
+)
+from contextual.docs.retrieval import (
+    DocSearchHit,
+    DocSection,
+    DocFileInfo,
+    docs_search,
+    docs_get_section,
+    docs_list_files,
+)
+from contextual.docs.watcher import DocsFileWatcher
+__all__ = [
+    # Chunking
+    "DocChunk",
+    "DocsChunker",
+    "chunk_doc_file",
+    # Indexing
+    "DocsPipeline",
+    "DocsIndexStats",
+    "index_docs",
+    # Retrieval
+    "DocSearchHit",
+    "DocSection",
+    "DocFileInfo",
+    "docs_search",
+    "docs_get_section",
+    "docs_list_files",
+    # Watching
+    "DocsFileWatcher",
+]