PyPI - agent-brain-rag - Versions diffs - 1.1.0__py3-none-any.whl - Mend

agent-brain-rag 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

agent_brain_rag-1.1.0.dist-info/METADATA +202 -0
agent_brain_rag-1.1.0.dist-info/RECORD +31 -0
agent_brain_rag-1.1.0.dist-info/WHEEL +4 -0
agent_brain_rag-1.1.0.dist-info/entry_points.txt +3 -0
doc_serve_server/__init__.py +3 -0
doc_serve_server/api/__init__.py +5 -0
doc_serve_server/api/main.py +332 -0
doc_serve_server/api/routers/__init__.py +11 -0
doc_serve_server/api/routers/health.py +100 -0
doc_serve_server/api/routers/index.py +208 -0
doc_serve_server/api/routers/query.py +96 -0
doc_serve_server/config/__init__.py +5 -0
doc_serve_server/config/settings.py +92 -0
doc_serve_server/indexing/__init__.py +19 -0
doc_serve_server/indexing/bm25_index.py +166 -0
doc_serve_server/indexing/chunking.py +831 -0
doc_serve_server/indexing/document_loader.py +506 -0
doc_serve_server/indexing/embedding.py +274 -0
doc_serve_server/locking.py +133 -0
doc_serve_server/models/__init__.py +18 -0
doc_serve_server/models/health.py +126 -0
doc_serve_server/models/index.py +157 -0
doc_serve_server/models/query.py +191 -0
doc_serve_server/project_root.py +85 -0
doc_serve_server/runtime.py +112 -0
doc_serve_server/services/__init__.py +11 -0
doc_serve_server/services/indexing_service.py +476 -0
doc_serve_server/services/query_service.py +414 -0
doc_serve_server/storage/__init__.py +5 -0
doc_serve_server/storage/vector_store.py +320 -0
doc_serve_server/storage_paths.py +72 -0

doc_serve_server/models/index.py ADDED Viewed

@@ -0,0 +1,157 @@
+"""Indexing request, response, and state models."""
+from datetime import datetime
+from enum import Enum
+from typing import Optional
+from pydantic import BaseModel, Field
+class CodeChunkStrategy(str, Enum):
+    """Strategy for chunking code files."""
+    AST_AWARE = "ast_aware"  # Use LlamaIndex CodeSplitter for AST boundaries
+    TEXT_BASED = "text_based"  # Use regular text chunking
+class IndexingStatusEnum(str, Enum):
+    """Enumeration of indexing status values."""
+    IDLE = "idle"
+    INDEXING = "indexing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+class IndexRequest(BaseModel):
+    """Request model for indexing documents."""
+    folder_path: str = Field(
+        ...,
+        min_length=1,
+        description="Path to folder containing documents to index",
+    )
+    chunk_size: int = Field(
+        default=512,
+        ge=128,
+        le=2048,
+        description="Target chunk size in tokens",
+    )
+    chunk_overlap: int = Field(
+        default=50,
+        ge=0,
+        le=200,
+        description="Overlap between chunks in tokens",
+    )
+    recursive: bool = Field(
+        default=True,
+        description="Whether to scan folder recursively",
+    )
+    # Code indexing options
+    include_code: bool = Field(
+        default=False,
+        description="Whether to index source code files alongside documents",
+    )
+    supported_languages: Optional[list[str]] = Field(
+        default=None,
+        description="Programming languages to index (defaults to all supported)",
+        examples=[["python", "typescript"], ["java", "kotlin"]],
+    )
+    code_chunk_strategy: CodeChunkStrategy = Field(
+        default=CodeChunkStrategy.AST_AWARE,
+        description="Strategy for chunking code files",
+    )
+    generate_summaries: bool = Field(
+        default=False,
+        description="Generate LLM summaries for code chunks to improve semantic search",
+    )
+    # File filtering options
+    include_patterns: Optional[list[str]] = Field(
+        default=None,
+        description="Additional file patterns to include (supports wildcards)",
+        examples=[["*.md", "*.py"], ["docs/**/*.md", "src/**/*.py"]],
+    )
+    exclude_patterns: Optional[list[str]] = Field(
+        default=None,
+        description="Additional file patterns to exclude (supports wildcards)",
+        examples=[["*.log", "__pycache__/**"], ["node_modules/**", "*.tmp"]],
+    )
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "folder_path": "/path/to/documents",
+                    "chunk_size": 512,
+                    "chunk_overlap": 50,
+                    "recursive": True,
+                },
+                {
+                    "folder_path": "/path/to/project",
+                    "chunk_size": 512,
+                    "chunk_overlap": 50,
+                    "recursive": True,
+                    "include_code": True,
+                    "supported_languages": ["python", "typescript", "javascript"],
+                    "code_chunk_strategy": "ast_aware",
+                    "include_patterns": ["docs/**/*.md", "src/**/*.py", "src/**/*.ts"],
+                    "exclude_patterns": ["node_modules/**", "__pycache__/**", "*.log"],
+                },
+                {
+                    "folder_path": "/path/to/codebase",
+                    "include_code": True,
+                    "supported_languages": ["java", "kotlin"],
+                    "code_chunk_strategy": "ast_aware",
+                },
+            ]
+        }
+    }
+class IndexResponse(BaseModel):
+    """Response model for indexing operations."""
+    job_id: str = Field(..., description="Unique identifier for the indexing job")
+    status: str = Field(..., description="Current status of the indexing job")
+    message: Optional[str] = Field(None, description="Additional status message")
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "job_id": "job_abc123",
+                    "status": "started",
+                    "message": "Indexing started for /path/to/documents",
+                }
+            ]
+        }
+    }
+class IndexingState(BaseModel):
+    """Internal state model for tracking indexing progress."""
+    current_job_id: Optional[str] = Field(None, description="Current job ID")
+    status: IndexingStatusEnum = Field(
+        default=IndexingStatusEnum.IDLE,
+        description="Current indexing status",
+    )
+    is_indexing: bool = Field(default=False, description="Whether indexing is active")
+    folder_path: Optional[str] = Field(None, description="Folder being indexed")
+    total_documents: int = Field(default=0, description="Total documents found")
+    processed_documents: int = Field(default=0, description="Documents processed")
+    total_chunks: int = Field(default=0, description="Total chunks created")
+    started_at: Optional[datetime] = Field(None, description="When indexing started")
+    completed_at: Optional[datetime] = Field(
+        None, description="When indexing completed"
+    )
+    error: Optional[str] = Field(None, description="Error message if failed")
+    @property
+    def progress_percent(self) -> float:
+        """Calculate progress percentage."""
+        if self.total_documents == 0:
+            return 0.0
+        return (self.processed_documents / self.total_documents) * 100

doc_serve_server/models/query.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""Query request and response models."""
+from enum import Enum
+from typing import Any, Optional
+from pydantic import BaseModel, Field, field_validator
+from ..indexing.document_loader import LanguageDetector
+class QueryMode(str, Enum):
+    """Retrieval modes."""
+    VECTOR = "vector"
+    BM25 = "bm25"
+    HYBRID = "hybrid"
+class QueryRequest(BaseModel):
+    """Request model for document queries."""
+    query: str = Field(
+        ...,
+        min_length=1,
+        max_length=1000,
+        description="The search query text",
+    )
+    top_k: int = Field(
+        default=5,
+        ge=1,
+        le=50,
+        description="Number of results to return",
+    )
+    similarity_threshold: float = Field(
+        default=0.7,
+        ge=0.0,
+        le=1.0,
+        description="Minimum similarity score (0-1)",
+    )
+    mode: QueryMode = Field(
+        default=QueryMode.HYBRID,
+        description="Retrieval mode (vector, bm25, hybrid)",
+    )
+    alpha: float = Field(
+        default=0.5,
+        ge=0.0,
+        le=1.0,
+        description="Weight for hybrid search (1.0 = pure vector, 0.0 = pure bm25)",
+    )
+    # Content filtering
+    source_types: list[str] | None = Field(
+        default=None,
+        description="Filter by source types: 'doc', 'code', 'test'",
+        examples=[["doc"], ["code"], ["doc", "code"]],
+    )
+    languages: list[str] | None = Field(
+        default=None,
+        description="Filter by programming languages for code files",
+        examples=[["python"], ["typescript", "javascript"], ["java", "kotlin"]],
+    )
+    file_paths: list[str] | None = Field(
+        default=None,
+        description="Filter by specific file paths (supports wildcards)",
+        examples=[["docs/*.md"], ["src/**/*.py"]],
+    )
+    @field_validator("languages")
+    @classmethod
+    def validate_languages(cls, v: Optional[list[str]]) -> Optional[list[str]]:
+        """Validate that provided languages are supported."""
+        if v is None:
+            return v
+        detector = LanguageDetector()
+        supported_languages = detector.get_supported_languages()
+        invalid_languages = [lang for lang in v if lang not in supported_languages]
+        if invalid_languages:
+            raise ValueError(
+                f"Unsupported languages: {invalid_languages}. "
+                f"Supported languages: {supported_languages}"
+            )
+        return v
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "query": "How do I configure authentication?",
+                    "top_k": 5,
+                    "similarity_threshold": 0.7,
+                    "mode": "hybrid",
+                    "alpha": 0.5,
+                },
+                {
+                    "query": "implement user authentication",
+                    "top_k": 10,
+                    "source_types": ["code"],
+                    "languages": ["python", "typescript"],
+                },
+                {
+                    "query": "API endpoints",
+                    "top_k": 5,
+                    "source_types": ["doc", "code"],
+                    "file_paths": ["docs/api/*.md", "src/**/*.py"],
+                },
+            ]
+        }
+    }
+class QueryResult(BaseModel):
+    """Single query result with source and score."""
+    text: str = Field(..., description="The chunk text content")
+    source: str = Field(..., description="Source file path")
+    score: float = Field(..., description="Primary score (rank or similarity)")
+    vector_score: float | None = Field(
+        default=None, description="Score from vector search"
+    )
+    bm25_score: float | None = Field(default=None, description="Score from BM25 search")
+    chunk_id: str = Field(..., description="Unique chunk identifier")
+    # Content type information
+    source_type: str = Field(
+        default="doc", description="Type of content: 'doc', 'code', or 'test'"
+    )
+    language: str | None = Field(
+        default=None, description="Programming language for code files"
+    )
+    # Additional metadata
+    metadata: dict[str, Any] = Field(
+        default_factory=dict, description="Additional metadata"
+    )
+class QueryResponse(BaseModel):
+    """Response model for document queries."""
+    results: list[QueryResult] = Field(
+        default_factory=list,
+        description="List of matching document chunks",
+    )
+    query_time_ms: float = Field(
+        ...,
+        ge=0,
+        description="Query execution time in milliseconds",
+    )
+    total_results: int = Field(
+        default=0,
+        ge=0,
+        description="Total number of results found",
+    )
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "results": [
+                        {
+                            "text": "Authentication is configured via...",
+                            "source": "docs/auth.md",
+                            "score": 0.92,
+                            "vector_score": 0.92,
+                            "bm25_score": 0.85,
+                            "chunk_id": "chunk_abc123",
+                            "source_type": "doc",
+                            "language": "markdown",
+                            "metadata": {"chunk_index": 0},
+                        },
+                        {
+                            "text": "def authenticate_user(username, password):",
+                            "source": "src/auth.py",
+                            "score": 0.88,
+                            "vector_score": 0.88,
+                            "bm25_score": 0.82,
+                            "chunk_id": "chunk_def456",
+                            "source_type": "code",
+                            "language": "python",
+                            "metadata": {"symbol_name": "authenticate_user"},
+                        },
+                    ],
+                    "query_time_ms": 125.5,
+                    "total_results": 2,
+                }
+            ]
+        }
+    }

doc_serve_server/project_root.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""Project root resolution for per-project doc-serve instances."""
+import logging
+import subprocess
+from pathlib import Path
+from typing import Optional
+logger = logging.getLogger(__name__)
+def resolve_project_root(start_path: Optional[Path] = None) -> Path:
+    """Resolve the canonical project root directory.
+    Resolution order:
+    1. Git repository root (git rev-parse --show-toplevel)
+    2. Walk up looking for .claude/ directory
+    3. Walk up looking for pyproject.toml
+    4. Fall back to cwd
+    Always resolves symlinks for canonical paths.
+    Args:
+        start_path: Starting path for resolution. Defaults to cwd.
+    Returns:
+        Resolved project root path.
+    """
+    start = (start_path or Path.cwd()).resolve()
+    # Try git root first
+    git_root = _resolve_git_root(start)
+    if git_root:
+        return git_root
+    # Walk up looking for markers
+    marker_root = _walk_up_for_marker(start)
+    if marker_root:
+        return marker_root
+    return start
+def _resolve_git_root(start: Path) -> Optional[Path]:
+    """Resolve git repository root with timeout.
+    Args:
+        start: Directory to start searching from.
+    Returns:
+        Git root path or None if not in a git repo.
+    """
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "--show-toplevel"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+            cwd=str(start),
+        )
+        if result.returncode == 0:
+            return Path(result.stdout.strip()).resolve()
+    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
+        pass
+    return None
+def _walk_up_for_marker(start: Path) -> Optional[Path]:
+    """Walk up directories looking for project markers.
+    Looks for .claude/ directory or pyproject.toml file.
+    Args:
+        start: Directory to start walking from.
+    Returns:
+        Directory containing a marker, or None.
+    """
+    current = start
+    while current != current.parent:
+        if (current / ".claude").is_dir():
+            return current
+        if (current / "pyproject.toml").is_file():
+            return current
+        current = current.parent
+    return None

doc_serve_server/runtime.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""Runtime state management for doc-serve instances."""
+import json
+import logging
+import os
+import urllib.request
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+from uuid import uuid4
+from pydantic import BaseModel, Field
+logger = logging.getLogger(__name__)
+class RuntimeState(BaseModel):
+    """Runtime state for a doc-serve instance."""
+    schema_version: str = "1.0"
+    mode: str = "project"  # "project" or "shared"
+    project_root: str = ""
+    instance_id: str = Field(default_factory=lambda: uuid4().hex[:12])
+    base_url: str = ""
+    bind_host: str = "127.0.0.1"
+    port: int = 0
+    pid: int = 0
+    started_at: str = Field(
+        default_factory=lambda: datetime.now(timezone.utc).isoformat()
+    )
+    # Shared mode fields
+    project_id: Optional[str] = None
+    active_projects: Optional[list[str]] = None
+def write_runtime(state_dir: Path, state: RuntimeState) -> None:
+    """Write runtime state to state directory.
+    Args:
+        state_dir: Path to the state directory.
+        state: Runtime state to write.
+    """
+    state_dir.mkdir(parents=True, exist_ok=True)
+    runtime_path = state_dir / "runtime.json"
+    runtime_path.write_text(state.model_dump_json(indent=2))
+    logger.info(f"Runtime state written to {runtime_path}")
+def read_runtime(state_dir: Path) -> Optional[RuntimeState]:
+    """Read runtime state from state directory.
+    Args:
+        state_dir: Path to the state directory.
+    Returns:
+        RuntimeState if file exists and is valid, None otherwise.
+    """
+    runtime_path = state_dir / "runtime.json"
+    if not runtime_path.exists():
+        return None
+    try:
+        data = json.loads(runtime_path.read_text())
+        return RuntimeState(**data)
+    except Exception as e:
+        logger.warning(f"Failed to read runtime state: {e}")
+        return None
+def delete_runtime(state_dir: Path) -> None:
+    """Delete runtime state file.
+    Args:
+        state_dir: Path to the state directory.
+    """
+    runtime_path = state_dir / "runtime.json"
+    if runtime_path.exists():
+        runtime_path.unlink()
+        logger.info(f"Runtime state deleted: {runtime_path}")
+def validate_runtime(state: RuntimeState) -> bool:
+    """Validate that the runtime state is still valid.
+    Checks:
+    1. PID is still alive
+    2. Health endpoint responds
+    Args:
+        state: Runtime state to validate.
+    Returns:
+        True if the instance is still running, False otherwise.
+    """
+    # Check PID
+    if state.pid:
+        try:
+            os.kill(state.pid, 0)
+        except ProcessLookupError:
+            return False
+        except PermissionError:
+            pass  # Process exists but we can't signal it
+    # Check health endpoint
+    if state.base_url:
+        try:
+            req = urllib.request.Request(f"{state.base_url}/health/", method="GET")
+            with urllib.request.urlopen(req, timeout=3) as resp:
+                return bool(resp.status == 200)
+        except Exception:
+            return False
+    return False

doc_serve_server/services/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Business logic services for indexing and querying."""
+from .indexing_service import IndexingService, get_indexing_service
+from .query_service import QueryService, get_query_service
+__all__ = [
+    "IndexingService",
+    "get_indexing_service",
+    "QueryService",
+    "get_query_service",
+]