PyPI - dbs-vector - Versions diffs - 0.5.1__py3-none-any.whl - Mend

dbs-vector 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

dbs_vector/__init__.py +6 -0
dbs_vector/api/__init__.py +0 -0
dbs_vector/api/main.py +137 -0
dbs_vector/api/mcp_server.py +100 -0
dbs_vector/api/state.py +18 -0
dbs_vector/cli.py +264 -0
dbs_vector/config.py +110 -0
dbs_vector/core/__init__.py +0 -0
dbs_vector/core/models.py +107 -0
dbs_vector/core/ports.py +89 -0
dbs_vector/core/registry.py +36 -0
dbs_vector/infrastructure/__init__.py +0 -0
dbs_vector/infrastructure/chunking/__init__.py +0 -0
dbs_vector/infrastructure/chunking/api.py +139 -0
dbs_vector/infrastructure/chunking/document.py +100 -0
dbs_vector/infrastructure/chunking/duckdb.py +119 -0
dbs_vector/infrastructure/chunking/sql.py +65 -0
dbs_vector/infrastructure/embeddings/__init__.py +0 -0
dbs_vector/infrastructure/embeddings/mlx_engine.py +106 -0
dbs_vector/infrastructure/storage/__init__.py +0 -0
dbs_vector/infrastructure/storage/lancedb_engine.py +145 -0
dbs_vector/infrastructure/storage/mappers.py +174 -0
dbs_vector/logger.py +43 -0
dbs_vector/py.typed +0 -0
dbs_vector/services/__init__.py +0 -0
dbs_vector/services/ingestion.py +127 -0
dbs_vector/services/search.py +76 -0
dbs_vector-0.5.1.dist-info/METADATA +178 -0
dbs_vector-0.5.1.dist-info/RECORD +32 -0
dbs_vector-0.5.1.dist-info/WHEEL +4 -0
dbs_vector-0.5.1.dist-info/entry_points.txt +2 -0
dbs_vector-0.5.1.dist-info/licenses/LICENSE.md +10 -0

dbs_vector/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from importlib.metadata import version
+try:
+    __version__ = version("dbs-vector")
+except Exception:
+    __version__ = "unknown"

dbs_vector/api/__init__.py ADDED Viewed

File without changes

dbs_vector/api/main.py ADDED Viewed

@@ -0,0 +1,137 @@
+import asyncio
+from collections.abc import AsyncGenerator
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from loguru import logger
+from pydantic import BaseModel, Field
+from dbs_vector.api.mcp_server import mcp
+from dbs_vector.api.state import _services, initialize_services
+from dbs_vector.config import settings
+from dbs_vector.core.models import SearchResult, SqlSearchResult
+@asynccontextmanager
+async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
+    """Startup and shutdown events for the API."""
+    logger.info("Initializing MLX Embedders and LanceDB connections")
+    try:
+        initialize_services()
+        logger.success("API is ready to accept concurrent requests")
+    except Exception as e:
+        logger.error("Failed to initialize search services: {}", e)
+        raise
+    async with mcp.session_manager.run():
+        yield
+    logger.info("Cleaning up resources")
+    _services.clear()
+app = FastAPI(
+    title="dbs-vector Search API",
+    description="Async API for high-performance Arrow-native local codebase search.",
+    version="0.1.0",
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["https://claude.ai"],
+    allow_methods=["GET", "POST", "OPTIONS"],
+    allow_headers=["*"],
+)
+app.mount("/mcp", mcp.streamable_http_app())
+class SearchRequest(BaseModel):
+    """Schema for a standard document search request."""
+    query: str = Field(..., description="The semantic search query.")
+    limit: int = Field(5, ge=1, le=100, description="Maximum number of results to return.")
+    source_filter: str | None = Field(None, description="Optional path/file to filter the search.")
+class SqlSearchRequest(BaseModel):
+    """Schema for an SQL search request."""
+    query: str = Field(..., description="The semantic SQL search query.")
+    limit: int = Field(5, ge=1, le=100, description="Maximum number of results to return.")
+    source_filter: str | None = Field(None, description="Optional database to filter the search.")
+    min_time: float | None = Field(None, description="Minimum execution time in ms.")
+class SearchResponse(BaseModel):
+    """Schema for returning standard search results."""
+    query: str
+    results: list[SearchResult]
+class SqlSearchResponse(BaseModel):
+    """Schema for returning SQL search results."""
+    query: str
+    results: list[SqlSearchResult]
+@app.get("/health")
+async def health_check() -> dict[str, str]:
+    """Basic health check endpoint."""
+    if not _services:
+        raise HTTPException(status_code=503, detail="Search service initializing or failed")
+    status_dict = {"status": "healthy"}
+    for engine_name, config in settings.engines.items():
+        status_dict[f"{engine_name}_model"] = config.model_name
+    return status_dict
+@app.post("/search/md", response_model=SearchResponse)
+async def search_md(request: SearchRequest) -> SearchResponse:
+    """Executes a hybrid search asynchronously for documents."""
+    service = _services.get("md")
+    if not service:
+        raise HTTPException(status_code=503, detail="Document search service is not initialized.")
+    try:
+        results = await asyncio.to_thread(
+            service.execute_query,
+            request.query,
+            request.source_filter,
+            request.limit,
+            extra_filters={},
+        )
+        return SearchResponse(query=request.query, results=results)  # type: ignore
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Search execution failed: {e}") from e
+@app.post("/search/sql", response_model=SqlSearchResponse)
+async def search_sql(request: SqlSearchRequest) -> SqlSearchResponse:
+    """Executes a hybrid search asynchronously for SQL queries."""
+    service = _services.get("sql")
+    if not service:
+        raise HTTPException(status_code=503, detail="SQL search service is not initialized.")
+    extra_filters = {}
+    if request.min_time is not None:
+        extra_filters["min_time"] = request.min_time
+    try:
+        results = await asyncio.to_thread(
+            service.execute_query,
+            request.query,
+            request.source_filter,
+            request.limit,
+            extra_filters=extra_filters,
+        )
+        return SqlSearchResponse(query=request.query, results=results)  # type: ignore
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Search execution failed: {e}") from e

dbs_vector/api/mcp_server.py ADDED Viewed

@@ -0,0 +1,100 @@
+from mcp.server.fastmcp import FastMCP
+from dbs_vector.api.state import _services
+mcp = FastMCP(
+    "dbs-vector",
+    stateless_http=True,
+    streamable_http_path="/",
+)
+@mcp.tool()
+async def search_documents(query: str, limit: int = 5, source_filter: str | None = None) -> str:
+    """
+    Search indexed codebase documents (Markdown, Python, etc.) via semantic vector search.
+    Args:
+        query: The semantic search query or concept you are looking for.
+        limit: Maximum number of results to return.
+        source_filter: Optional file path or pattern to restrict the search.
+    """
+    service = _services.get("md")
+    if not service:
+        return "Error: Document search service ('md' engine) is not initialized."
+    try:
+        # execute_query is synchronous, but we can call it directly since MCP runs locally
+        results = service.execute_query(
+            query=query,
+            source_filter=source_filter,
+            limit=limit,
+            extra_filters={},
+        )
+        if not results:
+            return f"No results found for query: '{query}'"
+        output = [f"Found {len(results)} results for '{query}':\n"]
+        for res in results:
+            dist_str = f"{res.distance:.4f}" if res.distance is not None else "N/A (FTS)"
+            chunk = res.chunk
+            output.append(
+                f"--- Result (Score: {dist_str}) ---\n"
+                f"Source: {chunk.source}\n"
+                f"Content:\n{chunk.text}\n"
+            )
+        return "\n".join(output)
+    except Exception as e:
+        return f"Search execution failed: {e}"
+@mcp.tool()
+async def search_sql_logs(
+    query: str, limit: int = 5, source_filter: str | None = None, min_time: float | None = None
+) -> str:
+    """
+    Search indexed SQL query logs via semantic vector search.
+    Args:
+        query: The semantic search query, e.g. 'find user by email' or partial SQL.
+        limit: Maximum number of results to return.
+        source_filter: Optional database name to restrict the search.
+        min_time: Minimum execution time in milliseconds.
+    """
+    service = _services.get("sql")
+    if not service:
+        return "Error: SQL search service ('sql' engine) is not initialized."
+    extra_filters = {}
+    if min_time is not None:
+        extra_filters["min_time"] = min_time
+    try:
+        results = service.execute_query(
+            query=query,
+            source_filter=source_filter,
+            limit=limit,
+            extra_filters=extra_filters,
+        )
+        if not results:
+            return f"No results found for query: '{query}'"
+        output = [f"Found {len(results)} results for '{query}':\n"]
+        for res in results:
+            dist_str = f"{res.distance:.4f}" if res.distance is not None else "N/A (FTS)"
+            chunk = res.chunk
+            output.append(
+                f"--- Result (Score: {dist_str}) ---\n"
+                f"Source Database: {chunk.source}\n"
+                f"Execution Time: {chunk.execution_time_ms}ms (Calls: {chunk.calls})\n"
+                f"SQL Query:\n{chunk.raw_query}\n"
+            )
+        return "\n".join(output)
+    except Exception as e:
+        return f"Search execution failed: {e}"

dbs_vector/api/state.py ADDED Viewed

@@ -0,0 +1,18 @@
+from loguru import logger
+from dbs_vector.cli import _build_dependencies
+from dbs_vector.config import settings
+from dbs_vector.services.search import SearchService
+# Global service instances holding the initialized models and databases
+_services: dict[str, SearchService] = {}
+def initialize_services() -> dict[str, SearchService]:
+    """Initialize configured search services and return the service map."""
+    _services.clear()
+    for engine_name in settings.engines.keys():
+        logger.info("Loading engine: {}", engine_name)
+        deps = _build_dependencies(engine_name)
+        _services[engine_name] = SearchService(deps.embedder, deps.store)
+    return _services

dbs_vector/cli.py ADDED Viewed

@@ -0,0 +1,264 @@
+import os
+# Suppress Hugging Face progress bars BEFORE any imports that might use huggingface_hub
+os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
+os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"  # Disable hf-transfer to use standard downloads
+os.environ["TRANSFORMERS_VERBOSITY"] = "error"
+from typing import Annotated, Any, NamedTuple
+import typer
+from loguru import logger
+from dbs_vector.config import settings
+from dbs_vector.core.registry import ComponentRegistry
+from dbs_vector.infrastructure.embeddings.mlx_engine import MLXEmbedder
+from dbs_vector.infrastructure.storage.lancedb_engine import LanceDBStore
+from dbs_vector.logger import configure_logger
+from dbs_vector.services.ingestion import IngestionService
+from dbs_vector.services.search import SearchService
+app = typer.Typer(
+    help="dbs-vector: Local Arrow-Native Codebase Search Engine",
+    no_args_is_help=True,
+    rich_markup_mode=None,
+)
+class EngineDeps(NamedTuple):
+    """Container for resolved engine dependencies."""
+    embedder: Any
+    store: Any
+    chunker: Any
+    workflow: str
+def version_callback(value: bool) -> None:
+    if value:
+        from dbs_vector import __version__
+        typer.echo(f"dbs-vector version: {__version__}")
+        raise typer.Exit()
+@app.callback()
+def main(
+    ctx: typer.Context,
+    config_file: Annotated[
+        str, typer.Option("--config-file", "-c", help="Path to config.yaml file.")
+    ] = "config.yaml",
+    version: Annotated[
+        bool | None,
+        typer.Option(
+            "--version",
+            "-v",
+            help="Show the version and exit.",
+            callback=version_callback,
+            is_eager=True,
+        ),
+    ] = None,
+) -> None:
+    """dbs-vector: Configurable Arrow-Native Search Engine."""
+    # Skip config loading when just showing help or version (no subcommand invoked)
+    if ctx.invoked_subcommand is None:
+        return
+    import os
+    from dbs_vector.config import load_settings, settings
+    # Export to environment so uvicorn subprocesses (in API mode) inherit it
+    os.environ["DBS_CONFIG_FILE"] = config_file
+    # Dynamically update the current process global settings singleton
+    new_settings = load_settings(config_file)
+    settings.db_path = new_settings.db_path
+    settings.batch_size = new_settings.batch_size
+    settings.nprobes = new_settings.nprobes
+    settings.engines = new_settings.engines
+    settings.log_level = new_settings.log_level
+    settings.log_serialize = new_settings.log_serialize
+    # Configure logger based on settings
+    configure_logger(level=settings.log_level, serialize=settings.log_serialize)
+def _build_dependencies(
+    engine_name: str,
+    query_override: str | None = None,
+    url_override: str | None = None,
+) -> EngineDeps:
+    """Dependency Injection Factory driven by config.yaml configuration."""
+    if engine_name not in settings.engines:
+        raise ValueError(
+            f"Unknown engine: '{engine_name}'. Check {os.environ.get('DBS_CONFIG_FILE', 'config.yaml')}."
+        )
+    config = settings.engines[engine_name]
+    # Initialize Embedder
+    embedder = MLXEmbedder(
+        model_name=config.model_name,
+        max_token_length=config.max_token_length,
+        dimension=config.vector_dimension,
+        passage_prefix=config.passage_prefix,
+        query_prefix=config.query_prefix,
+    )
+    # Resolve components via Registry
+    MapperClass = ComponentRegistry.get_mapper(config.mapper_type)
+    ChunkerClass = ComponentRegistry.get_chunker(config.chunker_type)
+    mapper = MapperClass(vector_dimension=config.vector_dimension)
+    chunker = ChunkerClass(
+        **config.chunker_kwargs(query_override=query_override, url_override=url_override)
+    )
+    try:
+        store = LanceDBStore(
+            db_path=settings.db_path,
+            table_name=config.table_name,
+            vector_dimension=config.vector_dimension,
+            mapper=mapper,
+            nprobes=settings.nprobes,
+        )
+    except ValueError as e:
+        if "Schema mismatch" in str(e):
+            typer.echo(f"\n[!] Database Error: {e}", err=True)
+            raise typer.Exit(code=1) from e
+        raise
+    return EngineDeps(embedder=embedder, store=store, chunker=chunker, workflow=config.workflow)
+@app.command()
+def ingest(
+    path: Annotated[
+        str, typer.Argument(help="Directory path, glob pattern, or JSON file to ingest.")
+    ],
+    engine_name: Annotated[
+        str, typer.Option("--type", "-t", help="The type of data to ingest (md, sql, etc).")
+    ] = "md",
+    rebuild: Annotated[
+        bool,
+        typer.Option(
+            "--rebuild", "-r", help="Drop the existing vector store and recreate it from scratch."
+        ),
+    ] = False,
+    force: Annotated[
+        bool,
+        typer.Option("--force", "-f", help="Bypass confirmation prompt when rebuilding."),
+    ] = False,
+    query: Annotated[
+        str | None,
+        typer.Option("--query", "-q", help="Custom SQL query for DuckDB extraction."),
+    ] = None,
+) -> None:
+    """Ingests documents or SQL query logs into the Arrow-native vector store."""
+    if engine_name not in settings.engines:
+        typer.echo(
+            f"Error: Unknown engine type '{engine_name}'. Available: {list(settings.engines.keys())}"
+        )
+        raise typer.Exit(code=1)
+    if rebuild and not force:
+        typer.confirm(
+            f"Are you sure you want to completely rebuild the '{engine_name}' vector store? This will erase all existing data.",
+            abort=True,
+        )
+    url_override = path if path.startswith(("http://", "https://")) else None
+    deps = _build_dependencies(engine_name, query_override=query, url_override=url_override)
+    service = IngestionService(deps.chunker, deps.embedder, deps.store, deps.workflow)
+    service.ingest_directory(path, rebuild=rebuild)
+@app.command()
+def search(
+    query: Annotated[
+        str, typer.Argument(help="The text or SQL to search for within the indexed data.")
+    ],
+    engine_name: Annotated[
+        str, typer.Option("--type", "-t", help="The type of data to search (md, sql, etc).")
+    ] = "md",
+    filter_source: Annotated[
+        str | None,
+        typer.Option("--source", "-s", help="Filter results to a specific file or database."),
+    ] = None,
+    limit: Annotated[
+        int, typer.Option("--limit", "-l", help="Maximum number of search results to return.")
+    ] = 5,
+    # SQL specific filters
+    min_time: Annotated[
+        float | None, typer.Option("--min-time", help="(SQL Only) Minimum execution time in ms.")
+    ] = None,
+) -> None:
+    """Searches the vector store using hybrid retrieval (Vector + Full-Text)."""
+    if engine_name not in settings.engines:
+        typer.echo(
+            f"Error: Unknown engine type '{engine_name}'. Available: {list(settings.engines.keys())}"
+        )
+        raise typer.Exit(code=1)
+    deps = _build_dependencies(engine_name)
+    service = SearchService(deps.embedder, deps.store)
+    extra_filters = {}
+    if min_time is not None and engine_name == "sql":
+        extra_filters["min_time"] = min_time
+    results = service.execute_query(
+        query, source_filter=filter_source, limit=limit, extra_filters=extra_filters
+    )
+    service.print_results(results)
+@app.command()
+def serve(
+    host: Annotated[
+        str, typer.Option("--host", "-h", help="Host to bind the API server to.")
+    ] = "127.0.0.1",
+    port: Annotated[
+        int, typer.Option("--port", "-p", help="Port to bind the API server to.")
+    ] = 8000,
+    reload: Annotated[
+        bool, typer.Option("--reload", help="Enable auto-reload for development.")
+    ] = False,
+) -> None:
+    """Starts the asynchronous FastAPI search server."""
+    import uvicorn
+    logger.info("Starting dbs-vector API server at http://{}:{}", host, port)
+    uvicorn.run("dbs_vector.api.main:app", host=host, port=port, reload=reload)
+@app.command()
+def mcp(
+    config_file: Annotated[
+        str, typer.Option("--config-file", "-c", help="Path to config.yaml file.")
+    ] = "config.yaml",
+) -> None:
+    """Starts the FastMCP standard input/output (stdio) server for integrations."""
+    import os
+    from dbs_vector.api.mcp_server import mcp as mcp_server
+    from dbs_vector.api.state import initialize_services
+    # Export to environment so the MCP subprocess inherits it
+    os.environ["DBS_CONFIG_FILE"] = config_file
+    logger.info("Initializing MLX Embedders and LanceDB connections")
+    try:
+        initialize_services()
+    except Exception as e:
+        logger.error("Failed to initialize search services: {}", e)
+        raise
+    mcp_server.run()
+if __name__ == "__main__":
+    app()

dbs_vector/config.py ADDED Viewed

@@ -0,0 +1,110 @@
+import os
+from pathlib import Path
+import yaml
+from loguru import logger
+from pydantic import BaseModel
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class EngineConfig(BaseModel):
+    """Configuration specific to a single AI engine/data source."""
+    description: str
+    model_name: str
+    vector_dimension: int
+    max_token_length: int
+    table_name: str
+    mapper_type: str
+    chunker_type: str
+    chunk_max_chars: int
+    # Task Prefixes for models like embeddinggemma
+    query_prefix: str = ""
+    passage_prefix: str = ""
+    workflow: str = "default"
+    duckdb_query: str | None = None
+    # API chunker fields
+    api_base_url: str = ""
+    api_key: str = ""
+    api_page_size: int = 200
+    api_since_days: int = 15
+    api_timeout_sec: int = 30
+    api_min_execution_ms: float = 0.0
+    api_database: str = ""
+    def chunker_kwargs(
+        self, query_override: str | None = None, url_override: str | None = None
+    ) -> dict[str, object]:
+        """Resolve chunker initialization kwargs from engine config."""
+        if self.chunker_type == "duckdb":
+            return {"query": query_override or self.duckdb_query}
+        if self.chunker_type == "api":
+            kwargs: dict[str, object] = {
+                "base_url": url_override or self.api_base_url,
+                "api_key": self.api_key,
+                "page_size": self.api_page_size,
+                "since_days": self.api_since_days,
+                "timeout_sec": self.api_timeout_sec,
+                "min_execution_ms": self.api_min_execution_ms,
+            }
+            if self.api_database:
+                kwargs["database"] = self.api_database
+            if query_override:
+                kwargs["custom_query"] = query_override
+            return kwargs
+        if self.chunk_max_chars > 0:
+            return {"max_chars": self.chunk_max_chars}
+        return {}
+class Settings(BaseSettings):
+    """Global configuration for the dbs-vector application."""
+    # General System
+    db_path: str = "./lancedb_dbs_vector"
+    batch_size: int = 64
+    nprobes: int = 20
+    log_level: str = "INFO"
+    log_serialize: bool = False
+    # Engines dictionary
+    engines: dict[str, EngineConfig] = {}
+    model_config = SettingsConfigDict(env_prefix="DBS_", env_file=".env")
+def load_settings(config_file: str | None = None) -> Settings:
+    """Loads base settings and overrides them from config.yaml."""
+    base_settings = Settings()
+    if config_file is None:
+        config_file = os.getenv("DBS_CONFIG_FILE", "config.yaml")
+    yaml_path = Path(config_file)
+    if yaml_path.exists():
+        with open(yaml_path, encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+            if not data:
+                return base_settings
+            # Override System configuration
+            if "system" in data and isinstance(data["system"], dict):
+                for key, value in data["system"].items():
+                    if hasattr(base_settings, key):
+                        setattr(base_settings, key, value)
+            # Override Engine configuration
+            if "engines" in data and isinstance(data["engines"], dict):
+                engines = {k: EngineConfig(**v) for k, v in data["engines"].items()}
+                base_settings.engines = engines
+    else:
+        logger.warning("Configuration file '{}' not found, using defaults", yaml_path)
+    return base_settings
+# Global singleton instance
+settings = load_settings()

dbs_vector/core/__init__.py ADDED Viewed

File without changes