dbs-vector 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. dbs_vector-0.5.1/.github/workflows/ci.yml +32 -0
  2. dbs_vector-0.5.1/.github/workflows/release.yml +94 -0
  3. dbs_vector-0.5.1/.gitignore +16 -0
  4. dbs_vector-0.5.1/.python-version +1 -0
  5. dbs_vector-0.5.1/CLAUDE.md +95 -0
  6. dbs_vector-0.5.1/LICENSE.md +10 -0
  7. dbs_vector-0.5.1/PKG-INFO +178 -0
  8. dbs_vector-0.5.1/README.md +149 -0
  9. dbs_vector-0.5.1/config.yaml +52 -0
  10. dbs_vector-0.5.1/config.yaml.example +141 -0
  11. dbs_vector-0.5.1/docs/README.md +97 -0
  12. dbs_vector-0.5.1/docs/README_API.md +141 -0
  13. dbs_vector-0.5.1/docs/README_ARCHITECTURE.md +77 -0
  14. dbs_vector-0.5.1/docs/README_DOCS.md +68 -0
  15. dbs_vector-0.5.1/docs/README_MCP.md +369 -0
  16. dbs_vector-0.5.1/docs/README_REMOTE_SQL_API.md +515 -0
  17. dbs_vector-0.5.1/docs/README_SQL.md +66 -0
  18. dbs_vector-0.5.1/docs/README_WORKFLOW.md +88 -0
  19. dbs_vector-0.5.1/docs/README_duckdb.md +219 -0
  20. dbs_vector-0.5.1/pyproject.toml +103 -0
  21. dbs_vector-0.5.1/scripts/README.md +96 -0
  22. dbs_vector-0.5.1/scripts/check_remote_api.py +596 -0
  23. dbs_vector-0.5.1/src/dbs_vector/__init__.py +6 -0
  24. dbs_vector-0.5.1/src/dbs_vector/api/__init__.py +0 -0
  25. dbs_vector-0.5.1/src/dbs_vector/api/main.py +137 -0
  26. dbs_vector-0.5.1/src/dbs_vector/api/mcp_server.py +100 -0
  27. dbs_vector-0.5.1/src/dbs_vector/api/state.py +18 -0
  28. dbs_vector-0.5.1/src/dbs_vector/cli.py +264 -0
  29. dbs_vector-0.5.1/src/dbs_vector/config.py +110 -0
  30. dbs_vector-0.5.1/src/dbs_vector/core/__init__.py +0 -0
  31. dbs_vector-0.5.1/src/dbs_vector/core/models.py +107 -0
  32. dbs_vector-0.5.1/src/dbs_vector/core/ports.py +89 -0
  33. dbs_vector-0.5.1/src/dbs_vector/core/registry.py +36 -0
  34. dbs_vector-0.5.1/src/dbs_vector/infrastructure/__init__.py +0 -0
  35. dbs_vector-0.5.1/src/dbs_vector/infrastructure/chunking/__init__.py +0 -0
  36. dbs_vector-0.5.1/src/dbs_vector/infrastructure/chunking/api.py +139 -0
  37. dbs_vector-0.5.1/src/dbs_vector/infrastructure/chunking/document.py +100 -0
  38. dbs_vector-0.5.1/src/dbs_vector/infrastructure/chunking/duckdb.py +119 -0
  39. dbs_vector-0.5.1/src/dbs_vector/infrastructure/chunking/sql.py +65 -0
  40. dbs_vector-0.5.1/src/dbs_vector/infrastructure/embeddings/__init__.py +0 -0
  41. dbs_vector-0.5.1/src/dbs_vector/infrastructure/embeddings/mlx_engine.py +106 -0
  42. dbs_vector-0.5.1/src/dbs_vector/infrastructure/storage/__init__.py +0 -0
  43. dbs_vector-0.5.1/src/dbs_vector/infrastructure/storage/lancedb_engine.py +145 -0
  44. dbs_vector-0.5.1/src/dbs_vector/infrastructure/storage/mappers.py +174 -0
  45. dbs_vector-0.5.1/src/dbs_vector/logger.py +43 -0
  46. dbs_vector-0.5.1/src/dbs_vector/py.typed +0 -0
  47. dbs_vector-0.5.1/src/dbs_vector/services/__init__.py +0 -0
  48. dbs_vector-0.5.1/src/dbs_vector/services/ingestion.py +127 -0
  49. dbs_vector-0.5.1/src/dbs_vector/services/search.py +76 -0
  50. dbs_vector-0.5.1/tests/conftest.py +18 -0
  51. dbs_vector-0.5.1/tests/integration/test_api.py +315 -0
  52. dbs_vector-0.5.1/tests/integration/test_cli.py +560 -0
  53. dbs_vector-0.5.1/tests/integration/test_ingestion.py +80 -0
  54. dbs_vector-0.5.1/tests/unit/api/test_mcp_server.py +115 -0
  55. dbs_vector-0.5.1/tests/unit/test_api_chunker.py +246 -0
  56. dbs_vector-0.5.1/tests/unit/test_chunker.py +170 -0
  57. dbs_vector-0.5.1/tests/unit/test_config.py +299 -0
  58. dbs_vector-0.5.1/tests/unit/test_duckdb_chunker.py +78 -0
  59. dbs_vector-0.5.1/tests/unit/test_lancedb_engine.py +478 -0
  60. dbs_vector-0.5.1/tests/unit/test_mappers.py +412 -0
  61. dbs_vector-0.5.1/tests/unit/test_mlx_engine.py +312 -0
  62. dbs_vector-0.5.1/tests/unit/test_registry.py +75 -0
  63. dbs_vector-0.5.1/tests/unit/test_search_service.py +251 -0
  64. dbs_vector-0.5.1/tests/unit/test_sql_chunker.py +388 -0
  65. dbs_vector-0.5.1/uv.lock +2766 -0
@@ -0,0 +1,32 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [main]
6
+ push:
7
+ branches: [main]
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ checks:
14
+ name: Lint, Typecheck, Tests
15
+ runs-on: macos-14
16
+ steps:
17
+ - name: Checkout
18
+ uses: actions/checkout@v4
19
+
20
+ - name: Setup Python
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: "3.12"
24
+
25
+ - name: Setup uv
26
+ uses: astral-sh/setup-uv@v4
27
+
28
+ - name: Install dependencies
29
+ run: uv sync --frozen
30
+
31
+ - name: Run quality checks
32
+ run: uv run poe github-release-test
@@ -0,0 +1,94 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ permissions:
9
+ contents: read
10
+
11
+ jobs:
12
+ verify_main:
13
+ name: Verify tag points to main
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - name: Checkout
17
+ uses: actions/checkout@v4
18
+ with:
19
+ fetch-depth: 0
20
+
21
+ - name: Ensure tagged commit is on main
22
+ run: |
23
+ git fetch origin main
24
+ if ! git branch -r --contains "$GITHUB_SHA" | grep -q "origin/main"; then
25
+ echo "Tag commit is not on main. Aborting release."
26
+ exit 1
27
+ fi
28
+
29
+ build:
30
+ name: Build distribution
31
+ runs-on: ubuntu-latest
32
+ needs: verify_main
33
+ steps:
34
+ - name: Checkout
35
+ uses: actions/checkout@v4
36
+
37
+ - name: Setup Python
38
+ uses: actions/setup-python@v5
39
+ with:
40
+ python-version: "3.12"
41
+
42
+ - name: Install build tooling
43
+ run: python -m pip install --upgrade pip build twine
44
+
45
+ - name: Build package
46
+ run: python -m build
47
+
48
+ - name: Validate distributions
49
+ run: twine check dist/*
50
+
51
+ - name: Upload dist artifact
52
+ uses: actions/upload-artifact@v4
53
+ with:
54
+ name: python-dist
55
+ path: dist/
56
+
57
+ github_release:
58
+ name: Create GitHub Release
59
+ runs-on: ubuntu-latest
60
+ needs: build
61
+ permissions:
62
+ contents: write
63
+ steps:
64
+ - name: Download dist artifact
65
+ uses: actions/download-artifact@v4
66
+ with:
67
+ name: python-dist
68
+ path: dist/
69
+
70
+ - name: Publish GitHub Release
71
+ uses: softprops/action-gh-release@v2
72
+ with:
73
+ generate_release_notes: true
74
+ files: dist/*
75
+
76
+ publish_pypi:
77
+ name: Publish to PyPI
78
+ runs-on: ubuntu-latest
79
+ needs: build
80
+ if: vars.PYPI_PUBLISH == 'true'
81
+ environment:
82
+ name: pypi
83
+ permissions:
84
+ id-token: write
85
+ contents: read
86
+ steps:
87
+ - name: Download dist artifact
88
+ uses: actions/download-artifact@v4
89
+ with:
90
+ name: python-dist
91
+ path: dist/
92
+
93
+ - name: Publish package distributions to PyPI
94
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,16 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+ .DS_Store
9
+
10
+ # Virtual environments
11
+ .venv
12
+ .ayder
13
+ .coverage
14
+
15
+ # test db
16
+ lancedb_dbs_vector/
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,95 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Commands
6
+
7
+ ```bash
8
+ # Install dependencies
9
+ uv sync
10
+
11
+ # Run full validation suite (format, lint, typecheck, test)
12
+ uv run poe check
13
+
14
+ # Run tests only
15
+ uv run poe test
16
+
17
+ # Run tests with coverage
18
+ uv run poe test-cov
19
+
20
+ # Run a single test file
21
+ uv run pytest tests/unit/test_chunker.py -v
22
+
23
+ # Run a single test by name
24
+ uv run pytest tests/unit/test_chunker.py::test_function_name -v
25
+
26
+ # Lint and format
27
+ uv run poe lint
28
+ uv run poe format
29
+
30
+ # Type checking
31
+ uv run poe typecheck
32
+
33
+ # CLI commands
34
+ uv run dbs-vector ingest "docs/" --type md
35
+ uv run dbs-vector ingest "queries.json" --type sql
36
+ uv run dbs-vector search "query text" --type md
37
+ uv run dbs-vector serve
38
+ uv run dbs-vector mcp
39
+ ```
40
+
41
+ ## Architecture
42
+
43
+ This is a Clean Architecture, configuration-driven RAG search engine for Apple Silicon (MLX). The dependency flow is: **CLI/API → Services → Core Protocols → Infrastructure**.
44
+
45
+ ### Layers
46
+
47
+ **`core/`** — Pure domain layer with no external dependencies.
48
+ - `models.py`: Pydantic domain models (`Document`, `Chunk`, `SqlChunk`, `SearchResult`, `SqlSearchResult`).
49
+ - `ports.py`: Protocol interfaces (`IEmbedder`, `IChunker`, `IVectorStore`, `IStoreMapper`) that decouple infrastructure from services.
50
+ - `registry.py`: `ComponentRegistry` maps string names from `config.yaml` to concrete mapper/chunker classes.
51
+
52
+ **`infrastructure/`** — Concrete implementations of the core protocols.
53
+ - `embeddings/mlx_engine.py`: `MLXEmbedder` — runs models on Apple GPU via MLX, casts tensors to NumPy via Unified Memory. Includes a process-level `_MODEL_CACHE` dict to avoid reloading models.
54
+ - `storage/lancedb_engine.py`: `LanceDBStore` — Arrow-native storage; uses `IVF_PQ` vector index + Tantivy FTS. Schema mismatch on startup means `--rebuild --force` is needed.
55
+ - `storage/mappers.py`: `DocumentMapper` and `SqlMapper` convert domain chunks ↔ PyArrow `RecordBatch` for zero-copy ingestion and back to domain models on retrieval.
56
+ - `chunking/document.py`: `DocumentChunker` — uses `markdown-it-py` to parse `.md` semantically (code fences are kept atomic); falls back to naive splitting for `.txt`.
57
+ - `chunking/sql.py`: `SqlChunker` — parses JSON slow query log format.
58
+
59
+ **`services/`** — Orchestration, depend only on protocols.
60
+ - `ingestion.py`: `IngestionService` — reads files, chunks, deduplicates via SHA-256 content hashes, batches, embeds, and streams to `IVectorStore`.
61
+ - `search.py`: `SearchService` — embeds query and delegates hybrid search; also formats results for CLI output.
62
+
63
+ **`api/`** — FastAPI HTTP server + MCP stdio server.
64
+ - `main.py`: FastAPI app with lifespan startup (loads all engines from config), `/health`, `/search/md`, `/search/sql` endpoints. MCP is mounted at `/mcp` (SSE).
65
+ - `mcp_server.py`: `FastMCP` server exposing search as MCP tools.
66
+ - `state.py`: Shared `_services` dict (engine name → `SearchService`) used by both the FastAPI lifespan and the MCP command.
67
+
68
+ **`config.py`** — `Settings` (pydantic-settings) + `EngineConfig` per engine. Loaded from `config.yaml` at startup. Env prefix: `DBS_`. The path can be overridden with `--config-file` or `DBS_CONFIG_FILE` env var.
69
+
70
+ ### Configuration-Driven Registry Pattern
71
+
72
+ Adding a new engine type requires:
73
+ 1. Implement `IChunker` and `IStoreMapper` concrete classes.
74
+ 2. Register them in `ComponentRegistry._chunkers` / `ComponentRegistry._mappers`.
75
+ 3. Add the engine block to `config.yaml` with appropriate `mapper_type`, `chunker_type`, `model_name`, etc.
76
+
77
+ No changes to services, CLI, or API are needed.
78
+
79
+ ### Key Design Details
80
+
81
+ - **Deduplication**: Content hashes (SHA-256 truncated to 16 chars) are computed at the file level and stored per chunk. Ingestion skips chunks whose hash already exists in the store.
82
+ - **Schema evolution**: If `LanceDBStore` detects a schema mismatch on startup, it raises a descriptive `ValueError` that the CLI surfaces with a `--rebuild --force` hint.
83
+ - **Asymmetric embeddings**: `MLXEmbedder` prepends different prefixes for passages (`passage_prefix`) vs queries (`query_prefix`), supporting instruction-tuned models like `embeddinggemma`.
84
+ - **Thread safety**: `MLXEmbedder` uses a per-model `threading.Lock`; FastAPI offloads synchronous search to `asyncio.to_thread`.
85
+ - **IVF_PQ indexing**: Only created when `total_rows > 256`; partitions scale as `sqrt(total_rows)` capped at 256.
86
+
87
+ ### Test Structure
88
+
89
+ ```
90
+ tests/
91
+ unit/ # Mock-based, no I/O — fast
92
+ integration/ # Uses tmpdir LanceDB + real chunkers/mappers
93
+ ```
94
+
95
+ Mypy ignores `lancedb`, `pyarrow`, and `mlx_embeddings` (no stubs). Ruff enforces pycodestyle, pyflakes, bugbear, pyupgrade, and isort at line length 100.
@@ -0,0 +1,10 @@
1
+ # GNU General Public License v3.0 or later
2
+
3
+ SPDX-License-Identifier: GPL-3.0-or-later
4
+
5
+ This project is licensed under the **GNU General Public License, version 3 or (at your option) any later version**.
6
+
7
+ - Official GPL-3.0 license text: https://www.gnu.org/licenses/gpl-3.0.txt
8
+ - GPL license overview: https://www.gnu.org/licenses/gpl-3.0.en.html
9
+
10
+ By contributing to or distributing this project, you agree that it is provided under the terms of GPL-3.0-or-later.
@@ -0,0 +1,178 @@
1
+ Metadata-Version: 2.4
2
+ Name: dbs-vector
3
+ Version: 0.5.1
4
+ Summary: High-performance, local RAG search engine and MCP/API server for Apple Silicon
5
+ License-File: LICENSE.md
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: fastapi>=0.133.0
8
+ Requires-Dist: lance==1.2.1
9
+ Requires-Dist: lancedb==0.29.2
10
+ Requires-Dist: loguru>=0.7.3
11
+ Requires-Dist: markdown-it-py>=4.0.0
12
+ Requires-Dist: mcp>=1.26.0
13
+ Requires-Dist: mlx-embeddings>=0.0.5
14
+ Requires-Dist: numpy>=2.2.3
15
+ Requires-Dist: polars==1.38.1
16
+ Requires-Dist: pyarrow==19.0.1
17
+ Requires-Dist: pydantic-settings==2.13.1
18
+ Requires-Dist: pydantic>=2.10.6
19
+ Requires-Dist: pyyaml>=6.0.3
20
+ Requires-Dist: tantivy==0.25.1
21
+ Requires-Dist: typer>=0.15.1
22
+ Requires-Dist: types-pyyaml>=6.0.12.20250915
23
+ Requires-Dist: uvicorn>=0.41.0
24
+ Provides-Extra: api
25
+ Requires-Dist: httpx>=0.27.0; extra == 'api'
26
+ Provides-Extra: sql
27
+ Requires-Dist: duckdb>=1.2.0; extra == 'sql'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # ⚡️ dbs-vector
31
+
32
+ **A High-Performance, Arrow-Native Local Codebase Search Engine for Apple Silicon.**
33
+
34
+ `dbs-vector` is a optimized Retrieval-Augmented Generation (RAG) search engine designed specifically for macOS (M-Series chips). It bypasses traditional Python serialization bottlenecks by utilizing Apple's Unified Memory Architecture (UMA) and pure Apache Arrow data pipelines.
35
+
36
+ It enables lightning-fast, hybrid (Vector + Full-Text) search across your local codebase, entirely offline.
37
+
38
+ ---
39
+
40
+ ## ✨ Features
41
+
42
+ * **Zero-Copy Memory Pipelines**: Uses **MLX** to compute embeddings on the Mac GPU, casting the resulting tensors instantly into NumPy arrays via Unified Memory without costly `float` object instantiation.
43
+ * **Arrow-Native Storage**: Uses **LanceDB** to stream ingestion batches directly to disk via PyArrow, avoiding the massive memory overhead of JSON and dictionary comprehensions.
44
+ * **Hybrid Retrieval**: Simultaneously executes Approximate Nearest Neighbor (ANN) cosine vector search and native **Tantivy** Full-Text Search (FTS).
45
+ * **Code-Aware Chunking**: Intelligently splits documentation and code, respecting markdown fences so that code blocks are indexed as atomic units.
46
+ * **Production Robustness**: Features dynamic `IVF_PQ` indexing, Rust-level predicate pushdown (metadata filtering), and dataset compaction for delta-updates.
47
+ * **Remote SQL API Ingestion**: `ApiChunker` pulls pre-aggregated slow-query records from any networked backend over HTTP, replacing local files with a paginated REST API — no changes to the embedding or storage layers.
48
+
49
+ ## 🚀 Installation
50
+
51
+ This project is built using `uv`, an extremely fast Python package manager.
52
+
53
+ 1. **Clone the repository:**
54
+ ```bash
55
+ git clone https://github.com/dbsmedya/dbs-vector.git
56
+ cd dbs-vector
57
+ ```
58
+
59
+ 2. **Install the CLI package:**
60
+ ```bash
61
+ uv sync
62
+ ```
63
+ *This automatically sets up the environment and creates the `dbs-vector` executable in your path.*
64
+
65
+ Optional extras unlock additional ingestion sources:
66
+
67
+ ```bash
68
+ uv sync --extra sql # DuckDB ingestion
69
+ uv sync --extra api # Remote HTTP API ingestion
70
+ ```
71
+
72
+ ## 💻 Usage
73
+
74
+ The application is entirely configuration-driven via `config.yaml`. It supports multiple data types (Engines) such as Markdown and SQL.
75
+
76
+ ### Global Options
77
+ * `--config-file` / `-c`: Path to your custom `config.yaml` (Defaults to `./config.yaml`).
78
+
79
+ ### Ingesting Documents
80
+ Index markdown files, JSON SQL logs, DuckDB analytical files, or a remote HTTP slow-query API into the local vector store.
81
+
82
+ ```bash
83
+ # Ingest all markdown files (default)
84
+ uv run dbs-vector ingest "docs/"
85
+
86
+ # Ingest SQL slow query logs (JSON format)
87
+ uv run dbs-vector ingest "slow_queries.json" --type sql
88
+
89
+ # Ingest SQL slow queries from DuckDB (High-Performance Columnar)
90
+ uv run dbs-vector ingest "slow_queries.duckdb" --type sql --rebuild
91
+
92
+ # Ingest from a remote HTTP API (paginated GET)
93
+ uv run dbs-vector ingest "https://slow-log-api.internal/api/v1" --type sql-api
94
+
95
+ # Ingest via a custom SELECT sent to the remote API
96
+ uv run dbs-vector ingest "https://slow-log-api.internal/api/v1" --type sql-api \
97
+ --query "SELECT fingerprint_id AS id, sanitized_sql AS text, db AS source, ..."
98
+ ```
99
+
100
+ ### Searching the Codebase
101
+ Execute queries against your chosen engine.
102
+
103
+ ```bash
104
+ # Semantic hybrid search across markdown
105
+ uv run dbs-vector search "What is MLX?"
106
+
107
+ # Find similar slow queries (SQL clustering)
108
+ uv run dbs-vector search "SELECT * FROM users" --type sql --min-time 1000
109
+ ```
110
+
111
+ > **Indexes are built automatically at the end of every `ingest` run.** Two indexes are created:
112
+ > - **IVF_PQ** vector index (only when the table has > 256 rows)
113
+ > - **Tantivy FTS** inverted index (required for hybrid search)
114
+ >
115
+ > If you see a *"Cannot perform full text search unless an INVERTED index has been created"* error, it means the FTS index was never built for your table. Fix it by re-running ingestion — use `--rebuild` to wipe and re-index from scratch:
116
+ > ```bash
117
+ > uv run dbs-vector ingest "docs/" --rebuild
118
+ > uv run dbs-vector ingest "slow_queries.json" --type sql --rebuild
119
+ > ```
120
+
121
+ For detailed specifications on each ingestion source, see:
122
+ 👉 **[SQL Engine Documentation](docs/README_SQL.md)**
123
+ 👉 **[DuckDB Ingestion Documentation](docs/README_duckdb.md)**
124
+ 👉 **[Remote SQL API Ingestion](docs/README_REMOTE_SQL_API.md)**
125
+
126
+ ### Async API Server
127
+ The application includes a high-performance FastAPI server to expose the search engine over HTTP.
128
+
129
+ ```bash
130
+ # Start the API server (loads all engines defined in config.yaml)
131
+ uv run dbs-vector serve
132
+ ```
133
+
134
+ For full API specifications and swagger documentation, see:
135
+ 👉 **[API Usage & Documentation](docs/README_API.md)**
136
+
137
+ ### Model Context Protocol (MCP) Server
138
+ `dbs-vector` includes a built-in MCP server compatible with Claude Desktop, Claude Code (CLI), and Cursor. Supports both **stdio** (no server required) and **Streamable HTTP** (shared instance, saves VRAM).
139
+
140
+ ```bash
141
+ # stdio — each client spawns its own process
142
+ uv run dbs-vector mcp
143
+
144
+ # HTTP — one shared server for all clients
145
+ uv run dbs-vector serve # MCP endpoint: http://127.0.0.1:8000/mcp
146
+ ```
147
+
148
+ For setup instructions for all clients and transport types, see:
149
+ 👉 **[MCP Server Documentation](docs/README_MCP.md)**
150
+
151
+ ## 🏗 Architecture & Roadmap
152
+
153
+ `dbs-vector` is built upon strict **Clean Architecture** and **SOLID** principles. It utilizes a **Configuration-Driven Registry Pattern**, allowing new data engines (e.g., LibCST, Logs) to be added by simply updating `config.yaml` and registering new mappers/chunkers without modifying core orchestration logic.
154
+
155
+ ### Specialized Gemma Workflows
156
+ The project is optimized for instruction-tuned models like `embeddinggemma`. It supports asymmetric task-based workflows defined in `config.yaml`:
157
+ * **Markdown (Search Result)**: Uses the `task: search result` prefix for queries and `title: none | text: ` for documents, maximizing retrieval accuracy for RAG.
158
+ * **SQL (Clustering)**: Uses the `task: clustering` prefix for both ingestion and search, enabling high-precision semantic grouping of logically similar slow queries.
159
+
160
+ ### Future Hardware Support (CUDA/TPU)
161
+ Because the core RAG orchestration relies exclusively on the `IEmbedder` Protocol, the application is strictly hardware-agnostic at its core. While currently optimized for Apple Silicon via `MLXEmbedder`, future deployment to cloud GPUs or Linux environments simply requires implementing a new `CudaEmbedder` (using PyTorch/Transformers) that returns standard NumPy arrays. No changes to the ingestion, storage, or API layers are necessary to support new hardware accelerators. No access to a CUDA hardware at the moment.
162
+
163
+ For a deep dive into the engineering, the Apache Arrow ingestion lifecycle, and the blueprint for AST/LibCST integration, see the official documentation:
164
+
165
+ 👉 **[Architecture & Engineering Documentation](docs/README.md)**
166
+
167
+ ## 🛠 Development
168
+
169
+ To contribute to `dbs-vector`, the project utilizes `poethepoet` as a task runner and implements strict quality gates (Ruff & Mypy).
170
+
171
+ ```bash
172
+ # Run the entire validation suite (Format, Lint, Typecheck, Pytest)
173
+ uv run poe check
174
+
175
+ # Run tests with coverage
176
+ uv run poe test-cov
177
+ ```
178
+
@@ -0,0 +1,149 @@
1
+ # ⚡️ dbs-vector
2
+
3
+ **A High-Performance, Arrow-Native Local Codebase Search Engine for Apple Silicon.**
4
+
5
+ `dbs-vector` is a optimized Retrieval-Augmented Generation (RAG) search engine designed specifically for macOS (M-Series chips). It bypasses traditional Python serialization bottlenecks by utilizing Apple's Unified Memory Architecture (UMA) and pure Apache Arrow data pipelines.
6
+
7
+ It enables lightning-fast, hybrid (Vector + Full-Text) search across your local codebase, entirely offline.
8
+
9
+ ---
10
+
11
+ ## ✨ Features
12
+
13
+ * **Zero-Copy Memory Pipelines**: Uses **MLX** to compute embeddings on the Mac GPU, casting the resulting tensors instantly into NumPy arrays via Unified Memory without costly `float` object instantiation.
14
+ * **Arrow-Native Storage**: Uses **LanceDB** to stream ingestion batches directly to disk via PyArrow, avoiding the massive memory overhead of JSON and dictionary comprehensions.
15
+ * **Hybrid Retrieval**: Simultaneously executes Approximate Nearest Neighbor (ANN) cosine vector search and native **Tantivy** Full-Text Search (FTS).
16
+ * **Code-Aware Chunking**: Intelligently splits documentation and code, respecting markdown fences so that code blocks are indexed as atomic units.
17
+ * **Production Robustness**: Features dynamic `IVF_PQ` indexing, Rust-level predicate pushdown (metadata filtering), and dataset compaction for delta-updates.
18
+ * **Remote SQL API Ingestion**: `ApiChunker` pulls pre-aggregated slow-query records from any networked backend over HTTP, replacing local files with a paginated REST API — no changes to the embedding or storage layers.
19
+
20
+ ## 🚀 Installation
21
+
22
+ This project is built using `uv`, an extremely fast Python package manager.
23
+
24
+ 1. **Clone the repository:**
25
+ ```bash
26
+ git clone https://github.com/dbsmedya/dbs-vector.git
27
+ cd dbs-vector
28
+ ```
29
+
30
+ 2. **Install the CLI package:**
31
+ ```bash
32
+ uv sync
33
+ ```
34
+ *This automatically sets up the environment and creates the `dbs-vector` executable in your path.*
35
+
36
+ Optional extras unlock additional ingestion sources:
37
+
38
+ ```bash
39
+ uv sync --extra sql # DuckDB ingestion
40
+ uv sync --extra api # Remote HTTP API ingestion
41
+ ```
42
+
43
+ ## 💻 Usage
44
+
45
+ The application is entirely configuration-driven via `config.yaml`. It supports multiple data types (Engines) such as Markdown and SQL.
46
+
47
+ ### Global Options
48
+ * `--config-file` / `-c`: Path to your custom `config.yaml` (Defaults to `./config.yaml`).
49
+
50
+ ### Ingesting Documents
51
+ Index markdown files, JSON SQL logs, DuckDB analytical files, or a remote HTTP slow-query API into the local vector store.
52
+
53
+ ```bash
54
+ # Ingest all markdown files (default)
55
+ uv run dbs-vector ingest "docs/"
56
+
57
+ # Ingest SQL slow query logs (JSON format)
58
+ uv run dbs-vector ingest "slow_queries.json" --type sql
59
+
60
+ # Ingest SQL slow queries from DuckDB (High-Performance Columnar)
61
+ uv run dbs-vector ingest "slow_queries.duckdb" --type sql --rebuild
62
+
63
+ # Ingest from a remote HTTP API (paginated GET)
64
+ uv run dbs-vector ingest "https://slow-log-api.internal/api/v1" --type sql-api
65
+
66
+ # Ingest via a custom SELECT sent to the remote API
67
+ uv run dbs-vector ingest "https://slow-log-api.internal/api/v1" --type sql-api \
68
+ --query "SELECT fingerprint_id AS id, sanitized_sql AS text, db AS source, ..."
69
+ ```
70
+
71
+ ### Searching the Codebase
72
+ Execute queries against your chosen engine.
73
+
74
+ ```bash
75
+ # Semantic hybrid search across markdown
76
+ uv run dbs-vector search "What is MLX?"
77
+
78
+ # Find similar slow queries (SQL clustering)
79
+ uv run dbs-vector search "SELECT * FROM users" --type sql --min-time 1000
80
+ ```
81
+
82
+ > **Indexes are built automatically at the end of every `ingest` run.** Two indexes are created:
83
+ > - **IVF_PQ** vector index (only when the table has > 256 rows)
84
+ > - **Tantivy FTS** inverted index (required for hybrid search)
85
+ >
86
+ > If you see a *"Cannot perform full text search unless an INVERTED index has been created"* error, it means the FTS index was never built for your table. Fix it by re-running ingestion — use `--rebuild` to wipe and re-index from scratch:
87
+ > ```bash
88
+ > uv run dbs-vector ingest "docs/" --rebuild
89
+ > uv run dbs-vector ingest "slow_queries.json" --type sql --rebuild
90
+ > ```
91
+
92
+ For detailed specifications on each ingestion source, see:
93
+ 👉 **[SQL Engine Documentation](docs/README_SQL.md)**
94
+ 👉 **[DuckDB Ingestion Documentation](docs/README_duckdb.md)**
95
+ 👉 **[Remote SQL API Ingestion](docs/README_REMOTE_SQL_API.md)**
96
+
97
+ ### Async API Server
98
+ The application includes a high-performance FastAPI server to expose the search engine over HTTP.
99
+
100
+ ```bash
101
+ # Start the API server (loads all engines defined in config.yaml)
102
+ uv run dbs-vector serve
103
+ ```
104
+
105
+ For full API specifications and swagger documentation, see:
106
+ 👉 **[API Usage & Documentation](docs/README_API.md)**
107
+
108
+ ### Model Context Protocol (MCP) Server
109
+ `dbs-vector` includes a built-in MCP server compatible with Claude Desktop, Claude Code (CLI), and Cursor. Supports both **stdio** (no server required) and **Streamable HTTP** (shared instance, saves VRAM).
110
+
111
+ ```bash
112
+ # stdio — each client spawns its own process
113
+ uv run dbs-vector mcp
114
+
115
+ # HTTP — one shared server for all clients
116
+ uv run dbs-vector serve # MCP endpoint: http://127.0.0.1:8000/mcp
117
+ ```
118
+
119
+ For setup instructions for all clients and transport types, see:
120
+ 👉 **[MCP Server Documentation](docs/README_MCP.md)**
121
+
122
+ ## 🏗 Architecture & Roadmap
123
+
124
+ `dbs-vector` is built upon strict **Clean Architecture** and **SOLID** principles. It utilizes a **Configuration-Driven Registry Pattern**, allowing new data engines (e.g., LibCST, Logs) to be added by simply updating `config.yaml` and registering new mappers/chunkers without modifying core orchestration logic.
125
+
126
+ ### Specialized Gemma Workflows
127
+ The project is optimized for instruction-tuned models like `embeddinggemma`. It supports asymmetric task-based workflows defined in `config.yaml`:
128
+ * **Markdown (Search Result)**: Uses the `task: search result` prefix for queries and `title: none | text: ` for documents, maximizing retrieval accuracy for RAG.
129
+ * **SQL (Clustering)**: Uses the `task: clustering` prefix for both ingestion and search, enabling high-precision semantic grouping of logically similar slow queries.
130
+
131
+ ### Future Hardware Support (CUDA/TPU)
132
+ Because the core RAG orchestration relies exclusively on the `IEmbedder` Protocol, the application is strictly hardware-agnostic at its core. While currently optimized for Apple Silicon via `MLXEmbedder`, future deployment to cloud GPUs or Linux environments simply requires implementing a new `CudaEmbedder` (using PyTorch/Transformers) that returns standard NumPy arrays. No changes to the ingestion, storage, or API layers are necessary to support new hardware accelerators. No access to a CUDA hardware at the moment.
133
+
134
+ For a deep dive into the engineering, the Apache Arrow ingestion lifecycle, and the blueprint for AST/LibCST integration, see the official documentation:
135
+
136
+ 👉 **[Architecture & Engineering Documentation](docs/README.md)**
137
+
138
+ ## 🛠 Development
139
+
140
+ To contribute to `dbs-vector`, the project utilizes `poethepoet` as a task runner and implements strict quality gates (Ruff & Mypy).
141
+
142
+ ```bash
143
+ # Run the entire validation suite (Format, Lint, Typecheck, Pytest)
144
+ uv run poe check
145
+
146
+ # Run tests with coverage
147
+ uv run poe test-cov
148
+ ```
149
+
@@ -0,0 +1,52 @@
1
+ system:
2
+ db_path: "./lancedb_dbs_vector"
3
+ batch_size: 64
4
+ nprobes: 20
5
+
6
+ engines:
7
+ md:
8
+ description: "Markdown & Prose Document Engine (Gemma Search)"
9
+ model_name: "mlx-community/embeddinggemma-300m-bf16"
10
+ vector_dimension: 768
11
+ max_token_length: 2048
12
+ table_name: "knowledge_vault"
13
+ mapper_type: "document"
14
+ chunker_type: "document"
15
+ chunk_max_chars: 1000
16
+ passage_prefix: "title: none | text: "
17
+ query_prefix: "task: search result | query: "
18
+ workflow: "md_search"
19
+
20
+ sql:
21
+ description: "SQL Slow Query Log Engine (Gemma Clustering)"
22
+ model_name: "mlx-community/embeddinggemma-300m-bf16"
23
+ vector_dimension: 768
24
+ max_token_length: 2048
25
+ table_name: "query_vault"
26
+ mapper_type: "sql"
27
+ chunker_type: "duckdb"
28
+ chunk_max_chars: 0
29
+ passage_prefix: "task: clustering | query: "
30
+ query_prefix: "task: clustering | query: "
31
+ workflow: "sql_clustering"
32
+
33
+ sql-api:
34
+ description: "Remote slow query log via HTTP API"
35
+ model_name: "mlx-community/embeddinggemma-300m-bf16"
36
+ vector_dimension: 768
37
+ max_token_length: 2048
38
+ table_name: "query_vault"
39
+ mapper_type: "sql"
40
+ chunker_type: "api"
41
+ chunk_max_chars: 0
42
+ passage_prefix: "task: clustering | query: "
43
+ query_prefix: "task: clustering | query: "
44
+ workflow: "sql_clustering"
45
+ # --- ApiChunker-specific fields ---
46
+ api_base_url: "http://localhost:8080/api/v1"
47
+ api_key: "0Byuf9P9e5UxUNIsngNjr6b9u8sldoHd1ek_ImBbxiI" # set via DBS_API_KEY env var in production
48
+ api_page_size: 200 # records per GET request (max 1000)
49
+ api_since_days: 60 # lower bound on latest_ts (default: 15)
50
+ api_timeout_sec: 30 # HTTP request timeout in seconds
51
+ api_min_execution_ms: 0 # filter: skip queries below this threshold
52
+ api_database: "" # leave empty to fetch all databases