dbs-vector 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbs_vector-0.5.1/.github/workflows/ci.yml +32 -0
- dbs_vector-0.5.1/.github/workflows/release.yml +94 -0
- dbs_vector-0.5.1/.gitignore +16 -0
- dbs_vector-0.5.1/.python-version +1 -0
- dbs_vector-0.5.1/CLAUDE.md +95 -0
- dbs_vector-0.5.1/LICENSE.md +10 -0
- dbs_vector-0.5.1/PKG-INFO +178 -0
- dbs_vector-0.5.1/README.md +149 -0
- dbs_vector-0.5.1/config.yaml +52 -0
- dbs_vector-0.5.1/config.yaml.example +141 -0
- dbs_vector-0.5.1/docs/README.md +97 -0
- dbs_vector-0.5.1/docs/README_API.md +141 -0
- dbs_vector-0.5.1/docs/README_ARCHITECTURE.md +77 -0
- dbs_vector-0.5.1/docs/README_DOCS.md +68 -0
- dbs_vector-0.5.1/docs/README_MCP.md +369 -0
- dbs_vector-0.5.1/docs/README_REMOTE_SQL_API.md +515 -0
- dbs_vector-0.5.1/docs/README_SQL.md +66 -0
- dbs_vector-0.5.1/docs/README_WORKFLOW.md +88 -0
- dbs_vector-0.5.1/docs/README_duckdb.md +219 -0
- dbs_vector-0.5.1/pyproject.toml +103 -0
- dbs_vector-0.5.1/scripts/README.md +96 -0
- dbs_vector-0.5.1/scripts/check_remote_api.py +596 -0
- dbs_vector-0.5.1/src/dbs_vector/__init__.py +6 -0
- dbs_vector-0.5.1/src/dbs_vector/api/__init__.py +0 -0
- dbs_vector-0.5.1/src/dbs_vector/api/main.py +137 -0
- dbs_vector-0.5.1/src/dbs_vector/api/mcp_server.py +100 -0
- dbs_vector-0.5.1/src/dbs_vector/api/state.py +18 -0
- dbs_vector-0.5.1/src/dbs_vector/cli.py +264 -0
- dbs_vector-0.5.1/src/dbs_vector/config.py +110 -0
- dbs_vector-0.5.1/src/dbs_vector/core/__init__.py +0 -0
- dbs_vector-0.5.1/src/dbs_vector/core/models.py +107 -0
- dbs_vector-0.5.1/src/dbs_vector/core/ports.py +89 -0
- dbs_vector-0.5.1/src/dbs_vector/core/registry.py +36 -0
- dbs_vector-0.5.1/src/dbs_vector/infrastructure/__init__.py +0 -0
- dbs_vector-0.5.1/src/dbs_vector/infrastructure/chunking/__init__.py +0 -0
- dbs_vector-0.5.1/src/dbs_vector/infrastructure/chunking/api.py +139 -0
- dbs_vector-0.5.1/src/dbs_vector/infrastructure/chunking/document.py +100 -0
- dbs_vector-0.5.1/src/dbs_vector/infrastructure/chunking/duckdb.py +119 -0
- dbs_vector-0.5.1/src/dbs_vector/infrastructure/chunking/sql.py +65 -0
- dbs_vector-0.5.1/src/dbs_vector/infrastructure/embeddings/__init__.py +0 -0
- dbs_vector-0.5.1/src/dbs_vector/infrastructure/embeddings/mlx_engine.py +106 -0
- dbs_vector-0.5.1/src/dbs_vector/infrastructure/storage/__init__.py +0 -0
- dbs_vector-0.5.1/src/dbs_vector/infrastructure/storage/lancedb_engine.py +145 -0
- dbs_vector-0.5.1/src/dbs_vector/infrastructure/storage/mappers.py +174 -0
- dbs_vector-0.5.1/src/dbs_vector/logger.py +43 -0
- dbs_vector-0.5.1/src/dbs_vector/py.typed +0 -0
- dbs_vector-0.5.1/src/dbs_vector/services/__init__.py +0 -0
- dbs_vector-0.5.1/src/dbs_vector/services/ingestion.py +127 -0
- dbs_vector-0.5.1/src/dbs_vector/services/search.py +76 -0
- dbs_vector-0.5.1/tests/conftest.py +18 -0
- dbs_vector-0.5.1/tests/integration/test_api.py +315 -0
- dbs_vector-0.5.1/tests/integration/test_cli.py +560 -0
- dbs_vector-0.5.1/tests/integration/test_ingestion.py +80 -0
- dbs_vector-0.5.1/tests/unit/api/test_mcp_server.py +115 -0
- dbs_vector-0.5.1/tests/unit/test_api_chunker.py +246 -0
- dbs_vector-0.5.1/tests/unit/test_chunker.py +170 -0
- dbs_vector-0.5.1/tests/unit/test_config.py +299 -0
- dbs_vector-0.5.1/tests/unit/test_duckdb_chunker.py +78 -0
- dbs_vector-0.5.1/tests/unit/test_lancedb_engine.py +478 -0
- dbs_vector-0.5.1/tests/unit/test_mappers.py +412 -0
- dbs_vector-0.5.1/tests/unit/test_mlx_engine.py +312 -0
- dbs_vector-0.5.1/tests/unit/test_registry.py +75 -0
- dbs_vector-0.5.1/tests/unit/test_search_service.py +251 -0
- dbs_vector-0.5.1/tests/unit/test_sql_chunker.py +388 -0
- dbs_vector-0.5.1/uv.lock +2766 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
branches: [main]
|
|
6
|
+
push:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
checks:
|
|
14
|
+
name: Lint, Typecheck, Tests
|
|
15
|
+
runs-on: macos-14
|
|
16
|
+
steps:
|
|
17
|
+
- name: Checkout
|
|
18
|
+
uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Setup Python
|
|
21
|
+
uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: "3.12"
|
|
24
|
+
|
|
25
|
+
- name: Setup uv
|
|
26
|
+
uses: astral-sh/setup-uv@v4
|
|
27
|
+
|
|
28
|
+
- name: Install dependencies
|
|
29
|
+
run: uv sync --frozen
|
|
30
|
+
|
|
31
|
+
- name: Run quality checks
|
|
32
|
+
run: uv run poe github-release-test
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
verify_main:
|
|
13
|
+
name: Verify tag points to main
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- name: Checkout
|
|
17
|
+
uses: actions/checkout@v4
|
|
18
|
+
with:
|
|
19
|
+
fetch-depth: 0
|
|
20
|
+
|
|
21
|
+
- name: Ensure tagged commit is on main
|
|
22
|
+
run: |
|
|
23
|
+
git fetch origin main
|
|
24
|
+
if ! git branch -r --contains "$GITHUB_SHA" | grep -q "origin/main"; then
|
|
25
|
+
echo "Tag commit is not on main. Aborting release."
|
|
26
|
+
exit 1
|
|
27
|
+
fi
|
|
28
|
+
|
|
29
|
+
build:
|
|
30
|
+
name: Build distribution
|
|
31
|
+
runs-on: ubuntu-latest
|
|
32
|
+
needs: verify_main
|
|
33
|
+
steps:
|
|
34
|
+
- name: Checkout
|
|
35
|
+
uses: actions/checkout@v4
|
|
36
|
+
|
|
37
|
+
- name: Setup Python
|
|
38
|
+
uses: actions/setup-python@v5
|
|
39
|
+
with:
|
|
40
|
+
python-version: "3.12"
|
|
41
|
+
|
|
42
|
+
- name: Install build tooling
|
|
43
|
+
run: python -m pip install --upgrade pip build twine
|
|
44
|
+
|
|
45
|
+
- name: Build package
|
|
46
|
+
run: python -m build
|
|
47
|
+
|
|
48
|
+
- name: Validate distributions
|
|
49
|
+
run: twine check dist/*
|
|
50
|
+
|
|
51
|
+
- name: Upload dist artifact
|
|
52
|
+
uses: actions/upload-artifact@v4
|
|
53
|
+
with:
|
|
54
|
+
name: python-dist
|
|
55
|
+
path: dist/
|
|
56
|
+
|
|
57
|
+
github_release:
|
|
58
|
+
name: Create GitHub Release
|
|
59
|
+
runs-on: ubuntu-latest
|
|
60
|
+
needs: build
|
|
61
|
+
permissions:
|
|
62
|
+
contents: write
|
|
63
|
+
steps:
|
|
64
|
+
- name: Download dist artifact
|
|
65
|
+
uses: actions/download-artifact@v4
|
|
66
|
+
with:
|
|
67
|
+
name: python-dist
|
|
68
|
+
path: dist/
|
|
69
|
+
|
|
70
|
+
- name: Publish GitHub Release
|
|
71
|
+
uses: softprops/action-gh-release@v2
|
|
72
|
+
with:
|
|
73
|
+
generate_release_notes: true
|
|
74
|
+
files: dist/*
|
|
75
|
+
|
|
76
|
+
publish_pypi:
|
|
77
|
+
name: Publish to PyPI
|
|
78
|
+
runs-on: ubuntu-latest
|
|
79
|
+
needs: build
|
|
80
|
+
if: vars.PYPI_PUBLISH == 'true'
|
|
81
|
+
environment:
|
|
82
|
+
name: pypi
|
|
83
|
+
permissions:
|
|
84
|
+
id-token: write
|
|
85
|
+
contents: read
|
|
86
|
+
steps:
|
|
87
|
+
- name: Download dist artifact
|
|
88
|
+
uses: actions/download-artifact@v4
|
|
89
|
+
with:
|
|
90
|
+
name: python-dist
|
|
91
|
+
path: dist/
|
|
92
|
+
|
|
93
|
+
- name: Publish package distributions to PyPI
|
|
94
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Commands
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Install dependencies
|
|
9
|
+
uv sync
|
|
10
|
+
|
|
11
|
+
# Run full validation suite (format, lint, typecheck, test)
|
|
12
|
+
uv run poe check
|
|
13
|
+
|
|
14
|
+
# Run tests only
|
|
15
|
+
uv run poe test
|
|
16
|
+
|
|
17
|
+
# Run tests with coverage
|
|
18
|
+
uv run poe test-cov
|
|
19
|
+
|
|
20
|
+
# Run a single test file
|
|
21
|
+
uv run pytest tests/unit/test_chunker.py -v
|
|
22
|
+
|
|
23
|
+
# Run a single test by name
|
|
24
|
+
uv run pytest tests/unit/test_chunker.py::test_function_name -v
|
|
25
|
+
|
|
26
|
+
# Lint and format
|
|
27
|
+
uv run poe lint
|
|
28
|
+
uv run poe format
|
|
29
|
+
|
|
30
|
+
# Type checking
|
|
31
|
+
uv run poe typecheck
|
|
32
|
+
|
|
33
|
+
# CLI commands
|
|
34
|
+
uv run dbs-vector ingest "docs/" --type md
|
|
35
|
+
uv run dbs-vector ingest "queries.json" --type sql
|
|
36
|
+
uv run dbs-vector search "query text" --type md
|
|
37
|
+
uv run dbs-vector serve
|
|
38
|
+
uv run dbs-vector mcp
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Architecture
|
|
42
|
+
|
|
43
|
+
This is a Clean Architecture, configuration-driven RAG search engine for Apple Silicon (MLX). The dependency flow is: **CLI/API → Services → Core Protocols → Infrastructure**.
|
|
44
|
+
|
|
45
|
+
### Layers
|
|
46
|
+
|
|
47
|
+
**`core/`** — Pure domain layer with no external dependencies.
|
|
48
|
+
- `models.py`: Pydantic domain models (`Document`, `Chunk`, `SqlChunk`, `SearchResult`, `SqlSearchResult`).
|
|
49
|
+
- `ports.py`: Protocol interfaces (`IEmbedder`, `IChunker`, `IVectorStore`, `IStoreMapper`) that decouple infrastructure from services.
|
|
50
|
+
- `registry.py`: `ComponentRegistry` maps string names from `config.yaml` to concrete mapper/chunker classes.
|
|
51
|
+
|
|
52
|
+
**`infrastructure/`** — Concrete implementations of the core protocols.
|
|
53
|
+
- `embeddings/mlx_engine.py`: `MLXEmbedder` — runs models on Apple GPU via MLX, casts tensors to NumPy via Unified Memory. Includes a process-level `_MODEL_CACHE` dict to avoid reloading models.
|
|
54
|
+
- `storage/lancedb_engine.py`: `LanceDBStore` — Arrow-native storage; uses `IVF_PQ` vector index + Tantivy FTS. Schema mismatch on startup means `--rebuild --force` is needed.
|
|
55
|
+
- `storage/mappers.py`: `DocumentMapper` and `SqlMapper` convert domain chunks ↔ PyArrow `RecordBatch` for zero-copy ingestion and back to domain models on retrieval.
|
|
56
|
+
- `chunking/document.py`: `DocumentChunker` — uses `markdown-it-py` to parse `.md` semantically (code fences are kept atomic); falls back to naive splitting for `.txt`.
|
|
57
|
+
- `chunking/sql.py`: `SqlChunker` — parses JSON slow query log format.
|
|
58
|
+
|
|
59
|
+
**`services/`** — Orchestration, depend only on protocols.
|
|
60
|
+
- `ingestion.py`: `IngestionService` — reads files, chunks, deduplicates via SHA-256 content hashes, batches, embeds, and streams to `IVectorStore`.
|
|
61
|
+
- `search.py`: `SearchService` — embeds query and delegates hybrid search; also formats results for CLI output.
|
|
62
|
+
|
|
63
|
+
**`api/`** — FastAPI HTTP server + MCP stdio server.
|
|
64
|
+
- `main.py`: FastAPI app with lifespan startup (loads all engines from config), `/health`, `/search/md`, `/search/sql` endpoints. MCP is mounted at `/mcp` (SSE).
|
|
65
|
+
- `mcp_server.py`: `FastMCP` server exposing search as MCP tools.
|
|
66
|
+
- `state.py`: Shared `_services` dict (engine name → `SearchService`) used by both the FastAPI lifespan and the MCP command.
|
|
67
|
+
|
|
68
|
+
**`config.py`** — `Settings` (pydantic-settings) + `EngineConfig` per engine. Loaded from `config.yaml` at startup. Env prefix: `DBS_`. The path can be overridden with `--config-file` or `DBS_CONFIG_FILE` env var.
|
|
69
|
+
|
|
70
|
+
### Configuration-Driven Registry Pattern
|
|
71
|
+
|
|
72
|
+
Adding a new engine type requires:
|
|
73
|
+
1. Implement `IChunker` and `IStoreMapper` concrete classes.
|
|
74
|
+
2. Register them in `ComponentRegistry._chunkers` / `ComponentRegistry._mappers`.
|
|
75
|
+
3. Add the engine block to `config.yaml` with appropriate `mapper_type`, `chunker_type`, `model_name`, etc.
|
|
76
|
+
|
|
77
|
+
No changes to services, CLI, or API are needed.
|
|
78
|
+
|
|
79
|
+
### Key Design Details
|
|
80
|
+
|
|
81
|
+
- **Deduplication**: Content hashes (SHA-256 truncated to 16 chars) are computed at the file level and stored per chunk. Ingestion skips chunks whose hash already exists in the store.
|
|
82
|
+
- **Schema evolution**: If `LanceDBStore` detects a schema mismatch on startup, it raises a descriptive `ValueError` that the CLI surfaces with a `--rebuild --force` hint.
|
|
83
|
+
- **Asymmetric embeddings**: `MLXEmbedder` prepends different prefixes for passages (`passage_prefix`) vs queries (`query_prefix`), supporting instruction-tuned models like `embeddinggemma`.
|
|
84
|
+
- **Thread safety**: `MLXEmbedder` uses a per-model `threading.Lock`; FastAPI offloads synchronous search to `asyncio.to_thread`.
|
|
85
|
+
- **IVF_PQ indexing**: Only created when `total_rows > 256`; partitions scale as `sqrt(total_rows)` capped at 256.
|
|
86
|
+
|
|
87
|
+
### Test Structure
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
tests/
|
|
91
|
+
unit/ # Mock-based, no I/O — fast
|
|
92
|
+
integration/ # Uses tmpdir LanceDB + real chunkers/mappers
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Mypy ignores `lancedb`, `pyarrow`, and `mlx_embeddings` (no stubs). Ruff enforces pycodestyle, pyflakes, bugbear, pyupgrade, and isort at line length 100.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# GNU General Public License v3.0 or later
|
|
2
|
+
|
|
3
|
+
SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
|
+
|
|
5
|
+
This project is licensed under the **GNU General Public License, version 3 or (at your option) any later version**.
|
|
6
|
+
|
|
7
|
+
- Official GPL-3.0 license text: https://www.gnu.org/licenses/gpl-3.0.txt
|
|
8
|
+
- GPL license overview: https://www.gnu.org/licenses/gpl-3.0.en.html
|
|
9
|
+
|
|
10
|
+
By contributing to or distributing this project, you agree that it is provided under the terms of GPL-3.0-or-later.
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dbs-vector
|
|
3
|
+
Version: 0.5.1
|
|
4
|
+
Summary: High-performance, local RAG search engine and MCP/API server for Apple Silicon
|
|
5
|
+
License-File: LICENSE.md
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Requires-Dist: fastapi>=0.133.0
|
|
8
|
+
Requires-Dist: lance==1.2.1
|
|
9
|
+
Requires-Dist: lancedb==0.29.2
|
|
10
|
+
Requires-Dist: loguru>=0.7.3
|
|
11
|
+
Requires-Dist: markdown-it-py>=4.0.0
|
|
12
|
+
Requires-Dist: mcp>=1.26.0
|
|
13
|
+
Requires-Dist: mlx-embeddings>=0.0.5
|
|
14
|
+
Requires-Dist: numpy>=2.2.3
|
|
15
|
+
Requires-Dist: polars==1.38.1
|
|
16
|
+
Requires-Dist: pyarrow==19.0.1
|
|
17
|
+
Requires-Dist: pydantic-settings==2.13.1
|
|
18
|
+
Requires-Dist: pydantic>=2.10.6
|
|
19
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
20
|
+
Requires-Dist: tantivy==0.25.1
|
|
21
|
+
Requires-Dist: typer>=0.15.1
|
|
22
|
+
Requires-Dist: types-pyyaml>=6.0.12.20250915
|
|
23
|
+
Requires-Dist: uvicorn>=0.41.0
|
|
24
|
+
Provides-Extra: api
|
|
25
|
+
Requires-Dist: httpx>=0.27.0; extra == 'api'
|
|
26
|
+
Provides-Extra: sql
|
|
27
|
+
Requires-Dist: duckdb>=1.2.0; extra == 'sql'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# ⚡️ dbs-vector
|
|
31
|
+
|
|
32
|
+
**A High-Performance, Arrow-Native Local Codebase Search Engine for Apple Silicon.**
|
|
33
|
+
|
|
34
|
+
`dbs-vector` is a optimized Retrieval-Augmented Generation (RAG) search engine designed specifically for macOS (M-Series chips). It bypasses traditional Python serialization bottlenecks by utilizing Apple's Unified Memory Architecture (UMA) and pure Apache Arrow data pipelines.
|
|
35
|
+
|
|
36
|
+
It enables lightning-fast, hybrid (Vector + Full-Text) search across your local codebase, entirely offline.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## ✨ Features
|
|
41
|
+
|
|
42
|
+
* **Zero-Copy Memory Pipelines**: Uses **MLX** to compute embeddings on the Mac GPU, casting the resulting tensors instantly into NumPy arrays via Unified Memory without costly `float` object instantiation.
|
|
43
|
+
* **Arrow-Native Storage**: Uses **LanceDB** to stream ingestion batches directly to disk via PyArrow, avoiding the massive memory overhead of JSON and dictionary comprehensions.
|
|
44
|
+
* **Hybrid Retrieval**: Simultaneously executes Approximate Nearest Neighbor (ANN) cosine vector search and native **Tantivy** Full-Text Search (FTS).
|
|
45
|
+
* **Code-Aware Chunking**: Intelligently splits documentation and code, respecting markdown fences so that code blocks are indexed as atomic units.
|
|
46
|
+
* **Production Robustness**: Features dynamic `IVF_PQ` indexing, Rust-level predicate pushdown (metadata filtering), and dataset compaction for delta-updates.
|
|
47
|
+
* **Remote SQL API Ingestion**: `ApiChunker` pulls pre-aggregated slow-query records from any networked backend over HTTP, replacing local files with a paginated REST API — no changes to the embedding or storage layers.
|
|
48
|
+
|
|
49
|
+
## 🚀 Installation
|
|
50
|
+
|
|
51
|
+
This project is built using `uv`, an extremely fast Python package manager.
|
|
52
|
+
|
|
53
|
+
1. **Clone the repository:**
|
|
54
|
+
```bash
|
|
55
|
+
git clone https://github.com/dbsmedya/dbs-vector.git
|
|
56
|
+
cd dbs-vector
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
2. **Install the CLI package:**
|
|
60
|
+
```bash
|
|
61
|
+
uv sync
|
|
62
|
+
```
|
|
63
|
+
*This automatically sets up the environment and creates the `dbs-vector` executable in your path.*
|
|
64
|
+
|
|
65
|
+
Optional extras unlock additional ingestion sources:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
uv sync --extra sql # DuckDB ingestion
|
|
69
|
+
uv sync --extra api # Remote HTTP API ingestion
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## 💻 Usage
|
|
73
|
+
|
|
74
|
+
The application is entirely configuration-driven via `config.yaml`. It supports multiple data types (Engines) such as Markdown and SQL.
|
|
75
|
+
|
|
76
|
+
### Global Options
|
|
77
|
+
* `--config-file` / `-c`: Path to your custom `config.yaml` (Defaults to `./config.yaml`).
|
|
78
|
+
|
|
79
|
+
### Ingesting Documents
|
|
80
|
+
Index markdown files, JSON SQL logs, DuckDB analytical files, or a remote HTTP slow-query API into the local vector store.
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# Ingest all markdown files (default)
|
|
84
|
+
uv run dbs-vector ingest "docs/"
|
|
85
|
+
|
|
86
|
+
# Ingest SQL slow query logs (JSON format)
|
|
87
|
+
uv run dbs-vector ingest "slow_queries.json" --type sql
|
|
88
|
+
|
|
89
|
+
# Ingest SQL slow queries from DuckDB (High-Performance Columnar)
|
|
90
|
+
uv run dbs-vector ingest "slow_queries.duckdb" --type sql --rebuild
|
|
91
|
+
|
|
92
|
+
# Ingest from a remote HTTP API (paginated GET)
|
|
93
|
+
uv run dbs-vector ingest "https://slow-log-api.internal/api/v1" --type sql-api
|
|
94
|
+
|
|
95
|
+
# Ingest via a custom SELECT sent to the remote API
|
|
96
|
+
uv run dbs-vector ingest "https://slow-log-api.internal/api/v1" --type sql-api \
|
|
97
|
+
--query "SELECT fingerprint_id AS id, sanitized_sql AS text, db AS source, ..."
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Searching the Codebase
|
|
101
|
+
Execute queries against your chosen engine.
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
# Semantic hybrid search across markdown
|
|
105
|
+
uv run dbs-vector search "What is MLX?"
|
|
106
|
+
|
|
107
|
+
# Find similar slow queries (SQL clustering)
|
|
108
|
+
uv run dbs-vector search "SELECT * FROM users" --type sql --min-time 1000
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
> **Indexes are built automatically at the end of every `ingest` run.** Two indexes are created:
|
|
112
|
+
> - **IVF_PQ** vector index (only when the table has > 256 rows)
|
|
113
|
+
> - **Tantivy FTS** inverted index (required for hybrid search)
|
|
114
|
+
>
|
|
115
|
+
> If you see a *"Cannot perform full text search unless an INVERTED index has been created"* error, it means the FTS index was never built for your table. Fix it by re-running ingestion — use `--rebuild` to wipe and re-index from scratch:
|
|
116
|
+
> ```bash
|
|
117
|
+
> uv run dbs-vector ingest "docs/" --rebuild
|
|
118
|
+
> uv run dbs-vector ingest "slow_queries.json" --type sql --rebuild
|
|
119
|
+
> ```
|
|
120
|
+
|
|
121
|
+
For detailed specifications on each ingestion source, see:
|
|
122
|
+
👉 **[SQL Engine Documentation](docs/README_SQL.md)**
|
|
123
|
+
👉 **[DuckDB Ingestion Documentation](docs/README_duckdb.md)**
|
|
124
|
+
👉 **[Remote SQL API Ingestion](docs/README_REMOTE_SQL_API.md)**
|
|
125
|
+
|
|
126
|
+
### Async API Server
|
|
127
|
+
The application includes a high-performance FastAPI server to expose the search engine over HTTP.
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
# Start the API server (loads all engines defined in config.yaml)
|
|
131
|
+
uv run dbs-vector serve
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
For full API specifications and swagger documentation, see:
|
|
135
|
+
👉 **[API Usage & Documentation](docs/README_API.md)**
|
|
136
|
+
|
|
137
|
+
### Model Context Protocol (MCP) Server
|
|
138
|
+
`dbs-vector` includes a built-in MCP server compatible with Claude Desktop, Claude Code (CLI), and Cursor. Supports both **stdio** (no server required) and **Streamable HTTP** (shared instance, saves VRAM).
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
# stdio — each client spawns its own process
|
|
142
|
+
uv run dbs-vector mcp
|
|
143
|
+
|
|
144
|
+
# HTTP — one shared server for all clients
|
|
145
|
+
uv run dbs-vector serve # MCP endpoint: http://127.0.0.1:8000/mcp
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
For setup instructions for all clients and transport types, see:
|
|
149
|
+
👉 **[MCP Server Documentation](docs/README_MCP.md)**
|
|
150
|
+
|
|
151
|
+
## 🏗 Architecture & Roadmap
|
|
152
|
+
|
|
153
|
+
`dbs-vector` is built upon strict **Clean Architecture** and **SOLID** principles. It utilizes a **Configuration-Driven Registry Pattern**, allowing new data engines (e.g., LibCST, Logs) to be added by simply updating `config.yaml` and registering new mappers/chunkers without modifying core orchestration logic.
|
|
154
|
+
|
|
155
|
+
### Specialized Gemma Workflows
|
|
156
|
+
The project is optimized for instruction-tuned models like `embeddinggemma`. It supports asymmetric task-based workflows defined in `config.yaml`:
|
|
157
|
+
* **Markdown (Search Result)**: Uses the `task: search result` prefix for queries and `title: none | text: ` for documents, maximizing retrieval accuracy for RAG.
|
|
158
|
+
* **SQL (Clustering)**: Uses the `task: clustering` prefix for both ingestion and search, enabling high-precision semantic grouping of logically similar slow queries.
|
|
159
|
+
|
|
160
|
+
### Future Hardware Support (CUDA/TPU)
|
|
161
|
+
Because the core RAG orchestration relies exclusively on the `IEmbedder` Protocol, the application is strictly hardware-agnostic at its core. While currently optimized for Apple Silicon via `MLXEmbedder`, future deployment to cloud GPUs or Linux environments simply requires implementing a new `CudaEmbedder` (using PyTorch/Transformers) that returns standard NumPy arrays. No changes to the ingestion, storage, or API layers are necessary to support new hardware accelerators. No access to a CUDA hardware at the moment.
|
|
162
|
+
|
|
163
|
+
For a deep dive into the engineering, the Apache Arrow ingestion lifecycle, and the blueprint for AST/LibCST integration, see the official documentation:
|
|
164
|
+
|
|
165
|
+
👉 **[Architecture & Engineering Documentation](docs/README.md)**
|
|
166
|
+
|
|
167
|
+
## 🛠 Development
|
|
168
|
+
|
|
169
|
+
To contribute to `dbs-vector`, the project utilizes `poethepoet` as a task runner and implements strict quality gates (Ruff & Mypy).
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
# Run the entire validation suite (Format, Lint, Typecheck, Pytest)
|
|
173
|
+
uv run poe check
|
|
174
|
+
|
|
175
|
+
# Run tests with coverage
|
|
176
|
+
uv run poe test-cov
|
|
177
|
+
```
|
|
178
|
+
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# ⚡️ dbs-vector
|
|
2
|
+
|
|
3
|
+
**A High-Performance, Arrow-Native Local Codebase Search Engine for Apple Silicon.**
|
|
4
|
+
|
|
5
|
+
`dbs-vector` is a optimized Retrieval-Augmented Generation (RAG) search engine designed specifically for macOS (M-Series chips). It bypasses traditional Python serialization bottlenecks by utilizing Apple's Unified Memory Architecture (UMA) and pure Apache Arrow data pipelines.
|
|
6
|
+
|
|
7
|
+
It enables lightning-fast, hybrid (Vector + Full-Text) search across your local codebase, entirely offline.
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## ✨ Features
|
|
12
|
+
|
|
13
|
+
* **Zero-Copy Memory Pipelines**: Uses **MLX** to compute embeddings on the Mac GPU, casting the resulting tensors instantly into NumPy arrays via Unified Memory without costly `float` object instantiation.
|
|
14
|
+
* **Arrow-Native Storage**: Uses **LanceDB** to stream ingestion batches directly to disk via PyArrow, avoiding the massive memory overhead of JSON and dictionary comprehensions.
|
|
15
|
+
* **Hybrid Retrieval**: Simultaneously executes Approximate Nearest Neighbor (ANN) cosine vector search and native **Tantivy** Full-Text Search (FTS).
|
|
16
|
+
* **Code-Aware Chunking**: Intelligently splits documentation and code, respecting markdown fences so that code blocks are indexed as atomic units.
|
|
17
|
+
* **Production Robustness**: Features dynamic `IVF_PQ` indexing, Rust-level predicate pushdown (metadata filtering), and dataset compaction for delta-updates.
|
|
18
|
+
* **Remote SQL API Ingestion**: `ApiChunker` pulls pre-aggregated slow-query records from any networked backend over HTTP, replacing local files with a paginated REST API — no changes to the embedding or storage layers.
|
|
19
|
+
|
|
20
|
+
## 🚀 Installation
|
|
21
|
+
|
|
22
|
+
This project is built using `uv`, an extremely fast Python package manager.
|
|
23
|
+
|
|
24
|
+
1. **Clone the repository:**
|
|
25
|
+
```bash
|
|
26
|
+
git clone https://github.com/dbsmedya/dbs-vector.git
|
|
27
|
+
cd dbs-vector
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
2. **Install the CLI package:**
|
|
31
|
+
```bash
|
|
32
|
+
uv sync
|
|
33
|
+
```
|
|
34
|
+
*This automatically sets up the environment and creates the `dbs-vector` executable in your path.*
|
|
35
|
+
|
|
36
|
+
Optional extras unlock additional ingestion sources:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
uv sync --extra sql # DuckDB ingestion
|
|
40
|
+
uv sync --extra api # Remote HTTP API ingestion
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## 💻 Usage
|
|
44
|
+
|
|
45
|
+
The application is entirely configuration-driven via `config.yaml`. It supports multiple data types (Engines) such as Markdown and SQL.
|
|
46
|
+
|
|
47
|
+
### Global Options
|
|
48
|
+
* `--config-file` / `-c`: Path to your custom `config.yaml` (Defaults to `./config.yaml`).
|
|
49
|
+
|
|
50
|
+
### Ingesting Documents
|
|
51
|
+
Index markdown files, JSON SQL logs, DuckDB analytical files, or a remote HTTP slow-query API into the local vector store.
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# Ingest all markdown files (default)
|
|
55
|
+
uv run dbs-vector ingest "docs/"
|
|
56
|
+
|
|
57
|
+
# Ingest SQL slow query logs (JSON format)
|
|
58
|
+
uv run dbs-vector ingest "slow_queries.json" --type sql
|
|
59
|
+
|
|
60
|
+
# Ingest SQL slow queries from DuckDB (High-Performance Columnar)
|
|
61
|
+
uv run dbs-vector ingest "slow_queries.duckdb" --type sql --rebuild
|
|
62
|
+
|
|
63
|
+
# Ingest from a remote HTTP API (paginated GET)
|
|
64
|
+
uv run dbs-vector ingest "https://slow-log-api.internal/api/v1" --type sql-api
|
|
65
|
+
|
|
66
|
+
# Ingest via a custom SELECT sent to the remote API
|
|
67
|
+
uv run dbs-vector ingest "https://slow-log-api.internal/api/v1" --type sql-api \
|
|
68
|
+
--query "SELECT fingerprint_id AS id, sanitized_sql AS text, db AS source, ..."
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Searching the Codebase
|
|
72
|
+
Execute queries against your chosen engine.
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# Semantic hybrid search across markdown
|
|
76
|
+
uv run dbs-vector search "What is MLX?"
|
|
77
|
+
|
|
78
|
+
# Find similar slow queries (SQL clustering)
|
|
79
|
+
uv run dbs-vector search "SELECT * FROM users" --type sql --min-time 1000
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
> **Indexes are built automatically at the end of every `ingest` run.** Two indexes are created:
|
|
83
|
+
> - **IVF_PQ** vector index (only when the table has > 256 rows)
|
|
84
|
+
> - **Tantivy FTS** inverted index (required for hybrid search)
|
|
85
|
+
>
|
|
86
|
+
> If you see a *"Cannot perform full text search unless an INVERTED index has been created"* error, it means the FTS index was never built for your table. Fix it by re-running ingestion — use `--rebuild` to wipe and re-index from scratch:
|
|
87
|
+
> ```bash
|
|
88
|
+
> uv run dbs-vector ingest "docs/" --rebuild
|
|
89
|
+
> uv run dbs-vector ingest "slow_queries.json" --type sql --rebuild
|
|
90
|
+
> ```
|
|
91
|
+
|
|
92
|
+
For detailed specifications on each ingestion source, see:
|
|
93
|
+
👉 **[SQL Engine Documentation](docs/README_SQL.md)**
|
|
94
|
+
👉 **[DuckDB Ingestion Documentation](docs/README_duckdb.md)**
|
|
95
|
+
👉 **[Remote SQL API Ingestion](docs/README_REMOTE_SQL_API.md)**
|
|
96
|
+
|
|
97
|
+
### Async API Server
|
|
98
|
+
The application includes a high-performance FastAPI server to expose the search engine over HTTP.
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# Start the API server (loads all engines defined in config.yaml)
|
|
102
|
+
uv run dbs-vector serve
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
For full API specifications and swagger documentation, see:
|
|
106
|
+
👉 **[API Usage & Documentation](docs/README_API.md)**
|
|
107
|
+
|
|
108
|
+
### Model Context Protocol (MCP) Server
|
|
109
|
+
`dbs-vector` includes a built-in MCP server compatible with Claude Desktop, Claude Code (CLI), and Cursor. Supports both **stdio** (no server required) and **Streamable HTTP** (shared instance, saves VRAM).
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
# stdio — each client spawns its own process
|
|
113
|
+
uv run dbs-vector mcp
|
|
114
|
+
|
|
115
|
+
# HTTP — one shared server for all clients
|
|
116
|
+
uv run dbs-vector serve # MCP endpoint: http://127.0.0.1:8000/mcp
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
For setup instructions for all clients and transport types, see:
|
|
120
|
+
👉 **[MCP Server Documentation](docs/README_MCP.md)**
|
|
121
|
+
|
|
122
|
+
## 🏗 Architecture & Roadmap
|
|
123
|
+
|
|
124
|
+
`dbs-vector` is built upon strict **Clean Architecture** and **SOLID** principles. It utilizes a **Configuration-Driven Registry Pattern**, allowing new data engines (e.g., LibCST, Logs) to be added by simply updating `config.yaml` and registering new mappers/chunkers without modifying core orchestration logic.
|
|
125
|
+
|
|
126
|
+
### Specialized Gemma Workflows
|
|
127
|
+
The project is optimized for instruction-tuned models like `embeddinggemma`. It supports asymmetric task-based workflows defined in `config.yaml`:
|
|
128
|
+
* **Markdown (Search Result)**: Uses the `task: search result` prefix for queries and `title: none | text: ` for documents, maximizing retrieval accuracy for RAG.
|
|
129
|
+
* **SQL (Clustering)**: Uses the `task: clustering` prefix for both ingestion and search, enabling high-precision semantic grouping of logically similar slow queries.
|
|
130
|
+
|
|
131
|
+
### Future Hardware Support (CUDA/TPU)
|
|
132
|
+
Because the core RAG orchestration relies exclusively on the `IEmbedder` Protocol, the application is strictly hardware-agnostic at its core. While currently optimized for Apple Silicon via `MLXEmbedder`, future deployment to cloud GPUs or Linux environments simply requires implementing a new `CudaEmbedder` (using PyTorch/Transformers) that returns standard NumPy arrays. No changes to the ingestion, storage, or API layers are necessary to support new hardware accelerators. No access to a CUDA hardware at the moment.
|
|
133
|
+
|
|
134
|
+
For a deep dive into the engineering, the Apache Arrow ingestion lifecycle, and the blueprint for AST/LibCST integration, see the official documentation:
|
|
135
|
+
|
|
136
|
+
👉 **[Architecture & Engineering Documentation](docs/README.md)**
|
|
137
|
+
|
|
138
|
+
## 🛠 Development
|
|
139
|
+
|
|
140
|
+
To contribute to `dbs-vector`, the project utilizes `poethepoet` as a task runner and implements strict quality gates (Ruff & Mypy).
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
# Run the entire validation suite (Format, Lint, Typecheck, Pytest)
|
|
144
|
+
uv run poe check
|
|
145
|
+
|
|
146
|
+
# Run tests with coverage
|
|
147
|
+
uv run poe test-cov
|
|
148
|
+
```
|
|
149
|
+
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
system:
|
|
2
|
+
db_path: "./lancedb_dbs_vector"
|
|
3
|
+
batch_size: 64
|
|
4
|
+
nprobes: 20
|
|
5
|
+
|
|
6
|
+
engines:
|
|
7
|
+
md:
|
|
8
|
+
description: "Markdown & Prose Document Engine (Gemma Search)"
|
|
9
|
+
model_name: "mlx-community/embeddinggemma-300m-bf16"
|
|
10
|
+
vector_dimension: 768
|
|
11
|
+
max_token_length: 2048
|
|
12
|
+
table_name: "knowledge_vault"
|
|
13
|
+
mapper_type: "document"
|
|
14
|
+
chunker_type: "document"
|
|
15
|
+
chunk_max_chars: 1000
|
|
16
|
+
passage_prefix: "title: none | text: "
|
|
17
|
+
query_prefix: "task: search result | query: "
|
|
18
|
+
workflow: "md_search"
|
|
19
|
+
|
|
20
|
+
sql:
|
|
21
|
+
description: "SQL Slow Query Log Engine (Gemma Clustering)"
|
|
22
|
+
model_name: "mlx-community/embeddinggemma-300m-bf16"
|
|
23
|
+
vector_dimension: 768
|
|
24
|
+
max_token_length: 2048
|
|
25
|
+
table_name: "query_vault"
|
|
26
|
+
mapper_type: "sql"
|
|
27
|
+
chunker_type: "duckdb"
|
|
28
|
+
chunk_max_chars: 0
|
|
29
|
+
passage_prefix: "task: clustering | query: "
|
|
30
|
+
query_prefix: "task: clustering | query: "
|
|
31
|
+
workflow: "sql_clustering"
|
|
32
|
+
|
|
33
|
+
sql-api:
|
|
34
|
+
description: "Remote slow query log via HTTP API"
|
|
35
|
+
model_name: "mlx-community/embeddinggemma-300m-bf16"
|
|
36
|
+
vector_dimension: 768
|
|
37
|
+
max_token_length: 2048
|
|
38
|
+
table_name: "query_vault"
|
|
39
|
+
mapper_type: "sql"
|
|
40
|
+
chunker_type: "api"
|
|
41
|
+
chunk_max_chars: 0
|
|
42
|
+
passage_prefix: "task: clustering | query: "
|
|
43
|
+
query_prefix: "task: clustering | query: "
|
|
44
|
+
workflow: "sql_clustering"
|
|
45
|
+
# --- ApiChunker-specific fields ---
|
|
46
|
+
api_base_url: "http://localhost:8080/api/v1"
|
|
47
|
+
api_key: "0Byuf9P9e5UxUNIsngNjr6b9u8sldoHd1ek_ImBbxiI" # set via DBS_API_KEY env var in production
|
|
48
|
+
api_page_size: 200 # records per GET request (max 1000)
|
|
49
|
+
api_since_days: 60 # lower bound on latest_ts (default: 15)
|
|
50
|
+
api_timeout_sec: 30 # HTTP request timeout in seconds
|
|
51
|
+
api_min_execution_ms: 0 # filter: skip queries below this threshold
|
|
52
|
+
api_database: "" # leave empty to fetch all databases
|