embenx 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- embenx-1.3.0/.gitignore +8 -0
- embenx-1.3.0/LICENSE +21 -0
- embenx-1.3.0/PKG-INFO +164 -0
- embenx-1.3.0/README.md +126 -0
- embenx-1.3.0/SKILL.md +4 -0
- embenx-1.3.0/benchmark.py +239 -0
- embenx-1.3.0/cli.py +310 -0
- embenx-1.3.0/core.py +819 -0
- embenx-1.3.0/data.py +77 -0
- embenx-1.3.0/docs_src/conf.py +23 -0
- embenx-1.3.0/examples/advanced_reranking.py +54 -0
- embenx-1.3.0/examples/agentic_self_healing.py +51 -0
- embenx-1.3.0/examples/ann_libraries.py +47 -0
- embenx-1.3.0/examples/cluster_kv_optimization.py +32 -0
- embenx-1.3.0/examples/complex_indexers.py +44 -0
- embenx-1.3.0/examples/custom_indexer.py +28 -0
- embenx-1.3.0/examples/db_indexers.py +56 -0
- embenx-1.3.0/examples/echo_temporal_memory.py +44 -0
- embenx-1.3.0/examples/faiss_variants.py +41 -0
- embenx-1.3.0/examples/filtering_reranking.py +45 -0
- embenx-1.3.0/examples/hybrid_search.py +51 -0
- embenx-1.3.0/examples/kv_cache_augmented.py +47 -0
- embenx-1.3.0/examples/library_benchmark.py +23 -0
- embenx-1.3.0/examples/matryoshka_search.py +30 -0
- embenx-1.3.0/examples/numpy_interop.py +34 -0
- embenx-1.3.0/examples/recall_evaluation.py +31 -0
- embenx-1.3.0/examples/spatial_cognitive_memory.py +38 -0
- embenx-1.3.0/examples/ssm_hydration.py +42 -0
- embenx-1.3.0/examples/trajectory_search.py +46 -0
- embenx-1.3.0/examples/turbo_quant_caching.py +47 -0
- embenx-1.3.0/examples/usearch_quantization.py +30 -0
- embenx-1.3.0/explorer.py +193 -0
- embenx-1.3.0/indexers/__init__.py +53 -0
- embenx-1.3.0/indexers/annoy_indexer.py +47 -0
- embenx-1.3.0/indexers/base.py +51 -0
- embenx-1.3.0/indexers/bm25_indexer.py +63 -0
- embenx-1.3.0/indexers/chroma_indexer.py +51 -0
- embenx-1.3.0/indexers/duckdb_indexer.py +44 -0
- embenx-1.3.0/indexers/elasticsearch_indexer.py +94 -0
- embenx-1.3.0/indexers/faiss_indexer.py +114 -0
- embenx-1.3.0/indexers/hnswlib_indexer.py +57 -0
- embenx-1.3.0/indexers/lance_indexer.py +56 -0
- embenx-1.3.0/indexers/milvus_indexer.py +74 -0
- embenx-1.3.0/indexers/pgvector_indexer.py +80 -0
- embenx-1.3.0/indexers/qdrant_indexer.py +39 -0
- embenx-1.3.0/indexers/scann_indexer.py +76 -0
- embenx-1.3.0/indexers/simple_indexer.py +55 -0
- embenx-1.3.0/indexers/usearch_indexer.py +63 -0
- embenx-1.3.0/indexers/vespa_indexer.py +54 -0
- embenx-1.3.0/indexers/weaviate_indexer.py +82 -0
- embenx-1.3.0/llm.py +76 -0
- embenx-1.3.0/mcp_server.py +113 -0
- embenx-1.3.0/pyproject.toml +112 -0
- embenx-1.3.0/rerank.py +53 -0
- embenx-1.3.0/tests/test_absolute_final_100.py +106 -0
- embenx-1.3.0/tests/test_benchmark.py +155 -0
- embenx-1.3.0/tests/test_cli.py +81 -0
- embenx-1.3.0/tests/test_complex_indexers.py +63 -0
- embenx-1.3.0/tests/test_core.py +386 -0
- embenx-1.3.0/tests/test_coverage_edge_cases.py +95 -0
- embenx-1.3.0/tests/test_coverage_final.py +117 -0
- embenx-1.3.0/tests/test_data.py +16 -0
- embenx-1.3.0/tests/test_explorer.py +14 -0
- embenx-1.3.0/tests/test_hybrid.py +72 -0
- embenx-1.3.0/tests/test_indexers.py +178 -0
- embenx-1.3.0/tests/test_llm.py +46 -0
- embenx-1.3.0/tests/test_milestone_4.py +85 -0
- embenx-1.3.0/tests/test_milestone_5.py +58 -0
- embenx-1.3.0/tests/test_more_indexers.py +71 -0
- embenx-1.3.0/tests/test_weaviate.py +28 -0
embenx-1.3.0/.gitignore
ADDED
embenx-1.3.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Aditya Karnam
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
embenx-1.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: embenx
|
|
3
|
+
Version: 1.3.0
|
|
4
|
+
Summary: Universal embedding retrieval toolkit & benchmark. Search, filter, and rerank across 15+ vector backends with a unified Python API and CLI.
|
|
5
|
+
Author-email: adityak74 <adityakarnam@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Requires-Dist: annoy>=1.17.0
|
|
10
|
+
Requires-Dist: chromadb>=0.4.0
|
|
11
|
+
Requires-Dist: datasets>=2.14.0
|
|
12
|
+
Requires-Dist: duckdb>=1.0.0
|
|
13
|
+
Requires-Dist: faiss-cpu>=1.7.4
|
|
14
|
+
Requires-Dist: flashrank>=0.2.10
|
|
15
|
+
Requires-Dist: hnswlib>=0.8.0
|
|
16
|
+
Requires-Dist: lancedb>=0.2.0
|
|
17
|
+
Requires-Dist: litellm>=1.0.0
|
|
18
|
+
Requires-Dist: mcp>=1.26.0
|
|
19
|
+
Requires-Dist: networkx>=3.0
|
|
20
|
+
Requires-Dist: pillow>=10.0.0
|
|
21
|
+
Requires-Dist: plotly>=5.18.0
|
|
22
|
+
Requires-Dist: psutil>=5.9.0
|
|
23
|
+
Requires-Dist: psycopg2-binary>=2.9.0
|
|
24
|
+
Requires-Dist: pyarrow>=12.0.0
|
|
25
|
+
Requires-Dist: pymilvus>=2.3.0
|
|
26
|
+
Requires-Dist: qdrant-client>=1.5.0
|
|
27
|
+
Requires-Dist: rank-bm25>=0.2.2
|
|
28
|
+
Requires-Dist: requests>=2.31.0
|
|
29
|
+
Requires-Dist: rerankers>=0.10.0
|
|
30
|
+
Requires-Dist: rich>=10.11.0
|
|
31
|
+
Requires-Dist: safetensors>=0.4.0
|
|
32
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
33
|
+
Requires-Dist: streamlit>=1.32.0
|
|
34
|
+
Requires-Dist: typer>=0.9.0
|
|
35
|
+
Requires-Dist: usearch>=2.9.0
|
|
36
|
+
Requires-Dist: weaviate-client>=4.5.4
|
|
37
|
+
Description-Content-Type: text/markdown
|
|
38
|
+
|
|
39
|
+
<div align="center">
|
|
40
|
+
|
|
41
|
+
<h1>Embenx 🚀</h1>
|
|
42
|
+
|
|
43
|
+
<p>
|
|
44
|
+
<strong>Universal embedding retrieval toolkit & benchmark.</strong><br/>
|
|
45
|
+
Search, filter, and rerank across 15+ vector backends (FAISS, ScaNN, pgvector, etc.) with a unified Python API and CLI.
|
|
46
|
+
</p>
|
|
47
|
+
|
|
48
|
+
<p>
|
|
49
|
+
<a href="https://github.com/adityak74/embenx/stargazers"><img src="https://img.shields.io/github/stars/adityak74/embenx?style=flat-square&color=yellow" alt="Stars"/></a>
|
|
50
|
+
<a href="https://github.com/adityak74/embenx/issues"><img src="https://img.shields.io/github/issues/adityak74/embenx?style=flat-square" alt="Issues"/></a>
|
|
51
|
+
<a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-green.svg?style=flat-square" alt="MIT License"/></a>
|
|
52
|
+
<a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.10+-blue.svg?style=flat-square" alt="Python 3.10+"/></a>
|
|
53
|
+
<a href="https://adityak74.github.io/embenx/"><img src="https://img.shields.io/badge/docs-live-brightgreen?style=flat-square" alt="Docs"/></a>
|
|
54
|
+
<a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/uv-ready-purple.svg?style=flat-square" alt="uv ready"/></a>
|
|
55
|
+
</p>
|
|
56
|
+
|
|
57
|
+
<p>
|
|
58
|
+
<a href="https://adityak74.github.io/embenx/">Documentation</a> ·
|
|
59
|
+
<a href="https://github.com/adityak74/embenx/issues">Report Bug</a> ·
|
|
60
|
+
<a href="https://github.com/adityak74/embenx/issues">Request Feature</a>
|
|
61
|
+
</p>
|
|
62
|
+
|
|
63
|
+
</div>
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## What is Embenx?
|
|
68
|
+
|
|
69
|
+
Embenx is a Python-native retrieval library that sits between raw vector indices and full-blown vector databases. It provides a high-level `Collection` API for managing embeddings and metadata, supporting advanced features like **filtering**, **reranking**, and **quantization** across 15+ backends.
|
|
70
|
+
|
|
71
|
+
## Library Usage
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from embenx import Collection
|
|
75
|
+
|
|
76
|
+
# 1. Initialize a collection
|
|
77
|
+
col = Collection(dimension=768, indexer_type="faiss-hnsw")
|
|
78
|
+
|
|
79
|
+
# 2. Add data
|
|
80
|
+
col.add(
|
|
81
|
+
vectors=[[0.1, 0.2, ...], [0.3, 0.4, ...]],
|
|
82
|
+
metadata=[{"category": "AI", "id": 1, "text": "The quick brown fox"}]
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# 3. Search with filtering
|
|
86
|
+
results = col.search(
|
|
87
|
+
query=[0.1, 0.2, ...],
|
|
88
|
+
top_k=5,
|
|
89
|
+
where={"category": "AI"}
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# 4. Export to production
|
|
93
|
+
col.export_to_production(backend="qdrant", connection_url="http://localhost:6333")
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Agentic Memory (MCP)
|
|
97
|
+
|
|
98
|
+
Embenx ships with a built-in **Model Context Protocol (MCP)** server. This allows AI agents (like Claude Desktop) to use Embenx collections as their own long-term memory.
|
|
99
|
+
|
|
100
|
+
### 1. Start the server
|
|
101
|
+
```bash
|
|
102
|
+
embenx mcp-start
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Visual Explorer
|
|
106
|
+
|
|
107
|
+
Embenx provides a built-in web UI to visualize your vector collections, including an interactive **HNSW Graph Visualizer** and a **RAG Playground**.
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
embenx explorer
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Features
|
|
114
|
+
|
|
115
|
+
- **Multimodal Support** — Native support for image embeddings (CLIP).
|
|
116
|
+
- **RAG Playground** — Test retrieval quality with an integrated LLM chat loop.
|
|
117
|
+
- **HNSW Graph Visualizer** — Interactive 3D visualization of navigation layers.
|
|
118
|
+
- **Export to Production** — One-click migration to Qdrant or Milvus clusters.
|
|
119
|
+
- **Unified Collection API** — Table-like interface for vectors and metadata.
|
|
120
|
+
- **Retrieval Zoo** — Instant access to pre-indexed collections (SQuAD, MS-MARCO, etc.).
|
|
121
|
+
- **Agentic Memory (MCP)** — Native Model Context Protocol support for AI agents.
|
|
122
|
+
- **Self-Healing Retrieval** — Integrated feedback loops to automatically improve ranking accuracy.
|
|
123
|
+
- **Temporal Memory (Echo)** — Recency-biased retrieval and time-window filtering (arXiv:2502.16090).
|
|
124
|
+
- **Spatial Memory (ESWM)** — Neuroscience-inspired spatial cognitive maps for navigation (ICLR 2026).
|
|
125
|
+
- **TurboQuant Compression** — 1-bit sign-based quantization for activation tensors (arXiv:2504.19874).
|
|
126
|
+
- **ClusterKV Optimization** — Semantic clustering for high-throughput retrieval (arXiv:2412.03213).
|
|
127
|
+
- **Hybrid Search** — Combine dense vectors with sparse BM25 retrieval using RRF.
|
|
128
|
+
- **KV Cache Offloading (RA-KVC)** — Store and retrieve high-dimensional LLM activations using `safetensors`.
|
|
129
|
+
- **SSM State Hydration** — Persist and prime hidden states ($h_0$) for State Space Models (Mamba-2).
|
|
130
|
+
- **Trajectory Retrieval** — Search for similar state/action sequences for World Models.
|
|
131
|
+
- **Visual Explorer** — Built-in web UI to visualize vector clusters and metadata.
|
|
132
|
+
- **Universal model support** — Integrated LiteLLM for any embedding provider.
|
|
133
|
+
- **Portable Formats** — Native support for Parquet, NumPy (.npy/.npz), and FAISS (.index).
|
|
134
|
+
|
|
135
|
+
## Supported Indexers
|
|
136
|
+
|
|
137
|
+
| Indexer | Family | Best For |
|
|
138
|
+
| :--- | :--- | :--- |
|
|
139
|
+
| `faiss` | HNSW, IVF, Flat | Production-grade local search |
|
|
140
|
+
| `scann` | Tree-AH | State-of-the-art speed/recall (Linux) |
|
|
141
|
+
| `usearch` | HNSW | High-performance C++, low latency |
|
|
142
|
+
| `pgvector` | Postgres | Embeddings next to relational data |
|
|
143
|
+
| `lancedb` | Columnar | Large disk-based datasets |
|
|
144
|
+
| `simple` | NumPy | Exact search baseline |
|
|
145
|
+
|
|
146
|
+
## Installation
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
pip install embenx
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Roadmap
|
|
153
|
+
|
|
154
|
+
See [ROADMAP.md](ROADMAP.md) for our journey towards production-grade agentic retrieval.
|
|
155
|
+
|
|
156
|
+
## License
|
|
157
|
+
|
|
158
|
+
Distributed under the **MIT License**.
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
<div align="center">
|
|
163
|
+
Built with ❤️ for the AI engineering community by <a href="https://github.com/adityak74">adityak74</a>
|
|
164
|
+
</div>
|
embenx-1.3.0/README.md
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
<h1>Embenx 🚀</h1>
|
|
4
|
+
|
|
5
|
+
<p>
|
|
6
|
+
<strong>Universal embedding retrieval toolkit & benchmark.</strong><br/>
|
|
7
|
+
Search, filter, and rerank across 15+ vector backends (FAISS, ScaNN, pgvector, etc.) with a unified Python API and CLI.
|
|
8
|
+
</p>
|
|
9
|
+
|
|
10
|
+
<p>
|
|
11
|
+
<a href="https://github.com/adityak74/embenx/stargazers"><img src="https://img.shields.io/github/stars/adityak74/embenx?style=flat-square&color=yellow" alt="Stars"/></a>
|
|
12
|
+
<a href="https://github.com/adityak74/embenx/issues"><img src="https://img.shields.io/github/issues/adityak74/embenx?style=flat-square" alt="Issues"/></a>
|
|
13
|
+
<a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-green.svg?style=flat-square" alt="MIT License"/></a>
|
|
14
|
+
<a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.10+-blue.svg?style=flat-square" alt="Python 3.10+"/></a>
|
|
15
|
+
<a href="https://adityak74.github.io/embenx/"><img src="https://img.shields.io/badge/docs-live-brightgreen?style=flat-square" alt="Docs"/></a>
|
|
16
|
+
<a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/uv-ready-purple.svg?style=flat-square" alt="uv ready"/></a>
|
|
17
|
+
</p>
|
|
18
|
+
|
|
19
|
+
<p>
|
|
20
|
+
<a href="https://adityak74.github.io/embenx/">Documentation</a> ·
|
|
21
|
+
<a href="https://github.com/adityak74/embenx/issues">Report Bug</a> ·
|
|
22
|
+
<a href="https://github.com/adityak74/embenx/issues">Request Feature</a>
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
</div>
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## What is Embenx?
|
|
30
|
+
|
|
31
|
+
Embenx is a Python-native retrieval library that sits between raw vector indices and full-blown vector databases. It provides a high-level `Collection` API for managing embeddings and metadata, supporting advanced features like **filtering**, **reranking**, and **quantization** across 15+ backends.
|
|
32
|
+
|
|
33
|
+
## Library Usage
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from embenx import Collection
|
|
37
|
+
|
|
38
|
+
# 1. Initialize a collection
|
|
39
|
+
col = Collection(dimension=768, indexer_type="faiss-hnsw")
|
|
40
|
+
|
|
41
|
+
# 2. Add data
|
|
42
|
+
col.add(
|
|
43
|
+
vectors=[[0.1, 0.2, ...], [0.3, 0.4, ...]],
|
|
44
|
+
metadata=[{"category": "AI", "id": 1, "text": "The quick brown fox"}]
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# 3. Search with filtering
|
|
48
|
+
results = col.search(
|
|
49
|
+
query=[0.1, 0.2, ...],
|
|
50
|
+
top_k=5,
|
|
51
|
+
where={"category": "AI"}
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# 4. Export to production
|
|
55
|
+
col.export_to_production(backend="qdrant", connection_url="http://localhost:6333")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Agentic Memory (MCP)
|
|
59
|
+
|
|
60
|
+
Embenx ships with a built-in **Model Context Protocol (MCP)** server. This allows AI agents (like Claude Desktop) to use Embenx collections as their own long-term memory.
|
|
61
|
+
|
|
62
|
+
### 1. Start the server
|
|
63
|
+
```bash
|
|
64
|
+
embenx mcp-start
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Visual Explorer
|
|
68
|
+
|
|
69
|
+
Embenx provides a built-in web UI to visualize your vector collections, including an interactive **HNSW Graph Visualizer** and a **RAG Playground**.
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
embenx explorer
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Features
|
|
76
|
+
|
|
77
|
+
- **Multimodal Support** — Native support for image embeddings (CLIP).
|
|
78
|
+
- **RAG Playground** — Test retrieval quality with an integrated LLM chat loop.
|
|
79
|
+
- **HNSW Graph Visualizer** — Interactive 3D visualization of navigation layers.
|
|
80
|
+
- **Export to Production** — One-click migration to Qdrant or Milvus clusters.
|
|
81
|
+
- **Unified Collection API** — Table-like interface for vectors and metadata.
|
|
82
|
+
- **Retrieval Zoo** — Instant access to pre-indexed collections (SQuAD, MS-MARCO, etc.).
|
|
83
|
+
- **Agentic Memory (MCP)** — Native Model Context Protocol support for AI agents.
|
|
84
|
+
- **Self-Healing Retrieval** — Integrated feedback loops to automatically improve ranking accuracy.
|
|
85
|
+
- **Temporal Memory (Echo)** — Recency-biased retrieval and time-window filtering (arXiv:2502.16090).
|
|
86
|
+
- **Spatial Memory (ESWM)** — Neuroscience-inspired spatial cognitive maps for navigation (ICLR 2026).
|
|
87
|
+
- **TurboQuant Compression** — 1-bit sign-based quantization for activation tensors (arXiv:2504.19874).
|
|
88
|
+
- **ClusterKV Optimization** — Semantic clustering for high-throughput retrieval (arXiv:2412.03213).
|
|
89
|
+
- **Hybrid Search** — Combine dense vectors with sparse BM25 retrieval using RRF.
|
|
90
|
+
- **KV Cache Offloading (RA-KVC)** — Store and retrieve high-dimensional LLM activations using `safetensors`.
|
|
91
|
+
- **SSM State Hydration** — Persist and prime hidden states ($h_0$) for State Space Models (Mamba-2).
|
|
92
|
+
- **Trajectory Retrieval** — Search for similar state/action sequences for World Models.
|
|
93
|
+
- **Visual Explorer** — Built-in web UI to visualize vector clusters and metadata.
|
|
94
|
+
- **Universal model support** — Integrated LiteLLM for any embedding provider.
|
|
95
|
+
- **Portable Formats** — Native support for Parquet, NumPy (.npy/.npz), and FAISS (.index).
|
|
96
|
+
|
|
97
|
+
## Supported Indexers
|
|
98
|
+
|
|
99
|
+
| Indexer | Family | Best For |
|
|
100
|
+
| :--- | :--- | :--- |
|
|
101
|
+
| `faiss` | HNSW, IVF, Flat | Production-grade local search |
|
|
102
|
+
| `scann` | Tree-AH | State-of-the-art speed/recall (Linux) |
|
|
103
|
+
| `usearch` | HNSW | High-performance C++, low latency |
|
|
104
|
+
| `pgvector` | Postgres | Embeddings next to relational data |
|
|
105
|
+
| `lancedb` | Columnar | Large disk-based datasets |
|
|
106
|
+
| `simple` | NumPy | Exact search baseline |
|
|
107
|
+
|
|
108
|
+
## Installation
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
pip install embenx
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Roadmap
|
|
115
|
+
|
|
116
|
+
See [ROADMAP.md](ROADMAP.md) for our journey towards production-grade agentic retrieval.
|
|
117
|
+
|
|
118
|
+
## License
|
|
119
|
+
|
|
120
|
+
Distributed under the **MIT License**.
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
<div align="center">
|
|
125
|
+
Built with ❤️ for the AI engineering community by <a href="https://github.com/adityak74">adityak74</a>
|
|
126
|
+
</div>
|
embenx-1.3.0/SKILL.md
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import inspect
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from typing import List, Dict, Any, Optional
|
|
6
|
+
|
|
7
|
+
import psutil
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
from data import load_documents
|
|
12
|
+
from indexers import BaseIndexer, get_indexer_map
|
|
13
|
+
from llm import Embedder
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_memory_usage():
|
|
17
|
+
process = psutil.Process(os.getpid())
|
|
18
|
+
return process.memory_info().rss / 1024 / 1024 # MB
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_custom_indexer(script_path: str, console: Console):
|
|
22
|
+
"""
|
|
23
|
+
Dynamically load a class inheriting from BaseIndexer from a given script.
|
|
24
|
+
"""
|
|
25
|
+
try:
|
|
26
|
+
spec = importlib.util.spec_from_file_location("custom_indexer", script_path)
|
|
27
|
+
if spec is None or spec.loader is None:
|
|
28
|
+
console.print(f"[red]Could not load spec for {script_path}[/red]")
|
|
29
|
+
return None, None
|
|
30
|
+
|
|
31
|
+
module = importlib.util.module_from_spec(spec)
|
|
32
|
+
spec.loader.exec_module(module)
|
|
33
|
+
|
|
34
|
+
for name, obj in inspect.getmembers(module):
|
|
35
|
+
if inspect.isclass(obj) and issubclass(obj, BaseIndexer) and obj is not BaseIndexer:
|
|
36
|
+
return name, obj
|
|
37
|
+
|
|
38
|
+
console.print(f"[red]No class inheriting from BaseIndexer found in {script_path}[/red]")
|
|
39
|
+
return None, None
|
|
40
|
+
except Exception as e:
|
|
41
|
+
console.print(f"[red]Error loading custom indexer from {script_path}: {e}[/red]")
|
|
42
|
+
return None, None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def benchmark_single_indexer(name, indexer_cls, dimension, embeddings, metadata, console, cleanup=True):
|
|
46
|
+
console.print(f"\n[bold cyan]--- Benchmarking {name.upper()} ---[/bold cyan]")
|
|
47
|
+
indexer = indexer_cls(dimension=dimension)
|
|
48
|
+
|
|
49
|
+
# Build Index
|
|
50
|
+
mem_before = get_memory_usage()
|
|
51
|
+
t0 = time.perf_counter()
|
|
52
|
+
try:
|
|
53
|
+
indexer.build_index(embeddings, metadata)
|
|
54
|
+
build_time = time.perf_counter() - t0
|
|
55
|
+
except Exception as e:
|
|
56
|
+
console.print(f"[red]Failed to build index for {name}: {e}[/red]")
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
mem_after = get_memory_usage()
|
|
60
|
+
mem_diff = mem_after - mem_before
|
|
61
|
+
index_size = indexer.get_size()
|
|
62
|
+
|
|
63
|
+
# Query Benchmarking
|
|
64
|
+
query_embeddings = embeddings[: min(10, len(embeddings))]
|
|
65
|
+
query_time = 0
|
|
66
|
+
if query_embeddings:
|
|
67
|
+
t0 = time.perf_counter()
|
|
68
|
+
for q_emb in query_embeddings:
|
|
69
|
+
indexer.search(q_emb, top_k=5)
|
|
70
|
+
query_time = (time.perf_counter() - t0) / len(query_embeddings) * 1000 # ms per query
|
|
71
|
+
|
|
72
|
+
result = {
|
|
73
|
+
"Indexer": name.upper(),
|
|
74
|
+
"Build Time (s)": f"{build_time:.4f}",
|
|
75
|
+
"Query Time (ms)": f"{query_time:.2f}",
|
|
76
|
+
"Index Size (KB)": f"{index_size / 1024:.2f}",
|
|
77
|
+
"Memory Diff (MB)": f"{mem_diff:.2f}",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if cleanup:
|
|
81
|
+
indexer.cleanup()
|
|
82
|
+
|
|
83
|
+
console.print(f"Done {name.upper()}.")
|
|
84
|
+
return result
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def run_benchmark(
|
|
88
|
+
dataset_name: str,
|
|
89
|
+
split: str,
|
|
90
|
+
text_column: str,
|
|
91
|
+
max_docs: int,
|
|
92
|
+
indexer_names: List[str],
|
|
93
|
+
model_name: str,
|
|
94
|
+
console: Console,
|
|
95
|
+
data_files: str = None,
|
|
96
|
+
cleanup: bool = True,
|
|
97
|
+
custom_indexer_script: str = None,
|
|
98
|
+
subset: str = "default", # Added as optional
|
|
99
|
+
):
|
|
100
|
+
"""
|
|
101
|
+
Run Embenx benchmarks. Matches original signature for test compatibility.
|
|
102
|
+
"""
|
|
103
|
+
# Load Data
|
|
104
|
+
console.print(f"\n[bold]Loading up to {max_docs} documents from {dataset_name}...[/bold]")
|
|
105
|
+
|
|
106
|
+
# Check if dataset_name is actually a path (Parquet benchmark use case)
|
|
107
|
+
if os.path.exists(dataset_name) and dataset_name.endswith(".parquet"):
|
|
108
|
+
from core import Collection
|
|
109
|
+
col = Collection.from_parquet(dataset_name)
|
|
110
|
+
docs = col._metadata
|
|
111
|
+
embeddings = col._vectors.tolist()
|
|
112
|
+
dimension = col.dimension
|
|
113
|
+
else:
|
|
114
|
+
# Standard HF/Zoo load
|
|
115
|
+
docs = load_documents(dataset_name, subset, split, max_docs)
|
|
116
|
+
|
|
117
|
+
if not docs:
|
|
118
|
+
console.print("[red]No documents loaded. Exiting.[/red]")
|
|
119
|
+
return
|
|
120
|
+
console.print(f"Loaded {len(docs)} documents.")
|
|
121
|
+
|
|
122
|
+
# Embed Data
|
|
123
|
+
console.print(f"\n[bold]Generating embeddings using LiteLLM ({model_name})...[/bold]")
|
|
124
|
+
embedder = Embedder(model_name)
|
|
125
|
+
|
|
126
|
+
text_field = text_column
|
|
127
|
+
if text_field not in docs[0] and "text" in docs[0]: text_field = "text"
|
|
128
|
+
elif text_field not in docs[0] and "content" in docs[0]: text_field = "content"
|
|
129
|
+
|
|
130
|
+
texts = [d.get(text_field, str(d)) for d in docs]
|
|
131
|
+
|
|
132
|
+
t0 = time.perf_counter()
|
|
133
|
+
embeddings = embedder.embed_texts(texts)
|
|
134
|
+
emb_time = time.perf_counter() - t0
|
|
135
|
+
|
|
136
|
+
if not embeddings:
|
|
137
|
+
console.print("[red]Failed to generate embeddings.[/red]")
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
dimension = len(embeddings[0])
|
|
141
|
+
console.print(f"Generated {len(embeddings)} embeddings of dimension {dimension} in {emb_time:.2f}s.")
|
|
142
|
+
|
|
143
|
+
# Initialize Indexers
|
|
144
|
+
indexers_map = get_indexer_map()
|
|
145
|
+
|
|
146
|
+
if custom_indexer_script:
|
|
147
|
+
custom_name, custom_cls = load_custom_indexer(custom_indexer_script, console)
|
|
148
|
+
if custom_cls:
|
|
149
|
+
c_name_lower = custom_name.lower()
|
|
150
|
+
indexers_map[c_name_lower] = custom_cls
|
|
151
|
+
console.print(f"[green]✓[/green] Successfully loaded custom indexer: [bold]{custom_name}[/bold]")
|
|
152
|
+
if c_name_lower not in [x.lower() for x in indexer_names]:
|
|
153
|
+
indexer_names.append(c_name_lower)
|
|
154
|
+
|
|
155
|
+
results = []
|
|
156
|
+
for name in indexer_names:
|
|
157
|
+
name_lower = name.lower()
|
|
158
|
+
if name_lower not in indexers_map:
|
|
159
|
+
console.print(f"[yellow]Warning: Indexer '{name}' not found. Skipping.[/yellow]")
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
res = benchmark_single_indexer(
|
|
163
|
+
name, indexers_map[name_lower], dimension, embeddings, docs, console, cleanup
|
|
164
|
+
)
|
|
165
|
+
if res:
|
|
166
|
+
results.append(res)
|
|
167
|
+
|
|
168
|
+
# Report
|
|
169
|
+
if results:
|
|
170
|
+
display_results(results, console)
|
|
171
|
+
return results
|
|
172
|
+
return []
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def display_results(results, console):
|
|
176
|
+
console.print("\n[bold green]Benchmark Results[/bold green]")
|
|
177
|
+
table = Table(show_header=True, header_style="bold magenta")
|
|
178
|
+
table.add_column("Indexer", style="cyan")
|
|
179
|
+
table.add_column("Build Time (s)", justify="right")
|
|
180
|
+
table.add_column("Query Time (ms/query)", justify="right")
|
|
181
|
+
table.add_column("Index Size (KB)", justify="right")
|
|
182
|
+
table.add_column("Memory Added (MB)", justify="right")
|
|
183
|
+
|
|
184
|
+
for r in results:
|
|
185
|
+
table.add_row(
|
|
186
|
+
r["Indexer"],
|
|
187
|
+
r["Build Time (s)"],
|
|
188
|
+
r["Query Time (ms)"],
|
|
189
|
+
r["Index Size (KB)"],
|
|
190
|
+
r["Memory Diff (MB)"],
|
|
191
|
+
)
|
|
192
|
+
console.print(table)
|
|
193
|
+
|
|
194
|
+
def generate_report(results: List[Dict[str, Any]], dataset_name: str, output_path: str = "benchmark_report.md"):
|
|
195
|
+
"""
|
|
196
|
+
Generate a formatted Markdown technical report from benchmark results.
|
|
197
|
+
"""
|
|
198
|
+
import datetime
|
|
199
|
+
|
|
200
|
+
report = []
|
|
201
|
+
report.append(f"# Embenx Retrieval Benchmark Report 🚀")
|
|
202
|
+
report.append(f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
203
|
+
report.append(f"Dataset: **{dataset_name}**")
|
|
204
|
+
report.append("\n## Executive Summary")
|
|
205
|
+
|
|
206
|
+
if not results:
|
|
207
|
+
report.append("No results to report.")
|
|
208
|
+
else:
|
|
209
|
+
# Find winners
|
|
210
|
+
query_times = [float(r["Query Time (ms)"]) for r in results]
|
|
211
|
+
fastest_idx = query_times.index(min(query_times))
|
|
212
|
+
fastest = results[fastest_idx]["Indexer"]
|
|
213
|
+
|
|
214
|
+
sizes = [float(r["Index Size (KB)"]) for r in results]
|
|
215
|
+
smallest_idx = sizes.index(min(sizes))
|
|
216
|
+
smallest = results[smallest_idx]["Indexer"]
|
|
217
|
+
|
|
218
|
+
report.append(f"- **Fastest Indexer**: {fastest} ({min(query_times):.2f} ms/query)")
|
|
219
|
+
report.append(f"- **Most Memory Efficient**: {smallest} ({min(sizes):.2f} KB)")
|
|
220
|
+
|
|
221
|
+
report.append("\n## Results Table")
|
|
222
|
+
report.append("| Indexer | Build Time (s) | Query Time (ms) | Index Size (KB) | Memory Diff (MB) |")
|
|
223
|
+
report.append("| :--- | :--- | :--- | :--- | :--- |")
|
|
224
|
+
|
|
225
|
+
for r in results:
|
|
226
|
+
report.append(f"| {r['Indexer']} | {r['Build Time (s)']} | {r['Query Time (ms)']} | {r['Index Size (KB)']} | {r['Memory Diff (MB)']} |")
|
|
227
|
+
|
|
228
|
+
report.append("\n## Analysis & Recommendations")
|
|
229
|
+
report.append("Based on the data above, we recommend:")
|
|
230
|
+
if "FAISS-HNSW" in [r["Indexer"] for r in results]:
|
|
231
|
+
report.append("- Use **FAISS-HNSW** for production-grade local search balancing speed and memory.")
|
|
232
|
+
if "SCANN" in [r["Indexer"] for r in results]:
|
|
233
|
+
report.append("- Use **ScaNN** for state-of-the-art speed/recall if on supported hardware.")
|
|
234
|
+
report.append("- For ultra-low latency requirements, prioritize indexers with sub-1ms query times.")
|
|
235
|
+
|
|
236
|
+
with open(output_path, "w") as f:
|
|
237
|
+
f.write("\n".join(report))
|
|
238
|
+
|
|
239
|
+
return output_path
|