embenx 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. embenx-1.3.0/.gitignore +8 -0
  2. embenx-1.3.0/LICENSE +21 -0
  3. embenx-1.3.0/PKG-INFO +164 -0
  4. embenx-1.3.0/README.md +126 -0
  5. embenx-1.3.0/SKILL.md +4 -0
  6. embenx-1.3.0/benchmark.py +239 -0
  7. embenx-1.3.0/cli.py +310 -0
  8. embenx-1.3.0/core.py +819 -0
  9. embenx-1.3.0/data.py +77 -0
  10. embenx-1.3.0/docs_src/conf.py +23 -0
  11. embenx-1.3.0/examples/advanced_reranking.py +54 -0
  12. embenx-1.3.0/examples/agentic_self_healing.py +51 -0
  13. embenx-1.3.0/examples/ann_libraries.py +47 -0
  14. embenx-1.3.0/examples/cluster_kv_optimization.py +32 -0
  15. embenx-1.3.0/examples/complex_indexers.py +44 -0
  16. embenx-1.3.0/examples/custom_indexer.py +28 -0
  17. embenx-1.3.0/examples/db_indexers.py +56 -0
  18. embenx-1.3.0/examples/echo_temporal_memory.py +44 -0
  19. embenx-1.3.0/examples/faiss_variants.py +41 -0
  20. embenx-1.3.0/examples/filtering_reranking.py +45 -0
  21. embenx-1.3.0/examples/hybrid_search.py +51 -0
  22. embenx-1.3.0/examples/kv_cache_augmented.py +47 -0
  23. embenx-1.3.0/examples/library_benchmark.py +23 -0
  24. embenx-1.3.0/examples/matryoshka_search.py +30 -0
  25. embenx-1.3.0/examples/numpy_interop.py +34 -0
  26. embenx-1.3.0/examples/recall_evaluation.py +31 -0
  27. embenx-1.3.0/examples/spatial_cognitive_memory.py +38 -0
  28. embenx-1.3.0/examples/ssm_hydration.py +42 -0
  29. embenx-1.3.0/examples/trajectory_search.py +46 -0
  30. embenx-1.3.0/examples/turbo_quant_caching.py +47 -0
  31. embenx-1.3.0/examples/usearch_quantization.py +30 -0
  32. embenx-1.3.0/explorer.py +193 -0
  33. embenx-1.3.0/indexers/__init__.py +53 -0
  34. embenx-1.3.0/indexers/annoy_indexer.py +47 -0
  35. embenx-1.3.0/indexers/base.py +51 -0
  36. embenx-1.3.0/indexers/bm25_indexer.py +63 -0
  37. embenx-1.3.0/indexers/chroma_indexer.py +51 -0
  38. embenx-1.3.0/indexers/duckdb_indexer.py +44 -0
  39. embenx-1.3.0/indexers/elasticsearch_indexer.py +94 -0
  40. embenx-1.3.0/indexers/faiss_indexer.py +114 -0
  41. embenx-1.3.0/indexers/hnswlib_indexer.py +57 -0
  42. embenx-1.3.0/indexers/lance_indexer.py +56 -0
  43. embenx-1.3.0/indexers/milvus_indexer.py +74 -0
  44. embenx-1.3.0/indexers/pgvector_indexer.py +80 -0
  45. embenx-1.3.0/indexers/qdrant_indexer.py +39 -0
  46. embenx-1.3.0/indexers/scann_indexer.py +76 -0
  47. embenx-1.3.0/indexers/simple_indexer.py +55 -0
  48. embenx-1.3.0/indexers/usearch_indexer.py +63 -0
  49. embenx-1.3.0/indexers/vespa_indexer.py +54 -0
  50. embenx-1.3.0/indexers/weaviate_indexer.py +82 -0
  51. embenx-1.3.0/llm.py +76 -0
  52. embenx-1.3.0/mcp_server.py +113 -0
  53. embenx-1.3.0/pyproject.toml +112 -0
  54. embenx-1.3.0/rerank.py +53 -0
  55. embenx-1.3.0/tests/test_absolute_final_100.py +106 -0
  56. embenx-1.3.0/tests/test_benchmark.py +155 -0
  57. embenx-1.3.0/tests/test_cli.py +81 -0
  58. embenx-1.3.0/tests/test_complex_indexers.py +63 -0
  59. embenx-1.3.0/tests/test_core.py +386 -0
  60. embenx-1.3.0/tests/test_coverage_edge_cases.py +95 -0
  61. embenx-1.3.0/tests/test_coverage_final.py +117 -0
  62. embenx-1.3.0/tests/test_data.py +16 -0
  63. embenx-1.3.0/tests/test_explorer.py +14 -0
  64. embenx-1.3.0/tests/test_hybrid.py +72 -0
  65. embenx-1.3.0/tests/test_indexers.py +178 -0
  66. embenx-1.3.0/tests/test_llm.py +46 -0
  67. embenx-1.3.0/tests/test_milestone_4.py +85 -0
  68. embenx-1.3.0/tests/test_milestone_5.py +58 -0
  69. embenx-1.3.0/tests/test_more_indexers.py +71 -0
  70. embenx-1.3.0/tests/test_weaviate.py +28 -0
@@ -0,0 +1,8 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .DS_Store
4
+ *.db
5
+ *.lance
6
+ docs/*.html
7
+ !docs/index.html
8
+ docs/
embenx-1.3.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Aditya Karnam
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
embenx-1.3.0/PKG-INFO ADDED
@@ -0,0 +1,164 @@
1
+ Metadata-Version: 2.4
2
+ Name: embenx
3
+ Version: 1.3.0
4
+ Summary: Universal embedding retrieval toolkit & benchmark. Search, filter, and rerank across 15+ vector backends with a unified Python API and CLI.
5
+ Author-email: adityak74 <adityakarnam@gmail.com>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.10
9
+ Requires-Dist: annoy>=1.17.0
10
+ Requires-Dist: chromadb>=0.4.0
11
+ Requires-Dist: datasets>=2.14.0
12
+ Requires-Dist: duckdb>=1.0.0
13
+ Requires-Dist: faiss-cpu>=1.7.4
14
+ Requires-Dist: flashrank>=0.2.10
15
+ Requires-Dist: hnswlib>=0.8.0
16
+ Requires-Dist: lancedb>=0.2.0
17
+ Requires-Dist: litellm>=1.0.0
18
+ Requires-Dist: mcp>=1.26.0
19
+ Requires-Dist: networkx>=3.0
20
+ Requires-Dist: pillow>=10.0.0
21
+ Requires-Dist: plotly>=5.18.0
22
+ Requires-Dist: psutil>=5.9.0
23
+ Requires-Dist: psycopg2-binary>=2.9.0
24
+ Requires-Dist: pyarrow>=12.0.0
25
+ Requires-Dist: pymilvus>=2.3.0
26
+ Requires-Dist: qdrant-client>=1.5.0
27
+ Requires-Dist: rank-bm25>=0.2.2
28
+ Requires-Dist: requests>=2.31.0
29
+ Requires-Dist: rerankers>=0.10.0
30
+ Requires-Dist: rich>=10.11.0
31
+ Requires-Dist: safetensors>=0.4.0
32
+ Requires-Dist: scikit-learn>=1.3.0
33
+ Requires-Dist: streamlit>=1.32.0
34
+ Requires-Dist: typer>=0.9.0
35
+ Requires-Dist: usearch>=2.9.0
36
+ Requires-Dist: weaviate-client>=4.5.4
37
+ Description-Content-Type: text/markdown
38
+
39
+ <div align="center">
40
+
41
+ <h1>Embenx 🚀</h1>
42
+
43
+ <p>
44
+ <strong>Universal embedding retrieval toolkit & benchmark.</strong><br/>
45
+ Search, filter, and rerank across 15+ vector backends (FAISS, ScaNN, pgvector, etc.) with a unified Python API and CLI.
46
+ </p>
47
+
48
+ <p>
49
+ <a href="https://github.com/adityak74/embenx/stargazers"><img src="https://img.shields.io/github/stars/adityak74/embenx?style=flat-square&color=yellow" alt="Stars"/></a>
50
+ <a href="https://github.com/adityak74/embenx/issues"><img src="https://img.shields.io/github/issues/adityak74/embenx?style=flat-square" alt="Issues"/></a>
51
+ <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-green.svg?style=flat-square" alt="MIT License"/></a>
52
+ <a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.10+-blue.svg?style=flat-square" alt="Python 3.10+"/></a>
53
+ <a href="https://adityak74.github.io/embenx/"><img src="https://img.shields.io/badge/docs-live-brightgreen?style=flat-square" alt="Docs"/></a>
54
+ <a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/uv-ready-purple.svg?style=flat-square" alt="uv ready"/></a>
55
+ </p>
56
+
57
+ <p>
58
+ <a href="https://adityak74.github.io/embenx/">Documentation</a> ·
59
+ <a href="https://github.com/adityak74/embenx/issues">Report Bug</a> ·
60
+ <a href="https://github.com/adityak74/embenx/issues">Request Feature</a>
61
+ </p>
62
+
63
+ </div>
64
+
65
+ ---
66
+
67
+ ## What is Embenx?
68
+
69
+ Embenx is a Python-native retrieval library that sits between raw vector indices and full-blown vector databases. It provides a high-level `Collection` API for managing embeddings and metadata, supporting advanced features like **filtering**, **reranking**, and **quantization** across 15+ backends.
70
+
71
+ ## Library Usage
72
+
73
+ ```python
74
+ from embenx import Collection
75
+
76
+ # 1. Initialize a collection
77
+ col = Collection(dimension=768, indexer_type="faiss-hnsw")
78
+
79
+ # 2. Add data
80
+ col.add(
81
+ vectors=[[0.1, 0.2, ...], [0.3, 0.4, ...]],
82
+ metadata=[{"category": "AI", "id": 1, "text": "The quick brown fox"}]
83
+ )
84
+
85
+ # 3. Search with filtering
86
+ results = col.search(
87
+ query=[0.1, 0.2, ...],
88
+ top_k=5,
89
+ where={"category": "AI"}
90
+ )
91
+
92
+ # 4. Export to production
93
+ col.export_to_production(backend="qdrant", connection_url="http://localhost:6333")
94
+ ```
95
+
96
+ ## Agentic Memory (MCP)
97
+
98
+ Embenx ships with a built-in **Model Context Protocol (MCP)** server. This allows AI agents (like Claude Desktop) to use Embenx collections as their own long-term memory.
99
+
100
+ ### 1. Start the server
101
+ ```bash
102
+ embenx mcp-start
103
+ ```
104
+
105
+ ## Visual Explorer
106
+
107
+ Embenx provides a built-in web UI to visualize your vector collections, including an interactive **HNSW Graph Visualizer** and a **RAG Playground**.
108
+
109
+ ```bash
110
+ embenx explorer
111
+ ```
112
+
113
+ ## Features
114
+
115
+ - **Multimodal Support** — Native support for image embeddings (CLIP).
116
+ - **RAG Playground** — Test retrieval quality with an integrated LLM chat loop.
117
+ - **HNSW Graph Visualizer** — Interactive 3D visualization of navigation layers.
118
+ - **Export to Production** — One-click migration to Qdrant or Milvus clusters.
119
+ - **Unified Collection API** — Table-like interface for vectors and metadata.
120
+ - **Retrieval Zoo** — Instant access to pre-indexed collections (SQuAD, MS-MARCO, etc.).
121
+ - **Agentic Memory (MCP)** — Native Model Context Protocol support for AI agents.
122
+ - **Self-Healing Retrieval** — Integrated feedback loops to automatically improve ranking accuracy.
123
+ - **Temporal Memory (Echo)** — Recency-biased retrieval and time-window filtering (arXiv:2502.16090).
124
+ - **Spatial Memory (ESWM)** — Neuroscience-inspired spatial cognitive maps for navigation (ICLR 2026).
125
+ - **TurboQuant Compression** — 1-bit sign-based quantization for activation tensors (arXiv:2504.19874).
126
+ - **ClusterKV Optimization** — Semantic clustering for high-throughput retrieval (arXiv:2412.03213).
127
+ - **Hybrid Search** — Combine dense vectors with sparse BM25 retrieval using RRF.
128
+ - **KV Cache Offloading (RA-KVC)** — Store and retrieve high-dimensional LLM activations using `safetensors`.
129
+ - **SSM State Hydration** — Persist and prime hidden states ($h_0$) for State Space Models (Mamba-2).
130
+ - **Trajectory Retrieval** — Search for similar state/action sequences for World Models.
131
+ - **Visual Explorer** — Built-in web UI to visualize vector clusters and metadata.
132
+ - **Universal model support** — Integrated LiteLLM for any embedding provider.
133
+ - **Portable Formats** — Native support for Parquet, NumPy (.npy/.npz), and FAISS (.index).
134
+
135
+ ## Supported Indexers
136
+
137
+ | Indexer | Family | Best For |
138
+ | :--- | :--- | :--- |
139
+ | `faiss` | HNSW, IVF, Flat | Production-grade local search |
140
+ | `scann` | Tree-AH | State-of-the-art speed/recall (Linux) |
141
+ | `usearch` | HNSW | High-performance C++, low latency |
142
+ | `pgvector` | Postgres | Embeddings next to relational data |
143
+ | `lancedb` | Columnar | Large disk-based datasets |
144
+ | `simple` | NumPy | Exact search baseline |
145
+
146
+ ## Installation
147
+
148
+ ```bash
149
+ pip install embenx
150
+ ```
151
+
152
+ ## Roadmap
153
+
154
+ See [ROADMAP.md](ROADMAP.md) for our journey towards production-grade agentic retrieval.
155
+
156
+ ## License
157
+
158
+ Distributed under the **MIT License**.
159
+
160
+ ---
161
+
162
+ <div align="center">
163
+ Built with ❤️ for the AI engineering community by <a href="https://github.com/adityak74">adityak74</a>
164
+ </div>
embenx-1.3.0/README.md ADDED
@@ -0,0 +1,126 @@
1
+ <div align="center">
2
+
3
+ <h1>Embenx 🚀</h1>
4
+
5
+ <p>
6
+ <strong>Universal embedding retrieval toolkit & benchmark.</strong><br/>
7
+ Search, filter, and rerank across 15+ vector backends (FAISS, ScaNN, pgvector, etc.) with a unified Python API and CLI.
8
+ </p>
9
+
10
+ <p>
11
+ <a href="https://github.com/adityak74/embenx/stargazers"><img src="https://img.shields.io/github/stars/adityak74/embenx?style=flat-square&color=yellow" alt="Stars"/></a>
12
+ <a href="https://github.com/adityak74/embenx/issues"><img src="https://img.shields.io/github/issues/adityak74/embenx?style=flat-square" alt="Issues"/></a>
13
+ <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-green.svg?style=flat-square" alt="MIT License"/></a>
14
+ <a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.10+-blue.svg?style=flat-square" alt="Python 3.10+"/></a>
15
+ <a href="https://adityak74.github.io/embenx/"><img src="https://img.shields.io/badge/docs-live-brightgreen?style=flat-square" alt="Docs"/></a>
16
+ <a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/uv-ready-purple.svg?style=flat-square" alt="uv ready"/></a>
17
+ </p>
18
+
19
+ <p>
20
+ <a href="https://adityak74.github.io/embenx/">Documentation</a> ·
21
+ <a href="https://github.com/adityak74/embenx/issues">Report Bug</a> ·
22
+ <a href="https://github.com/adityak74/embenx/issues">Request Feature</a>
23
+ </p>
24
+
25
+ </div>
26
+
27
+ ---
28
+
29
+ ## What is Embenx?
30
+
31
+ Embenx is a Python-native retrieval library that sits between raw vector indices and full-blown vector databases. It provides a high-level `Collection` API for managing embeddings and metadata, supporting advanced features like **filtering**, **reranking**, and **quantization** across 15+ backends.
32
+
33
+ ## Library Usage
34
+
35
+ ```python
36
+ from embenx import Collection
37
+
38
+ # 1. Initialize a collection
39
+ col = Collection(dimension=768, indexer_type="faiss-hnsw")
40
+
41
+ # 2. Add data
42
+ col.add(
43
+ vectors=[[0.1, 0.2, ...], [0.3, 0.4, ...]],
44
+ metadata=[{"category": "AI", "id": 1, "text": "The quick brown fox"}]
45
+ )
46
+
47
+ # 3. Search with filtering
48
+ results = col.search(
49
+ query=[0.1, 0.2, ...],
50
+ top_k=5,
51
+ where={"category": "AI"}
52
+ )
53
+
54
+ # 4. Export to production
55
+ col.export_to_production(backend="qdrant", connection_url="http://localhost:6333")
56
+ ```
57
+
58
+ ## Agentic Memory (MCP)
59
+
60
+ Embenx ships with a built-in **Model Context Protocol (MCP)** server. This allows AI agents (like Claude Desktop) to use Embenx collections as their own long-term memory.
61
+
62
+ ### 1. Start the server
63
+ ```bash
64
+ embenx mcp-start
65
+ ```
66
+
67
+ ## Visual Explorer
68
+
69
+ Embenx provides a built-in web UI to visualize your vector collections, including an interactive **HNSW Graph Visualizer** and a **RAG Playground**.
70
+
71
+ ```bash
72
+ embenx explorer
73
+ ```
74
+
75
+ ## Features
76
+
77
+ - **Multimodal Support** — Native support for image embeddings (CLIP).
78
+ - **RAG Playground** — Test retrieval quality with an integrated LLM chat loop.
79
+ - **HNSW Graph Visualizer** — Interactive 3D visualization of navigation layers.
80
+ - **Export to Production** — One-click migration to Qdrant or Milvus clusters.
81
+ - **Unified Collection API** — Table-like interface for vectors and metadata.
82
+ - **Retrieval Zoo** — Instant access to pre-indexed collections (SQuAD, MS-MARCO, etc.).
83
+ - **Agentic Memory (MCP)** — Native Model Context Protocol support for AI agents.
84
+ - **Self-Healing Retrieval** — Integrated feedback loops to automatically improve ranking accuracy.
85
+ - **Temporal Memory (Echo)** — Recency-biased retrieval and time-window filtering (arXiv:2502.16090).
86
+ - **Spatial Memory (ESWM)** — Neuroscience-inspired spatial cognitive maps for navigation (ICLR 2026).
87
+ - **TurboQuant Compression** — 1-bit sign-based quantization for activation tensors (arXiv:2504.19874).
88
+ - **ClusterKV Optimization** — Semantic clustering for high-throughput retrieval (arXiv:2412.03213).
89
+ - **Hybrid Search** — Combine dense vectors with sparse BM25 retrieval using RRF.
90
+ - **KV Cache Offloading (RA-KVC)** — Store and retrieve high-dimensional LLM activations using `safetensors`.
91
+ - **SSM State Hydration** — Persist and prime hidden states ($h_0$) for State Space Models (Mamba-2).
92
+ - **Trajectory Retrieval** — Search for similar state/action sequences for World Models.
93
+ - **Visual Explorer** — Built-in web UI to visualize vector clusters and metadata.
94
+ - **Universal model support** — Integrated LiteLLM for any embedding provider.
95
+ - **Portable Formats** — Native support for Parquet, NumPy (.npy/.npz), and FAISS (.index).
96
+
97
+ ## Supported Indexers
98
+
99
+ | Indexer | Family | Best For |
100
+ | :--- | :--- | :--- |
101
+ | `faiss` | HNSW, IVF, Flat | Production-grade local search |
102
+ | `scann` | Tree-AH | State-of-the-art speed/recall (Linux) |
103
+ | `usearch` | HNSW | High-performance C++, low latency |
104
+ | `pgvector` | Postgres | Embeddings next to relational data |
105
+ | `lancedb` | Columnar | Large disk-based datasets |
106
+ | `simple` | NumPy | Exact search baseline |
107
+
108
+ ## Installation
109
+
110
+ ```bash
111
+ pip install embenx
112
+ ```
113
+
114
+ ## Roadmap
115
+
116
+ See [ROADMAP.md](ROADMAP.md) for our journey towards production-grade agentic retrieval.
117
+
118
+ ## License
119
+
120
+ Distributed under the **MIT License**.
121
+
122
+ ---
123
+
124
+ <div align="center">
125
+ Built with ❤️ for the AI engineering community by <a href="https://github.com/adityak74">adityak74</a>
126
+ </div>
embenx-1.3.0/SKILL.md ADDED
@@ -0,0 +1,4 @@
1
+ # Embenx Skill 🚀
2
+ Optimized for high-performance embedding retrieval and benchmarking.
3
+ Use `Collection` for high-level operations.
4
+
@@ -0,0 +1,239 @@
1
+ import importlib.util
2
+ import inspect
3
+ import os
4
+ import time
5
+ from typing import List, Dict, Any, Optional
6
+
7
+ import psutil
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+
11
+ from data import load_documents
12
+ from indexers import BaseIndexer, get_indexer_map
13
+ from llm import Embedder
14
+
15
+
16
+ def get_memory_usage():
17
+ process = psutil.Process(os.getpid())
18
+ return process.memory_info().rss / 1024 / 1024 # MB
19
+
20
+
21
+ def load_custom_indexer(script_path: str, console: Console):
22
+ """
23
+ Dynamically load a class inheriting from BaseIndexer from a given script.
24
+ """
25
+ try:
26
+ spec = importlib.util.spec_from_file_location("custom_indexer", script_path)
27
+ if spec is None or spec.loader is None:
28
+ console.print(f"[red]Could not load spec for {script_path}[/red]")
29
+ return None, None
30
+
31
+ module = importlib.util.module_from_spec(spec)
32
+ spec.loader.exec_module(module)
33
+
34
+ for name, obj in inspect.getmembers(module):
35
+ if inspect.isclass(obj) and issubclass(obj, BaseIndexer) and obj is not BaseIndexer:
36
+ return name, obj
37
+
38
+ console.print(f"[red]No class inheriting from BaseIndexer found in {script_path}[/red]")
39
+ return None, None
40
+ except Exception as e:
41
+ console.print(f"[red]Error loading custom indexer from {script_path}: {e}[/red]")
42
+ return None, None
43
+
44
+
45
+ def benchmark_single_indexer(name, indexer_cls, dimension, embeddings, metadata, console, cleanup=True):
46
+ console.print(f"\n[bold cyan]--- Benchmarking {name.upper()} ---[/bold cyan]")
47
+ indexer = indexer_cls(dimension=dimension)
48
+
49
+ # Build Index
50
+ mem_before = get_memory_usage()
51
+ t0 = time.perf_counter()
52
+ try:
53
+ indexer.build_index(embeddings, metadata)
54
+ build_time = time.perf_counter() - t0
55
+ except Exception as e:
56
+ console.print(f"[red]Failed to build index for {name}: {e}[/red]")
57
+ return None
58
+
59
+ mem_after = get_memory_usage()
60
+ mem_diff = mem_after - mem_before
61
+ index_size = indexer.get_size()
62
+
63
+ # Query Benchmarking
64
+ query_embeddings = embeddings[: min(10, len(embeddings))]
65
+ query_time = 0
66
+ if query_embeddings:
67
+ t0 = time.perf_counter()
68
+ for q_emb in query_embeddings:
69
+ indexer.search(q_emb, top_k=5)
70
+ query_time = (time.perf_counter() - t0) / len(query_embeddings) * 1000 # ms per query
71
+
72
+ result = {
73
+ "Indexer": name.upper(),
74
+ "Build Time (s)": f"{build_time:.4f}",
75
+ "Query Time (ms)": f"{query_time:.2f}",
76
+ "Index Size (KB)": f"{index_size / 1024:.2f}",
77
+ "Memory Diff (MB)": f"{mem_diff:.2f}",
78
+ }
79
+
80
+ if cleanup:
81
+ indexer.cleanup()
82
+
83
+ console.print(f"Done {name.upper()}.")
84
+ return result
85
+
86
+
87
+ def run_benchmark(
88
+ dataset_name: str,
89
+ split: str,
90
+ text_column: str,
91
+ max_docs: int,
92
+ indexer_names: List[str],
93
+ model_name: str,
94
+ console: Console,
95
+ data_files: str = None,
96
+ cleanup: bool = True,
97
+ custom_indexer_script: str = None,
98
+ subset: str = "default", # Added as optional
99
+ ):
100
+ """
101
+ Run Embenx benchmarks. Matches original signature for test compatibility.
102
+ """
103
+ # Load Data
104
+ console.print(f"\n[bold]Loading up to {max_docs} documents from {dataset_name}...[/bold]")
105
+
106
+ # Check if dataset_name is actually a path (Parquet benchmark use case)
107
+ if os.path.exists(dataset_name) and dataset_name.endswith(".parquet"):
108
+ from core import Collection
109
+ col = Collection.from_parquet(dataset_name)
110
+ docs = col._metadata
111
+ embeddings = col._vectors.tolist()
112
+ dimension = col.dimension
113
+ else:
114
+ # Standard HF/Zoo load
115
+ docs = load_documents(dataset_name, subset, split, max_docs)
116
+
117
+ if not docs:
118
+ console.print("[red]No documents loaded. Exiting.[/red]")
119
+ return
120
+ console.print(f"Loaded {len(docs)} documents.")
121
+
122
+ # Embed Data
123
+ console.print(f"\n[bold]Generating embeddings using LiteLLM ({model_name})...[/bold]")
124
+ embedder = Embedder(model_name)
125
+
126
+ text_field = text_column
127
+ if text_field not in docs[0] and "text" in docs[0]: text_field = "text"
128
+ elif text_field not in docs[0] and "content" in docs[0]: text_field = "content"
129
+
130
+ texts = [d.get(text_field, str(d)) for d in docs]
131
+
132
+ t0 = time.perf_counter()
133
+ embeddings = embedder.embed_texts(texts)
134
+ emb_time = time.perf_counter() - t0
135
+
136
+ if not embeddings:
137
+ console.print("[red]Failed to generate embeddings.[/red]")
138
+ return
139
+
140
+ dimension = len(embeddings[0])
141
+ console.print(f"Generated {len(embeddings)} embeddings of dimension {dimension} in {emb_time:.2f}s.")
142
+
143
+ # Initialize Indexers
144
+ indexers_map = get_indexer_map()
145
+
146
+ if custom_indexer_script:
147
+ custom_name, custom_cls = load_custom_indexer(custom_indexer_script, console)
148
+ if custom_cls:
149
+ c_name_lower = custom_name.lower()
150
+ indexers_map[c_name_lower] = custom_cls
151
+ console.print(f"[green]✓[/green] Successfully loaded custom indexer: [bold]{custom_name}[/bold]")
152
+ if c_name_lower not in [x.lower() for x in indexer_names]:
153
+ indexer_names.append(c_name_lower)
154
+
155
+ results = []
156
+ for name in indexer_names:
157
+ name_lower = name.lower()
158
+ if name_lower not in indexers_map:
159
+ console.print(f"[yellow]Warning: Indexer '{name}' not found. Skipping.[/yellow]")
160
+ continue
161
+
162
+ res = benchmark_single_indexer(
163
+ name, indexers_map[name_lower], dimension, embeddings, docs, console, cleanup
164
+ )
165
+ if res:
166
+ results.append(res)
167
+
168
+ # Report
169
+ if results:
170
+ display_results(results, console)
171
+ return results
172
+ return []
173
+
174
+
175
+ def display_results(results, console):
176
+ console.print("\n[bold green]Benchmark Results[/bold green]")
177
+ table = Table(show_header=True, header_style="bold magenta")
178
+ table.add_column("Indexer", style="cyan")
179
+ table.add_column("Build Time (s)", justify="right")
180
+ table.add_column("Query Time (ms/query)", justify="right")
181
+ table.add_column("Index Size (KB)", justify="right")
182
+ table.add_column("Memory Added (MB)", justify="right")
183
+
184
+ for r in results:
185
+ table.add_row(
186
+ r["Indexer"],
187
+ r["Build Time (s)"],
188
+ r["Query Time (ms)"],
189
+ r["Index Size (KB)"],
190
+ r["Memory Diff (MB)"],
191
+ )
192
+ console.print(table)
193
+
194
+ def generate_report(results: List[Dict[str, Any]], dataset_name: str, output_path: str = "benchmark_report.md"):
195
+ """
196
+ Generate a formatted Markdown technical report from benchmark results.
197
+ """
198
+ import datetime
199
+
200
+ report = []
201
+ report.append(f"# Embenx Retrieval Benchmark Report 🚀")
202
+ report.append(f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
203
+ report.append(f"Dataset: **{dataset_name}**")
204
+ report.append("\n## Executive Summary")
205
+
206
+ if not results:
207
+ report.append("No results to report.")
208
+ else:
209
+ # Find winners
210
+ query_times = [float(r["Query Time (ms)"]) for r in results]
211
+ fastest_idx = query_times.index(min(query_times))
212
+ fastest = results[fastest_idx]["Indexer"]
213
+
214
+ sizes = [float(r["Index Size (KB)"]) for r in results]
215
+ smallest_idx = sizes.index(min(sizes))
216
+ smallest = results[smallest_idx]["Indexer"]
217
+
218
+ report.append(f"- **Fastest Indexer**: {fastest} ({min(query_times):.2f} ms/query)")
219
+ report.append(f"- **Most Memory Efficient**: {smallest} ({min(sizes):.2f} KB)")
220
+
221
+ report.append("\n## Results Table")
222
+ report.append("| Indexer | Build Time (s) | Query Time (ms) | Index Size (KB) | Memory Diff (MB) |")
223
+ report.append("| :--- | :--- | :--- | :--- | :--- |")
224
+
225
+ for r in results:
226
+ report.append(f"| {r['Indexer']} | {r['Build Time (s)']} | {r['Query Time (ms)']} | {r['Index Size (KB)']} | {r['Memory Diff (MB)']} |")
227
+
228
+ report.append("\n## Analysis & Recommendations")
229
+ report.append("Based on the data above, we recommend:")
230
+ if "FAISS-HNSW" in [r["Indexer"] for r in results]:
231
+ report.append("- Use **FAISS-HNSW** for production-grade local search balancing speed and memory.")
232
+ if "SCANN" in [r["Indexer"] for r in results]:
233
+ report.append("- Use **ScaNN** for state-of-the-art speed/recall if on supported hardware.")
234
+ report.append("- For ultra-low latency requirements, prioritize indexers with sub-1ms query times.")
235
+
236
+ with open(output_path, "w") as f:
237
+ f.write("\n".join(report))
238
+
239
+ return output_path