vecforge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vecforge/__init__.py +59 -0
- vecforge/cli/__init__.py +3 -0
- vecforge/cli/main.py +197 -0
- vecforge/core/__init__.py +3 -0
- vecforge/core/bm25.py +187 -0
- vecforge/core/embedder.py +152 -0
- vecforge/core/indexer.py +196 -0
- vecforge/core/reranker.py +120 -0
- vecforge/core/storage.py +493 -0
- vecforge/core/vault.py +760 -0
- vecforge/exceptions.py +164 -0
- vecforge/ingest/__init__.py +3 -0
- vecforge/ingest/dispatcher.py +181 -0
- vecforge/ingest/document.py +237 -0
- vecforge/search/__init__.py +3 -0
- vecforge/search/cascade.py +186 -0
- vecforge/search/filters.py +146 -0
- vecforge/search/hybrid.py +146 -0
- vecforge/security/__init__.py +3 -0
- vecforge/security/audit.py +169 -0
- vecforge/security/encryption.py +84 -0
- vecforge/security/namespaces.py +127 -0
- vecforge/security/rbac.py +172 -0
- vecforge/security/snapshots.py +135 -0
- vecforge/server/__init__.py +3 -0
- vecforge/server/app.py +54 -0
- vecforge/server/routes.py +215 -0
- vecforge-0.2.0.dist-info/METADATA +302 -0
- vecforge-0.2.0.dist-info/RECORD +34 -0
- vecforge-0.2.0.dist-info/WHEEL +5 -0
- vecforge-0.2.0.dist-info/entry_points.txt +2 -0
- vecforge-0.2.0.dist-info/licenses/LICENSE +45 -0
- vecforge-0.2.0.dist-info/licenses/NOTICE +14 -0
- vecforge-0.2.0.dist-info/top_level.txt +1 -0
vecforge/core/indexer.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
FAISS index management for VecForge.
|
|
12
|
+
|
|
13
|
+
Provides efficient approximate nearest neighbour search using FAISS.
|
|
14
|
+
Supports both flat (exact) and IVF (approximate) indexes with automatic
|
|
15
|
+
training when the collection grows.
|
|
16
|
+
|
|
17
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
|
|
24
|
+
import faiss
|
|
25
|
+
import numpy as np
|
|
26
|
+
from numpy.typing import NDArray
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
# perf: Switch to IVF when collection exceeds this threshold
|
|
31
|
+
_IVF_THRESHOLD = 10_000
|
|
32
|
+
_IVF_NLIST = 100 # number of Voronoi cells for IVF
|
|
33
|
+
_IVF_NPROBE = 10 # number of cells to search
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FaissIndexer:
|
|
37
|
+
"""FAISS-based vector index for fast nearest-neighbour search.
|
|
38
|
+
|
|
39
|
+
Starts with IndexFlatIP (exact inner product search) for small
|
|
40
|
+
collections, and can be upgraded to IndexIVFFlat for larger ones.
|
|
41
|
+
|
|
42
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
dimension: Embedding vector dimension.
|
|
46
|
+
|
|
47
|
+
Performance:
|
|
48
|
+
Flat index: O(N * d) search — exact, best for N < 10k
|
|
49
|
+
IVF index: O(N/nlist * d * nprobe) — approximate, for N > 10k
|
|
50
|
+
Typical: <5ms at 100k docs with IVF
|
|
51
|
+
|
|
52
|
+
Example:
|
|
53
|
+
>>> indexer = FaissIndexer(dimension=384)
|
|
54
|
+
>>> indexer.add(np.random.randn(100, 384).astype(np.float32))
|
|
55
|
+
>>> distances, indices = indexer.search(query_vec, top_k=5)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, dimension: int) -> None:
|
|
59
|
+
self._dimension = dimension
|
|
60
|
+
self._index: faiss.Index = faiss.IndexFlatIP(dimension)
|
|
61
|
+
self._count = 0
|
|
62
|
+
self._is_ivf = False
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def count(self) -> int:
|
|
66
|
+
"""Return number of vectors in the index.
|
|
67
|
+
|
|
68
|
+
Performance:
|
|
69
|
+
Time: O(1)
|
|
70
|
+
"""
|
|
71
|
+
return self._count
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def dimension(self) -> int:
|
|
75
|
+
"""Return vector dimension.
|
|
76
|
+
|
|
77
|
+
Performance:
|
|
78
|
+
Time: O(1)
|
|
79
|
+
"""
|
|
80
|
+
return self._dimension
|
|
81
|
+
|
|
82
|
+
def add(self, vectors: NDArray[np.float32]) -> None:
|
|
83
|
+
"""Add vectors to the index.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
vectors: Array of shape (n, dimension) with float32 vectors.
|
|
87
|
+
|
|
88
|
+
Performance:
|
|
89
|
+
Time: O(n * d) for flat index
|
|
90
|
+
Amortized O(n * d / nlist) for IVF after training
|
|
91
|
+
|
|
92
|
+
Example:
|
|
93
|
+
>>> indexer = FaissIndexer(384)
|
|
94
|
+
>>> indexer.add(np.random.randn(50, 384).astype(np.float32))
|
|
95
|
+
>>> indexer.count
|
|
96
|
+
50
|
|
97
|
+
"""
|
|
98
|
+
if vectors.ndim == 1:
|
|
99
|
+
vectors = vectors.reshape(1, -1)
|
|
100
|
+
|
|
101
|
+
# security: Validate dimensions match
|
|
102
|
+
if vectors.shape[1] != self._dimension:
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"Vector dimension mismatch: expected {self._dimension}, "
|
|
105
|
+
f"got {vectors.shape[1]}.\n"
|
|
106
|
+
f"VecForge by Suneel Bose K · ArcGX TechLabs"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# perf: Ensure contiguous float32 for FAISS
|
|
110
|
+
vectors = np.ascontiguousarray(vectors, dtype=np.float32)
|
|
111
|
+
|
|
112
|
+
self._index.add(vectors)
|
|
113
|
+
self._count += vectors.shape[0]
|
|
114
|
+
|
|
115
|
+
def search(
|
|
116
|
+
self, query: NDArray[np.float32], top_k: int = 10
|
|
117
|
+
) -> tuple[NDArray[np.float32], NDArray[np.int64]]:
|
|
118
|
+
"""Search for nearest neighbours to query vector.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
query: Query vector of shape (dimension,) or (1, dimension).
|
|
122
|
+
top_k: Number of nearest neighbours to return.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Tuple of (distances, indices) arrays, each of shape (top_k,).
|
|
126
|
+
Distances are inner product scores (higher = more similar).
|
|
127
|
+
Indices are 0-based positions in the order vectors were added.
|
|
128
|
+
|
|
129
|
+
Performance:
|
|
130
|
+
Time: O(N * d) for flat, O(N/nlist * d * nprobe) for IVF
|
|
131
|
+
Typical: <5ms at 100k docs
|
|
132
|
+
|
|
133
|
+
Example:
|
|
134
|
+
>>> distances, indices = indexer.search(query_vec, top_k=5)
|
|
135
|
+
>>> print(f"Best match: index={indices[0]}, score={distances[0]:.4f}")
|
|
136
|
+
"""
|
|
137
|
+
if query.ndim == 1:
|
|
138
|
+
query = query.reshape(1, -1)
|
|
139
|
+
|
|
140
|
+
# perf: Ensure contiguous float32
|
|
141
|
+
query = np.ascontiguousarray(query, dtype=np.float32)
|
|
142
|
+
|
|
143
|
+
# why: Clamp top_k to available vectors
|
|
144
|
+
effective_k = min(top_k, self._count)
|
|
145
|
+
if effective_k == 0:
|
|
146
|
+
return (
|
|
147
|
+
np.array([], dtype=np.float32),
|
|
148
|
+
np.array([], dtype=np.int64),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
distances, indices = self._index.search(query, effective_k)
|
|
152
|
+
return distances[0], indices[0]
|
|
153
|
+
|
|
154
|
+
def to_bytes(self) -> bytes:
|
|
155
|
+
"""Serialize the FAISS index to bytes for storage.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Raw bytes of the serialized FAISS index.
|
|
159
|
+
|
|
160
|
+
Performance:
|
|
161
|
+
Time: O(N * d) — proportional to index size
|
|
162
|
+
"""
|
|
163
|
+
data: bytes = faiss.serialize_index(self._index).tobytes()
|
|
164
|
+
return data
|
|
165
|
+
|
|
166
|
+
@classmethod
|
|
167
|
+
def from_bytes(cls, data: bytes, dimension: int) -> FaissIndexer:
|
|
168
|
+
"""Deserialize a FAISS index from bytes.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
data: Serialized FAISS index bytes.
|
|
172
|
+
dimension: Expected embedding dimension.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Reconstructed FaissIndexer instance.
|
|
176
|
+
|
|
177
|
+
Performance:
|
|
178
|
+
Time: O(N * d) — proportional to index size
|
|
179
|
+
"""
|
|
180
|
+
index_array = np.frombuffer(data, dtype=np.uint8)
|
|
181
|
+
index = faiss.deserialize_index(index_array)
|
|
182
|
+
|
|
183
|
+
instance = cls(dimension)
|
|
184
|
+
instance._index = index
|
|
185
|
+
instance._count = index.ntotal
|
|
186
|
+
return instance
|
|
187
|
+
|
|
188
|
+
def reset(self) -> None:
|
|
189
|
+
"""Reset the index, removing all vectors.
|
|
190
|
+
|
|
191
|
+
Performance:
|
|
192
|
+
Time: O(1)
|
|
193
|
+
"""
|
|
194
|
+
self._index = faiss.IndexFlatIP(self._dimension)
|
|
195
|
+
self._count = 0
|
|
196
|
+
self._is_ivf = False
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
Cross-encoder reranker for VecForge.
|
|
12
|
+
|
|
13
|
+
Provides high-precision reranking of search candidates using a
|
|
14
|
+
cross-encoder model. Applied as the final stage of the cascade
|
|
15
|
+
search pipeline for improved accuracy.
|
|
16
|
+
|
|
17
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
_DEFAULT_RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Reranker:
|
|
31
|
+
"""Cross-encoder reranker for precision search refinement.
|
|
32
|
+
|
|
33
|
+
Lazily loads the cross-encoder model on first use. Reranks candidate
|
|
34
|
+
results by computing query-document relevance scores using a
|
|
35
|
+
cross-attention model — more accurate but slower than bi-encoder.
|
|
36
|
+
|
|
37
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
model_name: Cross-encoder model name.
|
|
41
|
+
Defaults to 'cross-encoder/ms-marco-MiniLM-L-6-v2'.
|
|
42
|
+
|
|
43
|
+
Performance:
|
|
44
|
+
Time: O(k * d) where k = candidates, d = model complexity
|
|
45
|
+
Typical: ~20-50ms for top-20 candidates
|
|
46
|
+
|
|
47
|
+
Example:
|
|
48
|
+
>>> reranker = Reranker()
|
|
49
|
+
>>> scored = reranker.rerank("diabetes treatment", candidates)
|
|
50
|
+
>>> print(scored[0]) # highest relevance
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self, model_name: str = _DEFAULT_RERANK_MODEL) -> None:
|
|
54
|
+
self._model_name = model_name
|
|
55
|
+
self._model: Any = None
|
|
56
|
+
|
|
57
|
+
def _load_model(self) -> None:
|
|
58
|
+
"""Lazily load the cross-encoder model.
|
|
59
|
+
|
|
60
|
+
Performance:
|
|
61
|
+
Time: O(1) — one-time cost of ~1-2 seconds
|
|
62
|
+
"""
|
|
63
|
+
if self._model is not None:
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
from sentence_transformers import CrossEncoder
|
|
68
|
+
except ImportError as e:
|
|
69
|
+
raise ImportError(
|
|
70
|
+
"sentence-transformers is required for reranking.\n"
|
|
71
|
+
"Install with: pip install sentence-transformers\n"
|
|
72
|
+
"VecForge by Suneel Bose K · ArcGX TechLabs"
|
|
73
|
+
) from e
|
|
74
|
+
|
|
75
|
+
logger.info("Loading reranker model: %s", self._model_name)
|
|
76
|
+
self._model = CrossEncoder(self._model_name)
|
|
77
|
+
logger.info("Reranker model loaded: %s", self._model_name)
|
|
78
|
+
|
|
79
|
+
def rerank(
|
|
80
|
+
self,
|
|
81
|
+
query: str,
|
|
82
|
+
texts: list[str],
|
|
83
|
+
top_k: int | None = None,
|
|
84
|
+
) -> list[tuple[int, float]]:
|
|
85
|
+
"""Rerank candidate texts by relevance to query.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
query: Search query string.
|
|
89
|
+
texts: List of candidate document texts to rerank.
|
|
90
|
+
top_k: Number of top results to return. If None, returns all.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of (original_index, score) tuples sorted by descending
|
|
94
|
+
relevance score. original_index maps back to the input texts.
|
|
95
|
+
|
|
96
|
+
Performance:
|
|
97
|
+
Time: O(k * d) where k = len(texts)
|
|
98
|
+
Typical: ~20-50ms for 20 candidates
|
|
99
|
+
|
|
100
|
+
Example:
|
|
101
|
+
>>> ranked = reranker.rerank("hip fracture", ["broken hip", "diabetes"])
|
|
102
|
+
>>> ranked[0] # (0, 0.95) — "broken hip" most relevant
|
|
103
|
+
"""
|
|
104
|
+
if not texts:
|
|
105
|
+
return []
|
|
106
|
+
|
|
107
|
+
self._load_model()
|
|
108
|
+
|
|
109
|
+
# why: Cross-encoder expects (query, doc) pairs
|
|
110
|
+
pairs = [(query, text) for text in texts]
|
|
111
|
+
scores = self._model.predict(pairs)
|
|
112
|
+
|
|
113
|
+
# why: Create (index, score) pairs and sort by score descending
|
|
114
|
+
indexed_scores = [(i, float(score)) for i, score in enumerate(scores)]
|
|
115
|
+
indexed_scores.sort(key=lambda x: x[1], reverse=True)
|
|
116
|
+
|
|
117
|
+
if top_k is not None:
|
|
118
|
+
indexed_scores = indexed_scores[:top_k]
|
|
119
|
+
|
|
120
|
+
return indexed_scores
|