vecforge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,196 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
4
+ #
5
+ # Licensed under the Business Source License 1.1 (BSL 1.1)
6
+ # Free for personal, research, open-source, and non-commercial use.
7
+ # Commercial use requires a separate license from ArcGX TechLabs.
8
+ # See LICENSE file in the project root or contact: suneelbose@arcgx.in
9
+
10
+ """
11
+ FAISS index management for VecForge.
12
+
13
+ Provides efficient approximate nearest neighbour search using FAISS.
14
+ Supports both flat (exact) and IVF (approximate) indexes with automatic
15
+ training when the collection grows.
16
+
17
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import logging
23
+
24
+ import faiss
25
+ import numpy as np
26
+ from numpy.typing import NDArray
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # perf: Switch to IVF when collection exceeds this threshold
31
+ _IVF_THRESHOLD = 10_000
32
+ _IVF_NLIST = 100 # number of Voronoi cells for IVF
33
+ _IVF_NPROBE = 10 # number of cells to search
34
+
35
+
36
+ class FaissIndexer:
37
+ """FAISS-based vector index for fast nearest-neighbour search.
38
+
39
+ Starts with IndexFlatIP (exact inner product search) for small
40
+ collections, and can be upgraded to IndexIVFFlat for larger ones.
41
+
42
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
43
+
44
+ Args:
45
+ dimension: Embedding vector dimension.
46
+
47
+ Performance:
48
+ Flat index: O(N * d) search — exact, best for N < 10k
49
+ IVF index: O(N/nlist * d * nprobe) — approximate, for N > 10k
50
+ Typical: <5ms at 100k docs with IVF
51
+
52
+ Example:
53
+ >>> indexer = FaissIndexer(dimension=384)
54
+ >>> indexer.add(np.random.randn(100, 384).astype(np.float32))
55
+ >>> distances, indices = indexer.search(query_vec, top_k=5)
56
+ """
57
+
58
+ def __init__(self, dimension: int) -> None:
59
+ self._dimension = dimension
60
+ self._index: faiss.Index = faiss.IndexFlatIP(dimension)
61
+ self._count = 0
62
+ self._is_ivf = False
63
+
64
+ @property
65
+ def count(self) -> int:
66
+ """Return number of vectors in the index.
67
+
68
+ Performance:
69
+ Time: O(1)
70
+ """
71
+ return self._count
72
+
73
+ @property
74
+ def dimension(self) -> int:
75
+ """Return vector dimension.
76
+
77
+ Performance:
78
+ Time: O(1)
79
+ """
80
+ return self._dimension
81
+
82
+ def add(self, vectors: NDArray[np.float32]) -> None:
83
+ """Add vectors to the index.
84
+
85
+ Args:
86
+ vectors: Array of shape (n, dimension) with float32 vectors.
87
+
88
+ Performance:
89
+ Time: O(n * d) for flat index
90
+ Amortized O(n * d / nlist) for IVF after training
91
+
92
+ Example:
93
+ >>> indexer = FaissIndexer(384)
94
+ >>> indexer.add(np.random.randn(50, 384).astype(np.float32))
95
+ >>> indexer.count
96
+ 50
97
+ """
98
+ if vectors.ndim == 1:
99
+ vectors = vectors.reshape(1, -1)
100
+
101
+ # security: Validate dimensions match
102
+ if vectors.shape[1] != self._dimension:
103
+ raise ValueError(
104
+ f"Vector dimension mismatch: expected {self._dimension}, "
105
+ f"got {vectors.shape[1]}.\n"
106
+ f"VecForge by Suneel Bose K · ArcGX TechLabs"
107
+ )
108
+
109
+ # perf: Ensure contiguous float32 for FAISS
110
+ vectors = np.ascontiguousarray(vectors, dtype=np.float32)
111
+
112
+ self._index.add(vectors)
113
+ self._count += vectors.shape[0]
114
+
115
+ def search(
116
+ self, query: NDArray[np.float32], top_k: int = 10
117
+ ) -> tuple[NDArray[np.float32], NDArray[np.int64]]:
118
+ """Search for nearest neighbours to query vector.
119
+
120
+ Args:
121
+ query: Query vector of shape (dimension,) or (1, dimension).
122
+ top_k: Number of nearest neighbours to return.
123
+
124
+ Returns:
125
+ Tuple of (distances, indices) arrays, each of shape (top_k,).
126
+ Distances are inner product scores (higher = more similar).
127
+ Indices are 0-based positions in the order vectors were added.
128
+
129
+ Performance:
130
+ Time: O(N * d) for flat, O(N/nlist * d * nprobe) for IVF
131
+ Typical: <5ms at 100k docs
132
+
133
+ Example:
134
+ >>> distances, indices = indexer.search(query_vec, top_k=5)
135
+ >>> print(f"Best match: index={indices[0]}, score={distances[0]:.4f}")
136
+ """
137
+ if query.ndim == 1:
138
+ query = query.reshape(1, -1)
139
+
140
+ # perf: Ensure contiguous float32
141
+ query = np.ascontiguousarray(query, dtype=np.float32)
142
+
143
+ # why: Clamp top_k to available vectors
144
+ effective_k = min(top_k, self._count)
145
+ if effective_k == 0:
146
+ return (
147
+ np.array([], dtype=np.float32),
148
+ np.array([], dtype=np.int64),
149
+ )
150
+
151
+ distances, indices = self._index.search(query, effective_k)
152
+ return distances[0], indices[0]
153
+
154
+ def to_bytes(self) -> bytes:
155
+ """Serialize the FAISS index to bytes for storage.
156
+
157
+ Returns:
158
+ Raw bytes of the serialized FAISS index.
159
+
160
+ Performance:
161
+ Time: O(N * d) — proportional to index size
162
+ """
163
+ data: bytes = faiss.serialize_index(self._index).tobytes()
164
+ return data
165
+
166
+ @classmethod
167
+ def from_bytes(cls, data: bytes, dimension: int) -> FaissIndexer:
168
+ """Deserialize a FAISS index from bytes.
169
+
170
+ Args:
171
+ data: Serialized FAISS index bytes.
172
+ dimension: Expected embedding dimension.
173
+
174
+ Returns:
175
+ Reconstructed FaissIndexer instance.
176
+
177
+ Performance:
178
+ Time: O(N * d) — proportional to index size
179
+ """
180
+ index_array = np.frombuffer(data, dtype=np.uint8)
181
+ index = faiss.deserialize_index(index_array)
182
+
183
+ instance = cls(dimension)
184
+ instance._index = index
185
+ instance._count = index.ntotal
186
+ return instance
187
+
188
+ def reset(self) -> None:
189
+ """Reset the index, removing all vectors.
190
+
191
+ Performance:
192
+ Time: O(1)
193
+ """
194
+ self._index = faiss.IndexFlatIP(self._dimension)
195
+ self._count = 0
196
+ self._is_ivf = False
@@ -0,0 +1,120 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
4
+ #
5
+ # Licensed under the Business Source License 1.1 (BSL 1.1)
6
+ # Free for personal, research, open-source, and non-commercial use.
7
+ # Commercial use requires a separate license from ArcGX TechLabs.
8
+ # See LICENSE file in the project root or contact: suneelbose@arcgx.in
9
+
10
+ """
11
+ Cross-encoder reranker for VecForge.
12
+
13
+ Provides high-precision reranking of search candidates using a
14
+ cross-encoder model. Applied as the final stage of the cascade
15
+ search pipeline for improved accuracy.
16
+
17
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import logging
23
+ from typing import Any
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ _DEFAULT_RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
28
+
29
+
30
+ class Reranker:
31
+ """Cross-encoder reranker for precision search refinement.
32
+
33
+ Lazily loads the cross-encoder model on first use. Reranks candidate
34
+ results by computing query-document relevance scores using a
35
+ cross-attention model — more accurate but slower than bi-encoder.
36
+
37
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
38
+
39
+ Args:
40
+ model_name: Cross-encoder model name.
41
+ Defaults to 'cross-encoder/ms-marco-MiniLM-L-6-v2'.
42
+
43
+ Performance:
44
+ Time: O(k * d) where k = candidates, d = model complexity
45
+ Typical: ~20-50ms for top-20 candidates
46
+
47
+ Example:
48
+ >>> reranker = Reranker()
49
+ >>> scored = reranker.rerank("diabetes treatment", candidates)
50
+ >>> print(scored[0]) # highest relevance
51
+ """
52
+
53
+ def __init__(self, model_name: str = _DEFAULT_RERANK_MODEL) -> None:
54
+ self._model_name = model_name
55
+ self._model: Any = None
56
+
57
+ def _load_model(self) -> None:
58
+ """Lazily load the cross-encoder model.
59
+
60
+ Performance:
61
+ Time: O(1) — one-time cost of ~1-2 seconds
62
+ """
63
+ if self._model is not None:
64
+ return
65
+
66
+ try:
67
+ from sentence_transformers import CrossEncoder
68
+ except ImportError as e:
69
+ raise ImportError(
70
+ "sentence-transformers is required for reranking.\n"
71
+ "Install with: pip install sentence-transformers\n"
72
+ "VecForge by Suneel Bose K · ArcGX TechLabs"
73
+ ) from e
74
+
75
+ logger.info("Loading reranker model: %s", self._model_name)
76
+ self._model = CrossEncoder(self._model_name)
77
+ logger.info("Reranker model loaded: %s", self._model_name)
78
+
79
+ def rerank(
80
+ self,
81
+ query: str,
82
+ texts: list[str],
83
+ top_k: int | None = None,
84
+ ) -> list[tuple[int, float]]:
85
+ """Rerank candidate texts by relevance to query.
86
+
87
+ Args:
88
+ query: Search query string.
89
+ texts: List of candidate document texts to rerank.
90
+ top_k: Number of top results to return. If None, returns all.
91
+
92
+ Returns:
93
+ List of (original_index, score) tuples sorted by descending
94
+ relevance score. original_index maps back to the input texts.
95
+
96
+ Performance:
97
+ Time: O(k * d) where k = len(texts)
98
+ Typical: ~20-50ms for 20 candidates
99
+
100
+ Example:
101
+ >>> ranked = reranker.rerank("hip fracture", ["broken hip", "diabetes"])
102
+ >>> ranked[0] # (0, 0.95) — "broken hip" most relevant
103
+ """
104
+ if not texts:
105
+ return []
106
+
107
+ self._load_model()
108
+
109
+ # why: Cross-encoder expects (query, doc) pairs
110
+ pairs = [(query, text) for text in texts]
111
+ scores = self._model.predict(pairs)
112
+
113
+ # why: Create (index, score) pairs and sort by score descending
114
+ indexed_scores = [(i, float(score)) for i, score in enumerate(scores)]
115
+ indexed_scores.sort(key=lambda x: x[1], reverse=True)
116
+
117
+ if top_k is not None:
118
+ indexed_scores = indexed_scores[:top_k]
119
+
120
+ return indexed_scores