vecforge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vecforge/__init__.py +59 -0
- vecforge/cli/__init__.py +3 -0
- vecforge/cli/main.py +197 -0
- vecforge/core/__init__.py +3 -0
- vecforge/core/bm25.py +187 -0
- vecforge/core/embedder.py +152 -0
- vecforge/core/indexer.py +196 -0
- vecforge/core/reranker.py +120 -0
- vecforge/core/storage.py +493 -0
- vecforge/core/vault.py +760 -0
- vecforge/exceptions.py +164 -0
- vecforge/ingest/__init__.py +3 -0
- vecforge/ingest/dispatcher.py +181 -0
- vecforge/ingest/document.py +237 -0
- vecforge/search/__init__.py +3 -0
- vecforge/search/cascade.py +186 -0
- vecforge/search/filters.py +146 -0
- vecforge/search/hybrid.py +146 -0
- vecforge/security/__init__.py +3 -0
- vecforge/security/audit.py +169 -0
- vecforge/security/encryption.py +84 -0
- vecforge/security/namespaces.py +127 -0
- vecforge/security/rbac.py +172 -0
- vecforge/security/snapshots.py +135 -0
- vecforge/server/__init__.py +3 -0
- vecforge/server/app.py +54 -0
- vecforge/server/routes.py +215 -0
- vecforge-0.2.0.dist-info/METADATA +302 -0
- vecforge-0.2.0.dist-info/RECORD +34 -0
- vecforge-0.2.0.dist-info/WHEEL +5 -0
- vecforge-0.2.0.dist-info/entry_points.txt +2 -0
- vecforge-0.2.0.dist-info/licenses/LICENSE +45 -0
- vecforge-0.2.0.dist-info/licenses/NOTICE +14 -0
- vecforge-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
4-stage cascading retrieval pipeline for VecForge.
|
|
12
|
+
|
|
13
|
+
Pipeline stages:
|
|
14
|
+
1. FAISS dense retrieval (broad recall)
|
|
15
|
+
2. BM25 keyword merge via hybrid fusion (precision boost)
|
|
16
|
+
3. Metadata + namespace filtering
|
|
17
|
+
4. Optional cross-encoder reranking
|
|
18
|
+
|
|
19
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
import numpy as np
|
|
29
|
+
from numpy.typing import NDArray
|
|
30
|
+
|
|
31
|
+
from vecforge.core.bm25 import BM25Engine
|
|
32
|
+
from vecforge.core.indexer import FaissIndexer
|
|
33
|
+
from vecforge.core.reranker import Reranker
|
|
34
|
+
from vecforge.search.filters import MetadataFilter
|
|
35
|
+
from vecforge.search.hybrid import reciprocal_rank_fusion
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class CascadeCandidate:
|
|
42
|
+
"""Intermediate candidate during cascade search.
|
|
43
|
+
|
|
44
|
+
Attributes:
|
|
45
|
+
doc_index: Index in the document store.
|
|
46
|
+
score: Current relevance score.
|
|
47
|
+
text: Document text (loaded during cascade).
|
|
48
|
+
metadata: Document metadata.
|
|
49
|
+
namespace: Document namespace.
|
|
50
|
+
doc_id: Unique document identifier.
|
|
51
|
+
modality: Content modality.
|
|
52
|
+
created_at: Document creation timestamp.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
doc_index: int
|
|
56
|
+
score: float
|
|
57
|
+
text: str = ""
|
|
58
|
+
metadata: dict[str, Any] | None = None
|
|
59
|
+
namespace: str = "default"
|
|
60
|
+
doc_id: str = ""
|
|
61
|
+
modality: str = "text"
|
|
62
|
+
created_at: float = 0.0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class CascadeSearcher:
|
|
66
|
+
"""4-stage cascading search pipeline.
|
|
67
|
+
|
|
68
|
+
Processes search through increasingly precise stages:
|
|
69
|
+
1. Dense: FAISS retrieves broad candidate set
|
|
70
|
+
2. Sparse: BM25 scores merged via RRF or linear fusion
|
|
71
|
+
3. Filter: Metadata and namespace filtering applied
|
|
72
|
+
4. Rerank: Optional cross-encoder reranking for precision
|
|
73
|
+
|
|
74
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
indexer: FAISS index for dense retrieval.
|
|
78
|
+
bm25: BM25 engine for keyword search.
|
|
79
|
+
reranker: Optional cross-encoder reranker.
|
|
80
|
+
|
|
81
|
+
Performance:
|
|
82
|
+
Time: O(log N) FAISS + O(k) rerank where k << N
|
|
83
|
+
Typical: <15ms at 100k docs, <50ms at 1M docs
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
>>> searcher = CascadeSearcher(indexer, bm25_engine)
|
|
87
|
+
>>> results = searcher.search(query_vec, "diabetes", top_k=10)
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(
|
|
91
|
+
self,
|
|
92
|
+
indexer: FaissIndexer,
|
|
93
|
+
bm25: BM25Engine,
|
|
94
|
+
reranker: Reranker | None = None,
|
|
95
|
+
) -> None:
|
|
96
|
+
self._indexer = indexer
|
|
97
|
+
self._bm25 = bm25
|
|
98
|
+
self._reranker = reranker
|
|
99
|
+
|
|
100
|
+
def search(
|
|
101
|
+
self,
|
|
102
|
+
query_vector: NDArray[np.float32],
|
|
103
|
+
query_text: str,
|
|
104
|
+
top_k: int = 10,
|
|
105
|
+
alpha: float = 0.5,
|
|
106
|
+
rerank: bool = False,
|
|
107
|
+
filters: dict[str, Any] | None = None,
|
|
108
|
+
recency_weight: float = 0.0,
|
|
109
|
+
) -> list[CascadeCandidate]:
|
|
110
|
+
"""Execute 4-stage cascading search.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
query_vector: Dense query embedding from embedder.
|
|
114
|
+
query_text: Original query string for BM25 and reranking.
|
|
115
|
+
top_k: Number of final results to return.
|
|
116
|
+
alpha: Semantic vs keyword weight (0.0-1.0).
|
|
117
|
+
rerank: Enable cross-encoder reranking (Stage 4).
|
|
118
|
+
filters: Metadata filter conditions.
|
|
119
|
+
recency_weight: Weight for document recency (0.0-1.0).
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
List of CascadeCandidate sorted by descending relevance.
|
|
123
|
+
|
|
124
|
+
Performance:
|
|
125
|
+
Time: O(log N) + O(k) + O(k*F) + O(k*d_rerank)
|
|
126
|
+
Typical: <15ms without rerank, <50ms with rerank
|
|
127
|
+
"""
|
|
128
|
+
# why: Retrieve more candidates than top_k to allow filtering
|
|
129
|
+
retrieval_k = min(top_k * 4, self._indexer.count)
|
|
130
|
+
if retrieval_k == 0:
|
|
131
|
+
return []
|
|
132
|
+
|
|
133
|
+
# ─── Stage 1: Dense retrieval via FAISS ───
|
|
134
|
+
dense_scores, dense_ids = self._indexer.search(query_vector, top_k=retrieval_k)
|
|
135
|
+
logger.debug("Stage 1 (Dense): retrieved %d candidates", len(dense_ids))
|
|
136
|
+
|
|
137
|
+
# ─── Stage 2: Sparse keyword merge via RRF ───
|
|
138
|
+
bm25_results = self._bm25.search(query_text, top_k=retrieval_k)
|
|
139
|
+
sparse_ids = [r.doc_index for r in bm25_results]
|
|
140
|
+
sparse_scores = [r.score for r in bm25_results]
|
|
141
|
+
|
|
142
|
+
fused = reciprocal_rank_fusion(
|
|
143
|
+
dense_ids=dense_ids,
|
|
144
|
+
dense_scores=dense_scores,
|
|
145
|
+
sparse_ids=sparse_ids,
|
|
146
|
+
sparse_scores=sparse_scores,
|
|
147
|
+
alpha=alpha,
|
|
148
|
+
)
|
|
149
|
+
logger.debug("Stage 2 (Hybrid): fused %d candidates", len(fused))
|
|
150
|
+
|
|
151
|
+
# why: Convert to CascadeCandidate
|
|
152
|
+
candidates = [
|
|
153
|
+
CascadeCandidate(doc_index=doc_idx, score=score) for doc_idx, score in fused
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
# ─── Stage 3: Metadata filtering ───
|
|
157
|
+
if filters:
|
|
158
|
+
meta_filter = MetadataFilter(filters)
|
|
159
|
+
candidates = [
|
|
160
|
+
c
|
|
161
|
+
for c in candidates
|
|
162
|
+
if c.metadata is not None and meta_filter.matches(c.metadata)
|
|
163
|
+
]
|
|
164
|
+
logger.debug(
|
|
165
|
+
"Stage 3 (Filter): %d candidates after filtering",
|
|
166
|
+
len(candidates),
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# ─── Stage 4: Cross-encoder reranking (optional) ───
|
|
170
|
+
if rerank and self._reranker is not None and candidates:
|
|
171
|
+
texts = [c.text for c in candidates if c.text]
|
|
172
|
+
if texts:
|
|
173
|
+
reranked = self._reranker.rerank(query_text, texts, top_k=top_k)
|
|
174
|
+
reranked_candidates = []
|
|
175
|
+
for orig_idx, rerank_score in reranked:
|
|
176
|
+
if orig_idx < len(candidates):
|
|
177
|
+
c = candidates[orig_idx]
|
|
178
|
+
c.score = rerank_score
|
|
179
|
+
reranked_candidates.append(c)
|
|
180
|
+
candidates = reranked_candidates
|
|
181
|
+
logger.debug(
|
|
182
|
+
"Stage 4 (Rerank): reranked to %d results",
|
|
183
|
+
len(candidates),
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
return candidates[:top_k]
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
Metadata filtering logic for VecForge.
|
|
12
|
+
|
|
13
|
+
Supports equality, range (gte, lte, gt, lt), in/not_in operators
|
|
14
|
+
for flexible metadata-based result filtering.
|
|
15
|
+
|
|
16
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class MetadataFilter:
|
|
25
|
+
"""Filter search results based on metadata conditions.
|
|
26
|
+
|
|
27
|
+
Supports operators: equality, gte, lte, gt, lt, in, not_in, ne.
|
|
28
|
+
Filters are specified as nested dictionaries.
|
|
29
|
+
|
|
30
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
31
|
+
|
|
32
|
+
Performance:
|
|
33
|
+
Time: O(N * F) where N = results, F = number of filter keys
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> f = MetadataFilter({"type": "NDA", "year": {"gte": 2023}})
|
|
37
|
+
>>> f.matches({"type": "NDA", "year": 2024})
|
|
38
|
+
True
|
|
39
|
+
>>> f.matches({"type": "NDA", "year": 2020})
|
|
40
|
+
False
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
_OPERATORS = {"gte", "lte", "gt", "lt", "in", "not_in", "ne"}
|
|
44
|
+
|
|
45
|
+
def __init__(self, filters: dict[str, Any]) -> None:
|
|
46
|
+
self._filters = filters
|
|
47
|
+
|
|
48
|
+
def matches(self, metadata: dict[str, Any]) -> bool:
|
|
49
|
+
"""Check if metadata satisfies all filter conditions.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
metadata: Document metadata dictionary.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
True if all filter conditions are satisfied.
|
|
56
|
+
|
|
57
|
+
Performance:
|
|
58
|
+
Time: O(F) where F = number of filter keys
|
|
59
|
+
"""
|
|
60
|
+
for key, condition in self._filters.items():
|
|
61
|
+
if key not in metadata:
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
value = metadata[key]
|
|
65
|
+
|
|
66
|
+
if isinstance(condition, dict):
|
|
67
|
+
# why: Operator-based filtering
|
|
68
|
+
if not self._check_operators(value, condition):
|
|
69
|
+
return False
|
|
70
|
+
else:
|
|
71
|
+
# why: Simple equality check
|
|
72
|
+
if value != condition:
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
return True
|
|
76
|
+
|
|
77
|
+
def _check_operators(self, value: Any, operators: dict[str, Any]) -> bool:
|
|
78
|
+
"""Check operator-based conditions on a value.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
value: Metadata field value.
|
|
82
|
+
operators: Dictionary of operator → threshold pairs.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
True if all operator conditions pass.
|
|
86
|
+
|
|
87
|
+
Performance:
|
|
88
|
+
Time: O(number of operators)
|
|
89
|
+
"""
|
|
90
|
+
for op, threshold in operators.items():
|
|
91
|
+
if op not in self._OPERATORS:
|
|
92
|
+
# why: Treat unknown keys as nested equality
|
|
93
|
+
if value != threshold:
|
|
94
|
+
return False
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
if op == "gte" and not (value >= threshold):
|
|
98
|
+
return False
|
|
99
|
+
if op == "lte" and not (value <= threshold):
|
|
100
|
+
return False
|
|
101
|
+
if op == "gt" and not (value > threshold):
|
|
102
|
+
return False
|
|
103
|
+
if op == "lt" and not (value < threshold):
|
|
104
|
+
return False
|
|
105
|
+
if op == "in" and value not in threshold:
|
|
106
|
+
return False
|
|
107
|
+
if op == "not_in" and value in threshold:
|
|
108
|
+
return False
|
|
109
|
+
if op == "ne" and value == threshold:
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
return True
|
|
113
|
+
|
|
114
|
+
def filter_results(
|
|
115
|
+
self, results: list[Any], metadata_getter: Any = None
|
|
116
|
+
) -> list[Any]:
|
|
117
|
+
"""Filter a list of results by metadata conditions.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
results: List of result objects.
|
|
121
|
+
metadata_getter: Callable or attribute name to extract metadata.
|
|
122
|
+
If None, expects results to have a 'metadata' attribute.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Filtered list of results.
|
|
126
|
+
|
|
127
|
+
Performance:
|
|
128
|
+
Time: O(N * F) where N = results, F = filter keys
|
|
129
|
+
"""
|
|
130
|
+
if not self._filters:
|
|
131
|
+
return results
|
|
132
|
+
|
|
133
|
+
filtered = []
|
|
134
|
+
for result in results:
|
|
135
|
+
if metadata_getter is not None:
|
|
136
|
+
if callable(metadata_getter):
|
|
137
|
+
meta = metadata_getter(result)
|
|
138
|
+
else:
|
|
139
|
+
meta = getattr(result, metadata_getter)
|
|
140
|
+
else:
|
|
141
|
+
meta = getattr(result, "metadata", {})
|
|
142
|
+
|
|
143
|
+
if self.matches(meta):
|
|
144
|
+
filtered.append(result)
|
|
145
|
+
|
|
146
|
+
return filtered
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
Hybrid dense + sparse search fusion for VecForge.
|
|
12
|
+
|
|
13
|
+
Combines FAISS dense retrieval scores with BM25 sparse keyword scores
|
|
14
|
+
using configurable weighted fusion. Alpha controls the balance between
|
|
15
|
+
semantic understanding and keyword matching.
|
|
16
|
+
|
|
17
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
from numpy.typing import NDArray
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def reciprocal_rank_fusion(
|
|
27
|
+
dense_ids: NDArray[np.int64],
|
|
28
|
+
dense_scores: NDArray[np.float32],
|
|
29
|
+
sparse_ids: list[int],
|
|
30
|
+
sparse_scores: list[float],
|
|
31
|
+
alpha: float = 0.5,
|
|
32
|
+
k: int = 60,
|
|
33
|
+
) -> list[tuple[int, float]]:
|
|
34
|
+
"""Fuse dense and sparse search results using weighted RRF.
|
|
35
|
+
|
|
36
|
+
Reciprocal Rank Fusion (RRF) combines rankings from multiple
|
|
37
|
+
retrieval systems. Each document's score is computed as:
|
|
38
|
+
|
|
39
|
+
score = alpha * 1/(k + dense_rank) + (1 - alpha) * 1/(k + sparse_rank)
|
|
40
|
+
|
|
41
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
dense_ids: Document indices from FAISS dense retrieval.
|
|
45
|
+
dense_scores: Corresponding FAISS inner-product scores.
|
|
46
|
+
sparse_ids: Document indices from BM25 sparse retrieval.
|
|
47
|
+
sparse_scores: Corresponding BM25 scores.
|
|
48
|
+
alpha: Weight for dense vs sparse (0.0 = keyword only,
|
|
49
|
+
1.0 = semantic only). Defaults to 0.5 (balanced).
|
|
50
|
+
k: RRF constant. Higher values reduce the impact of ranking
|
|
51
|
+
position. Defaults to 60 (standard RRF constant).
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of (doc_index, fused_score) tuples sorted by descending
|
|
55
|
+
fused score.
|
|
56
|
+
|
|
57
|
+
Performance:
|
|
58
|
+
Time: O(D + S) where D = dense results, S = sparse results
|
|
59
|
+
Typical: <1ms for top-100 results
|
|
60
|
+
|
|
61
|
+
Example:
|
|
62
|
+
>>> fused = reciprocal_rank_fusion(
|
|
63
|
+
... dense_ids=np.array([0, 2, 5]),
|
|
64
|
+
... dense_scores=np.array([0.9, 0.7, 0.5]),
|
|
65
|
+
... sparse_ids=[2, 0, 3],
|
|
66
|
+
... sparse_scores=[5.0, 3.0, 1.0],
|
|
67
|
+
... alpha=0.5,
|
|
68
|
+
... )
|
|
69
|
+
>>> fused[0] # (doc_id, combined_score)
|
|
70
|
+
"""
|
|
71
|
+
fused_scores: dict[int, float] = {}
|
|
72
|
+
|
|
73
|
+
# perf: Process dense results — already sorted by score descending
|
|
74
|
+
for rank, doc_id in enumerate(dense_ids):
|
|
75
|
+
doc_id_int = int(doc_id)
|
|
76
|
+
if doc_id_int < 0: # why: FAISS returns -1 for padded slots
|
|
77
|
+
continue
|
|
78
|
+
rrf_score = alpha * (1.0 / (k + rank + 1))
|
|
79
|
+
fused_scores[doc_id_int] = fused_scores.get(doc_id_int, 0.0) + rrf_score
|
|
80
|
+
|
|
81
|
+
# perf: Process sparse results — already sorted by score descending
|
|
82
|
+
for rank, doc_id in enumerate(sparse_ids):
|
|
83
|
+
rrf_score = (1.0 - alpha) * (1.0 / (k + rank + 1))
|
|
84
|
+
fused_scores[doc_id] = fused_scores.get(doc_id, 0.0) + rrf_score
|
|
85
|
+
|
|
86
|
+
# why: Sort by fused score descending
|
|
87
|
+
results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
|
|
88
|
+
return results
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def weighted_linear_fusion(
|
|
92
|
+
dense_ids: NDArray[np.int64],
|
|
93
|
+
dense_scores: NDArray[np.float32],
|
|
94
|
+
sparse_ids: list[int],
|
|
95
|
+
sparse_scores: list[float],
|
|
96
|
+
alpha: float = 0.5,
|
|
97
|
+
) -> list[tuple[int, float]]:
|
|
98
|
+
"""Fuse dense and sparse scores using weighted linear combination.
|
|
99
|
+
|
|
100
|
+
Normalizes both score distributions to [0, 1] and combines:
|
|
101
|
+
|
|
102
|
+
score = alpha * norm_dense + (1 - alpha) * norm_sparse
|
|
103
|
+
|
|
104
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
dense_ids: Document indices from FAISS.
|
|
108
|
+
dense_scores: FAISS scores.
|
|
109
|
+
sparse_ids: Document indices from BM25.
|
|
110
|
+
sparse_scores: BM25 scores.
|
|
111
|
+
alpha: Semantic weight. Defaults to 0.5.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Sorted list of (doc_index, fused_score) tuples.
|
|
115
|
+
|
|
116
|
+
Performance:
|
|
117
|
+
Time: O(D + S + U*log(U)) where U = unique docs
|
|
118
|
+
"""
|
|
119
|
+
fused_scores: dict[int, float] = {}
|
|
120
|
+
|
|
121
|
+
# perf: Normalize dense scores to [0, 1]
|
|
122
|
+
if len(dense_scores) > 0:
|
|
123
|
+
d_min, d_max = float(dense_scores.min()), float(dense_scores.max())
|
|
124
|
+
d_range = d_max - d_min if d_max > d_min else 1.0
|
|
125
|
+
|
|
126
|
+
for doc_id, score in zip(dense_ids, dense_scores, strict=False):
|
|
127
|
+
doc_id_int = int(doc_id)
|
|
128
|
+
if doc_id_int < 0:
|
|
129
|
+
continue
|
|
130
|
+
norm_score = (float(score) - d_min) / d_range
|
|
131
|
+
fused_scores[doc_id_int] = alpha * norm_score
|
|
132
|
+
|
|
133
|
+
# perf: Normalize sparse scores to [0, 1]
|
|
134
|
+
if len(sparse_scores) > 0:
|
|
135
|
+
s_min = min(sparse_scores)
|
|
136
|
+
s_max = max(sparse_scores)
|
|
137
|
+
s_range = s_max - s_min if s_max > s_min else 1.0
|
|
138
|
+
|
|
139
|
+
for doc_id, score in zip(sparse_ids, sparse_scores, strict=False):
|
|
140
|
+
norm_score = (score - s_min) / s_range
|
|
141
|
+
fused_scores[doc_id] = fused_scores.get(doc_id, 0.0) + (
|
|
142
|
+
(1.0 - alpha) * norm_score
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
|
|
146
|
+
return results
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
Audit log writer + reader for VecForge.
|
|
12
|
+
|
|
13
|
+
Records all mutating operations as append-only JSONL audit events.
|
|
14
|
+
Every add, delete, update, and admin action is logged for compliance.
|
|
15
|
+
|
|
16
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
import logging
|
|
23
|
+
import time
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AuditLogger:
|
|
31
|
+
"""Append-only JSONL audit log for compliance and security.
|
|
32
|
+
|
|
33
|
+
Every mutating operation emits an audit event with actor, operation,
|
|
34
|
+
target, timestamp, and metadata. Logs are append-only and tamper-evident.
|
|
35
|
+
|
|
36
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
log_path: Path to the JSONL audit log file. If None, audit
|
|
40
|
+
logging is disabled (but a warning is emitted).
|
|
41
|
+
|
|
42
|
+
Performance:
|
|
43
|
+
Write: O(1) per event (append-only)
|
|
44
|
+
Read: O(N) where N = total events
|
|
45
|
+
|
|
46
|
+
Example:
|
|
47
|
+
>>> audit = AuditLogger("audit.jsonl")
|
|
48
|
+
>>> audit.log("admin", "add", doc_id="d123", namespace="default")
|
|
49
|
+
>>> events = audit.read_log()
|
|
50
|
+
>>> print(events[0]["operation"])
|
|
51
|
+
'add'
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, log_path: str | None = None) -> None:
|
|
55
|
+
self._path = Path(log_path) if log_path else None
|
|
56
|
+
self._enabled = log_path is not None
|
|
57
|
+
|
|
58
|
+
if self._enabled and self._path is not None:
|
|
59
|
+
# why: Ensure parent directory exists
|
|
60
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
logger.info("Audit logging enabled: %s", self._path)
|
|
62
|
+
else:
|
|
63
|
+
logger.debug("Audit logging disabled")
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def enabled(self) -> bool:
|
|
67
|
+
"""Return whether audit logging is active.
|
|
68
|
+
|
|
69
|
+
Performance:
|
|
70
|
+
Time: O(1)
|
|
71
|
+
"""
|
|
72
|
+
return self._enabled
|
|
73
|
+
|
|
74
|
+
def log(
|
|
75
|
+
self,
|
|
76
|
+
actor: str,
|
|
77
|
+
operation: str,
|
|
78
|
+
doc_id: str | None = None,
|
|
79
|
+
namespace: str | None = None,
|
|
80
|
+
metadata: dict[str, Any] | None = None,
|
|
81
|
+
) -> None:
|
|
82
|
+
"""Write an audit event to the log.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
actor: Identifier of the user/key performing the action.
|
|
86
|
+
operation: Operation name (add, delete, update, search, etc.).
|
|
87
|
+
doc_id: Target document ID, if applicable.
|
|
88
|
+
namespace: Target namespace, if applicable.
|
|
89
|
+
metadata: Additional event metadata.
|
|
90
|
+
|
|
91
|
+
Performance:
|
|
92
|
+
Time: O(1) — single file append
|
|
93
|
+
|
|
94
|
+
Example:
|
|
95
|
+
>>> audit.log(
|
|
96
|
+
... actor="key-abc123",
|
|
97
|
+
... operation="add",
|
|
98
|
+
... doc_id="doc-xyz",
|
|
99
|
+
... namespace="ward_7",
|
|
100
|
+
... metadata={"chars": 1500}
|
|
101
|
+
... )
|
|
102
|
+
"""
|
|
103
|
+
if not self._enabled or self._path is None:
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
event = {
|
|
107
|
+
"timestamp": time.time(),
|
|
108
|
+
"actor": actor,
|
|
109
|
+
"operation": operation,
|
|
110
|
+
"doc_id": doc_id,
|
|
111
|
+
"namespace": namespace,
|
|
112
|
+
"metadata": metadata or {},
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# security: Append-only write — no modification of existing entries
|
|
116
|
+
with open(self._path, "a", encoding="utf-8") as f:
|
|
117
|
+
f.write(json.dumps(event, default=str) + "\n")
|
|
118
|
+
|
|
119
|
+
def read_log(
|
|
120
|
+
self,
|
|
121
|
+
since: float | None = None,
|
|
122
|
+
until: float | None = None,
|
|
123
|
+
actor: str | None = None,
|
|
124
|
+
operation: str | None = None,
|
|
125
|
+
) -> list[dict[str, Any]]:
|
|
126
|
+
"""Read audit events with optional filters.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
since: Unix timestamp — events after this time.
|
|
130
|
+
until: Unix timestamp — events before this time.
|
|
131
|
+
actor: Filter by actor identity.
|
|
132
|
+
operation: Filter by operation type.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
List of audit event dictionaries.
|
|
136
|
+
|
|
137
|
+
Performance:
|
|
138
|
+
Time: O(N) where N = total events in log
|
|
139
|
+
"""
|
|
140
|
+
if not self._enabled or self._path is None or not self._path.exists():
|
|
141
|
+
return []
|
|
142
|
+
|
|
143
|
+
events = []
|
|
144
|
+
with open(self._path, encoding="utf-8") as f:
|
|
145
|
+
for line in f:
|
|
146
|
+
line = line.strip()
|
|
147
|
+
if not line:
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
event = json.loads(line)
|
|
152
|
+
except json.JSONDecodeError:
|
|
153
|
+
logger.warning("Skipping malformed audit log entry")
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# why: Apply filters
|
|
157
|
+
ts = event.get("timestamp", 0)
|
|
158
|
+
if since is not None and ts < since:
|
|
159
|
+
continue
|
|
160
|
+
if until is not None and ts > until:
|
|
161
|
+
continue
|
|
162
|
+
if actor is not None and event.get("actor") != actor:
|
|
163
|
+
continue
|
|
164
|
+
if operation is not None and event.get("operation") != operation:
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
events.append(event)
|
|
168
|
+
|
|
169
|
+
return events
|