vecforge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vecforge/__init__.py +59 -0
- vecforge/cli/__init__.py +3 -0
- vecforge/cli/main.py +197 -0
- vecforge/core/__init__.py +3 -0
- vecforge/core/bm25.py +187 -0
- vecforge/core/embedder.py +152 -0
- vecforge/core/indexer.py +196 -0
- vecforge/core/reranker.py +120 -0
- vecforge/core/storage.py +493 -0
- vecforge/core/vault.py +760 -0
- vecforge/exceptions.py +164 -0
- vecforge/ingest/__init__.py +3 -0
- vecforge/ingest/dispatcher.py +181 -0
- vecforge/ingest/document.py +237 -0
- vecforge/search/__init__.py +3 -0
- vecforge/search/cascade.py +186 -0
- vecforge/search/filters.py +146 -0
- vecforge/search/hybrid.py +146 -0
- vecforge/security/__init__.py +3 -0
- vecforge/security/audit.py +169 -0
- vecforge/security/encryption.py +84 -0
- vecforge/security/namespaces.py +127 -0
- vecforge/security/rbac.py +172 -0
- vecforge/security/snapshots.py +135 -0
- vecforge/server/__init__.py +3 -0
- vecforge/server/app.py +54 -0
- vecforge/server/routes.py +215 -0
- vecforge-0.2.0.dist-info/METADATA +302 -0
- vecforge-0.2.0.dist-info/RECORD +34 -0
- vecforge-0.2.0.dist-info/WHEEL +5 -0
- vecforge-0.2.0.dist-info/entry_points.txt +2 -0
- vecforge-0.2.0.dist-info/licenses/LICENSE +45 -0
- vecforge-0.2.0.dist-info/licenses/NOTICE +14 -0
- vecforge-0.2.0.dist-info/top_level.txt +1 -0
vecforge/__init__.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
VecForge — Forge your vector database. Own it forever.
|
|
12
|
+
|
|
13
|
+
A universal, local-first Python vector database with enterprise security,
|
|
14
|
+
multimodal ingestion, and optional quantum-inspired acceleration.
|
|
15
|
+
|
|
16
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
17
|
+
|
|
18
|
+
Quick Start::
|
|
19
|
+
|
|
20
|
+
from vecforge import VecForge
|
|
21
|
+
|
|
22
|
+
db = VecForge("my_vault")
|
|
23
|
+
db.add("Patient admitted with type 2 diabetes", metadata={"ward": "7"})
|
|
24
|
+
results = db.search("diabetic patient")
|
|
25
|
+
print(results[0].text)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
from vecforge.core.vault import SearchResult, VecForge
|
|
31
|
+
from vecforge.exceptions import (
|
|
32
|
+
DeletionProtectedError,
|
|
33
|
+
EncryptionKeyError,
|
|
34
|
+
IngestError,
|
|
35
|
+
InvalidAlphaError,
|
|
36
|
+
NamespaceNotFoundError,
|
|
37
|
+
VaultEmptyError,
|
|
38
|
+
VecForgeError,
|
|
39
|
+
VecForgePermissionError,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
__all__ = [
|
|
43
|
+
"VecForge",
|
|
44
|
+
"SearchResult",
|
|
45
|
+
"VecForgeError",
|
|
46
|
+
"VaultEmptyError",
|
|
47
|
+
"NamespaceNotFoundError",
|
|
48
|
+
"VecForgePermissionError",
|
|
49
|
+
"InvalidAlphaError",
|
|
50
|
+
"EncryptionKeyError",
|
|
51
|
+
"DeletionProtectedError",
|
|
52
|
+
"IngestError",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
__version__ = "0.2.0"
|
|
56
|
+
__author__ = "Suneel Bose K"
|
|
57
|
+
__company__ = "ArcGX TechLabs Private Limited"
|
|
58
|
+
__license__ = "BSL-1.1"
|
|
59
|
+
__copyright__ = "Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs"
|
vecforge/cli/__init__.py
ADDED
vecforge/cli/main.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
VecForge CLI — command-line interface.
|
|
12
|
+
|
|
13
|
+
Provides commands for ingestion, search, statistics, and serving.
|
|
14
|
+
|
|
15
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
|
|
22
|
+
import click
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@click.group()
|
|
26
|
+
@click.version_option(version="0.2.0", prog_name="VecForge")
|
|
27
|
+
def cli() -> None:
|
|
28
|
+
"""VecForge — Forge your vector database. Own it forever.
|
|
29
|
+
|
|
30
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@cli.command()
|
|
35
|
+
@click.argument("path")
|
|
36
|
+
@click.option("--vault", required=True, help="Path to vault database")
|
|
37
|
+
@click.option("--namespace", default="default", help="Target namespace")
|
|
38
|
+
@click.option("--chunk-size", default=1000, help="Chunk size in characters")
|
|
39
|
+
@click.option("--chunk-overlap", default=200, help="Chunk overlap in characters")
|
|
40
|
+
def ingest(
|
|
41
|
+
path: str,
|
|
42
|
+
vault: str,
|
|
43
|
+
namespace: str,
|
|
44
|
+
chunk_size: int,
|
|
45
|
+
chunk_overlap: int,
|
|
46
|
+
) -> None:
|
|
47
|
+
"""Ingest documents from PATH into the vault.
|
|
48
|
+
|
|
49
|
+
Supports: .txt, .md, .pdf, .docx, .html
|
|
50
|
+
|
|
51
|
+
Example: vecforge ingest my_docs/ --vault my.db
|
|
52
|
+
"""
|
|
53
|
+
from vecforge import VecForge
|
|
54
|
+
|
|
55
|
+
click.echo(f"VecForge — Ingesting from {path}...")
|
|
56
|
+
with VecForge(vault) as db:
|
|
57
|
+
count = db.ingest(
|
|
58
|
+
path,
|
|
59
|
+
namespace=namespace,
|
|
60
|
+
chunk_size=chunk_size,
|
|
61
|
+
chunk_overlap=chunk_overlap,
|
|
62
|
+
)
|
|
63
|
+
click.echo(f"✅ Ingested {count} chunks into vault '{vault}'")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@cli.command()
|
|
67
|
+
@click.argument("query")
|
|
68
|
+
@click.option("--vault", required=True, help="Path to vault database")
|
|
69
|
+
@click.option("--top-k", default=5, help="Number of results")
|
|
70
|
+
@click.option("--namespace", default=None, help="Restrict to namespace")
|
|
71
|
+
@click.option("--alpha", default=0.5, help="Semantic weight (0.0-1.0)")
|
|
72
|
+
@click.option("--rerank", is_flag=True, help="Enable cross-encoder reranking")
|
|
73
|
+
def search(
|
|
74
|
+
query: str,
|
|
75
|
+
vault: str,
|
|
76
|
+
top_k: int,
|
|
77
|
+
namespace: str | None,
|
|
78
|
+
alpha: float,
|
|
79
|
+
rerank: bool,
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Search the vault with a natural language query.
|
|
82
|
+
|
|
83
|
+
Example: vecforge search "diabetes treatment" --vault my.db
|
|
84
|
+
"""
|
|
85
|
+
from vecforge import VecForge
|
|
86
|
+
|
|
87
|
+
with VecForge(vault) as db:
|
|
88
|
+
results = db.search(
|
|
89
|
+
query,
|
|
90
|
+
top_k=top_k,
|
|
91
|
+
namespace=namespace,
|
|
92
|
+
alpha=alpha,
|
|
93
|
+
rerank=rerank,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
if not results:
|
|
97
|
+
click.echo("No results found.")
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
for i, r in enumerate(results, 1):
|
|
101
|
+
click.echo(f"\n{'─' * 60}")
|
|
102
|
+
click.echo(f"Result {i} | Score: {r.score:.4f} | ID: {r.doc_id[:8]}...")
|
|
103
|
+
click.echo(f"Namespace: {r.namespace} | Modality: {r.modality}")
|
|
104
|
+
if r.metadata:
|
|
105
|
+
click.echo(f"Metadata: {json.dumps(r.metadata, default=str)}")
|
|
106
|
+
click.echo(f"\n{r.text[:500]}")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@cli.command()
|
|
110
|
+
@click.argument("vault")
|
|
111
|
+
def stats(vault: str) -> None:
|
|
112
|
+
"""Show vault statistics.
|
|
113
|
+
|
|
114
|
+
Example: vecforge stats my.db
|
|
115
|
+
"""
|
|
116
|
+
from vecforge import VecForge
|
|
117
|
+
|
|
118
|
+
with VecForge(vault) as db:
|
|
119
|
+
info = db.stats()
|
|
120
|
+
|
|
121
|
+
click.echo(f"\n{'═' * 50}")
|
|
122
|
+
click.echo("VecForge Vault Statistics")
|
|
123
|
+
click.echo(f"{'═' * 50}")
|
|
124
|
+
click.echo(f"Path: {info['path']}")
|
|
125
|
+
click.echo(f"Documents: {info['documents']}")
|
|
126
|
+
click.echo(f"Encrypted: {info['encrypted']}")
|
|
127
|
+
click.echo(f"Quantum: {info['quantum']}")
|
|
128
|
+
click.echo(f"Protection: {info['deletion_protection']}")
|
|
129
|
+
click.echo(f"Namespaces: {', '.join(info['namespaces'])}")
|
|
130
|
+
click.echo(f"Index vectors: {info['index_vectors']}")
|
|
131
|
+
click.echo(f"BM25 docs: {info['bm25_documents']}")
|
|
132
|
+
click.echo(f"\nBuilt by {info['built_by']}")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@cli.command()
|
|
136
|
+
@click.argument("vault")
|
|
137
|
+
@click.option("--format", "fmt", default="json", help="Export format (json)")
|
|
138
|
+
@click.option("--output", "-o", default=None, help="Output file path")
|
|
139
|
+
@click.option("--namespace", default=None, help="Export specific namespace")
|
|
140
|
+
def export(vault: str, fmt: str, output: str | None, namespace: str | None) -> None:
|
|
141
|
+
"""Export vault data to JSON.
|
|
142
|
+
|
|
143
|
+
Example: vecforge export my.db -o data.json
|
|
144
|
+
"""
|
|
145
|
+
from vecforge.core.storage import StorageBackend
|
|
146
|
+
|
|
147
|
+
docs = []
|
|
148
|
+
storage = StorageBackend(path=vault)
|
|
149
|
+
all_docs = storage.get_all_docs(namespace=namespace)
|
|
150
|
+
|
|
151
|
+
for doc in all_docs:
|
|
152
|
+
docs.append(
|
|
153
|
+
{
|
|
154
|
+
"doc_id": doc.doc_id,
|
|
155
|
+
"text": doc.text,
|
|
156
|
+
"metadata": doc.metadata,
|
|
157
|
+
"namespace": doc.namespace,
|
|
158
|
+
"modality": doc.modality,
|
|
159
|
+
"created_at": doc.created_at,
|
|
160
|
+
}
|
|
161
|
+
)
|
|
162
|
+
storage.close()
|
|
163
|
+
|
|
164
|
+
data = {"vault": vault, "documents": docs, "count": len(docs)}
|
|
165
|
+
json_str = json.dumps(data, indent=2, default=str)
|
|
166
|
+
|
|
167
|
+
if output:
|
|
168
|
+
with open(output, "w", encoding="utf-8") as f:
|
|
169
|
+
f.write(json_str)
|
|
170
|
+
click.echo(f"✅ Exported {len(docs)} documents to {output}")
|
|
171
|
+
else:
|
|
172
|
+
click.echo(json_str)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@cli.command()
|
|
176
|
+
@click.option("--vault", required=True, help="Path to vault database")
|
|
177
|
+
@click.option("--port", default=8080, help="Server port")
|
|
178
|
+
@click.option("--host", default="0.0.0.0", help="Server host")
|
|
179
|
+
def serve(vault: str, port: int, host: str) -> None:
|
|
180
|
+
"""Start VecForge REST API server.
|
|
181
|
+
|
|
182
|
+
Example: vecforge serve --vault my.db --port 8080
|
|
183
|
+
"""
|
|
184
|
+
click.echo(f"VecForge REST Server — {vault}")
|
|
185
|
+
click.echo(f"Listening on {host}:{port}")
|
|
186
|
+
click.echo("Built by Suneel Bose K · ArcGX TechLabs\n")
|
|
187
|
+
|
|
188
|
+
import uvicorn
|
|
189
|
+
|
|
190
|
+
from vecforge.server.app import create_app
|
|
191
|
+
|
|
192
|
+
app = create_app(vault)
|
|
193
|
+
uvicorn.run(app, host=host, port=port)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
if __name__ == "__main__":
|
|
197
|
+
cli()
|
vecforge/core/bm25.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
BM25 keyword search engine for VecForge.
|
|
12
|
+
|
|
13
|
+
Provides sparse keyword-based retrieval using BM25Okapi. Used alongside
|
|
14
|
+
FAISS dense retrieval for hybrid search.
|
|
15
|
+
|
|
16
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
import re
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
|
|
25
|
+
import numpy as np
|
|
26
|
+
from rank_bm25 import BM25Okapi
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class BM25Result:
|
|
33
|
+
"""A single BM25 search result.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
doc_index: Index of the document in the corpus.
|
|
37
|
+
score: BM25 relevance score (higher = more relevant).
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
doc_index: int
|
|
41
|
+
score: float
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class BM25Engine:
|
|
45
|
+
"""BM25 keyword search engine using Okapi BM25.
|
|
46
|
+
|
|
47
|
+
Maintains an in-memory inverted index for fast keyword retrieval.
|
|
48
|
+
Rebuilt on each add operation (efficient for batch ingestion).
|
|
49
|
+
|
|
50
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
51
|
+
|
|
52
|
+
Performance:
|
|
53
|
+
Build: O(N * L) where N = docs, L = avg doc length
|
|
54
|
+
Search: O(V * N) where V = query terms
|
|
55
|
+
Typical: <2ms search at 100k docs
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
>>> engine = BM25Engine()
|
|
59
|
+
>>> engine.add_documents(["patient with diabetes", "hip fracture case"])
|
|
60
|
+
>>> results = engine.search("diabetes", top_k=1)
|
|
61
|
+
>>> results[0].doc_index
|
|
62
|
+
0
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self) -> None:
|
|
66
|
+
self._corpus: list[list[str]] = []
|
|
67
|
+
self._bm25: BM25Okapi | None = None
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def count(self) -> int:
|
|
71
|
+
"""Return number of documents in the corpus.
|
|
72
|
+
|
|
73
|
+
Performance:
|
|
74
|
+
Time: O(1)
|
|
75
|
+
"""
|
|
76
|
+
return len(self._corpus)
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def _tokenize(text: str) -> list[str]:
|
|
80
|
+
"""Tokenize text into lowercase words.
|
|
81
|
+
|
|
82
|
+
Simple whitespace + punctuation tokenizer. Adequate for BM25
|
|
83
|
+
where exact matching matters more than linguistic analysis.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
text: Raw text string to tokenize.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
List of lowercase word tokens.
|
|
90
|
+
|
|
91
|
+
Performance:
|
|
92
|
+
Time: O(L) where L = length of text
|
|
93
|
+
"""
|
|
94
|
+
# why: Simple regex tokenizer — BM25 doesn't need stemming for v0.1
|
|
95
|
+
text = text.lower()
|
|
96
|
+
tokens = re.findall(r"\b\w+\b", text)
|
|
97
|
+
return tokens
|
|
98
|
+
|
|
99
|
+
def add_documents(self, texts: list[str]) -> None:
|
|
100
|
+
"""Add documents to the BM25 index.
|
|
101
|
+
|
|
102
|
+
Rebuilds the internal BM25 index after adding. For best
|
|
103
|
+
performance, batch all documents into a single call.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
texts: List of document texts to add.
|
|
107
|
+
|
|
108
|
+
Performance:
|
|
109
|
+
Time: O(N * L) where N = total docs, L = avg doc length
|
|
110
|
+
|
|
111
|
+
Example:
|
|
112
|
+
>>> engine = BM25Engine()
|
|
113
|
+
>>> engine.add_documents(["doc one", "doc two", "doc three"])
|
|
114
|
+
>>> engine.count
|
|
115
|
+
3
|
|
116
|
+
"""
|
|
117
|
+
for text in texts:
|
|
118
|
+
self._corpus.append(self._tokenize(text))
|
|
119
|
+
|
|
120
|
+
# why: Rebuild entire index — BM25Okapi doesn't support incremental add
|
|
121
|
+
self._bm25 = BM25Okapi(self._corpus)
|
|
122
|
+
logger.debug("BM25 index rebuilt with %d documents", len(self._corpus))
|
|
123
|
+
|
|
124
|
+
def add_document(self, text: str) -> None:
|
|
125
|
+
"""Add a single document to the BM25 index.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
text: Document text to add.
|
|
129
|
+
|
|
130
|
+
Performance:
|
|
131
|
+
Time: O(N * L) — rebuilds entire index
|
|
132
|
+
"""
|
|
133
|
+
self.add_documents([text])
|
|
134
|
+
|
|
135
|
+
def search(self, query: str, top_k: int = 10) -> list[BM25Result]:
|
|
136
|
+
"""Search for documents matching the query keywords.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
query: Search query string.
|
|
140
|
+
top_k: Number of top results to return.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
List of BM25Result sorted by descending score.
|
|
144
|
+
Empty list if no documents in corpus.
|
|
145
|
+
|
|
146
|
+
Performance:
|
|
147
|
+
Time: O(V * N) where V = query terms, N = corpus size
|
|
148
|
+
Typical: <2ms at 100k docs
|
|
149
|
+
|
|
150
|
+
Example:
|
|
151
|
+
>>> results = engine.search("diabetes treatment", top_k=5)
|
|
152
|
+
>>> for r in results:
|
|
153
|
+
... print(f"Doc {r.doc_index}: score={r.score:.4f}")
|
|
154
|
+
"""
|
|
155
|
+
if self._bm25 is None or len(self._corpus) == 0:
|
|
156
|
+
return []
|
|
157
|
+
|
|
158
|
+
query_tokens = self._tokenize(query)
|
|
159
|
+
if not query_tokens:
|
|
160
|
+
return []
|
|
161
|
+
|
|
162
|
+
# perf: BM25Okapi.get_scores returns all scores in one pass
|
|
163
|
+
scores = self._bm25.get_scores(query_tokens)
|
|
164
|
+
|
|
165
|
+
# perf: Use argpartition for O(N) top-k instead of O(N log N) sort
|
|
166
|
+
effective_k = min(top_k, len(scores))
|
|
167
|
+
if effective_k == 0:
|
|
168
|
+
return []
|
|
169
|
+
|
|
170
|
+
top_indices = np.argpartition(scores, -effective_k)[-effective_k:]
|
|
171
|
+
# why: Sort the top-k by score descending
|
|
172
|
+
top_indices = top_indices[np.argsort(scores[top_indices])[::-1]]
|
|
173
|
+
|
|
174
|
+
return [
|
|
175
|
+
BM25Result(doc_index=int(idx), score=float(scores[idx]))
|
|
176
|
+
for idx in top_indices
|
|
177
|
+
if scores[idx] > 0.0 # why: Filter zero-score matches
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
def reset(self) -> None:
|
|
181
|
+
"""Reset the BM25 index, removing all documents.
|
|
182
|
+
|
|
183
|
+
Performance:
|
|
184
|
+
Time: O(1)
|
|
185
|
+
"""
|
|
186
|
+
self._corpus = []
|
|
187
|
+
self._bm25 = None
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
Embedding engine for VecForge.
|
|
12
|
+
|
|
13
|
+
Wraps sentence-transformers for local text embedding. No internet
|
|
14
|
+
required — models are downloaded once and cached locally.
|
|
15
|
+
|
|
16
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
from numpy.typing import NDArray
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
# perf: Default model balances quality and speed for most use cases
|
|
30
|
+
_DEFAULT_MODEL = "all-MiniLM-L6-v2"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Embedder:
|
|
34
|
+
"""Local text embedding engine using sentence-transformers.
|
|
35
|
+
|
|
36
|
+
Lazily loads the model on first use to keep VecForge init fast.
|
|
37
|
+
All processing runs locally — zero cloud dependency.
|
|
38
|
+
|
|
39
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
model_name: Name of the sentence-transformers model.
|
|
43
|
+
Defaults to 'all-MiniLM-L6-v2' (384-dim, fast, good quality).
|
|
44
|
+
device: Device to run on ('cpu', 'cuda'). Auto-detected if None.
|
|
45
|
+
|
|
46
|
+
Performance:
|
|
47
|
+
Time: O(n * d) where n = number of texts, d = model dimension
|
|
48
|
+
Typical: ~5ms per text on CPU, ~0.5ms on GPU
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
>>> embedder = Embedder()
|
|
52
|
+
>>> vectors = embedder.encode(["hello world", "vector search"])
|
|
53
|
+
>>> vectors.shape
|
|
54
|
+
(2, 384)
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
model_name: str = _DEFAULT_MODEL,
|
|
60
|
+
device: str | None = None,
|
|
61
|
+
) -> None:
|
|
62
|
+
self._model_name = model_name
|
|
63
|
+
self._device = device
|
|
64
|
+
self._model: Any = None # Lazy-loaded SentenceTransformer
|
|
65
|
+
self._dimension: int | None = None
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def dimension(self) -> int:
|
|
69
|
+
"""Return embedding dimension, loading model if needed.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Integer dimension of the embedding vectors.
|
|
73
|
+
|
|
74
|
+
Performance:
|
|
75
|
+
Time: O(1) after first call
|
|
76
|
+
"""
|
|
77
|
+
if self._dimension is None:
|
|
78
|
+
self._load_model()
|
|
79
|
+
assert self._dimension is not None # guaranteed after _load_model
|
|
80
|
+
return self._dimension
|
|
81
|
+
|
|
82
|
+
def _load_model(self) -> None:
|
|
83
|
+
"""Lazily load the sentence-transformer model.
|
|
84
|
+
|
|
85
|
+
Performance:
|
|
86
|
+
Time: O(1) — one-time cost of ~1-3 seconds for model loading
|
|
87
|
+
"""
|
|
88
|
+
if self._model is not None:
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
from sentence_transformers import SentenceTransformer
|
|
93
|
+
except ImportError as e:
|
|
94
|
+
raise ImportError(
|
|
95
|
+
"sentence-transformers is required for VecForge embeddings.\n"
|
|
96
|
+
"Install with: pip install sentence-transformers\n"
|
|
97
|
+
"VecForge by Suneel Bose K · ArcGX TechLabs"
|
|
98
|
+
) from e
|
|
99
|
+
|
|
100
|
+
logger.info("Loading embedding model: %s", self._model_name)
|
|
101
|
+
self._model = SentenceTransformer(self._model_name, device=self._device)
|
|
102
|
+
self._dimension = self._model.get_sentence_embedding_dimension()
|
|
103
|
+
logger.info(
|
|
104
|
+
"Embedding model loaded: %s (dim=%d)",
|
|
105
|
+
self._model_name,
|
|
106
|
+
self._dimension,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def encode(
|
|
110
|
+
self,
|
|
111
|
+
texts: list[str] | str,
|
|
112
|
+
batch_size: int = 64,
|
|
113
|
+
normalize: bool = True,
|
|
114
|
+
show_progress: bool = False,
|
|
115
|
+
) -> NDArray[np.float32]:
|
|
116
|
+
"""Encode texts into dense embedding vectors.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
texts: Single string or list of strings to embed.
|
|
120
|
+
batch_size: Batch size for encoding. Defaults to 64.
|
|
121
|
+
normalize: If True, L2-normalize vectors for cosine similarity.
|
|
122
|
+
Defaults to True.
|
|
123
|
+
show_progress: Show progress bar for large batches.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
NumPy array of shape (n_texts, dimension) with float32 vectors.
|
|
127
|
+
|
|
128
|
+
Performance:
|
|
129
|
+
Time: O(n * d) where n = len(texts), d = model dimension
|
|
130
|
+
Typical: ~5ms per text on CPU with default model
|
|
131
|
+
|
|
132
|
+
Example:
|
|
133
|
+
>>> embedder = Embedder()
|
|
134
|
+
>>> vec = embedder.encode("patient with diabetes")
|
|
135
|
+
>>> vec.shape
|
|
136
|
+
(1, 384)
|
|
137
|
+
"""
|
|
138
|
+
self._load_model()
|
|
139
|
+
|
|
140
|
+
if isinstance(texts, str):
|
|
141
|
+
texts = [texts]
|
|
142
|
+
|
|
143
|
+
# perf: sentence-transformers handles batching internally
|
|
144
|
+
vectors: NDArray[np.float32] = self._model.encode(
|
|
145
|
+
texts,
|
|
146
|
+
batch_size=batch_size,
|
|
147
|
+
normalize_embeddings=normalize,
|
|
148
|
+
show_progress_bar=show_progress,
|
|
149
|
+
convert_to_numpy=True,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return vectors.astype(np.float32)
|