vecforge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vecforge/__init__.py ADDED
@@ -0,0 +1,59 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
4
+ #
5
+ # Licensed under the Business Source License 1.1 (BSL 1.1)
6
+ # Free for personal, research, open-source, and non-commercial use.
7
+ # Commercial use requires a separate license from ArcGX TechLabs.
8
+ # See LICENSE file in the project root or contact: suneelbose@arcgx.in
9
+
10
+ """
11
+ VecForge — Forge your vector database. Own it forever.
12
+
13
+ A universal, local-first Python vector database with enterprise security,
14
+ multimodal ingestion, and optional quantum-inspired acceleration.
15
+
16
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
17
+
18
+ Quick Start::
19
+
20
+ from vecforge import VecForge
21
+
22
+ db = VecForge("my_vault")
23
+ db.add("Patient admitted with type 2 diabetes", metadata={"ward": "7"})
24
+ results = db.search("diabetic patient")
25
+ print(results[0].text)
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ from vecforge.core.vault import SearchResult, VecForge
31
+ from vecforge.exceptions import (
32
+ DeletionProtectedError,
33
+ EncryptionKeyError,
34
+ IngestError,
35
+ InvalidAlphaError,
36
+ NamespaceNotFoundError,
37
+ VaultEmptyError,
38
+ VecForgeError,
39
+ VecForgePermissionError,
40
+ )
41
+
42
+ __all__ = [
43
+ "VecForge",
44
+ "SearchResult",
45
+ "VecForgeError",
46
+ "VaultEmptyError",
47
+ "NamespaceNotFoundError",
48
+ "VecForgePermissionError",
49
+ "InvalidAlphaError",
50
+ "EncryptionKeyError",
51
+ "DeletionProtectedError",
52
+ "IngestError",
53
+ ]
54
+
55
+ __version__ = "0.2.0"
56
+ __author__ = "Suneel Bose K"
57
+ __company__ = "ArcGX TechLabs Private Limited"
58
+ __license__ = "BSL-1.1"
59
+ __copyright__ = "Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs"
@@ -0,0 +1,3 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Licensed under BSL 1.1 — see LICENSE for details.
vecforge/cli/main.py ADDED
@@ -0,0 +1,197 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
4
+ #
5
+ # Licensed under the Business Source License 1.1 (BSL 1.1)
6
+ # Free for personal, research, open-source, and non-commercial use.
7
+ # Commercial use requires a separate license from ArcGX TechLabs.
8
+ # See LICENSE file in the project root or contact: suneelbose@arcgx.in
9
+
10
+ """
11
+ VecForge CLI — command-line interface.
12
+
13
+ Provides commands for ingestion, search, statistics, and serving.
14
+
15
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+
22
+ import click
23
+
24
+
25
+ @click.group()
26
+ @click.version_option(version="0.2.0", prog_name="VecForge")
27
+ def cli() -> None:
28
+ """VecForge — Forge your vector database. Own it forever.
29
+
30
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
31
+ """
32
+
33
+
34
+ @cli.command()
35
+ @click.argument("path")
36
+ @click.option("--vault", required=True, help="Path to vault database")
37
+ @click.option("--namespace", default="default", help="Target namespace")
38
+ @click.option("--chunk-size", default=1000, help="Chunk size in characters")
39
+ @click.option("--chunk-overlap", default=200, help="Chunk overlap in characters")
40
+ def ingest(
41
+ path: str,
42
+ vault: str,
43
+ namespace: str,
44
+ chunk_size: int,
45
+ chunk_overlap: int,
46
+ ) -> None:
47
+ """Ingest documents from PATH into the vault.
48
+
49
+ Supports: .txt, .md, .pdf, .docx, .html
50
+
51
+ Example: vecforge ingest my_docs/ --vault my.db
52
+ """
53
+ from vecforge import VecForge
54
+
55
+ click.echo(f"VecForge — Ingesting from {path}...")
56
+ with VecForge(vault) as db:
57
+ count = db.ingest(
58
+ path,
59
+ namespace=namespace,
60
+ chunk_size=chunk_size,
61
+ chunk_overlap=chunk_overlap,
62
+ )
63
+ click.echo(f"✅ Ingested {count} chunks into vault '{vault}'")
64
+
65
+
66
+ @cli.command()
67
+ @click.argument("query")
68
+ @click.option("--vault", required=True, help="Path to vault database")
69
+ @click.option("--top-k", default=5, help="Number of results")
70
+ @click.option("--namespace", default=None, help="Restrict to namespace")
71
+ @click.option("--alpha", default=0.5, help="Semantic weight (0.0-1.0)")
72
+ @click.option("--rerank", is_flag=True, help="Enable cross-encoder reranking")
73
+ def search(
74
+ query: str,
75
+ vault: str,
76
+ top_k: int,
77
+ namespace: str | None,
78
+ alpha: float,
79
+ rerank: bool,
80
+ ) -> None:
81
+ """Search the vault with a natural language query.
82
+
83
+ Example: vecforge search "diabetes treatment" --vault my.db
84
+ """
85
+ from vecforge import VecForge
86
+
87
+ with VecForge(vault) as db:
88
+ results = db.search(
89
+ query,
90
+ top_k=top_k,
91
+ namespace=namespace,
92
+ alpha=alpha,
93
+ rerank=rerank,
94
+ )
95
+
96
+ if not results:
97
+ click.echo("No results found.")
98
+ return
99
+
100
+ for i, r in enumerate(results, 1):
101
+ click.echo(f"\n{'─' * 60}")
102
+ click.echo(f"Result {i} | Score: {r.score:.4f} | ID: {r.doc_id[:8]}...")
103
+ click.echo(f"Namespace: {r.namespace} | Modality: {r.modality}")
104
+ if r.metadata:
105
+ click.echo(f"Metadata: {json.dumps(r.metadata, default=str)}")
106
+ click.echo(f"\n{r.text[:500]}")
107
+
108
+
109
+ @cli.command()
110
+ @click.argument("vault")
111
+ def stats(vault: str) -> None:
112
+ """Show vault statistics.
113
+
114
+ Example: vecforge stats my.db
115
+ """
116
+ from vecforge import VecForge
117
+
118
+ with VecForge(vault) as db:
119
+ info = db.stats()
120
+
121
+ click.echo(f"\n{'═' * 50}")
122
+ click.echo("VecForge Vault Statistics")
123
+ click.echo(f"{'═' * 50}")
124
+ click.echo(f"Path: {info['path']}")
125
+ click.echo(f"Documents: {info['documents']}")
126
+ click.echo(f"Encrypted: {info['encrypted']}")
127
+ click.echo(f"Quantum: {info['quantum']}")
128
+ click.echo(f"Protection: {info['deletion_protection']}")
129
+ click.echo(f"Namespaces: {', '.join(info['namespaces'])}")
130
+ click.echo(f"Index vectors: {info['index_vectors']}")
131
+ click.echo(f"BM25 docs: {info['bm25_documents']}")
132
+ click.echo(f"\nBuilt by {info['built_by']}")
133
+
134
+
135
+ @cli.command()
136
+ @click.argument("vault")
137
+ @click.option("--format", "fmt", default="json", help="Export format (json)")
138
+ @click.option("--output", "-o", default=None, help="Output file path")
139
+ @click.option("--namespace", default=None, help="Export specific namespace")
140
+ def export(vault: str, fmt: str, output: str | None, namespace: str | None) -> None:
141
+ """Export vault data to JSON.
142
+
143
+ Example: vecforge export my.db -o data.json
144
+ """
145
+ from vecforge.core.storage import StorageBackend
146
+
147
+ docs = []
148
+ storage = StorageBackend(path=vault)
149
+ all_docs = storage.get_all_docs(namespace=namespace)
150
+
151
+ for doc in all_docs:
152
+ docs.append(
153
+ {
154
+ "doc_id": doc.doc_id,
155
+ "text": doc.text,
156
+ "metadata": doc.metadata,
157
+ "namespace": doc.namespace,
158
+ "modality": doc.modality,
159
+ "created_at": doc.created_at,
160
+ }
161
+ )
162
+ storage.close()
163
+
164
+ data = {"vault": vault, "documents": docs, "count": len(docs)}
165
+ json_str = json.dumps(data, indent=2, default=str)
166
+
167
+ if output:
168
+ with open(output, "w", encoding="utf-8") as f:
169
+ f.write(json_str)
170
+ click.echo(f"✅ Exported {len(docs)} documents to {output}")
171
+ else:
172
+ click.echo(json_str)
173
+
174
+
175
+ @cli.command()
176
+ @click.option("--vault", required=True, help="Path to vault database")
177
+ @click.option("--port", default=8080, help="Server port")
178
+ @click.option("--host", default="0.0.0.0", help="Server host")
179
+ def serve(vault: str, port: int, host: str) -> None:
180
+ """Start VecForge REST API server.
181
+
182
+ Example: vecforge serve --vault my.db --port 8080
183
+ """
184
+ click.echo(f"VecForge REST Server — {vault}")
185
+ click.echo(f"Listening on {host}:{port}")
186
+ click.echo("Built by Suneel Bose K · ArcGX TechLabs\n")
187
+
188
+ import uvicorn
189
+
190
+ from vecforge.server.app import create_app
191
+
192
+ app = create_app(vault)
193
+ uvicorn.run(app, host=host, port=port)
194
+
195
+
196
+ if __name__ == "__main__":
197
+ cli()
@@ -0,0 +1,3 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Licensed under BSL 1.1 — see LICENSE for details.
vecforge/core/bm25.py ADDED
@@ -0,0 +1,187 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
4
+ #
5
+ # Licensed under the Business Source License 1.1 (BSL 1.1)
6
+ # Free for personal, research, open-source, and non-commercial use.
7
+ # Commercial use requires a separate license from ArcGX TechLabs.
8
+ # See LICENSE file in the project root or contact: suneelbose@arcgx.in
9
+
10
+ """
11
+ BM25 keyword search engine for VecForge.
12
+
13
+ Provides sparse keyword-based retrieval using BM25Okapi. Used alongside
14
+ FAISS dense retrieval for hybrid search.
15
+
16
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import logging
22
+ import re
23
+ from dataclasses import dataclass
24
+
25
+ import numpy as np
26
+ from rank_bm25 import BM25Okapi
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @dataclass
32
+ class BM25Result:
33
+ """A single BM25 search result.
34
+
35
+ Attributes:
36
+ doc_index: Index of the document in the corpus.
37
+ score: BM25 relevance score (higher = more relevant).
38
+ """
39
+
40
+ doc_index: int
41
+ score: float
42
+
43
+
44
+ class BM25Engine:
45
+ """BM25 keyword search engine using Okapi BM25.
46
+
47
+ Maintains an in-memory inverted index for fast keyword retrieval.
48
+ Rebuilt on each add operation (efficient for batch ingestion).
49
+
50
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
51
+
52
+ Performance:
53
+ Build: O(N * L) where N = docs, L = avg doc length
54
+ Search: O(V * N) where V = query terms
55
+ Typical: <2ms search at 100k docs
56
+
57
+ Example:
58
+ >>> engine = BM25Engine()
59
+ >>> engine.add_documents(["patient with diabetes", "hip fracture case"])
60
+ >>> results = engine.search("diabetes", top_k=1)
61
+ >>> results[0].doc_index
62
+ 0
63
+ """
64
+
65
+ def __init__(self) -> None:
66
+ self._corpus: list[list[str]] = []
67
+ self._bm25: BM25Okapi | None = None
68
+
69
+ @property
70
+ def count(self) -> int:
71
+ """Return number of documents in the corpus.
72
+
73
+ Performance:
74
+ Time: O(1)
75
+ """
76
+ return len(self._corpus)
77
+
78
+ @staticmethod
79
+ def _tokenize(text: str) -> list[str]:
80
+ """Tokenize text into lowercase words.
81
+
82
+ Simple whitespace + punctuation tokenizer. Adequate for BM25
83
+ where exact matching matters more than linguistic analysis.
84
+
85
+ Args:
86
+ text: Raw text string to tokenize.
87
+
88
+ Returns:
89
+ List of lowercase word tokens.
90
+
91
+ Performance:
92
+ Time: O(L) where L = length of text
93
+ """
94
+ # why: Simple regex tokenizer — BM25 doesn't need stemming for v0.1
95
+ text = text.lower()
96
+ tokens = re.findall(r"\b\w+\b", text)
97
+ return tokens
98
+
99
+ def add_documents(self, texts: list[str]) -> None:
100
+ """Add documents to the BM25 index.
101
+
102
+ Rebuilds the internal BM25 index after adding. For best
103
+ performance, batch all documents into a single call.
104
+
105
+ Args:
106
+ texts: List of document texts to add.
107
+
108
+ Performance:
109
+ Time: O(N * L) where N = total docs, L = avg doc length
110
+
111
+ Example:
112
+ >>> engine = BM25Engine()
113
+ >>> engine.add_documents(["doc one", "doc two", "doc three"])
114
+ >>> engine.count
115
+ 3
116
+ """
117
+ for text in texts:
118
+ self._corpus.append(self._tokenize(text))
119
+
120
+ # why: Rebuild entire index — BM25Okapi doesn't support incremental add
121
+ self._bm25 = BM25Okapi(self._corpus)
122
+ logger.debug("BM25 index rebuilt with %d documents", len(self._corpus))
123
+
124
+ def add_document(self, text: str) -> None:
125
+ """Add a single document to the BM25 index.
126
+
127
+ Args:
128
+ text: Document text to add.
129
+
130
+ Performance:
131
+ Time: O(N * L) — rebuilds entire index
132
+ """
133
+ self.add_documents([text])
134
+
135
+ def search(self, query: str, top_k: int = 10) -> list[BM25Result]:
136
+ """Search for documents matching the query keywords.
137
+
138
+ Args:
139
+ query: Search query string.
140
+ top_k: Number of top results to return.
141
+
142
+ Returns:
143
+ List of BM25Result sorted by descending score.
144
+ Empty list if no documents in corpus.
145
+
146
+ Performance:
147
+ Time: O(V * N) where V = query terms, N = corpus size
148
+ Typical: <2ms at 100k docs
149
+
150
+ Example:
151
+ >>> results = engine.search("diabetes treatment", top_k=5)
152
+ >>> for r in results:
153
+ ... print(f"Doc {r.doc_index}: score={r.score:.4f}")
154
+ """
155
+ if self._bm25 is None or len(self._corpus) == 0:
156
+ return []
157
+
158
+ query_tokens = self._tokenize(query)
159
+ if not query_tokens:
160
+ return []
161
+
162
+ # perf: BM25Okapi.get_scores returns all scores in one pass
163
+ scores = self._bm25.get_scores(query_tokens)
164
+
165
+ # perf: Use argpartition for O(N) top-k instead of O(N log N) sort
166
+ effective_k = min(top_k, len(scores))
167
+ if effective_k == 0:
168
+ return []
169
+
170
+ top_indices = np.argpartition(scores, -effective_k)[-effective_k:]
171
+ # why: Sort the top-k by score descending
172
+ top_indices = top_indices[np.argsort(scores[top_indices])[::-1]]
173
+
174
+ return [
175
+ BM25Result(doc_index=int(idx), score=float(scores[idx]))
176
+ for idx in top_indices
177
+ if scores[idx] > 0.0 # why: Filter zero-score matches
178
+ ]
179
+
180
+ def reset(self) -> None:
181
+ """Reset the BM25 index, removing all documents.
182
+
183
+ Performance:
184
+ Time: O(1)
185
+ """
186
+ self._corpus = []
187
+ self._bm25 = None
@@ -0,0 +1,152 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
4
+ #
5
+ # Licensed under the Business Source License 1.1 (BSL 1.1)
6
+ # Free for personal, research, open-source, and non-commercial use.
7
+ # Commercial use requires a separate license from ArcGX TechLabs.
8
+ # See LICENSE file in the project root or contact: suneelbose@arcgx.in
9
+
10
+ """
11
+ Embedding engine for VecForge.
12
+
13
+ Wraps sentence-transformers for local text embedding. No internet
14
+ required — models are downloaded once and cached locally.
15
+
16
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import logging
22
+ from typing import Any
23
+
24
+ import numpy as np
25
+ from numpy.typing import NDArray
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # perf: Default model balances quality and speed for most use cases
30
+ _DEFAULT_MODEL = "all-MiniLM-L6-v2"
31
+
32
+
33
+ class Embedder:
34
+ """Local text embedding engine using sentence-transformers.
35
+
36
+ Lazily loads the model on first use to keep VecForge init fast.
37
+ All processing runs locally — zero cloud dependency.
38
+
39
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
40
+
41
+ Args:
42
+ model_name: Name of the sentence-transformers model.
43
+ Defaults to 'all-MiniLM-L6-v2' (384-dim, fast, good quality).
44
+ device: Device to run on ('cpu', 'cuda'). Auto-detected if None.
45
+
46
+ Performance:
47
+ Time: O(n * d) where n = number of texts, d = model dimension
48
+ Typical: ~5ms per text on CPU, ~0.5ms on GPU
49
+
50
+ Example:
51
+ >>> embedder = Embedder()
52
+ >>> vectors = embedder.encode(["hello world", "vector search"])
53
+ >>> vectors.shape
54
+ (2, 384)
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ model_name: str = _DEFAULT_MODEL,
60
+ device: str | None = None,
61
+ ) -> None:
62
+ self._model_name = model_name
63
+ self._device = device
64
+ self._model: Any = None # Lazy-loaded SentenceTransformer
65
+ self._dimension: int | None = None
66
+
67
+ @property
68
+ def dimension(self) -> int:
69
+ """Return embedding dimension, loading model if needed.
70
+
71
+ Returns:
72
+ Integer dimension of the embedding vectors.
73
+
74
+ Performance:
75
+ Time: O(1) after first call
76
+ """
77
+ if self._dimension is None:
78
+ self._load_model()
79
+ assert self._dimension is not None # guaranteed after _load_model
80
+ return self._dimension
81
+
82
+ def _load_model(self) -> None:
83
+ """Lazily load the sentence-transformer model.
84
+
85
+ Performance:
86
+ Time: O(1) — one-time cost of ~1-3 seconds for model loading
87
+ """
88
+ if self._model is not None:
89
+ return
90
+
91
+ try:
92
+ from sentence_transformers import SentenceTransformer
93
+ except ImportError as e:
94
+ raise ImportError(
95
+ "sentence-transformers is required for VecForge embeddings.\n"
96
+ "Install with: pip install sentence-transformers\n"
97
+ "VecForge by Suneel Bose K · ArcGX TechLabs"
98
+ ) from e
99
+
100
+ logger.info("Loading embedding model: %s", self._model_name)
101
+ self._model = SentenceTransformer(self._model_name, device=self._device)
102
+ self._dimension = self._model.get_sentence_embedding_dimension()
103
+ logger.info(
104
+ "Embedding model loaded: %s (dim=%d)",
105
+ self._model_name,
106
+ self._dimension,
107
+ )
108
+
109
+ def encode(
110
+ self,
111
+ texts: list[str] | str,
112
+ batch_size: int = 64,
113
+ normalize: bool = True,
114
+ show_progress: bool = False,
115
+ ) -> NDArray[np.float32]:
116
+ """Encode texts into dense embedding vectors.
117
+
118
+ Args:
119
+ texts: Single string or list of strings to embed.
120
+ batch_size: Batch size for encoding. Defaults to 64.
121
+ normalize: If True, L2-normalize vectors for cosine similarity.
122
+ Defaults to True.
123
+ show_progress: Show progress bar for large batches.
124
+
125
+ Returns:
126
+ NumPy array of shape (n_texts, dimension) with float32 vectors.
127
+
128
+ Performance:
129
+ Time: O(n * d) where n = len(texts), d = model dimension
130
+ Typical: ~5ms per text on CPU with default model
131
+
132
+ Example:
133
+ >>> embedder = Embedder()
134
+ >>> vec = embedder.encode("patient with diabetes")
135
+ >>> vec.shape
136
+ (1, 384)
137
+ """
138
+ self._load_model()
139
+
140
+ if isinstance(texts, str):
141
+ texts = [texts]
142
+
143
+ # perf: sentence-transformers handles batching internally
144
+ vectors: NDArray[np.float32] = self._model.encode(
145
+ texts,
146
+ batch_size=batch_size,
147
+ normalize_embeddings=normalize,
148
+ show_progress_bar=show_progress,
149
+ convert_to_numpy=True,
150
+ )
151
+
152
+ return vectors.astype(np.float32)