vecforge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,493 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
4
+ #
5
+ # Licensed under the Business Source License 1.1 (BSL 1.1)
6
+ # Free for personal, research, open-source, and non-commercial use.
7
+ # Commercial use requires a separate license from ArcGX TechLabs.
8
+ # See LICENSE file in the project root or contact: suneelbose@arcgx.in
9
+
10
+ """
11
+ SQLite + SQLCipher persistence layer for VecForge.
12
+
13
+ Provides encrypted document storage with namespace scoping. All queries
14
+ are scoped to a namespace to ensure multi-tenant isolation. Uses WAL
15
+ mode for concurrent read performance.
16
+
17
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ import logging
24
+ import sqlite3
25
+ import time
26
+ import uuid
27
+ from dataclasses import dataclass
28
+ from typing import Any
29
+
30
+ import numpy as np
31
+ from numpy.typing import NDArray
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ _SCHEMA_VERSION = 1
36
+
37
+
38
+ @dataclass
39
+ class StoredDocument:
40
+ """A document stored in the VecForge vault.
41
+
42
+ Attributes:
43
+ doc_id: Unique document identifier (UUID).
44
+ text: Original document text.
45
+ embedding: Dense embedding vector as numpy array.
46
+ metadata: User-provided metadata dictionary.
47
+ namespace: Namespace this document belongs to.
48
+ modality: Content modality (text, image, audio, etc.).
49
+ created_at: Unix timestamp of creation.
50
+ """
51
+
52
+ doc_id: str
53
+ text: str
54
+ embedding: NDArray[np.float32]
55
+ metadata: dict[str, Any]
56
+ namespace: str
57
+ modality: str
58
+ created_at: float
59
+
60
+
61
+ class StorageBackend:
62
+ """SQLite/SQLCipher persistence backend for VecForge.
63
+
64
+ All queries are namespace-scoped to prevent cross-tenant data leaks.
65
+ Uses WAL mode for concurrent read performance. Supports optional
66
+ SQLCipher AES-256 encryption at rest.
67
+
68
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
69
+
70
+ Args:
71
+ path: Database file path, or ':memory:' for in-memory storage.
72
+ encryption_key: Optional SQLCipher encryption key. If provided
73
+ and SQLCipher is available, AES-256 encryption is enabled.
74
+
75
+ Performance:
76
+ Insert: O(1) amortized per document
77
+ Lookup: O(log N) with indexed columns
78
+ Namespace-scoped queries: O(log N) using composite index
79
+
80
+ Example:
81
+ >>> storage = StorageBackend(":memory:")
82
+ >>> doc = storage.insert_doc("hello world", embedding, {}, "default")
83
+ >>> retrieved = storage.get_doc(doc.doc_id)
84
+ """
85
+
86
+ def __init__(
87
+ self,
88
+ path: str = ":memory:",
89
+ encryption_key: str | None = None,
90
+ ) -> None:
91
+ self._path = path
92
+ self._encryption_key = encryption_key
93
+ self._conn: sqlite3.Connection | None = None
94
+ self._encrypted = False
95
+
96
+ self._connect()
97
+ self._init_schema()
98
+
99
+ def _connect(self) -> None:
100
+ """Establish database connection, optionally with encryption.
101
+
102
+ Performance:
103
+ Time: O(1) — connection setup
104
+ """
105
+ # security: Try SQLCipher first if encryption key is provided
106
+ if self._encryption_key:
107
+ try:
108
+ import sqlcipher3
109
+
110
+ self._conn = sqlcipher3.connect(self._path)
111
+ self._conn.execute(f"PRAGMA key = '{self._encryption_key}'")
112
+ self._encrypted = True
113
+ logger.info("SQLCipher encryption enabled (AES-256)")
114
+ except ImportError:
115
+ logger.warning(
116
+ "sqlcipher3 not installed — falling back to unencrypted "
117
+ "SQLite. Install sqlcipher3 for AES-256 encryption."
118
+ )
119
+ self._conn = sqlite3.connect(self._path, check_same_thread=False)
120
+ else:
121
+ self._conn = sqlite3.connect(self._path, check_same_thread=False)
122
+
123
+ # perf: WAL mode for concurrent reads
124
+ self._conn.execute("PRAGMA journal_mode=WAL")
125
+ self._conn.execute("PRAGMA synchronous=NORMAL")
126
+ self._conn.row_factory = sqlite3.Row
127
+
128
+ def _init_schema(self) -> None:
129
+ """Initialize database schema if not exists.
130
+
131
+ Performance:
132
+ Time: O(1)
133
+ """
134
+ assert self._conn is not None
135
+
136
+ self._conn.executescript("""
137
+ CREATE TABLE IF NOT EXISTS schema_version (
138
+ version INTEGER PRIMARY KEY
139
+ );
140
+
141
+ CREATE TABLE IF NOT EXISTS documents (
142
+ doc_id TEXT PRIMARY KEY,
143
+ text TEXT NOT NULL,
144
+ embedding BLOB NOT NULL,
145
+ metadata_json TEXT NOT NULL DEFAULT '{}',
146
+ namespace TEXT NOT NULL DEFAULT 'default',
147
+ modality TEXT NOT NULL DEFAULT 'text',
148
+ created_at REAL NOT NULL,
149
+ updated_at REAL
150
+ );
151
+
152
+ CREATE INDEX IF NOT EXISTS idx_documents_namespace
153
+ ON documents(namespace);
154
+
155
+ CREATE INDEX IF NOT EXISTS idx_documents_namespace_created
156
+ ON documents(namespace, created_at);
157
+
158
+ CREATE TABLE IF NOT EXISTS namespaces (
159
+ name TEXT PRIMARY KEY,
160
+ created_at REAL NOT NULL,
161
+ metadata_json TEXT NOT NULL DEFAULT '{}'
162
+ );
163
+
164
+ CREATE TABLE IF NOT EXISTS faiss_index (
165
+ id INTEGER PRIMARY KEY CHECK (id = 1),
166
+ index_data BLOB NOT NULL,
167
+ dimension INTEGER NOT NULL,
168
+ count INTEGER NOT NULL,
169
+ updated_at REAL NOT NULL
170
+ );
171
+ """)
172
+
173
+ # Ensure default namespace exists
174
+ self._conn.execute(
175
+ """
176
+ INSERT OR IGNORE INTO namespaces (name, created_at)
177
+ VALUES ('default', ?)
178
+ """,
179
+ (time.time(),),
180
+ )
181
+
182
+ self._conn.commit()
183
+ logger.debug("Database schema initialized (version %d)", _SCHEMA_VERSION)
184
+
185
+ def insert_doc(
186
+ self,
187
+ text: str,
188
+ embedding: NDArray[np.float32],
189
+ metadata: dict[str, Any],
190
+ namespace: str = "default",
191
+ modality: str = "text",
192
+ doc_id: str | None = None,
193
+ ) -> StoredDocument:
194
+ """Insert a document into the vault.
195
+
196
+ Args:
197
+ text: Document text content.
198
+ embedding: Dense embedding vector.
199
+ metadata: User-provided metadata.
200
+ namespace: Target namespace. Defaults to 'default'.
201
+ modality: Content type. Defaults to 'text'.
202
+ doc_id: Optional custom document ID. Auto-generated if None.
203
+
204
+ Returns:
205
+ StoredDocument with the inserted document's details.
206
+
207
+ Raises:
208
+ sqlite3.IntegrityError: If doc_id already exists.
209
+
210
+ Performance:
211
+ Time: O(1) amortized
212
+
213
+ Example:
214
+ >>> doc = storage.insert_doc("hello", embedding, {"source": "test"})
215
+ >>> print(doc.doc_id)
216
+ 'a1b2c3d4...'
217
+ """
218
+ assert self._conn is not None
219
+
220
+ if doc_id is None:
221
+ doc_id = str(uuid.uuid4())
222
+
223
+ now = time.time()
224
+
225
+ # security: Always scope to namespace
226
+ self._conn.execute(
227
+ """
228
+ INSERT INTO documents (doc_id, text, embedding, metadata_json,
229
+ namespace, modality, created_at)
230
+ VALUES (?, ?, ?, ?, ?, ?, ?)
231
+ """,
232
+ (
233
+ doc_id,
234
+ text,
235
+ embedding.tobytes(),
236
+ json.dumps(metadata),
237
+ namespace,
238
+ modality,
239
+ now,
240
+ ),
241
+ )
242
+ self._conn.commit()
243
+
244
+ return StoredDocument(
245
+ doc_id=doc_id,
246
+ text=text,
247
+ embedding=embedding,
248
+ metadata=metadata,
249
+ namespace=namespace,
250
+ modality=modality,
251
+ created_at=now,
252
+ )
253
+
254
+ def get_doc(self, doc_id: str) -> StoredDocument | None:
255
+ """Retrieve a document by ID.
256
+
257
+ Args:
258
+ doc_id: Document identifier.
259
+
260
+ Returns:
261
+ StoredDocument if found, None otherwise.
262
+
263
+ Performance:
264
+ Time: O(log N) — indexed lookup
265
+ """
266
+ assert self._conn is not None
267
+
268
+ row = self._conn.execute(
269
+ "SELECT * FROM documents WHERE doc_id = ?", (doc_id,)
270
+ ).fetchone()
271
+
272
+ if row is None:
273
+ return None
274
+
275
+ return self._row_to_document(row)
276
+
277
+ def get_docs_by_namespace(
278
+ self, namespace: str, limit: int = 1000, offset: int = 0
279
+ ) -> list[StoredDocument]:
280
+ """Retrieve documents within a specific namespace.
281
+
282
+ Args:
283
+ namespace: Namespace to query.
284
+ limit: Maximum documents to return.
285
+ offset: Pagination offset.
286
+
287
+ Returns:
288
+ List of StoredDocument in creation order.
289
+
290
+ Performance:
291
+ Time: O(log N + limit) — index scan
292
+
293
+ Example:
294
+ >>> docs = storage.get_docs_by_namespace("ward_7", limit=50)
295
+ """
296
+ assert self._conn is not None
297
+
298
+ # security: CRITICAL — always scope to namespace
299
+ rows = self._conn.execute(
300
+ """
301
+ SELECT * FROM documents
302
+ WHERE namespace = ?
303
+ ORDER BY created_at ASC
304
+ LIMIT ? OFFSET ?
305
+ """,
306
+ (namespace, limit, offset),
307
+ ).fetchall()
308
+
309
+ return [self._row_to_document(row) for row in rows]
310
+
311
+ def get_all_docs(self, namespace: str | None = None) -> list[StoredDocument]:
312
+ """Retrieve all documents, optionally filtered by namespace.
313
+
314
+ Args:
315
+ namespace: If provided, only docs from this namespace.
316
+ If None, returns docs from ALL namespaces.
317
+
318
+ Returns:
319
+ List of all matching StoredDocument.
320
+
321
+ Performance:
322
+ Time: O(N) — full table scan
323
+ """
324
+ assert self._conn is not None
325
+
326
+ if namespace is not None:
327
+ # security: Namespace-scoped query
328
+ rows = self._conn.execute(
329
+ "SELECT * FROM documents WHERE namespace = ? ORDER BY created_at",
330
+ (namespace,),
331
+ ).fetchall()
332
+ else:
333
+ rows = self._conn.execute(
334
+ "SELECT * FROM documents ORDER BY created_at"
335
+ ).fetchall()
336
+
337
+ return [self._row_to_document(row) for row in rows]
338
+
339
+ def delete_doc(self, doc_id: str) -> bool:
340
+ """Delete a document by ID.
341
+
342
+ Args:
343
+ doc_id: Document identifier.
344
+
345
+ Returns:
346
+ True if document was deleted, False if not found.
347
+
348
+ Performance:
349
+ Time: O(log N)
350
+ """
351
+ assert self._conn is not None
352
+
353
+ cursor = self._conn.execute("DELETE FROM documents WHERE doc_id = ?", (doc_id,))
354
+ self._conn.commit()
355
+ return cursor.rowcount > 0
356
+
357
+ def count_docs(self, namespace: str | None = None) -> int:
358
+ """Count documents in the vault.
359
+
360
+ Args:
361
+ namespace: If provided, count only this namespace.
362
+
363
+ Returns:
364
+ Number of documents.
365
+
366
+ Performance:
367
+ Time: O(1) with index
368
+ """
369
+ assert self._conn is not None
370
+
371
+ if namespace is not None:
372
+ row = self._conn.execute(
373
+ "SELECT COUNT(*) FROM documents WHERE namespace = ?",
374
+ (namespace,),
375
+ ).fetchone()
376
+ else:
377
+ row = self._conn.execute("SELECT COUNT(*) FROM documents").fetchone()
378
+
379
+ return int(row[0]) if row else 0
380
+
381
+ def list_namespaces(self) -> list[str]:
382
+ """List all namespace names.
383
+
384
+ Returns:
385
+ Sorted list of namespace names.
386
+
387
+ Performance:
388
+ Time: O(K) where K = number of namespaces
389
+ """
390
+ assert self._conn is not None
391
+
392
+ rows = self._conn.execute(
393
+ "SELECT name FROM namespaces ORDER BY name"
394
+ ).fetchall()
395
+
396
+ return [row["name"] for row in rows]
397
+
398
+ def create_namespace(self, name: str) -> None:
399
+ """Create a new namespace.
400
+
401
+ Args:
402
+ name: Namespace name. Must be unique.
403
+
404
+ Performance:
405
+ Time: O(1)
406
+ """
407
+ assert self._conn is not None
408
+
409
+ self._conn.execute(
410
+ "INSERT OR IGNORE INTO namespaces (name, created_at) VALUES (?, ?)",
411
+ (name, time.time()),
412
+ )
413
+ self._conn.commit()
414
+
415
+ def save_faiss_index(self, index_data: bytes, dimension: int, count: int) -> None:
416
+ """Persist the FAISS index to storage.
417
+
418
+ Args:
419
+ index_data: Serialized FAISS index bytes.
420
+ dimension: Embedding dimension.
421
+ count: Number of vectors in the index.
422
+
423
+ Performance:
424
+ Time: O(N * d) — proportional to index size
425
+ """
426
+ assert self._conn is not None
427
+
428
+ self._conn.execute(
429
+ """
430
+ INSERT OR REPLACE INTO faiss_index
431
+ (id, index_data, dimension, count, updated_at)
432
+ VALUES (1, ?, ?, ?, ?)
433
+ """,
434
+ (index_data, dimension, count, time.time()),
435
+ )
436
+ self._conn.commit()
437
+
438
+ def load_faiss_index(self) -> tuple[bytes, int, int] | None:
439
+ """Load the persisted FAISS index.
440
+
441
+ Returns:
442
+ Tuple of (index_data, dimension, count) or None if not saved.
443
+
444
+ Performance:
445
+ Time: O(N * d) — proportional to index size
446
+ """
447
+ assert self._conn is not None
448
+
449
+ row = self._conn.execute(
450
+ "SELECT index_data, dimension, count FROM faiss_index WHERE id = 1"
451
+ ).fetchone()
452
+
453
+ if row is None:
454
+ return None
455
+
456
+ return (bytes(row["index_data"]), int(row["dimension"]), int(row["count"]))
457
+
458
+ def _row_to_document(self, row: sqlite3.Row) -> StoredDocument:
459
+ """Convert a database row to a StoredDocument.
460
+
461
+ Performance:
462
+ Time: O(d) where d = embedding dimension
463
+ """
464
+ embedding = np.frombuffer(row["embedding"], dtype=np.float32)
465
+
466
+ return StoredDocument(
467
+ doc_id=row["doc_id"],
468
+ text=row["text"],
469
+ embedding=embedding,
470
+ metadata=json.loads(row["metadata_json"]),
471
+ namespace=row["namespace"],
472
+ modality=row["modality"],
473
+ created_at=row["created_at"],
474
+ )
475
+
476
+ @property
477
+ def is_encrypted(self) -> bool:
478
+ """Return whether the storage backend is using encryption.
479
+
480
+ Performance:
481
+ Time: O(1)
482
+ """
483
+ return self._encrypted
484
+
485
+ def close(self) -> None:
486
+ """Close the database connection.
487
+
488
+ Performance:
489
+ Time: O(1)
490
+ """
491
+ if self._conn:
492
+ self._conn.close()
493
+ self._conn = None