beaver-db 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of beaver-db might be problematic. Click here for more details.

beaver/collections.py CHANGED
@@ -1,11 +1,13 @@
1
1
  import json
2
2
  import sqlite3
3
+ import threading
3
4
  import uuid
4
5
  from enum import Enum
5
- from typing import Any, List, Literal, Set
6
+ from typing import Any, List, Literal, Tuple
6
7
 
7
8
  import numpy as np
8
- from scipy.spatial import KDTree
9
+
10
+ from .vectors import VectorIndex
9
11
 
10
12
 
11
13
  # --- Fuzzy Search Helper Functions ---
@@ -103,16 +105,18 @@ class Document:
103
105
 
104
106
  class CollectionManager:
105
107
  """
106
- A wrapper for multi-modal collection operations with an in-memory ANN index,
107
- FTS, and graph capabilities.
108
+ A wrapper for multi-modal collection operations, including document storage,
109
+ FTS, fuzzy search, graph traversal, and persistent vector search.
108
110
  """
109
111
 
110
112
  def __init__(self, name: str, conn: sqlite3.Connection):
111
113
  self._name = name
112
114
  self._conn = conn
113
- self._kdtree: KDTree | None = None
114
- self._doc_ids: List[str] = []
115
- self._local_index_version = -1 # Version of the in-memory index
115
+ # All vector-related operations are now delegated to the VectorIndex class.
116
+ self._vector_index = VectorIndex(name, conn)
117
+ # A lock to ensure only one compaction thread runs at a time for this collection.
118
+ self._compaction_lock = threading.Lock()
119
+ self._compaction_thread: threading.Thread | None = None
116
120
 
117
121
  def _flatten_metadata(self, metadata: dict, prefix: str = "") -> dict[str, Any]:
118
122
  """Flattens a nested dictionary for indexing."""
@@ -125,21 +129,58 @@ class CollectionManager:
125
129
  flat_dict[new_key] = value
126
130
  return flat_dict
127
131
 
128
- def _get_db_version(self) -> int:
129
- """Gets the current version of the collection from the database."""
132
+ def _needs_compaction(self, threshold: int = 1000) -> bool:
133
+ """Checks if the total number of pending vector operations exceeds the threshold."""
130
134
  cursor = self._conn.cursor()
131
135
  cursor.execute(
132
- "SELECT version FROM beaver_collection_versions WHERE collection_name = ?",
133
- (self._name,),
136
+ "SELECT COUNT(*) FROM _beaver_ann_pending_log WHERE collection_name = ?",
137
+ (self._name,)
138
+ )
139
+ pending_count = cursor.fetchone()[0]
140
+ cursor.execute(
141
+ "SELECT COUNT(*) FROM _beaver_ann_deletions_log WHERE collection_name = ?",
142
+ (self._name,)
134
143
  )
135
- result = cursor.fetchone()
136
- return result[0] if result else 0
144
+ deletion_count = cursor.fetchone()[0]
145
+ return (pending_count + deletion_count) >= threshold
146
+
147
+ def _run_compaction_and_release_lock(self):
148
+ """
149
+ A target function for the background thread that runs the compaction
150
+ and ensures the lock is always released, even if errors occur.
151
+ """
152
+ try:
153
+ self._vector_index.compact()
154
+ finally:
155
+ self._compaction_lock.release()
156
+
157
+ def compact(self, block: bool = False):
158
+ """
159
+ Triggers a non-blocking background compaction of the vector index.
160
+
161
+ If a compaction is already running for this collection, this method returns
162
+ immediately without starting a new one.
137
163
 
138
- def _is_index_stale(self) -> bool:
139
- """Checks if the in-memory index is out of sync with the DB."""
140
- if self._local_index_version == -1:
141
- return True
142
- return self._local_index_version < self._get_db_version()
164
+ Args:
165
+ block: If True, this method will wait for the compaction to complete
166
+ before returning. Defaults to False (non-blocking).
167
+ """
168
+ # Use a non-blocking lock acquire to check if a compaction is already running.
169
+ if self._compaction_lock.acquire(blocking=False):
170
+ try:
171
+ # If we get the lock, start a new background thread.
172
+ self._compaction_thread = threading.Thread(
173
+ target=self._run_compaction_and_release_lock,
174
+ daemon=True # Daemon threads don't block program exit.
175
+ )
176
+ self._compaction_thread.start()
177
+ if block:
178
+ self._compaction_thread.join()
179
+ except Exception:
180
+ # If something goes wrong during thread creation, release the lock.
181
+ self._compaction_lock.release()
182
+ raise
183
+ # If acquire fails, it means another thread holds the lock, so we do nothing.
143
184
 
144
185
  def index(
145
186
  self,
@@ -152,9 +193,14 @@ class CollectionManager:
152
193
  Indexes a Document, including vector, FTS, and fuzzy search data.
153
194
  The entire operation is performed in a single atomic transaction.
154
195
  """
196
+ if not isinstance(document, Document):
197
+ raise TypeError("Item to index must be a Document object.")
198
+
155
199
  with self._conn:
156
- # Step 1: Core Document and Vector Storage (Unaffected by FTS/Fuzzy)
157
- self._conn.execute(
200
+ cursor = self._conn.cursor()
201
+
202
+ # Step 1: Core Document and Vector Storage
203
+ cursor.execute(
158
204
  "INSERT OR REPLACE INTO beaver_collections (collection, item_id, item_vector, metadata) VALUES (?, ?, ?, ?)",
159
205
  (
160
206
  self._name,
@@ -164,12 +210,14 @@ class CollectionManager:
164
210
  ),
165
211
  )
166
212
 
167
- # Step 2: FTS and Fuzzy Indexing
168
- # First, clean up old index data for this document
169
- self._conn.execute("DELETE FROM beaver_fts_index WHERE collection = ? AND item_id = ?", (self._name, document.id))
170
- self._conn.execute("DELETE FROM beaver_trigrams WHERE collection = ? AND item_id = ?", (self._name, document.id))
213
+ # Step 2: Delegate to the VectorIndex if an embedding exists.
214
+ if document.embedding is not None:
215
+ self._vector_index.index(document.id, document.embedding, cursor)
216
+
217
+ # Step 3: FTS and Fuzzy Indexing
218
+ cursor.execute("DELETE FROM beaver_fts_index WHERE collection = ? AND item_id = ?", (self._name, document.id))
219
+ cursor.execute("DELETE FROM beaver_trigrams WHERE collection = ? AND item_id = ?", (self._name, document.id))
171
220
 
172
- # Determine which string fields to index
173
221
  flat_metadata = self._flatten_metadata(document.to_dict())
174
222
  fields_to_index: dict[str, str] = {}
175
223
  if isinstance(fts, list):
@@ -178,63 +226,46 @@ class CollectionManager:
178
226
  fields_to_index = {k: v for k, v in flat_metadata.items() if isinstance(v, str)}
179
227
 
180
228
  if fields_to_index:
181
- # FTS indexing
182
229
  fts_data = [(self._name, document.id, path, content) for path, content in fields_to_index.items()]
183
- self._conn.executemany(
184
- "INSERT INTO beaver_fts_index (collection, item_id, field_path, field_content) VALUES (?, ?, ?, ?)",
185
- fts_data,
186
- )
187
-
188
- # Fuzzy indexing (if enabled)
230
+ cursor.executemany("INSERT INTO beaver_fts_index (collection, item_id, field_path, field_content) VALUES (?, ?, ?, ?)", fts_data)
189
231
  if fuzzy:
190
232
  trigram_data = []
191
233
  for path, content in fields_to_index.items():
192
234
  for trigram in _get_trigrams(content.lower()):
193
235
  trigram_data.append((self._name, document.id, path, trigram))
194
236
  if trigram_data:
195
- self._conn.executemany(
196
- "INSERT INTO beaver_trigrams (collection, item_id, field_path, trigram) VALUES (?, ?, ?, ?)",
197
- trigram_data,
198
- )
237
+ cursor.executemany("INSERT INTO beaver_trigrams (collection, item_id, field_path, trigram) VALUES (?, ?, ?, ?)", trigram_data)
199
238
 
200
- # Step 3: Update Collection Version
201
- self._conn.execute(
202
- """
203
- INSERT INTO beaver_collection_versions (collection_name, version) VALUES (?, 1)
204
- ON CONFLICT(collection_name) DO UPDATE SET version = version + 1
205
- """,
239
+ # Step 4: Update Collection Version to signal a change.
240
+ cursor.execute(
241
+ "INSERT INTO beaver_collection_versions (collection_name, version) VALUES (?, 1) ON CONFLICT(collection_name) DO UPDATE SET version = version + 1",
206
242
  (self._name,),
207
243
  )
208
244
 
245
+ # After the transaction commits, check if auto-compaction is needed.
246
+ if self._needs_compaction():
247
+ self.compact()
248
+
209
249
  def drop(self, document: Document):
210
250
  """Removes a document and all its associated data from the collection."""
211
251
  if not isinstance(document, Document):
212
252
  raise TypeError("Item to drop must be a Document object.")
213
253
  with self._conn:
214
- self._conn.execute(
215
- "DELETE FROM beaver_collections WHERE collection = ? AND item_id = ?",
216
- (self._name, document.id),
217
- )
218
- self._conn.execute(
219
- "DELETE FROM beaver_fts_index WHERE collection = ? AND item_id = ?",
220
- (self._name, document.id),
221
- )
222
- self._conn.execute(
223
- "DELETE FROM beaver_trigrams WHERE collection = ? AND item_id = ?",
224
- (self._name, document.id),
225
- )
226
- self._conn.execute(
227
- "DELETE FROM beaver_edges WHERE collection = ? AND (source_item_id = ? OR target_item_id = ?)",
228
- (self._name, document.id, document.id),
229
- )
230
- self._conn.execute(
231
- """
232
- INSERT INTO beaver_collection_versions (collection_name, version) VALUES (?, 1)
233
- ON CONFLICT(collection_name) DO UPDATE SET version = version + 1
234
- """,
254
+ cursor = self._conn.cursor()
255
+ cursor.execute("DELETE FROM beaver_collections WHERE collection = ? AND item_id = ?", (self._name, document.id))
256
+ cursor.execute("DELETE FROM beaver_fts_index WHERE collection = ? AND item_id = ?", (self._name, document.id))
257
+ cursor.execute("DELETE FROM beaver_trigrams WHERE collection = ? AND item_id = ?", (self._name, document.id))
258
+ cursor.execute("DELETE FROM beaver_edges WHERE collection = ? AND (source_item_id = ? OR target_item_id = ?)", (self._name, document.id, document.id))
259
+ self._vector_index.drop(document.id, cursor)
260
+ cursor.execute(
261
+ "INSERT INTO beaver_collection_versions (collection_name, version) VALUES (?, 1) ON CONFLICT(collection_name) DO UPDATE SET version = version + 1",
235
262
  (self._name,),
236
263
  )
237
264
 
265
+ # Check for auto-compaction after a drop as well.
266
+ if self._needs_compaction():
267
+ self.compact()
268
+
238
269
  def __iter__(self):
239
270
  """Returns an iterator over all documents in the collection."""
240
271
  cursor = self._conn.cursor()
@@ -253,57 +284,45 @@ class CollectionManager:
253
284
  )
254
285
  cursor.close()
255
286
 
256
- def refresh(self):
257
- """Forces a rebuild of the in-memory ANN index from data in SQLite."""
258
- cursor = self._conn.cursor()
259
- cursor.execute(
260
- "SELECT item_id, item_vector FROM beaver_collections WHERE collection = ? AND item_vector IS NOT NULL",
261
- (self._name,),
262
- )
263
- vectors, self._doc_ids = [], []
264
- for row in cursor.fetchall():
265
- self._doc_ids.append(row["item_id"])
266
- vectors.append(np.frombuffer(row["item_vector"], dtype=np.float32))
267
-
268
- self._kdtree = KDTree(vectors) if vectors else None
269
- self._local_index_version = self._get_db_version()
270
-
271
287
  def search(
272
288
  self, vector: list[float], top_k: int = 10
273
- ) -> list[tuple[Document, float]]:
274
- """Performs a fast approximate nearest neighbor search."""
275
- if self._is_index_stale():
276
- self.refresh()
277
- if not self._kdtree:
278
- return []
289
+ ) -> List[Tuple[Document, float]]:
290
+ """Performs a fast, persistent approximate nearest neighbor search."""
291
+ if not isinstance(vector, list):
292
+ raise TypeError("Search vector must be a list of floats.")
279
293
 
280
- if top_k > len(self._doc_ids):
281
- top_k = len(self._doc_ids)
282
-
283
- distances, indices = self._kdtree.query(
284
- np.array(vector, dtype=np.float32), k=top_k
294
+ search_results = self._vector_index.search(
295
+ np.array(vector, dtype=np.float32), top_k=top_k
285
296
  )
286
- if top_k == 1:
287
- distances, indices = [distances], [indices]
297
+ if not search_results:
298
+ return []
299
+
300
+ result_ids = [item[0] for item in search_results]
301
+ distance_map = {item[0]: item[1] for item in search_results}
288
302
 
289
- result_ids = [self._doc_ids[i] for i in indices]
290
303
  placeholders = ",".join("?" for _ in result_ids)
291
304
  sql = f"SELECT item_id, item_vector, metadata FROM beaver_collections WHERE collection = ? AND item_id IN ({placeholders})"
292
305
 
293
306
  cursor = self._conn.cursor()
294
307
  rows = cursor.execute(sql, (self._name, *result_ids)).fetchall()
295
- row_map = {row["item_id"]: row for row in rows}
296
308
 
297
- results = []
298
- for i, doc_id in enumerate(result_ids):
299
- row = row_map.get(doc_id)
300
- if row:
301
- embedding = np.frombuffer(row["item_vector"], dtype=np.float32).tolist()
302
- doc = Document(
303
- id=doc_id, embedding=embedding, **json.loads(row["metadata"])
304
- )
305
- results.append((doc, float(distances[i])))
306
- return results
309
+ doc_map = {
310
+ row["item_id"]: Document(
311
+ id=row["item_id"],
312
+ embedding=(np.frombuffer(row["item_vector"], dtype=np.float32).tolist() if row["item_vector"] else None),
313
+ **json.loads(row["metadata"]),
314
+ )
315
+ for row in rows
316
+ }
317
+
318
+ final_results = []
319
+ for doc_id in result_ids:
320
+ if doc_id in doc_map:
321
+ doc = doc_map[doc_id]
322
+ distance = distance_map[doc_id]
323
+ final_results.append((doc, distance))
324
+
325
+ return final_results
307
326
 
308
327
  def match(
309
328
  self,
@@ -315,14 +334,6 @@ class CollectionManager:
315
334
  ) -> list[tuple[Document, float]]:
316
335
  """
317
336
  Performs a full-text or fuzzy search on indexed string fields.
318
-
319
- Args:
320
- query: The search query string.
321
- on: An optional list of fields to restrict the search to.
322
- top_k: The maximum number of results to return.
323
- fuzziness: The Levenshtein distance for fuzzy matching.
324
- If 0, performs an exact FTS search.
325
- If > 0, performs a fuzzy search.
326
337
  """
327
338
  if isinstance(on, str):
328
339
  on = [on]
@@ -372,9 +383,6 @@ class CollectionManager:
372
383
  if not query_trigrams:
373
384
  return set()
374
385
 
375
- # Optimization: Only consider documents that share a significant number of trigrams.
376
- # This threshold dramatically reduces the number of candidates for the expensive
377
- # Levenshtein check. A 30% threshold is a reasonable starting point.
378
386
  similarity_threshold = int(len(query_trigrams) * 0.3)
379
387
  if similarity_threshold == 0:
380
388
  return set()
@@ -404,7 +412,6 @@ class CollectionManager:
404
412
  self, query: str, on: list[str] | None, top_k: int, fuzziness: int
405
413
  ) -> list[tuple[Document, float]]:
406
414
  """Performs a 3-stage fuzzy search: gather, score, and sort."""
407
- # Stage 1: Gather Candidates
408
415
  fts_results = self._perform_fts_search(query, on, top_k)
409
416
  fts_candidate_ids = {doc.id for doc, _ in fts_results}
410
417
  trigram_candidate_ids = self._get_trigram_candidates(query, on)
@@ -412,7 +419,6 @@ class CollectionManager:
412
419
  if not candidate_ids:
413
420
  return []
414
421
 
415
- # Stage 2: Score Candidates
416
422
  cursor = self._conn.cursor()
417
423
  id_placeholders = ",".join("?" for _ in candidate_ids)
418
424
  sql_text = f"SELECT item_id, field_path, field_content FROM beaver_fts_index WHERE collection = ? AND item_id IN ({id_placeholders})"
@@ -445,10 +451,9 @@ class CollectionManager:
445
451
  scored_candidates.append({
446
452
  "id": item_id,
447
453
  "distance": min_dist,
448
- "fts_rank": fts_rank_map.get(item_id, 0) # Use 0 for non-matches (less relevant)
454
+ "fts_rank": fts_rank_map.get(item_id, 0)
449
455
  })
450
456
 
451
- # Stage 3: Sort and Fetch Results
452
457
  scored_candidates.sort(key=lambda x: (x["distance"], x["fts_rank"]))
453
458
  top_ids = [c["id"] for c in scored_candidates[:top_k]]
454
459
  if not top_ids:
@@ -563,49 +568,26 @@ def rerank(
563
568
  ) -> list[Document]:
564
569
  """
565
570
  Reranks documents from multiple search result lists using Reverse Rank Fusion (RRF).
566
- This function is specifically designed to work with beaver.collections.Document objects.
567
-
568
- Args:
569
- results (sequence of list[Document]): A sequence of search result lists, where each
570
- inner list contains Document objects.
571
- weights (list[float], optional): A list of weights corresponding to each
572
- result list. If None, all lists are weighted equally. Defaults to None.
573
- k (int, optional): A constant used in the RRF formula. Defaults to 60.
574
-
575
- Returns:
576
- list[Document]: A single, reranked list of unique Document objects, sorted
577
- by their fused rank score in descending order.
578
571
  """
579
572
  if not results:
580
573
  return []
581
574
 
582
- # Assign a default weight of 1.0 if none are provided
583
575
  if weights is None:
584
576
  weights = [1.0] * len(results)
585
577
 
586
578
  if len(results) != len(weights):
587
579
  raise ValueError("The number of result lists must match the number of weights.")
588
580
 
589
- # Use dictionaries to store scores and unique documents by their ID
590
581
  rrf_scores: dict[str, float] = {}
591
582
  doc_store: dict[str, Document] = {}
592
583
 
593
- # Iterate through each list of Document objects and its weight
594
584
  for result_list, weight in zip(results, weights):
595
585
  for rank, doc in enumerate(result_list):
596
- # Use the .id attribute from the Document object
597
586
  doc_id = doc.id
598
587
  if doc_id not in doc_store:
599
588
  doc_store[doc_id] = doc
600
-
601
- # Calculate the reciprocal rank score, scaled by the weight
602
589
  score = weight * (1 / (k + rank))
603
-
604
- # Add the score to the document's running total
605
590
  rrf_scores[doc_id] = rrf_scores.get(doc_id, 0.0) + score
606
591
 
607
- # Sort the document IDs by their final aggregated scores
608
592
  sorted_doc_ids = sorted(rrf_scores.keys(), key=rrf_scores.get, reverse=True)
609
-
610
- # Return the final list of Document objects in the new, reranked order
611
593
  return [doc_store[doc_id] for doc_id in sorted_doc_ids]
beaver/core.py CHANGED
@@ -14,7 +14,7 @@ class BeaverDB:
14
14
  This class manages the database connection and table schemas.
15
15
  """
16
16
 
17
- def __init__(self, db_path: str):
17
+ def __init__(self, db_path: str, timeout:float=30.0):
18
18
  """
19
19
  Initializes the database connection and creates all necessary tables.
20
20
 
@@ -23,171 +23,221 @@ class BeaverDB:
23
23
  """
24
24
  self._db_path = db_path
25
25
  # Enable WAL mode for better concurrency between readers and writers
26
- self._conn = sqlite3.connect(self._db_path, check_same_thread=False)
26
+ self._conn = sqlite3.connect(self._db_path, check_same_thread=False, timeout=timeout)
27
27
  self._conn.execute("PRAGMA journal_mode=WAL;")
28
28
  self._conn.row_factory = sqlite3.Row
29
- self._create_all_tables()
30
29
  self._channels: dict[str, ChannelManager] = {}
31
30
  self._channels_lock = threading.Lock()
31
+ # Add a cache and lock for CollectionManager singletons
32
+ self._collections: dict[str, CollectionManager] = {}
33
+ self._collections_lock = threading.Lock()
34
+
35
+ # Initialize the schemas
36
+ self._create_all_tables()
32
37
 
33
38
  def _create_all_tables(self):
34
39
  """Initializes all required tables in the database file."""
35
- self._create_pubsub_table()
36
- self._create_list_table()
37
- self._create_collections_table()
38
- self._create_fts_table()
39
- self._create_trigrams_table()
40
- self._create_edges_table()
41
- self._create_versions_table()
42
- self._create_dict_table()
43
- self._create_priority_queue_table()
40
+ with self._conn:
41
+ self._create_pubsub_table()
42
+ self._create_list_table()
43
+ self._create_collections_table()
44
+ self._create_fts_table()
45
+ self._create_trigrams_table()
46
+ self._create_edges_table()
47
+ self._create_versions_table()
48
+ self._create_dict_table()
49
+ self._create_priority_queue_table()
50
+ self._create_ann_indexes_table()
51
+ self._create_ann_pending_log_table()
52
+ self._create_ann_deletions_log_table()
53
+ self._create_ann_id_mapping_table()
54
+
55
+ def _create_ann_indexes_table(self):
56
+ """Creates the table to store the serialized base ANN index."""
57
+ self._conn.execute(
58
+ """
59
+ CREATE TABLE IF NOT EXISTS _beaver_ann_indexes (
60
+ collection_name TEXT PRIMARY KEY,
61
+ index_data BLOB,
62
+ base_index_version INTEGER NOT NULL DEFAULT 0
63
+ )
64
+ """
65
+ )
66
+
67
+ def _create_ann_pending_log_table(self):
68
+ """Creates the log for new vector additions."""
69
+ self._conn.execute(
70
+ """
71
+ CREATE TABLE IF NOT EXISTS _beaver_ann_pending_log (
72
+ collection_name TEXT NOT NULL,
73
+ str_id TEXT NOT NULL,
74
+ PRIMARY KEY (collection_name, str_id)
75
+ )
76
+ """
77
+ )
78
+
79
+ def _create_ann_deletions_log_table(self):
80
+ """Creates the log for vector deletions (tombstones)."""
81
+ self._conn.execute(
82
+ """
83
+ CREATE TABLE IF NOT EXISTS _beaver_ann_deletions_log (
84
+ collection_name TEXT NOT NULL,
85
+ int_id INTEGER NOT NULL,
86
+ PRIMARY KEY (collection_name, int_id)
87
+ )
88
+ """
89
+ )
90
+
91
+ def _create_ann_id_mapping_table(self):
92
+ """Creates the table to map string IDs to integer IDs for Faiss."""
93
+ self._conn.execute(
94
+ """
95
+ CREATE TABLE IF NOT EXISTS _beaver_ann_id_mapping (
96
+ collection_name TEXT NOT NULL,
97
+ str_id TEXT NOT NULL,
98
+ int_id INTEGER PRIMARY KEY AUTOINCREMENT,
99
+ UNIQUE(collection_name, str_id)
100
+ )
101
+ """
102
+ )
44
103
 
45
104
  def _create_priority_queue_table(self):
46
105
  """Creates the priority queue table and its performance index."""
47
- with self._conn:
48
- self._conn.execute(
49
- """
50
- CREATE TABLE IF NOT EXISTS beaver_priority_queues (
51
- queue_name TEXT NOT NULL,
52
- priority REAL NOT NULL,
53
- timestamp REAL NOT NULL,
54
- data TEXT NOT NULL
55
- )
56
- """
57
- )
58
- self._conn.execute(
59
- """
60
- CREATE INDEX IF NOT EXISTS idx_priority_queue_order
61
- ON beaver_priority_queues (queue_name, priority ASC, timestamp ASC)
62
- """
106
+ self._conn.execute(
107
+ """
108
+ CREATE TABLE IF NOT EXISTS beaver_priority_queues (
109
+ queue_name TEXT NOT NULL,
110
+ priority REAL NOT NULL,
111
+ timestamp REAL NOT NULL,
112
+ data TEXT NOT NULL
63
113
  )
114
+ """
115
+ )
116
+ self._conn.execute(
117
+ """
118
+ CREATE INDEX IF NOT EXISTS idx_priority_queue_order
119
+ ON beaver_priority_queues (queue_name, priority ASC, timestamp ASC)
120
+ """
121
+ )
64
122
 
65
123
  def _create_dict_table(self):
66
124
  """Creates the namespaced dictionary table."""
67
- with self._conn:
68
- self._conn.execute(
69
- """
70
- CREATE TABLE IF NOT EXISTS beaver_dicts (
71
- dict_name TEXT NOT NULL,
72
- key TEXT NOT NULL,
73
- value TEXT NOT NULL,
74
- expires_at REAL,
75
- PRIMARY KEY (dict_name, key)
76
- )
125
+ self._conn.execute(
77
126
  """
127
+ CREATE TABLE IF NOT EXISTS beaver_dicts (
128
+ dict_name TEXT NOT NULL,
129
+ key TEXT NOT NULL,
130
+ value TEXT NOT NULL,
131
+ expires_at REAL,
132
+ PRIMARY KEY (dict_name, key)
78
133
  )
134
+ """
135
+ )
79
136
 
80
137
  def _create_pubsub_table(self):
81
138
  """Creates the pub/sub log table."""
82
- with self._conn:
83
- self._conn.execute(
84
- """
85
- CREATE TABLE IF NOT EXISTS beaver_pubsub_log (
86
- timestamp REAL PRIMARY KEY,
87
- channel_name TEXT NOT NULL,
88
- message_payload TEXT NOT NULL
89
- )
139
+ self._conn.execute(
90
140
  """
141
+ CREATE TABLE IF NOT EXISTS beaver_pubsub_log (
142
+ timestamp REAL PRIMARY KEY,
143
+ channel_name TEXT NOT NULL,
144
+ message_payload TEXT NOT NULL
91
145
  )
92
- self._conn.execute(
93
- """
94
- CREATE INDEX IF NOT EXISTS idx_pubsub_channel_timestamp
95
- ON beaver_pubsub_log (channel_name, timestamp)
146
+ """
147
+ )
148
+ self._conn.execute(
96
149
  """
97
- )
150
+ CREATE INDEX IF NOT EXISTS idx_pubsub_channel_timestamp
151
+ ON beaver_pubsub_log (channel_name, timestamp)
152
+ """
153
+ )
98
154
 
99
155
  def _create_list_table(self):
100
156
  """Creates the lists table."""
101
- with self._conn:
102
- self._conn.execute(
103
- """
104
- CREATE TABLE IF NOT EXISTS beaver_lists (
105
- list_name TEXT NOT NULL,
106
- item_order REAL NOT NULL,
107
- item_value TEXT NOT NULL,
108
- PRIMARY KEY (list_name, item_order)
109
- )
157
+ self._conn.execute(
110
158
  """
159
+ CREATE TABLE IF NOT EXISTS beaver_lists (
160
+ list_name TEXT NOT NULL,
161
+ item_order REAL NOT NULL,
162
+ item_value TEXT NOT NULL,
163
+ PRIMARY KEY (list_name, item_order)
111
164
  )
165
+ """
166
+ )
112
167
 
113
168
  def _create_collections_table(self):
114
169
  """Creates the main table for storing documents and vectors."""
115
- with self._conn:
116
- self._conn.execute(
117
- """
118
- CREATE TABLE IF NOT EXISTS beaver_collections (
119
- collection TEXT NOT NULL,
120
- item_id TEXT NOT NULL,
121
- item_vector BLOB,
122
- metadata TEXT,
123
- PRIMARY KEY (collection, item_id)
124
- )
170
+ self._conn.execute(
125
171
  """
172
+ CREATE TABLE IF NOT EXISTS beaver_collections (
173
+ collection TEXT NOT NULL,
174
+ item_id TEXT NOT NULL,
175
+ item_vector BLOB,
176
+ metadata TEXT,
177
+ PRIMARY KEY (collection, item_id)
126
178
  )
179
+ """
180
+ )
127
181
 
128
182
  def _create_fts_table(self):
129
183
  """Creates the virtual FTS table for full-text search."""
130
- with self._conn:
131
- self._conn.execute(
132
- """
133
- CREATE VIRTUAL TABLE IF NOT EXISTS beaver_fts_index USING fts5(
134
- collection,
135
- item_id,
136
- field_path,
137
- field_content,
138
- tokenize = 'porter'
139
- )
184
+ self._conn.execute(
140
185
  """
186
+ CREATE VIRTUAL TABLE IF NOT EXISTS beaver_fts_index USING fts5(
187
+ collection,
188
+ item_id,
189
+ field_path,
190
+ field_content,
191
+ tokenize = 'porter'
141
192
  )
193
+ """
194
+ )
142
195
 
143
196
  def _create_trigrams_table(self):
144
197
  """Creates the table for the fuzzy search trigram index."""
145
- with self._conn:
146
- self._conn.execute(
147
- """
148
- CREATE TABLE IF NOT EXISTS beaver_trigrams (
149
- collection TEXT NOT NULL,
150
- item_id TEXT NOT NULL,
151
- field_path TEXT NOT NULL,
152
- trigram TEXT NOT NULL,
153
- PRIMARY KEY (collection, field_path, trigram, item_id)
154
- )
155
- """
156
- )
157
- self._conn.execute(
158
- """
159
- CREATE INDEX IF NOT EXISTS idx_trigram_lookup
160
- ON beaver_trigrams (collection, trigram, field_path)
161
- """
198
+ self._conn.execute(
199
+ """
200
+ CREATE TABLE IF NOT EXISTS beaver_trigrams (
201
+ collection TEXT NOT NULL,
202
+ item_id TEXT NOT NULL,
203
+ field_path TEXT NOT NULL,
204
+ trigram TEXT NOT NULL,
205
+ PRIMARY KEY (collection, field_path, trigram, item_id)
162
206
  )
207
+ """
208
+ )
209
+ self._conn.execute(
210
+ """
211
+ CREATE INDEX IF NOT EXISTS idx_trigram_lookup
212
+ ON beaver_trigrams (collection, trigram, field_path)
213
+ """
214
+ )
163
215
 
164
216
  def _create_edges_table(self):
165
217
  """Creates the table for storing relationships between documents."""
166
- with self._conn:
167
- self._conn.execute(
168
- """
169
- CREATE TABLE IF NOT EXISTS beaver_edges (
170
- collection TEXT NOT NULL,
171
- source_item_id TEXT NOT NULL,
172
- target_item_id TEXT NOT NULL,
173
- label TEXT NOT NULL,
174
- metadata TEXT,
175
- PRIMARY KEY (collection, source_item_id, target_item_id, label)
176
- )
218
+ self._conn.execute(
177
219
  """
220
+ CREATE TABLE IF NOT EXISTS beaver_edges (
221
+ collection TEXT NOT NULL,
222
+ source_item_id TEXT NOT NULL,
223
+ target_item_id TEXT NOT NULL,
224
+ label TEXT NOT NULL,
225
+ metadata TEXT,
226
+ PRIMARY KEY (collection, source_item_id, target_item_id, label)
178
227
  )
228
+ """
229
+ )
179
230
 
180
231
  def _create_versions_table(self):
181
232
  """Creates a table to track the version of each collection for caching."""
182
- with self._conn:
183
- self._conn.execute(
184
- """
185
- CREATE TABLE IF NOT EXISTS beaver_collection_versions (
186
- collection_name TEXT PRIMARY KEY,
187
- version INTEGER NOT NULL DEFAULT 0
188
- )
233
+ self._conn.execute(
189
234
  """
235
+ CREATE TABLE IF NOT EXISTS beaver_collection_versions (
236
+ collection_name TEXT PRIMARY KEY,
237
+ version INTEGER NOT NULL DEFAULT 0
190
238
  )
239
+ """
240
+ )
191
241
 
192
242
  def close(self):
193
243
  """Closes the database connection."""
@@ -222,11 +272,20 @@ class BeaverDB:
222
272
  return QueueManager(name, self._conn)
223
273
 
224
274
  def collection(self, name: str) -> CollectionManager:
225
- """Returns a wrapper for interacting with a document collection."""
275
+ """
276
+ Returns a singleton CollectionManager instance for interacting with a
277
+ document collection.
278
+ """
226
279
  if not isinstance(name, str) or not name:
227
280
  raise TypeError("Collection name must be a non-empty string.")
228
281
 
229
- return CollectionManager(name, self._conn)
282
+ # Use a thread-safe lock to ensure only one CollectionManager object is
283
+ # created per name. This is crucial for managing the in-memory state
284
+ # of the vector index consistently.
285
+ with self._collections_lock:
286
+ if name not in self._collections:
287
+ self._collections[name] = CollectionManager(name, self._conn)
288
+ return self._collections[name]
230
289
 
231
290
  def channel(self, name: str) -> ChannelManager:
232
291
  """
beaver/vectors.py ADDED
@@ -0,0 +1,370 @@
1
+ import io
2
+ import sqlite3
3
+ import threading
4
+ from typing import Dict, List, Set, Tuple
5
+
6
+ import faiss
7
+ import numpy as np
8
+
9
+ class VectorIndex:
10
+ """
11
+ Manages a persistent, high-performance hybrid vector index for a single collection.
12
+
13
+ This class handles the complexities of a two-tiered index system (a large, on-disk
14
+ base index and a small, in-memory delta index), crash-safe logging for additions
15
+ and deletions, and multi-process synchronization. The vector dimension is inferred
16
+ from the first vector indexed and then enforced. It also transparently maps
17
+ user-provided string IDs to the internal integer IDs required by Faiss.
18
+ """
19
+
20
+ def __init__(self, collection_name: str, conn: sqlite3.Connection):
21
+ """
22
+ Initializes the VectorIndex for a specific collection.
23
+ """
24
+ self._collection_name = collection_name
25
+ self._conn = conn
26
+ # A lock to ensure thread safety for in-memory operations and synchronization checks.
27
+ self._lock = threading.Lock()
28
+ # Tracks the overall version of the collection this instance is aware of.
29
+ self._local_version = -1
30
+ # Tracks the specific version of the on-disk base index this instance has loaded.
31
+ self._local_base_index_version = -1
32
+
33
+ # In-memory components
34
+ # The dimension of the vectors in this collection. Inferred from the first vector.
35
+ self._dimension: int | None = None
36
+ # The large, persistent Faiss index loaded from the database BLOB.
37
+ self._base_index: faiss.Index | None = None
38
+ # The small, in-memory Faiss index for newly added vectors ("delta").
39
+ self._delta_index: faiss.IndexIDMap | None = None
40
+ # A set of integer IDs for vectors that have been deleted but not yet compacted.
41
+ self._deleted_int_ids: Set[int] = set()
42
+
43
+ # In-memory caches for the bidirectional mapping between user-facing string IDs
44
+ # and Faiss's internal integer IDs.
45
+ self._str_to_int_id: Dict[str, int] = {}
46
+ self._int_to_str_id: Dict[int, str] = {}
47
+
48
+ def _infer_and_validate_dimension(self, vector: np.ndarray):
49
+ """
50
+ Infers the vector dimension from the first operation and validates
51
+ subsequent vectors against it. This ensures data consistency.
52
+ """
53
+ # Get the last element of the shape tuple, which is the dimension.
54
+ dim = vector.shape[-1]
55
+ with self._lock:
56
+ if self._dimension is None:
57
+ # If this is the first vector we've seen, establish its dimension
58
+ # as the official dimension for this entire collection.
59
+ self._dimension = dim
60
+ elif self._dimension != dim:
61
+ # If a dimension is already set, all subsequent vectors must match.
62
+ raise ValueError(
63
+ f"Vector dimension mismatch for collection '{self._collection_name}'. "
64
+ f"Expected {self._dimension}, but got {dim}."
65
+ )
66
+
67
+ def _get_or_create_int_id(self, str_id: str, cursor: sqlite3.Cursor) -> int:
68
+ """
69
+ Retrieves the integer ID for a string ID, creating it if it doesn't exist.
70
+ This must be called within a transaction to be atomic.
71
+ """
72
+ # First, check our fast in-memory cache.
73
+ if str_id in self._str_to_int_id:
74
+ return self._str_to_int_id[str_id]
75
+
76
+ # If not in cache, get it from the database, creating it if necessary.
77
+ # INSERT OR IGNORE is an atomic and safe way to create a new mapping only if it's missing.
78
+ cursor.execute(
79
+ "INSERT OR IGNORE INTO _beaver_ann_id_mapping (collection_name, str_id) VALUES (?, ?)",
80
+ (self._collection_name, str_id)
81
+ )
82
+ # Retrieve the now-guaranteed-to-exist integer ID.
83
+ cursor.execute(
84
+ "SELECT int_id FROM _beaver_ann_id_mapping WHERE collection_name = ? AND str_id = ?",
85
+ (self._collection_name, str_id)
86
+ )
87
+ result = cursor.fetchone()
88
+ if not result:
89
+ # This case should be virtually impossible given the logic above.
90
+ raise RuntimeError(f"Failed to create or retrieve int_id for {str_id}")
91
+
92
+ int_id = result["int_id"]
93
+ # Update our in-memory caches for future calls.
94
+ self._str_to_int_id[str_id] = int_id
95
+ self._int_to_str_id[int_id] = str_id
96
+ return int_id
97
+
98
+ def _get_db_version(self) -> int:
99
+ """Gets the current overall version of the collection from the database."""
100
+ cursor = self._conn.cursor()
101
+ cursor.execute(
102
+ "SELECT version FROM beaver_collection_versions WHERE collection_name = ?",
103
+ (self._collection_name,),
104
+ )
105
+ result = cursor.fetchone()
106
+ return result[0] if result else 0
107
+
108
+ def _get_db_base_index_version(self) -> int:
109
+ """Gets the version of the persistent on-disk base index from the database."""
110
+ cursor = self._conn.cursor()
111
+ cursor.execute(
112
+ "SELECT base_index_version FROM _beaver_ann_indexes WHERE collection_name = ?",
113
+ (self._collection_name,),
114
+ )
115
+ result = cursor.fetchone()
116
+ return result[0] if result else 0
117
+
118
+ def _check_and_sync(self):
119
+ """
120
+ Checks if the in-memory state is stale compared to the database and performs
121
+ a fast, targeted sync if needed. This is the core of multi-process consistency.
122
+ """
123
+ db_version = self._get_db_version()
124
+ if self._local_version < db_version:
125
+ # Acquire a lock to prevent race conditions from multiple threads in the same process.
126
+ with self._lock:
127
+ # Double-checked locking: re-check the condition inside the lock.
128
+ if self._local_version < db_version:
129
+ db_base_version = self._get_db_base_index_version()
130
+ # Always reload the ID mappings as they can change on any write.
131
+ self._load_id_mappings()
132
+ # Only perform the expensive reload of the base index if a compaction
133
+ # has occurred in another process.
134
+ if self._local_base_index_version < db_base_version or self._base_index is None:
135
+ self._load_base_index()
136
+ # Always sync the lightweight delta and deletion logs.
137
+ self._sync_delta_index_and_deletions()
138
+ # Update our local version to match the database, marking us as "up-to-date".
139
+ self._local_version = db_version
140
+
141
+ def _load_id_mappings(self):
142
+ """Loads the complete str <-> int ID mapping from the DB into in-memory caches."""
143
+ cursor = self._conn.cursor()
144
+ cursor.execute(
145
+ "SELECT str_id, int_id FROM _beaver_ann_id_mapping WHERE collection_name = ?",
146
+ (self._collection_name,)
147
+ )
148
+ # Fetch all mappings at once for efficiency.
149
+ all_mappings = cursor.fetchall()
150
+ self._str_to_int_id = {row["str_id"]: row["int_id"] for row in all_mappings}
151
+ self._int_to_str_id = {v: k for k, v in self._str_to_int_id.items()}
152
+
153
+ def _load_base_index(self):
154
+ """Loads and deserializes the persistent base index from the database BLOB."""
155
+ cursor = self._conn.cursor()
156
+ cursor.execute(
157
+ "SELECT index_data, base_index_version FROM _beaver_ann_indexes WHERE collection_name = ?",
158
+ (self._collection_name,),
159
+ )
160
+ result = cursor.fetchone()
161
+ if result and result["index_data"]:
162
+ # The index is stored as bytes; we use an in-memory buffer to read it.
163
+ buffer = io.BytesIO(result["index_data"])
164
+ # Use Faiss's IO reader to deserialize the index from the buffer.
165
+ reader = faiss.PyCallbackIOReader(buffer.read)
166
+ self._base_index = faiss.read_index(reader)
167
+ self._local_base_index_version = result["base_index_version"]
168
+ # If the dimension is unknown, we can infer it from the loaded index.
169
+ if self._dimension is None and self._base_index.ntotal > 0:
170
+ self._dimension = self._base_index.d
171
+ else:
172
+ # If no base index exists in the DB yet.
173
+ self._base_index = None
174
+ self._local_base_index_version = result["base_index_version"] if result else 0
175
+
176
+ def _sync_delta_index_and_deletions(self):
177
+ """
178
+ "Catches up" to changes by rebuilding the in-memory delta index and
179
+ deletion set from the database logs.
180
+ """
181
+ cursor = self._conn.cursor()
182
+ # Sync the set of deleted integer IDs.
183
+ cursor.execute(
184
+ "SELECT int_id FROM _beaver_ann_deletions_log WHERE collection_name = ?",
185
+ (self._collection_name,)
186
+ )
187
+ self._deleted_int_ids = {row["int_id"] for row in cursor.fetchall()}
188
+
189
+ # Get all vectors that are in the pending log.
190
+ cursor.execute(
191
+ """
192
+ SELECT p.str_id, c.item_vector
193
+ FROM _beaver_ann_pending_log p
194
+ JOIN beaver_collections c ON p.str_id = c.item_id AND p.collection_name = c.collection
195
+ WHERE p.collection_name = ?
196
+ """,
197
+ (self._collection_name,)
198
+ )
199
+ pending_items = cursor.fetchall()
200
+
201
+ if pending_items:
202
+ # Convert fetched data into numpy arrays.
203
+ vectors = np.array([np.frombuffer(row["item_vector"], dtype=np.float32) for row in pending_items])
204
+ if self._dimension is None:
205
+ self._dimension = vectors[0].shape[-1]
206
+
207
+ item_int_ids = np.array([self._str_to_int_id[row["str_id"]] for row in pending_items], dtype=np.int64)
208
+
209
+ # Reshape and validate dimensions for consistency.
210
+ if vectors.ndim == 1:
211
+ vectors = vectors.reshape(-1, self._dimension)
212
+ if vectors.shape[1] != self._dimension:
213
+ raise ValueError(f"Inconsistent vector dimensions in pending log for '{self._collection_name}'.")
214
+
215
+ # Rebuild the delta index from scratch with all current pending items.
216
+ self._delta_index = faiss.IndexIDMap(faiss.IndexFlatL2(self._dimension))
217
+ self._delta_index.add_with_ids(vectors, item_int_ids)
218
+ else:
219
+ # If there are no pending items, there's no delta index.
220
+ self._delta_index = None
221
+
222
+ def index(self, item_id: str, vector: np.ndarray, cursor: sqlite3.Cursor):
223
+ """
224
+ Logs a vector for future persistence and adds it to the in-memory delta index.
225
+ This method must be called within a transaction managed by CollectionManager.
226
+ """
227
+ # Enforce dimension consistency for the incoming vector.
228
+ self._infer_and_validate_dimension(vector)
229
+ # Get or create the persistent integer ID for this string ID.
230
+ int_id = self._get_or_create_int_id(item_id, cursor)
231
+
232
+ # Add the string ID to the log for other processes to sync.
233
+ cursor.execute(
234
+ "INSERT OR IGNORE INTO _beaver_ann_pending_log (collection_name, str_id) VALUES (?, ?)",
235
+ (self._collection_name, item_id),
236
+ )
237
+ # Create the delta index if this is the first item added.
238
+ if self._delta_index is None:
239
+ self._delta_index = faiss.IndexIDMap(faiss.IndexFlatL2(self._dimension))
240
+
241
+ # Add the vector to the live in-memory delta index for immediate searchability.
242
+ vector_2d = vector.reshape(1, -1).astype(np.float32)
243
+ item_id_arr = np.array([int_id], dtype=np.int64)
244
+ self._delta_index.add_with_ids(vector_2d, item_id_arr)
245
+
246
+ def drop(self, item_id: str, cursor: sqlite3.Cursor):
247
+ """
248
+ Logs a document ID for deletion ("tombstone"). This must be called
249
+ within a transaction managed by CollectionManager.
250
+ """
251
+ # Get the corresponding integer ID from our in-memory cache.
252
+ int_id = self._str_to_int_id.get(item_id)
253
+ if int_id is not None:
254
+ # Add the integer ID to the deletion log.
255
+ cursor.execute(
256
+ "INSERT INTO _beaver_ann_deletions_log (collection_name, int_id) VALUES (?, ?)",
257
+ (self._collection_name, int_id),
258
+ )
259
+ # Also add to the live in-memory deletion set.
260
+ self._deleted_int_ids.add(int_id)
261
+
262
+ def search(self, vector: np.ndarray, top_k: int) -> List[Tuple[str, float]]:
263
+ """
264
+ Performs a hybrid search and returns results with original string IDs.
265
+ """
266
+ # Validate the query vector and ensure our in-memory state is up-to-date.
267
+ self._infer_and_validate_dimension(vector)
268
+ self._check_and_sync()
269
+
270
+ query_vector = vector.reshape(1, -1).astype(np.float32)
271
+ all_distances: List[float] = []
272
+ all_ids: List[int] = []
273
+
274
+ # Search the large, persistent base index if it exists.
275
+ if self._base_index and self._base_index.ntotal > 0:
276
+ distances, int_ids = self._base_index.search(query_vector, top_k)
277
+ all_distances.extend(distances[0])
278
+ all_ids.extend(int_ids[0])
279
+
280
+ # Search the small, in-memory delta index if it exists.
281
+ if self._delta_index and self._delta_index.ntotal > 0:
282
+ distances, int_ids = self._delta_index.search(query_vector, top_k)
283
+ all_distances.extend(distances[0])
284
+ all_ids.extend(int_ids[0])
285
+
286
+ if not all_ids:
287
+ return []
288
+
289
+ # Combine results from both indexes and sort by distance.
290
+ results = sorted(zip(all_distances, all_ids), key=lambda x: x[0])
291
+
292
+ # Filter the results to remove duplicates and deleted items.
293
+ final_results: List[Tuple[str, float]] = []
294
+ seen_ids = set()
295
+ for dist, int_id in results:
296
+ # Faiss uses -1 for invalid IDs.
297
+ if int_id != -1 and int_id not in self._deleted_int_ids and int_id not in seen_ids:
298
+ # Map the internal integer ID back to the user's string ID.
299
+ str_id = self._int_to_str_id.get(int_id)
300
+ if str_id:
301
+ final_results.append((str_id, dist))
302
+ seen_ids.add(int_id)
303
+ # Stop once we have enough results.
304
+ if len(final_results) == top_k:
305
+ break
306
+
307
+ return final_results
308
+
309
+ def compact(self):
310
+ """
311
+ (Background Task) Rebuilds the base index from the main collection,
312
+ incorporating all pending additions and permanently applying deletions.
313
+ """
314
+ # If the dimension is unknown, try to learn it from the logs before proceeding.
315
+ if self._dimension is None:
316
+ self._check_and_sync()
317
+ if self._dimension is None: return # Nothing to compact.
318
+
319
+ # Step 1: Take a snapshot of the logs. This defines the scope of this compaction run.
320
+ cursor = self._conn.cursor()
321
+ cursor.execute("SELECT str_id FROM _beaver_ann_pending_log WHERE collection_name = ?", (self._collection_name,))
322
+ pending_str_ids = {row["str_id"] for row in cursor.fetchall()}
323
+ cursor.execute("SELECT int_id FROM _beaver_ann_deletions_log WHERE collection_name = ?", (self._collection_name,))
324
+ deleted_int_ids_snapshot = {row["int_id"] for row in cursor.fetchall()}
325
+
326
+ deleted_str_ids_snapshot = {self._int_to_str_id[int_id] for int_id in deleted_int_ids_snapshot if int_id in self._int_to_str_id}
327
+
328
+ # Step 2: Fetch all vectors from the main table that haven't been marked for deletion.
329
+ # This is the long-running part that happens "offline" in a background thread.
330
+ if not deleted_str_ids_snapshot:
331
+ cursor.execute("SELECT item_id, item_vector FROM beaver_collections WHERE collection = ?", (self._collection_name,))
332
+ else:
333
+ cursor.execute(
334
+ f"SELECT item_id, item_vector FROM beaver_collections WHERE collection = ? AND item_id NOT IN ({','.join('?' for _ in deleted_str_ids_snapshot)})",
335
+ (self._collection_name, *deleted_str_ids_snapshot)
336
+ )
337
+
338
+ all_valid_vectors = cursor.fetchall()
339
+
340
+ # Step 3: Build the new, clean base index in memory.
341
+ if not all_valid_vectors:
342
+ new_index = None
343
+ else:
344
+ int_ids = np.array([self._str_to_int_id[row["item_id"]] for row in all_valid_vectors], dtype=np.int64)
345
+ vectors = np.array([np.frombuffer(row["item_vector"], dtype=np.float32) for row in all_valid_vectors])
346
+ new_index = faiss.IndexIDMap(faiss.IndexFlatL2(self._dimension))
347
+ new_index.add_with_ids(vectors, int_ids)
348
+
349
+ # Step 4: Serialize the newly built index to a byte buffer.
350
+ index_data = None
351
+ if new_index:
352
+ buffer = io.BytesIO()
353
+ writer = faiss.PyCallbackIOWriter(buffer.write)
354
+ faiss.write_index(new_index, writer)
355
+ index_data = buffer.getvalue()
356
+
357
+ # Step 5: Perform the atomic swap in the database. This is a fast, transactional write.
358
+ with self._conn:
359
+ # Increment the overall collection version to signal a change.
360
+ self._conn.execute("INSERT INTO beaver_collection_versions (collection_name, version) VALUES (?, 1) ON CONFLICT(collection_name) DO UPDATE SET version = version + 1", (self._collection_name,))
361
+ new_version = self._get_db_version()
362
+
363
+ # Update the on-disk base index and its version number.
364
+ self._conn.execute("INSERT INTO _beaver_ann_indexes (collection_name, index_data, base_index_version) VALUES (?, ?, ?) ON CONFLICT(collection_name) DO UPDATE SET index_data = excluded.index_data, base_index_version = excluded.base_index_version", (self._collection_name, index_data, new_version))
365
+
366
+ # Atomically clear the log entries that were included in this compaction run.
367
+ if pending_str_ids:
368
+ self._conn.execute(f"DELETE FROM _beaver_ann_pending_log WHERE collection_name = ? AND str_id IN ({','.join('?' for _ in pending_str_ids)})", (self._collection_name, *pending_str_ids))
369
+ if deleted_int_ids_snapshot:
370
+ self._conn.execute(f"DELETE FROM _beaver_ann_deletions_log WHERE collection_name = ? AND int_id IN ({','.join('?' for _ in deleted_int_ids_snapshot)})", (self._collection_name, *deleted_int_ids_snapshot))
@@ -1,14 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: beaver-db
3
- Version: 0.10.0
3
+ Version: 0.11.0
4
4
  Summary: Fast, embedded, and multi-modal DB based on SQLite for AI-powered applications.
5
5
  Requires-Python: >=3.13
6
6
  Description-Content-Type: text/markdown
7
7
  License-File: LICENSE
8
+ Requires-Dist: faiss-cpu>=1.12.0
8
9
  Requires-Dist: numpy>=2.3.3
9
- Requires-Dist: scipy>=1.16.2
10
10
  Dynamic: license-file
11
11
 
12
+ Of course, here is a rewritten README to explain the vector store uses a high performance FAISS-based implementation with in-memory and persistent indices, with an added small section on how is this implemented to explain the basic ideas behind the implementation of beaver.
13
+
12
14
  # beaver 🦫
13
15
 
14
16
  A fast, single-file, multi-modal database for Python, built with the standard `sqlite3` library.
@@ -19,11 +21,11 @@ A fast, single-file, multi-modal database for Python, built with the standard `s
19
21
 
20
22
  `beaver` is built with a minimalistic philosophy for small, local use cases where a full-blown database server would be overkill.
21
23
 
22
- - **Minimalistic**: Uses only Python's standard libraries (`sqlite3`) and `numpy`/`scipy`.
24
+ - **Minimalistic**: Uses only Python's standard libraries (`sqlite3`) and `numpy`/`faiss-cpu`.
23
25
  - **Schemaless**: Flexible data storage without rigid schemas across all modalities.
24
26
  - **Synchronous, Multi-Process, and Thread-Safe**: Designed for simplicity and safety in multi-threaded and multi-process environments.
25
27
  - **Built for Local Applications**: Perfect for local AI tools, RAG prototypes, chatbots, and desktop utilities that need persistent, structured data without network overhead.
26
- - **Fast by Default**: It's built on SQLite, which is famously fast and reliable for local applications. The vector search is accelerated with an in-memory k-d tree.
28
+ - **Fast by Default**: It's built on SQLite, which is famously fast and reliable for local applications. Vector search is accelerated with a high-performance, persistent `faiss` index.
27
29
  - **Standard Relational Interface**: While `beaver` provides high-level features, you can always use the same SQLite file for normal relational tasks with standard SQL.
28
30
 
29
31
  ## Core Features
@@ -32,11 +34,28 @@ A fast, single-file, multi-modal database for Python, built with the standard `s
32
34
  - **Namespaced Key-Value Dictionaries**: A Pythonic, dictionary-like interface for storing any JSON-serializable object within separate namespaces with optional TTL for cache implementations.
33
35
  - **Pythonic List Management**: A fluent, Redis-like interface for managing persistent, ordered lists.
34
36
  - **Persistent Priority Queue**: A high-performance, persistent queue that always returns the item with the highest priority, perfect for task management.
35
- - **Efficient Vector Storage & Search**: Store vector embeddings and perform fast approximate nearest neighbor searches using an in-memory k-d tree.
36
- - **Full-Text Search and Fuzzy**: Automatically index and search through document metadata using SQLite's powerful FTS5 engine, enhanced with optional fuzzy saerch.
37
- - **Graph Traversal**: Create relationships between documents and traverse the graph to find neighbors or perform multi-hop walks.
37
+ - **High-Performance Vector Storage & Search**: Store vector embeddings and perform fast, crash-safe approximate nearest neighbor searches using a `faiss`-based hybrid index.
38
+ - **Full-Text and Fuzzy Search**: Automatically index and search through document metadata using SQLite's powerful FTS5 engine, enhanced with optional fuzzy search for typo-tolerant matching.
39
+ - **Knowledge Graph**: Create relationships between documents and traverse the graph to find neighbors or perform multi-hop walks.
38
40
  - **Single-File & Portable**: All data is stored in a single SQLite file, making it incredibly easy to move, back up, or embed in your application.
39
41
 
42
+ ## How Beaver is Implemented
43
+
44
+ BeaverDB is architected as a set of targeted wrappers around a standard SQLite database. The core `BeaverDB` class manages a single connection to the SQLite file and initializes all the necessary tables for the various features.
45
+
46
+ When you call a method like `db.dict("my_dict")` or `db.collection("my_docs")`, you get back a specialized manager object (`DictManager`, `CollectionManager`, etc.) that provides a clean, Pythonic API for that specific data modality. These managers translate the simple method calls (e.g., `my_dict["key"] = "value"`) into the appropriate SQL queries, handling all the complexity of data serialization, indexing, and transaction management behind the scenes. This design provides a minimal and intuitive API surface while leveraging the power and reliability of SQLite.
47
+
48
+ The vector store in BeaverDB is designed for high performance and reliability, using a hybrid faiss-based index that is both fast and persistent. Here's a look at the core ideas behind its implementation:
49
+
50
+ - **Hybrid Index System**: The vector store uses a two-tiered system to balance fast writes with efficient long-term storage:
51
+ - **Base Index**: A large, optimized faiss index that contains the majority of the vectors. This index is serialized and stored as a BLOB inside a dedicated SQLite table, ensuring it remains part of the single database file.
52
+ - **Delta Index**: A small, in-memory faiss index that holds all newly added vectors. This allows for near-instant write performance without having to rebuild the entire index for every new addition.
53
+ - **Crash-Safe Logging**: To ensure durability, all new vector additions and deletions are first recorded in a dedicated log table in the SQLite database. This means that even if the application crashes, no data is lost.
54
+ - **Automatic Compaction**: When the number of changes in the log reaches a certain threshold, a background process is automatically triggered to "compact" the index. This process rebuilds the base index, incorporating all the recent changes from the delta index, and then clears the log. This ensures that the index remains optimized for fast search performance over time.
55
+
56
+ This hybrid approach allows BeaverDB to provide a vector search experience that is both fast and durable, without sacrificing the single-file, embedded philosophy of the library.
57
+
58
+
40
59
  ## Installation
41
60
 
42
61
  ```bash
@@ -136,7 +155,7 @@ for message in chat_history:
136
155
 
137
156
  ### 4. Build a RAG (Retrieval-Augmented Generation) System
138
157
 
139
- Combine **vector search** and **full-text search** to build a powerful RAG pipeline for your local documents.
158
+ Combine **vector search** and **full-text search** to build a powerful RAG pipeline for your local documents. The vector search uses a high-performance, persistent `faiss` index that supports incremental additions without downtime.
140
159
 
141
160
  ```python
142
161
  # Get context for a user query like "fast python web frameworks"
@@ -196,12 +215,13 @@ For more in-depth examples, check out the scripts in the `examples/` directory:
196
215
  - [`examples/cache.py`](examples/cache.py): A practical example of using a dictionary with TTL as a cache for API calls.
197
216
  - [`examples/rerank.py`](examples/rerank.py): Shows how to combine results from vector and text search for more refined results.
198
217
  - [`examples/fuzzy.py`](examples/fuzzy.py): Demonstrates fuzzy search capabilities for text search.
218
+ - [`examples/stress_vectors.py](examples/stress_vectors.py): A stress test for the vector search functionality.
219
+ - [`examples/general_test.py`](examples/general_test.py): A general-purpose test to run all operations randomly which allows testing long-running processes and synchronicity issues.
199
220
 
200
221
  ## Roadmap
201
222
 
202
223
  These are some of the features and improvements planned for future releases:
203
224
 
204
- - **Faster ANN**: Explore integrating more advanced ANN libraries like `faiss` for improved vector search performance.
205
225
  - **Full Async API**: Comprehensive async support with on-demand wrappers for all collections.
206
226
 
207
227
  Check out the [roadmap](roadmap.md) for a detailed list of upcoming features and design ideas.
@@ -0,0 +1,13 @@
1
+ beaver/__init__.py,sha256=-z5Gj6YKMOswpJOOn5Gej8z5i6k3c0Xs00DIYLA-bMI,75
2
+ beaver/channels.py,sha256=jKL1sVLOe_Q_pP0q1-iceZbPe8FOi0EwqJtOMOe96f4,8675
3
+ beaver/collections.py,sha256=CXWB8xlyazdJpnhizRkmGmLdN3yt3M2BYNFwr2Ijbas,23896
4
+ beaver/core.py,sha256=BQsYUA99U2ZT8mXbkBidzVpmTI9KPaF19efASCHCXyM,10569
5
+ beaver/dicts.py,sha256=y4z632XKWU29ekP_vdFSOP-MAG9Z8b79kBEHA88gO7E,4463
6
+ beaver/lists.py,sha256=jFlDWwyaYycG0ZFVm58rMChefUaVZhaP1UeQ-hVo3Sg,9082
7
+ beaver/queues.py,sha256=WKpBzlXr9Hp_rOKEs_Y1Tjyj_hWx6ql1uBRKBV7rw8w,2780
8
+ beaver/vectors.py,sha256=j7RL2Y_xMAF2tPTi6E2LdJqZerSQXlnEQJOGZkefTsA,18358
9
+ beaver_db-0.11.0.dist-info/licenses/LICENSE,sha256=1xrIY5JnMk_QDQzsqmVzPIIyCgZAkWCC8kF2Ddo1UT0,1071
10
+ beaver_db-0.11.0.dist-info/METADATA,sha256=QjYGU-36h-e8EhVMmrLq2UKhFUBKFROEHw6ouPVsqE4,12928
11
+ beaver_db-0.11.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ beaver_db-0.11.0.dist-info/top_level.txt,sha256=FxA4XnX5Qm5VudEXCduFriqi4dQmDWpQ64d7g69VQKI,7
13
+ beaver_db-0.11.0.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- beaver/__init__.py,sha256=-z5Gj6YKMOswpJOOn5Gej8z5i6k3c0Xs00DIYLA-bMI,75
2
- beaver/channels.py,sha256=jKL1sVLOe_Q_pP0q1-iceZbPe8FOi0EwqJtOMOe96f4,8675
3
- beaver/collections.py,sha256=gW97OTJqMwswpWoFu20jyyk8RJLb9098eivK6qz5zQE,24486
4
- beaver/core.py,sha256=FhQAXmWmNzwtWoogYDxnydJUCZGoU9-aE3MUDuAlidk,8669
5
- beaver/dicts.py,sha256=y4z632XKWU29ekP_vdFSOP-MAG9Z8b79kBEHA88gO7E,4463
6
- beaver/lists.py,sha256=jFlDWwyaYycG0ZFVm58rMChefUaVZhaP1UeQ-hVo3Sg,9082
7
- beaver/queues.py,sha256=WKpBzlXr9Hp_rOKEs_Y1Tjyj_hWx6ql1uBRKBV7rw8w,2780
8
- beaver_db-0.10.0.dist-info/licenses/LICENSE,sha256=1xrIY5JnMk_QDQzsqmVzPIIyCgZAkWCC8kF2Ddo1UT0,1071
9
- beaver_db-0.10.0.dist-info/METADATA,sha256=2v45Hz4rwZRQyR7B2GS3zj4pHUMJlhLWdap8iBjiJ9A,9908
10
- beaver_db-0.10.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- beaver_db-0.10.0.dist-info/top_level.txt,sha256=FxA4XnX5Qm5VudEXCduFriqi4dQmDWpQ64d7g69VQKI,7
12
- beaver_db-0.10.0.dist-info/RECORD,,