beaver-db 2.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- beaver/__init__.py +16 -0
- beaver/blobs.py +223 -0
- beaver/bridge.py +167 -0
- beaver/cache.py +274 -0
- beaver/channels.py +249 -0
- beaver/cli/__init__.py +133 -0
- beaver/cli/blobs.py +225 -0
- beaver/cli/channels.py +166 -0
- beaver/cli/collections.py +500 -0
- beaver/cli/dicts.py +171 -0
- beaver/cli/lists.py +244 -0
- beaver/cli/locks.py +202 -0
- beaver/cli/logs.py +248 -0
- beaver/cli/queues.py +215 -0
- beaver/client.py +392 -0
- beaver/core.py +646 -0
- beaver/dicts.py +314 -0
- beaver/docs.py +459 -0
- beaver/events.py +155 -0
- beaver/graphs.py +212 -0
- beaver/lists.py +337 -0
- beaver/locks.py +186 -0
- beaver/logs.py +187 -0
- beaver/manager.py +203 -0
- beaver/queries.py +66 -0
- beaver/queues.py +215 -0
- beaver/security.py +144 -0
- beaver/server.py +452 -0
- beaver/sketches.py +307 -0
- beaver/types.py +32 -0
- beaver/vectors.py +198 -0
- beaver_db-2.0rc2.dist-info/METADATA +149 -0
- beaver_db-2.0rc2.dist-info/RECORD +36 -0
- beaver_db-2.0rc2.dist-info/WHEEL +4 -0
- beaver_db-2.0rc2.dist-info/entry_points.txt +2 -0
- beaver_db-2.0rc2.dist-info/licenses/LICENSE +21 -0
beaver/sketches.py
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import hashlib
|
|
3
|
+
import struct
|
|
4
|
+
import asyncio
|
|
5
|
+
from typing import (
|
|
6
|
+
Any,
|
|
7
|
+
Iterator,
|
|
8
|
+
Optional,
|
|
9
|
+
Protocol,
|
|
10
|
+
runtime_checkable,
|
|
11
|
+
TYPE_CHECKING,
|
|
12
|
+
Self,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
|
|
17
|
+
from .manager import AsyncBeaverBase, atomic, emits
|
|
18
|
+
from .locks import AsyncBeaverLock
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from .core import AsyncBeaverDB
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _calculate_hll_precision(error_rate: float) -> int:
|
|
25
|
+
"""Derives the HyperLogLog precision 'p' from a desired error rate."""
|
|
26
|
+
if not (0 < error_rate < 1):
|
|
27
|
+
raise ValueError("Error rate must be between 0 and 1")
|
|
28
|
+
p = 2 * math.log2(1.04 / error_rate)
|
|
29
|
+
return max(4, min(int(math.ceil(p)), 18))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _calculate_bloom_params(capacity: int, error_rate: float) -> tuple[int, int]:
|
|
33
|
+
"""Calculates optimal Bloom Filter size (bits) and hash count (k)."""
|
|
34
|
+
if capacity <= 0:
|
|
35
|
+
raise ValueError("Capacity must be positive")
|
|
36
|
+
if not (0 < error_rate < 1):
|
|
37
|
+
raise ValueError("Error rate must be between 0 and 1")
|
|
38
|
+
|
|
39
|
+
m_bits = -(capacity * math.log(error_rate)) / (math.log(2) ** 2)
|
|
40
|
+
k = (m_bits / capacity) * math.log(2)
|
|
41
|
+
return int(math.ceil(m_bits)), int(math.ceil(k))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ApproximateSet:
|
|
45
|
+
"""
|
|
46
|
+
A unified probabilistic data structure combining HyperLogLog and Bloom Filter.
|
|
47
|
+
Pure Python implementation (CPU-bound).
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
capacity: int = 1_000_000,
|
|
53
|
+
error_rate: float = 0.01,
|
|
54
|
+
data: bytes | None = None,
|
|
55
|
+
):
|
|
56
|
+
self.capacity = capacity
|
|
57
|
+
self.error_rate = error_rate
|
|
58
|
+
|
|
59
|
+
# 1. Configure HyperLogLog
|
|
60
|
+
self.p = _calculate_hll_precision(error_rate)
|
|
61
|
+
self.m = 1 << self.p
|
|
62
|
+
self.alpha = self._get_alpha(self.m)
|
|
63
|
+
|
|
64
|
+
# 2. Configure Bloom Filter
|
|
65
|
+
self.bloom_bits, self.bloom_k = _calculate_bloom_params(capacity, error_rate)
|
|
66
|
+
self.bloom_bytes_len = (self.bloom_bits + 7) // 8
|
|
67
|
+
|
|
68
|
+
# 3. Initialize or Load Storage
|
|
69
|
+
expected_size = self.m + self.bloom_bytes_len
|
|
70
|
+
|
|
71
|
+
if data:
|
|
72
|
+
if len(data) != expected_size:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"Corrupted sketch data. Expected {expected_size} bytes, got {len(data)}"
|
|
75
|
+
)
|
|
76
|
+
self._data = bytearray(data)
|
|
77
|
+
else:
|
|
78
|
+
self._data = bytearray(expected_size)
|
|
79
|
+
|
|
80
|
+
def _get_alpha(self, m: int) -> float:
|
|
81
|
+
if m == 16:
|
|
82
|
+
return 0.673
|
|
83
|
+
elif m == 32:
|
|
84
|
+
return 0.697
|
|
85
|
+
elif m == 64:
|
|
86
|
+
return 0.709
|
|
87
|
+
return 0.7213 / (1 + 1.079 / m)
|
|
88
|
+
|
|
89
|
+
def add(self, item_bytes: bytes):
|
|
90
|
+
self._add_hll(item_bytes)
|
|
91
|
+
self._add_bloom(item_bytes)
|
|
92
|
+
|
|
93
|
+
def _add_hll(self, item_bytes: bytes):
|
|
94
|
+
h = hashlib.sha1(item_bytes).digest()
|
|
95
|
+
x = struct.unpack("<Q", h[:8])[0]
|
|
96
|
+
j = x & (self.m - 1)
|
|
97
|
+
w = x >> self.p
|
|
98
|
+
rank = 1
|
|
99
|
+
while w & 1 == 0 and rank <= (64 - self.p):
|
|
100
|
+
rank += 1
|
|
101
|
+
w >>= 1
|
|
102
|
+
if rank > self._data[j]:
|
|
103
|
+
self._data[j] = rank
|
|
104
|
+
|
|
105
|
+
def _add_bloom(self, item_bytes: bytes):
|
|
106
|
+
h = hashlib.md5(item_bytes).digest()
|
|
107
|
+
h1, h2 = struct.unpack("<QQ", h)
|
|
108
|
+
offset = self.m
|
|
109
|
+
for i in range(self.bloom_k):
|
|
110
|
+
bit_idx = (h1 + i * h2) % self.bloom_bits
|
|
111
|
+
byte_idx = offset + (bit_idx // 8)
|
|
112
|
+
mask = 1 << (bit_idx % 8)
|
|
113
|
+
self._data[byte_idx] |= mask
|
|
114
|
+
|
|
115
|
+
def __contains__(self, item_bytes: bytes) -> bool:
|
|
116
|
+
h = hashlib.md5(item_bytes).digest()
|
|
117
|
+
h1, h2 = struct.unpack("<QQ", h)
|
|
118
|
+
offset = self.m
|
|
119
|
+
for i in range(self.bloom_k):
|
|
120
|
+
bit_idx = (h1 + i * h2) % self.bloom_bits
|
|
121
|
+
byte_idx = offset + (bit_idx // 8)
|
|
122
|
+
mask = 1 << (bit_idx % 8)
|
|
123
|
+
if not (self._data[byte_idx] & mask):
|
|
124
|
+
return False
|
|
125
|
+
return True
|
|
126
|
+
|
|
127
|
+
def __len__(self) -> int:
|
|
128
|
+
zeros = 0
|
|
129
|
+
sum_inv = 0.0
|
|
130
|
+
for i in range(self.m):
|
|
131
|
+
val = self._data[i]
|
|
132
|
+
if val == 0:
|
|
133
|
+
zeros += 1
|
|
134
|
+
sum_inv += 2.0 ** (-val)
|
|
135
|
+
E = self.alpha * (self.m**2) / sum_inv
|
|
136
|
+
if E <= 2.5 * self.m:
|
|
137
|
+
if zeros > 0:
|
|
138
|
+
E = self.m * math.log(self.m / zeros)
|
|
139
|
+
return int(E)
|
|
140
|
+
|
|
141
|
+
def to_bytes(self) -> bytes:
|
|
142
|
+
return bytes(self._data)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class AsyncSketchBatch[T: BaseModel]:
|
|
146
|
+
"""Async Context manager for batched updates to an ApproximateSet."""
|
|
147
|
+
|
|
148
|
+
def __init__(self, manager: "AsyncBeaverSketch[T]"):
|
|
149
|
+
self._manager = manager
|
|
150
|
+
self._pending_items: list[Any] = []
|
|
151
|
+
|
|
152
|
+
def add(self, item: Any):
|
|
153
|
+
"""Adds an item to the pending batch buffer."""
|
|
154
|
+
self._pending_items.append(item)
|
|
155
|
+
|
|
156
|
+
async def __aenter__(self):
|
|
157
|
+
if self._manager._sketch is None:
|
|
158
|
+
await self._manager._ensure_sketch()
|
|
159
|
+
|
|
160
|
+
return self
|
|
161
|
+
|
|
162
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
163
|
+
if not self._pending_items:
|
|
164
|
+
return
|
|
165
|
+
|
|
166
|
+
# Atomic Bulk Update: Lock -> Reload -> Modify -> Save
|
|
167
|
+
async with self._manager._internal_lock:
|
|
168
|
+
async with self._manager._db.transaction():
|
|
169
|
+
# 1. Reload latest state from DB
|
|
170
|
+
await self._manager._reload()
|
|
171
|
+
|
|
172
|
+
# 2. Update in-memory (CPU bound, could offload to thread if huge)
|
|
173
|
+
for item in self._pending_items:
|
|
174
|
+
serialized_item = self._manager._serialize(item)
|
|
175
|
+
item_bytes = serialized_item.encode("utf-8")
|
|
176
|
+
self._manager._sketch.add(item_bytes)
|
|
177
|
+
|
|
178
|
+
# 3. Save back to DB
|
|
179
|
+
await self._manager._save()
|
|
180
|
+
|
|
181
|
+
self._pending_items.clear()
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@runtime_checkable
|
|
185
|
+
class IBeaverSketch[T: BaseModel](Protocol):
|
|
186
|
+
"""Protocol exposed to the user via BeaverBridge."""
|
|
187
|
+
|
|
188
|
+
def add(self, item: T) -> None: ...
|
|
189
|
+
def contains(self, item: T) -> bool: ...
|
|
190
|
+
def count(self) -> int: ...
|
|
191
|
+
def clear(self) -> None: ...
|
|
192
|
+
def batched(self) -> AsyncSketchBatch[T]: ...
|
|
193
|
+
def __len__(self) -> int: ...
|
|
194
|
+
def __contains__(self, item: T) -> bool: ...
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class AsyncBeaverSketch[T: BaseModel](AsyncBeaverBase[T]):
|
|
198
|
+
"""
|
|
199
|
+
Manages a persistent ApproximateSet (Bloom + HLL).
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
def __init__(
|
|
203
|
+
self,
|
|
204
|
+
name: str,
|
|
205
|
+
db: "AsyncBeaverDB",
|
|
206
|
+
capacity: int = 1_000_000,
|
|
207
|
+
error_rate: float = 0.01,
|
|
208
|
+
model: type[T] | None = None,
|
|
209
|
+
):
|
|
210
|
+
super().__init__(name, db, model=model)
|
|
211
|
+
self._capacity = capacity
|
|
212
|
+
self._error_rate = error_rate
|
|
213
|
+
self._sketch: ApproximateSet | None = None
|
|
214
|
+
|
|
215
|
+
async def _ensure_sketch(self):
|
|
216
|
+
"""Loads the sketch from DB or creates it if it doesn't exist."""
|
|
217
|
+
cursor = await self.connection.execute(
|
|
218
|
+
"SELECT capacity, error_rate, data FROM __beaver_sketches__ WHERE name = ?",
|
|
219
|
+
(self._name,),
|
|
220
|
+
)
|
|
221
|
+
row = await cursor.fetchone()
|
|
222
|
+
|
|
223
|
+
if row:
|
|
224
|
+
db_cap, db_err, db_data = row["capacity"], row["error_rate"], row["data"]
|
|
225
|
+
# Allow small float tolerance
|
|
226
|
+
if db_cap != self._capacity or abs(db_err - self._error_rate) > 1e-9:
|
|
227
|
+
raise ValueError(
|
|
228
|
+
f"Sketch '{self._name}' exists with capacity={db_cap}, error={db_err}. "
|
|
229
|
+
f"Cannot load with requested capacity={self._capacity}, error={self._error_rate}."
|
|
230
|
+
)
|
|
231
|
+
self._sketch = ApproximateSet(
|
|
232
|
+
self._capacity, self._error_rate, data=db_data
|
|
233
|
+
)
|
|
234
|
+
else:
|
|
235
|
+
self._sketch = ApproximateSet(self._capacity, self._error_rate)
|
|
236
|
+
await self._save()
|
|
237
|
+
|
|
238
|
+
async def _reload(self):
|
|
239
|
+
"""Reloads the binary data from the database."""
|
|
240
|
+
cursor = await self.connection.execute(
|
|
241
|
+
"SELECT data FROM __beaver_sketches__ WHERE name = ?", (self._name,)
|
|
242
|
+
)
|
|
243
|
+
row = await cursor.fetchone()
|
|
244
|
+
if row:
|
|
245
|
+
self._sketch._data = bytearray(row["data"])
|
|
246
|
+
|
|
247
|
+
async def _save(self):
|
|
248
|
+
"""Persists the current in-memory sketch to the database."""
|
|
249
|
+
if self._sketch:
|
|
250
|
+
await self.connection.execute(
|
|
251
|
+
"""
|
|
252
|
+
INSERT OR REPLACE INTO __beaver_sketches__ (name, type, capacity, error_rate, data)
|
|
253
|
+
VALUES (?, 'approx_set', ?, ?, ?)
|
|
254
|
+
""",
|
|
255
|
+
(self._name, self._capacity, self._error_rate, self._sketch.to_bytes()),
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
@atomic
|
|
259
|
+
async def add(self, item: T):
|
|
260
|
+
"""
|
|
261
|
+
Adds a single item to the sketch atomically.
|
|
262
|
+
"""
|
|
263
|
+
if self._sketch is None:
|
|
264
|
+
await self._ensure_sketch()
|
|
265
|
+
|
|
266
|
+
serialized_item = self._serialize(item)
|
|
267
|
+
item_bytes = serialized_item.encode("utf-8")
|
|
268
|
+
|
|
269
|
+
await self._reload()
|
|
270
|
+
self._sketch.add(item_bytes)
|
|
271
|
+
await self._save()
|
|
272
|
+
|
|
273
|
+
async def contains(self, item: T) -> bool:
|
|
274
|
+
"""
|
|
275
|
+
Checks membership using the local cached state.
|
|
276
|
+
Note: Does not strictly reload from DB for performance reasons.
|
|
277
|
+
"""
|
|
278
|
+
if self._sketch is None:
|
|
279
|
+
await self._ensure_sketch()
|
|
280
|
+
|
|
281
|
+
serialized_item = self._serialize(item)
|
|
282
|
+
item_bytes = serialized_item.encode("utf-8")
|
|
283
|
+
return item_bytes in self._sketch
|
|
284
|
+
|
|
285
|
+
async def count(self) -> int:
|
|
286
|
+
"""Returns approximate cardinality using local cached state."""
|
|
287
|
+
if self._sketch is None:
|
|
288
|
+
await self._ensure_sketch()
|
|
289
|
+
|
|
290
|
+
return len(self._sketch)
|
|
291
|
+
|
|
292
|
+
def batched(self) -> AsyncSketchBatch[T]:
|
|
293
|
+
"""Returns an async context manager for batched updates."""
|
|
294
|
+
# Initialize lazily if needed
|
|
295
|
+
if self._sketch is None:
|
|
296
|
+
# We can't await here in a sync method, so we rely on _init or first usage
|
|
297
|
+
pass
|
|
298
|
+
|
|
299
|
+
return AsyncSketchBatch(self)
|
|
300
|
+
|
|
301
|
+
async def clear(self):
|
|
302
|
+
"""Resets the sketch to empty."""
|
|
303
|
+
if self._sketch is None:
|
|
304
|
+
await self._ensure_sketch()
|
|
305
|
+
|
|
306
|
+
self._sketch = ApproximateSet(self._capacity, self._error_rate)
|
|
307
|
+
await self._save()
|
beaver/types.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import sqlite3
|
|
3
|
+
from typing import Any, Callable, Optional, Protocol, Type, Self, runtime_checkable
|
|
4
|
+
|
|
5
|
+
from .cache import ICache
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class IDatabase(Protocol):
|
|
9
|
+
@property
|
|
10
|
+
def connection(self) -> sqlite3.Connection: ...
|
|
11
|
+
def cache(self, key: str) -> "ICache": ...
|
|
12
|
+
def singleton[T, M](
|
|
13
|
+
self, cls: Type[M], name: str, model: Type[T] | None = None, **kwargs
|
|
14
|
+
) -> M: ...
|
|
15
|
+
def emit(self, topic: str, event: str, payload: dict) -> bool: ...
|
|
16
|
+
def on(
|
|
17
|
+
self,
|
|
18
|
+
topic: str,
|
|
19
|
+
event: str,
|
|
20
|
+
callback: Callable,
|
|
21
|
+
): ...
|
|
22
|
+
def off(
|
|
23
|
+
self,
|
|
24
|
+
topic: str,
|
|
25
|
+
event: str,
|
|
26
|
+
callback: Callable,
|
|
27
|
+
): ...
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@runtime_checkable
|
|
31
|
+
class IResourceManager(Protocol):
|
|
32
|
+
def close(self): ...
|
beaver/vectors.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import math
|
|
3
|
+
import struct
|
|
4
|
+
from typing import (
|
|
5
|
+
List,
|
|
6
|
+
Tuple,
|
|
7
|
+
Iterator,
|
|
8
|
+
AsyncIterator,
|
|
9
|
+
Protocol,
|
|
10
|
+
runtime_checkable,
|
|
11
|
+
TYPE_CHECKING,
|
|
12
|
+
Any,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from pydantic import BaseModel
|
|
16
|
+
|
|
17
|
+
from .manager import AsyncBeaverBase, atomic, emits
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from .core import AsyncBeaverDB
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class VectorItem[T](BaseModel):
|
|
24
|
+
"""Represents a stored vector with metadata."""
|
|
25
|
+
|
|
26
|
+
id: str
|
|
27
|
+
vector: List[float]
|
|
28
|
+
metadata: T | None = None
|
|
29
|
+
score: float = 0
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@runtime_checkable
|
|
33
|
+
class IBeaverVectors[T](Protocol):
|
|
34
|
+
"""Protocol exposed to the user via BeaverBridge."""
|
|
35
|
+
|
|
36
|
+
def set(self, id: str, vector: List[float], metadata: T | None = None) -> None: ...
|
|
37
|
+
def get(self, id: str) -> VectorItem[T] | None: ...
|
|
38
|
+
def delete(self, id: str) -> None: ...
|
|
39
|
+
|
|
40
|
+
def search(self, vector: List[float], k: int = 10) -> List[VectorItem[T]]: ...
|
|
41
|
+
|
|
42
|
+
def count(self) -> int: ...
|
|
43
|
+
def clear(self) -> None: ...
|
|
44
|
+
def __iter__(self) -> Iterator[VectorItem[T]]: ...
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class AsyncBeaverVectors[T: BaseModel](AsyncBeaverBase[T]):
|
|
48
|
+
"""
|
|
49
|
+
A simple, persistent vector store.
|
|
50
|
+
|
|
51
|
+
Performs exact Nearest Neighbor search by doing a full scan
|
|
52
|
+
and computing distances in memory.
|
|
53
|
+
|
|
54
|
+
Table managed:
|
|
55
|
+
- __beaver_vectors__ (collection, item_id, vector, metadata)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, name: str, db: "AsyncBeaverDB", model: type[T] | None = None):
|
|
59
|
+
super().__init__(name, db, model)
|
|
60
|
+
# T is the metadata model
|
|
61
|
+
self._meta_model = model
|
|
62
|
+
|
|
63
|
+
def _serialize_vector(self, vector: List[float]) -> bytes:
|
|
64
|
+
"""Packs a list of floats into binary data."""
|
|
65
|
+
# Use 'f' for float (4 bytes) or 'd' for double (8 bytes).
|
|
66
|
+
# 'f' is standard for most embeddings.
|
|
67
|
+
return struct.pack(f"{len(vector)}f", *vector)
|
|
68
|
+
|
|
69
|
+
def _deserialize_vector(self, data: bytes) -> List[float]:
|
|
70
|
+
"""Unpacks binary data into a list of floats."""
|
|
71
|
+
count = len(data) // 4
|
|
72
|
+
return list(struct.unpack(f"{count}f", data))
|
|
73
|
+
|
|
74
|
+
def _cosine_similarity(self, v1: List[float], v2: List[float]) -> float:
|
|
75
|
+
"""Computes Cosine Similarity between two vectors."""
|
|
76
|
+
if len(v1) != len(v2):
|
|
77
|
+
return -1.0 # Dimension mismatch punishment
|
|
78
|
+
|
|
79
|
+
dot_product = sum(a * b for a, b in zip(v1, v2))
|
|
80
|
+
norm_v1 = math.sqrt(sum(a * a for a in v1))
|
|
81
|
+
norm_v2 = math.sqrt(sum(b * b for b in v2))
|
|
82
|
+
|
|
83
|
+
if norm_v1 == 0 or norm_v2 == 0:
|
|
84
|
+
return 0.0
|
|
85
|
+
|
|
86
|
+
return dot_product / (norm_v1 * norm_v2)
|
|
87
|
+
|
|
88
|
+
@emits("set", payload=lambda id, *args, **kwargs: dict(id=id))
|
|
89
|
+
@atomic
|
|
90
|
+
async def set(self, id: str, vector: List[float], metadata: T | None = None):
|
|
91
|
+
"""
|
|
92
|
+
Stores a vector and optional metadata.
|
|
93
|
+
"""
|
|
94
|
+
vec_blob = self._serialize_vector(vector)
|
|
95
|
+
|
|
96
|
+
# Serialize metadata using base manager logic
|
|
97
|
+
meta_json = self._serialize(metadata) if metadata else None
|
|
98
|
+
|
|
99
|
+
await self.connection.execute(
|
|
100
|
+
"""
|
|
101
|
+
INSERT OR REPLACE INTO __beaver_vectors__
|
|
102
|
+
(collection, item_id, vector, metadata)
|
|
103
|
+
VALUES (?, ?, ?, ?)
|
|
104
|
+
""",
|
|
105
|
+
(self._name, id, vec_blob, meta_json),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
@atomic
|
|
109
|
+
async def get(self, id: str) -> VectorItem[T]:
|
|
110
|
+
"""Retrieves a vector item by ID."""
|
|
111
|
+
cursor = await self.connection.execute(
|
|
112
|
+
"SELECT vector, metadata FROM __beaver_vectors__ WHERE collection = ? AND item_id = ?",
|
|
113
|
+
(self._name, id),
|
|
114
|
+
)
|
|
115
|
+
row = await cursor.fetchone()
|
|
116
|
+
|
|
117
|
+
if not row:
|
|
118
|
+
raise KeyError(id)
|
|
119
|
+
|
|
120
|
+
vector = self._deserialize_vector(row["vector"])
|
|
121
|
+
meta_val = self._deserialize(row["metadata"]) if row["metadata"] else None
|
|
122
|
+
|
|
123
|
+
return VectorItem(id=id, vector=vector, metadata=meta_val)
|
|
124
|
+
|
|
125
|
+
@emits("delete", payload=lambda id, *args, **kwargs: dict(id=id))
|
|
126
|
+
@atomic
|
|
127
|
+
async def delete(self, id: str):
|
|
128
|
+
"""Deletes a vector item."""
|
|
129
|
+
await self.connection.execute(
|
|
130
|
+
"DELETE FROM __beaver_vectors__ WHERE collection = ? AND item_id = ?",
|
|
131
|
+
(self._name, id),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
async def search(self, vector: List[float], k: int = 10) -> List[VectorItem[T]]:
|
|
135
|
+
"""
|
|
136
|
+
Performs exact KNN search using Cosine Similarity.
|
|
137
|
+
Scans the entire table for this collection.
|
|
138
|
+
"""
|
|
139
|
+
query_vec = vector
|
|
140
|
+
|
|
141
|
+
# 1. Fetch ALL vectors (Full Scan)
|
|
142
|
+
# Optimization: We could stream this if memory is an issue,
|
|
143
|
+
# but for a simple store, fetching all is fine.
|
|
144
|
+
cursor = await self.connection.execute(
|
|
145
|
+
"SELECT item_id, vector, metadata FROM __beaver_vectors__ WHERE collection = ?",
|
|
146
|
+
(self._name,),
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
candidates = []
|
|
150
|
+
async for row in cursor:
|
|
151
|
+
# CPU Bound work inside the loop
|
|
152
|
+
row_vec = self._deserialize_vector(row["vector"])
|
|
153
|
+
score = self._cosine_similarity(query_vec, row_vec)
|
|
154
|
+
|
|
155
|
+
candidates.append((score, row))
|
|
156
|
+
|
|
157
|
+
# 2. Sort by Score Descending
|
|
158
|
+
candidates.sort(key=lambda x: x[0], reverse=True)
|
|
159
|
+
|
|
160
|
+
# 3. Take Top K and Hydrate
|
|
161
|
+
top_k = candidates[:k]
|
|
162
|
+
results = []
|
|
163
|
+
|
|
164
|
+
for score, row in top_k:
|
|
165
|
+
# Reconstruct item
|
|
166
|
+
vec = self._deserialize_vector(row["vector"])
|
|
167
|
+
meta_val = self._deserialize(row["metadata"]) if row["metadata"] else None
|
|
168
|
+
|
|
169
|
+
item = VectorItem(
|
|
170
|
+
id=row["item_id"], vector=vec, metadata=meta_val, score=score
|
|
171
|
+
)
|
|
172
|
+
results.append(item)
|
|
173
|
+
|
|
174
|
+
return results
|
|
175
|
+
|
|
176
|
+
async def count(self) -> int:
|
|
177
|
+
cursor = await self.connection.execute(
|
|
178
|
+
"SELECT COUNT(*) FROM __beaver_vectors__ WHERE collection = ?",
|
|
179
|
+
(self._name,),
|
|
180
|
+
)
|
|
181
|
+
result = await cursor.fetchone()
|
|
182
|
+
return result[0] if result else 0
|
|
183
|
+
|
|
184
|
+
@atomic
|
|
185
|
+
async def clear(self):
|
|
186
|
+
await self.connection.execute(
|
|
187
|
+
"DELETE FROM __beaver_vectors__ WHERE collection = ?", (self._name,)
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
async def __aiter__(self):
|
|
191
|
+
cursor = await self.connection.execute(
|
|
192
|
+
"SELECT item_id, vector, metadata FROM __beaver_vectors__ WHERE collection = ?",
|
|
193
|
+
(self._name,),
|
|
194
|
+
)
|
|
195
|
+
async for row in cursor:
|
|
196
|
+
vec = self._deserialize_vector(row["vector"])
|
|
197
|
+
meta = self._deserialize(row["metadata"]) if row["metadata"] else None
|
|
198
|
+
yield VectorItem(id=row["item_id"], vector=vec, metadata=meta)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: beaver-db
|
|
3
|
+
Version: 2.0rc2
|
|
4
|
+
Summary: Fast, async-native, embedded, and multi-modal DB based on SQLite for AI-powered applications.
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
9
|
+
Classifier: Topic :: Database
|
|
10
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Requires-Dist: aiosqlite>=0.21.0
|
|
13
|
+
Requires-Dist: numpy>=2.3.4
|
|
14
|
+
Requires-Dist: pydantic>=2.12.3
|
|
15
|
+
Requires-Dist: rich>=14.2.0
|
|
16
|
+
Requires-Dist: typer>=0.20.0
|
|
17
|
+
Provides-Extra: full
|
|
18
|
+
Requires-Dist: cryptography>=46.0.3; extra == 'full'
|
|
19
|
+
Requires-Dist: fastapi[standard]>=0.118.0; extra == 'full'
|
|
20
|
+
Provides-Extra: remote
|
|
21
|
+
Requires-Dist: fastapi[standard]>=0.118.0; extra == 'remote'
|
|
22
|
+
Provides-Extra: security
|
|
23
|
+
Requires-Dist: cryptography>=46.0.3; extra == 'security'
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
<div style="text-align: center;">
|
|
27
|
+
<img src="https://github.com/syalia-srl/beaver/blob/main/logo.png?raw=true" width="256px">
|
|
28
|
+
</div>
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
<!-- Project badges -->
|
|
33
|
+

|
|
34
|
+

|
|
35
|
+

|
|
36
|
+

|
|
37
|
+

|
|
38
|
+
|
|
39
|
+
-----
|
|
40
|
+
|
|
41
|
+
`beaver` is a simple, local, and embedded database designed to manage complex, modern data types without requiring a database server, built on top of SQLite.
|
|
42
|
+
|
|
43
|
+
## Design Philosophy
|
|
44
|
+
|
|
45
|
+
`beaver` is built with a minimalistic philosophy for small, local use cases where a full-blown database server would be overkill.
|
|
46
|
+
|
|
47
|
+
* **Minimal Dependencies**: The core library has minimal dependencies (`numpy`, `pydantic`, `rich`, `typer`). Advanced features (like the REST server) are optional extras.
|
|
48
|
+
* **Safe Concurrency**: Thread-safe and multi-process-safe by default, with robust inter-process locking.
|
|
49
|
+
* **Local-First**: A single, portable SQLite file is the default.
|
|
50
|
+
* **Fast & Performant**: Zero network latency for local operations and an optional, in-memory read cache.
|
|
51
|
+
* **Standard SQLite**: The database file is 100% compatible with any standard SQLite tool, ensuring data portability.
|
|
52
|
+
* **Pythonic API**: Designed to feel like a natural extension of your code, using standard Python data structures and Pydantic models.
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
Install the core library:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install beaver-db
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
To include optional features, you can install them as extras:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
# For the REST API server and client
|
|
66
|
+
pip install "beaver-db[remote]"
|
|
67
|
+
|
|
68
|
+
# To install all optional features at once
|
|
69
|
+
pip install "beaver-db[full]"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Docker
|
|
73
|
+
|
|
74
|
+
You can also run the BeaverDB REST API server using Docker.
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
docker pull ghcr.io/syalia-srl/beaver:latest
|
|
78
|
+
docker run -p 8000:8000 -v $(pwd)/data:/app ghcr.io/syalia-srl/beaver
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Quickstart
|
|
82
|
+
|
|
83
|
+
Get up and running in 30 seconds. This example showcases a dictionary, a list, and full-text search in a single script.
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from beaver import BeaverDB, Document
|
|
87
|
+
|
|
88
|
+
# 1. Initialize the database
|
|
89
|
+
db = BeaverDB("data.db")
|
|
90
|
+
|
|
91
|
+
# 2. Use a namespaced dictionary for app configuration
|
|
92
|
+
config = db.dict("app_config")
|
|
93
|
+
config["theme"] = "dark"
|
|
94
|
+
print(f"Theme set to: {config['theme']}")
|
|
95
|
+
|
|
96
|
+
# 3. Use a persistent list to manage a task queue
|
|
97
|
+
tasks = db.list("daily_tasks")
|
|
98
|
+
tasks.push("Write the project report")
|
|
99
|
+
tasks.push("Deploy the new feature")
|
|
100
|
+
print(f"First task is: {tasks[0]}")
|
|
101
|
+
|
|
102
|
+
# 4. Use a collection for document storage and search
|
|
103
|
+
articles = db.collection("articles")
|
|
104
|
+
doc = Document(
|
|
105
|
+
id="sqlite-001",
|
|
106
|
+
body="SQLite is a powerful embedded database ideal for local apps.",
|
|
107
|
+
)
|
|
108
|
+
articles.index(doc)
|
|
109
|
+
|
|
110
|
+
# Perform a full-text search
|
|
111
|
+
results = articles.match(query="database")
|
|
112
|
+
top_doc, rank = results[0]
|
|
113
|
+
print(f"FTS Result: '{top_doc.body}'")
|
|
114
|
+
|
|
115
|
+
db.close()
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Features
|
|
119
|
+
|
|
120
|
+
* [**Key-Value Dictionaries**](https://syalia.com/beaver/guide-dicts-blobs.html): A Pythonic, dictionary-like interface for storing any JSON-serializable object or Pydantic model within separate namespaces. Includes TTL support for caching.
|
|
121
|
+
* [**Blob Storage**](https://syalia.com/beaver/guide-dicts-blobs.html): A dictionary-like interface for storing binary data (e.g., images, PDFs) with associated JSON metadata.
|
|
122
|
+
* [**Persistent Lists**](https://syalia.com/beaver/guide-lists-queues.html): A full-featured, persistent Python list supporting `push`, `pop`, `prepend`, `deque`, slicing, and in-place updates.
|
|
123
|
+
* [**Persistent Priority Queue**](https://syalia.com/beaver/guide-lists-queues.html): A high-performance, persistent priority queue perfect for task orchestration across multiple processes.
|
|
124
|
+
* [**Probabilistic Sketches:**]() Track cardinality and membership for millions of items in constant space using HyperLogLog and Bloom Filters.
|
|
125
|
+
* [**Document Collections**](https://syalia.com/beaver/guide-collections.html): Store rich documents combining a vector embedding and Pydantic-based metadata.
|
|
126
|
+
* [**Vector Search**](https://syalia.com/beaver/guide-collections.html%23vector-search): Fast, multi-process-safe linear vector search using an in-memory `numpy`-based index.
|
|
127
|
+
* [**Full-Text & Fuzzy Search**](https://syalia.com/beaver/guide-collections.html%23full-text-fuzzy-search): Automatically index and search through document metadata using SQLite's FTS5 engine, with optional fuzzy search for typo-tolerant matching.
|
|
128
|
+
* [**Knowledge Graph**](https://syalia.com/beaver/guide-collections.html%23knowledge-graph): Create directed, labeled relationships between documents and traverse the graph to find neighbors or perform multi-hop walks.
|
|
129
|
+
* [**Pub/Sub System**](https://syalia.com/beaver/guide-realtime.html): A powerful, thread and process-safe publish-subscribe system for real-time messaging with a fan-out architecture.
|
|
130
|
+
* [**Time-Indexed Logs**](https://syalia.com/beaver/guide-realtime.html): A specialized data structure for structured, time-series logs. Query historical data by time range or create a live, aggregated view.
|
|
131
|
+
* [**Event-Driven Callbacks**](https://syalia.com/beaver/guide-realtime.html): Listen for database changes in real-time. Subscribe to events on specific managers to trigger workflows or update UIs.
|
|
132
|
+
* [**Inter-Process Locking**](https://syalia.com/beaver/guide-concurrency.html): Robust, deadlock-proof locks. Use `db.lock('task_name')` to coordinate arbitrary scripts, or `with db.list('my_list') as l:` to perform atomic, multi-step operations.
|
|
133
|
+
* [**Pydantic Support**](https://syalia.com/beaver/dev-architecture.html%23type-safe-models): Optionally associate `pydantic.BaseModel`s with any data structure for automatic, recursive data validation and (de)serialization.
|
|
134
|
+
* [**Deployment**](https://syalia.com/beaver/guide-deployment.html): Instantly serve your database over a RESTful API with `beaver serve` and interact with it via the `beaver` CLI.
|
|
135
|
+
* [**Data Export & Backups**](https://syalia.com/beaver/guide-deployment.html): Dump any data structure to a portable JSON file with a single `.dump()` command.
|
|
136
|
+
|
|
137
|
+
## Documentation
|
|
138
|
+
|
|
139
|
+
For a complete API reference, in-depth guides, and more examples, please visit the official documentation at:
|
|
140
|
+
|
|
141
|
+
[**https://syalia.com/beaver**](https://syalia.com/beaver)
|
|
142
|
+
|
|
143
|
+
## Contributing
|
|
144
|
+
|
|
145
|
+
Contributions are welcome\! If you think of something that would make `beaver` more useful for your use case, please open an issue or submit a pull request.
|
|
146
|
+
|
|
147
|
+
## License
|
|
148
|
+
|
|
149
|
+
This project is licensed under the MIT License.
|