kitedb 0.2.5__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kitedb/__init__.py +273 -0
- kitedb/_kitedb.cp313-win_amd64.pyd +0 -0
- kitedb/_kitedb.pyi +677 -0
- kitedb/builders.py +901 -0
- kitedb/fluent.py +850 -0
- kitedb/schema.py +327 -0
- kitedb/traversal.py +1523 -0
- kitedb/vector_index.py +472 -0
- kitedb-0.2.5.dist-info/METADATA +217 -0
- kitedb-0.2.5.dist-info/RECORD +12 -0
- kitedb-0.2.5.dist-info/WHEEL +4 -0
- kitedb-0.2.5.dist-info/licenses/LICENSE +21 -0
kitedb/vector_index.py
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
"""High-level vector index API mirroring the TypeScript VectorIndex."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import math
|
|
7
|
+
from collections import OrderedDict
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple
|
|
10
|
+
|
|
11
|
+
from kitedb._kitedb import IvfConfig, IvfIndex, SearchOptions, brute_force_search
|
|
12
|
+
from kitedb.builders import NodeRef
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_METRIC_MAP = {
|
|
16
|
+
"cosine": "Cosine",
|
|
17
|
+
"euclidean": "Euclidean",
|
|
18
|
+
"dot": "DotProduct",
|
|
19
|
+
"dot_product": "DotProduct",
|
|
20
|
+
"dotproduct": "DotProduct",
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _validate_vector(vector: Sequence[float]) -> Optional[str]:
|
|
25
|
+
if len(vector) == 0:
|
|
26
|
+
return "Vector is empty"
|
|
27
|
+
for value in vector:
|
|
28
|
+
if not math.isfinite(value):
|
|
29
|
+
return "Vector contains NaN or infinity"
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class _LRUCache:
|
|
34
|
+
def __init__(self, max_size: int = 10_000):
|
|
35
|
+
self._max_size = max_size
|
|
36
|
+
self._data: OrderedDict[int, NodeRef] = OrderedDict()
|
|
37
|
+
|
|
38
|
+
def get(self, key: int) -> Optional[NodeRef]:
|
|
39
|
+
value = self._data.get(key)
|
|
40
|
+
if value is not None:
|
|
41
|
+
self._data.move_to_end(key)
|
|
42
|
+
return value
|
|
43
|
+
|
|
44
|
+
def set(self, key: int, value: NodeRef) -> None:
|
|
45
|
+
if key in self._data:
|
|
46
|
+
self._data.move_to_end(key)
|
|
47
|
+
self._data[key] = value
|
|
48
|
+
if len(self._data) > self._max_size:
|
|
49
|
+
self._data.popitem(last=False)
|
|
50
|
+
|
|
51
|
+
def delete(self, key: int) -> None:
|
|
52
|
+
self._data.pop(key, None)
|
|
53
|
+
|
|
54
|
+
def clear(self) -> None:
|
|
55
|
+
self._data.clear()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class VectorIndexOptions:
|
|
60
|
+
dimensions: int
|
|
61
|
+
metric: str = "cosine"
|
|
62
|
+
row_group_size: int = 1024
|
|
63
|
+
fragment_target_size: int = 100_000
|
|
64
|
+
normalize: Optional[bool] = None
|
|
65
|
+
ivf: Optional[Dict[str, object]] = None
|
|
66
|
+
training_threshold: int = 1000
|
|
67
|
+
cache_max_size: int = 10_000
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class SimilarOptions:
|
|
72
|
+
k: int
|
|
73
|
+
threshold: Optional[float] = None
|
|
74
|
+
n_probe: Optional[int] = None
|
|
75
|
+
filter: Optional[Callable[[int], bool]] = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class VectorSearchHit:
|
|
80
|
+
node: NodeRef
|
|
81
|
+
distance: float
|
|
82
|
+
similarity: float
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class VectorIndex:
|
|
86
|
+
def __init__(self, options: VectorIndexOptions):
|
|
87
|
+
self._dimensions = options.dimensions
|
|
88
|
+
self._metric = options.metric.lower()
|
|
89
|
+
self._metric_enum = _METRIC_MAP.get(self._metric, "Cosine")
|
|
90
|
+
self._row_group_size = options.row_group_size
|
|
91
|
+
self._fragment_target_size = options.fragment_target_size
|
|
92
|
+
self._normalize = options.normalize
|
|
93
|
+
if self._normalize is None:
|
|
94
|
+
self._normalize = self._metric == "cosine"
|
|
95
|
+
|
|
96
|
+
self._ivf_config = options.ivf or {}
|
|
97
|
+
self._training_threshold = options.training_threshold
|
|
98
|
+
self._node_ref_cache = _LRUCache(options.cache_max_size)
|
|
99
|
+
|
|
100
|
+
self._vectors: Dict[int, List[float]] = {}
|
|
101
|
+
self._node_to_vector: Dict[int, int] = {}
|
|
102
|
+
self._vector_to_node: Dict[int, int] = {}
|
|
103
|
+
self._next_vector_id = 0
|
|
104
|
+
|
|
105
|
+
self._index: Optional[IvfIndex] = None
|
|
106
|
+
self._needs_training = True
|
|
107
|
+
self._is_building = False
|
|
108
|
+
|
|
109
|
+
self._manifest_cache: Optional[str] = None
|
|
110
|
+
self._manifest_dirty = True
|
|
111
|
+
|
|
112
|
+
def set(self, node_ref: NodeRef, vector: Sequence[float]) -> None:
|
|
113
|
+
if self._is_building:
|
|
114
|
+
raise ValueError("Cannot modify vectors while index is being built")
|
|
115
|
+
|
|
116
|
+
vec = self._coerce_vector(vector)
|
|
117
|
+
if len(vec) != self._dimensions:
|
|
118
|
+
raise ValueError(
|
|
119
|
+
f"Vector dimension mismatch: expected {self._dimensions}, got {len(vec)}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
validation_error = _validate_vector(vec)
|
|
123
|
+
if validation_error is not None:
|
|
124
|
+
raise ValueError(f"Invalid vector: {validation_error}")
|
|
125
|
+
|
|
126
|
+
if self._normalize:
|
|
127
|
+
vec = self._normalize_vector(vec)
|
|
128
|
+
|
|
129
|
+
node_id = node_ref.id
|
|
130
|
+
|
|
131
|
+
if node_id in self._node_to_vector:
|
|
132
|
+
old_vector_id = self._node_to_vector[node_id]
|
|
133
|
+
old_vector = self._vectors.get(node_id)
|
|
134
|
+
if old_vector is not None and self._index is not None and self._index.trained:
|
|
135
|
+
self._index.delete(old_vector_id, old_vector)
|
|
136
|
+
self._node_to_vector.pop(node_id, None)
|
|
137
|
+
self._vector_to_node.pop(old_vector_id, None)
|
|
138
|
+
|
|
139
|
+
vector_id = self._next_vector_id
|
|
140
|
+
self._next_vector_id += 1
|
|
141
|
+
|
|
142
|
+
self._vectors[node_id] = vec
|
|
143
|
+
self._node_to_vector[node_id] = vector_id
|
|
144
|
+
self._vector_to_node[vector_id] = node_id
|
|
145
|
+
self._manifest_dirty = True
|
|
146
|
+
|
|
147
|
+
self._node_ref_cache.set(node_id, node_ref)
|
|
148
|
+
|
|
149
|
+
if self._index is not None and self._index.trained:
|
|
150
|
+
self._index.insert(vector_id, vec)
|
|
151
|
+
else:
|
|
152
|
+
self._needs_training = True
|
|
153
|
+
|
|
154
|
+
def get(self, node_ref: NodeRef) -> Optional[List[float]]:
|
|
155
|
+
return self._vectors.get(node_ref.id)
|
|
156
|
+
|
|
157
|
+
def delete(self, node_ref: NodeRef) -> bool:
|
|
158
|
+
if self._is_building:
|
|
159
|
+
raise ValueError("Cannot modify vectors while index is being built")
|
|
160
|
+
|
|
161
|
+
node_id = node_ref.id
|
|
162
|
+
vector_id = self._node_to_vector.get(node_id)
|
|
163
|
+
if vector_id is None:
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
vector = self._vectors.get(node_id)
|
|
167
|
+
if vector is None:
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
if self._index is not None and self._index.trained:
|
|
171
|
+
self._index.delete(vector_id, vector)
|
|
172
|
+
|
|
173
|
+
self._vectors.pop(node_id, None)
|
|
174
|
+
self._node_to_vector.pop(node_id, None)
|
|
175
|
+
self._vector_to_node.pop(vector_id, None)
|
|
176
|
+
self._node_ref_cache.delete(node_id)
|
|
177
|
+
self._manifest_dirty = True
|
|
178
|
+
return True
|
|
179
|
+
|
|
180
|
+
def has(self, node_ref: NodeRef) -> bool:
|
|
181
|
+
return node_ref.id in self._node_to_vector
|
|
182
|
+
|
|
183
|
+
def build_index(self) -> None:
|
|
184
|
+
if self._is_building:
|
|
185
|
+
raise ValueError("Index build already in progress")
|
|
186
|
+
|
|
187
|
+
self._is_building = True
|
|
188
|
+
try:
|
|
189
|
+
live_vectors = len(self._vectors)
|
|
190
|
+
if live_vectors < self._training_threshold:
|
|
191
|
+
self._index = None
|
|
192
|
+
self._needs_training = False
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
n_clusters = self._ivf_config.get("n_clusters")
|
|
196
|
+
if n_clusters is None:
|
|
197
|
+
n_clusters = min(1024, max(16, int(math.sqrt(live_vectors))))
|
|
198
|
+
n_probe = self._ivf_config.get("n_probe")
|
|
199
|
+
ivf_config = IvfConfig(
|
|
200
|
+
n_clusters=n_clusters,
|
|
201
|
+
n_probe=int(n_probe) if n_probe is not None else None,
|
|
202
|
+
metric=self._metric,
|
|
203
|
+
)
|
|
204
|
+
index = IvfIndex(self._dimensions, ivf_config)
|
|
205
|
+
|
|
206
|
+
training_data, count = self._collect_training_vectors()
|
|
207
|
+
index.add_training_vectors(training_data, num_vectors=count)
|
|
208
|
+
index.train()
|
|
209
|
+
|
|
210
|
+
for node_id, vector in self._vectors.items():
|
|
211
|
+
vector_id = self._node_to_vector[node_id]
|
|
212
|
+
index.insert(vector_id, vector)
|
|
213
|
+
|
|
214
|
+
self._index = index
|
|
215
|
+
self._needs_training = False
|
|
216
|
+
finally:
|
|
217
|
+
self._is_building = False
|
|
218
|
+
|
|
219
|
+
def search(
|
|
220
|
+
self,
|
|
221
|
+
query: Sequence[float],
|
|
222
|
+
options: Optional[SimilarOptions] = None,
|
|
223
|
+
**kwargs: object,
|
|
224
|
+
) -> List[VectorSearchHit]:
|
|
225
|
+
opts = self._coerce_similar_options(options, kwargs)
|
|
226
|
+
|
|
227
|
+
query_vec = self._coerce_vector(query)
|
|
228
|
+
if len(query_vec) != self._dimensions:
|
|
229
|
+
raise ValueError(
|
|
230
|
+
f"Query dimension mismatch: expected {self._dimensions}, got {len(query_vec)}"
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
validation_error = _validate_vector(query_vec)
|
|
234
|
+
if validation_error is not None:
|
|
235
|
+
raise ValueError(f"Invalid query vector: {validation_error}")
|
|
236
|
+
|
|
237
|
+
if self._normalize:
|
|
238
|
+
query_vec = self._normalize_vector(query_vec)
|
|
239
|
+
|
|
240
|
+
if self._needs_training:
|
|
241
|
+
self.build_index()
|
|
242
|
+
|
|
243
|
+
if self._index is not None and self._index.trained:
|
|
244
|
+
results = self._search_ivf(query_vec, opts)
|
|
245
|
+
else:
|
|
246
|
+
results = self._search_brute_force(query_vec, opts)
|
|
247
|
+
|
|
248
|
+
hits: List[VectorSearchHit] = []
|
|
249
|
+
for node_id, distance, similarity in results:
|
|
250
|
+
node_ref = self._node_ref_cache.get(node_id)
|
|
251
|
+
if node_ref is None:
|
|
252
|
+
continue
|
|
253
|
+
hits.append(VectorSearchHit(node=node_ref, distance=distance, similarity=similarity))
|
|
254
|
+
if len(hits) >= opts.k:
|
|
255
|
+
break
|
|
256
|
+
|
|
257
|
+
return hits
|
|
258
|
+
|
|
259
|
+
def stats(self) -> Dict[str, object]:
|
|
260
|
+
total_vectors = len(self._node_to_vector)
|
|
261
|
+
live_vectors = len(self._vectors)
|
|
262
|
+
return {
|
|
263
|
+
"totalVectors": total_vectors,
|
|
264
|
+
"liveVectors": live_vectors,
|
|
265
|
+
"dimensions": self._dimensions,
|
|
266
|
+
"metric": self._metric,
|
|
267
|
+
"indexTrained": self._index.trained if self._index is not None else False,
|
|
268
|
+
"indexClusters": self._index.config.n_clusters if self._index is not None else None,
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
def clear(self) -> None:
|
|
272
|
+
self._vectors.clear()
|
|
273
|
+
self._node_to_vector.clear()
|
|
274
|
+
self._vector_to_node.clear()
|
|
275
|
+
self._node_ref_cache.clear()
|
|
276
|
+
self._next_vector_id = 0
|
|
277
|
+
self._index = None
|
|
278
|
+
self._needs_training = True
|
|
279
|
+
self._manifest_cache = None
|
|
280
|
+
self._manifest_dirty = True
|
|
281
|
+
|
|
282
|
+
def buildIndex(self) -> None: # noqa: N802
|
|
283
|
+
self.build_index()
|
|
284
|
+
|
|
285
|
+
def _coerce_vector(self, vector: Sequence[float]) -> List[float]:
|
|
286
|
+
return [float(v) for v in vector]
|
|
287
|
+
|
|
288
|
+
def _normalize_vector(self, vector: List[float]) -> List[float]:
|
|
289
|
+
norm = math.sqrt(sum(v * v for v in vector))
|
|
290
|
+
if norm <= 1e-12:
|
|
291
|
+
return vector
|
|
292
|
+
return [v / norm for v in vector]
|
|
293
|
+
|
|
294
|
+
def _collect_training_vectors(self) -> Tuple[List[float], int]:
|
|
295
|
+
flat: List[float] = []
|
|
296
|
+
count = 0
|
|
297
|
+
for vec in self._vectors.values():
|
|
298
|
+
flat.extend(vec)
|
|
299
|
+
count += 1
|
|
300
|
+
return flat, count
|
|
301
|
+
|
|
302
|
+
def _search_brute_force(
|
|
303
|
+
self,
|
|
304
|
+
query_vec: List[float],
|
|
305
|
+
options: SimilarOptions,
|
|
306
|
+
) -> List[Tuple[int, float, float]]:
|
|
307
|
+
vectors: List[List[float]] = []
|
|
308
|
+
node_ids: List[int] = []
|
|
309
|
+
|
|
310
|
+
for node_id, vec in self._vectors.items():
|
|
311
|
+
if options.filter is not None:
|
|
312
|
+
try:
|
|
313
|
+
if not options.filter(node_id):
|
|
314
|
+
continue
|
|
315
|
+
except Exception:
|
|
316
|
+
continue
|
|
317
|
+
node_ids.append(node_id)
|
|
318
|
+
vectors.append(vec)
|
|
319
|
+
|
|
320
|
+
if not vectors:
|
|
321
|
+
return []
|
|
322
|
+
|
|
323
|
+
results = brute_force_search(
|
|
324
|
+
vectors=vectors,
|
|
325
|
+
node_ids=node_ids,
|
|
326
|
+
query=query_vec,
|
|
327
|
+
k=max(options.k * 2, options.k),
|
|
328
|
+
metric=self._metric,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
hits: List[Tuple[int, float, float]] = []
|
|
332
|
+
for result in results:
|
|
333
|
+
if options.threshold is not None and result.similarity < options.threshold:
|
|
334
|
+
continue
|
|
335
|
+
hits.append((int(result.node_id), float(result.distance), float(result.similarity)))
|
|
336
|
+
return hits
|
|
337
|
+
|
|
338
|
+
def _search_ivf(
|
|
339
|
+
self,
|
|
340
|
+
query_vec: List[float],
|
|
341
|
+
options: SimilarOptions,
|
|
342
|
+
) -> List[Tuple[int, float, float]]:
|
|
343
|
+
if self._index is None:
|
|
344
|
+
return []
|
|
345
|
+
|
|
346
|
+
search_options = None
|
|
347
|
+
if options.n_probe is not None or options.threshold is not None:
|
|
348
|
+
search_options = SearchOptions(
|
|
349
|
+
n_probe=options.n_probe,
|
|
350
|
+
threshold=options.threshold,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
manifest_json = self._build_manifest_json()
|
|
354
|
+
results = self._index.search(
|
|
355
|
+
manifest_json=manifest_json,
|
|
356
|
+
query=query_vec,
|
|
357
|
+
k=max(options.k * 2, options.k),
|
|
358
|
+
options=search_options,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
hits: List[Tuple[int, float, float]] = []
|
|
362
|
+
for result in results:
|
|
363
|
+
node_id = int(result.node_id)
|
|
364
|
+
if options.filter is not None:
|
|
365
|
+
try:
|
|
366
|
+
if not options.filter(node_id):
|
|
367
|
+
continue
|
|
368
|
+
except Exception:
|
|
369
|
+
continue
|
|
370
|
+
if options.threshold is not None and result.similarity < options.threshold:
|
|
371
|
+
continue
|
|
372
|
+
hits.append((node_id, float(result.distance), float(result.similarity)))
|
|
373
|
+
return hits
|
|
374
|
+
|
|
375
|
+
def _build_manifest_json(self) -> str:
|
|
376
|
+
if not self._manifest_dirty and self._manifest_cache is not None:
|
|
377
|
+
return self._manifest_cache
|
|
378
|
+
|
|
379
|
+
vector_ids = sorted(self._vector_to_node.keys())
|
|
380
|
+
row_groups: List[Dict[str, object]] = []
|
|
381
|
+
vector_locations: Dict[str, Dict[str, int]] = {}
|
|
382
|
+
|
|
383
|
+
current_data: List[float] = []
|
|
384
|
+
current_count = 0
|
|
385
|
+
row_group_id = 0
|
|
386
|
+
local_index = 0
|
|
387
|
+
|
|
388
|
+
for vector_id in vector_ids:
|
|
389
|
+
node_id = self._vector_to_node[vector_id]
|
|
390
|
+
vector = self._vectors.get(node_id)
|
|
391
|
+
if vector is None:
|
|
392
|
+
continue
|
|
393
|
+
|
|
394
|
+
if current_count >= self._row_group_size:
|
|
395
|
+
row_groups.append(
|
|
396
|
+
{
|
|
397
|
+
"id": row_group_id,
|
|
398
|
+
"count": current_count,
|
|
399
|
+
"data": current_data,
|
|
400
|
+
}
|
|
401
|
+
)
|
|
402
|
+
row_group_id += 1
|
|
403
|
+
current_data = []
|
|
404
|
+
current_count = 0
|
|
405
|
+
|
|
406
|
+
current_data.extend(vector)
|
|
407
|
+
current_count += 1
|
|
408
|
+
vector_locations[str(vector_id)] = {
|
|
409
|
+
"fragment_id": 0,
|
|
410
|
+
"local_index": local_index,
|
|
411
|
+
}
|
|
412
|
+
local_index += 1
|
|
413
|
+
|
|
414
|
+
if current_count > 0 or not row_groups:
|
|
415
|
+
row_groups.append(
|
|
416
|
+
{
|
|
417
|
+
"id": row_group_id,
|
|
418
|
+
"count": current_count,
|
|
419
|
+
"data": current_data,
|
|
420
|
+
}
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
fragment = {
|
|
424
|
+
"id": 0,
|
|
425
|
+
"state": "Active",
|
|
426
|
+
"row_groups": row_groups,
|
|
427
|
+
"total_vectors": local_index,
|
|
428
|
+
"deletion_bitmap": [],
|
|
429
|
+
"deleted_count": 0,
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
manifest = {
|
|
433
|
+
"config": {
|
|
434
|
+
"dimensions": self._dimensions,
|
|
435
|
+
"metric": self._metric_enum,
|
|
436
|
+
"row_group_size": self._row_group_size,
|
|
437
|
+
"fragment_target_size": self._fragment_target_size,
|
|
438
|
+
"normalize_on_insert": self._normalize,
|
|
439
|
+
},
|
|
440
|
+
"fragments": [fragment],
|
|
441
|
+
"active_fragment_id": 0,
|
|
442
|
+
"total_vectors": local_index,
|
|
443
|
+
"total_deleted": 0,
|
|
444
|
+
"next_vector_id": self._next_vector_id,
|
|
445
|
+
"node_to_vector": {str(k): v for k, v in self._node_to_vector.items()},
|
|
446
|
+
"vector_to_node": {str(k): v for k, v in self._vector_to_node.items()},
|
|
447
|
+
"vector_locations": vector_locations,
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
self._manifest_cache = json.dumps(manifest)
|
|
451
|
+
self._manifest_dirty = False
|
|
452
|
+
return self._manifest_cache
|
|
453
|
+
|
|
454
|
+
def _coerce_similar_options(
|
|
455
|
+
self,
|
|
456
|
+
options: Optional[SimilarOptions],
|
|
457
|
+
kwargs: Dict[str, object],
|
|
458
|
+
) -> SimilarOptions:
|
|
459
|
+
if options is None:
|
|
460
|
+
if "k" not in kwargs:
|
|
461
|
+
raise ValueError("search requires k or SimilarOptions")
|
|
462
|
+
return SimilarOptions(
|
|
463
|
+
k=int(kwargs["k"]),
|
|
464
|
+
threshold=kwargs.get("threshold"),
|
|
465
|
+
n_probe=kwargs.get("n_probe"),
|
|
466
|
+
filter=kwargs.get("filter"),
|
|
467
|
+
)
|
|
468
|
+
return options
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def create_vector_index(options: VectorIndexOptions) -> VectorIndex:
|
|
472
|
+
return VectorIndex(options)
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kitedb
|
|
3
|
+
Version: 0.2.5
|
|
4
|
+
Classifier: Development Status :: 4 - Beta
|
|
5
|
+
Classifier: Intended Audience :: Developers
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Rust
|
|
14
|
+
Classifier: Topic :: Database
|
|
15
|
+
Classifier: Topic :: Database :: Database Engines/Servers
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Dist: pytest>=7.0 ; extra == 'dev'
|
|
18
|
+
Requires-Dist: pytest-benchmark>=4.0 ; extra == 'dev'
|
|
19
|
+
Requires-Dist: numpy>=1.20 ; extra == 'dev'
|
|
20
|
+
Requires-Dist: numpy>=1.20 ; extra == 'numpy'
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Provides-Extra: numpy
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Summary: High-performance embedded graph database with vector search
|
|
25
|
+
Keywords: graph,database,embedded,vector,search,pathfinding
|
|
26
|
+
Home-Page: https://github.com/mask-software/kitedb
|
|
27
|
+
Author: KiteDB Contributors
|
|
28
|
+
License: MIT
|
|
29
|
+
Requires-Python: >=3.8
|
|
30
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
31
|
+
Project-URL: Documentation, https://github.com/mask-software/kitedb#readme
|
|
32
|
+
Project-URL: Homepage, https://github.com/mask-software/kitedb
|
|
33
|
+
Project-URL: Repository, https://github.com/mask-software/kitedb
|
|
34
|
+
|
|
35
|
+
# KiteDB for Python
|
|
36
|
+
|
|
37
|
+
KiteDB is a high-performance embedded graph database with built-in vector search.
|
|
38
|
+
This package provides the Python bindings to the Rust core.
|
|
39
|
+
|
|
40
|
+
## Features
|
|
41
|
+
|
|
42
|
+
- ACID transactions with commit/rollback
|
|
43
|
+
- Node and edge CRUD operations with properties
|
|
44
|
+
- Labels, edge types, and property keys
|
|
45
|
+
- Fluent traversal and pathfinding (BFS, Dijkstra, A\*)
|
|
46
|
+
- Vector embeddings with IVF and IVF-PQ indexes
|
|
47
|
+
- Single-file storage format
|
|
48
|
+
|
|
49
|
+
## Install
|
|
50
|
+
|
|
51
|
+
### From PyPI
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install kitedb
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### From source
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Install maturin (Rust extension build tool)
|
|
61
|
+
python -m pip install -U maturin
|
|
62
|
+
|
|
63
|
+
# Build and install in development mode
|
|
64
|
+
cd ray-rs
|
|
65
|
+
maturin develop --features python
|
|
66
|
+
|
|
67
|
+
# Or build a wheel
|
|
68
|
+
maturin build --features python --release
|
|
69
|
+
pip install target/wheels/kitedb-*.whl
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Quick start (fluent API)
|
|
73
|
+
|
|
74
|
+
The fluent API provides a high-level, type-safe interface:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from kitedb import ray, node, edge, prop, optional
|
|
78
|
+
|
|
79
|
+
# Define your schema
|
|
80
|
+
User = node("user",
|
|
81
|
+
key=lambda id: f"user:{id}",
|
|
82
|
+
props={
|
|
83
|
+
"name": prop.string("name"),
|
|
84
|
+
"email": prop.string("email"),
|
|
85
|
+
"age": optional(prop.int("age")),
|
|
86
|
+
}
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
Knows = edge("knows", {
|
|
90
|
+
"since": prop.int("since"),
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
# Open database
|
|
94
|
+
with ray("./social.kitedb", nodes=[User], edges=[Knows]) as db:
|
|
95
|
+
# Insert nodes
|
|
96
|
+
alice = db.insert(User).values(key="alice", name="Alice", email="alice@example.com").returning()
|
|
97
|
+
bob = db.insert(User).values(key="bob", name="Bob", email="bob@example.com").returning()
|
|
98
|
+
|
|
99
|
+
# Create edges
|
|
100
|
+
db.link(alice, Knows, bob, since=2024)
|
|
101
|
+
|
|
102
|
+
# Traverse
|
|
103
|
+
friends = db.from_(alice).out(Knows).nodes().to_list()
|
|
104
|
+
|
|
105
|
+
# Pathfinding
|
|
106
|
+
path = db.shortest_path(alice).via(Knows).to(bob).dijkstra()
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Quick start (low-level API)
|
|
110
|
+
|
|
111
|
+
For direct control, use the low-level `Database` class:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from kitedb import Database, PropValue
|
|
115
|
+
|
|
116
|
+
with Database("my_graph.kitedb") as db:
|
|
117
|
+
db.begin()
|
|
118
|
+
|
|
119
|
+
alice = db.create_node("user:alice")
|
|
120
|
+
bob = db.create_node("user:bob")
|
|
121
|
+
|
|
122
|
+
name_key = db.get_or_create_propkey("name")
|
|
123
|
+
db.set_node_prop(alice, name_key, PropValue.string("Alice"))
|
|
124
|
+
db.set_node_prop(bob, name_key, PropValue.string("Bob"))
|
|
125
|
+
|
|
126
|
+
knows = db.get_or_create_etype("knows")
|
|
127
|
+
db.add_edge(alice, knows, bob)
|
|
128
|
+
|
|
129
|
+
db.commit()
|
|
130
|
+
|
|
131
|
+
print("nodes:", db.count_nodes())
|
|
132
|
+
print("edges:", db.count_edges())
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Fluent traversal
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from kitedb import TraverseOptions
|
|
139
|
+
|
|
140
|
+
friends = db.from_(alice).out(knows).to_list()
|
|
141
|
+
|
|
142
|
+
results = db.from_(alice).traverse(
|
|
143
|
+
knows,
|
|
144
|
+
TraverseOptions(max_depth=3, min_depth=1, direction="out", unique=True),
|
|
145
|
+
).to_list()
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Concurrent Access
|
|
149
|
+
|
|
150
|
+
KiteDB supports concurrent read operations from multiple threads. Read operations don't block each other:
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
import threading
|
|
154
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
155
|
+
|
|
156
|
+
# Multiple threads can read concurrently
|
|
157
|
+
def read_user(key):
|
|
158
|
+
return db.get_node_by_key(key)
|
|
159
|
+
|
|
160
|
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
|
161
|
+
futures = [executor.submit(read_user, f"user:{i}") for i in range(100)]
|
|
162
|
+
results = [f.result() for f in futures]
|
|
163
|
+
|
|
164
|
+
# Or with asyncio (reads run concurrently)
|
|
165
|
+
import asyncio
|
|
166
|
+
|
|
167
|
+
async def read_users():
|
|
168
|
+
loop = asyncio.get_event_loop()
|
|
169
|
+
tasks = [
|
|
170
|
+
loop.run_in_executor(None, db.get_node_by_key, f"user:{i}")
|
|
171
|
+
for i in range(100)
|
|
172
|
+
]
|
|
173
|
+
return await asyncio.gather(*tasks)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**Concurrency model:**
|
|
177
|
+
|
|
178
|
+
- **Reads are concurrent**: Multiple `get_node_by_key()`, `get_neighbors()`, traversals, etc. can run in parallel
|
|
179
|
+
- **Writes are exclusive**: Write operations (`create_node()`, `add_edge()`, etc.) require exclusive access
|
|
180
|
+
- **Thread safety**: The `Database` object is safe to share across threads
|
|
181
|
+
|
|
182
|
+
Note: Python's GIL is released during Rust operations, allowing true parallelism for I/O-bound database access.
|
|
183
|
+
|
|
184
|
+
## Vector search
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
from kitedb import IvfIndex, IvfConfig, SearchOptions
|
|
188
|
+
|
|
189
|
+
index = IvfIndex(dimensions=128, config=IvfConfig(n_clusters=100))
|
|
190
|
+
|
|
191
|
+
training_data = [0.1] * (128 * 1000)
|
|
192
|
+
index.add_training_vectors(training_data, num_vectors=1000)
|
|
193
|
+
index.train()
|
|
194
|
+
|
|
195
|
+
index.insert(vector_id=1, vector=[0.1] * 128)
|
|
196
|
+
|
|
197
|
+
results = index.search(
|
|
198
|
+
manifest_json='{"vectors": {...}}',
|
|
199
|
+
query=[0.1] * 128,
|
|
200
|
+
k=10,
|
|
201
|
+
options=SearchOptions(n_probe=20),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
for result in results:
|
|
205
|
+
print(result.node_id, result.distance)
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## Documentation
|
|
209
|
+
|
|
210
|
+
```text
|
|
211
|
+
https://ray-kwaf.vercel.app/docs
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
## License
|
|
215
|
+
|
|
216
|
+
MIT License - see the main project LICENSE file for details.
|
|
217
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
kitedb\__init__.py,sha256=Bv89fIrMT104bfqygV_wtHiUp9IACTJLGGyRtxAhprc,6191
|
|
2
|
+
kitedb\_kitedb.cp313-win_amd64.pyd,sha256=BkCqrfpZLDbT3cNIPTEx8d8a1gq_UgHfMb3YyfG7CNA,3515392
|
|
3
|
+
kitedb\_kitedb.pyi,sha256=Jyw5ujVUc1jBh5Kkgl75vO5m2xiod9TaUXaSMusE534,23624
|
|
4
|
+
kitedb\builders.py,sha256=Pg7e4sClo7km50bitPaeuuyGDkNgmKRkbtjNb1IOru4,28552
|
|
5
|
+
kitedb\fluent.py,sha256=vMcmbnve57gmg_hj99z6hWoQrLya1tOq6GdA9IcWuxM,27545
|
|
6
|
+
kitedb\schema.py,sha256=h_H0S_Ti-PdI_34Cv-Y3NL2t9CtZqT3-h6D9sZb-SMY,8974
|
|
7
|
+
kitedb\traversal.py,sha256=OzenuP2KFQWjeq_M7pXHzNkSz8yB-1-Bxvr_XXzvApc,54624
|
|
8
|
+
kitedb\vector_index.py,sha256=IF411qHuPp926FfoPglUF1DbydhtbKJtFVoB4zEryjY,15995
|
|
9
|
+
kitedb-0.2.5.dist-info\METADATA,sha256=BdxvCKkt6peNr954jBpKOoBeIp8fxrLFSV2iapZAeq4,6110
|
|
10
|
+
kitedb-0.2.5.dist-info\WHEEL,sha256=n_BmF69IyGtioVWE9c3M_zsEfe6-xMZy1v5HCL_6qE0,97
|
|
11
|
+
kitedb-0.2.5.dist-info\licenses\LICENSE,sha256=hOXoX-b3Si1uhntZcUxKoeyHDL7TIec1lCdmv1ib24c,1092
|
|
12
|
+
kitedb-0.2.5.dist-info\RECORD,,
|