leanvec 1.0.3__cp38-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- leanvec/__init__.py +5 -0
- leanvec/leanvec.abi3.so +0 -0
- leanvec-1.0.3.dist-info/METADATA +15 -0
- leanvec-1.0.3.dist-info/RECORD +6 -0
- leanvec-1.0.3.dist-info/WHEEL +4 -0
- leanvecdb.py +169 -0
leanvec/__init__.py
ADDED
leanvec/leanvec.abi3.so
ADDED
|
Binary file
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: leanvec
|
|
3
|
+
Version: 1.0.3
|
|
4
|
+
Classifier: Programming Language :: Rust
|
|
5
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
|
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
10
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
11
|
+
Classifier: Operating System :: MacOS
|
|
12
|
+
Summary: Testing
|
|
13
|
+
Author-email: Carlo Moro <cnmoro@gmail.com>
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Project-URL: Repository, https://github.com/cnmoro
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
leanvec/__init__.py,sha256=PDFpm2X0hT9o0Sac0fIPL8aAFZZljH6smas4MX99GZE,111
|
|
2
|
+
leanvec/leanvec.abi3.so,sha256=qGCVk6vLy0mMiSYKxUozKIqBWchXHi1cDQywl5H7lXQ,901536
|
|
3
|
+
leanvec-1.0.3.dist-info/METADATA,sha256=D8zqFTq0QzqNMn7_leSPRwGApeCPWNP8gI10gUsAOZk,605
|
|
4
|
+
leanvec-1.0.3.dist-info/WHEEL,sha256=j5d2PbTbu8ET4WNBayOJZzjkZHOJYftE_TabKmsnXJc,103
|
|
5
|
+
leanvecdb.py,sha256=U8Kveh4f2jFidK7oPeSokwUVfizPSyipoFzYUfsJmT8,6782
|
|
6
|
+
leanvec-1.0.3.dist-info/RECORD,,
|
leanvecdb.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
import os
|
|
4
|
+
import leanvec
|
|
5
|
+
import atexit
|
|
6
|
+
import gc
|
|
7
|
+
import time
|
|
8
|
+
import threading
|
|
9
|
+
import copy
|
|
10
|
+
from typing import List, Dict, Any, Optional
|
|
11
|
+
|
|
12
|
+
class LeanVecDB:
|
|
13
|
+
def __init__(self, base_path: str = 'leanvec_root', auto_persist: bool = True):
|
|
14
|
+
self.base_path = base_path
|
|
15
|
+
if not os.path.exists(base_path):
|
|
16
|
+
os.makedirs(base_path)
|
|
17
|
+
|
|
18
|
+
self.collections: Dict[str, leanvec.LeanDB] = {}
|
|
19
|
+
self.dimensions: Dict[str, int] = {}
|
|
20
|
+
self._load_existing_collections()
|
|
21
|
+
|
|
22
|
+
self.start_time = time.time()
|
|
23
|
+
self.last_access_time = time.time()
|
|
24
|
+
self.maintenance_interval = 86400
|
|
25
|
+
self.idle_threshold = 3600
|
|
26
|
+
self.stop_maintenance = False
|
|
27
|
+
|
|
28
|
+
self.m_thread = threading.Thread(target=self._maintenance_loop, daemon=True)
|
|
29
|
+
self.m_thread.start()
|
|
30
|
+
|
|
31
|
+
if auto_persist:
|
|
32
|
+
atexit.register(self.persist_all)
|
|
33
|
+
|
|
34
|
+
def _get_col_path(self, name: str) -> str:
|
|
35
|
+
return os.path.join(self.base_path, name)
|
|
36
|
+
|
|
37
|
+
def _load_existing_collections(self):
|
|
38
|
+
if not os.path.exists(self.base_path):
|
|
39
|
+
return
|
|
40
|
+
for name in os.listdir(self.base_path):
|
|
41
|
+
path = self._get_col_path(name)
|
|
42
|
+
if os.path.isdir(path):
|
|
43
|
+
cfg_path = os.path.join(path, 'config.json')
|
|
44
|
+
if os.path.exists(cfg_path):
|
|
45
|
+
try:
|
|
46
|
+
with open(cfg_path, 'r') as f:
|
|
47
|
+
self.dimensions[name] = json.load(f).get('dimension')
|
|
48
|
+
except: pass
|
|
49
|
+
|
|
50
|
+
def _ensure_collection(self, name: str) -> leanvec.LeanDB:
|
|
51
|
+
self.last_access_time = time.time()
|
|
52
|
+
if name not in self.collections:
|
|
53
|
+
path = self._get_col_path(name)
|
|
54
|
+
if not os.path.exists(path):
|
|
55
|
+
os.makedirs(path)
|
|
56
|
+
self.collections[name] = leanvec.LeanDB(path)
|
|
57
|
+
return self.collections[name]
|
|
58
|
+
|
|
59
|
+
def list_collections(self) -> List[str]:
|
|
60
|
+
return list(set(list(self.collections.keys()) + list(self.dimensions.keys())))
|
|
61
|
+
|
|
62
|
+
def store_embedding(self, embedding: List[float], metadata_dict: Optional[Dict[str, Any]] = None, collection: str = "default", ttl: Optional[int] = None) -> str:
|
|
63
|
+
"""Store a single embedding with optional TTL (in seconds)."""
|
|
64
|
+
metadatas = [metadata_dict] if metadata_dict is not None else None
|
|
65
|
+
return self.store_embeddings_batch([embedding], metadatas, collection=collection, ttl=ttl)[0]
|
|
66
|
+
|
|
67
|
+
def store_embeddings_batch(self, embeddings: List[List[float]], metadatas: Optional[List[Dict[str, Any]]] = None, collection: str = "default", ttl: Optional[int] = None) -> List[str]:
|
|
68
|
+
if not embeddings: return []
|
|
69
|
+
|
|
70
|
+
db = self._ensure_collection(collection)
|
|
71
|
+
input_dim = len(embeddings[0])
|
|
72
|
+
|
|
73
|
+
# Dimension validation and persistence
|
|
74
|
+
if collection not in self.dimensions:
|
|
75
|
+
self.dimensions[collection] = input_dim
|
|
76
|
+
with open(os.path.join(self._get_col_path(collection), 'config.json'), 'w') as f:
|
|
77
|
+
json.dump({'dimension': input_dim}, f)
|
|
78
|
+
elif input_dim != self.dimensions[collection]:
|
|
79
|
+
raise ValueError(f"Dimension Mismatch: Expected {self.dimensions[collection]}, Got {input_dim}")
|
|
80
|
+
|
|
81
|
+
if metadatas is None:
|
|
82
|
+
metadatas = [{} for _ in range(len(embeddings))]
|
|
83
|
+
|
|
84
|
+
ids = []
|
|
85
|
+
for vec, meta_orig in zip(embeddings, metadatas):
|
|
86
|
+
# Crucial: copy metadata to avoid shared reference bugs in batch lists
|
|
87
|
+
meta = copy.deepcopy(meta_orig)
|
|
88
|
+
|
|
89
|
+
# ID generation logic (supports explicit 'id' or '_id')
|
|
90
|
+
doc_id = str(meta.get("id") or meta.get("_id") or uuid.uuid4())
|
|
91
|
+
meta["id"] = doc_id
|
|
92
|
+
ids.append(doc_id)
|
|
93
|
+
|
|
94
|
+
# Add to DB including the TTL parameter
|
|
95
|
+
db.add(doc_id, vec, json.dumps(meta), ttl)
|
|
96
|
+
|
|
97
|
+
return ids
|
|
98
|
+
|
|
99
|
+
def search(self, query_embedding: List[float], k: int = 5, filters: Optional[Dict[str, Any]] = None, collection: str = "default", autocut: bool = False) -> List[Dict[str, Any]]:
|
|
100
|
+
db = self._ensure_collection(collection)
|
|
101
|
+
filter_str = json.dumps(filters) if filters else None
|
|
102
|
+
raw_results = db.search(query_embedding, k, filter_str)
|
|
103
|
+
|
|
104
|
+
results = []
|
|
105
|
+
for doc_id, score, meta_str in raw_results:
|
|
106
|
+
try:
|
|
107
|
+
meta = json.loads(meta_str)
|
|
108
|
+
except (TypeError, json.JSONDecodeError):
|
|
109
|
+
meta = {}
|
|
110
|
+
|
|
111
|
+
results.append({
|
|
112
|
+
"id": doc_id,
|
|
113
|
+
"score": score,
|
|
114
|
+
"metadata": meta
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
if autocut and len(results) > 1:
|
|
118
|
+
scores = [r["score"] for r in results]
|
|
119
|
+
cut_idx = self._calculate_autocut(scores)
|
|
120
|
+
if cut_idx: results = results[:cut_idx]
|
|
121
|
+
return results
|
|
122
|
+
|
|
123
|
+
def _calculate_autocut(self, scores: List[float]) -> Optional[int]:
|
|
124
|
+
for i in range(1, len(scores)):
|
|
125
|
+
if scores[i-1] > 0 and (scores[i] - scores[i-1]) / scores[i-1] > 0.2:
|
|
126
|
+
return i
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
def delete(self, metadata_filter: Dict[str, Any], collection: str = "default") -> int:
|
|
130
|
+
if collection not in self.collections and collection not in self.dimensions:
|
|
131
|
+
return 0
|
|
132
|
+
db = self._ensure_collection(collection)
|
|
133
|
+
return db.delete_by_filter(json.dumps(metadata_filter))
|
|
134
|
+
|
|
135
|
+
def count(self, collection: str = "default") -> int:
|
|
136
|
+
if collection not in self.collections and collection not in self.dimensions:
|
|
137
|
+
return 0
|
|
138
|
+
return self._ensure_collection(collection).count()
|
|
139
|
+
|
|
140
|
+
def persist_all(self):
|
|
141
|
+
# Check if base path still exists to avoid OS Error 2 during cleanup
|
|
142
|
+
if not os.path.exists(self.base_path):
|
|
143
|
+
return
|
|
144
|
+
|
|
145
|
+
for name, db in list(self.collections.items()):
|
|
146
|
+
try:
|
|
147
|
+
gc.collect()
|
|
148
|
+
db.persist()
|
|
149
|
+
except Exception:
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
def persist(self, collection: str = "default"):
|
|
153
|
+
"""Persist a specific collection."""
|
|
154
|
+
if collection in self.collections:
|
|
155
|
+
self.collections[collection].persist()
|
|
156
|
+
|
|
157
|
+
def _maintenance_loop(self):
|
|
158
|
+
while not self.stop_maintenance:
|
|
159
|
+
time.sleep(1)
|
|
160
|
+
uptime = time.time() - self.start_time
|
|
161
|
+
idle_time = time.time() - self.last_access_time
|
|
162
|
+
if uptime > self.maintenance_interval and idle_time > self.idle_threshold:
|
|
163
|
+
self.persist_all()
|
|
164
|
+
self.start_time = time.time()
|
|
165
|
+
|
|
166
|
+
def vacuum(self, collection: str = "default"):
|
|
167
|
+
"""Stop-the-world compaction for a specific collection."""
|
|
168
|
+
db = self._ensure_collection(collection)
|
|
169
|
+
db.vacuum()
|