leanvec 1.0.3__cp38-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
leanvec/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ from .leanvec import *
2
+
3
+ __doc__ = leanvec.__doc__
4
+ if hasattr(leanvec, "__all__"):
5
+ __all__ = leanvec.__all__
Binary file
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: leanvec
3
+ Version: 1.0.3
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python :: Implementation :: CPython
6
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: POSIX :: Linux
10
+ Classifier: Operating System :: Microsoft :: Windows
11
+ Classifier: Operating System :: MacOS
12
+ Summary: Testing
13
+ Author-email: Carlo Moro <cnmoro@gmail.com>
14
+ Requires-Python: >=3.8
15
+ Project-URL: Repository, https://github.com/cnmoro
@@ -0,0 +1,6 @@
1
+ leanvec/__init__.py,sha256=PDFpm2X0hT9o0Sac0fIPL8aAFZZljH6smas4MX99GZE,111
2
+ leanvec/leanvec.abi3.so,sha256=qGCVk6vLy0mMiSYKxUozKIqBWchXHi1cDQywl5H7lXQ,901536
3
+ leanvec-1.0.3.dist-info/METADATA,sha256=D8zqFTq0QzqNMn7_leSPRwGApeCPWNP8gI10gUsAOZk,605
4
+ leanvec-1.0.3.dist-info/WHEEL,sha256=j5d2PbTbu8ET4WNBayOJZzjkZHOJYftE_TabKmsnXJc,103
5
+ leanvecdb.py,sha256=U8Kveh4f2jFidK7oPeSokwUVfizPSyipoFzYUfsJmT8,6782
6
+ leanvec-1.0.3.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.11.4)
3
+ Root-Is-Purelib: false
4
+ Tag: cp38-abi3-macosx_11_0_arm64
leanvecdb.py ADDED
@@ -0,0 +1,169 @@
1
+ import json
2
+ import uuid
3
+ import os
4
+ import leanvec
5
+ import atexit
6
+ import gc
7
+ import time
8
+ import threading
9
+ import copy
10
+ from typing import List, Dict, Any, Optional
11
+
12
+ class LeanVecDB:
13
+ def __init__(self, base_path: str = 'leanvec_root', auto_persist: bool = True):
14
+ self.base_path = base_path
15
+ if not os.path.exists(base_path):
16
+ os.makedirs(base_path)
17
+
18
+ self.collections: Dict[str, leanvec.LeanDB] = {}
19
+ self.dimensions: Dict[str, int] = {}
20
+ self._load_existing_collections()
21
+
22
+ self.start_time = time.time()
23
+ self.last_access_time = time.time()
24
+ self.maintenance_interval = 86400
25
+ self.idle_threshold = 3600
26
+ self.stop_maintenance = False
27
+
28
+ self.m_thread = threading.Thread(target=self._maintenance_loop, daemon=True)
29
+ self.m_thread.start()
30
+
31
+ if auto_persist:
32
+ atexit.register(self.persist_all)
33
+
34
+ def _get_col_path(self, name: str) -> str:
35
+ return os.path.join(self.base_path, name)
36
+
37
+ def _load_existing_collections(self):
38
+ if not os.path.exists(self.base_path):
39
+ return
40
+ for name in os.listdir(self.base_path):
41
+ path = self._get_col_path(name)
42
+ if os.path.isdir(path):
43
+ cfg_path = os.path.join(path, 'config.json')
44
+ if os.path.exists(cfg_path):
45
+ try:
46
+ with open(cfg_path, 'r') as f:
47
+ self.dimensions[name] = json.load(f).get('dimension')
48
+ except: pass
49
+
50
+ def _ensure_collection(self, name: str) -> leanvec.LeanDB:
51
+ self.last_access_time = time.time()
52
+ if name not in self.collections:
53
+ path = self._get_col_path(name)
54
+ if not os.path.exists(path):
55
+ os.makedirs(path)
56
+ self.collections[name] = leanvec.LeanDB(path)
57
+ return self.collections[name]
58
+
59
+ def list_collections(self) -> List[str]:
60
+ return list(set(list(self.collections.keys()) + list(self.dimensions.keys())))
61
+
62
+ def store_embedding(self, embedding: List[float], metadata_dict: Optional[Dict[str, Any]] = None, collection: str = "default", ttl: Optional[int] = None) -> str:
63
+ """Store a single embedding with optional TTL (in seconds)."""
64
+ metadatas = [metadata_dict] if metadata_dict is not None else None
65
+ return self.store_embeddings_batch([embedding], metadatas, collection=collection, ttl=ttl)[0]
66
+
67
+ def store_embeddings_batch(self, embeddings: List[List[float]], metadatas: Optional[List[Dict[str, Any]]] = None, collection: str = "default", ttl: Optional[int] = None) -> List[str]:
68
+ if not embeddings: return []
69
+
70
+ db = self._ensure_collection(collection)
71
+ input_dim = len(embeddings[0])
72
+
73
+ # Dimension validation and persistence
74
+ if collection not in self.dimensions:
75
+ self.dimensions[collection] = input_dim
76
+ with open(os.path.join(self._get_col_path(collection), 'config.json'), 'w') as f:
77
+ json.dump({'dimension': input_dim}, f)
78
+ elif input_dim != self.dimensions[collection]:
79
+ raise ValueError(f"Dimension Mismatch: Expected {self.dimensions[collection]}, Got {input_dim}")
80
+
81
+ if metadatas is None:
82
+ metadatas = [{} for _ in range(len(embeddings))]
83
+
84
+ ids = []
85
+ for vec, meta_orig in zip(embeddings, metadatas):
86
+ # Crucial: copy metadata to avoid shared reference bugs in batch lists
87
+ meta = copy.deepcopy(meta_orig)
88
+
89
+ # ID generation logic (supports explicit 'id' or '_id')
90
+ doc_id = str(meta.get("id") or meta.get("_id") or uuid.uuid4())
91
+ meta["id"] = doc_id
92
+ ids.append(doc_id)
93
+
94
+ # Add to DB including the TTL parameter
95
+ db.add(doc_id, vec, json.dumps(meta), ttl)
96
+
97
+ return ids
98
+
99
+ def search(self, query_embedding: List[float], k: int = 5, filters: Optional[Dict[str, Any]] = None, collection: str = "default", autocut: bool = False) -> List[Dict[str, Any]]:
100
+ db = self._ensure_collection(collection)
101
+ filter_str = json.dumps(filters) if filters else None
102
+ raw_results = db.search(query_embedding, k, filter_str)
103
+
104
+ results = []
105
+ for doc_id, score, meta_str in raw_results:
106
+ try:
107
+ meta = json.loads(meta_str)
108
+ except (TypeError, json.JSONDecodeError):
109
+ meta = {}
110
+
111
+ results.append({
112
+ "id": doc_id,
113
+ "score": score,
114
+ "metadata": meta
115
+ })
116
+
117
+ if autocut and len(results) > 1:
118
+ scores = [r["score"] for r in results]
119
+ cut_idx = self._calculate_autocut(scores)
120
+ if cut_idx: results = results[:cut_idx]
121
+ return results
122
+
123
+ def _calculate_autocut(self, scores: List[float]) -> Optional[int]:
124
+ for i in range(1, len(scores)):
125
+ if scores[i-1] > 0 and (scores[i] - scores[i-1]) / scores[i-1] > 0.2:
126
+ return i
127
+ return None
128
+
129
+ def delete(self, metadata_filter: Dict[str, Any], collection: str = "default") -> int:
130
+ if collection not in self.collections and collection not in self.dimensions:
131
+ return 0
132
+ db = self._ensure_collection(collection)
133
+ return db.delete_by_filter(json.dumps(metadata_filter))
134
+
135
+ def count(self, collection: str = "default") -> int:
136
+ if collection not in self.collections and collection not in self.dimensions:
137
+ return 0
138
+ return self._ensure_collection(collection).count()
139
+
140
+ def persist_all(self):
141
+ # Check if base path still exists to avoid OS Error 2 during cleanup
142
+ if not os.path.exists(self.base_path):
143
+ return
144
+
145
+ for name, db in list(self.collections.items()):
146
+ try:
147
+ gc.collect()
148
+ db.persist()
149
+ except Exception:
150
+ pass
151
+
152
+ def persist(self, collection: str = "default"):
153
+ """Persist a specific collection."""
154
+ if collection in self.collections:
155
+ self.collections[collection].persist()
156
+
157
+ def _maintenance_loop(self):
158
+ while not self.stop_maintenance:
159
+ time.sleep(1)
160
+ uptime = time.time() - self.start_time
161
+ idle_time = time.time() - self.last_access_time
162
+ if uptime > self.maintenance_interval and idle_time > self.idle_threshold:
163
+ self.persist_all()
164
+ self.start_time = time.time()
165
+
166
+ def vacuum(self, collection: str = "default"):
167
+ """Stop-the-world compaction for a specific collection."""
168
+ db = self._ensure_collection(collection)
169
+ db.vacuum()