opencode-semantic-memory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,199 @@
1
+ """Vector storage using LanceDB for semantic search."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ if TYPE_CHECKING:
9
+ import lancedb
10
+
11
+
12
+ class VectorStorage:
13
+ """LanceDB-based vector storage for semantic search."""
14
+
15
+ def __init__(self, vectors_path: Path, embedding_dim: int = 384):
16
+ self.vectors_path = vectors_path
17
+ self.embedding_dim = embedding_dim
18
+ self._db: lancedb.DBConnection | None = None
19
+
20
+ @property
21
+ def db(self) -> Any:
22
+ """Lazy-load LanceDB connection."""
23
+ if self._db is None:
24
+ import lancedb
25
+
26
+ self.vectors_path.mkdir(parents=True, exist_ok=True)
27
+ self._db = lancedb.connect(str(self.vectors_path))
28
+ self._init_table()
29
+ return self._db
30
+
31
+ def _init_table(self) -> None:
32
+ """Initialize the vectors table if it doesn't exist."""
33
+ existing_tables = self._db.list_tables()
34
+ table_names = (
35
+ existing_tables.tables if hasattr(existing_tables, "tables") else existing_tables
36
+ )
37
+ if "memories" not in table_names:
38
+ self._db.create_table(
39
+ "memories",
40
+ data=[
41
+ {
42
+ "id": "__init__",
43
+ "memory_id": 0,
44
+ "text": "",
45
+ "vector": [0.0] * self.embedding_dim,
46
+ }
47
+ ],
48
+ )
49
+ self._db["memories"].delete("id = '__init__'")
50
+
51
+ def add(self, record_id: str, memory_id: int, text: str, vector: list[float]) -> None:
52
+ """Add a vector to the store."""
53
+ table = self.db["memories"]
54
+ table.add(
55
+ [
56
+ {
57
+ "id": record_id,
58
+ "memory_id": memory_id,
59
+ "text": text,
60
+ "vector": vector,
61
+ }
62
+ ]
63
+ )
64
+
65
+ def search(self, query_vector: list[float], limit: int = 20) -> list[dict[str, Any]]:
66
+ """Search for similar vectors."""
67
+ table = self.db["memories"]
68
+ results = table.search(query_vector).limit(limit).to_list()
69
+ return results
70
+
71
+ def delete(self, record_id: str) -> None:
72
+ """Delete a vector by ID."""
73
+ table = self.db["memories"]
74
+ table.delete(f"id = '{record_id}'")
75
+
76
+ def delete_by_memory_id(self, memory_id: int) -> None:
77
+ """Delete all vectors for a memory."""
78
+ table = self.db["memories"]
79
+ table.delete(f"memory_id = {memory_id}")
80
+
81
+ def delete_by_prefix(self, id_prefix: str) -> int:
82
+ """Delete all vectors with IDs starting with the given prefix.
83
+
84
+ Uses LanceDB's SQL-like predicate with LIKE for efficient filtering
85
+ without loading all records into memory.
86
+ """
87
+ import re
88
+
89
+ # Validate id_prefix to prevent SQL injection - only allow safe characters
90
+ if not re.match(r"^[a-zA-Z0-9_:/-]+$", id_prefix):
91
+ raise ValueError(f"Invalid id_prefix: {id_prefix}")
92
+
93
+ table = self.db["memories"]
94
+ try:
95
+ # Use SQL LIKE predicate - LanceDB supports this
96
+ # Escape any SQL special characters in the prefix
97
+ safe_prefix = id_prefix.replace("'", "''").replace("%", "\\%").replace("_", "\\_")
98
+
99
+ # Count matching records first (optional, for return value)
100
+ try:
101
+ matches = (
102
+ table.search([0.0] * self.embedding_dim)
103
+ .where(f"id LIKE '{safe_prefix}%'")
104
+ .limit(10000)
105
+ .to_list()
106
+ )
107
+ count = len(matches)
108
+ except Exception:
109
+ count = 0
110
+
111
+ # Delete using predicate
112
+ table.delete(f"id LIKE '{safe_prefix}%'")
113
+ return count
114
+ except Exception:
115
+ # Fallback to old method if LIKE not supported
116
+ try:
117
+ all_records = table.to_pandas()
118
+ matching_ids = all_records[all_records["id"].str.startswith(id_prefix)][
119
+ "id"
120
+ ].tolist()
121
+ for record_id in matching_ids:
122
+ # Validate each id before deletion
123
+ if re.match(r"^[a-zA-Z0-9_:/-]+$", record_id):
124
+ table.delete(f"id = '{record_id}'")
125
+ return len(matching_ids)
126
+ except Exception:
127
+ return 0
128
+
129
+ def cleanup_old_versions(self, keep_versions: int = 10) -> dict[str, Any]:
130
+ """Clean up old LanceDB versions to reclaim disk space.
131
+
132
+ Args:
133
+ keep_versions: Number of recent versions to keep (default 10)
134
+
135
+ Returns:
136
+ Stats about the cleanup operation
137
+ """
138
+ from datetime import timedelta
139
+
140
+ try:
141
+ table = self.db.open_table("memories")
142
+ versions = table.list_versions()
143
+
144
+ if len(versions) <= keep_versions:
145
+ return {
146
+ "status": "skipped",
147
+ "reason": f"Only {len(versions)} versions exist, keeping all",
148
+ "versions_before": len(versions),
149
+ }
150
+
151
+ # Cleanup old versions (keep recent ones)
152
+ # LanceDB cleanup_old_versions uses time-based threshold
153
+ # We calculate how old the Nth version is and use that as threshold
154
+ versions_sorted = sorted(versions, key=lambda v: v["version"], reverse=True)
155
+ cutoff_version = versions_sorted[keep_versions - 1]
156
+
157
+ # Use the timestamp of the cutoff version
158
+ # Add a small buffer to ensure we keep exactly keep_versions
159
+ from datetime import UTC, datetime
160
+
161
+ cutoff_time = cutoff_version.get("timestamp")
162
+ if cutoff_time:
163
+ # Versions older than this will be deleted
164
+ now = datetime.now(UTC)
165
+ if cutoff_time.tzinfo is None:
166
+ cutoff_time = cutoff_time.replace(tzinfo=UTC)
167
+ older_than = now - cutoff_time
168
+ if older_than.total_seconds() > 0:
169
+ older_than = timedelta(seconds=max(1, older_than.total_seconds()))
170
+ else:
171
+ older_than = timedelta(days=1)
172
+ else:
173
+ older_than = timedelta(days=1)
174
+
175
+ stats = table.cleanup_old_versions(older_than=older_than, delete_unverified=True)
176
+
177
+ # Also compact files to optimize storage
178
+ compact_stats = table.compact_files()
179
+
180
+ versions_after = len(table.list_versions())
181
+
182
+ return {
183
+ "status": "success",
184
+ "versions_before": len(versions),
185
+ "versions_after": versions_after,
186
+ "versions_removed": len(versions) - versions_after,
187
+ "bytes_freed": getattr(stats, "bytes_removed", 0),
188
+ "files_compacted": getattr(compact_stats, "files_removed", 0),
189
+ }
190
+ except ImportError:
191
+ return {
192
+ "status": "error",
193
+ "error": "pylance not installed - run: uv pip install pylance",
194
+ }
195
+ except Exception as e:
196
+ return {
197
+ "status": "error",
198
+ "error": str(e),
199
+ }