opencode-semantic-memory 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opencode_memory/__init__.py +3 -0
- opencode_memory/cache.py +261 -0
- opencode_memory/cli.py +794 -0
- opencode_memory/config.py +89 -0
- opencode_memory/daemon.py +879 -0
- opencode_memory/enrichment/__init__.py +0 -0
- opencode_memory/enrichment/gitlab.py +237 -0
- opencode_memory/extraction.py +225 -0
- opencode_memory/historical_ingest.py +142 -0
- opencode_memory/http_server.py +464 -0
- opencode_memory/ingestion/__init__.py +7 -0
- opencode_memory/ingestion/embeddings.py +211 -0
- opencode_memory/ingestion/extractors.py +287 -0
- opencode_memory/ingestion/opencode_db.py +448 -0
- opencode_memory/ingestion/parser.py +344 -0
- opencode_memory/ingestion/watcher.py +88 -0
- opencode_memory/linking/__init__.py +5 -0
- opencode_memory/linking/linker.py +323 -0
- opencode_memory/metrics.py +273 -0
- opencode_memory/models.py +171 -0
- opencode_memory/project.py +86 -0
- opencode_memory/query/__init__.py +5 -0
- opencode_memory/query/hybrid.py +196 -0
- opencode_memory/server.py +2795 -0
- opencode_memory/session/__init__.py +5 -0
- opencode_memory/session/registry.py +57 -0
- opencode_memory/storage/__init__.py +6 -0
- opencode_memory/storage/sqlite.py +1608 -0
- opencode_memory/storage/vectors.py +199 -0
- opencode_semantic_memory-0.1.0.dist-info/METADATA +531 -0
- opencode_semantic_memory-0.1.0.dist-info/RECORD +33 -0
- opencode_semantic_memory-0.1.0.dist-info/WHEEL +4 -0
- opencode_semantic_memory-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Vector storage using LanceDB for semantic search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
import lancedb
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class VectorStorage:
|
|
13
|
+
"""LanceDB-based vector storage for semantic search."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, vectors_path: Path, embedding_dim: int = 384):
|
|
16
|
+
self.vectors_path = vectors_path
|
|
17
|
+
self.embedding_dim = embedding_dim
|
|
18
|
+
self._db: lancedb.DBConnection | None = None
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def db(self) -> Any:
|
|
22
|
+
"""Lazy-load LanceDB connection."""
|
|
23
|
+
if self._db is None:
|
|
24
|
+
import lancedb
|
|
25
|
+
|
|
26
|
+
self.vectors_path.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
self._db = lancedb.connect(str(self.vectors_path))
|
|
28
|
+
self._init_table()
|
|
29
|
+
return self._db
|
|
30
|
+
|
|
31
|
+
def _init_table(self) -> None:
|
|
32
|
+
"""Initialize the vectors table if it doesn't exist."""
|
|
33
|
+
existing_tables = self._db.list_tables()
|
|
34
|
+
table_names = (
|
|
35
|
+
existing_tables.tables if hasattr(existing_tables, "tables") else existing_tables
|
|
36
|
+
)
|
|
37
|
+
if "memories" not in table_names:
|
|
38
|
+
self._db.create_table(
|
|
39
|
+
"memories",
|
|
40
|
+
data=[
|
|
41
|
+
{
|
|
42
|
+
"id": "__init__",
|
|
43
|
+
"memory_id": 0,
|
|
44
|
+
"text": "",
|
|
45
|
+
"vector": [0.0] * self.embedding_dim,
|
|
46
|
+
}
|
|
47
|
+
],
|
|
48
|
+
)
|
|
49
|
+
self._db["memories"].delete("id = '__init__'")
|
|
50
|
+
|
|
51
|
+
def add(self, record_id: str, memory_id: int, text: str, vector: list[float]) -> None:
|
|
52
|
+
"""Add a vector to the store."""
|
|
53
|
+
table = self.db["memories"]
|
|
54
|
+
table.add(
|
|
55
|
+
[
|
|
56
|
+
{
|
|
57
|
+
"id": record_id,
|
|
58
|
+
"memory_id": memory_id,
|
|
59
|
+
"text": text,
|
|
60
|
+
"vector": vector,
|
|
61
|
+
}
|
|
62
|
+
]
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def search(self, query_vector: list[float], limit: int = 20) -> list[dict[str, Any]]:
|
|
66
|
+
"""Search for similar vectors."""
|
|
67
|
+
table = self.db["memories"]
|
|
68
|
+
results = table.search(query_vector).limit(limit).to_list()
|
|
69
|
+
return results
|
|
70
|
+
|
|
71
|
+
def delete(self, record_id: str) -> None:
|
|
72
|
+
"""Delete a vector by ID."""
|
|
73
|
+
table = self.db["memories"]
|
|
74
|
+
table.delete(f"id = '{record_id}'")
|
|
75
|
+
|
|
76
|
+
def delete_by_memory_id(self, memory_id: int) -> None:
|
|
77
|
+
"""Delete all vectors for a memory."""
|
|
78
|
+
table = self.db["memories"]
|
|
79
|
+
table.delete(f"memory_id = {memory_id}")
|
|
80
|
+
|
|
81
|
+
def delete_by_prefix(self, id_prefix: str) -> int:
|
|
82
|
+
"""Delete all vectors with IDs starting with the given prefix.
|
|
83
|
+
|
|
84
|
+
Uses LanceDB's SQL-like predicate with LIKE for efficient filtering
|
|
85
|
+
without loading all records into memory.
|
|
86
|
+
"""
|
|
87
|
+
import re
|
|
88
|
+
|
|
89
|
+
# Validate id_prefix to prevent SQL injection - only allow safe characters
|
|
90
|
+
if not re.match(r"^[a-zA-Z0-9_:/-]+$", id_prefix):
|
|
91
|
+
raise ValueError(f"Invalid id_prefix: {id_prefix}")
|
|
92
|
+
|
|
93
|
+
table = self.db["memories"]
|
|
94
|
+
try:
|
|
95
|
+
# Use SQL LIKE predicate - LanceDB supports this
|
|
96
|
+
# Escape any SQL special characters in the prefix
|
|
97
|
+
safe_prefix = id_prefix.replace("'", "''").replace("%", "\\%").replace("_", "\\_")
|
|
98
|
+
|
|
99
|
+
# Count matching records first (optional, for return value)
|
|
100
|
+
try:
|
|
101
|
+
matches = (
|
|
102
|
+
table.search([0.0] * self.embedding_dim)
|
|
103
|
+
.where(f"id LIKE '{safe_prefix}%'")
|
|
104
|
+
.limit(10000)
|
|
105
|
+
.to_list()
|
|
106
|
+
)
|
|
107
|
+
count = len(matches)
|
|
108
|
+
except Exception:
|
|
109
|
+
count = 0
|
|
110
|
+
|
|
111
|
+
# Delete using predicate
|
|
112
|
+
table.delete(f"id LIKE '{safe_prefix}%'")
|
|
113
|
+
return count
|
|
114
|
+
except Exception:
|
|
115
|
+
# Fallback to old method if LIKE not supported
|
|
116
|
+
try:
|
|
117
|
+
all_records = table.to_pandas()
|
|
118
|
+
matching_ids = all_records[all_records["id"].str.startswith(id_prefix)][
|
|
119
|
+
"id"
|
|
120
|
+
].tolist()
|
|
121
|
+
for record_id in matching_ids:
|
|
122
|
+
# Validate each id before deletion
|
|
123
|
+
if re.match(r"^[a-zA-Z0-9_:/-]+$", record_id):
|
|
124
|
+
table.delete(f"id = '{record_id}'")
|
|
125
|
+
return len(matching_ids)
|
|
126
|
+
except Exception:
|
|
127
|
+
return 0
|
|
128
|
+
|
|
129
|
+
def cleanup_old_versions(self, keep_versions: int = 10) -> dict[str, Any]:
|
|
130
|
+
"""Clean up old LanceDB versions to reclaim disk space.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
keep_versions: Number of recent versions to keep (default 10)
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Stats about the cleanup operation
|
|
137
|
+
"""
|
|
138
|
+
from datetime import timedelta
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
table = self.db.open_table("memories")
|
|
142
|
+
versions = table.list_versions()
|
|
143
|
+
|
|
144
|
+
if len(versions) <= keep_versions:
|
|
145
|
+
return {
|
|
146
|
+
"status": "skipped",
|
|
147
|
+
"reason": f"Only {len(versions)} versions exist, keeping all",
|
|
148
|
+
"versions_before": len(versions),
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# Cleanup old versions (keep recent ones)
|
|
152
|
+
# LanceDB cleanup_old_versions uses time-based threshold
|
|
153
|
+
# We calculate how old the Nth version is and use that as threshold
|
|
154
|
+
versions_sorted = sorted(versions, key=lambda v: v["version"], reverse=True)
|
|
155
|
+
cutoff_version = versions_sorted[keep_versions - 1]
|
|
156
|
+
|
|
157
|
+
# Use the timestamp of the cutoff version
|
|
158
|
+
# Add a small buffer to ensure we keep exactly keep_versions
|
|
159
|
+
from datetime import UTC, datetime
|
|
160
|
+
|
|
161
|
+
cutoff_time = cutoff_version.get("timestamp")
|
|
162
|
+
if cutoff_time:
|
|
163
|
+
# Versions older than this will be deleted
|
|
164
|
+
now = datetime.now(UTC)
|
|
165
|
+
if cutoff_time.tzinfo is None:
|
|
166
|
+
cutoff_time = cutoff_time.replace(tzinfo=UTC)
|
|
167
|
+
older_than = now - cutoff_time
|
|
168
|
+
if older_than.total_seconds() > 0:
|
|
169
|
+
older_than = timedelta(seconds=max(1, older_than.total_seconds()))
|
|
170
|
+
else:
|
|
171
|
+
older_than = timedelta(days=1)
|
|
172
|
+
else:
|
|
173
|
+
older_than = timedelta(days=1)
|
|
174
|
+
|
|
175
|
+
stats = table.cleanup_old_versions(older_than=older_than, delete_unverified=True)
|
|
176
|
+
|
|
177
|
+
# Also compact files to optimize storage
|
|
178
|
+
compact_stats = table.compact_files()
|
|
179
|
+
|
|
180
|
+
versions_after = len(table.list_versions())
|
|
181
|
+
|
|
182
|
+
return {
|
|
183
|
+
"status": "success",
|
|
184
|
+
"versions_before": len(versions),
|
|
185
|
+
"versions_after": versions_after,
|
|
186
|
+
"versions_removed": len(versions) - versions_after,
|
|
187
|
+
"bytes_freed": getattr(stats, "bytes_removed", 0),
|
|
188
|
+
"files_compacted": getattr(compact_stats, "files_removed", 0),
|
|
189
|
+
}
|
|
190
|
+
except ImportError:
|
|
191
|
+
return {
|
|
192
|
+
"status": "error",
|
|
193
|
+
"error": "pylance not installed - run: uv pip install pylance",
|
|
194
|
+
}
|
|
195
|
+
except Exception as e:
|
|
196
|
+
return {
|
|
197
|
+
"status": "error",
|
|
198
|
+
"error": str(e),
|
|
199
|
+
}
|