flurryx-code-memory 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_memory/__init__.py +1 -0
- code_memory/claims/__init__.py +32 -0
- code_memory/claims/extractor.py +325 -0
- code_memory/claims/indexer.py +258 -0
- code_memory/claims/resolver.py +186 -0
- code_memory/claims/store.py +424 -0
- code_memory/cli.py +1192 -0
- code_memory/config.py +268 -0
- code_memory/embed/__init__.py +224 -0
- code_memory/embed/cache.py +204 -0
- code_memory/embed/m3.py +174 -0
- code_memory/embed/ollama.py +92 -0
- code_memory/embed/tei.py +106 -0
- code_memory/episodic/__init__.py +3 -0
- code_memory/episodic/sqlite_store.py +278 -0
- code_memory/extractor/__init__.py +3 -0
- code_memory/extractor/csproj.py +166 -0
- code_memory/extractor/dll.py +385 -0
- code_memory/extractor/gitignore.py +162 -0
- code_memory/extractor/nuget.py +275 -0
- code_memory/extractor/sanity.py +124 -0
- code_memory/extractor/sln.py +108 -0
- code_memory/extractor/treesitter.py +1172 -0
- code_memory/graph/__init__.py +3 -0
- code_memory/graph/falkor_store.py +740 -0
- code_memory/mcp_server.py +1816 -0
- code_memory/metrics.py +260 -0
- code_memory/orchestrator/__init__.py +13 -0
- code_memory/orchestrator/git_delta.py +211 -0
- code_memory/orchestrator/ingest_state.py +71 -0
- code_memory/orchestrator/pipeline.py +1478 -0
- code_memory/orchestrator/reset.py +130 -0
- code_memory/orchestrator/resolver.py +825 -0
- code_memory/orchestrator/retrieve.py +505 -0
- code_memory/resilience.py +73 -0
- code_memory/sync/__init__.py +20 -0
- code_memory/sync/autostart/__init__.py +42 -0
- code_memory/sync/autostart/base.py +106 -0
- code_memory/sync/autostart/launchd.py +115 -0
- code_memory/sync/autostart/schtasks.py +155 -0
- code_memory/sync/autostart/systemd.py +113 -0
- code_memory/sync/hooks.py +164 -0
- code_memory/sync/safety.py +65 -0
- code_memory/sync/snapshot.py +461 -0
- code_memory/sync/store.py +399 -0
- code_memory/sync/sync.py +405 -0
- code_memory/sync/watcher.py +320 -0
- code_memory/vector/__init__.py +3 -0
- code_memory/vector/qdrant_store.py +302 -0
- flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
- flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
- flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
- flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
"""Cross-platform filesystem watcher.
|
|
2
|
+
|
|
3
|
+
Uses ``watchdog`` (FSEvents on macOS, inotify on Linux, ReadDirectoryChangesW
|
|
4
|
+
on Windows) when available; otherwise falls back to mtime polling so the
|
|
5
|
+
feature degrades gracefully even when the optional dep is missing.
|
|
6
|
+
|
|
7
|
+
The watcher debounces bursts (e.g. ``git checkout`` touches many files at
|
|
8
|
+
once) and triggers a single ``sync_repo`` per quiet period. Excluded paths
|
|
9
|
+
include ``.git`` and any directory the extractor's gitignore filter drops.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import threading
|
|
16
|
+
import time
|
|
17
|
+
from collections.abc import Callable
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
from .sync import SyncResult, sync_repo
|
|
21
|
+
|
|
22
|
+
log = logging.getLogger("codememory.watcher")
|
|
23
|
+
|
|
24
|
+
DEFAULT_DEBOUNCE = 2.0
|
|
25
|
+
DEFAULT_POLL_INTERVAL = 5.0
|
|
26
|
+
|
|
27
|
+
ExcludeFn = Callable[[Path], bool]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
EXCLUDED_ROOT_DIRS: tuple[str, ...] = (
|
|
31
|
+
# VCS / project state
|
|
32
|
+
".git",
|
|
33
|
+
"data",
|
|
34
|
+
# Virtualenvs / package roots
|
|
35
|
+
".venv",
|
|
36
|
+
"node_modules",
|
|
37
|
+
# Build outputs
|
|
38
|
+
"dist",
|
|
39
|
+
"out-tsc",
|
|
40
|
+
"build",
|
|
41
|
+
"target",
|
|
42
|
+
"coverage",
|
|
43
|
+
# Framework / bundler caches
|
|
44
|
+
".angular",
|
|
45
|
+
".next",
|
|
46
|
+
".nuxt",
|
|
47
|
+
".svelte-kit",
|
|
48
|
+
".turbo",
|
|
49
|
+
".parcel-cache",
|
|
50
|
+
".cache",
|
|
51
|
+
# Python caches
|
|
52
|
+
"__pycache__",
|
|
53
|
+
".pytest_cache",
|
|
54
|
+
".mypy_cache",
|
|
55
|
+
".ruff_cache",
|
|
56
|
+
".tox",
|
|
57
|
+
# Editor metadata
|
|
58
|
+
".idea",
|
|
59
|
+
".vscode",
|
|
60
|
+
# Agentic tool caches (high write churn, no source value)
|
|
61
|
+
".opencode",
|
|
62
|
+
".serena",
|
|
63
|
+
".claude",
|
|
64
|
+
".cursor",
|
|
65
|
+
".windsurf",
|
|
66
|
+
".clavix",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _default_exclude(repo: Path) -> ExcludeFn:
|
|
71
|
+
repo_root = repo.resolve()
|
|
72
|
+
blocked = tuple((repo_root / name).resolve() for name in EXCLUDED_ROOT_DIRS)
|
|
73
|
+
|
|
74
|
+
def exclude(p: Path) -> bool:
|
|
75
|
+
try:
|
|
76
|
+
r = p.resolve()
|
|
77
|
+
except OSError:
|
|
78
|
+
return True
|
|
79
|
+
for b in blocked:
|
|
80
|
+
try:
|
|
81
|
+
r.relative_to(b)
|
|
82
|
+
return True
|
|
83
|
+
except ValueError:
|
|
84
|
+
continue
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
return exclude
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class Debouncer:
|
|
91
|
+
"""Coalesce bursts; fire ``flush`` once activity quiets for ``window`` seconds."""
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
window: float,
|
|
96
|
+
flush: Callable[[], None],
|
|
97
|
+
) -> None:
|
|
98
|
+
self.window = window
|
|
99
|
+
self.flush = flush
|
|
100
|
+
self._lock = threading.Lock()
|
|
101
|
+
self._timer: threading.Timer | None = None
|
|
102
|
+
self._dirty = False
|
|
103
|
+
|
|
104
|
+
def bump(self) -> None:
|
|
105
|
+
with self._lock:
|
|
106
|
+
self._dirty = True
|
|
107
|
+
if self._timer is not None:
|
|
108
|
+
self._timer.cancel()
|
|
109
|
+
self._timer = threading.Timer(self.window, self._fire)
|
|
110
|
+
self._timer.daemon = True
|
|
111
|
+
self._timer.start()
|
|
112
|
+
|
|
113
|
+
def _fire(self) -> None:
|
|
114
|
+
with self._lock:
|
|
115
|
+
if not self._dirty:
|
|
116
|
+
return
|
|
117
|
+
self._dirty = False
|
|
118
|
+
try:
|
|
119
|
+
self.flush()
|
|
120
|
+
except Exception: # noqa: BLE001
|
|
121
|
+
log.exception("debounced flush failed")
|
|
122
|
+
|
|
123
|
+
def cancel(self) -> None:
|
|
124
|
+
with self._lock:
|
|
125
|
+
if self._timer is not None:
|
|
126
|
+
self._timer.cancel()
|
|
127
|
+
self._timer = None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class Watcher:
|
|
131
|
+
"""Long-running watcher for a single repo."""
|
|
132
|
+
|
|
133
|
+
def __init__(
|
|
134
|
+
self,
|
|
135
|
+
repo: Path,
|
|
136
|
+
*,
|
|
137
|
+
project: str | None = None,
|
|
138
|
+
debounce: float = DEFAULT_DEBOUNCE,
|
|
139
|
+
poll_interval: float = DEFAULT_POLL_INTERVAL,
|
|
140
|
+
on_sync: Callable[[SyncResult], None] | None = None,
|
|
141
|
+
) -> None:
|
|
142
|
+
self.repo = Path(repo).resolve()
|
|
143
|
+
self.project = project
|
|
144
|
+
self.debounce_window = debounce
|
|
145
|
+
self.poll_interval = poll_interval
|
|
146
|
+
self.on_sync = on_sync
|
|
147
|
+
self.exclude = _default_exclude(self.repo)
|
|
148
|
+
self._stop = threading.Event()
|
|
149
|
+
self._thread: threading.Thread | None = None
|
|
150
|
+
self._debouncer = Debouncer(debounce, self._trigger_sync)
|
|
151
|
+
|
|
152
|
+
# ------------------------------------------------------------------
|
|
153
|
+
# Lifecycle
|
|
154
|
+
# ------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
def start(self, *, blocking: bool = False) -> None:
|
|
157
|
+
if self._thread is not None and self._thread.is_alive():
|
|
158
|
+
return
|
|
159
|
+
self._stop.clear()
|
|
160
|
+
target = self._run_watchdog if _watchdog_available() else self._run_poll
|
|
161
|
+
if blocking:
|
|
162
|
+
target()
|
|
163
|
+
return
|
|
164
|
+
self._thread = threading.Thread(target=target, name="cm-watch", daemon=True)
|
|
165
|
+
self._thread.start()
|
|
166
|
+
|
|
167
|
+
def stop(self) -> None:
|
|
168
|
+
self._stop.set()
|
|
169
|
+
self._debouncer.cancel()
|
|
170
|
+
if self._thread is not None:
|
|
171
|
+
self._thread.join(timeout=5)
|
|
172
|
+
self._thread = None
|
|
173
|
+
|
|
174
|
+
def is_running(self) -> bool:
|
|
175
|
+
return self._thread is not None and self._thread.is_alive()
|
|
176
|
+
|
|
177
|
+
# ------------------------------------------------------------------
|
|
178
|
+
# Implementations
|
|
179
|
+
# ------------------------------------------------------------------
|
|
180
|
+
|
|
181
|
+
def _run_watchdog(self) -> None:
|
|
182
|
+
from watchdog.events import FileSystemEventHandler
|
|
183
|
+
from watchdog.observers import Observer
|
|
184
|
+
|
|
185
|
+
watcher = self
|
|
186
|
+
|
|
187
|
+
class _Handler(FileSystemEventHandler):
|
|
188
|
+
def on_any_event(self, event): # noqa: ANN001 - lib type
|
|
189
|
+
if event.is_directory:
|
|
190
|
+
return
|
|
191
|
+
path = Path(getattr(event, "dest_path", None) or event.src_path)
|
|
192
|
+
watcher._handle_path(path)
|
|
193
|
+
|
|
194
|
+
observer = Observer()
|
|
195
|
+
observer.schedule(_Handler(), str(self.repo), recursive=True)
|
|
196
|
+
observer.start()
|
|
197
|
+
log.info("watcher started (watchdog) on %s", self.repo)
|
|
198
|
+
try:
|
|
199
|
+
while not self._stop.wait(0.5):
|
|
200
|
+
pass
|
|
201
|
+
finally:
|
|
202
|
+
observer.stop()
|
|
203
|
+
observer.join(timeout=3)
|
|
204
|
+
log.info("watcher stopped (watchdog)")
|
|
205
|
+
|
|
206
|
+
def _run_poll(self) -> None:
|
|
207
|
+
log.info(
|
|
208
|
+
"watchdog not installed; falling back to mtime polling on %s "
|
|
209
|
+
"(install `watchdog` for native events)",
|
|
210
|
+
self.repo,
|
|
211
|
+
)
|
|
212
|
+
last_mtime = self._max_mtime()
|
|
213
|
+
last_head = self._git_head()
|
|
214
|
+
while not self._stop.wait(self.poll_interval):
|
|
215
|
+
mtime = self._max_mtime()
|
|
216
|
+
head = self._git_head()
|
|
217
|
+
if mtime != last_mtime or head != last_head:
|
|
218
|
+
last_mtime = mtime
|
|
219
|
+
last_head = head
|
|
220
|
+
self._debouncer.bump()
|
|
221
|
+
|
|
222
|
+
def _max_mtime(self) -> float:
|
|
223
|
+
latest = 0.0
|
|
224
|
+
for root, dirs, files in _safe_walk(self.repo):
|
|
225
|
+
r = Path(root)
|
|
226
|
+
dirs[:] = [d for d in dirs if not self.exclude(r / d)]
|
|
227
|
+
for name in files:
|
|
228
|
+
p = r / name
|
|
229
|
+
if self.exclude(p):
|
|
230
|
+
continue
|
|
231
|
+
try:
|
|
232
|
+
mt = p.stat().st_mtime
|
|
233
|
+
except OSError:
|
|
234
|
+
continue
|
|
235
|
+
if mt > latest:
|
|
236
|
+
latest = mt
|
|
237
|
+
return latest
|
|
238
|
+
|
|
239
|
+
def _git_head(self) -> str | None:
|
|
240
|
+
try:
|
|
241
|
+
from ..orchestrator import git_delta
|
|
242
|
+
|
|
243
|
+
return git_delta.head_sha(self.repo) if git_delta.is_git_repo(self.repo) else None
|
|
244
|
+
except Exception: # noqa: BLE001
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
# ------------------------------------------------------------------
|
|
248
|
+
# Event routing
|
|
249
|
+
# ------------------------------------------------------------------
|
|
250
|
+
|
|
251
|
+
def _is_ref_event(self, path: Path) -> bool:
|
|
252
|
+
"""True when ``path`` is a git ref whose change should re-sync."""
|
|
253
|
+
try:
|
|
254
|
+
rel = path.resolve().relative_to(self.repo)
|
|
255
|
+
except (OSError, ValueError):
|
|
256
|
+
return False
|
|
257
|
+
parts = rel.parts
|
|
258
|
+
if not parts or parts[0] != ".git":
|
|
259
|
+
return False
|
|
260
|
+
if parts == (".git", "HEAD"):
|
|
261
|
+
return True
|
|
262
|
+
if len(parts) >= 4 and parts[1:3] == ("refs", "heads"):
|
|
263
|
+
return True
|
|
264
|
+
return False
|
|
265
|
+
|
|
266
|
+
def _handle_path(self, path: Path) -> None:
|
|
267
|
+
"""Decide whether ``path`` should trigger a debounced sync."""
|
|
268
|
+
if self._is_ref_event(path):
|
|
269
|
+
self._debouncer.bump()
|
|
270
|
+
return
|
|
271
|
+
if self.exclude(path):
|
|
272
|
+
return
|
|
273
|
+
self._debouncer.bump()
|
|
274
|
+
|
|
275
|
+
# ------------------------------------------------------------------
|
|
276
|
+
# Sync
|
|
277
|
+
# ------------------------------------------------------------------
|
|
278
|
+
|
|
279
|
+
def _trigger_sync(self) -> None:
|
|
280
|
+
log.debug("watcher firing sync for %s", self.repo)
|
|
281
|
+
try:
|
|
282
|
+
result = sync_repo(self.repo, project=self.project, trigger="watcher")
|
|
283
|
+
except Exception: # noqa: BLE001
|
|
284
|
+
log.exception("watcher sync failed")
|
|
285
|
+
return
|
|
286
|
+
if self.on_sync:
|
|
287
|
+
try:
|
|
288
|
+
self.on_sync(result)
|
|
289
|
+
except Exception: # noqa: BLE001
|
|
290
|
+
log.exception("on_sync callback raised")
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _watchdog_available() -> bool:
|
|
294
|
+
try:
|
|
295
|
+
import watchdog # noqa: F401
|
|
296
|
+
import watchdog.observers # noqa: F401
|
|
297
|
+
|
|
298
|
+
return True
|
|
299
|
+
except ImportError:
|
|
300
|
+
return False
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _safe_walk(root: Path):
|
|
304
|
+
import os
|
|
305
|
+
|
|
306
|
+
for entry in os.walk(root):
|
|
307
|
+
yield entry
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def run_foreground(repo: Path, *, project: str | None = None) -> None:
|
|
311
|
+
"""Blocking CLI entry: start the watcher until Ctrl-C."""
|
|
312
|
+
w = Watcher(repo, project=project)
|
|
313
|
+
w.start(blocking=False)
|
|
314
|
+
try:
|
|
315
|
+
while True:
|
|
316
|
+
time.sleep(1)
|
|
317
|
+
except KeyboardInterrupt:
|
|
318
|
+
pass
|
|
319
|
+
finally:
|
|
320
|
+
w.stop()
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable, Sequence
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from qdrant_client import QdrantClient
|
|
8
|
+
from qdrant_client.http import models as qm
|
|
9
|
+
|
|
10
|
+
from ..config import CONFIG
|
|
11
|
+
from ..embed import HybridVec
|
|
12
|
+
|
|
13
|
+
# Vector slot names inside each Qdrant point. Keep stable; collection
|
|
14
|
+
# rebuild is required to change them.
|
|
15
|
+
DENSE = "dense"
|
|
16
|
+
SPARSE = "sparse"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class VectorRecord:
|
|
21
|
+
id: str
|
|
22
|
+
vector: HybridVec
|
|
23
|
+
payload: dict[str, Any] = field(default_factory=dict)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class VectorHit:
|
|
28
|
+
id: str
|
|
29
|
+
score: float
|
|
30
|
+
payload: dict[str, Any]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class QdrantStore:
|
|
34
|
+
"""Hybrid dense + sparse store w/ server-side RRF fusion.
|
|
35
|
+
|
|
36
|
+
Collections use Qdrant's named-vector layout: each point carries a
|
|
37
|
+
``dense`` slot (m3 1024-d cosine) and a ``sparse`` slot (m3 lexical
|
|
38
|
+
weights, IDF-modified). Queries prefetch both, then fuse with
|
|
39
|
+
Reciprocal Rank Fusion so neither view dominates on its own.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
url: str | None = None,
|
|
45
|
+
dim: int | None = None,
|
|
46
|
+
) -> None:
|
|
47
|
+
from ..config import resolve_embed_dim
|
|
48
|
+
|
|
49
|
+
self.url = url or CONFIG.qdrant_url
|
|
50
|
+
self.client = QdrantClient(url=self.url)
|
|
51
|
+
# ``CONFIG.embed_dim`` is 0 by default (sentinel for "auto").
|
|
52
|
+
# Resolve via the known-model table so ``EMBED_MODEL``
|
|
53
|
+
# automatically picks the right dim without the operator setting
|
|
54
|
+
# ``EMBED_DIM``. Explicit ``dim`` arg or ``EMBED_DIM`` env
|
|
55
|
+
# still wins.
|
|
56
|
+
self.dim = (
|
|
57
|
+
dim
|
|
58
|
+
if dim is not None
|
|
59
|
+
else resolve_embed_dim(CONFIG.embed_model, CONFIG.embed_dim)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# --------------------------------------------------------- collection
|
|
63
|
+
|
|
64
|
+
def ensure_collection(self, name: str) -> None:
|
|
65
|
+
status = self._inspect_collection(name)
|
|
66
|
+
if status == "hybrid":
|
|
67
|
+
# Check dimension match so mismatched embedding models are
|
|
68
|
+
# caught early with a clear error rather than cryptic Qdrant
|
|
69
|
+
# gRPC failures at upsert time.
|
|
70
|
+
existing = self.client.get_collection(collection_name=name)
|
|
71
|
+
vectors = getattr(existing.config.params, "vectors", None)
|
|
72
|
+
if isinstance(vectors, dict) and DENSE in vectors:
|
|
73
|
+
existing_dim = getattr(vectors[DENSE], "size", None)
|
|
74
|
+
if existing_dim is not None and existing_dim != self.dim:
|
|
75
|
+
raise ValueError(
|
|
76
|
+
f"Collection '{name}' exists with dimension {existing_dim:,}d, "
|
|
77
|
+
f"but embedding model produces {self.dim:,}d. "
|
|
78
|
+
f"Re-ingest (code-memory ingest --full) or delete the collection and re-create."
|
|
79
|
+
)
|
|
80
|
+
return
|
|
81
|
+
if status == "legacy":
|
|
82
|
+
# Caller is on the ingest path and explicitly asked us to make
|
|
83
|
+
# the collection ready — drop the legacy layout and recreate.
|
|
84
|
+
# Read paths never trigger this branch because they go through
|
|
85
|
+
# ``_inspect_collection`` directly.
|
|
86
|
+
try:
|
|
87
|
+
self.client.delete_collection(collection_name=name)
|
|
88
|
+
except Exception: # noqa: BLE001
|
|
89
|
+
pass
|
|
90
|
+
self._create_hybrid(name)
|
|
91
|
+
|
|
92
|
+
def recreate_collection(self, name: str) -> None:
|
|
93
|
+
"""Drop and re-create. Used by full re-ingest."""
|
|
94
|
+
try:
|
|
95
|
+
self.client.delete_collection(collection_name=name)
|
|
96
|
+
except Exception:
|
|
97
|
+
pass
|
|
98
|
+
self._create_hybrid(name)
|
|
99
|
+
|
|
100
|
+
def _inspect_collection(self, name: str) -> str:
|
|
101
|
+
"""Pure read of the collection's schema. No side effects.
|
|
102
|
+
|
|
103
|
+
Returns ``"missing"``, ``"legacy"`` (single-vector layout left
|
|
104
|
+
over from before the hybrid migration), or ``"hybrid"``.
|
|
105
|
+
"""
|
|
106
|
+
existing = {c.name for c in self.client.get_collections().collections}
|
|
107
|
+
if name not in existing:
|
|
108
|
+
return "missing"
|
|
109
|
+
info = self.client.get_collection(collection_name=name)
|
|
110
|
+
vectors = getattr(info.config.params, "vectors", None)
|
|
111
|
+
sparse = getattr(info.config.params, "sparse_vectors", None)
|
|
112
|
+
has_dense = isinstance(vectors, dict) and DENSE in vectors
|
|
113
|
+
has_sparse = isinstance(sparse, dict) and SPARSE in sparse
|
|
114
|
+
if has_dense and has_sparse:
|
|
115
|
+
return "hybrid"
|
|
116
|
+
return "legacy"
|
|
117
|
+
|
|
118
|
+
def _create_hybrid(self, name: str) -> None:
|
|
119
|
+
self.client.create_collection(
|
|
120
|
+
collection_name=name,
|
|
121
|
+
vectors_config={
|
|
122
|
+
DENSE: qm.VectorParams(size=self.dim, distance=qm.Distance.COSINE),
|
|
123
|
+
},
|
|
124
|
+
sparse_vectors_config={
|
|
125
|
+
SPARSE: qm.SparseVectorParams(
|
|
126
|
+
modifier=qm.Modifier.IDF,
|
|
127
|
+
),
|
|
128
|
+
},
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# ------------------------------------------------------------- upsert
|
|
132
|
+
|
|
133
|
+
def upsert(self, collection: str, records: Iterable[VectorRecord]) -> None:
|
|
134
|
+
points: list[qm.PointStruct] = []
|
|
135
|
+
for r in records:
|
|
136
|
+
vec_payload: dict[str, Any] = {DENSE: r.vector.dense}
|
|
137
|
+
# Skip the sparse slot when the embedder didn't emit one
|
|
138
|
+
# (Ollama backend returns ``HybridVec`` with empty sparse).
|
|
139
|
+
# Qdrant rejects sparse vectors with zero indices.
|
|
140
|
+
if r.vector.sparse.indices:
|
|
141
|
+
vec_payload[SPARSE] = qm.SparseVector(
|
|
142
|
+
indices=r.vector.sparse.indices,
|
|
143
|
+
values=r.vector.sparse.values,
|
|
144
|
+
)
|
|
145
|
+
points.append(
|
|
146
|
+
qm.PointStruct(id=r.id, vector=vec_payload, payload=r.payload)
|
|
147
|
+
)
|
|
148
|
+
if not points:
|
|
149
|
+
return
|
|
150
|
+
self.client.upsert(collection_name=collection, points=points)
|
|
151
|
+
|
|
152
|
+
# ------------------------------------------------------------- search
|
|
153
|
+
|
|
154
|
+
def search(
|
|
155
|
+
self,
|
|
156
|
+
collection: str,
|
|
157
|
+
vector: HybridVec | Sequence[float],
|
|
158
|
+
top_k: int = 10,
|
|
159
|
+
filt: dict[str, Any] | None = None,
|
|
160
|
+
*,
|
|
161
|
+
prefetch_multiplier: int = 4,
|
|
162
|
+
mode: str = "hybrid",
|
|
163
|
+
) -> list[VectorHit]:
|
|
164
|
+
"""Hybrid search with RRF fusion.
|
|
165
|
+
|
|
166
|
+
``vector`` may be a :class:`HybridVec` (preferred) for full
|
|
167
|
+
dense+sparse fusion, or a plain dense sequence for backwards
|
|
168
|
+
compatibility with legacy callers / tests. Sparse-less queries
|
|
169
|
+
degrade gracefully to a dense-only ranking.
|
|
170
|
+
|
|
171
|
+
``prefetch_multiplier`` controls how many candidates each branch
|
|
172
|
+
pulls before fusion. 4x is the Qdrant docs default and gives
|
|
173
|
+
enough overlap for RRF to do useful work.
|
|
174
|
+
|
|
175
|
+
``mode`` is an A/B test seam used by the benchmark harness:
|
|
176
|
+
``"hybrid"`` (default) fuses both vectors; ``"dense"`` ignores
|
|
177
|
+
the sparse slot entirely. Production callers should leave it at
|
|
178
|
+
the default — query-time degradation is for measurement only.
|
|
179
|
+
"""
|
|
180
|
+
status = self._inspect_collection(collection)
|
|
181
|
+
if status == "missing":
|
|
182
|
+
raise LookupError(
|
|
183
|
+
f"Qdrant collection '{collection}' does not exist. "
|
|
184
|
+
f"Run `code-memory ingest <path> --project <slug>` first."
|
|
185
|
+
)
|
|
186
|
+
if status == "legacy":
|
|
187
|
+
raise RuntimeError(
|
|
188
|
+
f"Qdrant collection '{collection}' uses the legacy "
|
|
189
|
+
f"single-vector layout from before the hybrid migration. "
|
|
190
|
+
f"Drop it and re-ingest:\n"
|
|
191
|
+
f" curl -X DELETE {self.url}/collections/{collection}\n"
|
|
192
|
+
f" code-memory ingest <path> --full"
|
|
193
|
+
)
|
|
194
|
+
qfilter = _to_filter(filt) if filt else None
|
|
195
|
+
|
|
196
|
+
# Hybrid mode requires a non-empty sparse query vector. When the
|
|
197
|
+
# embedder is dense-only (Ollama backend), fall through to the
|
|
198
|
+
# dense path so callers don't need to special-case the backend.
|
|
199
|
+
hv = vector if isinstance(vector, HybridVec) else None
|
|
200
|
+
has_sparse = hv is not None and bool(hv.sparse.indices)
|
|
201
|
+
if hv is not None and has_sparse and mode in ("hybrid", "hybrid_dbsf"):
|
|
202
|
+
prefetch = [
|
|
203
|
+
qm.Prefetch(
|
|
204
|
+
query=hv.dense,
|
|
205
|
+
using=DENSE,
|
|
206
|
+
limit=top_k * prefetch_multiplier,
|
|
207
|
+
filter=qfilter,
|
|
208
|
+
),
|
|
209
|
+
qm.Prefetch(
|
|
210
|
+
query=qm.SparseVector(
|
|
211
|
+
indices=hv.sparse.indices,
|
|
212
|
+
values=hv.sparse.values,
|
|
213
|
+
),
|
|
214
|
+
using=SPARSE,
|
|
215
|
+
limit=top_k * prefetch_multiplier,
|
|
216
|
+
filter=qfilter,
|
|
217
|
+
),
|
|
218
|
+
]
|
|
219
|
+
fusion = qm.Fusion.DBSF if mode == "hybrid_dbsf" else qm.Fusion.RRF
|
|
220
|
+
res = self.client.query_points(
|
|
221
|
+
collection_name=collection,
|
|
222
|
+
prefetch=prefetch,
|
|
223
|
+
query=qm.FusionQuery(fusion=fusion),
|
|
224
|
+
limit=top_k,
|
|
225
|
+
with_payload=True,
|
|
226
|
+
query_filter=qfilter,
|
|
227
|
+
)
|
|
228
|
+
else:
|
|
229
|
+
# Dense-only path: legacy callers + benchmark "dense" mode
|
|
230
|
+
dense_vec = vector.dense if isinstance(vector, HybridVec) else list(vector)
|
|
231
|
+
res = self.client.query_points(
|
|
232
|
+
collection_name=collection,
|
|
233
|
+
query=dense_vec,
|
|
234
|
+
using=DENSE,
|
|
235
|
+
limit=top_k,
|
|
236
|
+
query_filter=qfilter,
|
|
237
|
+
with_payload=True,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
return [
|
|
241
|
+
VectorHit(id=str(p.id), score=float(p.score), payload=p.payload or {})
|
|
242
|
+
for p in res.points
|
|
243
|
+
]
|
|
244
|
+
|
|
245
|
+
def delete_by_path(self, collection: str, path: str) -> None:
|
|
246
|
+
self.client.delete(
|
|
247
|
+
collection_name=collection,
|
|
248
|
+
points_selector=qm.FilterSelector(
|
|
249
|
+
filter=qm.Filter(
|
|
250
|
+
must=[qm.FieldCondition(key="path", match=qm.MatchValue(value=path))]
|
|
251
|
+
)
|
|
252
|
+
),
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
def delete_by_ids(self, collection: str, ids: Sequence[str]) -> None:
|
|
256
|
+
"""Bulk delete points by id. No-op on empty input."""
|
|
257
|
+
if not ids:
|
|
258
|
+
return
|
|
259
|
+
self.client.delete(
|
|
260
|
+
collection_name=collection,
|
|
261
|
+
points_selector=qm.PointIdsList(points=list(ids)),
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
def set_payload(
|
|
265
|
+
self,
|
|
266
|
+
collection: str,
|
|
267
|
+
ids: Sequence[str],
|
|
268
|
+
payload: dict[str, Any],
|
|
269
|
+
) -> None:
|
|
270
|
+
"""Merge ``payload`` into points identified by ``ids``.
|
|
271
|
+
|
|
272
|
+
Used by the claim indexer to flip ``open`` from ``True`` to
|
|
273
|
+
``False`` when a claim is superseded, without re-embedding the
|
|
274
|
+
triple. No-op on empty ids — Qdrant rejects empty selectors.
|
|
275
|
+
"""
|
|
276
|
+
if not ids:
|
|
277
|
+
return
|
|
278
|
+
self.client.set_payload(
|
|
279
|
+
collection_name=collection,
|
|
280
|
+
payload=payload,
|
|
281
|
+
points=list(ids),
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
def count(self, collection: str) -> int:
|
|
285
|
+
"""Return total point count for the collection.
|
|
286
|
+
|
|
287
|
+
Returns ``0`` for missing collections so callers can use this
|
|
288
|
+
as a cheap "do I need to backfill?" probe without try/except
|
|
289
|
+
around ``ensure_collection``.
|
|
290
|
+
"""
|
|
291
|
+
if self._inspect_collection(collection) == "missing":
|
|
292
|
+
return 0
|
|
293
|
+
res = self.client.count(collection_name=collection, exact=False)
|
|
294
|
+
return int(res.count)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _to_filter(filt: dict[str, Any]) -> qm.Filter:
|
|
298
|
+
must = [
|
|
299
|
+
qm.FieldCondition(key=k, match=qm.MatchValue(value=v))
|
|
300
|
+
for k, v in filt.items()
|
|
301
|
+
]
|
|
302
|
+
return qm.Filter(must=must)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flurryx-code-memory
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Local lightweight memory layer for coding agents: FalkorDB + Qdrant + Ollama (BGE-M3) + tree-sitter
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: anyio>=4.4
|
|
7
|
+
Requires-Dist: falkordb>=1.0.10
|
|
8
|
+
Requires-Dist: httpx>=0.27
|
|
9
|
+
Requires-Dist: mcp>=1.0
|
|
10
|
+
Requires-Dist: pydantic>=2.8
|
|
11
|
+
Requires-Dist: qdrant-client>=1.12
|
|
12
|
+
Requires-Dist: rich>=13.7
|
|
13
|
+
Requires-Dist: tree-sitter-language-pack>=0.7
|
|
14
|
+
Requires-Dist: tree-sitter>=0.23
|
|
15
|
+
Requires-Dist: typer>=0.12
|
|
16
|
+
Requires-Dist: watchdog>=4.0
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: mypy>=1.11; extra == 'dev'
|
|
19
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
20
|
+
Requires-Dist: pytest>=8.3; extra == 'dev'
|
|
21
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
22
|
+
Provides-Extra: dotnet
|
|
23
|
+
Requires-Dist: dnfile>=0.15; extra == 'dotnet'
|
|
24
|
+
Provides-Extra: hybrid
|
|
25
|
+
Requires-Dist: flagembedding>=1.3; extra == 'hybrid'
|
|
26
|
+
Requires-Dist: torch>=2.1; extra == 'hybrid'
|