flurryx-code-memory 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_memory/__init__.py +1 -0
- code_memory/claims/__init__.py +32 -0
- code_memory/claims/extractor.py +325 -0
- code_memory/claims/indexer.py +258 -0
- code_memory/claims/resolver.py +186 -0
- code_memory/claims/store.py +424 -0
- code_memory/cli.py +1192 -0
- code_memory/config.py +268 -0
- code_memory/embed/__init__.py +224 -0
- code_memory/embed/cache.py +204 -0
- code_memory/embed/m3.py +174 -0
- code_memory/embed/ollama.py +92 -0
- code_memory/embed/tei.py +106 -0
- code_memory/episodic/__init__.py +3 -0
- code_memory/episodic/sqlite_store.py +278 -0
- code_memory/extractor/__init__.py +3 -0
- code_memory/extractor/csproj.py +166 -0
- code_memory/extractor/dll.py +385 -0
- code_memory/extractor/gitignore.py +162 -0
- code_memory/extractor/nuget.py +275 -0
- code_memory/extractor/sanity.py +124 -0
- code_memory/extractor/sln.py +108 -0
- code_memory/extractor/treesitter.py +1172 -0
- code_memory/graph/__init__.py +3 -0
- code_memory/graph/falkor_store.py +740 -0
- code_memory/mcp_server.py +1816 -0
- code_memory/metrics.py +260 -0
- code_memory/orchestrator/__init__.py +13 -0
- code_memory/orchestrator/git_delta.py +211 -0
- code_memory/orchestrator/ingest_state.py +71 -0
- code_memory/orchestrator/pipeline.py +1478 -0
- code_memory/orchestrator/reset.py +130 -0
- code_memory/orchestrator/resolver.py +825 -0
- code_memory/orchestrator/retrieve.py +505 -0
- code_memory/resilience.py +73 -0
- code_memory/sync/__init__.py +20 -0
- code_memory/sync/autostart/__init__.py +42 -0
- code_memory/sync/autostart/base.py +106 -0
- code_memory/sync/autostart/launchd.py +115 -0
- code_memory/sync/autostart/schtasks.py +155 -0
- code_memory/sync/autostart/systemd.py +113 -0
- code_memory/sync/hooks.py +164 -0
- code_memory/sync/safety.py +65 -0
- code_memory/sync/snapshot.py +461 -0
- code_memory/sync/store.py +399 -0
- code_memory/sync/sync.py +405 -0
- code_memory/sync/watcher.py +320 -0
- code_memory/vector/__init__.py +3 -0
- code_memory/vector/qdrant_store.py +302 -0
- flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
- flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
- flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
- flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
code_memory/config.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import subprocess
|
|
6
|
+
from dataclasses import dataclass, replace
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Config file name (project-local and global). KEY=VALUE per line, '#'
|
|
11
|
+
# starts a comment. Real shell env always wins; project file beats
|
|
12
|
+
# global file. Layering exists so users can pin defaults once
|
|
13
|
+
# (~/.config/code-memory/config) and override per repo
|
|
14
|
+
# (./.code-memoryrc) without polluting the shell rc.
|
|
15
|
+
_RC_BASENAME = ".code-memoryrc"
|
|
16
|
+
_GLOBAL_RC = (
|
|
17
|
+
Path(os.environ.get("XDG_CONFIG_HOME", str(Path.home() / ".config")))
|
|
18
|
+
/ "code-memory"
|
|
19
|
+
/ "config"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _parse_rc(path: Path) -> dict[str, str]:
|
|
24
|
+
try:
|
|
25
|
+
text = path.read_text(encoding="utf-8")
|
|
26
|
+
except OSError:
|
|
27
|
+
return {}
|
|
28
|
+
out: dict[str, str] = {}
|
|
29
|
+
for raw in text.splitlines():
|
|
30
|
+
line = raw.strip()
|
|
31
|
+
if not line or line.startswith("#"):
|
|
32
|
+
continue
|
|
33
|
+
if line.startswith("export "):
|
|
34
|
+
line = line[7:].lstrip()
|
|
35
|
+
key, sep, val = line.partition("=")
|
|
36
|
+
if not sep:
|
|
37
|
+
continue
|
|
38
|
+
key = key.strip()
|
|
39
|
+
val = val.strip()
|
|
40
|
+
# Strip matching surrounding quotes if any.
|
|
41
|
+
if len(val) >= 2 and val[0] == val[-1] and val[0] in ("'", '"'):
|
|
42
|
+
val = val[1:-1]
|
|
43
|
+
if key:
|
|
44
|
+
out[key] = val
|
|
45
|
+
return out
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _project_rc() -> Path | None:
|
|
49
|
+
"""Locate project rc: cwd, then walk up to git toplevel."""
|
|
50
|
+
cwd = Path.cwd()
|
|
51
|
+
candidate = cwd / _RC_BASENAME
|
|
52
|
+
if candidate.is_file():
|
|
53
|
+
return candidate
|
|
54
|
+
top = _git_toplevel(cwd)
|
|
55
|
+
if top is not None:
|
|
56
|
+
candidate = top / _RC_BASENAME
|
|
57
|
+
if candidate.is_file():
|
|
58
|
+
return candidate
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _load_rc_into_environ() -> None:
|
|
63
|
+
"""Populate os.environ with rc-file values without overriding the
|
|
64
|
+
real shell. Project rc beats global rc.
|
|
65
|
+
|
|
66
|
+
Precedence (highest → lowest):
|
|
67
|
+
real shell env > ./.code-memoryrc > ~/.config/code-memory/config
|
|
68
|
+
"""
|
|
69
|
+
# Apply global first so the project pass can shadow it. Neither
|
|
70
|
+
# pass overrides anything already in the shell environment.
|
|
71
|
+
for source in (_GLOBAL_RC, _project_rc()):
|
|
72
|
+
if source is None:
|
|
73
|
+
continue
|
|
74
|
+
for k, v in _parse_rc(source).items():
|
|
75
|
+
if k not in os.environ:
|
|
76
|
+
os.environ[k] = v
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _env(key: str, default: str) -> str:
|
|
80
|
+
return os.environ.get(key, default)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
_SLUG_RE = re.compile(r"[^a-z0-9]+")
|
|
84
|
+
|
|
85
|
+
# Sentinel values for ``CODE_MEMORY_PROJECT`` that mean "infer from cwd"
|
|
86
|
+
# rather than "use a project literally named this". Recognising these
|
|
87
|
+
# avoids the silent footgun of indexing into a namespace called ``auto``.
|
|
88
|
+
_AUTO_SENTINELS = frozenset({"", "auto", "default"})
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def slugify(name: str) -> str:
|
|
92
|
+
s = _SLUG_RE.sub("-", name.lower()).strip("-")
|
|
93
|
+
return s or "default"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _git_toplevel(start: Path) -> Path | None:
|
|
97
|
+
try:
|
|
98
|
+
out = subprocess.run(
|
|
99
|
+
["git", "-C", str(start), "rev-parse", "--show-toplevel"],
|
|
100
|
+
capture_output=True,
|
|
101
|
+
text=True,
|
|
102
|
+
check=False,
|
|
103
|
+
timeout=2,
|
|
104
|
+
)
|
|
105
|
+
except (FileNotFoundError, subprocess.SubprocessError):
|
|
106
|
+
return None
|
|
107
|
+
if out.returncode != 0:
|
|
108
|
+
return None
|
|
109
|
+
top = out.stdout.strip()
|
|
110
|
+
return Path(top) if top else None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# Populate os.environ from rc files *before* the ``Config`` dataclass
|
|
114
|
+
# defaults are evaluated (those are computed at module import via
|
|
115
|
+
# ``_env(...)`` calls in field defaults). Real shell env still wins.
|
|
116
|
+
_load_rc_into_environ()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# Vector dimensionality of the embedding models we ship recipes for.
|
|
120
|
+
# Used to default ``EMBED_DIM`` when the operator only sets
|
|
121
|
+
# ``EMBED_MODEL``. Saves the silent-mismatch footgun where the model
|
|
122
|
+
# emits 768-d vectors but the Qdrant collection was created for 1024.
|
|
123
|
+
# Keys are matched case-insensitively against the leading model name
|
|
124
|
+
# (anything before ``:``), so ``bge-m3:latest``, ``bge-m3:567m-fp16``,
|
|
125
|
+
# and ``BAAI/bge-m3`` all resolve to the same dim.
|
|
126
|
+
_KNOWN_MODEL_DIMS: dict[str, int] = {
|
|
127
|
+
# bge family
|
|
128
|
+
"bge-m3": 1024,
|
|
129
|
+
"baai/bge-m3": 1024,
|
|
130
|
+
"bge-large-en": 1024,
|
|
131
|
+
"bge-base-en": 768,
|
|
132
|
+
"bge-small-en": 384,
|
|
133
|
+
# mixedbread
|
|
134
|
+
"mxbai-embed-large": 1024,
|
|
135
|
+
# snowflake
|
|
136
|
+
"snowflake-arctic-embed:s": 384,
|
|
137
|
+
"snowflake-arctic-embed:m": 768,
|
|
138
|
+
"snowflake-arctic-embed:l": 1024,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def resolve_embed_dim(model_name: str, override: int = 0) -> int:
|
|
143
|
+
"""Return the vector dim for ``model_name``, honouring ``override``.
|
|
144
|
+
|
|
145
|
+
``override > 0`` wins (operators with a custom model still in
|
|
146
|
+
control). Otherwise look up the model's base name in the known
|
|
147
|
+
table. Falls back to ``1024`` (bge-m3 default) with a print to
|
|
148
|
+
stderr so the operator notices we're guessing.
|
|
149
|
+
"""
|
|
150
|
+
if override > 0:
|
|
151
|
+
return override
|
|
152
|
+
lower = model_name.strip().lower()
|
|
153
|
+
# Try the full name (so ``snowflake-arctic-embed:s`` matches its
|
|
154
|
+
# own dim, not the parent family's). Fall back to the bare base
|
|
155
|
+
# name (so ``bge-m3:latest`` still resolves via ``bge-m3``).
|
|
156
|
+
if lower in _KNOWN_MODEL_DIMS:
|
|
157
|
+
return _KNOWN_MODEL_DIMS[lower]
|
|
158
|
+
base = lower.split(":", 1)[0]
|
|
159
|
+
if base in _KNOWN_MODEL_DIMS:
|
|
160
|
+
return _KNOWN_MODEL_DIMS[base]
|
|
161
|
+
# Unknown model — fall back to the bge-m3 default but warn so the
|
|
162
|
+
# operator notices a mismatch before it produces broken vectors.
|
|
163
|
+
import sys as _sys
|
|
164
|
+
_sys.stderr.write(
|
|
165
|
+
f"[code-memory] WARNING: embed model {model_name!r} not in "
|
|
166
|
+
f"known-dim table; defaulting to 1024. Set EMBED_DIM=<n> to silence.\n"
|
|
167
|
+
)
|
|
168
|
+
return 1024
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def detect_project_slug(root: str | Path | None = None) -> str:
|
|
172
|
+
"""Resolve project slug.
|
|
173
|
+
|
|
174
|
+
Priority:
|
|
175
|
+
1. explicit `root` (path) -> git toplevel basename, else dir name
|
|
176
|
+
2. CODE_MEMORY_PROJECT env var
|
|
177
|
+
3. cwd -> git toplevel basename, else cwd name
|
|
178
|
+
"""
|
|
179
|
+
if root is not None:
|
|
180
|
+
p = Path(root).resolve()
|
|
181
|
+
top = _git_toplevel(p if p.is_dir() else p.parent)
|
|
182
|
+
return slugify((top or p).name)
|
|
183
|
+
|
|
184
|
+
env = os.environ.get("CODE_MEMORY_PROJECT", "").strip()
|
|
185
|
+
if env and env.lower() not in _AUTO_SENTINELS:
|
|
186
|
+
return slugify(env)
|
|
187
|
+
|
|
188
|
+
cwd = Path.cwd()
|
|
189
|
+
top = _git_toplevel(cwd)
|
|
190
|
+
return slugify((top or cwd).name)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@dataclass(frozen=True)
|
|
194
|
+
class Config:
|
|
195
|
+
ollama_url: str = _env("OLLAMA_URL", "http://localhost:11434")
|
|
196
|
+
# TEI (text-embeddings-inference) server URL. Used only when
|
|
197
|
+
# ``EMBED_BACKEND=tei``. The enterprise-deploy story: stand TEI up
|
|
198
|
+
# on a GPU host (Linux + CUDA), point ``TEI_URL`` at it, get a
|
|
199
|
+
# 5-10× cold-ingest speedup over Ollama with the same bge-m3
|
|
200
|
+
# weights. On Mac there's no GPU advantage and Ollama's Metal path
|
|
201
|
+
# is faster — leave on the default backend there.
|
|
202
|
+
tei_url: str = _env("TEI_URL", "http://localhost:8080")
|
|
203
|
+
embed_model: str = _env("EMBED_MODEL", "bge-m3")
|
|
204
|
+
# ``embed_dim`` defaults to the dimension of the configured model
|
|
205
|
+
# so users don't have to keep two env vars in sync. Override with
|
|
206
|
+
# ``EMBED_DIM`` when running a model not in the known-dim table.
|
|
207
|
+
embed_dim: int = int(_env("EMBED_DIM", "0"))
|
|
208
|
+
|
|
209
|
+
qdrant_url: str = _env("QDRANT_URL", "http://localhost:6333")
|
|
210
|
+
qdrant_code: str = _env("QDRANT_COLLECTION_CODE", "code_chunks")
|
|
211
|
+
qdrant_episodes: str = _env("QDRANT_COLLECTION_EPISODES", "episodes")
|
|
212
|
+
qdrant_claim_entities: str = _env(
|
|
213
|
+
"QDRANT_COLLECTION_CLAIM_ENTITIES", "claim_entities"
|
|
214
|
+
)
|
|
215
|
+
# Semantic index over user-claim triples (subject + predicate + object
|
|
216
|
+
# + evidence_span). Distinct from ``qdrant_claim_entities`` — that
|
|
217
|
+
# one stores canonical entity vectors for resolver dedup; this one
|
|
218
|
+
# stores per-claim vectors so retrieve can return semantically
|
|
219
|
+
# matching claims alongside code + episodes. SQLite (``claims.db``)
|
|
220
|
+
# remains source of truth; this collection is rebuildable.
|
|
221
|
+
qdrant_claims: str = _env("QDRANT_COLLECTION_CLAIMS", "claims")
|
|
222
|
+
|
|
223
|
+
falkor_host: str = _env("FALKOR_HOST", "localhost")
|
|
224
|
+
falkor_port: int = int(_env("FALKOR_PORT", "6379"))
|
|
225
|
+
falkor_graph: str = _env("FALKOR_GRAPH", "code_graph")
|
|
226
|
+
|
|
227
|
+
# Resolved once at import time. Late-binding against `Path.cwd()` would
|
|
228
|
+
# diverge whenever a long-lived process (MCP server) shares storage
|
|
229
|
+
# with shell invocations launched from a different cwd, silently
|
|
230
|
+
# routing writes and reads to different files.
|
|
231
|
+
episodic_db: Path = Path(_env("EPISODIC_DB", "./data/episodic.db")).resolve()
|
|
232
|
+
claims_db: Path = Path(_env("CLAIMS_DB", "./data/claims.db")).resolve()
|
|
233
|
+
data_dir: Path = Path(_env("DATA_DIR", "./data")).resolve()
|
|
234
|
+
|
|
235
|
+
# Claim extraction (Graphiti-style user-prompt facts).
|
|
236
|
+
# Enabled by default. Set CLAIMS_EXTRACTION=false to disable.
|
|
237
|
+
claims_enabled: bool = _env("CLAIMS_EXTRACTION", "true").strip().lower() in {
|
|
238
|
+
"1",
|
|
239
|
+
"true",
|
|
240
|
+
"yes",
|
|
241
|
+
"on",
|
|
242
|
+
}
|
|
243
|
+
claims_llm_model: str = _env("CLAIMS_LLM_MODEL", "gemma2:9b")
|
|
244
|
+
claims_llm_timeout: float = float(_env("CLAIMS_LLM_TIMEOUT", "30"))
|
|
245
|
+
claims_min_confidence: float = float(_env("CLAIMS_MIN_CONFIDENCE", "0.6"))
|
|
246
|
+
# Cosine similarity at or above which a freshly embedded
|
|
247
|
+
# subject/object reuses an existing entity instead of creating a new
|
|
248
|
+
# one. 0.85 is a conservative default — false-merges hurt more than
|
|
249
|
+
# extra entities (they propagate to every downstream claim).
|
|
250
|
+
claims_entity_threshold: float = float(
|
|
251
|
+
_env("CLAIMS_ENTITY_THRESHOLD", "0.85")
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def for_project(self, slug: str) -> Config:
|
|
255
|
+
slug = slugify(slug)
|
|
256
|
+
return replace(
|
|
257
|
+
self,
|
|
258
|
+
qdrant_code=f"{self.qdrant_code}__{slug}",
|
|
259
|
+
qdrant_episodes=f"{self.qdrant_episodes}__{slug}",
|
|
260
|
+
qdrant_claim_entities=f"{self.qdrant_claim_entities}__{slug}",
|
|
261
|
+
qdrant_claims=f"{self.qdrant_claims}__{slug}",
|
|
262
|
+
falkor_graph=f"{self.falkor_graph}__{slug}",
|
|
263
|
+
episodic_db=self.data_dir / slug / "episodic.db",
|
|
264
|
+
claims_db=self.data_dir / slug / "claims.db",
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
CONFIG = Config()
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""Embedding backends.
|
|
2
|
+
|
|
3
|
+
Three backends, same :class:`HybridVec` shape:
|
|
4
|
+
|
|
5
|
+
* :class:`OllamaEmbedder` (default) — dense-only via the Ollama daemon.
|
|
6
|
+
Keeps the model warm across short-lived CLI processes (per-save
|
|
7
|
+
reingest hooks, git hooks). Returns ``sparse`` as an empty vector.
|
|
8
|
+
* :class:`M3Embedder` (opt-in via ``EMBED_BACKEND=flagembed``) — dense
|
|
9
|
+
+ sparse from one in-process FlagEmbedding forward pass. Best for
|
|
10
|
+
long-lived processes (watcher, MCP server) where the cold-load cost
|
|
11
|
+
is paid once.
|
|
12
|
+
* :class:`TEIEmbedder` (opt-in via ``EMBED_BACKEND=tei``) — dense-only
|
|
13
|
+
via HuggingFace's `text-embeddings-inference` GPU server. **5-10×
|
|
14
|
+
cold-ingest speedup vs Ollama on Linux + NVIDIA**, same weights, no
|
|
15
|
+
recall loss. Set ``TEI_URL`` to point at the TEI daemon (default
|
|
16
|
+
``http://localhost:8080``).
|
|
17
|
+
|
|
18
|
+
All backends are transparently wrapped in :class:`CachedEmbedder` so
|
|
19
|
+
content-hash cache hits skip the model entirely on re-ingest.
|
|
20
|
+
|
|
21
|
+
Use :func:`get_embedder` for the process-singleton; it reads
|
|
22
|
+
``EMBED_BACKEND`` and dispatches accordingly.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import logging
|
|
28
|
+
import os
|
|
29
|
+
from collections.abc import Sequence
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Protocol
|
|
32
|
+
|
|
33
|
+
from ..config import CONFIG
|
|
34
|
+
from .cache import EmbedCache, hash_chunk
|
|
35
|
+
from .m3 import HybridVec, M3Embedder, SparseVec
|
|
36
|
+
from .ollama import OllamaEmbedder
|
|
37
|
+
from .tei import TEIEmbedder
|
|
38
|
+
|
|
39
|
+
log = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
ENV_BACKEND = "EMBED_BACKEND"
|
|
42
|
+
ENV_DISABLE_CACHE = "EMBED_CACHE_DISABLED"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class Embedder(Protocol):
|
|
46
|
+
"""Common shape: produce :class:`HybridVec` per text."""
|
|
47
|
+
|
|
48
|
+
def embed(self, texts): # type: ignore[no-untyped-def]
|
|
49
|
+
...
|
|
50
|
+
|
|
51
|
+
def embed_one(self, text: str) -> HybridVec: ...
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class CachedEmbedder:
|
|
55
|
+
"""Embedder that consults a content-hash cache before the inner backend.
|
|
56
|
+
|
|
57
|
+
Same ``embed`` / ``embed_one`` shape as the underlying embedder, so
|
|
58
|
+
callers don't see the cache. The wrapper:
|
|
59
|
+
|
|
60
|
+
1. Hashes every requested chunk.
|
|
61
|
+
2. Pulls cached vectors in one ``IN (?, ?, …)`` SELECT.
|
|
62
|
+
3. Forwards the miss list to the inner embedder.
|
|
63
|
+
4. Writes the new vectors back to the cache before returning.
|
|
64
|
+
5. Reassembles the output in the original input order.
|
|
65
|
+
|
|
66
|
+
On a re-ingest where every chunk is unchanged, the inner embedder
|
|
67
|
+
sees an empty list and returns instantly — the whole vector
|
|
68
|
+
pipeline collapses to a SQLite scan + Qdrant upsert.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
inner: Embedder,
|
|
74
|
+
cache: EmbedCache,
|
|
75
|
+
model_id: str,
|
|
76
|
+
) -> None:
|
|
77
|
+
self._inner = inner
|
|
78
|
+
self._cache = cache
|
|
79
|
+
self._model_id = model_id
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def cache(self) -> EmbedCache:
|
|
83
|
+
return self._cache
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def model_id(self) -> str:
|
|
87
|
+
return self._model_id
|
|
88
|
+
|
|
89
|
+
def embed(self, texts: Sequence[str]) -> list[HybridVec]:
|
|
90
|
+
if not texts:
|
|
91
|
+
return []
|
|
92
|
+
hashes = [hash_chunk(t) for t in texts]
|
|
93
|
+
cached = self._cache.get_many(hashes, self._model_id)
|
|
94
|
+
# Build miss-list + remember positions so we can splice results
|
|
95
|
+
# back into input order.
|
|
96
|
+
miss_positions: list[int] = []
|
|
97
|
+
miss_texts: list[str] = []
|
|
98
|
+
miss_hashes: list[str] = []
|
|
99
|
+
for i, h in enumerate(hashes):
|
|
100
|
+
if h not in cached:
|
|
101
|
+
miss_positions.append(i)
|
|
102
|
+
miss_texts.append(texts[i])
|
|
103
|
+
miss_hashes.append(h)
|
|
104
|
+
|
|
105
|
+
new_vecs: list[HybridVec] = (
|
|
106
|
+
self._inner.embed(miss_texts) if miss_texts else []
|
|
107
|
+
)
|
|
108
|
+
if new_vecs:
|
|
109
|
+
self._cache.put_many(
|
|
110
|
+
zip(miss_hashes, new_vecs, strict=True),
|
|
111
|
+
model=self._model_id,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Reassemble in original order.
|
|
115
|
+
out: list[HybridVec] = [None] * len(texts) # type: ignore[list-item]
|
|
116
|
+
for i, h in enumerate(hashes):
|
|
117
|
+
if h in cached:
|
|
118
|
+
out[i] = cached[h]
|
|
119
|
+
for pos, vec in zip(miss_positions, new_vecs, strict=True):
|
|
120
|
+
out[pos] = vec
|
|
121
|
+
return out # type: ignore[return-value]
|
|
122
|
+
|
|
123
|
+
def embed_one(self, text: str) -> HybridVec:
|
|
124
|
+
return self.embed([text])[0]
|
|
125
|
+
|
|
126
|
+
def close(self) -> None:
|
|
127
|
+
inner_close = getattr(self._inner, "close", None)
|
|
128
|
+
if callable(inner_close):
|
|
129
|
+
inner_close()
|
|
130
|
+
self._cache.close()
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
_SINGLETON: Embedder | None = None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _resolve_backend() -> str:
|
|
137
|
+
raw = os.environ.get(ENV_BACKEND, "ollama").strip().lower()
|
|
138
|
+
if raw in ("flagembed", "flag", "m3", "fastembed"):
|
|
139
|
+
return "flagembed"
|
|
140
|
+
if raw in ("tei", "text-embeddings-inference"):
|
|
141
|
+
return "tei"
|
|
142
|
+
return "ollama"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _cache_enabled() -> bool:
|
|
146
|
+
raw = os.environ.get(ENV_DISABLE_CACHE, "").strip().lower()
|
|
147
|
+
return raw not in ("1", "true", "yes", "on")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _build_inner_embedder(backend: str) -> tuple[Embedder, str]:
|
|
151
|
+
"""Return (embedder, model_id). model_id namespaces the cache.
|
|
152
|
+
|
|
153
|
+
Note: the cache key includes only the embedding model name, not
|
|
154
|
+
the backend — Ollama and TEI serving the *same* ``bge-m3`` weights
|
|
155
|
+
yield the same vectors (within floating-point tolerance), so the
|
|
156
|
+
cache hits are interchangeable across backends. Saves the cache
|
|
157
|
+
cold-start cost when an operator switches Ollama → TEI.
|
|
158
|
+
"""
|
|
159
|
+
if backend == "flagembed":
|
|
160
|
+
log.info("embed: backend=flagembed (in-process m3, dense+sparse)")
|
|
161
|
+
emb_m3 = M3Embedder()
|
|
162
|
+
# FlagEmbed carries a sparse vector that Ollama/TEI don't —
|
|
163
|
+
# keep its cache slot separate so dense-only backends never
|
|
164
|
+
# see (and silently drop) those sparse rows.
|
|
165
|
+
return emb_m3, f"flagembed:{getattr(emb_m3, 'model_name', 'bge-m3')}"
|
|
166
|
+
if backend == "tei":
|
|
167
|
+
log.info("embed: backend=tei (HTTP @ %s, dense-only)", CONFIG.tei_url)
|
|
168
|
+
emb_tei = TEIEmbedder()
|
|
169
|
+
return emb_tei, f"model:{getattr(emb_tei, 'model', 'bge-m3')}"
|
|
170
|
+
log.info("embed: backend=ollama (HTTP, dense-only)")
|
|
171
|
+
emb = OllamaEmbedder()
|
|
172
|
+
return emb, f"model:{getattr(emb, 'model', 'bge-m3')}"
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def get_embedder() -> Embedder:
|
|
176
|
+
"""Process-singleton embedder. First call wins the backend choice.
|
|
177
|
+
|
|
178
|
+
The embedder is always wrapped in :class:`CachedEmbedder` unless
|
|
179
|
+
``EMBED_CACHE_DISABLED=1`` is set — content-hash cache hits then
|
|
180
|
+
bypass the inner model entirely on re-ingest.
|
|
181
|
+
"""
|
|
182
|
+
global _SINGLETON
|
|
183
|
+
if _SINGLETON is None:
|
|
184
|
+
backend = _resolve_backend()
|
|
185
|
+
inner, model_id = _build_inner_embedder(backend)
|
|
186
|
+
if not _cache_enabled():
|
|
187
|
+
log.info("embed: cache disabled via %s", ENV_DISABLE_CACHE)
|
|
188
|
+
_SINGLETON = inner
|
|
189
|
+
else:
|
|
190
|
+
cache_path = _cache_db_path()
|
|
191
|
+
log.info("embed: cache at %s (model=%s)", cache_path, model_id)
|
|
192
|
+
cache = EmbedCache(cache_path)
|
|
193
|
+
_SINGLETON = CachedEmbedder(inner=inner, cache=cache, model_id=model_id)
|
|
194
|
+
return _SINGLETON
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _cache_db_path() -> Path:
|
|
198
|
+
"""Cache file lives in ``CONFIG.data_dir`` so it survives ``code-memory
|
|
199
|
+
reset`` (which only clears the project namespace) and so the same
|
|
200
|
+
content embedded twice across projects reuses the cached vector.
|
|
201
|
+
"""
|
|
202
|
+
base = Path(CONFIG.data_dir)
|
|
203
|
+
base.mkdir(parents=True, exist_ok=True)
|
|
204
|
+
return base / "embed_cache.sqlite"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def set_embedder_for_tests(embedder: Embedder | None) -> None:
|
|
208
|
+
global _SINGLETON
|
|
209
|
+
_SINGLETON = embedder
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
__all__ = [
|
|
213
|
+
"CachedEmbedder",
|
|
214
|
+
"EmbedCache",
|
|
215
|
+
"Embedder",
|
|
216
|
+
"HybridVec",
|
|
217
|
+
"M3Embedder",
|
|
218
|
+
"OllamaEmbedder",
|
|
219
|
+
"SparseVec",
|
|
220
|
+
"TEIEmbedder",
|
|
221
|
+
"get_embedder",
|
|
222
|
+
"hash_chunk",
|
|
223
|
+
"set_embedder_for_tests",
|
|
224
|
+
]
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Persistent content-hash embedding cache.
|
|
2
|
+
|
|
3
|
+
Most enterprise workflows re-ingest the same repo daily after small
|
|
4
|
+
diffs: a few changed files, the rest stable. Without a cache, every
|
|
5
|
+
ingest re-embeds 100% of the corpus — for a 134k-chunk monorepo on
|
|
6
|
+
``bge-m3``/Ollama that's ~1.5 hours of pure inference each run.
|
|
7
|
+
|
|
8
|
+
This cache fingerprints each chunk's text (SHA-256) plus the embedding
|
|
9
|
+
model name and keys a dense / sparse vector pair on the result. On
|
|
10
|
+
re-ingest, unchanged chunks short-circuit the embedder entirely. Only
|
|
11
|
+
new or modified chunks pay the model cost.
|
|
12
|
+
|
|
13
|
+
Design choices:
|
|
14
|
+
|
|
15
|
+
- **SQLite single-file store** so it shares the same lifecycle as
|
|
16
|
+
``EpisodicStore`` (one persistent state directory per project). No
|
|
17
|
+
separate daemon.
|
|
18
|
+
- **Per-model namespacing.** Switching between ``bge-m3`` and
|
|
19
|
+
``bge-small-en`` must not pollute results — they live in different
|
|
20
|
+
rows. Same hash + different model = different cache entries.
|
|
21
|
+
- **Raw float32 BLOBs.** Lighter than JSON; deserialises with a single
|
|
22
|
+
``struct.unpack`` call.
|
|
23
|
+
- **Insert-only by default.** Cache is treated as monotonic; a separate
|
|
24
|
+
``vacuum`` clears stale entries that haven't been read in N days.
|
|
25
|
+
- **No locking beyond SQLite's default.** Concurrent watch + manual
|
|
26
|
+
ingest are rare and the upsert path uses ``INSERT OR REPLACE`` so
|
|
27
|
+
the latest write wins without explicit serialisation.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import hashlib
|
|
33
|
+
import logging
|
|
34
|
+
import sqlite3
|
|
35
|
+
import struct
|
|
36
|
+
import time
|
|
37
|
+
from collections.abc import Iterable, Sequence
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
|
|
40
|
+
from .m3 import HybridVec, SparseVec
|
|
41
|
+
|
|
42
|
+
log = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
_SCHEMA = """
|
|
45
|
+
CREATE TABLE IF NOT EXISTS embed_cache (
|
|
46
|
+
chunk_hash TEXT NOT NULL,
|
|
47
|
+
model TEXT NOT NULL,
|
|
48
|
+
dense BLOB NOT NULL,
|
|
49
|
+
sparse_idx BLOB,
|
|
50
|
+
sparse_val BLOB,
|
|
51
|
+
ts REAL NOT NULL,
|
|
52
|
+
PRIMARY KEY (chunk_hash, model)
|
|
53
|
+
);
|
|
54
|
+
CREATE INDEX IF NOT EXISTS idx_embed_cache_ts ON embed_cache(ts);
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def hash_chunk(text: str) -> str:
|
|
59
|
+
"""SHA-256 of UTF-8 chunk text. Stable, collision-resistant."""
|
|
60
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _pack_floats(values: Sequence[float]) -> bytes:
|
|
64
|
+
return struct.pack(f"<{len(values)}f", *values)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _unpack_floats(blob: bytes) -> list[float]:
|
|
68
|
+
n = len(blob) // 4
|
|
69
|
+
return list(struct.unpack(f"<{n}f", blob))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _pack_ints(values: Sequence[int]) -> bytes:
|
|
73
|
+
return struct.pack(f"<{len(values)}i", *values)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _unpack_ints(blob: bytes) -> list[int]:
|
|
77
|
+
n = len(blob) // 4
|
|
78
|
+
return list(struct.unpack(f"<{n}i", blob))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class EmbedCache:
|
|
82
|
+
"""SQLite-backed content-hash cache for embedding vectors.
|
|
83
|
+
|
|
84
|
+
Open once per process. Concurrent access is safe but uncoordinated
|
|
85
|
+
— last write wins. The hot path (``get_many``) issues one
|
|
86
|
+
parameterised ``SELECT … WHERE chunk_hash IN (…)`` and rebuilds the
|
|
87
|
+
in-memory mapping; the cold path (``put_many``) batches inserts in
|
|
88
|
+
one transaction.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(self, path: Path | str) -> None:
|
|
92
|
+
self.path = Path(path)
|
|
93
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
94
|
+
# check_same_thread=False so the pipeline + watcher can share
|
|
95
|
+
# the same instance from different threads. SQLite serialises
|
|
96
|
+
# writes internally; reads are concurrent.
|
|
97
|
+
self.conn = sqlite3.connect(self.path, check_same_thread=False)
|
|
98
|
+
self.conn.executescript(_SCHEMA)
|
|
99
|
+
self.conn.commit()
|
|
100
|
+
# Stats so callers can log hit/miss ratios.
|
|
101
|
+
self.hits = 0
|
|
102
|
+
self.misses = 0
|
|
103
|
+
|
|
104
|
+
# ------------------------------------------------------------ read
|
|
105
|
+
|
|
106
|
+
def get_many(
|
|
107
|
+
self, hashes: Iterable[str], model: str
|
|
108
|
+
) -> dict[str, HybridVec]:
|
|
109
|
+
"""Return ``{hash: HybridVec}`` for every cached hash in ``hashes``.
|
|
110
|
+
|
|
111
|
+
Missing entries are simply absent from the result dict — the
|
|
112
|
+
caller decides what to do (typically: build a miss-list and
|
|
113
|
+
send it to the embedder).
|
|
114
|
+
"""
|
|
115
|
+
hash_list = list(hashes)
|
|
116
|
+
if not hash_list:
|
|
117
|
+
return {}
|
|
118
|
+
# SQLite's parameter limit is 999 by default; chunk to stay safe.
|
|
119
|
+
out: dict[str, HybridVec] = {}
|
|
120
|
+
for i in range(0, len(hash_list), 800):
|
|
121
|
+
batch = hash_list[i : i + 800]
|
|
122
|
+
placeholders = ",".join("?" * len(batch))
|
|
123
|
+
rows = self.conn.execute(
|
|
124
|
+
f"""
|
|
125
|
+
SELECT chunk_hash, dense, sparse_idx, sparse_val
|
|
126
|
+
FROM embed_cache
|
|
127
|
+
WHERE model = ? AND chunk_hash IN ({placeholders})
|
|
128
|
+
""",
|
|
129
|
+
(model, *batch),
|
|
130
|
+
).fetchall()
|
|
131
|
+
for chunk_hash, dense_blob, idx_blob, val_blob in rows:
|
|
132
|
+
indices = _unpack_ints(idx_blob) if idx_blob else []
|
|
133
|
+
values = _unpack_floats(val_blob) if val_blob else []
|
|
134
|
+
out[chunk_hash] = HybridVec(
|
|
135
|
+
dense=_unpack_floats(dense_blob),
|
|
136
|
+
sparse=SparseVec(indices=indices, values=values),
|
|
137
|
+
)
|
|
138
|
+
self.hits += len(out)
|
|
139
|
+
self.misses += len(hash_list) - len(out)
|
|
140
|
+
return out
|
|
141
|
+
|
|
142
|
+
# ------------------------------------------------------------ write
|
|
143
|
+
|
|
144
|
+
def put_many(
|
|
145
|
+
self,
|
|
146
|
+
items: Iterable[tuple[str, HybridVec]],
|
|
147
|
+
model: str,
|
|
148
|
+
) -> int:
|
|
149
|
+
"""Insert (hash, vec) pairs for ``model``. Returns count written."""
|
|
150
|
+
rows = []
|
|
151
|
+
now = time.time()
|
|
152
|
+
for chunk_hash, vec in items:
|
|
153
|
+
rows.append(
|
|
154
|
+
(
|
|
155
|
+
chunk_hash,
|
|
156
|
+
model,
|
|
157
|
+
_pack_floats(vec.dense),
|
|
158
|
+
_pack_ints(vec.sparse.indices) if vec.sparse.indices else None,
|
|
159
|
+
_pack_floats(vec.sparse.values) if vec.sparse.values else None,
|
|
160
|
+
now,
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
if not rows:
|
|
164
|
+
return 0
|
|
165
|
+
with self.conn:
|
|
166
|
+
self.conn.executemany(
|
|
167
|
+
"""
|
|
168
|
+
INSERT OR REPLACE INTO embed_cache
|
|
169
|
+
(chunk_hash, model, dense, sparse_idx, sparse_val, ts)
|
|
170
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
171
|
+
""",
|
|
172
|
+
rows,
|
|
173
|
+
)
|
|
174
|
+
return len(rows)
|
|
175
|
+
|
|
176
|
+
# ----------------------------------------------------------- admin
|
|
177
|
+
|
|
178
|
+
def stats(self) -> dict[str, int]:
|
|
179
|
+
rows = self.conn.execute(
|
|
180
|
+
"SELECT COUNT(*) FROM embed_cache"
|
|
181
|
+
).fetchone()
|
|
182
|
+
return {
|
|
183
|
+
"total_entries": int(rows[0]),
|
|
184
|
+
"hits": self.hits,
|
|
185
|
+
"misses": self.misses,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
def vacuum_older_than(self, seconds: float) -> int:
|
|
189
|
+
"""Drop entries last touched before ``now - seconds``."""
|
|
190
|
+
cutoff = time.time() - seconds
|
|
191
|
+
cur = self.conn.execute(
|
|
192
|
+
"DELETE FROM embed_cache WHERE ts < ?", (cutoff,)
|
|
193
|
+
)
|
|
194
|
+
self.conn.commit()
|
|
195
|
+
return cur.rowcount
|
|
196
|
+
|
|
197
|
+
def close(self) -> None:
|
|
198
|
+
self.conn.close()
|
|
199
|
+
|
|
200
|
+
def __enter__(self) -> EmbedCache:
|
|
201
|
+
return self
|
|
202
|
+
|
|
203
|
+
def __exit__(self, *exc: object) -> None:
|
|
204
|
+
self.close()
|