flurryx-code-memory 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. code_memory/__init__.py +1 -0
  2. code_memory/claims/__init__.py +32 -0
  3. code_memory/claims/extractor.py +325 -0
  4. code_memory/claims/indexer.py +258 -0
  5. code_memory/claims/resolver.py +186 -0
  6. code_memory/claims/store.py +424 -0
  7. code_memory/cli.py +1192 -0
  8. code_memory/config.py +268 -0
  9. code_memory/embed/__init__.py +224 -0
  10. code_memory/embed/cache.py +204 -0
  11. code_memory/embed/m3.py +174 -0
  12. code_memory/embed/ollama.py +92 -0
  13. code_memory/embed/tei.py +106 -0
  14. code_memory/episodic/__init__.py +3 -0
  15. code_memory/episodic/sqlite_store.py +278 -0
  16. code_memory/extractor/__init__.py +3 -0
  17. code_memory/extractor/csproj.py +166 -0
  18. code_memory/extractor/dll.py +385 -0
  19. code_memory/extractor/gitignore.py +162 -0
  20. code_memory/extractor/nuget.py +275 -0
  21. code_memory/extractor/sanity.py +124 -0
  22. code_memory/extractor/sln.py +108 -0
  23. code_memory/extractor/treesitter.py +1172 -0
  24. code_memory/graph/__init__.py +3 -0
  25. code_memory/graph/falkor_store.py +740 -0
  26. code_memory/mcp_server.py +1816 -0
  27. code_memory/metrics.py +260 -0
  28. code_memory/orchestrator/__init__.py +13 -0
  29. code_memory/orchestrator/git_delta.py +211 -0
  30. code_memory/orchestrator/ingest_state.py +71 -0
  31. code_memory/orchestrator/pipeline.py +1478 -0
  32. code_memory/orchestrator/reset.py +130 -0
  33. code_memory/orchestrator/resolver.py +825 -0
  34. code_memory/orchestrator/retrieve.py +505 -0
  35. code_memory/resilience.py +73 -0
  36. code_memory/sync/__init__.py +20 -0
  37. code_memory/sync/autostart/__init__.py +42 -0
  38. code_memory/sync/autostart/base.py +106 -0
  39. code_memory/sync/autostart/launchd.py +115 -0
  40. code_memory/sync/autostart/schtasks.py +155 -0
  41. code_memory/sync/autostart/systemd.py +113 -0
  42. code_memory/sync/hooks.py +164 -0
  43. code_memory/sync/safety.py +65 -0
  44. code_memory/sync/snapshot.py +461 -0
  45. code_memory/sync/store.py +399 -0
  46. code_memory/sync/sync.py +405 -0
  47. code_memory/sync/watcher.py +320 -0
  48. code_memory/vector/__init__.py +3 -0
  49. code_memory/vector/qdrant_store.py +302 -0
  50. flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
  51. flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
  52. flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
  53. flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
code_memory/config.py ADDED
@@ -0,0 +1,268 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ import subprocess
6
+ from dataclasses import dataclass, replace
7
+ from pathlib import Path
8
+
9
+
10
+ # Config file name (project-local and global). KEY=VALUE per line, '#'
11
+ # starts a comment. Real shell env always wins; project file beats
12
+ # global file. Layering exists so users can pin defaults once
13
+ # (~/.config/code-memory/config) and override per repo
14
+ # (./.code-memoryrc) without polluting the shell rc.
15
+ _RC_BASENAME = ".code-memoryrc"
16
+ _GLOBAL_RC = (
17
+ Path(os.environ.get("XDG_CONFIG_HOME", str(Path.home() / ".config")))
18
+ / "code-memory"
19
+ / "config"
20
+ )
21
+
22
+
23
+ def _parse_rc(path: Path) -> dict[str, str]:
24
+ try:
25
+ text = path.read_text(encoding="utf-8")
26
+ except OSError:
27
+ return {}
28
+ out: dict[str, str] = {}
29
+ for raw in text.splitlines():
30
+ line = raw.strip()
31
+ if not line or line.startswith("#"):
32
+ continue
33
+ if line.startswith("export "):
34
+ line = line[7:].lstrip()
35
+ key, sep, val = line.partition("=")
36
+ if not sep:
37
+ continue
38
+ key = key.strip()
39
+ val = val.strip()
40
+ # Strip matching surrounding quotes if any.
41
+ if len(val) >= 2 and val[0] == val[-1] and val[0] in ("'", '"'):
42
+ val = val[1:-1]
43
+ if key:
44
+ out[key] = val
45
+ return out
46
+
47
+
48
+ def _project_rc() -> Path | None:
49
+ """Locate project rc: cwd, then walk up to git toplevel."""
50
+ cwd = Path.cwd()
51
+ candidate = cwd / _RC_BASENAME
52
+ if candidate.is_file():
53
+ return candidate
54
+ top = _git_toplevel(cwd)
55
+ if top is not None:
56
+ candidate = top / _RC_BASENAME
57
+ if candidate.is_file():
58
+ return candidate
59
+ return None
60
+
61
+
62
+ def _load_rc_into_environ() -> None:
63
+ """Populate os.environ with rc-file values without overriding the
64
+ real shell. Project rc beats global rc.
65
+
66
+ Precedence (highest → lowest):
67
+ real shell env > ./.code-memoryrc > ~/.config/code-memory/config
68
+ """
69
+ # Apply global first so the project pass can shadow it. Neither
70
+ # pass overrides anything already in the shell environment.
71
+ for source in (_GLOBAL_RC, _project_rc()):
72
+ if source is None:
73
+ continue
74
+ for k, v in _parse_rc(source).items():
75
+ if k not in os.environ:
76
+ os.environ[k] = v
77
+
78
+
79
+ def _env(key: str, default: str) -> str:
80
+ return os.environ.get(key, default)
81
+
82
+
83
+ _SLUG_RE = re.compile(r"[^a-z0-9]+")
84
+
85
+ # Sentinel values for ``CODE_MEMORY_PROJECT`` that mean "infer from cwd"
86
+ # rather than "use a project literally named this". Recognising these
87
+ # avoids the silent footgun of indexing into a namespace called ``auto``.
88
+ _AUTO_SENTINELS = frozenset({"", "auto", "default"})
89
+
90
+
91
+ def slugify(name: str) -> str:
92
+ s = _SLUG_RE.sub("-", name.lower()).strip("-")
93
+ return s or "default"
94
+
95
+
96
+ def _git_toplevel(start: Path) -> Path | None:
97
+ try:
98
+ out = subprocess.run(
99
+ ["git", "-C", str(start), "rev-parse", "--show-toplevel"],
100
+ capture_output=True,
101
+ text=True,
102
+ check=False,
103
+ timeout=2,
104
+ )
105
+ except (FileNotFoundError, subprocess.SubprocessError):
106
+ return None
107
+ if out.returncode != 0:
108
+ return None
109
+ top = out.stdout.strip()
110
+ return Path(top) if top else None
111
+
112
+
113
+ # Populate os.environ from rc files *before* the ``Config`` dataclass
114
+ # defaults are evaluated (those are computed at module import via
115
+ # ``_env(...)`` calls in field defaults). Real shell env still wins.
116
+ _load_rc_into_environ()
117
+
118
+
119
+ # Vector dimensionality of the embedding models we ship recipes for.
120
+ # Used to default ``EMBED_DIM`` when the operator only sets
121
+ # ``EMBED_MODEL``. Saves the silent-mismatch footgun where the model
122
+ # emits 768-d vectors but the Qdrant collection was created for 1024.
123
+ # Keys are matched case-insensitively against the leading model name
124
+ # (anything before ``:``), so ``bge-m3:latest``, ``bge-m3:567m-fp16``,
125
+ # and ``BAAI/bge-m3`` all resolve to the same dim.
126
+ _KNOWN_MODEL_DIMS: dict[str, int] = {
127
+ # bge family
128
+ "bge-m3": 1024,
129
+ "baai/bge-m3": 1024,
130
+ "bge-large-en": 1024,
131
+ "bge-base-en": 768,
132
+ "bge-small-en": 384,
133
+ # mixedbread
134
+ "mxbai-embed-large": 1024,
135
+ # snowflake
136
+ "snowflake-arctic-embed:s": 384,
137
+ "snowflake-arctic-embed:m": 768,
138
+ "snowflake-arctic-embed:l": 1024,
139
+ }
140
+
141
+
142
+ def resolve_embed_dim(model_name: str, override: int = 0) -> int:
143
+ """Return the vector dim for ``model_name``, honouring ``override``.
144
+
145
+ ``override > 0`` wins (operators with a custom model still in
146
+ control). Otherwise look up the model's base name in the known
147
+ table. Falls back to ``1024`` (bge-m3 default) with a print to
148
+ stderr so the operator notices we're guessing.
149
+ """
150
+ if override > 0:
151
+ return override
152
+ lower = model_name.strip().lower()
153
+ # Try the full name (so ``snowflake-arctic-embed:s`` matches its
154
+ # own dim, not the parent family's). Fall back to the bare base
155
+ # name (so ``bge-m3:latest`` still resolves via ``bge-m3``).
156
+ if lower in _KNOWN_MODEL_DIMS:
157
+ return _KNOWN_MODEL_DIMS[lower]
158
+ base = lower.split(":", 1)[0]
159
+ if base in _KNOWN_MODEL_DIMS:
160
+ return _KNOWN_MODEL_DIMS[base]
161
+ # Unknown model — fall back to the bge-m3 default but warn so the
162
+ # operator notices a mismatch before it produces broken vectors.
163
+ import sys as _sys
164
+ _sys.stderr.write(
165
+ f"[code-memory] WARNING: embed model {model_name!r} not in "
166
+ f"known-dim table; defaulting to 1024. Set EMBED_DIM=<n> to silence.\n"
167
+ )
168
+ return 1024
169
+
170
+
171
+ def detect_project_slug(root: str | Path | None = None) -> str:
172
+ """Resolve project slug.
173
+
174
+ Priority:
175
+ 1. explicit `root` (path) -> git toplevel basename, else dir name
176
+ 2. CODE_MEMORY_PROJECT env var
177
+ 3. cwd -> git toplevel basename, else cwd name
178
+ """
179
+ if root is not None:
180
+ p = Path(root).resolve()
181
+ top = _git_toplevel(p if p.is_dir() else p.parent)
182
+ return slugify((top or p).name)
183
+
184
+ env = os.environ.get("CODE_MEMORY_PROJECT", "").strip()
185
+ if env and env.lower() not in _AUTO_SENTINELS:
186
+ return slugify(env)
187
+
188
+ cwd = Path.cwd()
189
+ top = _git_toplevel(cwd)
190
+ return slugify((top or cwd).name)
191
+
192
+
193
+ @dataclass(frozen=True)
194
+ class Config:
195
+ ollama_url: str = _env("OLLAMA_URL", "http://localhost:11434")
196
+ # TEI (text-embeddings-inference) server URL. Used only when
197
+ # ``EMBED_BACKEND=tei``. The enterprise-deploy story: stand TEI up
198
+ # on a GPU host (Linux + CUDA), point ``TEI_URL`` at it, get a
199
+ # 5-10× cold-ingest speedup over Ollama with the same bge-m3
200
+ # weights. On Mac there's no GPU advantage and Ollama's Metal path
201
+ # is faster — leave on the default backend there.
202
+ tei_url: str = _env("TEI_URL", "http://localhost:8080")
203
+ embed_model: str = _env("EMBED_MODEL", "bge-m3")
204
+ # ``embed_dim`` defaults to the dimension of the configured model
205
+ # so users don't have to keep two env vars in sync. Override with
206
+ # ``EMBED_DIM`` when running a model not in the known-dim table.
207
+ embed_dim: int = int(_env("EMBED_DIM", "0"))
208
+
209
+ qdrant_url: str = _env("QDRANT_URL", "http://localhost:6333")
210
+ qdrant_code: str = _env("QDRANT_COLLECTION_CODE", "code_chunks")
211
+ qdrant_episodes: str = _env("QDRANT_COLLECTION_EPISODES", "episodes")
212
+ qdrant_claim_entities: str = _env(
213
+ "QDRANT_COLLECTION_CLAIM_ENTITIES", "claim_entities"
214
+ )
215
+ # Semantic index over user-claim triples (subject + predicate + object
216
+ # + evidence_span). Distinct from ``qdrant_claim_entities`` — that
217
+ # one stores canonical entity vectors for resolver dedup; this one
218
+ # stores per-claim vectors so retrieve can return semantically
219
+ # matching claims alongside code + episodes. SQLite (``claims.db``)
220
+ # remains source of truth; this collection is rebuildable.
221
+ qdrant_claims: str = _env("QDRANT_COLLECTION_CLAIMS", "claims")
222
+
223
+ falkor_host: str = _env("FALKOR_HOST", "localhost")
224
+ falkor_port: int = int(_env("FALKOR_PORT", "6379"))
225
+ falkor_graph: str = _env("FALKOR_GRAPH", "code_graph")
226
+
227
+ # Resolved once at import time. Late-binding against `Path.cwd()` would
228
+ # diverge whenever a long-lived process (MCP server) shares storage
229
+ # with shell invocations launched from a different cwd, silently
230
+ # routing writes and reads to different files.
231
+ episodic_db: Path = Path(_env("EPISODIC_DB", "./data/episodic.db")).resolve()
232
+ claims_db: Path = Path(_env("CLAIMS_DB", "./data/claims.db")).resolve()
233
+ data_dir: Path = Path(_env("DATA_DIR", "./data")).resolve()
234
+
235
+ # Claim extraction (Graphiti-style user-prompt facts).
236
+ # Enabled by default. Set CLAIMS_EXTRACTION=false to disable.
237
+ claims_enabled: bool = _env("CLAIMS_EXTRACTION", "true").strip().lower() in {
238
+ "1",
239
+ "true",
240
+ "yes",
241
+ "on",
242
+ }
243
+ claims_llm_model: str = _env("CLAIMS_LLM_MODEL", "gemma2:9b")
244
+ claims_llm_timeout: float = float(_env("CLAIMS_LLM_TIMEOUT", "30"))
245
+ claims_min_confidence: float = float(_env("CLAIMS_MIN_CONFIDENCE", "0.6"))
246
+ # Cosine similarity at or above which a freshly embedded
247
+ # subject/object reuses an existing entity instead of creating a new
248
+ # one. 0.85 is a conservative default — false-merges hurt more than
249
+ # extra entities (they propagate to every downstream claim).
250
+ claims_entity_threshold: float = float(
251
+ _env("CLAIMS_ENTITY_THRESHOLD", "0.85")
252
+ )
253
+
254
+ def for_project(self, slug: str) -> Config:
255
+ slug = slugify(slug)
256
+ return replace(
257
+ self,
258
+ qdrant_code=f"{self.qdrant_code}__{slug}",
259
+ qdrant_episodes=f"{self.qdrant_episodes}__{slug}",
260
+ qdrant_claim_entities=f"{self.qdrant_claim_entities}__{slug}",
261
+ qdrant_claims=f"{self.qdrant_claims}__{slug}",
262
+ falkor_graph=f"{self.falkor_graph}__{slug}",
263
+ episodic_db=self.data_dir / slug / "episodic.db",
264
+ claims_db=self.data_dir / slug / "claims.db",
265
+ )
266
+
267
+
268
+ CONFIG = Config()
@@ -0,0 +1,224 @@
1
+ """Embedding backends.
2
+
3
+ Three backends, same :class:`HybridVec` shape:
4
+
5
+ * :class:`OllamaEmbedder` (default) — dense-only via the Ollama daemon.
6
+ Keeps the model warm across short-lived CLI processes (per-save
7
+ reingest hooks, git hooks). Returns ``sparse`` as an empty vector.
8
+ * :class:`M3Embedder` (opt-in via ``EMBED_BACKEND=flagembed``) — dense
9
+ + sparse from one in-process FlagEmbedding forward pass. Best for
10
+ long-lived processes (watcher, MCP server) where the cold-load cost
11
+ is paid once.
12
+ * :class:`TEIEmbedder` (opt-in via ``EMBED_BACKEND=tei``) — dense-only
13
+ via HuggingFace's `text-embeddings-inference` GPU server. **5-10×
14
+ cold-ingest speedup vs Ollama on Linux + NVIDIA**, same weights, no
15
+ recall loss. Set ``TEI_URL`` to point at the TEI daemon (default
16
+ ``http://localhost:8080``).
17
+
18
+ All backends are transparently wrapped in :class:`CachedEmbedder` so
19
+ content-hash cache hits skip the model entirely on re-ingest.
20
+
21
+ Use :func:`get_embedder` for the process-singleton; it reads
22
+ ``EMBED_BACKEND`` and dispatches accordingly.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import logging
28
+ import os
29
+ from collections.abc import Sequence
30
+ from pathlib import Path
31
+ from typing import Protocol
32
+
33
+ from ..config import CONFIG
34
+ from .cache import EmbedCache, hash_chunk
35
+ from .m3 import HybridVec, M3Embedder, SparseVec
36
+ from .ollama import OllamaEmbedder
37
+ from .tei import TEIEmbedder
38
+
39
+ log = logging.getLogger(__name__)
40
+
41
+ ENV_BACKEND = "EMBED_BACKEND"
42
+ ENV_DISABLE_CACHE = "EMBED_CACHE_DISABLED"
43
+
44
+
45
+ class Embedder(Protocol):
46
+ """Common shape: produce :class:`HybridVec` per text."""
47
+
48
+ def embed(self, texts): # type: ignore[no-untyped-def]
49
+ ...
50
+
51
+ def embed_one(self, text: str) -> HybridVec: ...
52
+
53
+
54
+ class CachedEmbedder:
55
+ """Embedder that consults a content-hash cache before the inner backend.
56
+
57
+ Same ``embed`` / ``embed_one`` shape as the underlying embedder, so
58
+ callers don't see the cache. The wrapper:
59
+
60
+ 1. Hashes every requested chunk.
61
+ 2. Pulls cached vectors in one ``IN (?, ?, …)`` SELECT.
62
+ 3. Forwards the miss list to the inner embedder.
63
+ 4. Writes the new vectors back to the cache before returning.
64
+ 5. Reassembles the output in the original input order.
65
+
66
+ On a re-ingest where every chunk is unchanged, the inner embedder
67
+ sees an empty list and returns instantly — the whole vector
68
+ pipeline collapses to a SQLite scan + Qdrant upsert.
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ inner: Embedder,
74
+ cache: EmbedCache,
75
+ model_id: str,
76
+ ) -> None:
77
+ self._inner = inner
78
+ self._cache = cache
79
+ self._model_id = model_id
80
+
81
+ @property
82
+ def cache(self) -> EmbedCache:
83
+ return self._cache
84
+
85
+ @property
86
+ def model_id(self) -> str:
87
+ return self._model_id
88
+
89
+ def embed(self, texts: Sequence[str]) -> list[HybridVec]:
90
+ if not texts:
91
+ return []
92
+ hashes = [hash_chunk(t) for t in texts]
93
+ cached = self._cache.get_many(hashes, self._model_id)
94
+ # Build miss-list + remember positions so we can splice results
95
+ # back into input order.
96
+ miss_positions: list[int] = []
97
+ miss_texts: list[str] = []
98
+ miss_hashes: list[str] = []
99
+ for i, h in enumerate(hashes):
100
+ if h not in cached:
101
+ miss_positions.append(i)
102
+ miss_texts.append(texts[i])
103
+ miss_hashes.append(h)
104
+
105
+ new_vecs: list[HybridVec] = (
106
+ self._inner.embed(miss_texts) if miss_texts else []
107
+ )
108
+ if new_vecs:
109
+ self._cache.put_many(
110
+ zip(miss_hashes, new_vecs, strict=True),
111
+ model=self._model_id,
112
+ )
113
+
114
+ # Reassemble in original order.
115
+ out: list[HybridVec] = [None] * len(texts) # type: ignore[list-item]
116
+ for i, h in enumerate(hashes):
117
+ if h in cached:
118
+ out[i] = cached[h]
119
+ for pos, vec in zip(miss_positions, new_vecs, strict=True):
120
+ out[pos] = vec
121
+ return out # type: ignore[return-value]
122
+
123
+ def embed_one(self, text: str) -> HybridVec:
124
+ return self.embed([text])[0]
125
+
126
+ def close(self) -> None:
127
+ inner_close = getattr(self._inner, "close", None)
128
+ if callable(inner_close):
129
+ inner_close()
130
+ self._cache.close()
131
+
132
+
133
+ _SINGLETON: Embedder | None = None
134
+
135
+
136
+ def _resolve_backend() -> str:
137
+ raw = os.environ.get(ENV_BACKEND, "ollama").strip().lower()
138
+ if raw in ("flagembed", "flag", "m3", "fastembed"):
139
+ return "flagembed"
140
+ if raw in ("tei", "text-embeddings-inference"):
141
+ return "tei"
142
+ return "ollama"
143
+
144
+
145
+ def _cache_enabled() -> bool:
146
+ raw = os.environ.get(ENV_DISABLE_CACHE, "").strip().lower()
147
+ return raw not in ("1", "true", "yes", "on")
148
+
149
+
150
+ def _build_inner_embedder(backend: str) -> tuple[Embedder, str]:
151
+ """Return (embedder, model_id). model_id namespaces the cache.
152
+
153
+ Note: the cache key includes only the embedding model name, not
154
+ the backend — Ollama and TEI serving the *same* ``bge-m3`` weights
155
+ yield the same vectors (within floating-point tolerance), so the
156
+ cache hits are interchangeable across backends. Saves the cache
157
+ cold-start cost when an operator switches Ollama → TEI.
158
+ """
159
+ if backend == "flagembed":
160
+ log.info("embed: backend=flagembed (in-process m3, dense+sparse)")
161
+ emb_m3 = M3Embedder()
162
+ # FlagEmbed carries a sparse vector that Ollama/TEI don't —
163
+ # keep its cache slot separate so dense-only backends never
164
+ # see (and silently drop) those sparse rows.
165
+ return emb_m3, f"flagembed:{getattr(emb_m3, 'model_name', 'bge-m3')}"
166
+ if backend == "tei":
167
+ log.info("embed: backend=tei (HTTP @ %s, dense-only)", CONFIG.tei_url)
168
+ emb_tei = TEIEmbedder()
169
+ return emb_tei, f"model:{getattr(emb_tei, 'model', 'bge-m3')}"
170
+ log.info("embed: backend=ollama (HTTP, dense-only)")
171
+ emb = OllamaEmbedder()
172
+ return emb, f"model:{getattr(emb, 'model', 'bge-m3')}"
173
+
174
+
175
+ def get_embedder() -> Embedder:
176
+ """Process-singleton embedder. First call wins the backend choice.
177
+
178
+ The embedder is always wrapped in :class:`CachedEmbedder` unless
179
+ ``EMBED_CACHE_DISABLED=1`` is set — content-hash cache hits then
180
+ bypass the inner model entirely on re-ingest.
181
+ """
182
+ global _SINGLETON
183
+ if _SINGLETON is None:
184
+ backend = _resolve_backend()
185
+ inner, model_id = _build_inner_embedder(backend)
186
+ if not _cache_enabled():
187
+ log.info("embed: cache disabled via %s", ENV_DISABLE_CACHE)
188
+ _SINGLETON = inner
189
+ else:
190
+ cache_path = _cache_db_path()
191
+ log.info("embed: cache at %s (model=%s)", cache_path, model_id)
192
+ cache = EmbedCache(cache_path)
193
+ _SINGLETON = CachedEmbedder(inner=inner, cache=cache, model_id=model_id)
194
+ return _SINGLETON
195
+
196
+
197
+ def _cache_db_path() -> Path:
198
+ """Cache file lives in ``CONFIG.data_dir`` so it survives ``code-memory
199
+ reset`` (which only clears the project namespace) and so the same
200
+ content embedded twice across projects reuses the cached vector.
201
+ """
202
+ base = Path(CONFIG.data_dir)
203
+ base.mkdir(parents=True, exist_ok=True)
204
+ return base / "embed_cache.sqlite"
205
+
206
+
207
+ def set_embedder_for_tests(embedder: Embedder | None) -> None:
208
+ global _SINGLETON
209
+ _SINGLETON = embedder
210
+
211
+
212
+ __all__ = [
213
+ "CachedEmbedder",
214
+ "EmbedCache",
215
+ "Embedder",
216
+ "HybridVec",
217
+ "M3Embedder",
218
+ "OllamaEmbedder",
219
+ "SparseVec",
220
+ "TEIEmbedder",
221
+ "get_embedder",
222
+ "hash_chunk",
223
+ "set_embedder_for_tests",
224
+ ]
@@ -0,0 +1,204 @@
1
+ """Persistent content-hash embedding cache.
2
+
3
+ Most enterprise workflows re-ingest the same repo daily after small
4
+ diffs: a few changed files, the rest stable. Without a cache, every
5
+ ingest re-embeds 100% of the corpus — for a 134k-chunk monorepo on
6
+ ``bge-m3``/Ollama that's ~1.5 hours of pure inference each run.
7
+
8
+ This cache fingerprints each chunk's text (SHA-256) plus the embedding
9
+ model name and keys a dense / sparse vector pair on the result. On
10
+ re-ingest, unchanged chunks short-circuit the embedder entirely. Only
11
+ new or modified chunks pay the model cost.
12
+
13
+ Design choices:
14
+
15
+ - **SQLite single-file store** so it shares the same lifecycle as
16
+ ``EpisodicStore`` (one persistent state directory per project). No
17
+ separate daemon.
18
+ - **Per-model namespacing.** Switching between ``bge-m3`` and
19
+ ``bge-small-en`` must not pollute results — they live in different
20
+ rows. Same hash + different model = different cache entries.
21
+ - **Raw float32 BLOBs.** Lighter than JSON; deserialises with a single
22
+ ``struct.unpack`` call.
23
+ - **Insert-only by default.** Cache is treated as monotonic; a separate
24
+ ``vacuum`` clears stale entries that haven't been read in N days.
25
+ - **No locking beyond SQLite's default.** Concurrent watch + manual
26
+ ingest are rare and the upsert path uses ``INSERT OR REPLACE`` so
27
+ the latest write wins without explicit serialisation.
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import hashlib
33
+ import logging
34
+ import sqlite3
35
+ import struct
36
+ import time
37
+ from collections.abc import Iterable, Sequence
38
+ from pathlib import Path
39
+
40
+ from .m3 import HybridVec, SparseVec
41
+
42
+ log = logging.getLogger(__name__)
43
+
44
+ _SCHEMA = """
45
+ CREATE TABLE IF NOT EXISTS embed_cache (
46
+ chunk_hash TEXT NOT NULL,
47
+ model TEXT NOT NULL,
48
+ dense BLOB NOT NULL,
49
+ sparse_idx BLOB,
50
+ sparse_val BLOB,
51
+ ts REAL NOT NULL,
52
+ PRIMARY KEY (chunk_hash, model)
53
+ );
54
+ CREATE INDEX IF NOT EXISTS idx_embed_cache_ts ON embed_cache(ts);
55
+ """
56
+
57
+
58
+ def hash_chunk(text: str) -> str:
59
+ """SHA-256 of UTF-8 chunk text. Stable, collision-resistant."""
60
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
61
+
62
+
63
+ def _pack_floats(values: Sequence[float]) -> bytes:
64
+ return struct.pack(f"<{len(values)}f", *values)
65
+
66
+
67
+ def _unpack_floats(blob: bytes) -> list[float]:
68
+ n = len(blob) // 4
69
+ return list(struct.unpack(f"<{n}f", blob))
70
+
71
+
72
+ def _pack_ints(values: Sequence[int]) -> bytes:
73
+ return struct.pack(f"<{len(values)}i", *values)
74
+
75
+
76
+ def _unpack_ints(blob: bytes) -> list[int]:
77
+ n = len(blob) // 4
78
+ return list(struct.unpack(f"<{n}i", blob))
79
+
80
+
81
+ class EmbedCache:
82
+ """SQLite-backed content-hash cache for embedding vectors.
83
+
84
+ Open once per process. Concurrent access is safe but uncoordinated
85
+ — last write wins. The hot path (``get_many``) issues one
86
+ parameterised ``SELECT … WHERE chunk_hash IN (…)`` and rebuilds the
87
+ in-memory mapping; the cold path (``put_many``) batches inserts in
88
+ one transaction.
89
+ """
90
+
91
+ def __init__(self, path: Path | str) -> None:
92
+ self.path = Path(path)
93
+ self.path.parent.mkdir(parents=True, exist_ok=True)
94
+ # check_same_thread=False so the pipeline + watcher can share
95
+ # the same instance from different threads. SQLite serialises
96
+ # writes internally; reads are concurrent.
97
+ self.conn = sqlite3.connect(self.path, check_same_thread=False)
98
+ self.conn.executescript(_SCHEMA)
99
+ self.conn.commit()
100
+ # Stats so callers can log hit/miss ratios.
101
+ self.hits = 0
102
+ self.misses = 0
103
+
104
+ # ------------------------------------------------------------ read
105
+
106
+ def get_many(
107
+ self, hashes: Iterable[str], model: str
108
+ ) -> dict[str, HybridVec]:
109
+ """Return ``{hash: HybridVec}`` for every cached hash in ``hashes``.
110
+
111
+ Missing entries are simply absent from the result dict — the
112
+ caller decides what to do (typically: build a miss-list and
113
+ send it to the embedder).
114
+ """
115
+ hash_list = list(hashes)
116
+ if not hash_list:
117
+ return {}
118
+ # SQLite's parameter limit is 999 by default; chunk to stay safe.
119
+ out: dict[str, HybridVec] = {}
120
+ for i in range(0, len(hash_list), 800):
121
+ batch = hash_list[i : i + 800]
122
+ placeholders = ",".join("?" * len(batch))
123
+ rows = self.conn.execute(
124
+ f"""
125
+ SELECT chunk_hash, dense, sparse_idx, sparse_val
126
+ FROM embed_cache
127
+ WHERE model = ? AND chunk_hash IN ({placeholders})
128
+ """,
129
+ (model, *batch),
130
+ ).fetchall()
131
+ for chunk_hash, dense_blob, idx_blob, val_blob in rows:
132
+ indices = _unpack_ints(idx_blob) if idx_blob else []
133
+ values = _unpack_floats(val_blob) if val_blob else []
134
+ out[chunk_hash] = HybridVec(
135
+ dense=_unpack_floats(dense_blob),
136
+ sparse=SparseVec(indices=indices, values=values),
137
+ )
138
+ self.hits += len(out)
139
+ self.misses += len(hash_list) - len(out)
140
+ return out
141
+
142
+ # ------------------------------------------------------------ write
143
+
144
+ def put_many(
145
+ self,
146
+ items: Iterable[tuple[str, HybridVec]],
147
+ model: str,
148
+ ) -> int:
149
+ """Insert (hash, vec) pairs for ``model``. Returns count written."""
150
+ rows = []
151
+ now = time.time()
152
+ for chunk_hash, vec in items:
153
+ rows.append(
154
+ (
155
+ chunk_hash,
156
+ model,
157
+ _pack_floats(vec.dense),
158
+ _pack_ints(vec.sparse.indices) if vec.sparse.indices else None,
159
+ _pack_floats(vec.sparse.values) if vec.sparse.values else None,
160
+ now,
161
+ )
162
+ )
163
+ if not rows:
164
+ return 0
165
+ with self.conn:
166
+ self.conn.executemany(
167
+ """
168
+ INSERT OR REPLACE INTO embed_cache
169
+ (chunk_hash, model, dense, sparse_idx, sparse_val, ts)
170
+ VALUES (?, ?, ?, ?, ?, ?)
171
+ """,
172
+ rows,
173
+ )
174
+ return len(rows)
175
+
176
+ # ----------------------------------------------------------- admin
177
+
178
+ def stats(self) -> dict[str, int]:
179
+ rows = self.conn.execute(
180
+ "SELECT COUNT(*) FROM embed_cache"
181
+ ).fetchone()
182
+ return {
183
+ "total_entries": int(rows[0]),
184
+ "hits": self.hits,
185
+ "misses": self.misses,
186
+ }
187
+
188
+ def vacuum_older_than(self, seconds: float) -> int:
189
+ """Drop entries last touched before ``now - seconds``."""
190
+ cutoff = time.time() - seconds
191
+ cur = self.conn.execute(
192
+ "DELETE FROM embed_cache WHERE ts < ?", (cutoff,)
193
+ )
194
+ self.conn.commit()
195
+ return cur.rowcount
196
+
197
+ def close(self) -> None:
198
+ self.conn.close()
199
+
200
+ def __enter__(self) -> EmbedCache:
201
+ return self
202
+
203
+ def __exit__(self, *exc: object) -> None:
204
+ self.close()