flurryx-code-memory 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. code_memory/__init__.py +1 -0
  2. code_memory/claims/__init__.py +32 -0
  3. code_memory/claims/extractor.py +325 -0
  4. code_memory/claims/indexer.py +258 -0
  5. code_memory/claims/resolver.py +186 -0
  6. code_memory/claims/store.py +424 -0
  7. code_memory/cli.py +1192 -0
  8. code_memory/config.py +268 -0
  9. code_memory/embed/__init__.py +224 -0
  10. code_memory/embed/cache.py +204 -0
  11. code_memory/embed/m3.py +174 -0
  12. code_memory/embed/ollama.py +92 -0
  13. code_memory/embed/tei.py +106 -0
  14. code_memory/episodic/__init__.py +3 -0
  15. code_memory/episodic/sqlite_store.py +278 -0
  16. code_memory/extractor/__init__.py +3 -0
  17. code_memory/extractor/csproj.py +166 -0
  18. code_memory/extractor/dll.py +385 -0
  19. code_memory/extractor/gitignore.py +162 -0
  20. code_memory/extractor/nuget.py +275 -0
  21. code_memory/extractor/sanity.py +124 -0
  22. code_memory/extractor/sln.py +108 -0
  23. code_memory/extractor/treesitter.py +1172 -0
  24. code_memory/graph/__init__.py +3 -0
  25. code_memory/graph/falkor_store.py +740 -0
  26. code_memory/mcp_server.py +1816 -0
  27. code_memory/metrics.py +260 -0
  28. code_memory/orchestrator/__init__.py +13 -0
  29. code_memory/orchestrator/git_delta.py +211 -0
  30. code_memory/orchestrator/ingest_state.py +71 -0
  31. code_memory/orchestrator/pipeline.py +1478 -0
  32. code_memory/orchestrator/reset.py +130 -0
  33. code_memory/orchestrator/resolver.py +825 -0
  34. code_memory/orchestrator/retrieve.py +505 -0
  35. code_memory/resilience.py +73 -0
  36. code_memory/sync/__init__.py +20 -0
  37. code_memory/sync/autostart/__init__.py +42 -0
  38. code_memory/sync/autostart/base.py +106 -0
  39. code_memory/sync/autostart/launchd.py +115 -0
  40. code_memory/sync/autostart/schtasks.py +155 -0
  41. code_memory/sync/autostart/systemd.py +113 -0
  42. code_memory/sync/hooks.py +164 -0
  43. code_memory/sync/safety.py +65 -0
  44. code_memory/sync/snapshot.py +461 -0
  45. code_memory/sync/store.py +399 -0
  46. code_memory/sync/sync.py +405 -0
  47. code_memory/sync/watcher.py +320 -0
  48. code_memory/vector/__init__.py +3 -0
  49. code_memory/vector/qdrant_store.py +302 -0
  50. flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
  51. flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
  52. flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
  53. flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,174 @@
1
+ """BGE-M3 hybrid embedder: dense + sparse from one forward pass.
2
+
3
+ Opt-in backend (``EMBED_BACKEND=flagembed``). Loads m3 in-process via
4
+ FlagEmbedding, which means each Python process pays a ~5-15 s
5
+ cold-load. Worth it for long-lived processes (watcher, MCP server)
6
+ that want the sparse signal; not worth it for hook-driven per-save
7
+ CLI invocations — :class:`code_memory.embed.OllamaEmbedder` is the
8
+ default for that reason.
9
+
10
+ m3 emits three views per input:
11
+
12
+ * Dense (1024-d float) — semantic similarity (cosine).
13
+ * Sparse (token-id -> weight) — lexical/identifier signal akin to BM25
14
+ but learned. Used for code search where exact symbol names matter.
15
+ * ColBERT multi-vec — not used here; cross-encoder rerank covers the
16
+ late-interaction case.
17
+
18
+ Fusion happens server-side in Qdrant (RRF / DBSF), so both views are
19
+ stored alongside each chunk and combined at query time.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+ import platform
26
+ from collections.abc import Sequence
27
+ from dataclasses import dataclass
28
+ from typing import Any
29
+
30
+ from ..config import CONFIG
31
+
32
+ log = logging.getLogger(__name__)
33
+
34
+ # FlagEmbedding requires a HF repo id. The legacy ``EMBED_MODEL`` env
35
+ # var used the Ollama short name (``bge-m3``), which HF rejects, so we
36
+ # remap it here. Other models pass through unchanged.
37
+ _OLLAMA_TO_HF = {"bge-m3": "BAAI/bge-m3"}
38
+ DEFAULT_MODEL = "BAAI/bge-m3"
39
+
40
+
41
+ def _resolve_model(name: str | None) -> str:
42
+ raw = (name or CONFIG.embed_model or DEFAULT_MODEL).strip()
43
+ return _OLLAMA_TO_HF.get(raw, raw)
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class SparseVec:
48
+ """Sparse vector in Qdrant's (indices, values) layout."""
49
+
50
+ indices: list[int]
51
+ values: list[float]
52
+
53
+
54
+ @dataclass(frozen=True)
55
+ class HybridVec:
56
+ dense: list[float]
57
+ sparse: SparseVec
58
+
59
+
60
+ def _detect_device() -> str:
61
+ """Best available accelerator; falls back to CPU."""
62
+ try:
63
+ import torch
64
+ except ImportError:
65
+ return "cpu"
66
+ if platform.system() == "Darwin" and torch.backends.mps.is_available():
67
+ return "mps"
68
+ if torch.cuda.is_available():
69
+ return "cuda"
70
+ return "cpu"
71
+
72
+
73
+ class M3Embedder:
74
+ """Stateful BGE-M3 wrapper producing dense + sparse vectors.
75
+
76
+ Heavy to construct (downloads + loads ~2.3GB on first use). Cache
77
+ the instance for the process lifetime — see ``code_memory.embed``
78
+ factory below.
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ model: str | None = None,
84
+ device: str | None = None,
85
+ use_fp16: bool | None = None,
86
+ batch_size: int = 12,
87
+ ) -> None:
88
+ from FlagEmbedding import BGEM3FlagModel
89
+
90
+ self.model_name = _resolve_model(model)
91
+ self.device = device or _detect_device()
92
+ # fp16 only safe on CUDA/MPS; CPU stays at fp32 for numerical
93
+ # stability + because some BLAS kernels don't support fp16.
94
+ if use_fp16 is None:
95
+ use_fp16 = self.device in ("cuda", "mps")
96
+ self.batch_size = batch_size
97
+ log.info(
98
+ "m3: loading %s (device=%s fp16=%s)",
99
+ self.model_name,
100
+ self.device,
101
+ use_fp16,
102
+ )
103
+ self._impl = BGEM3FlagModel(
104
+ self.model_name,
105
+ use_fp16=use_fp16,
106
+ devices=self.device,
107
+ )
108
+
109
+ # ----------------------------------------------------------- batch
110
+
111
+ def embed(self, texts: Sequence[str]) -> list[HybridVec]:
112
+ if not texts:
113
+ return []
114
+ out = self._impl.encode(
115
+ list(texts),
116
+ batch_size=self.batch_size,
117
+ return_dense=True,
118
+ return_sparse=True,
119
+ return_colbert_vecs=False,
120
+ )
121
+ dense = out["dense_vecs"]
122
+ sparse = out["lexical_weights"]
123
+ return [
124
+ HybridVec(
125
+ dense=list(map(float, dense[i])),
126
+ sparse=_to_qdrant_sparse(sparse[i]),
127
+ )
128
+ for i in range(len(texts))
129
+ ]
130
+
131
+ def embed_one(self, text: str) -> HybridVec:
132
+ return self.embed([text])[0]
133
+
134
+ # ------------------------------------------------------------ misc
135
+
136
+ def close(self) -> None:
137
+ # FlagEmbedding has no explicit close; drop the reference to free
138
+ # GPU mem on next gc cycle.
139
+ self._impl = None
140
+
141
+ def __enter__(self) -> M3Embedder:
142
+ return self
143
+
144
+ def __exit__(self, *exc: object) -> None:
145
+ self.close()
146
+
147
+
148
+ def _to_qdrant_sparse(weights: dict[Any, Any]) -> SparseVec:
149
+ """Convert m3 ``{token_id: weight}`` mapping to Qdrant sparse format.
150
+
151
+ m3 returns numpy floats keyed by string token IDs. Qdrant wants
152
+ plain ints and floats; the conversion is explicit so misbehaving
153
+ inputs (negative weights, NaN) are dropped rather than poisoning the
154
+ index.
155
+ """
156
+ indices: list[int] = []
157
+ values: list[float] = []
158
+ for tok, w in weights.items():
159
+ try:
160
+ idx = int(tok)
161
+ except (TypeError, ValueError):
162
+ continue
163
+ val = float(w)
164
+ if val <= 0.0 or val != val: # drop NaN / non-positive
165
+ continue
166
+ indices.append(idx)
167
+ values.append(val)
168
+ return SparseVec(indices=indices, values=values)
169
+
170
+
171
+ # Factory + singleton live in ``code_memory.embed.__init__`` so the
172
+ # Ollama and M3 backends share one selection mechanism. ``M3Embedder``
173
+ # itself remains directly constructible for tests and for users who
174
+ # want to bypass the env-var dispatch.
@@ -0,0 +1,92 @@
1
+ """Ollama-backed dense embedder (default backend).
2
+
3
+ Runs `bge-m3` (or any Ollama-served model) over HTTP. Ollama keeps the
4
+ model loaded in its own daemon, so short-lived CLI processes (e.g.
5
+ ``code-memory reingest <file>`` invoked from a save-file hook) reuse
6
+ the warm model instead of paying a ~5-15 s cold load every call.
7
+
8
+ Trade-off vs the in-process FlagEmbedding path: Ollama only exposes the
9
+ dense head of m3 — no sparse, no ColBERT. Sparse is returned as an
10
+ empty :class:`SparseVec` so the Qdrant hybrid layout still upserts
11
+ cleanly; queries through the hybrid slot then degrade to dense-only at
12
+ RRF time. Users who want true m3 hybrid (dense + sparse from one
13
+ forward pass) can flip ``EMBED_BACKEND=flagembed`` and accept the
14
+ cold-load cost.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from collections.abc import Sequence
20
+
21
+ import logging
22
+
23
+ import httpx
24
+
25
+ from ..config import CONFIG
26
+ from ..resilience import with_retry
27
+ from .m3 import HybridVec, SparseVec
28
+
29
+ log_ = logging.getLogger(__name__)
30
+
31
+
32
+ class OllamaEmbedder:
33
+ """Thin sync wrapper over Ollama /api/embed.
34
+
35
+ Returns :class:`HybridVec` with an empty sparse component so the
36
+ shape matches :class:`M3Embedder`. The empty sparse vector is a
37
+ deliberate signal to :class:`QdrantStore` that hybrid fusion will
38
+ degrade to dense-only for this point.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ url: str | None = None,
44
+ model: str | None = None,
45
+ timeout: float = 300.0,
46
+ ) -> None:
47
+ self.url = (url or CONFIG.ollama_url).rstrip("/")
48
+ self.model = model or CONFIG.embed_model
49
+ self._client = httpx.Client(timeout=timeout)
50
+
51
+ def embed(self, texts: Sequence[str]) -> list[HybridVec]:
52
+ if not texts:
53
+ return []
54
+
55
+ def _call():
56
+ res = self._client.post(
57
+ f"{self.url}/api/embed",
58
+ json={"model": self.model, "input": list(texts)},
59
+ )
60
+ res.raise_for_status()
61
+ data = res.json()
62
+ embeddings = data.get("embeddings")
63
+ if embeddings is None:
64
+ raise RuntimeError(f"Ollama returned no embeddings: {data}")
65
+ return embeddings
66
+
67
+ embeddings = with_retry(
68
+ _call,
69
+ max_retries=3,
70
+ backoff_s=1.0,
71
+ on_retry=lambda attempt, exc: log_.warning(
72
+ "ollama embed retry %d/3 after %s", attempt, exc
73
+ ),
74
+ )
75
+
76
+ empty = SparseVec(indices=[], values=[])
77
+ return [
78
+ HybridVec(dense=[float(x) for x in vec], sparse=empty)
79
+ for vec in embeddings
80
+ ]
81
+
82
+ def embed_one(self, text: str) -> HybridVec:
83
+ return self.embed([text])[0]
84
+
85
+ def close(self) -> None:
86
+ self._client.close()
87
+
88
+ def __enter__(self) -> OllamaEmbedder:
89
+ return self
90
+
91
+ def __exit__(self, *exc: object) -> None:
92
+ self.close()
@@ -0,0 +1,106 @@
1
+ """text-embeddings-inference (TEI) backend.
2
+
3
+ `HuggingFace TEI <https://github.com/huggingface/text-embeddings-inference>`_
4
+ is a purpose-built embedding server. On a Linux + NVIDIA host with the
5
+ same ``BAAI/bge-m3`` weights, it serves embeddings at **5-10× the
6
+ throughput** of Ollama because:
7
+
8
+ * Built on ONNX Runtime / candle-rs with native CUDA batching.
9
+ * Streams + dynamically batches concurrent requests instead of
10
+ serialising one-at-a-time.
11
+ * No HTTP-to-llama.cpp daemon hop per call.
12
+
13
+ For enterprise CI / staging where the cold ingest of a large monorepo
14
+ matters, this is the way to break the ``bge-m3`` throughput floor
15
+ without changing models or losing semantic recall.
16
+
17
+ Trade-off vs ``OllamaEmbedder``:
18
+
19
+ * Same shape (dense-only ``HybridVec`` with empty sparse) so callers
20
+ swap backends transparently.
21
+ * TEI must be running before code-memory ingests; Ollama-style "I
22
+ brought my own daemon" still applies.
23
+ * On Mac (no NVIDIA), TEI's CPU path is roughly on par with Ollama's
24
+ Metal path — there's no advantage. Stay on Ollama there.
25
+
26
+ Activated via ``EMBED_BACKEND=tei``; URL via ``TEI_URL``.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ from collections.abc import Sequence
32
+
33
+ import httpx
34
+
35
+ from ..config import CONFIG
36
+ from .m3 import HybridVec, SparseVec
37
+
38
+
39
+ class TEIEmbedder:
40
+ """Sync wrapper over TEI's ``/embed`` endpoint.
41
+
42
+ Returns :class:`HybridVec` with an empty sparse component so the
43
+ shape matches :class:`OllamaEmbedder` and :class:`M3Embedder`.
44
+ Callers (pipeline, retrieve) need no branching on backend type.
45
+
46
+ TEI's request payload differs slightly from Ollama's:
47
+
48
+ * Endpoint: ``POST /embed``
49
+ * Body: ``{"inputs": [...]}``
50
+ * Response: ``[[float, ...], [float, ...]]`` (raw vector list, no
51
+ wrapping object).
52
+
53
+ A ``truncate=true`` flag is set so over-length chunks are silently
54
+ truncated to the model's max sequence length rather than failing
55
+ the whole batch — the same forgiving semantic Ollama applies.
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ url: str | None = None,
61
+ timeout: float = 300.0,
62
+ ) -> None:
63
+ # TEI doesn't accept a model id at request time — the daemon
64
+ # is launched with a single ``--model-id`` flag — so we don't
65
+ # carry one through requests. ``self.model`` exists for
66
+ # parity with :class:`OllamaEmbedder` and is sourced from
67
+ # ``EMBED_MODEL`` so the cache key namespace lines up across
68
+ # backends pointing at the same model weights.
69
+ self.url = (url or CONFIG.tei_url).rstrip("/")
70
+ self.model = CONFIG.embed_model
71
+ self._client = httpx.Client(timeout=timeout)
72
+
73
+ def embed(self, texts: Sequence[str]) -> list[HybridVec]:
74
+ if not texts:
75
+ return []
76
+ res = self._client.post(
77
+ f"{self.url}/embed",
78
+ json={"inputs": list(texts), "truncate": True},
79
+ )
80
+ res.raise_for_status()
81
+ data = res.json()
82
+ # TEI returns ``[[float, ...], ...]`` — a bare list of
83
+ # vectors, one per input, in the same order. No wrapper key.
84
+ if not isinstance(data, list):
85
+ raise RuntimeError(f"TEI returned unexpected shape: {type(data).__name__}")
86
+ if len(data) != len(texts):
87
+ raise RuntimeError(
88
+ f"TEI returned {len(data)} vectors for {len(texts)} inputs"
89
+ )
90
+ empty = SparseVec(indices=[], values=[])
91
+ return [
92
+ HybridVec(dense=[float(x) for x in vec], sparse=empty)
93
+ for vec in data
94
+ ]
95
+
96
+ def embed_one(self, text: str) -> HybridVec:
97
+ return self.embed([text])[0]
98
+
99
+ def close(self) -> None:
100
+ self._client.close()
101
+
102
+ def __enter__(self) -> TEIEmbedder:
103
+ return self
104
+
105
+ def __exit__(self, *exc: object) -> None:
106
+ self.close()
@@ -0,0 +1,3 @@
1
+ from .sqlite_store import Episode, EpisodicStore
2
+
3
+ __all__ = ["Episode", "EpisodicStore"]
@@ -0,0 +1,278 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import sqlite3
6
+ import time
7
+ import uuid
8
+ from dataclasses import asdict, dataclass, field
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from ..config import CONFIG
13
+
14
+ # Base table — kept minimal so a legacy DB opens without errors. Every
15
+ # additional column lives in ``_MIGRATIONS`` so loading an old database
16
+ # transparently catches it up to the latest schema.
17
+ _BASE_SCHEMA = """
18
+ CREATE TABLE IF NOT EXISTS episodes (
19
+ id TEXT PRIMARY KEY,
20
+ ts REAL NOT NULL,
21
+ prompt TEXT NOT NULL,
22
+ plan TEXT,
23
+ patch TEXT,
24
+ verdict TEXT,
25
+ tags TEXT,
26
+ meta TEXT
27
+ );
28
+ """
29
+
30
+ # Idempotent migrations. Each statement is run independently; failures
31
+ # (e.g. "duplicate column" when the migration has already been applied)
32
+ # are swallowed because that's the success path for an idempotent
33
+ # migration. Indexes that reference migration-added columns must come
34
+ # AFTER the corresponding ADD COLUMN, hence interleaved here.
35
+ _MIGRATIONS = (
36
+ "ALTER TABLE episodes ADD COLUMN head_sha TEXT",
37
+ "CREATE INDEX IF NOT EXISTS idx_episodes_ts ON episodes(ts)",
38
+ "CREATE INDEX IF NOT EXISTS idx_episodes_verdict ON episodes(verdict)",
39
+ "CREATE INDEX IF NOT EXISTS idx_episodes_head_sha ON episodes(head_sha)",
40
+ # Content hash for dedup. Same user prompt re-asserted across turns
41
+ # produced one row per assertion before; now the existing row gets
42
+ # its ``ts`` refreshed and the new insert is a no-op. Non-unique by
43
+ # design so legacy rows (NULL hash) still load without conflict.
44
+ "ALTER TABLE episodes ADD COLUMN content_hash TEXT",
45
+ "CREATE INDEX IF NOT EXISTS idx_episodes_content_hash ON episodes(content_hash)",
46
+ )
47
+
48
+
49
+ def _content_hash(prompt: str) -> str:
50
+ """SHA-256 over the user prompt, normalized.
51
+
52
+ Dedup key is prompt-only on purpose: the same prompt typed twice
53
+ represents the same intent, regardless of which plan/patch/verdict
54
+ the agent eventually produced. Whitespace is normalized so a
55
+ trailing newline doesn't split otherwise-identical rows.
56
+ """
57
+ return hashlib.sha256(prompt.strip().encode("utf-8")).hexdigest()
58
+
59
+
60
+ @dataclass
61
+ class Episode:
62
+ prompt: str
63
+ plan: str | None = None
64
+ patch: str | None = None
65
+ verdict: str | None = None # pass | fail | partial
66
+ tags: list[str] = field(default_factory=list)
67
+ meta: dict[str, Any] = field(default_factory=dict)
68
+ id: str = field(default_factory=lambda: str(uuid.uuid4()))
69
+ ts: float = field(default_factory=time.time)
70
+ # Git HEAD at the moment the episode was recorded — links the
71
+ # agent's work back to the code state the graph was indexing then.
72
+ head_sha: str | None = None
73
+
74
+
75
+ class EpisodicStore:
76
+ def __init__(self, path: Path | None = None) -> None:
77
+ self.path = path or CONFIG.episodic_db
78
+ self.path.parent.mkdir(parents=True, exist_ok=True)
79
+ self.conn = sqlite3.connect(self.path)
80
+ self.conn.executescript(_BASE_SCHEMA)
81
+ for stmt in _MIGRATIONS:
82
+ try:
83
+ self.conn.execute(stmt)
84
+ except sqlite3.OperationalError:
85
+ # column already added by a prior process — that's the
86
+ # success path for an idempotent migration
87
+ pass
88
+ self.conn.commit()
89
+
90
+ def add(self, ep: Episode) -> str:
91
+ """Insert an episode, deduping on prompt content.
92
+
93
+ If an existing row has the same ``content_hash``, refresh its
94
+ ``ts`` to ``ep.ts`` and fill any previously-NULL fields from
95
+ the new episode (plan/patch/verdict/head_sha). Tags are unioned
96
+ and meta is merged with new values winning on key collision.
97
+ Returns the existing row's id so vector upserts stay idempotent.
98
+ """
99
+ hash_ = _content_hash(ep.prompt)
100
+ existing = self.conn.execute(
101
+ "SELECT id, plan, patch, verdict, head_sha, tags, meta "
102
+ "FROM episodes WHERE content_hash = ? LIMIT 1",
103
+ (hash_,),
104
+ ).fetchone()
105
+ if existing is not None:
106
+ return self._refresh_existing(existing, ep)
107
+
108
+ self.conn.execute(
109
+ "INSERT INTO episodes(id, ts, prompt, plan, patch, verdict, tags, meta, head_sha, content_hash) "
110
+ "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
111
+ (
112
+ ep.id,
113
+ ep.ts,
114
+ ep.prompt,
115
+ ep.plan,
116
+ ep.patch,
117
+ ep.verdict,
118
+ json.dumps(ep.tags),
119
+ json.dumps(ep.meta),
120
+ ep.head_sha,
121
+ hash_,
122
+ ),
123
+ )
124
+ self.conn.commit()
125
+ return ep.id
126
+
127
+ def _refresh_existing(
128
+ self, existing: tuple[Any, ...], ep: Episode
129
+ ) -> str:
130
+ existing_id = str(existing[0])
131
+ old_plan, old_patch, old_verdict, old_head = (
132
+ existing[1],
133
+ existing[2],
134
+ existing[3],
135
+ existing[4],
136
+ )
137
+ old_tags = json.loads(existing[5]) if existing[5] else []
138
+ old_meta = json.loads(existing[6]) if existing[6] else {}
139
+
140
+ merged_tags = list(dict.fromkeys([*old_tags, *ep.tags]))
141
+ merged_meta = {**old_meta, **ep.meta}
142
+
143
+ self.conn.execute(
144
+ "UPDATE episodes SET "
145
+ " ts = ?, "
146
+ " plan = COALESCE(plan, ?), "
147
+ " patch = COALESCE(patch, ?), "
148
+ " verdict = COALESCE(verdict, ?), "
149
+ " head_sha = COALESCE(head_sha, ?), "
150
+ " tags = ?, "
151
+ " meta = ? "
152
+ "WHERE id = ?",
153
+ (
154
+ ep.ts,
155
+ ep.plan if ep.plan else None,
156
+ ep.patch if ep.patch else None,
157
+ ep.verdict if ep.verdict else None,
158
+ ep.head_sha,
159
+ json.dumps(merged_tags),
160
+ json.dumps(merged_meta),
161
+ existing_id,
162
+ ),
163
+ )
164
+ self.conn.commit()
165
+ return existing_id
166
+
167
+ def dedupe(self) -> dict[str, list[str]]:
168
+ """Compact pre-existing duplicates in the table.
169
+
170
+ For each ``content_hash`` group with >1 row, keep the row with
171
+ the oldest ``ts`` (first observation), update its ``ts`` to
172
+ ``MAX(ts)`` of the group so retrieval still surfaces it as
173
+ recent, and delete the rest. Returns ``{kept_id: [removed_ids]}``
174
+ so callers (e.g. the orchestrator) can prune matching vectors.
175
+
176
+ Backfills ``content_hash`` for legacy NULL rows on the fly.
177
+ """
178
+ null_rows = self.conn.execute(
179
+ "SELECT id, prompt FROM episodes WHERE content_hash IS NULL"
180
+ ).fetchall()
181
+ for ep_id, prompt in null_rows:
182
+ self.conn.execute(
183
+ "UPDATE episodes SET content_hash = ? WHERE id = ?",
184
+ (_content_hash(prompt), ep_id),
185
+ )
186
+ if null_rows:
187
+ self.conn.commit()
188
+
189
+ groups = self.conn.execute(
190
+ "SELECT content_hash FROM episodes "
191
+ "WHERE content_hash IS NOT NULL "
192
+ "GROUP BY content_hash HAVING COUNT(*) > 1"
193
+ ).fetchall()
194
+
195
+ removed: dict[str, list[str]] = {}
196
+ for (hash_,) in groups:
197
+ rows = self.conn.execute(
198
+ "SELECT id, ts FROM episodes WHERE content_hash = ? "
199
+ "ORDER BY ts ASC",
200
+ (hash_,),
201
+ ).fetchall()
202
+ keep_id = str(rows[0][0])
203
+ max_ts = max(float(r[1]) for r in rows)
204
+ del_ids = [str(r[0]) for r in rows[1:]]
205
+ self.conn.execute(
206
+ "UPDATE episodes SET ts = ? WHERE id = ?", (max_ts, keep_id)
207
+ )
208
+ self.conn.executemany(
209
+ "DELETE FROM episodes WHERE id = ?", [(d,) for d in del_ids]
210
+ )
211
+ removed[keep_id] = del_ids
212
+ self.conn.commit()
213
+ return removed
214
+
215
+ def get(self, ep_id: str) -> Episode | None:
216
+ row = self.conn.execute(
217
+ "SELECT id, ts, prompt, plan, patch, verdict, tags, meta, head_sha "
218
+ "FROM episodes WHERE id = ?",
219
+ (ep_id,),
220
+ ).fetchone()
221
+ if row is None:
222
+ return None
223
+ return _row_to_episode(row)
224
+
225
+ def recent(self, limit: int = 20) -> list[Episode]:
226
+ rows = self.conn.execute(
227
+ "SELECT id, ts, prompt, plan, patch, verdict, tags, meta, head_sha "
228
+ "FROM episodes ORDER BY ts DESC LIMIT ?",
229
+ (limit,),
230
+ ).fetchall()
231
+ return [_row_to_episode(r) for r in rows]
232
+
233
+ def by_ids(self, ids: list[str]) -> list[Episode]:
234
+ if not ids:
235
+ return []
236
+ placeholders = ",".join("?" for _ in ids)
237
+ rows = self.conn.execute(
238
+ f"SELECT id, ts, prompt, plan, patch, verdict, tags, meta, head_sha "
239
+ f"FROM episodes WHERE id IN ({placeholders})",
240
+ ids,
241
+ ).fetchall()
242
+ return [_row_to_episode(r) for r in rows]
243
+
244
+ def close(self) -> None:
245
+ self.conn.close()
246
+
247
+
248
+ def _row_to_episode(row: tuple[Any, ...]) -> Episode:
249
+ return Episode(
250
+ id=row[0],
251
+ ts=row[1],
252
+ prompt=row[2],
253
+ plan=row[3],
254
+ patch=row[4],
255
+ verdict=row[5],
256
+ tags=json.loads(row[6]) if row[6] else [],
257
+ meta=json.loads(row[7]) if row[7] else {},
258
+ head_sha=row[8] if len(row) > 8 else None,
259
+ )
260
+
261
+
262
+ def episode_text(ep: Episode) -> str:
263
+ """Composite text for embedding."""
264
+ parts = [f"PROMPT:\n{ep.prompt}"]
265
+ if ep.plan:
266
+ parts.append(f"PLAN:\n{ep.plan}")
267
+ if ep.patch:
268
+ parts.append(f"PATCH:\n{ep.patch}")
269
+ if ep.verdict:
270
+ parts.append(f"VERDICT: {ep.verdict}")
271
+ return "\n\n".join(parts)
272
+
273
+
274
+ def episode_payload(ep: Episode) -> dict[str, Any]:
275
+ d = asdict(ep)
276
+ d.pop("plan", None)
277
+ d.pop("patch", None)
278
+ return d
@@ -0,0 +1,3 @@
1
+ from .treesitter import ExtractedFile, Extractor, Symbol
2
+
3
+ __all__ = ["Extractor", "ExtractedFile", "Symbol"]