flurryx-code-memory 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_memory/__init__.py +1 -0
- code_memory/claims/__init__.py +32 -0
- code_memory/claims/extractor.py +325 -0
- code_memory/claims/indexer.py +258 -0
- code_memory/claims/resolver.py +186 -0
- code_memory/claims/store.py +424 -0
- code_memory/cli.py +1192 -0
- code_memory/config.py +268 -0
- code_memory/embed/__init__.py +224 -0
- code_memory/embed/cache.py +204 -0
- code_memory/embed/m3.py +174 -0
- code_memory/embed/ollama.py +92 -0
- code_memory/embed/tei.py +106 -0
- code_memory/episodic/__init__.py +3 -0
- code_memory/episodic/sqlite_store.py +278 -0
- code_memory/extractor/__init__.py +3 -0
- code_memory/extractor/csproj.py +166 -0
- code_memory/extractor/dll.py +385 -0
- code_memory/extractor/gitignore.py +162 -0
- code_memory/extractor/nuget.py +275 -0
- code_memory/extractor/sanity.py +124 -0
- code_memory/extractor/sln.py +108 -0
- code_memory/extractor/treesitter.py +1172 -0
- code_memory/graph/__init__.py +3 -0
- code_memory/graph/falkor_store.py +740 -0
- code_memory/mcp_server.py +1816 -0
- code_memory/metrics.py +260 -0
- code_memory/orchestrator/__init__.py +13 -0
- code_memory/orchestrator/git_delta.py +211 -0
- code_memory/orchestrator/ingest_state.py +71 -0
- code_memory/orchestrator/pipeline.py +1478 -0
- code_memory/orchestrator/reset.py +130 -0
- code_memory/orchestrator/resolver.py +825 -0
- code_memory/orchestrator/retrieve.py +505 -0
- code_memory/resilience.py +73 -0
- code_memory/sync/__init__.py +20 -0
- code_memory/sync/autostart/__init__.py +42 -0
- code_memory/sync/autostart/base.py +106 -0
- code_memory/sync/autostart/launchd.py +115 -0
- code_memory/sync/autostart/schtasks.py +155 -0
- code_memory/sync/autostart/systemd.py +113 -0
- code_memory/sync/hooks.py +164 -0
- code_memory/sync/safety.py +65 -0
- code_memory/sync/snapshot.py +461 -0
- code_memory/sync/store.py +399 -0
- code_memory/sync/sync.py +405 -0
- code_memory/sync/watcher.py +320 -0
- code_memory/vector/__init__.py +3 -0
- code_memory/vector/qdrant_store.py +302 -0
- flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
- flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
- flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
- flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
code_memory/metrics.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
import time
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass, asdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class RetrieveTiming:
|
|
10
|
+
query: str
|
|
11
|
+
embed_ms: float
|
|
12
|
+
code_search_ms: float
|
|
13
|
+
eps_search_ms: float
|
|
14
|
+
claims_ms: float
|
|
15
|
+
total_ms: float
|
|
16
|
+
code_hit_count: int = 0
|
|
17
|
+
eps_hit_count: int = 0
|
|
18
|
+
claims_hit_count: int = 0
|
|
19
|
+
|
|
20
|
+
class MetricsStore:
|
|
21
|
+
def __init__(self, path: Path):
|
|
22
|
+
self.path = Path(path)
|
|
23
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
24
|
+
self._init_db()
|
|
25
|
+
|
|
26
|
+
def _init_db(self):
|
|
27
|
+
with sqlite3.connect(str(self.path)) as conn:
|
|
28
|
+
conn.execute("""
|
|
29
|
+
CREATE TABLE IF NOT EXISTS retrieves (
|
|
30
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
31
|
+
query TEXT,
|
|
32
|
+
embed_ms REAL,
|
|
33
|
+
code_search_ms REAL,
|
|
34
|
+
eps_search_ms REAL,
|
|
35
|
+
claims_ms REAL,
|
|
36
|
+
total_ms REAL,
|
|
37
|
+
code_hit_count INTEGER DEFAULT 0,
|
|
38
|
+
eps_hit_count INTEGER DEFAULT 0,
|
|
39
|
+
claims_hit_count INTEGER DEFAULT 0,
|
|
40
|
+
ts REAL NOT NULL
|
|
41
|
+
)
|
|
42
|
+
""")
|
|
43
|
+
conn.execute("""
|
|
44
|
+
CREATE TABLE IF NOT EXISTS backend_health (
|
|
45
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
46
|
+
backend TEXT NOT NULL,
|
|
47
|
+
status TEXT NOT NULL,
|
|
48
|
+
latency_ms REAL,
|
|
49
|
+
ts REAL NOT NULL
|
|
50
|
+
)
|
|
51
|
+
""")
|
|
52
|
+
conn.execute("""
|
|
53
|
+
CREATE TABLE IF NOT EXISTS cache_stats (
|
|
54
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
55
|
+
hits INTEGER DEFAULT 0,
|
|
56
|
+
misses INTEGER DEFAULT 0,
|
|
57
|
+
ts REAL NOT NULL
|
|
58
|
+
)
|
|
59
|
+
""")
|
|
60
|
+
conn.execute("""
|
|
61
|
+
CREATE TABLE IF NOT EXISTS ingest_stats (
|
|
62
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
63
|
+
files INTEGER DEFAULT 0,
|
|
64
|
+
symbols INTEGER DEFAULT 0,
|
|
65
|
+
duration_s REAL,
|
|
66
|
+
ts REAL NOT NULL
|
|
67
|
+
)
|
|
68
|
+
""")
|
|
69
|
+
conn.execute("""
|
|
70
|
+
CREATE TABLE IF NOT EXISTS tool_calls (
|
|
71
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
72
|
+
tool TEXT NOT NULL,
|
|
73
|
+
project TEXT NOT NULL,
|
|
74
|
+
query_text TEXT,
|
|
75
|
+
output_chars INTEGER DEFAULT 0,
|
|
76
|
+
result_count INTEGER DEFAULT 0,
|
|
77
|
+
session_id TEXT,
|
|
78
|
+
ts REAL NOT NULL
|
|
79
|
+
)
|
|
80
|
+
""")
|
|
81
|
+
conn.execute("""
|
|
82
|
+
CREATE TABLE IF NOT EXISTS fs_reads (
|
|
83
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
84
|
+
tool TEXT NOT NULL,
|
|
85
|
+
path TEXT,
|
|
86
|
+
output_chars INTEGER DEFAULT 0,
|
|
87
|
+
session_id TEXT,
|
|
88
|
+
project TEXT NOT NULL,
|
|
89
|
+
ts REAL NOT NULL
|
|
90
|
+
)
|
|
91
|
+
""")
|
|
92
|
+
conn.commit()
|
|
93
|
+
|
|
94
|
+
def record_retrieve(self, m: RetrieveTiming):
|
|
95
|
+
with sqlite3.connect(str(self.path)) as conn:
|
|
96
|
+
conn.execute(
|
|
97
|
+
"INSERT INTO retrieves (query, embed_ms, code_search_ms, eps_search_ms, claims_ms, total_ms, code_hit_count, eps_hit_count, claims_hit_count, ts) VALUES (?,?,?,?,?,?,?,?,?,?)",
|
|
98
|
+
(m.query, m.embed_ms, m.code_search_ms, m.eps_search_ms, m.claims_ms, m.total_ms, m.code_hit_count, m.eps_hit_count, m.claims_hit_count, time.time())
|
|
99
|
+
)
|
|
100
|
+
conn.commit()
|
|
101
|
+
|
|
102
|
+
def record_backend_health(self, backend: str, status: str, latency_ms: float):
|
|
103
|
+
with sqlite3.connect(str(self.path)) as conn:
|
|
104
|
+
conn.execute(
|
|
105
|
+
"INSERT INTO backend_health (backend, status, latency_ms, ts) VALUES (?,?,?,?)",
|
|
106
|
+
(backend, status, latency_ms, time.time())
|
|
107
|
+
)
|
|
108
|
+
conn.commit()
|
|
109
|
+
|
|
110
|
+
def record_cache_stats(self, hits: int, misses: int):
|
|
111
|
+
with sqlite3.connect(str(self.path)) as conn:
|
|
112
|
+
conn.execute(
|
|
113
|
+
"INSERT INTO cache_stats (hits, misses, ts) VALUES (?,?,?)",
|
|
114
|
+
(hits, misses, time.time())
|
|
115
|
+
)
|
|
116
|
+
conn.commit()
|
|
117
|
+
|
|
118
|
+
def record_ingest(self, files: int, symbols: int, duration_s: float):
|
|
119
|
+
with sqlite3.connect(str(self.path)) as conn:
|
|
120
|
+
conn.execute(
|
|
121
|
+
"INSERT INTO ingest_stats (files, symbols, duration_s, ts) VALUES (?,?,?,?)",
|
|
122
|
+
(files, symbols, duration_s, time.time())
|
|
123
|
+
)
|
|
124
|
+
conn.commit()
|
|
125
|
+
|
|
126
|
+
def record_tool_call(self, tool: str, project: str, *, query_text: str = "", output_chars: int = 0, result_count: int = 0, session_id: str = ""):
|
|
127
|
+
with sqlite3.connect(str(self.path)) as conn:
|
|
128
|
+
conn.execute(
|
|
129
|
+
"INSERT INTO tool_calls (tool, project, query_text, output_chars, result_count, session_id, ts) VALUES (?,?,?,?,?,?,?)",
|
|
130
|
+
(tool, project, query_text or None, output_chars, result_count, session_id or None, time.time())
|
|
131
|
+
)
|
|
132
|
+
conn.commit()
|
|
133
|
+
|
|
134
|
+
def record_fs_read(self, tool: str, path: str, project: str, *, output_chars: int = 0, session_id: str = ""):
|
|
135
|
+
with sqlite3.connect(str(self.path)) as conn:
|
|
136
|
+
conn.execute(
|
|
137
|
+
"INSERT INTO fs_reads (tool, path, output_chars, session_id, project, ts) VALUES (?,?,?,?,?,?)",
|
|
138
|
+
(tool, path or None, output_chars, session_id or None, project, time.time())
|
|
139
|
+
)
|
|
140
|
+
conn.commit()
|
|
141
|
+
|
|
142
|
+
def tool_usage_summary(self, project: str | None = None) -> dict:
|
|
143
|
+
with sqlite3.connect(str(self.path)) as conn:
|
|
144
|
+
conn.row_factory = sqlite3.Row
|
|
145
|
+
where = " WHERE project=?" if project else ""
|
|
146
|
+
params = (project,) if project else ()
|
|
147
|
+
rows = conn.execute(
|
|
148
|
+
f"SELECT tool, COUNT(*) as calls, COALESCE(SUM(output_chars),0) as total_chars, COALESCE(AVG(output_chars),0) as avg_chars FROM tool_calls{where} GROUP BY tool ORDER BY calls DESC",
|
|
149
|
+
params
|
|
150
|
+
).fetchall()
|
|
151
|
+
return {
|
|
152
|
+
"tools": [dict(r) for r in rows],
|
|
153
|
+
"total_calls": sum(r["calls"] for r in rows),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
def efficiency_summary(self, project: str | None = None) -> dict:
|
|
157
|
+
with sqlite3.connect(str(self.path)) as conn:
|
|
158
|
+
conn.row_factory = sqlite3.Row
|
|
159
|
+
# Aggregate tool_calls
|
|
160
|
+
tc_where = " WHERE project=?" if project else ""
|
|
161
|
+
tc_params = (project,) if project else ()
|
|
162
|
+
tc = conn.execute(
|
|
163
|
+
f"SELECT COUNT(*) as calls, COALESCE(SUM(output_chars),0) as total_chars FROM tool_calls{tc_where}",
|
|
164
|
+
tc_params
|
|
165
|
+
).fetchone()
|
|
166
|
+
|
|
167
|
+
# Aggregate fs_reads
|
|
168
|
+
fs_where = " WHERE project=?" if project else ""
|
|
169
|
+
fs_params = (project,) if project else ()
|
|
170
|
+
fs = conn.execute(
|
|
171
|
+
f"SELECT COUNT(*) as reads, COALESCE(SUM(output_chars),0) as total_chars FROM fs_reads{fs_where}",
|
|
172
|
+
fs_params
|
|
173
|
+
).fetchone()
|
|
174
|
+
|
|
175
|
+
# Per-session breakdown
|
|
176
|
+
session_where = " WHERE project=?" if project else ""
|
|
177
|
+
session_params = (project,) if project else ()
|
|
178
|
+
tc_sessions = conn.execute(
|
|
179
|
+
f"SELECT COALESCE(session_id,'') as session_id, SUM(output_chars) as mcp_chars FROM tool_calls{session_where} GROUP BY session_id",
|
|
180
|
+
session_params
|
|
181
|
+
).fetchall()
|
|
182
|
+
fs_sessions = conn.execute(
|
|
183
|
+
f"SELECT COALESCE(session_id,'') as session_id, SUM(output_chars) as fs_chars FROM fs_reads{session_where} GROUP BY session_id",
|
|
184
|
+
session_params
|
|
185
|
+
).fetchall()
|
|
186
|
+
|
|
187
|
+
# Merge per-session
|
|
188
|
+
session_map: dict[str, dict] = {}
|
|
189
|
+
for r in tc_sessions:
|
|
190
|
+
sid = r["session_id"]
|
|
191
|
+
session_map.setdefault(sid, {"session_id": sid, "mcp_chars": 0, "fs_chars": 0})
|
|
192
|
+
session_map[sid]["mcp_chars"] = r["mcp_chars"]
|
|
193
|
+
for r in fs_sessions:
|
|
194
|
+
sid = r["session_id"]
|
|
195
|
+
session_map.setdefault(sid, {"session_id": sid, "mcp_chars": 0, "fs_chars": 0})
|
|
196
|
+
session_map[sid]["fs_chars"] = r["fs_chars"]
|
|
197
|
+
|
|
198
|
+
return {
|
|
199
|
+
"total_mcp_chars": tc["total_chars"],
|
|
200
|
+
"total_fs_chars": fs["total_chars"],
|
|
201
|
+
"mcp_calls": tc["calls"],
|
|
202
|
+
"fs_reads": fs["reads"],
|
|
203
|
+
"sessions": list(session_map.values()),
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
def summary(self) -> dict[str, Any]:
|
|
207
|
+
with sqlite3.connect(str(self.path)) as conn:
|
|
208
|
+
conn.row_factory = sqlite3.Row
|
|
209
|
+
# Retrieve stats
|
|
210
|
+
r = conn.execute(
|
|
211
|
+
"SELECT COUNT(*) as count, AVG(total_ms) as avg_total_ms, AVG(embed_ms) as avg_embed_ms, AVG(code_search_ms) as avg_code_search_ms FROM retrieves"
|
|
212
|
+
).fetchone()
|
|
213
|
+
|
|
214
|
+
# Cache stats
|
|
215
|
+
c = conn.execute(
|
|
216
|
+
"SELECT COALESCE(SUM(hits),0) as total_hits, COALESCE(SUM(misses),0) as total_misses FROM cache_stats"
|
|
217
|
+
).fetchone()
|
|
218
|
+
total_cache = c["total_hits"] + c["total_misses"]
|
|
219
|
+
hit_ratio = c["total_hits"] / total_cache if total_cache > 0 else 0.0
|
|
220
|
+
|
|
221
|
+
# Ingest stats
|
|
222
|
+
i = conn.execute(
|
|
223
|
+
"SELECT COUNT(*) as count, COALESCE(SUM(symbols),0) as total_symbols, MAX(ts) as last_ingest_ts FROM ingest_stats"
|
|
224
|
+
).fetchone()
|
|
225
|
+
|
|
226
|
+
# Backend health - latest per backend
|
|
227
|
+
b = conn.execute(
|
|
228
|
+
"SELECT backend, status, MAX(ts) as last_ts FROM backend_health GROUP BY backend"
|
|
229
|
+
).fetchall()
|
|
230
|
+
|
|
231
|
+
return {
|
|
232
|
+
"retrieves": {
|
|
233
|
+
"count": r["count"],
|
|
234
|
+
"avg_total_ms": round(r["avg_total_ms"] or 0, 1),
|
|
235
|
+
"avg_embed_ms": round(r["avg_embed_ms"] or 0, 1),
|
|
236
|
+
"avg_code_search_ms": round(r["avg_code_search_ms"] or 0, 1),
|
|
237
|
+
},
|
|
238
|
+
"cache": {
|
|
239
|
+
"total_hits": c["total_hits"],
|
|
240
|
+
"total_misses": c["total_misses"],
|
|
241
|
+
"hit_ratio": round(hit_ratio, 3),
|
|
242
|
+
},
|
|
243
|
+
"ingest": {
|
|
244
|
+
"count": i["count"],
|
|
245
|
+
"total_symbols": i["total_symbols"],
|
|
246
|
+
"last_ingest_ts": i["last_ingest_ts"],
|
|
247
|
+
},
|
|
248
|
+
"backends": [{"backend": row["backend"], "status": row["status"]} for row in b],
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
def recent_retrieves(self, limit: int = 10) -> list[dict]:
|
|
252
|
+
with sqlite3.connect(str(self.path)) as conn:
|
|
253
|
+
conn.row_factory = sqlite3.Row
|
|
254
|
+
rows = conn.execute(
|
|
255
|
+
"SELECT * FROM retrieves ORDER BY ts DESC LIMIT ?", (limit,)
|
|
256
|
+
).fetchall()
|
|
257
|
+
return [dict(r) for r in rows]
|
|
258
|
+
|
|
259
|
+
def close(self):
|
|
260
|
+
pass
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .pipeline import Pipeline
|
|
2
|
+
from .reset import ResetResult, list_projects, reset_all, reset_project
|
|
3
|
+
from .retrieve import ContextPack, Retriever
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"Pipeline",
|
|
7
|
+
"Retriever",
|
|
8
|
+
"ContextPack",
|
|
9
|
+
"ResetResult",
|
|
10
|
+
"list_projects",
|
|
11
|
+
"reset_project",
|
|
12
|
+
"reset_all",
|
|
13
|
+
]
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""Git-aware delta detection for incremental ingestion.
|
|
2
|
+
|
|
3
|
+
Given a repo root and a base commit, produce three lists:
|
|
4
|
+
- changed (added / modified / renamed-new) -> reingest
|
|
5
|
+
- deleted (removed / renamed-old) -> drop from index
|
|
6
|
+
- dirty (uncommitted worktree changes) -> reingest
|
|
7
|
+
|
|
8
|
+
All paths returned are absolute, resolved.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import subprocess
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GitError(RuntimeError):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _run(repo: Path, *args: str, check: bool = True, timeout: float = 30.0) -> str:
|
|
23
|
+
try:
|
|
24
|
+
out = subprocess.run(
|
|
25
|
+
["git", "-C", str(repo), *args],
|
|
26
|
+
capture_output=True,
|
|
27
|
+
text=True,
|
|
28
|
+
check=False,
|
|
29
|
+
timeout=timeout,
|
|
30
|
+
)
|
|
31
|
+
except FileNotFoundError as e:
|
|
32
|
+
raise GitError("git executable not found on PATH") from e
|
|
33
|
+
except subprocess.SubprocessError as e:
|
|
34
|
+
raise GitError(f"git invocation failed: {e}") from e
|
|
35
|
+
if check and out.returncode != 0:
|
|
36
|
+
raise GitError(
|
|
37
|
+
f"git {' '.join(args)} failed (exit {out.returncode}): {out.stderr.strip()}"
|
|
38
|
+
)
|
|
39
|
+
return out.stdout
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def is_git_repo(root: str | Path) -> bool:
|
|
43
|
+
try:
|
|
44
|
+
out = _run(Path(root), "rev-parse", "--is-inside-work-tree", check=False)
|
|
45
|
+
except GitError:
|
|
46
|
+
return False
|
|
47
|
+
return out.strip() == "true"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def head_sha(root: str | Path) -> str:
|
|
51
|
+
return _run(Path(root), "rev-parse", "HEAD").strip()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def current_branch(root: str | Path) -> str | None:
|
|
55
|
+
out = _run(Path(root), "rev-parse", "--abbrev-ref", "HEAD", check=False).strip()
|
|
56
|
+
return out if out and out != "HEAD" else None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def is_reachable(root: str | Path, sha: str) -> bool:
|
|
60
|
+
try:
|
|
61
|
+
_run(Path(root), "cat-file", "-e", f"{sha}^{{commit}}", check=True)
|
|
62
|
+
return True
|
|
63
|
+
except GitError:
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def resolve_ref(root: str | Path, ref: str) -> str:
|
|
68
|
+
"""Resolve a ref (branch / tag / sha) to a full SHA. Raises if unknown."""
|
|
69
|
+
return _run(Path(root), "rev-parse", "--verify", f"{ref}^{{commit}}").strip()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def commit_ordinal(root: str | Path, sha: str) -> int | None:
|
|
73
|
+
"""Return the topological ordinal of ``sha`` — first-parent ancestor count.
|
|
74
|
+
|
|
75
|
+
``git rev-list --count --first-parent <sha>`` gives a monotonic integer
|
|
76
|
+
along the main branch: parent < child, always. We use it as a cheap
|
|
77
|
+
"before / after" comparator across SHAs without dragging full topology
|
|
78
|
+
into the graph store. Returns ``None`` if the SHA isn't reachable
|
|
79
|
+
(shallow clone, orphan, freshly initialised repo).
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
82
|
+
out = _run(
|
|
83
|
+
Path(root),
|
|
84
|
+
"rev-list",
|
|
85
|
+
"--count",
|
|
86
|
+
"--first-parent",
|
|
87
|
+
sha,
|
|
88
|
+
check=True,
|
|
89
|
+
).strip()
|
|
90
|
+
except GitError:
|
|
91
|
+
return None
|
|
92
|
+
try:
|
|
93
|
+
return int(out)
|
|
94
|
+
except ValueError:
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class Delta:
|
|
100
|
+
changed: list[Path] = field(default_factory=list)
|
|
101
|
+
deleted: list[Path] = field(default_factory=list)
|
|
102
|
+
dirty: list[Path] = field(default_factory=list)
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def is_empty(self) -> bool:
|
|
106
|
+
return not (self.changed or self.deleted or self.dirty)
|
|
107
|
+
|
|
108
|
+
def reingest_paths(self) -> list[Path]:
|
|
109
|
+
# de-dup while preserving order
|
|
110
|
+
seen: set[Path] = set()
|
|
111
|
+
out: list[Path] = []
|
|
112
|
+
for p in self.changed + self.dirty:
|
|
113
|
+
if p in seen:
|
|
114
|
+
continue
|
|
115
|
+
seen.add(p)
|
|
116
|
+
out.append(p)
|
|
117
|
+
return out
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def diff(root: str | Path, base_sha: str, head: str = "HEAD") -> Delta:
|
|
121
|
+
"""Compute path-level delta between base_sha and head (committed only)."""
|
|
122
|
+
repo = Path(root).resolve()
|
|
123
|
+
out = _run(repo, "diff", "--name-status", "-M", base_sha, head)
|
|
124
|
+
delta = Delta()
|
|
125
|
+
for line in out.splitlines():
|
|
126
|
+
if not line.strip():
|
|
127
|
+
continue
|
|
128
|
+
parts = line.split("\t")
|
|
129
|
+
status = parts[0]
|
|
130
|
+
# M / A / D / T => 2 fields ; R### / C### => 3 fields (old, new)
|
|
131
|
+
code = status[0]
|
|
132
|
+
if code in ("R", "C") and len(parts) >= 3:
|
|
133
|
+
old_abs = (repo / parts[1]).resolve()
|
|
134
|
+
new_abs = (repo / parts[2]).resolve()
|
|
135
|
+
delta.deleted.append(old_abs)
|
|
136
|
+
delta.changed.append(new_abs)
|
|
137
|
+
elif code == "D" and len(parts) >= 2:
|
|
138
|
+
delta.deleted.append((repo / parts[1]).resolve())
|
|
139
|
+
elif code in ("A", "M", "T") and len(parts) >= 2:
|
|
140
|
+
delta.changed.append((repo / parts[1]).resolve())
|
|
141
|
+
# anything else (U/X/B) -> skip silently
|
|
142
|
+
return delta
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def dirty_files(root: str | Path) -> list[Path]:
|
|
146
|
+
"""Return absolute paths of files with uncommitted *content* changes
|
|
147
|
+
(modified, added, untracked). Worktree deletions are reported by
|
|
148
|
+
``dirty_deleted_files`` so callers can route them to a delete path
|
|
149
|
+
instead of trying to reingest a missing file.
|
|
150
|
+
"""
|
|
151
|
+
repo = Path(root).resolve()
|
|
152
|
+
out = _run(repo, "status", "--porcelain=v1", "--untracked-files=all")
|
|
153
|
+
paths: list[Path] = []
|
|
154
|
+
for line in out.splitlines():
|
|
155
|
+
if len(line) < 4:
|
|
156
|
+
continue
|
|
157
|
+
xy = line[:2]
|
|
158
|
+
rest = line[3:]
|
|
159
|
+
# rename in index: "R old -> new"
|
|
160
|
+
if "->" in rest:
|
|
161
|
+
rest = rest.split("->", 1)[1].strip()
|
|
162
|
+
# ignored / deleted both index+worktree -> skip
|
|
163
|
+
if xy == "!!" or "D" in xy:
|
|
164
|
+
continue
|
|
165
|
+
path = (repo / rest).resolve()
|
|
166
|
+
if path.is_file():
|
|
167
|
+
paths.append(path)
|
|
168
|
+
return paths
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def dirty_deleted_files(root: str | Path) -> list[Path]:
|
|
172
|
+
"""Return absolute paths of files deleted in the worktree but not yet
|
|
173
|
+
committed.
|
|
174
|
+
|
|
175
|
+
Catches both ``git rm`` (index column 'D') and a plain ``rm`` against
|
|
176
|
+
a tracked file (worktree column 'D'), so the index can be pruned
|
|
177
|
+
before any commit lands. Without this, deletes only propagate after
|
|
178
|
+
the next commit — leaving the graph and vector index claiming the
|
|
179
|
+
file still exists.
|
|
180
|
+
"""
|
|
181
|
+
repo = Path(root).resolve()
|
|
182
|
+
out = _run(repo, "status", "--porcelain=v1")
|
|
183
|
+
paths: list[Path] = []
|
|
184
|
+
for line in out.splitlines():
|
|
185
|
+
if len(line) < 4:
|
|
186
|
+
continue
|
|
187
|
+
xy = line[:2]
|
|
188
|
+
rest = line[3:]
|
|
189
|
+
if xy == "!!":
|
|
190
|
+
continue
|
|
191
|
+
# Renames: ``R<sp>`` in index means the OLD path is gone; we
|
|
192
|
+
# don't get the old name from here (it appears with ``->``),
|
|
193
|
+
# but ``diff`` against HEAD already emits a paired delete +
|
|
194
|
+
# change for those, so we skip rename lines defensively.
|
|
195
|
+
if "->" in rest:
|
|
196
|
+
continue
|
|
197
|
+
if xy[0] == "D" or xy[1] == "D":
|
|
198
|
+
paths.append((repo / rest).resolve())
|
|
199
|
+
return paths
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def changed_since(root: str | Path, base_sha: str, *, include_dirty: bool = True) -> Delta:
|
|
203
|
+
"""Convenience: delta from base_sha to HEAD, plus optional dirty worktree."""
|
|
204
|
+
d = diff(root, base_sha, "HEAD")
|
|
205
|
+
if include_dirty:
|
|
206
|
+
d.dirty.extend(dirty_files(root))
|
|
207
|
+
# Uncommitted deletes must reach the pipeline's delete loop so
|
|
208
|
+
# graph nodes + vectors for vanished files get torn down even
|
|
209
|
+
# before the user commits the removal.
|
|
210
|
+
d.deleted.extend(dirty_deleted_files(root))
|
|
211
|
+
return d
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Per-repo ingest state: track the last commit successfully ingested.
|
|
2
|
+
|
|
3
|
+
Lives in the same SQLite DB as episodes (per-project namespaced) so it
|
|
4
|
+
inherits project isolation automatically.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import sqlite3
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
SCHEMA = """
|
|
15
|
+
CREATE TABLE IF NOT EXISTS ingest_state (
|
|
16
|
+
repo_root TEXT PRIMARY KEY,
|
|
17
|
+
last_sha TEXT NOT NULL,
|
|
18
|
+
last_ts REAL NOT NULL,
|
|
19
|
+
branch TEXT
|
|
20
|
+
);
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class IngestState:
|
|
26
|
+
repo_root: str
|
|
27
|
+
last_sha: str
|
|
28
|
+
last_ts: float
|
|
29
|
+
branch: str | None = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class IngestStateStore:
|
|
33
|
+
"""Thin SQLite wrapper for per-repo ingest checkpoints."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, db_path: Path) -> None:
|
|
36
|
+
self.path = db_path
|
|
37
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
38
|
+
self.conn = sqlite3.connect(self.path)
|
|
39
|
+
self.conn.executescript(SCHEMA)
|
|
40
|
+
self.conn.commit()
|
|
41
|
+
|
|
42
|
+
def get(self, repo_root: str | Path) -> IngestState | None:
|
|
43
|
+
row = self.conn.execute(
|
|
44
|
+
"SELECT repo_root, last_sha, last_ts, branch FROM ingest_state WHERE repo_root = ?",
|
|
45
|
+
(str(Path(repo_root).resolve()),),
|
|
46
|
+
).fetchone()
|
|
47
|
+
if row is None:
|
|
48
|
+
return None
|
|
49
|
+
return IngestState(repo_root=row[0], last_sha=row[1], last_ts=row[2], branch=row[3])
|
|
50
|
+
|
|
51
|
+
def set(self, repo_root: str | Path, sha: str, branch: str | None = None) -> None:
|
|
52
|
+
self.conn.execute(
|
|
53
|
+
"INSERT INTO ingest_state(repo_root, last_sha, last_ts, branch) "
|
|
54
|
+
"VALUES (?, ?, ?, ?) "
|
|
55
|
+
"ON CONFLICT(repo_root) DO UPDATE SET "
|
|
56
|
+
" last_sha = excluded.last_sha, "
|
|
57
|
+
" last_ts = excluded.last_ts, "
|
|
58
|
+
" branch = excluded.branch",
|
|
59
|
+
(str(Path(repo_root).resolve()), sha, time.time(), branch),
|
|
60
|
+
)
|
|
61
|
+
self.conn.commit()
|
|
62
|
+
|
|
63
|
+
def clear(self, repo_root: str | Path) -> None:
|
|
64
|
+
self.conn.execute(
|
|
65
|
+
"DELETE FROM ingest_state WHERE repo_root = ?",
|
|
66
|
+
(str(Path(repo_root).resolve()),),
|
|
67
|
+
)
|
|
68
|
+
self.conn.commit()
|
|
69
|
+
|
|
70
|
+
def close(self) -> None:
|
|
71
|
+
self.conn.close()
|