orgraph-mcp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orgraph/__init__.py +3 -0
- orgraph/_vendor/__init__.py +0 -0
- orgraph/_vendor/cache.py +475 -0
- orgraph/_vendor/cluster.py +272 -0
- orgraph/_vendor/extract.py +12390 -0
- orgraph/_vendor/mcp_ingest.py +402 -0
- orgraph/cli.py +348 -0
- orgraph/eval/__init__.py +0 -0
- orgraph/eval/fixtures/codewiki_gt.json +156 -0
- orgraph/eval/ground_truth.py +44 -0
- orgraph/eval/metrics.py +88 -0
- orgraph/eval/runner.py +116 -0
- orgraph/extract/__init__.py +0 -0
- orgraph/extract/manifest.py +84 -0
- orgraph/extract/scip.py +298 -0
- orgraph/extract/scip_pb2.py +2456 -0
- orgraph/extract/treesitter.py +207 -0
- orgraph/extract/types.py +49 -0
- orgraph/graph/__init__.py +0 -0
- orgraph/graph/builder.py +224 -0
- orgraph/graph/kuzu.py +40 -0
- orgraph/graph/schema.py +158 -0
- orgraph/installer/__init__.py +0 -0
- orgraph/installer/agents.py +136 -0
- orgraph/installer/config.py +123 -0
- orgraph/installer/installer.py +181 -0
- orgraph/mcp/__init__.py +0 -0
- orgraph/mcp/server.py +62 -0
- orgraph/mcp/tools.py +564 -0
- orgraph/search/__init__.py +0 -0
- orgraph/search/index.py +36 -0
- orgraph/topology/__init__.py +0 -0
- orgraph/topology/call_graph.py +193 -0
- orgraph/topology/cluster.py +196 -0
- orgraph/topology/context.py +150 -0
- orgraph/topology/serialise.py +87 -0
- orgraph/topology/topology.py +348 -0
- orgraph_mcp-0.1.0.dist-info/METADATA +85 -0
- orgraph_mcp-0.1.0.dist-info/RECORD +42 -0
- orgraph_mcp-0.1.0.dist-info/WHEEL +4 -0
- orgraph_mcp-0.1.0.dist-info/entry_points.txt +2 -0
- orgraph_mcp-0.1.0.dist-info/licenses/LICENSE +21 -0
orgraph/__init__.py
ADDED
|
File without changes
|
orgraph/_vendor/cache.py
ADDED
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
# per-file extraction cache - skip unchanged files on re-run
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import atexit
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import tempfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
# Output directory name — override with GRAPHIFY_OUT env var for worktrees or
|
|
13
|
+
# shared-output setups. Accepts a relative name ("graphify-out-feature") or an
|
|
14
|
+
# absolute path ("/shared/graphify-out").
|
|
15
|
+
_GRAPHIFY_OUT = os.environ.get("GRAPHIFY_OUT", "graphify-out")
|
|
16
|
+
|
|
17
|
+
# AST cache entries are the output of graphify's own extractor code, so they
|
|
18
|
+
# are only valid for the version that wrote them: keying purely on file
|
|
19
|
+
# content means extractor fixes shipped in a new release keep serving stale
|
|
20
|
+
# pre-fix results. The AST cache is therefore namespaced by package version
|
|
21
|
+
# (cache/ast/v{version}/), with entries from other versions removed on first
|
|
22
|
+
# use. The semantic cache is deliberately NOT versioned — its entries are
|
|
23
|
+
# produced by the LLM from file contents, and invalidating them on every
|
|
24
|
+
# release would re-bill extraction for unchanged files.
|
|
25
|
+
try:
|
|
26
|
+
from importlib.metadata import version as _pkg_version
|
|
27
|
+
|
|
28
|
+
_EXTRACTOR_VERSION = _pkg_version("graphifyy")
|
|
29
|
+
except Exception:
|
|
30
|
+
_EXTRACTOR_VERSION = "unknown"
|
|
31
|
+
|
|
32
|
+
# Version dirs already swept this process — cleanup runs once per (base, version).
|
|
33
|
+
_cleaned_ast_dirs: set[str] = set()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _cleanup_stale_ast_entries(ast_base: Path, current_dir: Path) -> None:
|
|
37
|
+
"""Remove AST cache entries left behind by other graphify versions.
|
|
38
|
+
|
|
39
|
+
Sweeps sibling ``v*/`` directories and unversioned ``*.json`` entries
|
|
40
|
+
(the pre-versioning layout) under ``cache/ast/``. Best-effort: failures
|
|
41
|
+
are ignored, stragglers are retried on the next run.
|
|
42
|
+
"""
|
|
43
|
+
key = str(current_dir)
|
|
44
|
+
if key in _cleaned_ast_dirs:
|
|
45
|
+
return
|
|
46
|
+
_cleaned_ast_dirs.add(key)
|
|
47
|
+
if not ast_base.is_dir():
|
|
48
|
+
return
|
|
49
|
+
import shutil
|
|
50
|
+
|
|
51
|
+
for child in ast_base.iterdir():
|
|
52
|
+
if child == current_dir:
|
|
53
|
+
continue
|
|
54
|
+
try:
|
|
55
|
+
if child.is_dir() and child.name.startswith("v"):
|
|
56
|
+
shutil.rmtree(child, ignore_errors=True)
|
|
57
|
+
elif child.suffix == ".json":
|
|
58
|
+
child.unlink()
|
|
59
|
+
except OSError:
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# A frontmatter delimiter is a whole line of exactly three dashes (optional
|
|
64
|
+
# trailing whitespace). Substring checks like startswith("---") /
|
|
65
|
+
# find("\n---") also match `----` thematic breaks and `--- text` prose,
|
|
66
|
+
# silently dropping everything above them from the hash (#1259).
|
|
67
|
+
_FRONTMATTER_DELIM = re.compile(r"^---[ \t]*\r?$", re.MULTILINE)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _body_content(content: bytes) -> bytes:
|
|
71
|
+
"""Strip YAML frontmatter from Markdown content, returning only the body."""
|
|
72
|
+
text = content.decode(errors="replace")
|
|
73
|
+
opener = _FRONTMATTER_DELIM.match(text)
|
|
74
|
+
if opener is None:
|
|
75
|
+
return content
|
|
76
|
+
closer = _FRONTMATTER_DELIM.search(text, opener.end())
|
|
77
|
+
if closer is None:
|
|
78
|
+
return content
|
|
79
|
+
# Slice right after the closing `---` (not after its line) so the output
|
|
80
|
+
# stays byte-identical with the historical implementation for well-formed
|
|
81
|
+
# frontmatter -- existing semantic-cache hashes must not churn.
|
|
82
|
+
return text[closer.start() + 3:].encode()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# Stat-based index: maps absolute path → {size, mtime_ns, hash}.
|
|
86
|
+
# Loaded once per process, flushed via atexit. Skips full file reads when
|
|
87
|
+
# size+mtime_ns are unchanged — same trade-off as make(1).
|
|
88
|
+
# Correctness risks: `touch` causes a harmless extra re-hash; same-size edits
|
|
89
|
+
# within NFS second-resolution mtime have a 1-second window (same as make).
|
|
90
|
+
# Use `graphify extract --force` to bypass when needed.
|
|
91
|
+
_stat_index: dict[str, dict] = {}
|
|
92
|
+
_stat_index_root: Path | None = None
|
|
93
|
+
_stat_index_dirty: bool = False
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _stat_index_file(root: Path) -> Path:
|
|
97
|
+
_out = Path(_GRAPHIFY_OUT)
|
|
98
|
+
base = _out if _out.is_absolute() else Path(root).resolve() / _out
|
|
99
|
+
return base / "cache" / "stat-index.json"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _ensure_stat_index(root: Path) -> None:
|
|
103
|
+
global _stat_index, _stat_index_root, _stat_index_dirty
|
|
104
|
+
if _stat_index_root is not None:
|
|
105
|
+
return
|
|
106
|
+
_stat_index_root = Path(root).resolve()
|
|
107
|
+
p = _stat_index_file(_stat_index_root)
|
|
108
|
+
if p.exists():
|
|
109
|
+
try:
|
|
110
|
+
_stat_index = json.loads(p.read_text(encoding="utf-8"))
|
|
111
|
+
except (json.JSONDecodeError, OSError):
|
|
112
|
+
_stat_index = {}
|
|
113
|
+
else:
|
|
114
|
+
_stat_index = {}
|
|
115
|
+
atexit.register(_flush_stat_index)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _flush_stat_index() -> None:
|
|
119
|
+
global _stat_index_dirty, _stat_index_root
|
|
120
|
+
if not _stat_index_dirty or _stat_index_root is None:
|
|
121
|
+
return
|
|
122
|
+
p = _stat_index_file(_stat_index_root)
|
|
123
|
+
try:
|
|
124
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
125
|
+
fd, tmp = tempfile.mkstemp(dir=p.parent, prefix="stat-index.", suffix=".tmp")
|
|
126
|
+
try:
|
|
127
|
+
os.write(fd, json.dumps(_stat_index, separators=(",", ":")).encode())
|
|
128
|
+
os.close(fd)
|
|
129
|
+
os.replace(tmp, p)
|
|
130
|
+
except Exception:
|
|
131
|
+
try:
|
|
132
|
+
os.close(fd)
|
|
133
|
+
except OSError:
|
|
134
|
+
pass
|
|
135
|
+
try:
|
|
136
|
+
os.unlink(tmp)
|
|
137
|
+
except OSError:
|
|
138
|
+
pass
|
|
139
|
+
except OSError:
|
|
140
|
+
pass
|
|
141
|
+
_stat_index_dirty = False
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _normalize_path(path: Path) -> Path:
|
|
145
|
+
"""Normalize path for consistent cache keys across Windows path spellings."""
|
|
146
|
+
import sys
|
|
147
|
+
if sys.platform != "win32":
|
|
148
|
+
return path
|
|
149
|
+
s = str(path)
|
|
150
|
+
if s.startswith("\\\\?\\"):
|
|
151
|
+
s = s[4:] # strip extended-length prefix \\?\
|
|
152
|
+
return Path(os.path.normcase(s))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def file_hash(path: Path, root: Path = Path(".")) -> str:
|
|
156
|
+
"""SHA256 of file contents + path relative to root.
|
|
157
|
+
|
|
158
|
+
Uses a stat-based fastpath (size + mtime_ns) to skip full reads when the
|
|
159
|
+
file hasn't changed. Falls through to full SHA256 on first encounter or
|
|
160
|
+
when stat changes. Index is flushed atomically at process exit.
|
|
161
|
+
|
|
162
|
+
Using a relative path (not absolute) makes cache entries portable across
|
|
163
|
+
machines and checkout directories, so shared caches and CI work correctly.
|
|
164
|
+
Falls back to the resolved absolute path if the file is outside root.
|
|
165
|
+
|
|
166
|
+
For Markdown files (.md), only the body below the YAML frontmatter is hashed,
|
|
167
|
+
so metadata-only changes (e.g. reviewed, status, tags) do not invalidate the cache.
|
|
168
|
+
"""
|
|
169
|
+
global _stat_index_dirty
|
|
170
|
+
p = _normalize_path(Path(path))
|
|
171
|
+
root = _normalize_path(Path(root))
|
|
172
|
+
if not p.is_file():
|
|
173
|
+
raise IsADirectoryError(f"file_hash requires a file, got: {p}")
|
|
174
|
+
|
|
175
|
+
_ensure_stat_index(root)
|
|
176
|
+
abs_key = str(p.resolve())
|
|
177
|
+
st: "os.stat_result | None" = None
|
|
178
|
+
try:
|
|
179
|
+
st = p.stat()
|
|
180
|
+
entry = _stat_index.get(abs_key)
|
|
181
|
+
if (entry
|
|
182
|
+
and entry.get("size") == st.st_size
|
|
183
|
+
and entry.get("mtime_ns") == st.st_mtime_ns):
|
|
184
|
+
return entry["hash"]
|
|
185
|
+
except OSError:
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
raw = p.read_bytes()
|
|
189
|
+
content = _body_content(raw) if p.suffix.lower() == ".md" else raw
|
|
190
|
+
h = hashlib.sha256()
|
|
191
|
+
h.update(content)
|
|
192
|
+
h.update(b"\x00")
|
|
193
|
+
try:
|
|
194
|
+
rel = p.resolve().relative_to(Path(root).resolve())
|
|
195
|
+
h.update(rel.as_posix().lower().encode())
|
|
196
|
+
except ValueError:
|
|
197
|
+
h.update(p.resolve().as_posix().lower().encode())
|
|
198
|
+
digest = h.hexdigest()
|
|
199
|
+
|
|
200
|
+
if st is not None:
|
|
201
|
+
_stat_index[abs_key] = {"size": st.st_size, "mtime_ns": st.st_mtime_ns, "hash": digest}
|
|
202
|
+
_stat_index_dirty = True
|
|
203
|
+
|
|
204
|
+
return digest
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _relativize_source_files_in(payload: dict, root: Path) -> None:
|
|
208
|
+
"""Mutate ``payload`` to rewrite absolute ``source_file`` fields as
|
|
209
|
+
forward-slash relative paths from ``root``.
|
|
210
|
+
|
|
211
|
+
Mirror of :func:`graphify.watch._relativize_source_files` so cached
|
|
212
|
+
extraction fragments persist in portable form (#777). Already-relative
|
|
213
|
+
fields and out-of-root paths pass through unchanged.
|
|
214
|
+
|
|
215
|
+
Only ``root`` is resolved — ``source_file`` itself is relativized
|
|
216
|
+
symbolically so in-root symlinks keep their original name rather than
|
|
217
|
+
pointing at the resolved target. Same reasoning as
|
|
218
|
+
:func:`graphify.detect._to_relative_for_storage`.
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
root_resolved = Path(root).resolve()
|
|
222
|
+
except OSError:
|
|
223
|
+
return
|
|
224
|
+
for bucket in ("nodes", "edges", "hyperedges"):
|
|
225
|
+
for item in payload.get(bucket, []):
|
|
226
|
+
if not isinstance(item, dict):
|
|
227
|
+
continue
|
|
228
|
+
source = item.get("source_file")
|
|
229
|
+
if not source:
|
|
230
|
+
continue
|
|
231
|
+
sp = Path(source)
|
|
232
|
+
if not sp.is_absolute():
|
|
233
|
+
continue
|
|
234
|
+
try:
|
|
235
|
+
rel = os.path.relpath(sp, root_resolved)
|
|
236
|
+
except (ValueError, OSError):
|
|
237
|
+
continue # out-of-root (e.g. Windows cross-drive)
|
|
238
|
+
if rel == ".." or rel.startswith(".." + os.sep) or rel.startswith("../"):
|
|
239
|
+
continue # escaped root — keep absolute
|
|
240
|
+
item["source_file"] = rel.replace(os.sep, "/")
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _absolutize_source_files_in(payload: dict, root: Path) -> None:
|
|
244
|
+
"""Inverse of :func:`_relativize_source_files_in`.
|
|
245
|
+
|
|
246
|
+
Re-anchor relative ``source_file`` fields against ``root`` so callers
|
|
247
|
+
that load a cached fragment see the same absolute-path shape that a
|
|
248
|
+
fresh in-process extraction would produce. Legacy cache entries with
|
|
249
|
+
absolute ``source_file`` values pass through unchanged.
|
|
250
|
+
"""
|
|
251
|
+
try:
|
|
252
|
+
root_resolved = Path(root).resolve()
|
|
253
|
+
except OSError:
|
|
254
|
+
return
|
|
255
|
+
for bucket in ("nodes", "edges", "hyperedges"):
|
|
256
|
+
for item in payload.get(bucket, []):
|
|
257
|
+
if not isinstance(item, dict):
|
|
258
|
+
continue
|
|
259
|
+
source = item.get("source_file")
|
|
260
|
+
if not source:
|
|
261
|
+
continue
|
|
262
|
+
sp = Path(source)
|
|
263
|
+
if sp.is_absolute():
|
|
264
|
+
continue
|
|
265
|
+
try:
|
|
266
|
+
item["source_file"] = str(root_resolved / sp)
|
|
267
|
+
except (TypeError, OSError):
|
|
268
|
+
continue
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def cache_dir(root: Path = Path("."), kind: str = "ast") -> Path:
|
|
272
|
+
"""Returns the cache directory for ``kind`` - creates it if needed.
|
|
273
|
+
|
|
274
|
+
kind is "ast" or "semantic". Separate subdirectories prevent semantic cache
|
|
275
|
+
entries from overwriting AST cache entries for the same source_file (#582).
|
|
276
|
+
|
|
277
|
+
AST entries live in graphify-out/cache/ast/v{version}/ — namespaced by
|
|
278
|
+
graphify version because they depend on extractor code, not just file
|
|
279
|
+
contents. Semantic entries live unversioned in graphify-out/cache/semantic/
|
|
280
|
+
(re-extraction costs LLM calls).
|
|
281
|
+
"""
|
|
282
|
+
_out = Path(_GRAPHIFY_OUT)
|
|
283
|
+
base = _out if _out.is_absolute() else Path(root).resolve() / _out
|
|
284
|
+
d = base / "cache" / kind
|
|
285
|
+
if kind == "ast":
|
|
286
|
+
d = d / f"v{_EXTRACTOR_VERSION}"
|
|
287
|
+
_cleanup_stale_ast_entries(d.parent, d)
|
|
288
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
289
|
+
return d
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def load_cached(path: Path, root: Path = Path("."), kind: str = "ast") -> dict | None:
|
|
293
|
+
"""Return cached extraction for this file if hash matches, else None.
|
|
294
|
+
|
|
295
|
+
Cache key: SHA256 of file contents.
|
|
296
|
+
Cache value: stored as graphify-out/cache/{kind}/{hash}.json (AST entries
|
|
297
|
+
under the per-version subdirectory, see :func:`cache_dir`).
|
|
298
|
+
|
|
299
|
+
AST entries written by other graphify versions — including the legacy
|
|
300
|
+
flat cache/ layout (pre-0.5.3) and the unversioned cache/ast/ layout —
|
|
301
|
+
are deliberately not consulted: they were produced by a different
|
|
302
|
+
extractor and may be stale.
|
|
303
|
+
Returns None if no cache entry or file has changed.
|
|
304
|
+
"""
|
|
305
|
+
try:
|
|
306
|
+
h = file_hash(path, root)
|
|
307
|
+
except OSError:
|
|
308
|
+
return None
|
|
309
|
+
entry = cache_dir(root, kind) / f"{h}.json"
|
|
310
|
+
if entry.exists():
|
|
311
|
+
try:
|
|
312
|
+
result = json.loads(entry.read_text(encoding="utf-8"))
|
|
313
|
+
except (json.JSONDecodeError, OSError):
|
|
314
|
+
return None
|
|
315
|
+
# Re-anchor relative source_file fields so callers see the same
|
|
316
|
+
# absolute-path shape that a fresh in-process extraction produces
|
|
317
|
+
# (#777). Legacy entries with absolute source_file pass through.
|
|
318
|
+
if isinstance(result, dict):
|
|
319
|
+
_absolutize_source_files_in(result, root)
|
|
320
|
+
return result
|
|
321
|
+
return None
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def save_cached(path: Path, result: dict, root: Path = Path("."), kind: str = "ast") -> None:
|
|
325
|
+
"""Save extraction result for this file.
|
|
326
|
+
|
|
327
|
+
Stores as graphify-out/cache/{kind}/{hash}.json where hash = SHA256 of current file contents.
|
|
328
|
+
result should be a dict with 'nodes' and 'edges' lists.
|
|
329
|
+
|
|
330
|
+
No-ops if `path` is not a regular file. Subagent-produced semantic fragments
|
|
331
|
+
occasionally carry a directory path in `source_file`; skipping them prevents
|
|
332
|
+
IsADirectoryError from aborting the whole batch.
|
|
333
|
+
"""
|
|
334
|
+
p = Path(path)
|
|
335
|
+
if not p.is_file():
|
|
336
|
+
return
|
|
337
|
+
# Relativize source_file fields against ``root`` before write so the
|
|
338
|
+
# cache file on disk is portable across machines and checkout
|
|
339
|
+
# directories (#777). The cache key is content-hashed so lookup is
|
|
340
|
+
# already path-independent; this fixes the embedded path leak.
|
|
341
|
+
#
|
|
342
|
+
# Serialize a relativized copy rather than mutating the caller's dict —
|
|
343
|
+
# downstream pipeline steps (notably extract.py's AST prefix remap, which
|
|
344
|
+
# looks up Path(source_file).resolve() in a prefix table) depend on the
|
|
345
|
+
# source_file field's original absolute form. Mutating the input here would
|
|
346
|
+
# silently break those remaps on the first extraction pass.
|
|
347
|
+
on_disk = result
|
|
348
|
+
if isinstance(result, dict) and any(result.get(k) for k in ("nodes", "edges", "hyperedges")):
|
|
349
|
+
import copy as _copy
|
|
350
|
+
on_disk = _copy.deepcopy(result)
|
|
351
|
+
_relativize_source_files_in(on_disk, root)
|
|
352
|
+
h = file_hash(p, root)
|
|
353
|
+
target_dir = cache_dir(root, kind)
|
|
354
|
+
entry = target_dir / f"{h}.json"
|
|
355
|
+
fd, tmp_path = tempfile.mkstemp(dir=target_dir, prefix=f"{h}.", suffix=".tmp")
|
|
356
|
+
try:
|
|
357
|
+
os.write(fd, json.dumps(on_disk).encode())
|
|
358
|
+
os.close(fd)
|
|
359
|
+
try:
|
|
360
|
+
os.replace(tmp_path, entry)
|
|
361
|
+
except PermissionError:
|
|
362
|
+
# Windows: os.replace can fail with WinError 5 if the target is
|
|
363
|
+
# briefly locked. Fall back to copy-then-delete.
|
|
364
|
+
import shutil
|
|
365
|
+
shutil.copy2(tmp_path, entry)
|
|
366
|
+
os.unlink(tmp_path)
|
|
367
|
+
except Exception:
|
|
368
|
+
try:
|
|
369
|
+
os.close(fd)
|
|
370
|
+
except OSError:
|
|
371
|
+
pass
|
|
372
|
+
try:
|
|
373
|
+
os.unlink(tmp_path)
|
|
374
|
+
except OSError:
|
|
375
|
+
pass
|
|
376
|
+
raise
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def cached_files(root: Path = Path(".")) -> set[str]:
|
|
380
|
+
"""Return set of file hashes that have a valid cache entry (any kind)."""
|
|
381
|
+
base = Path(root).resolve() / _GRAPHIFY_OUT / "cache"
|
|
382
|
+
hashes: set[str] = set()
|
|
383
|
+
# Legacy flat entries
|
|
384
|
+
if base.is_dir():
|
|
385
|
+
hashes.update(p.stem for p in base.glob("*.json"))
|
|
386
|
+
# Namespaced entries (ast/ recursively, covering per-version subdirs)
|
|
387
|
+
for kind, pattern in (("ast", "**/*.json"), ("semantic", "*.json")):
|
|
388
|
+
d = base / kind
|
|
389
|
+
if d.is_dir():
|
|
390
|
+
hashes.update(p.stem for p in d.glob(pattern))
|
|
391
|
+
return hashes
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def clear_cache(root: Path = Path(".")) -> None:
|
|
395
|
+
"""Delete all cache entries (ast/, semantic/, and legacy flat entries)."""
|
|
396
|
+
base = Path(root).resolve() / _GRAPHIFY_OUT / "cache"
|
|
397
|
+
# Legacy flat entries
|
|
398
|
+
if base.is_dir():
|
|
399
|
+
for f in base.glob("*.json"):
|
|
400
|
+
f.unlink()
|
|
401
|
+
# Namespaced entries (ast/ recursively, covering per-version subdirs)
|
|
402
|
+
for kind, pattern in (("ast", "**/*.json"), ("semantic", "*.json")):
|
|
403
|
+
d = base / kind
|
|
404
|
+
if d.is_dir():
|
|
405
|
+
for f in d.glob(pattern):
|
|
406
|
+
f.unlink()
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def check_semantic_cache(
|
|
410
|
+
files: list[str],
|
|
411
|
+
root: Path = Path("."),
|
|
412
|
+
) -> tuple[list[dict], list[dict], list[dict], list[str]]:
|
|
413
|
+
"""Check semantic extraction cache for a list of absolute file paths.
|
|
414
|
+
|
|
415
|
+
Returns (cached_nodes, cached_edges, cached_hyperedges, uncached_files).
|
|
416
|
+
Uncached files need Claude extraction; cached files are merged directly.
|
|
417
|
+
"""
|
|
418
|
+
cached_nodes: list[dict] = []
|
|
419
|
+
cached_edges: list[dict] = []
|
|
420
|
+
cached_hyperedges: list[dict] = []
|
|
421
|
+
uncached: list[str] = []
|
|
422
|
+
|
|
423
|
+
for fpath in files:
|
|
424
|
+
p = Path(fpath)
|
|
425
|
+
if not p.is_absolute():
|
|
426
|
+
p = Path(root) / p
|
|
427
|
+
result = load_cached(p, root, kind="semantic")
|
|
428
|
+
if result is not None:
|
|
429
|
+
cached_nodes.extend(result.get("nodes", []))
|
|
430
|
+
cached_edges.extend(result.get("edges", []))
|
|
431
|
+
cached_hyperedges.extend(result.get("hyperedges", []))
|
|
432
|
+
else:
|
|
433
|
+
uncached.append(fpath)
|
|
434
|
+
|
|
435
|
+
return cached_nodes, cached_edges, cached_hyperedges, uncached
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def save_semantic_cache(
|
|
439
|
+
nodes: list[dict],
|
|
440
|
+
edges: list[dict],
|
|
441
|
+
hyperedges: list[dict] | None = None,
|
|
442
|
+
root: Path = Path("."),
|
|
443
|
+
) -> int:
|
|
444
|
+
"""Save semantic extraction results to cache, keyed by source_file.
|
|
445
|
+
|
|
446
|
+
Groups nodes and edges by source_file, then saves one cache entry per file
|
|
447
|
+
under cache/semantic/ (separate from AST entries in cache/ast/) to prevent
|
|
448
|
+
hash-key collisions (#582).
|
|
449
|
+
Returns the number of files cached.
|
|
450
|
+
"""
|
|
451
|
+
from collections import defaultdict
|
|
452
|
+
|
|
453
|
+
by_file: dict[str, dict] = defaultdict(lambda: {"nodes": [], "edges": [], "hyperedges": []})
|
|
454
|
+
for n in nodes:
|
|
455
|
+
src = n.get("source_file", "")
|
|
456
|
+
if src:
|
|
457
|
+
by_file[src]["nodes"].append(n)
|
|
458
|
+
for e in edges:
|
|
459
|
+
src = e.get("source_file", "")
|
|
460
|
+
if src:
|
|
461
|
+
by_file[src]["edges"].append(e)
|
|
462
|
+
for h in (hyperedges or []):
|
|
463
|
+
src = h.get("source_file", "")
|
|
464
|
+
if src:
|
|
465
|
+
by_file[src]["hyperedges"].append(h)
|
|
466
|
+
|
|
467
|
+
saved = 0
|
|
468
|
+
for fpath, result in by_file.items():
|
|
469
|
+
p = Path(fpath)
|
|
470
|
+
if not p.is_absolute():
|
|
471
|
+
p = Path(root) / p
|
|
472
|
+
if p.is_file():
|
|
473
|
+
save_cached(p, result, root, kind="semantic")
|
|
474
|
+
saved += 1
|
|
475
|
+
return saved
|