orgraph-mcp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. orgraph/__init__.py +3 -0
  2. orgraph/_vendor/__init__.py +0 -0
  3. orgraph/_vendor/cache.py +475 -0
  4. orgraph/_vendor/cluster.py +272 -0
  5. orgraph/_vendor/extract.py +12390 -0
  6. orgraph/_vendor/mcp_ingest.py +402 -0
  7. orgraph/cli.py +348 -0
  8. orgraph/eval/__init__.py +0 -0
  9. orgraph/eval/fixtures/codewiki_gt.json +156 -0
  10. orgraph/eval/ground_truth.py +44 -0
  11. orgraph/eval/metrics.py +88 -0
  12. orgraph/eval/runner.py +116 -0
  13. orgraph/extract/__init__.py +0 -0
  14. orgraph/extract/manifest.py +84 -0
  15. orgraph/extract/scip.py +298 -0
  16. orgraph/extract/scip_pb2.py +2456 -0
  17. orgraph/extract/treesitter.py +207 -0
  18. orgraph/extract/types.py +49 -0
  19. orgraph/graph/__init__.py +0 -0
  20. orgraph/graph/builder.py +224 -0
  21. orgraph/graph/kuzu.py +40 -0
  22. orgraph/graph/schema.py +158 -0
  23. orgraph/installer/__init__.py +0 -0
  24. orgraph/installer/agents.py +136 -0
  25. orgraph/installer/config.py +123 -0
  26. orgraph/installer/installer.py +181 -0
  27. orgraph/mcp/__init__.py +0 -0
  28. orgraph/mcp/server.py +62 -0
  29. orgraph/mcp/tools.py +564 -0
  30. orgraph/search/__init__.py +0 -0
  31. orgraph/search/index.py +36 -0
  32. orgraph/topology/__init__.py +0 -0
  33. orgraph/topology/call_graph.py +193 -0
  34. orgraph/topology/cluster.py +196 -0
  35. orgraph/topology/context.py +150 -0
  36. orgraph/topology/serialise.py +87 -0
  37. orgraph/topology/topology.py +348 -0
  38. orgraph_mcp-0.1.0.dist-info/METADATA +85 -0
  39. orgraph_mcp-0.1.0.dist-info/RECORD +42 -0
  40. orgraph_mcp-0.1.0.dist-info/WHEEL +4 -0
  41. orgraph_mcp-0.1.0.dist-info/entry_points.txt +2 -0
  42. orgraph_mcp-0.1.0.dist-info/licenses/LICENSE +21 -0
orgraph/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """orgraph — codebase knowledge graph for coding agents."""
2
+
3
+ __version__ = "0.1.0"
File without changes
@@ -0,0 +1,475 @@
1
+ # per-file extraction cache - skip unchanged files on re-run
2
+ from __future__ import annotations
3
+
4
+ import atexit
5
+ import hashlib
6
+ import json
7
+ import os
8
+ import re
9
+ import tempfile
10
+ from pathlib import Path
11
+
12
+ # Output directory name — override with GRAPHIFY_OUT env var for worktrees or
13
+ # shared-output setups. Accepts a relative name ("graphify-out-feature") or an
14
+ # absolute path ("/shared/graphify-out").
15
+ _GRAPHIFY_OUT = os.environ.get("GRAPHIFY_OUT", "graphify-out")
16
+
17
+ # AST cache entries are the output of graphify's own extractor code, so they
18
+ # are only valid for the version that wrote them: keying purely on file
19
+ # content means extractor fixes shipped in a new release keep serving stale
20
+ # pre-fix results. The AST cache is therefore namespaced by package version
21
+ # (cache/ast/v{version}/), with entries from other versions removed on first
22
+ # use. The semantic cache is deliberately NOT versioned — its entries are
23
+ # produced by the LLM from file contents, and invalidating them on every
24
+ # release would re-bill extraction for unchanged files.
25
+ try:
26
+ from importlib.metadata import version as _pkg_version
27
+
28
+ _EXTRACTOR_VERSION = _pkg_version("graphifyy")
29
+ except Exception:
30
+ _EXTRACTOR_VERSION = "unknown"
31
+
32
+ # Version dirs already swept this process — cleanup runs once per (base, version).
33
+ _cleaned_ast_dirs: set[str] = set()
34
+
35
+
36
+ def _cleanup_stale_ast_entries(ast_base: Path, current_dir: Path) -> None:
37
+ """Remove AST cache entries left behind by other graphify versions.
38
+
39
+ Sweeps sibling ``v*/`` directories and unversioned ``*.json`` entries
40
+ (the pre-versioning layout) under ``cache/ast/``. Best-effort: failures
41
+ are ignored, stragglers are retried on the next run.
42
+ """
43
+ key = str(current_dir)
44
+ if key in _cleaned_ast_dirs:
45
+ return
46
+ _cleaned_ast_dirs.add(key)
47
+ if not ast_base.is_dir():
48
+ return
49
+ import shutil
50
+
51
+ for child in ast_base.iterdir():
52
+ if child == current_dir:
53
+ continue
54
+ try:
55
+ if child.is_dir() and child.name.startswith("v"):
56
+ shutil.rmtree(child, ignore_errors=True)
57
+ elif child.suffix == ".json":
58
+ child.unlink()
59
+ except OSError:
60
+ pass
61
+
62
+
63
+ # A frontmatter delimiter is a whole line of exactly three dashes (optional
64
+ # trailing whitespace). Substring checks like startswith("---") /
65
+ # find("\n---") also match `----` thematic breaks and `--- text` prose,
66
+ # silently dropping everything above them from the hash (#1259).
67
+ _FRONTMATTER_DELIM = re.compile(r"^---[ \t]*\r?$", re.MULTILINE)
68
+
69
+
70
+ def _body_content(content: bytes) -> bytes:
71
+ """Strip YAML frontmatter from Markdown content, returning only the body."""
72
+ text = content.decode(errors="replace")
73
+ opener = _FRONTMATTER_DELIM.match(text)
74
+ if opener is None:
75
+ return content
76
+ closer = _FRONTMATTER_DELIM.search(text, opener.end())
77
+ if closer is None:
78
+ return content
79
+ # Slice right after the closing `---` (not after its line) so the output
80
+ # stays byte-identical with the historical implementation for well-formed
81
+ # frontmatter -- existing semantic-cache hashes must not churn.
82
+ return text[closer.start() + 3:].encode()
83
+
84
+
85
+ # Stat-based index: maps absolute path → {size, mtime_ns, hash}.
86
+ # Loaded once per process, flushed via atexit. Skips full file reads when
87
+ # size+mtime_ns are unchanged — same trade-off as make(1).
88
+ # Correctness risks: `touch` causes a harmless extra re-hash; same-size edits
89
+ # within NFS second-resolution mtime have a 1-second window (same as make).
90
+ # Use `graphify extract --force` to bypass when needed.
91
+ _stat_index: dict[str, dict] = {}
92
+ _stat_index_root: Path | None = None
93
+ _stat_index_dirty: bool = False
94
+
95
+
96
+ def _stat_index_file(root: Path) -> Path:
97
+ _out = Path(_GRAPHIFY_OUT)
98
+ base = _out if _out.is_absolute() else Path(root).resolve() / _out
99
+ return base / "cache" / "stat-index.json"
100
+
101
+
102
+ def _ensure_stat_index(root: Path) -> None:
103
+ global _stat_index, _stat_index_root, _stat_index_dirty
104
+ if _stat_index_root is not None:
105
+ return
106
+ _stat_index_root = Path(root).resolve()
107
+ p = _stat_index_file(_stat_index_root)
108
+ if p.exists():
109
+ try:
110
+ _stat_index = json.loads(p.read_text(encoding="utf-8"))
111
+ except (json.JSONDecodeError, OSError):
112
+ _stat_index = {}
113
+ else:
114
+ _stat_index = {}
115
+ atexit.register(_flush_stat_index)
116
+
117
+
118
+ def _flush_stat_index() -> None:
119
+ global _stat_index_dirty, _stat_index_root
120
+ if not _stat_index_dirty or _stat_index_root is None:
121
+ return
122
+ p = _stat_index_file(_stat_index_root)
123
+ try:
124
+ p.parent.mkdir(parents=True, exist_ok=True)
125
+ fd, tmp = tempfile.mkstemp(dir=p.parent, prefix="stat-index.", suffix=".tmp")
126
+ try:
127
+ os.write(fd, json.dumps(_stat_index, separators=(",", ":")).encode())
128
+ os.close(fd)
129
+ os.replace(tmp, p)
130
+ except Exception:
131
+ try:
132
+ os.close(fd)
133
+ except OSError:
134
+ pass
135
+ try:
136
+ os.unlink(tmp)
137
+ except OSError:
138
+ pass
139
+ except OSError:
140
+ pass
141
+ _stat_index_dirty = False
142
+
143
+
144
+ def _normalize_path(path: Path) -> Path:
145
+ """Normalize path for consistent cache keys across Windows path spellings."""
146
+ import sys
147
+ if sys.platform != "win32":
148
+ return path
149
+ s = str(path)
150
+ if s.startswith("\\\\?\\"):
151
+ s = s[4:] # strip extended-length prefix \\?\
152
+ return Path(os.path.normcase(s))
153
+
154
+
155
+ def file_hash(path: Path, root: Path = Path(".")) -> str:
156
+ """SHA256 of file contents + path relative to root.
157
+
158
+ Uses a stat-based fastpath (size + mtime_ns) to skip full reads when the
159
+ file hasn't changed. Falls through to full SHA256 on first encounter or
160
+ when stat changes. Index is flushed atomically at process exit.
161
+
162
+ Using a relative path (not absolute) makes cache entries portable across
163
+ machines and checkout directories, so shared caches and CI work correctly.
164
+ Falls back to the resolved absolute path if the file is outside root.
165
+
166
+ For Markdown files (.md), only the body below the YAML frontmatter is hashed,
167
+ so metadata-only changes (e.g. reviewed, status, tags) do not invalidate the cache.
168
+ """
169
+ global _stat_index_dirty
170
+ p = _normalize_path(Path(path))
171
+ root = _normalize_path(Path(root))
172
+ if not p.is_file():
173
+ raise IsADirectoryError(f"file_hash requires a file, got: {p}")
174
+
175
+ _ensure_stat_index(root)
176
+ abs_key = str(p.resolve())
177
+ st: "os.stat_result | None" = None
178
+ try:
179
+ st = p.stat()
180
+ entry = _stat_index.get(abs_key)
181
+ if (entry
182
+ and entry.get("size") == st.st_size
183
+ and entry.get("mtime_ns") == st.st_mtime_ns):
184
+ return entry["hash"]
185
+ except OSError:
186
+ pass
187
+
188
+ raw = p.read_bytes()
189
+ content = _body_content(raw) if p.suffix.lower() == ".md" else raw
190
+ h = hashlib.sha256()
191
+ h.update(content)
192
+ h.update(b"\x00")
193
+ try:
194
+ rel = p.resolve().relative_to(Path(root).resolve())
195
+ h.update(rel.as_posix().lower().encode())
196
+ except ValueError:
197
+ h.update(p.resolve().as_posix().lower().encode())
198
+ digest = h.hexdigest()
199
+
200
+ if st is not None:
201
+ _stat_index[abs_key] = {"size": st.st_size, "mtime_ns": st.st_mtime_ns, "hash": digest}
202
+ _stat_index_dirty = True
203
+
204
+ return digest
205
+
206
+
207
+ def _relativize_source_files_in(payload: dict, root: Path) -> None:
208
+ """Mutate ``payload`` to rewrite absolute ``source_file`` fields as
209
+ forward-slash relative paths from ``root``.
210
+
211
+ Mirror of :func:`graphify.watch._relativize_source_files` so cached
212
+ extraction fragments persist in portable form (#777). Already-relative
213
+ fields and out-of-root paths pass through unchanged.
214
+
215
+ Only ``root`` is resolved — ``source_file`` itself is relativized
216
+ symbolically so in-root symlinks keep their original name rather than
217
+ pointing at the resolved target. Same reasoning as
218
+ :func:`graphify.detect._to_relative_for_storage`.
219
+ """
220
+ try:
221
+ root_resolved = Path(root).resolve()
222
+ except OSError:
223
+ return
224
+ for bucket in ("nodes", "edges", "hyperedges"):
225
+ for item in payload.get(bucket, []):
226
+ if not isinstance(item, dict):
227
+ continue
228
+ source = item.get("source_file")
229
+ if not source:
230
+ continue
231
+ sp = Path(source)
232
+ if not sp.is_absolute():
233
+ continue
234
+ try:
235
+ rel = os.path.relpath(sp, root_resolved)
236
+ except (ValueError, OSError):
237
+ continue # out-of-root (e.g. Windows cross-drive)
238
+ if rel == ".." or rel.startswith(".." + os.sep) or rel.startswith("../"):
239
+ continue # escaped root — keep absolute
240
+ item["source_file"] = rel.replace(os.sep, "/")
241
+
242
+
243
+ def _absolutize_source_files_in(payload: dict, root: Path) -> None:
244
+ """Inverse of :func:`_relativize_source_files_in`.
245
+
246
+ Re-anchor relative ``source_file`` fields against ``root`` so callers
247
+ that load a cached fragment see the same absolute-path shape that a
248
+ fresh in-process extraction would produce. Legacy cache entries with
249
+ absolute ``source_file`` values pass through unchanged.
250
+ """
251
+ try:
252
+ root_resolved = Path(root).resolve()
253
+ except OSError:
254
+ return
255
+ for bucket in ("nodes", "edges", "hyperedges"):
256
+ for item in payload.get(bucket, []):
257
+ if not isinstance(item, dict):
258
+ continue
259
+ source = item.get("source_file")
260
+ if not source:
261
+ continue
262
+ sp = Path(source)
263
+ if sp.is_absolute():
264
+ continue
265
+ try:
266
+ item["source_file"] = str(root_resolved / sp)
267
+ except (TypeError, OSError):
268
+ continue
269
+
270
+
271
+ def cache_dir(root: Path = Path("."), kind: str = "ast") -> Path:
272
+ """Returns the cache directory for ``kind`` - creates it if needed.
273
+
274
+ kind is "ast" or "semantic". Separate subdirectories prevent semantic cache
275
+ entries from overwriting AST cache entries for the same source_file (#582).
276
+
277
+ AST entries live in graphify-out/cache/ast/v{version}/ — namespaced by
278
+ graphify version because they depend on extractor code, not just file
279
+ contents. Semantic entries live unversioned in graphify-out/cache/semantic/
280
+ (re-extraction costs LLM calls).
281
+ """
282
+ _out = Path(_GRAPHIFY_OUT)
283
+ base = _out if _out.is_absolute() else Path(root).resolve() / _out
284
+ d = base / "cache" / kind
285
+ if kind == "ast":
286
+ d = d / f"v{_EXTRACTOR_VERSION}"
287
+ _cleanup_stale_ast_entries(d.parent, d)
288
+ d.mkdir(parents=True, exist_ok=True)
289
+ return d
290
+
291
+
292
+ def load_cached(path: Path, root: Path = Path("."), kind: str = "ast") -> dict | None:
293
+ """Return cached extraction for this file if hash matches, else None.
294
+
295
+ Cache key: SHA256 of file contents.
296
+ Cache value: stored as graphify-out/cache/{kind}/{hash}.json (AST entries
297
+ under the per-version subdirectory, see :func:`cache_dir`).
298
+
299
+ AST entries written by other graphify versions — including the legacy
300
+ flat cache/ layout (pre-0.5.3) and the unversioned cache/ast/ layout —
301
+ are deliberately not consulted: they were produced by a different
302
+ extractor and may be stale.
303
+ Returns None if no cache entry or file has changed.
304
+ """
305
+ try:
306
+ h = file_hash(path, root)
307
+ except OSError:
308
+ return None
309
+ entry = cache_dir(root, kind) / f"{h}.json"
310
+ if entry.exists():
311
+ try:
312
+ result = json.loads(entry.read_text(encoding="utf-8"))
313
+ except (json.JSONDecodeError, OSError):
314
+ return None
315
+ # Re-anchor relative source_file fields so callers see the same
316
+ # absolute-path shape that a fresh in-process extraction produces
317
+ # (#777). Legacy entries with absolute source_file pass through.
318
+ if isinstance(result, dict):
319
+ _absolutize_source_files_in(result, root)
320
+ return result
321
+ return None
322
+
323
+
324
+ def save_cached(path: Path, result: dict, root: Path = Path("."), kind: str = "ast") -> None:
325
+ """Save extraction result for this file.
326
+
327
+ Stores as graphify-out/cache/{kind}/{hash}.json where hash = SHA256 of current file contents.
328
+ result should be a dict with 'nodes' and 'edges' lists.
329
+
330
+ No-ops if `path` is not a regular file. Subagent-produced semantic fragments
331
+ occasionally carry a directory path in `source_file`; skipping them prevents
332
+ IsADirectoryError from aborting the whole batch.
333
+ """
334
+ p = Path(path)
335
+ if not p.is_file():
336
+ return
337
+ # Relativize source_file fields against ``root`` before write so the
338
+ # cache file on disk is portable across machines and checkout
339
+ # directories (#777). The cache key is content-hashed so lookup is
340
+ # already path-independent; this fixes the embedded path leak.
341
+ #
342
+ # Serialize a relativized copy rather than mutating the caller's dict —
343
+ # downstream pipeline steps (notably extract.py's AST prefix remap, which
344
+ # looks up Path(source_file).resolve() in a prefix table) depend on the
345
+ # source_file field's original absolute form. Mutating the input here would
346
+ # silently break those remaps on the first extraction pass.
347
+ on_disk = result
348
+ if isinstance(result, dict) and any(result.get(k) for k in ("nodes", "edges", "hyperedges")):
349
+ import copy as _copy
350
+ on_disk = _copy.deepcopy(result)
351
+ _relativize_source_files_in(on_disk, root)
352
+ h = file_hash(p, root)
353
+ target_dir = cache_dir(root, kind)
354
+ entry = target_dir / f"{h}.json"
355
+ fd, tmp_path = tempfile.mkstemp(dir=target_dir, prefix=f"{h}.", suffix=".tmp")
356
+ try:
357
+ os.write(fd, json.dumps(on_disk).encode())
358
+ os.close(fd)
359
+ try:
360
+ os.replace(tmp_path, entry)
361
+ except PermissionError:
362
+ # Windows: os.replace can fail with WinError 5 if the target is
363
+ # briefly locked. Fall back to copy-then-delete.
364
+ import shutil
365
+ shutil.copy2(tmp_path, entry)
366
+ os.unlink(tmp_path)
367
+ except Exception:
368
+ try:
369
+ os.close(fd)
370
+ except OSError:
371
+ pass
372
+ try:
373
+ os.unlink(tmp_path)
374
+ except OSError:
375
+ pass
376
+ raise
377
+
378
+
379
+ def cached_files(root: Path = Path(".")) -> set[str]:
380
+ """Return set of file hashes that have a valid cache entry (any kind)."""
381
+ base = Path(root).resolve() / _GRAPHIFY_OUT / "cache"
382
+ hashes: set[str] = set()
383
+ # Legacy flat entries
384
+ if base.is_dir():
385
+ hashes.update(p.stem for p in base.glob("*.json"))
386
+ # Namespaced entries (ast/ recursively, covering per-version subdirs)
387
+ for kind, pattern in (("ast", "**/*.json"), ("semantic", "*.json")):
388
+ d = base / kind
389
+ if d.is_dir():
390
+ hashes.update(p.stem for p in d.glob(pattern))
391
+ return hashes
392
+
393
+
394
+ def clear_cache(root: Path = Path(".")) -> None:
395
+ """Delete all cache entries (ast/, semantic/, and legacy flat entries)."""
396
+ base = Path(root).resolve() / _GRAPHIFY_OUT / "cache"
397
+ # Legacy flat entries
398
+ if base.is_dir():
399
+ for f in base.glob("*.json"):
400
+ f.unlink()
401
+ # Namespaced entries (ast/ recursively, covering per-version subdirs)
402
+ for kind, pattern in (("ast", "**/*.json"), ("semantic", "*.json")):
403
+ d = base / kind
404
+ if d.is_dir():
405
+ for f in d.glob(pattern):
406
+ f.unlink()
407
+
408
+
409
+ def check_semantic_cache(
410
+ files: list[str],
411
+ root: Path = Path("."),
412
+ ) -> tuple[list[dict], list[dict], list[dict], list[str]]:
413
+ """Check semantic extraction cache for a list of absolute file paths.
414
+
415
+ Returns (cached_nodes, cached_edges, cached_hyperedges, uncached_files).
416
+ Uncached files need Claude extraction; cached files are merged directly.
417
+ """
418
+ cached_nodes: list[dict] = []
419
+ cached_edges: list[dict] = []
420
+ cached_hyperedges: list[dict] = []
421
+ uncached: list[str] = []
422
+
423
+ for fpath in files:
424
+ p = Path(fpath)
425
+ if not p.is_absolute():
426
+ p = Path(root) / p
427
+ result = load_cached(p, root, kind="semantic")
428
+ if result is not None:
429
+ cached_nodes.extend(result.get("nodes", []))
430
+ cached_edges.extend(result.get("edges", []))
431
+ cached_hyperedges.extend(result.get("hyperedges", []))
432
+ else:
433
+ uncached.append(fpath)
434
+
435
+ return cached_nodes, cached_edges, cached_hyperedges, uncached
436
+
437
+
438
+ def save_semantic_cache(
439
+ nodes: list[dict],
440
+ edges: list[dict],
441
+ hyperedges: list[dict] | None = None,
442
+ root: Path = Path("."),
443
+ ) -> int:
444
+ """Save semantic extraction results to cache, keyed by source_file.
445
+
446
+ Groups nodes and edges by source_file, then saves one cache entry per file
447
+ under cache/semantic/ (separate from AST entries in cache/ast/) to prevent
448
+ hash-key collisions (#582).
449
+ Returns the number of files cached.
450
+ """
451
+ from collections import defaultdict
452
+
453
+ by_file: dict[str, dict] = defaultdict(lambda: {"nodes": [], "edges": [], "hyperedges": []})
454
+ for n in nodes:
455
+ src = n.get("source_file", "")
456
+ if src:
457
+ by_file[src]["nodes"].append(n)
458
+ for e in edges:
459
+ src = e.get("source_file", "")
460
+ if src:
461
+ by_file[src]["edges"].append(e)
462
+ for h in (hyperedges or []):
463
+ src = h.get("source_file", "")
464
+ if src:
465
+ by_file[src]["hyperedges"].append(h)
466
+
467
+ saved = 0
468
+ for fpath, result in by_file.items():
469
+ p = Path(fpath)
470
+ if not p.is_absolute():
471
+ p = Path(root) / p
472
+ if p.is_file():
473
+ save_cached(p, result, root, kind="semantic")
474
+ saved += 1
475
+ return saved