code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. code_review_graph/__init__.py +20 -0
  2. code_review_graph/__main__.py +4 -0
  3. code_review_graph/analysis.py +410 -0
  4. code_review_graph/changes.py +409 -0
  5. code_review_graph/cli.py +1255 -0
  6. code_review_graph/communities.py +874 -0
  7. code_review_graph/constants.py +23 -0
  8. code_review_graph/context_savings.py +317 -0
  9. code_review_graph/custom_languages.py +322 -0
  10. code_review_graph/daemon.py +1009 -0
  11. code_review_graph/daemon_cli.py +320 -0
  12. code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
  13. code_review_graph/embeddings.py +1006 -0
  14. code_review_graph/enrich.py +303 -0
  15. code_review_graph/eval/__init__.py +33 -0
  16. code_review_graph/eval/benchmarks/__init__.py +1 -0
  17. code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
  18. code_review_graph/eval/benchmarks/build_performance.py +60 -0
  19. code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
  20. code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
  21. code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
  22. code_review_graph/eval/benchmarks/search_quality.py +59 -0
  23. code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
  24. code_review_graph/eval/configs/code-review-graph.yaml +50 -0
  25. code_review_graph/eval/configs/express.yaml +45 -0
  26. code_review_graph/eval/configs/fastapi.yaml +48 -0
  27. code_review_graph/eval/configs/flask.yaml +50 -0
  28. code_review_graph/eval/configs/gin.yaml +51 -0
  29. code_review_graph/eval/configs/httpx.yaml +48 -0
  30. code_review_graph/eval/reporter.py +301 -0
  31. code_review_graph/eval/runner.py +211 -0
  32. code_review_graph/eval/scorer.py +85 -0
  33. code_review_graph/eval/token_benchmark.py +182 -0
  34. code_review_graph/exports.py +409 -0
  35. code_review_graph/flows.py +698 -0
  36. code_review_graph/graph.py +1427 -0
  37. code_review_graph/graph_diff.py +122 -0
  38. code_review_graph/hints.py +384 -0
  39. code_review_graph/incremental.py +1245 -0
  40. code_review_graph/jedi_resolver.py +303 -0
  41. code_review_graph/main.py +1079 -0
  42. code_review_graph/memory.py +142 -0
  43. code_review_graph/migrations.py +284 -0
  44. code_review_graph/parser.py +6957 -0
  45. code_review_graph/postprocessing.py +134 -0
  46. code_review_graph/prompts.py +159 -0
  47. code_review_graph/refactor.py +852 -0
  48. code_review_graph/registry.py +319 -0
  49. code_review_graph/rescript_resolver.py +206 -0
  50. code_review_graph/search.py +447 -0
  51. code_review_graph/skills.py +1481 -0
  52. code_review_graph/spring_resolver.py +200 -0
  53. code_review_graph/temporal_resolver.py +199 -0
  54. code_review_graph/token_benchmark.py +125 -0
  55. code_review_graph/tools/__init__.py +156 -0
  56. code_review_graph/tools/_common.py +176 -0
  57. code_review_graph/tools/analysis_tools.py +184 -0
  58. code_review_graph/tools/build.py +541 -0
  59. code_review_graph/tools/community_tools.py +246 -0
  60. code_review_graph/tools/context.py +152 -0
  61. code_review_graph/tools/docs.py +274 -0
  62. code_review_graph/tools/flows_tools.py +176 -0
  63. code_review_graph/tools/query.py +692 -0
  64. code_review_graph/tools/refactor_tools.py +168 -0
  65. code_review_graph/tools/registry_tools.py +125 -0
  66. code_review_graph/tools/review.py +477 -0
  67. code_review_graph/tsconfig_resolver.py +257 -0
  68. code_review_graph/visualization.py +2184 -0
  69. code_review_graph/wiki.py +305 -0
  70. code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
  71. code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
  72. code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
  73. code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
  74. code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1245 @@
1
+ """Incremental graph update logic.
2
+
3
+ Detects changed files via git diff, re-parses only changed + impacted files,
4
+ and updates the graph accordingly. Also supports CLI invocation for hooks.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import concurrent.futures
10
+ import fnmatch
11
+ import hashlib
12
+ import logging
13
+ import os
14
+ import re
15
+ import subprocess
16
+ import sys
17
+ import threading
18
+ import time
19
+ from pathlib import Path, PurePosixPath
20
+ from typing import Callable, Optional
21
+
22
+ from .graph import GraphStore
23
+ from .parser import CodeParser
24
+
25
+ _MAX_PARSE_WORKERS = int(os.environ.get("CRG_PARSE_WORKERS", str(min(os.cpu_count() or 4, 8))))
26
+
27
+
28
+ def _select_executor_kind() -> str:
29
+ """Return 'process' or 'thread' for parallel parsing.
30
+
31
+ Defaults to ``process`` (the original behavior, fastest on Linux/macOS).
32
+ Auto-switches to ``thread`` when running on Windows with stdin not
33
+ attached to a TTY — that combination indicates an MCP/stdio host, where
34
+ ``ProcessPoolExecutor`` workers inherit the parent's pipe handles and
35
+ leak as zombies after the pool closes (issues #46, #136).
36
+
37
+ Override explicitly with ``CRG_PARSE_EXECUTOR={process,thread}``.
38
+
39
+ Tree-sitter parsing in the worker releases the GIL during native
40
+ parsing, so the speedup loss for falling back to threads is small
41
+ (typically <30% on the full-build path) and the trade is worth it
42
+ to avoid the deadlock + zombie process accumulation.
43
+ """
44
+ explicit = os.environ.get("CRG_PARSE_EXECUTOR", "").strip().lower()
45
+ if explicit in ("process", "thread"):
46
+ return explicit
47
+ if sys.platform == "win32" and not sys.stdin.isatty():
48
+ return "thread"
49
+ return "process"
50
+
51
+
52
+ def _make_executor(max_workers: int):
53
+ """Construct the parallel-parse executor selected by [_select_executor_kind]."""
54
+ if _select_executor_kind() == "thread":
55
+ return concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
56
+ return concurrent.futures.ProcessPoolExecutor(max_workers=max_workers)
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+
61
+ def _run_rescript_resolver(store: GraphStore) -> Optional[dict]:
62
+ """Run the ReScript cross-module resolver, swallowing any failure so
63
+ build never fails because of it. Returns stats or None on error.
64
+ """
65
+ try:
66
+ from .rescript_resolver import resolve_rescript_cross_module
67
+ return resolve_rescript_cross_module(store)
68
+ except Exception as exc: # noqa: BLE001 - best-effort post-pass
69
+ logger.warning("ReScript cross-module resolver failed: %s", exc)
70
+ return None
71
+
72
+
73
+ def _run_spring_resolver(store: GraphStore) -> Optional[dict]:
74
+ """Run the Spring DI call resolver, swallowing any failure so
75
+ build never fails because of it. Returns stats or None on error.
76
+ """
77
+ try:
78
+ from .spring_resolver import resolve_spring_di_calls
79
+ return resolve_spring_di_calls(store)
80
+ except Exception as exc: # noqa: BLE001 - best-effort post-pass
81
+ logger.warning("Spring DI resolver failed: %s", exc)
82
+ return None
83
+
84
+
85
+ def _run_temporal_resolver(store: GraphStore) -> Optional[dict]:
86
+ """Run the Temporal workflow/activity call resolver, swallowing any failure so
87
+ build never fails because of it. Returns stats or None on error.
88
+ """
89
+ try:
90
+ from .temporal_resolver import resolve_temporal_calls
91
+ return resolve_temporal_calls(store)
92
+ except Exception as exc: # noqa: BLE001 - best-effort post-pass
93
+ logger.warning("Temporal resolver failed: %s", exc)
94
+ return None
95
+
96
+ # Default ignore patterns (in addition to .gitignore).
97
+ #
98
+ # `<dir>/**` patterns are matched at any depth by _should_ignore, so
99
+ # `node_modules/**` also excludes `packages/app/node_modules/react/index.js`
100
+ # inside monorepos. See: #91
101
+ DEFAULT_IGNORE_PATTERNS = [
102
+ ".code-review-graph/**",
103
+ "node_modules/**",
104
+ ".git/**",
105
+ ".svn/**",
106
+ "__pycache__/**",
107
+ "*.pyc",
108
+ ".venv/**",
109
+ "venv/**",
110
+ "dist/**",
111
+ "build/**",
112
+ ".next/**",
113
+ "target/**",
114
+ # PHP / Laravel / Composer
115
+ "vendor/**",
116
+ "bootstrap/cache/**",
117
+ "public/build/**",
118
+ # Ruby / Bundler
119
+ ".bundle/**",
120
+ # Java / Kotlin / Gradle
121
+ ".gradle/**",
122
+ "*.jar",
123
+ # Dart / Flutter
124
+ ".dart_tool/**",
125
+ ".pub-cache/**",
126
+ # General
127
+ "coverage/**",
128
+ ".cache/**",
129
+ "*.min.js",
130
+ "*.min.css",
131
+ "*.map",
132
+ "*.lock",
133
+ "package-lock.json",
134
+ "yarn.lock",
135
+ "*.db",
136
+ "*.sqlite",
137
+ "*.db-journal",
138
+ "*.db-wal",
139
+ ]
140
+
141
+
142
+ def find_svn_root(start: Path | None = None) -> Optional[Path]:
143
+ """Walk up from start to find the SVN working copy root.
144
+
145
+ For SVN 1.7+, there is a single ``.svn`` at the WC root.
146
+ For older SVN, every directory has ``.svn`` — we return the topmost one
147
+ found so that the WC root is correctly identified.
148
+ """
149
+ current = start or Path.cwd()
150
+ candidate: Optional[Path] = None
151
+ while current != current.parent:
152
+ if (current / ".svn").exists():
153
+ candidate = current
154
+ current = current.parent
155
+ if (current / ".svn").exists():
156
+ candidate = current
157
+ return candidate
158
+
159
+
160
+ def find_repo_root(
161
+ start: Path | None = None,
162
+ stop_at: Path | None = None,
163
+ ) -> Optional[Path]:
164
+ """Walk up from ``start`` to find the nearest ``.git`` directory or SVN working copy root.
165
+
166
+ Args:
167
+ start: Starting directory. Defaults to ``Path.cwd()``.
168
+ stop_at: Optional boundary — if provided, the walk examines
169
+ ``stop_at`` for a ``.git`` directory and then stops without
170
+ crossing above it. Useful for tests that create a synthetic
171
+ repo under ``tmp_path`` (so the walk does not accidentally
172
+ climb into a developer's home-directory dotfiles repo) and
173
+ for any production caller that wants to bound the ancestor
174
+ walk — e.g. multi-repo orchestrators, CI containers with
175
+ bind-mounted volumes, embedded sandboxes. See #241.
176
+
177
+ Returns:
178
+ The first ancestor containing ``.git`` or an SVN working copy,
179
+ or ``None`` if no ancestor up to and including ``stop_at`` (when
180
+ set) or the filesystem root (when ``stop_at is None``) contains one.
181
+ """
182
+ current = start or Path.cwd()
183
+ while current != current.parent:
184
+ if (current / ".git").exists():
185
+ return current
186
+ if stop_at is not None and current == stop_at:
187
+ return None
188
+ current = current.parent
189
+ if (current / ".git").exists():
190
+ return current
191
+ # No Git root found — try SVN
192
+ return find_svn_root(start)
193
+
194
+
195
+ def detect_vcs(root: Path) -> str:
196
+ """Return ``'git'``, ``'svn'``, or ``'none'`` based on VCS markers at *root*."""
197
+ if (root / ".git").exists():
198
+ return "git"
199
+ if (root / ".svn").exists():
200
+ return "svn"
201
+ return "none"
202
+
203
+
204
+ def find_project_root(
205
+ start: Path | None = None,
206
+ stop_at: Path | None = None,
207
+ ) -> Path:
208
+ """Find the project root.
209
+
210
+ Resolution order (highest precedence first):
211
+
212
+ 1. ``CRG_REPO_ROOT`` environment variable — explicit override for
213
+ anyone scripting the CLI from outside the repo (CI jobs, daemons,
214
+ multi-repo orchestrators). See: #155
215
+ 2. Git repository root via :func:`find_repo_root` from ``start``,
216
+ honoring ``stop_at`` if provided.
217
+ 3. ``start`` itself (or cwd if no start given).
218
+
219
+ ``stop_at`` is forwarded to :func:`find_repo_root` so callers that
220
+ want to bound the ancestor walk (typically tests; see #241) can do so
221
+ without having to call ``find_repo_root`` directly.
222
+ """
223
+ env_override = os.environ.get("CRG_REPO_ROOT", "").strip()
224
+ if env_override:
225
+ p = Path(env_override).expanduser().resolve()
226
+ if p.exists():
227
+ return p
228
+ root = find_repo_root(start, stop_at=stop_at)
229
+ if root:
230
+ return root
231
+ return start or Path.cwd()
232
+
233
+
234
+ def _write_data_dir_gitignore(data_dir: Path) -> None:
235
+ """Write .gitignore file in data directory if it doesn't exist.
236
+
237
+ The gitignore contains a single '*' to prevent accidental commits.
238
+ """
239
+ inner_gitignore = data_dir / ".gitignore"
240
+ if not inner_gitignore.exists():
241
+ try:
242
+ # `encoding="utf-8"` is REQUIRED — the em-dash in the header is
243
+ # U+2014 which falls outside cp1252. On Windows, calling
244
+ # write_text without an encoding silently uses the system default
245
+ # codepage, producing a file that subsequently fails to decode as
246
+ # UTF-8 (see issue #239).
247
+ inner_gitignore.write_text(
248
+ "# Auto-generated by code-review-graph — do not commit database files.\n"
249
+ "# The graph.db contains absolute paths and code structure metadata.\n"
250
+ "*\n",
251
+ encoding="utf-8",
252
+ )
253
+ except OSError:
254
+ # Data dir might be read-only (rare); that's OK, it's a best-effort guard.
255
+ pass
256
+
257
+
258
+ def get_data_dir(repo_root: Path) -> Path:
259
+ """Return the directory where this project's graph data lives.
260
+
261
+ Resolution priority:
262
+ 1. Registry entry for this repo (set via --data-dir)
263
+ 2. CRG_DATA_DIR environment variable (global override)
264
+ 3. Default: <repo>/.code-review-graph/
265
+
266
+ By default, ``<repo_root>/.code-review-graph``. If the
267
+ ``CRG_DATA_DIR`` environment variable is set, it is used verbatim
268
+ instead — letting you keep graphs outside the working tree (useful
269
+ for ephemeral workspaces, Docker volumes, or shared caches). See: #155
270
+
271
+ The directory is created if it does not already exist; an inner
272
+ ``.gitignore`` (with ``*``) is written so any accidentally-nested
273
+ files never get committed. Both are idempotent.
274
+ """
275
+ # Check registry first
276
+ try:
277
+ from .registry import Registry
278
+ registry_data_dir = Registry().get_data_dir_for_repo(str(repo_root))
279
+ if registry_data_dir:
280
+ data_dir = Path(registry_data_dir).resolve()
281
+ data_dir.mkdir(parents=True, exist_ok=True)
282
+ _write_data_dir_gitignore(data_dir)
283
+ return data_dir
284
+ except Exception as exc:
285
+ # If registry lookup fails, log and fall through to other methods
286
+ logger.debug("Registry lookup failed for %s: %s", repo_root, exc)
287
+
288
+ # Check environment variable
289
+ env_override = os.environ.get("CRG_DATA_DIR", "").strip()
290
+ if env_override:
291
+ data_dir = Path(env_override).expanduser().resolve()
292
+ else:
293
+ data_dir = repo_root / ".code-review-graph"
294
+
295
+ data_dir.mkdir(parents=True, exist_ok=True)
296
+ _write_data_dir_gitignore(data_dir)
297
+
298
+ return data_dir
299
+
300
+
301
+ def get_db_path(repo_root: Path) -> Path:
302
+ """Determine the database path for a repository.
303
+
304
+ Respects ``CRG_DATA_DIR`` (see :func:`get_data_dir`). Migrates a
305
+ legacy top-level ``.code-review-graph.db`` file into the new
306
+ directory when it exists (WAL/SHM side-files are discarded).
307
+ """
308
+ crg_dir = get_data_dir(repo_root)
309
+ new_db = crg_dir / "graph.db"
310
+
311
+ # Migrate legacy database if present (only meaningful when the
312
+ # legacy file sits at the repo root — if CRG_DATA_DIR is set we
313
+ # skip the migration because there's no relationship between the
314
+ # legacy location and the new one).
315
+ legacy_db = repo_root / ".code-review-graph.db"
316
+ if legacy_db.exists() and not new_db.exists():
317
+ legacy_db.rename(new_db)
318
+ # Discard stale WAL/SHM side-files from the old location
319
+ for suffix in ("-wal", "-shm", "-journal"):
320
+ side = repo_root / f".code-review-graph.db{suffix}"
321
+ if side.exists():
322
+ side.unlink()
323
+
324
+ return new_db
325
+
326
+
327
+ def ensure_repo_gitignore_excludes_crg(repo_root: Path) -> str:
328
+ """Ensure repo-level .gitignore excludes ``.code-review-graph/``.
329
+
330
+ Returns one of:
331
+ - ``created``: .gitignore was created with the entry
332
+ - ``updated``: entry was appended to existing .gitignore
333
+ - ``already-present``: no changes were needed
334
+ """
335
+ gitignore_path = repo_root / ".gitignore"
336
+ existing = gitignore_path.read_text(encoding="utf-8") if gitignore_path.exists() else ""
337
+
338
+ for raw_line in existing.splitlines():
339
+ line = raw_line.strip()
340
+ if not line or line.startswith("#"):
341
+ continue
342
+ if line == ".code-review-graph" or line.startswith(".code-review-graph/"):
343
+ return "already-present"
344
+
345
+ block = "# Added by code-review-graph\n.code-review-graph/\n"
346
+ prefix = "\n" if existing and not existing.endswith("\n") else ""
347
+ gitignore_path.write_text(existing + prefix + block, encoding="utf-8")
348
+
349
+ if existing:
350
+ return "updated"
351
+ return "created"
352
+
353
+
354
+ def _load_ignore_patterns(repo_root: Path) -> list[str]:
355
+ """Load ignore patterns from .code-review-graphignore file."""
356
+ patterns = list(DEFAULT_IGNORE_PATTERNS)
357
+ ignore_file = repo_root / ".code-review-graphignore"
358
+ if ignore_file.exists():
359
+ for line in ignore_file.read_text(encoding="utf-8", errors="replace").splitlines():
360
+ line = line.strip()
361
+ if line and not line.startswith("#"):
362
+ patterns.append(line)
363
+ return patterns
364
+
365
+
366
+ def _should_ignore(path: str, patterns: list[str]) -> bool:
367
+ """Check if a path matches any ignore pattern.
368
+
369
+ Handles nested occurrences of ``<dir>/**`` patterns: for example,
370
+ ``node_modules/**`` also matches ``packages/app/node_modules/foo.js``
371
+ inside monorepos. ``fnmatch`` alone treats ``*`` as not crossing ``/``
372
+ and only matches the prefix, so we additionally test each path segment
373
+ against the bare prefix of ``<dir>/**`` patterns. See: #91
374
+ """
375
+ # Direct fnmatch first (cheap)
376
+ if any(fnmatch.fnmatch(path, p) for p in patterns):
377
+ return True
378
+ # Then: treat simple single-segment "dir/**" patterns as
379
+ # "this directory at any depth".
380
+ parts = PurePosixPath(path).parts
381
+ for p in patterns:
382
+ if not p.endswith("/**"):
383
+ continue
384
+ prefix = p[:-3]
385
+ # Only single-segment dir patterns (no "/" inside the prefix)
386
+ # qualify for nested matching.
387
+ if "/" in prefix or not prefix:
388
+ continue
389
+ if prefix in parts:
390
+ return True
391
+ return False
392
+
393
+
394
+ def _is_binary(path: Path) -> bool:
395
+ """Quick heuristic: check if file appears to be binary."""
396
+ try:
397
+ chunk = path.read_bytes()[:8192]
398
+ return b"\x00" in chunk
399
+ except (OSError, PermissionError):
400
+ return True
401
+
402
+
403
+ _GIT_TIMEOUT = int(os.environ.get("CRG_GIT_TIMEOUT", "30")) # seconds, configurable
404
+
405
+ # When True, `git ls-files --recurse-submodules` is used so that files
406
+ # inside git submodules are included in the graph. Opt-in via env var;
407
+ # can also be overridden per-call through function parameters.
408
+ _RECURSE_SUBMODULES = os.environ.get("CRG_RECURSE_SUBMODULES", "").lower() in ("1", "true", "yes")
409
+
410
+
411
+ def _git_branch_info(repo_root: Path) -> tuple[str, str]:
412
+ """Return (branch_name, head_sha) for the current repo state."""
413
+ branch = ""
414
+ sha = ""
415
+ try:
416
+ result = subprocess.run(
417
+ ["git", "rev-parse", "--abbrev-ref", "HEAD"],
418
+ capture_output=True,
419
+ text=True, encoding='utf-8', cwd=str(repo_root),
420
+ timeout=_GIT_TIMEOUT,
421
+ stdin=subprocess.DEVNULL,
422
+ )
423
+ if result.returncode == 0:
424
+ branch = result.stdout.strip()
425
+ except (subprocess.TimeoutExpired, FileNotFoundError):
426
+ pass
427
+ try:
428
+ result = subprocess.run(
429
+ ["git", "rev-parse", "HEAD"],
430
+ capture_output=True,
431
+ text=True, encoding='utf-8', cwd=str(repo_root),
432
+ timeout=_GIT_TIMEOUT,
433
+ stdin=subprocess.DEVNULL,
434
+ )
435
+ if result.returncode == 0:
436
+ sha = result.stdout.strip()
437
+ except (subprocess.TimeoutExpired, FileNotFoundError):
438
+ pass
439
+ return branch, sha
440
+
441
+
442
+ def _svn_revision_info(repo_root: Path) -> tuple[str, str]:
443
+ """Return (branch_path, revision_str) for the current SVN working copy."""
444
+ branch = ""
445
+ rev = ""
446
+ try:
447
+ result = subprocess.run(
448
+ ["svn", "info", "--non-interactive"],
449
+ capture_output=True, text=True, encoding="utf-8", errors="replace",
450
+ cwd=str(repo_root), timeout=_GIT_TIMEOUT,
451
+ stdin=subprocess.DEVNULL,
452
+ )
453
+ if result.returncode == 0:
454
+ for line in result.stdout.splitlines():
455
+ if line.startswith("URL: "):
456
+ url = line[5:].strip()
457
+ # Extract trunk/branches/tags segment from SVN URL
458
+ for marker in ("/branches/", "/tags/", "/trunk"):
459
+ if marker in url:
460
+ idx = url.index(marker)
461
+ branch = url[idx:].lstrip("/")
462
+ break
463
+ if not branch and url:
464
+ branch = url.rstrip("/").split("/")[-1]
465
+ elif line.startswith("Revision: "):
466
+ rev = line[10:].strip()
467
+ except (subprocess.TimeoutExpired, FileNotFoundError):
468
+ pass
469
+ return branch, rev
470
+
471
+
472
+ _SAFE_GIT_REF = re.compile(r"^[A-Za-z0-9_.~^/@{}\-]+$")
473
+ _SAFE_SVN_REV = re.compile(r"^r?\d+(:r?\d+|:HEAD|:BASE|:COMMITTED)?$", re.IGNORECASE)
474
+
475
+
476
+ def _store_vcs_metadata(repo_root: Path, store: "GraphStore") -> None:
477
+ """Persist VCS branch/revision info into the graph metadata table."""
478
+ vcs = detect_vcs(repo_root)
479
+ if vcs == "git":
480
+ branch, sha = _git_branch_info(repo_root)
481
+ if branch:
482
+ store.set_metadata("git_branch", branch)
483
+ if sha:
484
+ store.set_metadata("git_head_sha", sha)
485
+ elif vcs == "svn":
486
+ branch, rev = _svn_revision_info(repo_root)
487
+ if branch:
488
+ store.set_metadata("svn_branch", branch)
489
+ if rev:
490
+ store.set_metadata("svn_revision", rev)
491
+
492
+
493
+ def get_changed_files(repo_root: Path, base: str = "HEAD~1") -> list[str]:
494
+ """Get list of changed files via git diff or svn status.
495
+
496
+ For SVN working copies the *base* parameter is ignored; modified/added/
497
+ deleted files are detected from ``svn status``. Pass an SVN revision
498
+ range (e.g. ``"r100:HEAD"``) as *base* to compare against a specific
499
+ revision instead.
500
+ """
501
+ if detect_vcs(repo_root) == "svn":
502
+ return _get_svn_changed_files(repo_root, base if _SAFE_SVN_REV.match(base) else None)
503
+ # Git path
504
+ if not _SAFE_GIT_REF.match(base):
505
+ logger.warning("Invalid git ref rejected: %s", base)
506
+ return []
507
+ try:
508
+ result = subprocess.run(
509
+ ["git", "diff", "--name-only", base, "--"],
510
+ capture_output=True,
511
+ text=True, encoding='utf-8', cwd=str(repo_root),
512
+ timeout=_GIT_TIMEOUT,
513
+ stdin=subprocess.DEVNULL,
514
+ )
515
+ if result.returncode != 0:
516
+ # Fallback: try diff against empty tree (initial commit)
517
+ result = subprocess.run(
518
+ ["git", "diff", "--name-only", "--cached"],
519
+ capture_output=True,
520
+ text=True, encoding='utf-8', cwd=str(repo_root),
521
+ timeout=_GIT_TIMEOUT,
522
+ stdin=subprocess.DEVNULL,
523
+ )
524
+ files = [f.strip() for f in result.stdout.splitlines() if f.strip()]
525
+ return files
526
+ except (FileNotFoundError, subprocess.TimeoutExpired):
527
+ return []
528
+
529
+
530
+ def _get_svn_changed_files(repo_root: Path, rev_range: str | None = None) -> list[str]:
531
+ """Return changed files in an SVN working copy.
532
+
533
+ When *rev_range* is given (e.g. ``"r100:HEAD"``), ``svn diff --summarize``
534
+ is used to list files changed between those revisions. Otherwise
535
+ ``svn status`` reports working-copy modifications.
536
+ """
537
+ try:
538
+ if rev_range:
539
+ result = subprocess.run(
540
+ ["svn", "diff", "--summarize", "--non-interactive", "-r", rev_range],
541
+ capture_output=True, text=True, encoding="utf-8", errors="replace",
542
+ cwd=str(repo_root), timeout=_GIT_TIMEOUT,
543
+ stdin=subprocess.DEVNULL,
544
+ )
545
+ if result.returncode != 0:
546
+ logger.warning("svn diff --summarize failed (rc=%d): %s",
547
+ result.returncode, result.stderr[:200])
548
+ return []
549
+ files = []
550
+ for line in result.stdout.splitlines():
551
+ # Format: "M path/to/file" (first char is status)
552
+ if len(line) >= 2 and line[0] in ("M", "A", "D"):
553
+ files.append(line[1:].strip())
554
+ return files
555
+ else:
556
+ result = subprocess.run(
557
+ ["svn", "status", "--non-interactive"],
558
+ capture_output=True, text=True, encoding="utf-8", errors="replace",
559
+ cwd=str(repo_root), timeout=_GIT_TIMEOUT,
560
+ stdin=subprocess.DEVNULL,
561
+ )
562
+ files = []
563
+ for line in result.stdout.splitlines():
564
+ if len(line) < 2:
565
+ continue
566
+ status_char = line[0]
567
+ # M=modified, A=added, D=deleted, R=replaced, C=conflicted
568
+ if status_char in ("M", "A", "D", "R", "C"):
569
+ # SVN status: 8 fixed-width columns then the path
570
+ path = line[8:].strip() if len(line) > 8 else line[1:].strip()
571
+ files.append(path)
572
+ return files
573
+ except (FileNotFoundError, subprocess.TimeoutExpired):
574
+ return []
575
+
576
+
577
+ def get_staged_and_unstaged(repo_root: Path) -> list[str]:
578
+ """Get all modified files (staged + unstaged + untracked)."""
579
+ if detect_vcs(repo_root) == "svn":
580
+ return _get_svn_changed_files(repo_root)
581
+ try:
582
+ result = subprocess.run(
583
+ ["git", "status", "--porcelain"],
584
+ capture_output=True,
585
+ text=True, encoding='utf-8', cwd=str(repo_root),
586
+ timeout=_GIT_TIMEOUT,
587
+ stdin=subprocess.DEVNULL,
588
+ )
589
+ files = []
590
+ for line in result.stdout.splitlines():
591
+ if len(line) > 3:
592
+ entry = line[3:].strip()
593
+ # Handle renamed files: "R old -> new"
594
+ if " -> " in entry:
595
+ entry = entry.split(" -> ", 1)[1]
596
+ files.append(entry)
597
+ return files
598
+ except (FileNotFoundError, subprocess.TimeoutExpired):
599
+ return []
600
+
601
+
602
+ def get_all_tracked_files(
603
+ repo_root: Path,
604
+ recurse_submodules: bool | None = None,
605
+ ) -> list[str]:
606
+ """Get all files tracked by git or svn.
607
+
608
+ Args:
609
+ repo_root: Repository root directory.
610
+ recurse_submodules: If True, pass ``--recurse-submodules`` to
611
+ ``git ls-files`` so that files inside git submodules are
612
+ included. When *None* (default), falls back to the
613
+ ``CRG_RECURSE_SUBMODULES`` environment variable.
614
+ (Ignored for SVN working copies.)
615
+ """
616
+ if detect_vcs(repo_root) == "svn":
617
+ return _get_svn_all_tracked_files(repo_root)
618
+
619
+ if recurse_submodules is None:
620
+ recurse_submodules = _RECURSE_SUBMODULES
621
+
622
+ cmd = ["git", "ls-files"]
623
+ if recurse_submodules:
624
+ cmd.append("--recurse-submodules")
625
+
626
+ try:
627
+ result = subprocess.run(
628
+ cmd,
629
+ capture_output=True,
630
+ text=True, encoding='utf-8', cwd=str(repo_root),
631
+ timeout=_GIT_TIMEOUT,
632
+ stdin=subprocess.DEVNULL,
633
+ )
634
+ return [f.strip() for f in result.stdout.splitlines() if f.strip()]
635
+ except (FileNotFoundError, subprocess.TimeoutExpired):
636
+ return []
637
+
638
+
639
+ def _get_svn_all_tracked_files(repo_root: Path) -> list[str]:
640
+ """Return SVN-versioned files by walking the working copy.
641
+
642
+ Uses ``svn list -R`` to get the server-side file list, falling back to
643
+ a filesystem walk (which is also the fallback in :func:`collect_all_files`).
644
+ """
645
+ try:
646
+ result = subprocess.run(
647
+ ["svn", "list", "--recursive", "--non-interactive"],
648
+ capture_output=True, text=True, encoding="utf-8", errors="replace",
649
+ cwd=str(repo_root), timeout=60, # svn list queries the server
650
+ stdin=subprocess.DEVNULL,
651
+ )
652
+ if result.returncode == 0:
653
+ # svn list returns paths relative to the WC URL; directories end with "/"
654
+ files = [
655
+ f.strip()
656
+ for f in result.stdout.splitlines()
657
+ if f.strip() and not f.strip().endswith("/")
658
+ ]
659
+ if files:
660
+ return files
661
+ except (FileNotFoundError, subprocess.TimeoutExpired):
662
+ pass
663
+ # Fallback: let collect_all_files do a filesystem walk
664
+ return []
665
+
666
+
667
+ def collect_all_files(
668
+ repo_root: Path,
669
+ recurse_submodules: bool | None = None,
670
+ ) -> list[str]:
671
+ """Collect all parseable files in the repo, respecting ignore patterns.
672
+
673
+ Args:
674
+ repo_root: Repository root directory.
675
+ recurse_submodules: If True, include files from git submodules.
676
+ When *None*, falls back to ``CRG_RECURSE_SUBMODULES`` env var.
677
+ """
678
+ ignore_patterns = _load_ignore_patterns(repo_root)
679
+ parser = CodeParser(repo_root)
680
+ files = []
681
+
682
+ # Prefer git ls-files for tracked files
683
+ tracked = get_all_tracked_files(repo_root, recurse_submodules)
684
+ if tracked:
685
+ candidates = tracked
686
+ else:
687
+ # Fallback: walk directory
688
+ candidates = [str(p.relative_to(repo_root)) for p in repo_root.rglob("*") if p.is_file()]
689
+
690
+ for rel_path in candidates:
691
+ if _should_ignore(rel_path, ignore_patterns):
692
+ continue
693
+ # Skip paths that would exceed OS filename limits (macOS: 255 bytes
694
+ # per component, ~1024 total; Windows: 260 total).
695
+ try:
696
+ full_path = repo_root / rel_path
697
+ except (OSError, ValueError):
698
+ logger.debug("Skipping path that cannot be constructed: %s", rel_path)
699
+ continue
700
+ if len(str(full_path)) > 1000 or any(len(p.encode()) > 255 for p in full_path.parts):
701
+ logger.debug("Skipping overlong path: %s", rel_path[:120])
702
+ continue
703
+ if not full_path.is_file():
704
+ continue
705
+ if full_path.is_symlink():
706
+ continue
707
+ if parser.detect_language(full_path) is None:
708
+ continue
709
+ if _is_binary(full_path):
710
+ continue
711
+ files.append(rel_path)
712
+
713
+ return files
714
+
715
+
716
+ _MAX_DEPENDENT_HOPS = int(os.environ.get("CRG_DEPENDENT_HOPS", "2"))
717
+ _MAX_DEPENDENT_FILES = 500
718
+
719
+
720
+ def _single_hop_dependents(store: GraphStore, file_path: str) -> set[str]:
721
+ """Find files that directly depend on *file_path* (single hop)."""
722
+ dependents: set[str] = set()
723
+ edges = store.get_edges_by_target(file_path)
724
+ for e in edges:
725
+ if e.kind == "IMPORTS_FROM":
726
+ dependents.add(e.file_path)
727
+
728
+ nodes = store.get_nodes_by_file(file_path)
729
+ for node in nodes:
730
+ for e in store.get_edges_by_target(node.qualified_name):
731
+ if e.kind in ("CALLS", "IMPORTS_FROM", "INHERITS", "IMPLEMENTS"):
732
+ dependents.add(e.file_path)
733
+
734
+ dependents.discard(file_path)
735
+ return dependents
736
+
737
+
738
+ class DependentList(list):
739
+ """A ``list[str]`` with a ``.truncated`` flag.
740
+
741
+ When :func:`find_dependents` hits ``_MAX_DEPENDENT_FILES`` it truncates
742
+ the result and sets ``truncated = True`` so callers can distinguish a
743
+ complete expansion from a capped one. See issue #261.
744
+
745
+ This is a transparent ``list`` subclass — existing callers that iterate,
746
+ ``len()``, or slice continue to work unchanged; only callers that
747
+ specifically check ``.truncated`` benefit from the signal.
748
+ """
749
+
750
+ truncated: bool
751
+
752
+ def __init__(self, items: list, *, truncated: bool = False) -> None:
753
+ super().__init__(items)
754
+ self.truncated = truncated
755
+
756
+
757
+ def find_dependents(
758
+ store: GraphStore,
759
+ file_path: str,
760
+ max_hops: int = _MAX_DEPENDENT_HOPS,
761
+ ) -> DependentList:
762
+ """Find files that import from or depend on the given file.
763
+
764
+ Performs up to *max_hops* iterations of expansion (default 2).
765
+ Stops early if the total exceeds 500 files.
766
+
767
+ Returns a :class:`DependentList` — a regular ``list[str]`` that also
768
+ carries a ``.truncated`` flag. When ``truncated is True`` the
769
+ returned list is capped at ``_MAX_DEPENDENT_FILES`` and the full
770
+ set of dependents was not explored. See issue #261.
771
+ """
772
+ all_dependents: set[str] = set()
773
+ visited: set[str] = {file_path}
774
+ frontier: set[str] = {file_path}
775
+ for _hop in range(max_hops):
776
+ next_frontier: set[str] = set()
777
+ for fp in frontier:
778
+ deps = _single_hop_dependents(store, fp)
779
+ new_deps = deps - visited
780
+ all_dependents.update(new_deps)
781
+ next_frontier.update(new_deps)
782
+ visited.update(next_frontier)
783
+ frontier = next_frontier
784
+ if not frontier:
785
+ break
786
+ if len(all_dependents) > _MAX_DEPENDENT_FILES:
787
+ logger.warning(
788
+ "Dependent expansion capped at %d files for %s",
789
+ len(all_dependents),
790
+ file_path,
791
+ )
792
+ return DependentList(
793
+ list(all_dependents)[:_MAX_DEPENDENT_FILES],
794
+ truncated=True,
795
+ )
796
+ return DependentList(list(all_dependents))
797
+
798
+
799
+ def _parse_single_file(
800
+ args: tuple[str, str],
801
+ ) -> tuple[str, list, list, str | None, str]:
802
+ """Parse one file in a worker process.
803
+
804
+ Returns ``(rel_path, nodes, edges, error_or_none, file_hash)``.
805
+ Must be a module-level function so ``ProcessPoolExecutor`` can
806
+ serialise it across processes.
807
+ """
808
+ rel_path, repo_root_str = args
809
+ abs_path = Path(repo_root_str) / rel_path
810
+ try:
811
+ raw = abs_path.read_bytes()
812
+ fhash = hashlib.sha256(raw).hexdigest()
813
+ parser = CodeParser(Path(repo_root_str))
814
+ nodes, edges = parser.parse_bytes(abs_path, raw)
815
+ return (rel_path, nodes, edges, None, fhash)
816
+ except Exception as e:
817
+ return (rel_path, [], [], str(e), "")
818
+
819
+
820
+ def full_build(
821
+ repo_root: Path,
822
+ store: GraphStore,
823
+ recurse_submodules: bool | None = None,
824
+ ) -> dict:
825
+ """Full rebuild of the entire graph.
826
+
827
+ Args:
828
+ repo_root: Repository root directory.
829
+ store: Graph database store.
830
+ recurse_submodules: If True, include files from git submodules.
831
+ When *None*, falls back to ``CRG_RECURSE_SUBMODULES`` env var.
832
+ """
833
+ parser = CodeParser(repo_root)
834
+ files = collect_all_files(repo_root, recurse_submodules)
835
+
836
+ # Purge stale data from files no longer on disk
837
+ existing_files = set(store.get_all_files())
838
+ current_abs = {str(repo_root / f) for f in files}
839
+ stale_files = existing_files - current_abs
840
+ for stale in stale_files:
841
+ store.remove_file_data(stale)
842
+ # Ensure deletions are persisted before store_file_nodes_edges()
843
+ # starts its own explicit transaction via BEGIN IMMEDIATE.
844
+ if stale_files:
845
+ store.commit()
846
+
847
+ total_nodes = 0
848
+ total_edges = 0
849
+ errors = []
850
+ file_count = len(files)
851
+
852
+ use_serial = os.environ.get("CRG_SERIAL_PARSE", "") == "1"
853
+
854
+ if use_serial or file_count < 8:
855
+ # Serial fallback (for debugging or tiny repos)
856
+ for i, rel_path in enumerate(files, 1):
857
+ full_path = repo_root / rel_path
858
+ try:
859
+ source = full_path.read_bytes()
860
+ fhash = hashlib.sha256(source).hexdigest()
861
+ nodes, edges = parser.parse_bytes(full_path, source)
862
+ store.store_file_nodes_edges(str(full_path), nodes, edges, fhash)
863
+ total_nodes += len(nodes)
864
+ total_edges += len(edges)
865
+ except (OSError, PermissionError) as e:
866
+ errors.append({"file": rel_path, "error": str(e)})
867
+ except Exception as e:
868
+ logger.warning("Error parsing %s: %s", rel_path, e)
869
+ errors.append({"file": rel_path, "error": str(e)})
870
+ if i % 50 == 0 or i == file_count:
871
+ logger.info("Progress: %d/%d files parsed", i, file_count)
872
+ else:
873
+ # Parallel parsing — store calls remain serial (SQLite single-writer).
874
+ # Executor kind auto-selected: process on Linux/macOS/Windows-TTY,
875
+ # thread on Windows-MCP-stdio to avoid pipe-handle inheritance
876
+ # deadlock (issues #46, #136). Override via CRG_PARSE_EXECUTOR env.
877
+ args_list = [(rel_path, str(repo_root)) for rel_path in files]
878
+ with _make_executor(_MAX_PARSE_WORKERS) as executor:
879
+ for i, (rel_path, nodes, edges, error, fhash) in enumerate(
880
+ executor.map(_parse_single_file, args_list, chunksize=20),
881
+ 1,
882
+ ):
883
+ if error:
884
+ logger.warning("Error parsing %s: %s", rel_path, error)
885
+ errors.append({"file": rel_path, "error": error})
886
+ continue
887
+ full_path = repo_root / rel_path
888
+ store.store_file_nodes_edges(
889
+ str(full_path),
890
+ nodes,
891
+ edges,
892
+ fhash,
893
+ )
894
+ total_nodes += len(nodes)
895
+ total_edges += len(edges)
896
+ if i % 200 == 0 or i == file_count:
897
+ logger.info("Progress: %d/%d files parsed", i, file_count)
898
+
899
+ store.set_metadata("last_updated", time.strftime("%Y-%m-%dT%H:%M:%S"))
900
+ store.set_metadata("last_build_type", "full")
901
+ _store_vcs_metadata(repo_root, store)
902
+ store.commit()
903
+
904
+ rescript_stats = _run_rescript_resolver(store)
905
+ spring_stats = _run_spring_resolver(store)
906
+ temporal_stats = _run_temporal_resolver(store)
907
+
908
+ disambiguated = store.find_disambiguated_nodes()
909
+ if disambiguated:
910
+ logger.info(
911
+ "Disambiguated %d duplicate qualified_name(s): %s",
912
+ len(disambiguated), ", ".join(disambiguated),
913
+ )
914
+
915
+ return {
916
+ "files_parsed": len(files),
917
+ "total_nodes": total_nodes,
918
+ "total_edges": total_edges,
919
+ "disambiguated_nodes": disambiguated,
920
+ "errors": errors,
921
+ "rescript_resolution": rescript_stats,
922
+ "spring_resolution": spring_stats,
923
+ "temporal_resolution": temporal_stats,
924
+ }
925
+
926
+
927
+ def incremental_update(
928
+ repo_root: Path,
929
+ store: GraphStore,
930
+ base: str = "HEAD~1",
931
+ changed_files: list[str] | None = None,
932
+ ) -> dict:
933
+ """Incremental update: re-parse changed + dependent files only."""
934
+ parser = CodeParser(repo_root)
935
+ ignore_patterns = _load_ignore_patterns(repo_root)
936
+
937
+ # Determine changed files
938
+ if changed_files is None:
939
+ changed_files = get_changed_files(repo_root, base)
940
+
941
+ if not changed_files:
942
+ return {
943
+ "files_updated": 0,
944
+ "total_nodes": 0,
945
+ "total_edges": 0,
946
+ "changed_files": [],
947
+ "dependent_files": [],
948
+ }
949
+
950
+ # Find dependent files (files that import from changed files)
951
+ dependent_files: set[str] = set()
952
+ for rel_path in changed_files:
953
+ full_path = str(repo_root / rel_path)
954
+ deps = find_dependents(store, full_path)
955
+ for d in deps:
956
+ # Convert back to relative path if needed
957
+ try:
958
+ dependent_files.add(str(Path(d).relative_to(repo_root)))
959
+ except ValueError:
960
+ dependent_files.add(d)
961
+
962
+ # Combine changed + dependent
963
+ all_files = set(changed_files) | dependent_files
964
+
965
+ total_nodes = 0
966
+ total_edges = 0
967
+ errors = []
968
+
969
+ # Separate deleted/unparseable files from files that need re-parsing
970
+ to_parse: list[str] = []
971
+ removed_any = False
972
+ for rel_path in all_files:
973
+ if _should_ignore(rel_path, ignore_patterns):
974
+ continue
975
+ abs_path = repo_root / rel_path
976
+ if not abs_path.is_file():
977
+ store.remove_file_data(str(abs_path))
978
+ removed_any = True
979
+ continue
980
+ if parser.detect_language(abs_path) is None:
981
+ continue
982
+ # Quick hash check to skip unchanged files
983
+ try:
984
+ raw = abs_path.read_bytes()
985
+ fhash = hashlib.sha256(raw).hexdigest()
986
+ existing_nodes = store.get_nodes_by_file(str(abs_path))
987
+ if existing_nodes and existing_nodes[0].file_hash == fhash:
988
+ continue
989
+ except (OSError, PermissionError):
990
+ pass
991
+ to_parse.append(rel_path)
992
+
993
+ # Persist deletions before store_file_nodes_edges() opens its own
994
+ # explicit transaction — avoids nested transaction errors.
995
+ if removed_any:
996
+ store.commit()
997
+
998
+ use_serial = os.environ.get("CRG_SERIAL_PARSE", "") == "1"
999
+
1000
+ if use_serial or len(to_parse) < 8:
1001
+ for rel_path in to_parse:
1002
+ abs_path = repo_root / rel_path
1003
+ try:
1004
+ source = abs_path.read_bytes()
1005
+ fhash = hashlib.sha256(source).hexdigest()
1006
+ nodes, edges = parser.parse_bytes(abs_path, source)
1007
+ store.store_file_nodes_edges(str(abs_path), nodes, edges, fhash)
1008
+ total_nodes += len(nodes)
1009
+ total_edges += len(edges)
1010
+ except (OSError, PermissionError) as e:
1011
+ errors.append({"file": rel_path, "error": str(e)})
1012
+ except Exception as e:
1013
+ logger.warning("Error parsing %s: %s", rel_path, e)
1014
+ errors.append({"file": rel_path, "error": str(e)})
1015
+ else:
1016
+ # See full-build comment above for executor kind rationale.
1017
+ args_list = [(rel_path, str(repo_root)) for rel_path in to_parse]
1018
+ with _make_executor(_MAX_PARSE_WORKERS) as executor:
1019
+ for rel_path, nodes, edges, error, fhash in executor.map(
1020
+ _parse_single_file,
1021
+ args_list,
1022
+ chunksize=20,
1023
+ ):
1024
+ if error:
1025
+ logger.warning("Error parsing %s: %s", rel_path, error)
1026
+ errors.append({"file": rel_path, "error": error})
1027
+ continue
1028
+ store.store_file_nodes_edges(
1029
+ str(repo_root / rel_path),
1030
+ nodes,
1031
+ edges,
1032
+ fhash,
1033
+ )
1034
+ total_nodes += len(nodes)
1035
+ total_edges += len(edges)
1036
+
1037
+ store.set_metadata("last_updated", time.strftime("%Y-%m-%dT%H:%M:%S"))
1038
+ store.set_metadata("last_build_type", "incremental")
1039
+ _store_vcs_metadata(repo_root, store)
1040
+ store.commit()
1041
+
1042
+ # Only re-run language-specific resolvers when the relevant files changed.
1043
+ rescript_changed = any(
1044
+ rp.endswith((".res", ".resi")) for rp in all_files
1045
+ )
1046
+ rescript_stats = (
1047
+ _run_rescript_resolver(store) if rescript_changed else None
1048
+ )
1049
+
1050
+ spring_changed = any(rp.endswith(".java") for rp in all_files)
1051
+ spring_stats = _run_spring_resolver(store) if spring_changed else None
1052
+ temporal_stats = _run_temporal_resolver(store) if spring_changed else None
1053
+
1054
+ disambiguated = store.find_disambiguated_nodes()
1055
+
1056
+ return {
1057
+ "files_updated": len(all_files),
1058
+ "total_nodes": total_nodes,
1059
+ "total_edges": total_edges,
1060
+ "disambiguated_nodes": disambiguated,
1061
+ "changed_files": list(changed_files),
1062
+ "dependent_files": list(dependent_files),
1063
+ "errors": errors,
1064
+ "rescript_resolution": rescript_stats,
1065
+ "spring_resolution": spring_stats,
1066
+ "temporal_resolution": temporal_stats,
1067
+ }
1068
+
1069
+
1070
+ # ---------------------------------------------------------------------------
1071
+ # Watch mode
1072
+ # ---------------------------------------------------------------------------
1073
+
1074
+
1075
+ _DEBOUNCE_SECONDS = 0.3
1076
+
1077
+
1078
+ def watch(
1079
+ repo_root: Path,
1080
+ store: GraphStore,
1081
+ on_files_updated: Optional[Callable] = None,
1082
+ ) -> None:
1083
+ """Watch for file changes and auto-update the graph.
1084
+
1085
+ Uses a 300ms debounce to batch rapid-fire saves into a single update.
1086
+
1087
+ Args:
1088
+ repo_root: Repository root to watch.
1089
+ store: Graph database to update.
1090
+ on_files_updated: Optional callback invoked after each debounced
1091
+ batch of file updates completes. Receives the store as its
1092
+ only argument. Used by the CLI to run post-processing
1093
+ (FTS, flows, communities) after watch updates.
1094
+ """
1095
+ import threading
1096
+
1097
+ from watchdog.events import FileSystemEventHandler
1098
+ from watchdog.observers import Observer
1099
+
1100
+ parser = CodeParser(repo_root)
1101
+ ignore_patterns = _load_ignore_patterns(repo_root)
1102
+
1103
+ class GraphUpdateHandler(FileSystemEventHandler):
1104
+ def __init__(self):
1105
+ self._pending: set[str] = set()
1106
+ self._lock = threading.Lock()
1107
+ self._timer: threading.Timer | None = None
1108
+
1109
+ def _should_handle(self, path: str) -> bool:
1110
+ if Path(path).is_symlink():
1111
+ return False
1112
+ try:
1113
+ rel = str(Path(path).relative_to(repo_root))
1114
+ except ValueError:
1115
+ return False
1116
+ if _should_ignore(rel, ignore_patterns):
1117
+ return False
1118
+ if parser.detect_language(Path(path)) is None:
1119
+ return False
1120
+ return True
1121
+
1122
+ def on_modified(self, event):
1123
+ if event.is_directory:
1124
+ return
1125
+ if self._should_handle(event.src_path):
1126
+ self._schedule(event.src_path)
1127
+
1128
+ def on_created(self, event):
1129
+ if event.is_directory:
1130
+ return
1131
+ if self._should_handle(event.src_path):
1132
+ self._schedule(event.src_path)
1133
+
1134
+ def on_deleted(self, event):
1135
+ if event.is_directory:
1136
+ return
1137
+ # Only handle files we would normally track
1138
+ try:
1139
+ rel = str(Path(event.src_path).relative_to(repo_root))
1140
+ except ValueError:
1141
+ return
1142
+ if _should_ignore(rel, ignore_patterns):
1143
+ return
1144
+ try:
1145
+ store.remove_file_data(event.src_path)
1146
+ store.commit()
1147
+ logger.info("Removed: %s", rel)
1148
+ except Exception as e:
1149
+ logger.error("Error removing %s: %s", rel, e)
1150
+
1151
+ def _schedule(self, abs_path: str):
1152
+ """Add file to pending set and reset the debounce timer."""
1153
+ with self._lock:
1154
+ self._pending.add(abs_path)
1155
+ if self._timer is not None:
1156
+ self._timer.cancel()
1157
+ self._timer = threading.Timer(_DEBOUNCE_SECONDS, self._flush)
1158
+ self._timer.start()
1159
+
1160
+ def _flush(self):
1161
+ """Process all pending files after the debounce window."""
1162
+ with self._lock:
1163
+ paths = list(self._pending)
1164
+ self._pending.clear()
1165
+ self._timer = None
1166
+
1167
+ updated = 0
1168
+ for abs_path in paths:
1169
+ if self._update_file(abs_path):
1170
+ updated += 1
1171
+
1172
+ if updated > 0 and on_files_updated is not None:
1173
+ try:
1174
+ on_files_updated(store)
1175
+ except Exception as e:
1176
+ logger.error("Post-update callback failed: %s", e)
1177
+
1178
+ def _update_file(self, abs_path: str) -> bool:
1179
+ path = Path(abs_path)
1180
+ if not path.is_file():
1181
+ return False
1182
+ if path.is_symlink():
1183
+ return False
1184
+ if _is_binary(path):
1185
+ return False
1186
+ try:
1187
+ source = path.read_bytes()
1188
+ fhash = hashlib.sha256(source).hexdigest()
1189
+ nodes, edges = parser.parse_bytes(path, source)
1190
+ store.store_file_nodes_edges(abs_path, nodes, edges, fhash)
1191
+ store.set_metadata("last_updated", time.strftime("%Y-%m-%dT%H:%M:%S"))
1192
+ store.commit()
1193
+ rel = str(path.relative_to(repo_root))
1194
+ logger.info(
1195
+ "Updated: %s (%d nodes, %d edges)",
1196
+ rel,
1197
+ len(nodes),
1198
+ len(edges),
1199
+ )
1200
+ return True
1201
+ except Exception as e:
1202
+ logger.error("Error updating %s: %s", abs_path, e)
1203
+ return False
1204
+
1205
+ handler = GraphUpdateHandler()
1206
+ observer = Observer()
1207
+ observer.schedule(handler, str(repo_root), recursive=True)
1208
+ observer.start()
1209
+
1210
+ logger.info("Watching %s for changes... (Ctrl+C to stop)", repo_root)
1211
+ try:
1212
+ import time as _time
1213
+
1214
+ while True:
1215
+ _time.sleep(1)
1216
+ except KeyboardInterrupt:
1217
+ observer.stop()
1218
+ observer.join()
1219
+ logger.info("Watch stopped.")
1220
+
1221
+
1222
+ def start_watch_thread(
1223
+ repo_root: Path,
1224
+ store: GraphStore,
1225
+ daemon: bool = True,
1226
+ ) -> threading.Thread | None:
1227
+ """Start watch mode in a background thread.
1228
+
1229
+ Returns the started thread, or None if watchdog is unavailable.
1230
+ """
1231
+ try:
1232
+ import watchdog # noqa: F401
1233
+ except ImportError:
1234
+ logger.warning("watchdog not installed; auto-watch disabled")
1235
+ return None
1236
+
1237
+ thread = threading.Thread(
1238
+ target=watch,
1239
+ args=(repo_root, store),
1240
+ daemon=daemon,
1241
+ name="crg-watch",
1242
+ )
1243
+ thread.start()
1244
+ logger.info("Auto-watch started for %s", repo_root)
1245
+ return thread