code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_review_graph/__init__.py +20 -0
- code_review_graph/__main__.py +4 -0
- code_review_graph/analysis.py +410 -0
- code_review_graph/changes.py +409 -0
- code_review_graph/cli.py +1255 -0
- code_review_graph/communities.py +874 -0
- code_review_graph/constants.py +23 -0
- code_review_graph/context_savings.py +317 -0
- code_review_graph/custom_languages.py +322 -0
- code_review_graph/daemon.py +1009 -0
- code_review_graph/daemon_cli.py +320 -0
- code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
- code_review_graph/embeddings.py +1006 -0
- code_review_graph/enrich.py +303 -0
- code_review_graph/eval/__init__.py +33 -0
- code_review_graph/eval/benchmarks/__init__.py +1 -0
- code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
- code_review_graph/eval/benchmarks/build_performance.py +60 -0
- code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
- code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
- code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
- code_review_graph/eval/benchmarks/search_quality.py +59 -0
- code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
- code_review_graph/eval/configs/code-review-graph.yaml +50 -0
- code_review_graph/eval/configs/express.yaml +45 -0
- code_review_graph/eval/configs/fastapi.yaml +48 -0
- code_review_graph/eval/configs/flask.yaml +50 -0
- code_review_graph/eval/configs/gin.yaml +51 -0
- code_review_graph/eval/configs/httpx.yaml +48 -0
- code_review_graph/eval/reporter.py +301 -0
- code_review_graph/eval/runner.py +211 -0
- code_review_graph/eval/scorer.py +85 -0
- code_review_graph/eval/token_benchmark.py +182 -0
- code_review_graph/exports.py +409 -0
- code_review_graph/flows.py +698 -0
- code_review_graph/graph.py +1427 -0
- code_review_graph/graph_diff.py +122 -0
- code_review_graph/hints.py +384 -0
- code_review_graph/incremental.py +1245 -0
- code_review_graph/jedi_resolver.py +303 -0
- code_review_graph/main.py +1079 -0
- code_review_graph/memory.py +142 -0
- code_review_graph/migrations.py +284 -0
- code_review_graph/parser.py +6957 -0
- code_review_graph/postprocessing.py +134 -0
- code_review_graph/prompts.py +159 -0
- code_review_graph/refactor.py +852 -0
- code_review_graph/registry.py +319 -0
- code_review_graph/rescript_resolver.py +206 -0
- code_review_graph/search.py +447 -0
- code_review_graph/skills.py +1481 -0
- code_review_graph/spring_resolver.py +200 -0
- code_review_graph/temporal_resolver.py +199 -0
- code_review_graph/token_benchmark.py +125 -0
- code_review_graph/tools/__init__.py +156 -0
- code_review_graph/tools/_common.py +176 -0
- code_review_graph/tools/analysis_tools.py +184 -0
- code_review_graph/tools/build.py +541 -0
- code_review_graph/tools/community_tools.py +246 -0
- code_review_graph/tools/context.py +152 -0
- code_review_graph/tools/docs.py +274 -0
- code_review_graph/tools/flows_tools.py +176 -0
- code_review_graph/tools/query.py +692 -0
- code_review_graph/tools/refactor_tools.py +168 -0
- code_review_graph/tools/registry_tools.py +125 -0
- code_review_graph/tools/review.py +477 -0
- code_review_graph/tsconfig_resolver.py +257 -0
- code_review_graph/visualization.py +2184 -0
- code_review_graph/wiki.py +305 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1245 @@
|
|
|
1
|
+
"""Incremental graph update logic.
|
|
2
|
+
|
|
3
|
+
Detects changed files via git diff, re-parses only changed + impacted files,
|
|
4
|
+
and updates the graph accordingly. Also supports CLI invocation for hooks.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import concurrent.futures
|
|
10
|
+
import fnmatch
|
|
11
|
+
import hashlib
|
|
12
|
+
import logging
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
import subprocess
|
|
16
|
+
import sys
|
|
17
|
+
import threading
|
|
18
|
+
import time
|
|
19
|
+
from pathlib import Path, PurePosixPath
|
|
20
|
+
from typing import Callable, Optional
|
|
21
|
+
|
|
22
|
+
from .graph import GraphStore
|
|
23
|
+
from .parser import CodeParser
|
|
24
|
+
|
|
25
|
+
_MAX_PARSE_WORKERS = int(os.environ.get("CRG_PARSE_WORKERS", str(min(os.cpu_count() or 4, 8))))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _select_executor_kind() -> str:
|
|
29
|
+
"""Return 'process' or 'thread' for parallel parsing.
|
|
30
|
+
|
|
31
|
+
Defaults to ``process`` (the original behavior, fastest on Linux/macOS).
|
|
32
|
+
Auto-switches to ``thread`` when running on Windows with stdin not
|
|
33
|
+
attached to a TTY — that combination indicates an MCP/stdio host, where
|
|
34
|
+
``ProcessPoolExecutor`` workers inherit the parent's pipe handles and
|
|
35
|
+
leak as zombies after the pool closes (issues #46, #136).
|
|
36
|
+
|
|
37
|
+
Override explicitly with ``CRG_PARSE_EXECUTOR={process,thread}``.
|
|
38
|
+
|
|
39
|
+
Tree-sitter parsing in the worker releases the GIL during native
|
|
40
|
+
parsing, so the speedup loss for falling back to threads is small
|
|
41
|
+
(typically <30% on the full-build path) and the trade is worth it
|
|
42
|
+
to avoid the deadlock + zombie process accumulation.
|
|
43
|
+
"""
|
|
44
|
+
explicit = os.environ.get("CRG_PARSE_EXECUTOR", "").strip().lower()
|
|
45
|
+
if explicit in ("process", "thread"):
|
|
46
|
+
return explicit
|
|
47
|
+
if sys.platform == "win32" and not sys.stdin.isatty():
|
|
48
|
+
return "thread"
|
|
49
|
+
return "process"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _make_executor(max_workers: int):
|
|
53
|
+
"""Construct the parallel-parse executor selected by [_select_executor_kind]."""
|
|
54
|
+
if _select_executor_kind() == "thread":
|
|
55
|
+
return concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
|
|
56
|
+
return concurrent.futures.ProcessPoolExecutor(max_workers=max_workers)
|
|
57
|
+
|
|
58
|
+
logger = logging.getLogger(__name__)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _run_rescript_resolver(store: GraphStore) -> Optional[dict]:
|
|
62
|
+
"""Run the ReScript cross-module resolver, swallowing any failure so
|
|
63
|
+
build never fails because of it. Returns stats or None on error.
|
|
64
|
+
"""
|
|
65
|
+
try:
|
|
66
|
+
from .rescript_resolver import resolve_rescript_cross_module
|
|
67
|
+
return resolve_rescript_cross_module(store)
|
|
68
|
+
except Exception as exc: # noqa: BLE001 - best-effort post-pass
|
|
69
|
+
logger.warning("ReScript cross-module resolver failed: %s", exc)
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _run_spring_resolver(store: GraphStore) -> Optional[dict]:
|
|
74
|
+
"""Run the Spring DI call resolver, swallowing any failure so
|
|
75
|
+
build never fails because of it. Returns stats or None on error.
|
|
76
|
+
"""
|
|
77
|
+
try:
|
|
78
|
+
from .spring_resolver import resolve_spring_di_calls
|
|
79
|
+
return resolve_spring_di_calls(store)
|
|
80
|
+
except Exception as exc: # noqa: BLE001 - best-effort post-pass
|
|
81
|
+
logger.warning("Spring DI resolver failed: %s", exc)
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _run_temporal_resolver(store: GraphStore) -> Optional[dict]:
|
|
86
|
+
"""Run the Temporal workflow/activity call resolver, swallowing any failure so
|
|
87
|
+
build never fails because of it. Returns stats or None on error.
|
|
88
|
+
"""
|
|
89
|
+
try:
|
|
90
|
+
from .temporal_resolver import resolve_temporal_calls
|
|
91
|
+
return resolve_temporal_calls(store)
|
|
92
|
+
except Exception as exc: # noqa: BLE001 - best-effort post-pass
|
|
93
|
+
logger.warning("Temporal resolver failed: %s", exc)
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
# Default ignore patterns (in addition to .gitignore).
|
|
97
|
+
#
|
|
98
|
+
# `<dir>/**` patterns are matched at any depth by _should_ignore, so
|
|
99
|
+
# `node_modules/**` also excludes `packages/app/node_modules/react/index.js`
|
|
100
|
+
# inside monorepos. See: #91
|
|
101
|
+
DEFAULT_IGNORE_PATTERNS = [
|
|
102
|
+
".code-review-graph/**",
|
|
103
|
+
"node_modules/**",
|
|
104
|
+
".git/**",
|
|
105
|
+
".svn/**",
|
|
106
|
+
"__pycache__/**",
|
|
107
|
+
"*.pyc",
|
|
108
|
+
".venv/**",
|
|
109
|
+
"venv/**",
|
|
110
|
+
"dist/**",
|
|
111
|
+
"build/**",
|
|
112
|
+
".next/**",
|
|
113
|
+
"target/**",
|
|
114
|
+
# PHP / Laravel / Composer
|
|
115
|
+
"vendor/**",
|
|
116
|
+
"bootstrap/cache/**",
|
|
117
|
+
"public/build/**",
|
|
118
|
+
# Ruby / Bundler
|
|
119
|
+
".bundle/**",
|
|
120
|
+
# Java / Kotlin / Gradle
|
|
121
|
+
".gradle/**",
|
|
122
|
+
"*.jar",
|
|
123
|
+
# Dart / Flutter
|
|
124
|
+
".dart_tool/**",
|
|
125
|
+
".pub-cache/**",
|
|
126
|
+
# General
|
|
127
|
+
"coverage/**",
|
|
128
|
+
".cache/**",
|
|
129
|
+
"*.min.js",
|
|
130
|
+
"*.min.css",
|
|
131
|
+
"*.map",
|
|
132
|
+
"*.lock",
|
|
133
|
+
"package-lock.json",
|
|
134
|
+
"yarn.lock",
|
|
135
|
+
"*.db",
|
|
136
|
+
"*.sqlite",
|
|
137
|
+
"*.db-journal",
|
|
138
|
+
"*.db-wal",
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def find_svn_root(start: Path | None = None) -> Optional[Path]:
|
|
143
|
+
"""Walk up from start to find the SVN working copy root.
|
|
144
|
+
|
|
145
|
+
For SVN 1.7+, there is a single ``.svn`` at the WC root.
|
|
146
|
+
For older SVN, every directory has ``.svn`` — we return the topmost one
|
|
147
|
+
found so that the WC root is correctly identified.
|
|
148
|
+
"""
|
|
149
|
+
current = start or Path.cwd()
|
|
150
|
+
candidate: Optional[Path] = None
|
|
151
|
+
while current != current.parent:
|
|
152
|
+
if (current / ".svn").exists():
|
|
153
|
+
candidate = current
|
|
154
|
+
current = current.parent
|
|
155
|
+
if (current / ".svn").exists():
|
|
156
|
+
candidate = current
|
|
157
|
+
return candidate
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def find_repo_root(
|
|
161
|
+
start: Path | None = None,
|
|
162
|
+
stop_at: Path | None = None,
|
|
163
|
+
) -> Optional[Path]:
|
|
164
|
+
"""Walk up from ``start`` to find the nearest ``.git`` directory or SVN working copy root.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
start: Starting directory. Defaults to ``Path.cwd()``.
|
|
168
|
+
stop_at: Optional boundary — if provided, the walk examines
|
|
169
|
+
``stop_at`` for a ``.git`` directory and then stops without
|
|
170
|
+
crossing above it. Useful for tests that create a synthetic
|
|
171
|
+
repo under ``tmp_path`` (so the walk does not accidentally
|
|
172
|
+
climb into a developer's home-directory dotfiles repo) and
|
|
173
|
+
for any production caller that wants to bound the ancestor
|
|
174
|
+
walk — e.g. multi-repo orchestrators, CI containers with
|
|
175
|
+
bind-mounted volumes, embedded sandboxes. See #241.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
The first ancestor containing ``.git`` or an SVN working copy,
|
|
179
|
+
or ``None`` if no ancestor up to and including ``stop_at`` (when
|
|
180
|
+
set) or the filesystem root (when ``stop_at is None``) contains one.
|
|
181
|
+
"""
|
|
182
|
+
current = start or Path.cwd()
|
|
183
|
+
while current != current.parent:
|
|
184
|
+
if (current / ".git").exists():
|
|
185
|
+
return current
|
|
186
|
+
if stop_at is not None and current == stop_at:
|
|
187
|
+
return None
|
|
188
|
+
current = current.parent
|
|
189
|
+
if (current / ".git").exists():
|
|
190
|
+
return current
|
|
191
|
+
# No Git root found — try SVN
|
|
192
|
+
return find_svn_root(start)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def detect_vcs(root: Path) -> str:
|
|
196
|
+
"""Return ``'git'``, ``'svn'``, or ``'none'`` based on VCS markers at *root*."""
|
|
197
|
+
if (root / ".git").exists():
|
|
198
|
+
return "git"
|
|
199
|
+
if (root / ".svn").exists():
|
|
200
|
+
return "svn"
|
|
201
|
+
return "none"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def find_project_root(
|
|
205
|
+
start: Path | None = None,
|
|
206
|
+
stop_at: Path | None = None,
|
|
207
|
+
) -> Path:
|
|
208
|
+
"""Find the project root.
|
|
209
|
+
|
|
210
|
+
Resolution order (highest precedence first):
|
|
211
|
+
|
|
212
|
+
1. ``CRG_REPO_ROOT`` environment variable — explicit override for
|
|
213
|
+
anyone scripting the CLI from outside the repo (CI jobs, daemons,
|
|
214
|
+
multi-repo orchestrators). See: #155
|
|
215
|
+
2. Git repository root via :func:`find_repo_root` from ``start``,
|
|
216
|
+
honoring ``stop_at`` if provided.
|
|
217
|
+
3. ``start`` itself (or cwd if no start given).
|
|
218
|
+
|
|
219
|
+
``stop_at`` is forwarded to :func:`find_repo_root` so callers that
|
|
220
|
+
want to bound the ancestor walk (typically tests; see #241) can do so
|
|
221
|
+
without having to call ``find_repo_root`` directly.
|
|
222
|
+
"""
|
|
223
|
+
env_override = os.environ.get("CRG_REPO_ROOT", "").strip()
|
|
224
|
+
if env_override:
|
|
225
|
+
p = Path(env_override).expanduser().resolve()
|
|
226
|
+
if p.exists():
|
|
227
|
+
return p
|
|
228
|
+
root = find_repo_root(start, stop_at=stop_at)
|
|
229
|
+
if root:
|
|
230
|
+
return root
|
|
231
|
+
return start or Path.cwd()
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _write_data_dir_gitignore(data_dir: Path) -> None:
|
|
235
|
+
"""Write .gitignore file in data directory if it doesn't exist.
|
|
236
|
+
|
|
237
|
+
The gitignore contains a single '*' to prevent accidental commits.
|
|
238
|
+
"""
|
|
239
|
+
inner_gitignore = data_dir / ".gitignore"
|
|
240
|
+
if not inner_gitignore.exists():
|
|
241
|
+
try:
|
|
242
|
+
# `encoding="utf-8"` is REQUIRED — the em-dash in the header is
|
|
243
|
+
# U+2014 which falls outside cp1252. On Windows, calling
|
|
244
|
+
# write_text without an encoding silently uses the system default
|
|
245
|
+
# codepage, producing a file that subsequently fails to decode as
|
|
246
|
+
# UTF-8 (see issue #239).
|
|
247
|
+
inner_gitignore.write_text(
|
|
248
|
+
"# Auto-generated by code-review-graph — do not commit database files.\n"
|
|
249
|
+
"# The graph.db contains absolute paths and code structure metadata.\n"
|
|
250
|
+
"*\n",
|
|
251
|
+
encoding="utf-8",
|
|
252
|
+
)
|
|
253
|
+
except OSError:
|
|
254
|
+
# Data dir might be read-only (rare); that's OK, it's a best-effort guard.
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def get_data_dir(repo_root: Path) -> Path:
|
|
259
|
+
"""Return the directory where this project's graph data lives.
|
|
260
|
+
|
|
261
|
+
Resolution priority:
|
|
262
|
+
1. Registry entry for this repo (set via --data-dir)
|
|
263
|
+
2. CRG_DATA_DIR environment variable (global override)
|
|
264
|
+
3. Default: <repo>/.code-review-graph/
|
|
265
|
+
|
|
266
|
+
By default, ``<repo_root>/.code-review-graph``. If the
|
|
267
|
+
``CRG_DATA_DIR`` environment variable is set, it is used verbatim
|
|
268
|
+
instead — letting you keep graphs outside the working tree (useful
|
|
269
|
+
for ephemeral workspaces, Docker volumes, or shared caches). See: #155
|
|
270
|
+
|
|
271
|
+
The directory is created if it does not already exist; an inner
|
|
272
|
+
``.gitignore`` (with ``*``) is written so any accidentally-nested
|
|
273
|
+
files never get committed. Both are idempotent.
|
|
274
|
+
"""
|
|
275
|
+
# Check registry first
|
|
276
|
+
try:
|
|
277
|
+
from .registry import Registry
|
|
278
|
+
registry_data_dir = Registry().get_data_dir_for_repo(str(repo_root))
|
|
279
|
+
if registry_data_dir:
|
|
280
|
+
data_dir = Path(registry_data_dir).resolve()
|
|
281
|
+
data_dir.mkdir(parents=True, exist_ok=True)
|
|
282
|
+
_write_data_dir_gitignore(data_dir)
|
|
283
|
+
return data_dir
|
|
284
|
+
except Exception as exc:
|
|
285
|
+
# If registry lookup fails, log and fall through to other methods
|
|
286
|
+
logger.debug("Registry lookup failed for %s: %s", repo_root, exc)
|
|
287
|
+
|
|
288
|
+
# Check environment variable
|
|
289
|
+
env_override = os.environ.get("CRG_DATA_DIR", "").strip()
|
|
290
|
+
if env_override:
|
|
291
|
+
data_dir = Path(env_override).expanduser().resolve()
|
|
292
|
+
else:
|
|
293
|
+
data_dir = repo_root / ".code-review-graph"
|
|
294
|
+
|
|
295
|
+
data_dir.mkdir(parents=True, exist_ok=True)
|
|
296
|
+
_write_data_dir_gitignore(data_dir)
|
|
297
|
+
|
|
298
|
+
return data_dir
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def get_db_path(repo_root: Path) -> Path:
|
|
302
|
+
"""Determine the database path for a repository.
|
|
303
|
+
|
|
304
|
+
Respects ``CRG_DATA_DIR`` (see :func:`get_data_dir`). Migrates a
|
|
305
|
+
legacy top-level ``.code-review-graph.db`` file into the new
|
|
306
|
+
directory when it exists (WAL/SHM side-files are discarded).
|
|
307
|
+
"""
|
|
308
|
+
crg_dir = get_data_dir(repo_root)
|
|
309
|
+
new_db = crg_dir / "graph.db"
|
|
310
|
+
|
|
311
|
+
# Migrate legacy database if present (only meaningful when the
|
|
312
|
+
# legacy file sits at the repo root — if CRG_DATA_DIR is set we
|
|
313
|
+
# skip the migration because there's no relationship between the
|
|
314
|
+
# legacy location and the new one).
|
|
315
|
+
legacy_db = repo_root / ".code-review-graph.db"
|
|
316
|
+
if legacy_db.exists() and not new_db.exists():
|
|
317
|
+
legacy_db.rename(new_db)
|
|
318
|
+
# Discard stale WAL/SHM side-files from the old location
|
|
319
|
+
for suffix in ("-wal", "-shm", "-journal"):
|
|
320
|
+
side = repo_root / f".code-review-graph.db{suffix}"
|
|
321
|
+
if side.exists():
|
|
322
|
+
side.unlink()
|
|
323
|
+
|
|
324
|
+
return new_db
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def ensure_repo_gitignore_excludes_crg(repo_root: Path) -> str:
|
|
328
|
+
"""Ensure repo-level .gitignore excludes ``.code-review-graph/``.
|
|
329
|
+
|
|
330
|
+
Returns one of:
|
|
331
|
+
- ``created``: .gitignore was created with the entry
|
|
332
|
+
- ``updated``: entry was appended to existing .gitignore
|
|
333
|
+
- ``already-present``: no changes were needed
|
|
334
|
+
"""
|
|
335
|
+
gitignore_path = repo_root / ".gitignore"
|
|
336
|
+
existing = gitignore_path.read_text(encoding="utf-8") if gitignore_path.exists() else ""
|
|
337
|
+
|
|
338
|
+
for raw_line in existing.splitlines():
|
|
339
|
+
line = raw_line.strip()
|
|
340
|
+
if not line or line.startswith("#"):
|
|
341
|
+
continue
|
|
342
|
+
if line == ".code-review-graph" or line.startswith(".code-review-graph/"):
|
|
343
|
+
return "already-present"
|
|
344
|
+
|
|
345
|
+
block = "# Added by code-review-graph\n.code-review-graph/\n"
|
|
346
|
+
prefix = "\n" if existing and not existing.endswith("\n") else ""
|
|
347
|
+
gitignore_path.write_text(existing + prefix + block, encoding="utf-8")
|
|
348
|
+
|
|
349
|
+
if existing:
|
|
350
|
+
return "updated"
|
|
351
|
+
return "created"
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _load_ignore_patterns(repo_root: Path) -> list[str]:
|
|
355
|
+
"""Load ignore patterns from .code-review-graphignore file."""
|
|
356
|
+
patterns = list(DEFAULT_IGNORE_PATTERNS)
|
|
357
|
+
ignore_file = repo_root / ".code-review-graphignore"
|
|
358
|
+
if ignore_file.exists():
|
|
359
|
+
for line in ignore_file.read_text(encoding="utf-8", errors="replace").splitlines():
|
|
360
|
+
line = line.strip()
|
|
361
|
+
if line and not line.startswith("#"):
|
|
362
|
+
patterns.append(line)
|
|
363
|
+
return patterns
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _should_ignore(path: str, patterns: list[str]) -> bool:
|
|
367
|
+
"""Check if a path matches any ignore pattern.
|
|
368
|
+
|
|
369
|
+
Handles nested occurrences of ``<dir>/**`` patterns: for example,
|
|
370
|
+
``node_modules/**`` also matches ``packages/app/node_modules/foo.js``
|
|
371
|
+
inside monorepos. ``fnmatch`` alone treats ``*`` as not crossing ``/``
|
|
372
|
+
and only matches the prefix, so we additionally test each path segment
|
|
373
|
+
against the bare prefix of ``<dir>/**`` patterns. See: #91
|
|
374
|
+
"""
|
|
375
|
+
# Direct fnmatch first (cheap)
|
|
376
|
+
if any(fnmatch.fnmatch(path, p) for p in patterns):
|
|
377
|
+
return True
|
|
378
|
+
# Then: treat simple single-segment "dir/**" patterns as
|
|
379
|
+
# "this directory at any depth".
|
|
380
|
+
parts = PurePosixPath(path).parts
|
|
381
|
+
for p in patterns:
|
|
382
|
+
if not p.endswith("/**"):
|
|
383
|
+
continue
|
|
384
|
+
prefix = p[:-3]
|
|
385
|
+
# Only single-segment dir patterns (no "/" inside the prefix)
|
|
386
|
+
# qualify for nested matching.
|
|
387
|
+
if "/" in prefix or not prefix:
|
|
388
|
+
continue
|
|
389
|
+
if prefix in parts:
|
|
390
|
+
return True
|
|
391
|
+
return False
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _is_binary(path: Path) -> bool:
|
|
395
|
+
"""Quick heuristic: check if file appears to be binary."""
|
|
396
|
+
try:
|
|
397
|
+
chunk = path.read_bytes()[:8192]
|
|
398
|
+
return b"\x00" in chunk
|
|
399
|
+
except (OSError, PermissionError):
|
|
400
|
+
return True
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
_GIT_TIMEOUT = int(os.environ.get("CRG_GIT_TIMEOUT", "30")) # seconds, configurable
|
|
404
|
+
|
|
405
|
+
# When True, `git ls-files --recurse-submodules` is used so that files
|
|
406
|
+
# inside git submodules are included in the graph. Opt-in via env var;
|
|
407
|
+
# can also be overridden per-call through function parameters.
|
|
408
|
+
_RECURSE_SUBMODULES = os.environ.get("CRG_RECURSE_SUBMODULES", "").lower() in ("1", "true", "yes")
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _git_branch_info(repo_root: Path) -> tuple[str, str]:
|
|
412
|
+
"""Return (branch_name, head_sha) for the current repo state."""
|
|
413
|
+
branch = ""
|
|
414
|
+
sha = ""
|
|
415
|
+
try:
|
|
416
|
+
result = subprocess.run(
|
|
417
|
+
["git", "rev-parse", "--abbrev-ref", "HEAD"],
|
|
418
|
+
capture_output=True,
|
|
419
|
+
text=True, encoding='utf-8', cwd=str(repo_root),
|
|
420
|
+
timeout=_GIT_TIMEOUT,
|
|
421
|
+
stdin=subprocess.DEVNULL,
|
|
422
|
+
)
|
|
423
|
+
if result.returncode == 0:
|
|
424
|
+
branch = result.stdout.strip()
|
|
425
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
426
|
+
pass
|
|
427
|
+
try:
|
|
428
|
+
result = subprocess.run(
|
|
429
|
+
["git", "rev-parse", "HEAD"],
|
|
430
|
+
capture_output=True,
|
|
431
|
+
text=True, encoding='utf-8', cwd=str(repo_root),
|
|
432
|
+
timeout=_GIT_TIMEOUT,
|
|
433
|
+
stdin=subprocess.DEVNULL,
|
|
434
|
+
)
|
|
435
|
+
if result.returncode == 0:
|
|
436
|
+
sha = result.stdout.strip()
|
|
437
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
438
|
+
pass
|
|
439
|
+
return branch, sha
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def _svn_revision_info(repo_root: Path) -> tuple[str, str]:
|
|
443
|
+
"""Return (branch_path, revision_str) for the current SVN working copy."""
|
|
444
|
+
branch = ""
|
|
445
|
+
rev = ""
|
|
446
|
+
try:
|
|
447
|
+
result = subprocess.run(
|
|
448
|
+
["svn", "info", "--non-interactive"],
|
|
449
|
+
capture_output=True, text=True, encoding="utf-8", errors="replace",
|
|
450
|
+
cwd=str(repo_root), timeout=_GIT_TIMEOUT,
|
|
451
|
+
stdin=subprocess.DEVNULL,
|
|
452
|
+
)
|
|
453
|
+
if result.returncode == 0:
|
|
454
|
+
for line in result.stdout.splitlines():
|
|
455
|
+
if line.startswith("URL: "):
|
|
456
|
+
url = line[5:].strip()
|
|
457
|
+
# Extract trunk/branches/tags segment from SVN URL
|
|
458
|
+
for marker in ("/branches/", "/tags/", "/trunk"):
|
|
459
|
+
if marker in url:
|
|
460
|
+
idx = url.index(marker)
|
|
461
|
+
branch = url[idx:].lstrip("/")
|
|
462
|
+
break
|
|
463
|
+
if not branch and url:
|
|
464
|
+
branch = url.rstrip("/").split("/")[-1]
|
|
465
|
+
elif line.startswith("Revision: "):
|
|
466
|
+
rev = line[10:].strip()
|
|
467
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
468
|
+
pass
|
|
469
|
+
return branch, rev
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
_SAFE_GIT_REF = re.compile(r"^[A-Za-z0-9_.~^/@{}\-]+$")
|
|
473
|
+
_SAFE_SVN_REV = re.compile(r"^r?\d+(:r?\d+|:HEAD|:BASE|:COMMITTED)?$", re.IGNORECASE)
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _store_vcs_metadata(repo_root: Path, store: "GraphStore") -> None:
|
|
477
|
+
"""Persist VCS branch/revision info into the graph metadata table."""
|
|
478
|
+
vcs = detect_vcs(repo_root)
|
|
479
|
+
if vcs == "git":
|
|
480
|
+
branch, sha = _git_branch_info(repo_root)
|
|
481
|
+
if branch:
|
|
482
|
+
store.set_metadata("git_branch", branch)
|
|
483
|
+
if sha:
|
|
484
|
+
store.set_metadata("git_head_sha", sha)
|
|
485
|
+
elif vcs == "svn":
|
|
486
|
+
branch, rev = _svn_revision_info(repo_root)
|
|
487
|
+
if branch:
|
|
488
|
+
store.set_metadata("svn_branch", branch)
|
|
489
|
+
if rev:
|
|
490
|
+
store.set_metadata("svn_revision", rev)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def get_changed_files(repo_root: Path, base: str = "HEAD~1") -> list[str]:
|
|
494
|
+
"""Get list of changed files via git diff or svn status.
|
|
495
|
+
|
|
496
|
+
For SVN working copies the *base* parameter is ignored; modified/added/
|
|
497
|
+
deleted files are detected from ``svn status``. Pass an SVN revision
|
|
498
|
+
range (e.g. ``"r100:HEAD"``) as *base* to compare against a specific
|
|
499
|
+
revision instead.
|
|
500
|
+
"""
|
|
501
|
+
if detect_vcs(repo_root) == "svn":
|
|
502
|
+
return _get_svn_changed_files(repo_root, base if _SAFE_SVN_REV.match(base) else None)
|
|
503
|
+
# Git path
|
|
504
|
+
if not _SAFE_GIT_REF.match(base):
|
|
505
|
+
logger.warning("Invalid git ref rejected: %s", base)
|
|
506
|
+
return []
|
|
507
|
+
try:
|
|
508
|
+
result = subprocess.run(
|
|
509
|
+
["git", "diff", "--name-only", base, "--"],
|
|
510
|
+
capture_output=True,
|
|
511
|
+
text=True, encoding='utf-8', cwd=str(repo_root),
|
|
512
|
+
timeout=_GIT_TIMEOUT,
|
|
513
|
+
stdin=subprocess.DEVNULL,
|
|
514
|
+
)
|
|
515
|
+
if result.returncode != 0:
|
|
516
|
+
# Fallback: try diff against empty tree (initial commit)
|
|
517
|
+
result = subprocess.run(
|
|
518
|
+
["git", "diff", "--name-only", "--cached"],
|
|
519
|
+
capture_output=True,
|
|
520
|
+
text=True, encoding='utf-8', cwd=str(repo_root),
|
|
521
|
+
timeout=_GIT_TIMEOUT,
|
|
522
|
+
stdin=subprocess.DEVNULL,
|
|
523
|
+
)
|
|
524
|
+
files = [f.strip() for f in result.stdout.splitlines() if f.strip()]
|
|
525
|
+
return files
|
|
526
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
527
|
+
return []
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
def _get_svn_changed_files(repo_root: Path, rev_range: str | None = None) -> list[str]:
|
|
531
|
+
"""Return changed files in an SVN working copy.
|
|
532
|
+
|
|
533
|
+
When *rev_range* is given (e.g. ``"r100:HEAD"``), ``svn diff --summarize``
|
|
534
|
+
is used to list files changed between those revisions. Otherwise
|
|
535
|
+
``svn status`` reports working-copy modifications.
|
|
536
|
+
"""
|
|
537
|
+
try:
|
|
538
|
+
if rev_range:
|
|
539
|
+
result = subprocess.run(
|
|
540
|
+
["svn", "diff", "--summarize", "--non-interactive", "-r", rev_range],
|
|
541
|
+
capture_output=True, text=True, encoding="utf-8", errors="replace",
|
|
542
|
+
cwd=str(repo_root), timeout=_GIT_TIMEOUT,
|
|
543
|
+
stdin=subprocess.DEVNULL,
|
|
544
|
+
)
|
|
545
|
+
if result.returncode != 0:
|
|
546
|
+
logger.warning("svn diff --summarize failed (rc=%d): %s",
|
|
547
|
+
result.returncode, result.stderr[:200])
|
|
548
|
+
return []
|
|
549
|
+
files = []
|
|
550
|
+
for line in result.stdout.splitlines():
|
|
551
|
+
# Format: "M path/to/file" (first char is status)
|
|
552
|
+
if len(line) >= 2 and line[0] in ("M", "A", "D"):
|
|
553
|
+
files.append(line[1:].strip())
|
|
554
|
+
return files
|
|
555
|
+
else:
|
|
556
|
+
result = subprocess.run(
|
|
557
|
+
["svn", "status", "--non-interactive"],
|
|
558
|
+
capture_output=True, text=True, encoding="utf-8", errors="replace",
|
|
559
|
+
cwd=str(repo_root), timeout=_GIT_TIMEOUT,
|
|
560
|
+
stdin=subprocess.DEVNULL,
|
|
561
|
+
)
|
|
562
|
+
files = []
|
|
563
|
+
for line in result.stdout.splitlines():
|
|
564
|
+
if len(line) < 2:
|
|
565
|
+
continue
|
|
566
|
+
status_char = line[0]
|
|
567
|
+
# M=modified, A=added, D=deleted, R=replaced, C=conflicted
|
|
568
|
+
if status_char in ("M", "A", "D", "R", "C"):
|
|
569
|
+
# SVN status: 8 fixed-width columns then the path
|
|
570
|
+
path = line[8:].strip() if len(line) > 8 else line[1:].strip()
|
|
571
|
+
files.append(path)
|
|
572
|
+
return files
|
|
573
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
574
|
+
return []
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def get_staged_and_unstaged(repo_root: Path) -> list[str]:
|
|
578
|
+
"""Get all modified files (staged + unstaged + untracked)."""
|
|
579
|
+
if detect_vcs(repo_root) == "svn":
|
|
580
|
+
return _get_svn_changed_files(repo_root)
|
|
581
|
+
try:
|
|
582
|
+
result = subprocess.run(
|
|
583
|
+
["git", "status", "--porcelain"],
|
|
584
|
+
capture_output=True,
|
|
585
|
+
text=True, encoding='utf-8', cwd=str(repo_root),
|
|
586
|
+
timeout=_GIT_TIMEOUT,
|
|
587
|
+
stdin=subprocess.DEVNULL,
|
|
588
|
+
)
|
|
589
|
+
files = []
|
|
590
|
+
for line in result.stdout.splitlines():
|
|
591
|
+
if len(line) > 3:
|
|
592
|
+
entry = line[3:].strip()
|
|
593
|
+
# Handle renamed files: "R old -> new"
|
|
594
|
+
if " -> " in entry:
|
|
595
|
+
entry = entry.split(" -> ", 1)[1]
|
|
596
|
+
files.append(entry)
|
|
597
|
+
return files
|
|
598
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
599
|
+
return []
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
def get_all_tracked_files(
|
|
603
|
+
repo_root: Path,
|
|
604
|
+
recurse_submodules: bool | None = None,
|
|
605
|
+
) -> list[str]:
|
|
606
|
+
"""Get all files tracked by git or svn.
|
|
607
|
+
|
|
608
|
+
Args:
|
|
609
|
+
repo_root: Repository root directory.
|
|
610
|
+
recurse_submodules: If True, pass ``--recurse-submodules`` to
|
|
611
|
+
``git ls-files`` so that files inside git submodules are
|
|
612
|
+
included. When *None* (default), falls back to the
|
|
613
|
+
``CRG_RECURSE_SUBMODULES`` environment variable.
|
|
614
|
+
(Ignored for SVN working copies.)
|
|
615
|
+
"""
|
|
616
|
+
if detect_vcs(repo_root) == "svn":
|
|
617
|
+
return _get_svn_all_tracked_files(repo_root)
|
|
618
|
+
|
|
619
|
+
if recurse_submodules is None:
|
|
620
|
+
recurse_submodules = _RECURSE_SUBMODULES
|
|
621
|
+
|
|
622
|
+
cmd = ["git", "ls-files"]
|
|
623
|
+
if recurse_submodules:
|
|
624
|
+
cmd.append("--recurse-submodules")
|
|
625
|
+
|
|
626
|
+
try:
|
|
627
|
+
result = subprocess.run(
|
|
628
|
+
cmd,
|
|
629
|
+
capture_output=True,
|
|
630
|
+
text=True, encoding='utf-8', cwd=str(repo_root),
|
|
631
|
+
timeout=_GIT_TIMEOUT,
|
|
632
|
+
stdin=subprocess.DEVNULL,
|
|
633
|
+
)
|
|
634
|
+
return [f.strip() for f in result.stdout.splitlines() if f.strip()]
|
|
635
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
636
|
+
return []
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def _get_svn_all_tracked_files(repo_root: Path) -> list[str]:
|
|
640
|
+
"""Return SVN-versioned files by walking the working copy.
|
|
641
|
+
|
|
642
|
+
Uses ``svn list -R`` to get the server-side file list, falling back to
|
|
643
|
+
a filesystem walk (which is also the fallback in :func:`collect_all_files`).
|
|
644
|
+
"""
|
|
645
|
+
try:
|
|
646
|
+
result = subprocess.run(
|
|
647
|
+
["svn", "list", "--recursive", "--non-interactive"],
|
|
648
|
+
capture_output=True, text=True, encoding="utf-8", errors="replace",
|
|
649
|
+
cwd=str(repo_root), timeout=60, # svn list queries the server
|
|
650
|
+
stdin=subprocess.DEVNULL,
|
|
651
|
+
)
|
|
652
|
+
if result.returncode == 0:
|
|
653
|
+
# svn list returns paths relative to the WC URL; directories end with "/"
|
|
654
|
+
files = [
|
|
655
|
+
f.strip()
|
|
656
|
+
for f in result.stdout.splitlines()
|
|
657
|
+
if f.strip() and not f.strip().endswith("/")
|
|
658
|
+
]
|
|
659
|
+
if files:
|
|
660
|
+
return files
|
|
661
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
662
|
+
pass
|
|
663
|
+
# Fallback: let collect_all_files do a filesystem walk
|
|
664
|
+
return []
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
def collect_all_files(
|
|
668
|
+
repo_root: Path,
|
|
669
|
+
recurse_submodules: bool | None = None,
|
|
670
|
+
) -> list[str]:
|
|
671
|
+
"""Collect all parseable files in the repo, respecting ignore patterns.
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
repo_root: Repository root directory.
|
|
675
|
+
recurse_submodules: If True, include files from git submodules.
|
|
676
|
+
When *None*, falls back to ``CRG_RECURSE_SUBMODULES`` env var.
|
|
677
|
+
"""
|
|
678
|
+
ignore_patterns = _load_ignore_patterns(repo_root)
|
|
679
|
+
parser = CodeParser(repo_root)
|
|
680
|
+
files = []
|
|
681
|
+
|
|
682
|
+
# Prefer git ls-files for tracked files
|
|
683
|
+
tracked = get_all_tracked_files(repo_root, recurse_submodules)
|
|
684
|
+
if tracked:
|
|
685
|
+
candidates = tracked
|
|
686
|
+
else:
|
|
687
|
+
# Fallback: walk directory
|
|
688
|
+
candidates = [str(p.relative_to(repo_root)) for p in repo_root.rglob("*") if p.is_file()]
|
|
689
|
+
|
|
690
|
+
for rel_path in candidates:
|
|
691
|
+
if _should_ignore(rel_path, ignore_patterns):
|
|
692
|
+
continue
|
|
693
|
+
# Skip paths that would exceed OS filename limits (macOS: 255 bytes
|
|
694
|
+
# per component, ~1024 total; Windows: 260 total).
|
|
695
|
+
try:
|
|
696
|
+
full_path = repo_root / rel_path
|
|
697
|
+
except (OSError, ValueError):
|
|
698
|
+
logger.debug("Skipping path that cannot be constructed: %s", rel_path)
|
|
699
|
+
continue
|
|
700
|
+
if len(str(full_path)) > 1000 or any(len(p.encode()) > 255 for p in full_path.parts):
|
|
701
|
+
logger.debug("Skipping overlong path: %s", rel_path[:120])
|
|
702
|
+
continue
|
|
703
|
+
if not full_path.is_file():
|
|
704
|
+
continue
|
|
705
|
+
if full_path.is_symlink():
|
|
706
|
+
continue
|
|
707
|
+
if parser.detect_language(full_path) is None:
|
|
708
|
+
continue
|
|
709
|
+
if _is_binary(full_path):
|
|
710
|
+
continue
|
|
711
|
+
files.append(rel_path)
|
|
712
|
+
|
|
713
|
+
return files
|
|
714
|
+
|
|
715
|
+
|
|
716
|
+
_MAX_DEPENDENT_HOPS = int(os.environ.get("CRG_DEPENDENT_HOPS", "2"))
|
|
717
|
+
_MAX_DEPENDENT_FILES = 500
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
def _single_hop_dependents(store: GraphStore, file_path: str) -> set[str]:
|
|
721
|
+
"""Find files that directly depend on *file_path* (single hop)."""
|
|
722
|
+
dependents: set[str] = set()
|
|
723
|
+
edges = store.get_edges_by_target(file_path)
|
|
724
|
+
for e in edges:
|
|
725
|
+
if e.kind == "IMPORTS_FROM":
|
|
726
|
+
dependents.add(e.file_path)
|
|
727
|
+
|
|
728
|
+
nodes = store.get_nodes_by_file(file_path)
|
|
729
|
+
for node in nodes:
|
|
730
|
+
for e in store.get_edges_by_target(node.qualified_name):
|
|
731
|
+
if e.kind in ("CALLS", "IMPORTS_FROM", "INHERITS", "IMPLEMENTS"):
|
|
732
|
+
dependents.add(e.file_path)
|
|
733
|
+
|
|
734
|
+
dependents.discard(file_path)
|
|
735
|
+
return dependents
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
class DependentList(list):
|
|
739
|
+
"""A ``list[str]`` with a ``.truncated`` flag.
|
|
740
|
+
|
|
741
|
+
When :func:`find_dependents` hits ``_MAX_DEPENDENT_FILES`` it truncates
|
|
742
|
+
the result and sets ``truncated = True`` so callers can distinguish a
|
|
743
|
+
complete expansion from a capped one. See issue #261.
|
|
744
|
+
|
|
745
|
+
This is a transparent ``list`` subclass — existing callers that iterate,
|
|
746
|
+
``len()``, or slice continue to work unchanged; only callers that
|
|
747
|
+
specifically check ``.truncated`` benefit from the signal.
|
|
748
|
+
"""
|
|
749
|
+
|
|
750
|
+
truncated: bool
|
|
751
|
+
|
|
752
|
+
def __init__(self, items: list, *, truncated: bool = False) -> None:
|
|
753
|
+
super().__init__(items)
|
|
754
|
+
self.truncated = truncated
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
def find_dependents(
|
|
758
|
+
store: GraphStore,
|
|
759
|
+
file_path: str,
|
|
760
|
+
max_hops: int = _MAX_DEPENDENT_HOPS,
|
|
761
|
+
) -> DependentList:
|
|
762
|
+
"""Find files that import from or depend on the given file.
|
|
763
|
+
|
|
764
|
+
Performs up to *max_hops* iterations of expansion (default 2).
|
|
765
|
+
Stops early if the total exceeds 500 files.
|
|
766
|
+
|
|
767
|
+
Returns a :class:`DependentList` — a regular ``list[str]`` that also
|
|
768
|
+
carries a ``.truncated`` flag. When ``truncated is True`` the
|
|
769
|
+
returned list is capped at ``_MAX_DEPENDENT_FILES`` and the full
|
|
770
|
+
set of dependents was not explored. See issue #261.
|
|
771
|
+
"""
|
|
772
|
+
all_dependents: set[str] = set()
|
|
773
|
+
visited: set[str] = {file_path}
|
|
774
|
+
frontier: set[str] = {file_path}
|
|
775
|
+
for _hop in range(max_hops):
|
|
776
|
+
next_frontier: set[str] = set()
|
|
777
|
+
for fp in frontier:
|
|
778
|
+
deps = _single_hop_dependents(store, fp)
|
|
779
|
+
new_deps = deps - visited
|
|
780
|
+
all_dependents.update(new_deps)
|
|
781
|
+
next_frontier.update(new_deps)
|
|
782
|
+
visited.update(next_frontier)
|
|
783
|
+
frontier = next_frontier
|
|
784
|
+
if not frontier:
|
|
785
|
+
break
|
|
786
|
+
if len(all_dependents) > _MAX_DEPENDENT_FILES:
|
|
787
|
+
logger.warning(
|
|
788
|
+
"Dependent expansion capped at %d files for %s",
|
|
789
|
+
len(all_dependents),
|
|
790
|
+
file_path,
|
|
791
|
+
)
|
|
792
|
+
return DependentList(
|
|
793
|
+
list(all_dependents)[:_MAX_DEPENDENT_FILES],
|
|
794
|
+
truncated=True,
|
|
795
|
+
)
|
|
796
|
+
return DependentList(list(all_dependents))
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
def _parse_single_file(
|
|
800
|
+
args: tuple[str, str],
|
|
801
|
+
) -> tuple[str, list, list, str | None, str]:
|
|
802
|
+
"""Parse one file in a worker process.
|
|
803
|
+
|
|
804
|
+
Returns ``(rel_path, nodes, edges, error_or_none, file_hash)``.
|
|
805
|
+
Must be a module-level function so ``ProcessPoolExecutor`` can
|
|
806
|
+
serialise it across processes.
|
|
807
|
+
"""
|
|
808
|
+
rel_path, repo_root_str = args
|
|
809
|
+
abs_path = Path(repo_root_str) / rel_path
|
|
810
|
+
try:
|
|
811
|
+
raw = abs_path.read_bytes()
|
|
812
|
+
fhash = hashlib.sha256(raw).hexdigest()
|
|
813
|
+
parser = CodeParser(Path(repo_root_str))
|
|
814
|
+
nodes, edges = parser.parse_bytes(abs_path, raw)
|
|
815
|
+
return (rel_path, nodes, edges, None, fhash)
|
|
816
|
+
except Exception as e:
|
|
817
|
+
return (rel_path, [], [], str(e), "")
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
def full_build(
|
|
821
|
+
repo_root: Path,
|
|
822
|
+
store: GraphStore,
|
|
823
|
+
recurse_submodules: bool | None = None,
|
|
824
|
+
) -> dict:
|
|
825
|
+
"""Full rebuild of the entire graph.
|
|
826
|
+
|
|
827
|
+
Args:
|
|
828
|
+
repo_root: Repository root directory.
|
|
829
|
+
store: Graph database store.
|
|
830
|
+
recurse_submodules: If True, include files from git submodules.
|
|
831
|
+
When *None*, falls back to ``CRG_RECURSE_SUBMODULES`` env var.
|
|
832
|
+
"""
|
|
833
|
+
parser = CodeParser(repo_root)
|
|
834
|
+
files = collect_all_files(repo_root, recurse_submodules)
|
|
835
|
+
|
|
836
|
+
# Purge stale data from files no longer on disk
|
|
837
|
+
existing_files = set(store.get_all_files())
|
|
838
|
+
current_abs = {str(repo_root / f) for f in files}
|
|
839
|
+
stale_files = existing_files - current_abs
|
|
840
|
+
for stale in stale_files:
|
|
841
|
+
store.remove_file_data(stale)
|
|
842
|
+
# Ensure deletions are persisted before store_file_nodes_edges()
|
|
843
|
+
# starts its own explicit transaction via BEGIN IMMEDIATE.
|
|
844
|
+
if stale_files:
|
|
845
|
+
store.commit()
|
|
846
|
+
|
|
847
|
+
total_nodes = 0
|
|
848
|
+
total_edges = 0
|
|
849
|
+
errors = []
|
|
850
|
+
file_count = len(files)
|
|
851
|
+
|
|
852
|
+
use_serial = os.environ.get("CRG_SERIAL_PARSE", "") == "1"
|
|
853
|
+
|
|
854
|
+
if use_serial or file_count < 8:
|
|
855
|
+
# Serial fallback (for debugging or tiny repos)
|
|
856
|
+
for i, rel_path in enumerate(files, 1):
|
|
857
|
+
full_path = repo_root / rel_path
|
|
858
|
+
try:
|
|
859
|
+
source = full_path.read_bytes()
|
|
860
|
+
fhash = hashlib.sha256(source).hexdigest()
|
|
861
|
+
nodes, edges = parser.parse_bytes(full_path, source)
|
|
862
|
+
store.store_file_nodes_edges(str(full_path), nodes, edges, fhash)
|
|
863
|
+
total_nodes += len(nodes)
|
|
864
|
+
total_edges += len(edges)
|
|
865
|
+
except (OSError, PermissionError) as e:
|
|
866
|
+
errors.append({"file": rel_path, "error": str(e)})
|
|
867
|
+
except Exception as e:
|
|
868
|
+
logger.warning("Error parsing %s: %s", rel_path, e)
|
|
869
|
+
errors.append({"file": rel_path, "error": str(e)})
|
|
870
|
+
if i % 50 == 0 or i == file_count:
|
|
871
|
+
logger.info("Progress: %d/%d files parsed", i, file_count)
|
|
872
|
+
else:
|
|
873
|
+
# Parallel parsing — store calls remain serial (SQLite single-writer).
|
|
874
|
+
# Executor kind auto-selected: process on Linux/macOS/Windows-TTY,
|
|
875
|
+
# thread on Windows-MCP-stdio to avoid pipe-handle inheritance
|
|
876
|
+
# deadlock (issues #46, #136). Override via CRG_PARSE_EXECUTOR env.
|
|
877
|
+
args_list = [(rel_path, str(repo_root)) for rel_path in files]
|
|
878
|
+
with _make_executor(_MAX_PARSE_WORKERS) as executor:
|
|
879
|
+
for i, (rel_path, nodes, edges, error, fhash) in enumerate(
|
|
880
|
+
executor.map(_parse_single_file, args_list, chunksize=20),
|
|
881
|
+
1,
|
|
882
|
+
):
|
|
883
|
+
if error:
|
|
884
|
+
logger.warning("Error parsing %s: %s", rel_path, error)
|
|
885
|
+
errors.append({"file": rel_path, "error": error})
|
|
886
|
+
continue
|
|
887
|
+
full_path = repo_root / rel_path
|
|
888
|
+
store.store_file_nodes_edges(
|
|
889
|
+
str(full_path),
|
|
890
|
+
nodes,
|
|
891
|
+
edges,
|
|
892
|
+
fhash,
|
|
893
|
+
)
|
|
894
|
+
total_nodes += len(nodes)
|
|
895
|
+
total_edges += len(edges)
|
|
896
|
+
if i % 200 == 0 or i == file_count:
|
|
897
|
+
logger.info("Progress: %d/%d files parsed", i, file_count)
|
|
898
|
+
|
|
899
|
+
store.set_metadata("last_updated", time.strftime("%Y-%m-%dT%H:%M:%S"))
|
|
900
|
+
store.set_metadata("last_build_type", "full")
|
|
901
|
+
_store_vcs_metadata(repo_root, store)
|
|
902
|
+
store.commit()
|
|
903
|
+
|
|
904
|
+
rescript_stats = _run_rescript_resolver(store)
|
|
905
|
+
spring_stats = _run_spring_resolver(store)
|
|
906
|
+
temporal_stats = _run_temporal_resolver(store)
|
|
907
|
+
|
|
908
|
+
disambiguated = store.find_disambiguated_nodes()
|
|
909
|
+
if disambiguated:
|
|
910
|
+
logger.info(
|
|
911
|
+
"Disambiguated %d duplicate qualified_name(s): %s",
|
|
912
|
+
len(disambiguated), ", ".join(disambiguated),
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
return {
|
|
916
|
+
"files_parsed": len(files),
|
|
917
|
+
"total_nodes": total_nodes,
|
|
918
|
+
"total_edges": total_edges,
|
|
919
|
+
"disambiguated_nodes": disambiguated,
|
|
920
|
+
"errors": errors,
|
|
921
|
+
"rescript_resolution": rescript_stats,
|
|
922
|
+
"spring_resolution": spring_stats,
|
|
923
|
+
"temporal_resolution": temporal_stats,
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
def incremental_update(
|
|
928
|
+
repo_root: Path,
|
|
929
|
+
store: GraphStore,
|
|
930
|
+
base: str = "HEAD~1",
|
|
931
|
+
changed_files: list[str] | None = None,
|
|
932
|
+
) -> dict:
|
|
933
|
+
"""Incremental update: re-parse changed + dependent files only."""
|
|
934
|
+
parser = CodeParser(repo_root)
|
|
935
|
+
ignore_patterns = _load_ignore_patterns(repo_root)
|
|
936
|
+
|
|
937
|
+
# Determine changed files
|
|
938
|
+
if changed_files is None:
|
|
939
|
+
changed_files = get_changed_files(repo_root, base)
|
|
940
|
+
|
|
941
|
+
if not changed_files:
|
|
942
|
+
return {
|
|
943
|
+
"files_updated": 0,
|
|
944
|
+
"total_nodes": 0,
|
|
945
|
+
"total_edges": 0,
|
|
946
|
+
"changed_files": [],
|
|
947
|
+
"dependent_files": [],
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
# Find dependent files (files that import from changed files)
|
|
951
|
+
dependent_files: set[str] = set()
|
|
952
|
+
for rel_path in changed_files:
|
|
953
|
+
full_path = str(repo_root / rel_path)
|
|
954
|
+
deps = find_dependents(store, full_path)
|
|
955
|
+
for d in deps:
|
|
956
|
+
# Convert back to relative path if needed
|
|
957
|
+
try:
|
|
958
|
+
dependent_files.add(str(Path(d).relative_to(repo_root)))
|
|
959
|
+
except ValueError:
|
|
960
|
+
dependent_files.add(d)
|
|
961
|
+
|
|
962
|
+
# Combine changed + dependent
|
|
963
|
+
all_files = set(changed_files) | dependent_files
|
|
964
|
+
|
|
965
|
+
total_nodes = 0
|
|
966
|
+
total_edges = 0
|
|
967
|
+
errors = []
|
|
968
|
+
|
|
969
|
+
# Separate deleted/unparseable files from files that need re-parsing
|
|
970
|
+
to_parse: list[str] = []
|
|
971
|
+
removed_any = False
|
|
972
|
+
for rel_path in all_files:
|
|
973
|
+
if _should_ignore(rel_path, ignore_patterns):
|
|
974
|
+
continue
|
|
975
|
+
abs_path = repo_root / rel_path
|
|
976
|
+
if not abs_path.is_file():
|
|
977
|
+
store.remove_file_data(str(abs_path))
|
|
978
|
+
removed_any = True
|
|
979
|
+
continue
|
|
980
|
+
if parser.detect_language(abs_path) is None:
|
|
981
|
+
continue
|
|
982
|
+
# Quick hash check to skip unchanged files
|
|
983
|
+
try:
|
|
984
|
+
raw = abs_path.read_bytes()
|
|
985
|
+
fhash = hashlib.sha256(raw).hexdigest()
|
|
986
|
+
existing_nodes = store.get_nodes_by_file(str(abs_path))
|
|
987
|
+
if existing_nodes and existing_nodes[0].file_hash == fhash:
|
|
988
|
+
continue
|
|
989
|
+
except (OSError, PermissionError):
|
|
990
|
+
pass
|
|
991
|
+
to_parse.append(rel_path)
|
|
992
|
+
|
|
993
|
+
# Persist deletions before store_file_nodes_edges() opens its own
|
|
994
|
+
# explicit transaction — avoids nested transaction errors.
|
|
995
|
+
if removed_any:
|
|
996
|
+
store.commit()
|
|
997
|
+
|
|
998
|
+
use_serial = os.environ.get("CRG_SERIAL_PARSE", "") == "1"
|
|
999
|
+
|
|
1000
|
+
if use_serial or len(to_parse) < 8:
|
|
1001
|
+
for rel_path in to_parse:
|
|
1002
|
+
abs_path = repo_root / rel_path
|
|
1003
|
+
try:
|
|
1004
|
+
source = abs_path.read_bytes()
|
|
1005
|
+
fhash = hashlib.sha256(source).hexdigest()
|
|
1006
|
+
nodes, edges = parser.parse_bytes(abs_path, source)
|
|
1007
|
+
store.store_file_nodes_edges(str(abs_path), nodes, edges, fhash)
|
|
1008
|
+
total_nodes += len(nodes)
|
|
1009
|
+
total_edges += len(edges)
|
|
1010
|
+
except (OSError, PermissionError) as e:
|
|
1011
|
+
errors.append({"file": rel_path, "error": str(e)})
|
|
1012
|
+
except Exception as e:
|
|
1013
|
+
logger.warning("Error parsing %s: %s", rel_path, e)
|
|
1014
|
+
errors.append({"file": rel_path, "error": str(e)})
|
|
1015
|
+
else:
|
|
1016
|
+
# See full-build comment above for executor kind rationale.
|
|
1017
|
+
args_list = [(rel_path, str(repo_root)) for rel_path in to_parse]
|
|
1018
|
+
with _make_executor(_MAX_PARSE_WORKERS) as executor:
|
|
1019
|
+
for rel_path, nodes, edges, error, fhash in executor.map(
|
|
1020
|
+
_parse_single_file,
|
|
1021
|
+
args_list,
|
|
1022
|
+
chunksize=20,
|
|
1023
|
+
):
|
|
1024
|
+
if error:
|
|
1025
|
+
logger.warning("Error parsing %s: %s", rel_path, error)
|
|
1026
|
+
errors.append({"file": rel_path, "error": error})
|
|
1027
|
+
continue
|
|
1028
|
+
store.store_file_nodes_edges(
|
|
1029
|
+
str(repo_root / rel_path),
|
|
1030
|
+
nodes,
|
|
1031
|
+
edges,
|
|
1032
|
+
fhash,
|
|
1033
|
+
)
|
|
1034
|
+
total_nodes += len(nodes)
|
|
1035
|
+
total_edges += len(edges)
|
|
1036
|
+
|
|
1037
|
+
store.set_metadata("last_updated", time.strftime("%Y-%m-%dT%H:%M:%S"))
|
|
1038
|
+
store.set_metadata("last_build_type", "incremental")
|
|
1039
|
+
_store_vcs_metadata(repo_root, store)
|
|
1040
|
+
store.commit()
|
|
1041
|
+
|
|
1042
|
+
# Only re-run language-specific resolvers when the relevant files changed.
|
|
1043
|
+
rescript_changed = any(
|
|
1044
|
+
rp.endswith((".res", ".resi")) for rp in all_files
|
|
1045
|
+
)
|
|
1046
|
+
rescript_stats = (
|
|
1047
|
+
_run_rescript_resolver(store) if rescript_changed else None
|
|
1048
|
+
)
|
|
1049
|
+
|
|
1050
|
+
spring_changed = any(rp.endswith(".java") for rp in all_files)
|
|
1051
|
+
spring_stats = _run_spring_resolver(store) if spring_changed else None
|
|
1052
|
+
temporal_stats = _run_temporal_resolver(store) if spring_changed else None
|
|
1053
|
+
|
|
1054
|
+
disambiguated = store.find_disambiguated_nodes()
|
|
1055
|
+
|
|
1056
|
+
return {
|
|
1057
|
+
"files_updated": len(all_files),
|
|
1058
|
+
"total_nodes": total_nodes,
|
|
1059
|
+
"total_edges": total_edges,
|
|
1060
|
+
"disambiguated_nodes": disambiguated,
|
|
1061
|
+
"changed_files": list(changed_files),
|
|
1062
|
+
"dependent_files": list(dependent_files),
|
|
1063
|
+
"errors": errors,
|
|
1064
|
+
"rescript_resolution": rescript_stats,
|
|
1065
|
+
"spring_resolution": spring_stats,
|
|
1066
|
+
"temporal_resolution": temporal_stats,
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
|
|
1070
|
+
# ---------------------------------------------------------------------------
|
|
1071
|
+
# Watch mode
|
|
1072
|
+
# ---------------------------------------------------------------------------
|
|
1073
|
+
|
|
1074
|
+
|
|
1075
|
+
_DEBOUNCE_SECONDS = 0.3
|
|
1076
|
+
|
|
1077
|
+
|
|
1078
|
+
def watch(
|
|
1079
|
+
repo_root: Path,
|
|
1080
|
+
store: GraphStore,
|
|
1081
|
+
on_files_updated: Optional[Callable] = None,
|
|
1082
|
+
) -> None:
|
|
1083
|
+
"""Watch for file changes and auto-update the graph.
|
|
1084
|
+
|
|
1085
|
+
Uses a 300ms debounce to batch rapid-fire saves into a single update.
|
|
1086
|
+
|
|
1087
|
+
Args:
|
|
1088
|
+
repo_root: Repository root to watch.
|
|
1089
|
+
store: Graph database to update.
|
|
1090
|
+
on_files_updated: Optional callback invoked after each debounced
|
|
1091
|
+
batch of file updates completes. Receives the store as its
|
|
1092
|
+
only argument. Used by the CLI to run post-processing
|
|
1093
|
+
(FTS, flows, communities) after watch updates.
|
|
1094
|
+
"""
|
|
1095
|
+
import threading
|
|
1096
|
+
|
|
1097
|
+
from watchdog.events import FileSystemEventHandler
|
|
1098
|
+
from watchdog.observers import Observer
|
|
1099
|
+
|
|
1100
|
+
parser = CodeParser(repo_root)
|
|
1101
|
+
ignore_patterns = _load_ignore_patterns(repo_root)
|
|
1102
|
+
|
|
1103
|
+
class GraphUpdateHandler(FileSystemEventHandler):
|
|
1104
|
+
def __init__(self):
|
|
1105
|
+
self._pending: set[str] = set()
|
|
1106
|
+
self._lock = threading.Lock()
|
|
1107
|
+
self._timer: threading.Timer | None = None
|
|
1108
|
+
|
|
1109
|
+
def _should_handle(self, path: str) -> bool:
|
|
1110
|
+
if Path(path).is_symlink():
|
|
1111
|
+
return False
|
|
1112
|
+
try:
|
|
1113
|
+
rel = str(Path(path).relative_to(repo_root))
|
|
1114
|
+
except ValueError:
|
|
1115
|
+
return False
|
|
1116
|
+
if _should_ignore(rel, ignore_patterns):
|
|
1117
|
+
return False
|
|
1118
|
+
if parser.detect_language(Path(path)) is None:
|
|
1119
|
+
return False
|
|
1120
|
+
return True
|
|
1121
|
+
|
|
1122
|
+
def on_modified(self, event):
|
|
1123
|
+
if event.is_directory:
|
|
1124
|
+
return
|
|
1125
|
+
if self._should_handle(event.src_path):
|
|
1126
|
+
self._schedule(event.src_path)
|
|
1127
|
+
|
|
1128
|
+
def on_created(self, event):
|
|
1129
|
+
if event.is_directory:
|
|
1130
|
+
return
|
|
1131
|
+
if self._should_handle(event.src_path):
|
|
1132
|
+
self._schedule(event.src_path)
|
|
1133
|
+
|
|
1134
|
+
def on_deleted(self, event):
|
|
1135
|
+
if event.is_directory:
|
|
1136
|
+
return
|
|
1137
|
+
# Only handle files we would normally track
|
|
1138
|
+
try:
|
|
1139
|
+
rel = str(Path(event.src_path).relative_to(repo_root))
|
|
1140
|
+
except ValueError:
|
|
1141
|
+
return
|
|
1142
|
+
if _should_ignore(rel, ignore_patterns):
|
|
1143
|
+
return
|
|
1144
|
+
try:
|
|
1145
|
+
store.remove_file_data(event.src_path)
|
|
1146
|
+
store.commit()
|
|
1147
|
+
logger.info("Removed: %s", rel)
|
|
1148
|
+
except Exception as e:
|
|
1149
|
+
logger.error("Error removing %s: %s", rel, e)
|
|
1150
|
+
|
|
1151
|
+
def _schedule(self, abs_path: str):
|
|
1152
|
+
"""Add file to pending set and reset the debounce timer."""
|
|
1153
|
+
with self._lock:
|
|
1154
|
+
self._pending.add(abs_path)
|
|
1155
|
+
if self._timer is not None:
|
|
1156
|
+
self._timer.cancel()
|
|
1157
|
+
self._timer = threading.Timer(_DEBOUNCE_SECONDS, self._flush)
|
|
1158
|
+
self._timer.start()
|
|
1159
|
+
|
|
1160
|
+
def _flush(self):
|
|
1161
|
+
"""Process all pending files after the debounce window."""
|
|
1162
|
+
with self._lock:
|
|
1163
|
+
paths = list(self._pending)
|
|
1164
|
+
self._pending.clear()
|
|
1165
|
+
self._timer = None
|
|
1166
|
+
|
|
1167
|
+
updated = 0
|
|
1168
|
+
for abs_path in paths:
|
|
1169
|
+
if self._update_file(abs_path):
|
|
1170
|
+
updated += 1
|
|
1171
|
+
|
|
1172
|
+
if updated > 0 and on_files_updated is not None:
|
|
1173
|
+
try:
|
|
1174
|
+
on_files_updated(store)
|
|
1175
|
+
except Exception as e:
|
|
1176
|
+
logger.error("Post-update callback failed: %s", e)
|
|
1177
|
+
|
|
1178
|
+
def _update_file(self, abs_path: str) -> bool:
|
|
1179
|
+
path = Path(abs_path)
|
|
1180
|
+
if not path.is_file():
|
|
1181
|
+
return False
|
|
1182
|
+
if path.is_symlink():
|
|
1183
|
+
return False
|
|
1184
|
+
if _is_binary(path):
|
|
1185
|
+
return False
|
|
1186
|
+
try:
|
|
1187
|
+
source = path.read_bytes()
|
|
1188
|
+
fhash = hashlib.sha256(source).hexdigest()
|
|
1189
|
+
nodes, edges = parser.parse_bytes(path, source)
|
|
1190
|
+
store.store_file_nodes_edges(abs_path, nodes, edges, fhash)
|
|
1191
|
+
store.set_metadata("last_updated", time.strftime("%Y-%m-%dT%H:%M:%S"))
|
|
1192
|
+
store.commit()
|
|
1193
|
+
rel = str(path.relative_to(repo_root))
|
|
1194
|
+
logger.info(
|
|
1195
|
+
"Updated: %s (%d nodes, %d edges)",
|
|
1196
|
+
rel,
|
|
1197
|
+
len(nodes),
|
|
1198
|
+
len(edges),
|
|
1199
|
+
)
|
|
1200
|
+
return True
|
|
1201
|
+
except Exception as e:
|
|
1202
|
+
logger.error("Error updating %s: %s", abs_path, e)
|
|
1203
|
+
return False
|
|
1204
|
+
|
|
1205
|
+
handler = GraphUpdateHandler()
|
|
1206
|
+
observer = Observer()
|
|
1207
|
+
observer.schedule(handler, str(repo_root), recursive=True)
|
|
1208
|
+
observer.start()
|
|
1209
|
+
|
|
1210
|
+
logger.info("Watching %s for changes... (Ctrl+C to stop)", repo_root)
|
|
1211
|
+
try:
|
|
1212
|
+
import time as _time
|
|
1213
|
+
|
|
1214
|
+
while True:
|
|
1215
|
+
_time.sleep(1)
|
|
1216
|
+
except KeyboardInterrupt:
|
|
1217
|
+
observer.stop()
|
|
1218
|
+
observer.join()
|
|
1219
|
+
logger.info("Watch stopped.")
|
|
1220
|
+
|
|
1221
|
+
|
|
1222
|
+
def start_watch_thread(
|
|
1223
|
+
repo_root: Path,
|
|
1224
|
+
store: GraphStore,
|
|
1225
|
+
daemon: bool = True,
|
|
1226
|
+
) -> threading.Thread | None:
|
|
1227
|
+
"""Start watch mode in a background thread.
|
|
1228
|
+
|
|
1229
|
+
Returns the started thread, or None if watchdog is unavailable.
|
|
1230
|
+
"""
|
|
1231
|
+
try:
|
|
1232
|
+
import watchdog # noqa: F401
|
|
1233
|
+
except ImportError:
|
|
1234
|
+
logger.warning("watchdog not installed; auto-watch disabled")
|
|
1235
|
+
return None
|
|
1236
|
+
|
|
1237
|
+
thread = threading.Thread(
|
|
1238
|
+
target=watch,
|
|
1239
|
+
args=(repo_root, store),
|
|
1240
|
+
daemon=daemon,
|
|
1241
|
+
name="crg-watch",
|
|
1242
|
+
)
|
|
1243
|
+
thread.start()
|
|
1244
|
+
logger.info("Auto-watch started for %s", repo_root)
|
|
1245
|
+
return thread
|