code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_review_graph/__init__.py +20 -0
- code_review_graph/__main__.py +4 -0
- code_review_graph/analysis.py +410 -0
- code_review_graph/changes.py +409 -0
- code_review_graph/cli.py +1255 -0
- code_review_graph/communities.py +874 -0
- code_review_graph/constants.py +23 -0
- code_review_graph/context_savings.py +317 -0
- code_review_graph/custom_languages.py +322 -0
- code_review_graph/daemon.py +1009 -0
- code_review_graph/daemon_cli.py +320 -0
- code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
- code_review_graph/embeddings.py +1006 -0
- code_review_graph/enrich.py +303 -0
- code_review_graph/eval/__init__.py +33 -0
- code_review_graph/eval/benchmarks/__init__.py +1 -0
- code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
- code_review_graph/eval/benchmarks/build_performance.py +60 -0
- code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
- code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
- code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
- code_review_graph/eval/benchmarks/search_quality.py +59 -0
- code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
- code_review_graph/eval/configs/code-review-graph.yaml +50 -0
- code_review_graph/eval/configs/express.yaml +45 -0
- code_review_graph/eval/configs/fastapi.yaml +48 -0
- code_review_graph/eval/configs/flask.yaml +50 -0
- code_review_graph/eval/configs/gin.yaml +51 -0
- code_review_graph/eval/configs/httpx.yaml +48 -0
- code_review_graph/eval/reporter.py +301 -0
- code_review_graph/eval/runner.py +211 -0
- code_review_graph/eval/scorer.py +85 -0
- code_review_graph/eval/token_benchmark.py +182 -0
- code_review_graph/exports.py +409 -0
- code_review_graph/flows.py +698 -0
- code_review_graph/graph.py +1427 -0
- code_review_graph/graph_diff.py +122 -0
- code_review_graph/hints.py +384 -0
- code_review_graph/incremental.py +1245 -0
- code_review_graph/jedi_resolver.py +303 -0
- code_review_graph/main.py +1079 -0
- code_review_graph/memory.py +142 -0
- code_review_graph/migrations.py +284 -0
- code_review_graph/parser.py +6957 -0
- code_review_graph/postprocessing.py +134 -0
- code_review_graph/prompts.py +159 -0
- code_review_graph/refactor.py +852 -0
- code_review_graph/registry.py +319 -0
- code_review_graph/rescript_resolver.py +206 -0
- code_review_graph/search.py +447 -0
- code_review_graph/skills.py +1481 -0
- code_review_graph/spring_resolver.py +200 -0
- code_review_graph/temporal_resolver.py +199 -0
- code_review_graph/token_benchmark.py +125 -0
- code_review_graph/tools/__init__.py +156 -0
- code_review_graph/tools/_common.py +176 -0
- code_review_graph/tools/analysis_tools.py +184 -0
- code_review_graph/tools/build.py +541 -0
- code_review_graph/tools/community_tools.py +246 -0
- code_review_graph/tools/context.py +152 -0
- code_review_graph/tools/docs.py +274 -0
- code_review_graph/tools/flows_tools.py +176 -0
- code_review_graph/tools/query.py +692 -0
- code_review_graph/tools/refactor_tools.py +168 -0
- code_review_graph/tools/registry_tools.py +125 -0
- code_review_graph/tools/review.py +477 -0
- code_review_graph/tsconfig_resolver.py +257 -0
- code_review_graph/visualization.py +2184 -0
- code_review_graph/wiki.py +305 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Shared constants for code-review-graph."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
SECURITY_KEYWORDS: frozenset[str] = frozenset({
|
|
8
|
+
"auth", "login", "password", "token", "session", "crypt", "secret",
|
|
9
|
+
"credential", "permission", "sql", "query", "execute", "connect",
|
|
10
|
+
"socket", "request", "http", "sanitize", "validate", "encrypt",
|
|
11
|
+
"decrypt", "hash", "sign", "verify", "admin", "privilege",
|
|
12
|
+
})
|
|
13
|
+
|
|
14
|
+
# ---------------------------------------------------------------------------
|
|
15
|
+
# Configurable limits (override via environment variables)
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
MAX_IMPACT_NODES = int(os.environ.get("CRG_MAX_IMPACT_NODES", "500"))
|
|
18
|
+
MAX_IMPACT_DEPTH = int(os.environ.get("CRG_MAX_IMPACT_DEPTH", "2"))
|
|
19
|
+
MAX_BFS_DEPTH = int(os.environ.get("CRG_MAX_BFS_DEPTH", "15"))
|
|
20
|
+
MAX_SEARCH_RESULTS = int(os.environ.get("CRG_MAX_SEARCH_RESULTS", "20"))
|
|
21
|
+
|
|
22
|
+
# BFS engine: "sql" (SQLite recursive CTE) or "networkx" (Python-side BFS)
|
|
23
|
+
BFS_ENGINE = os.environ.get("CRG_BFS_ENGINE", "sql")
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""Compact estimated context savings helpers.
|
|
2
|
+
|
|
3
|
+
The project intentionally labels these values as estimates: the helper uses a
|
|
4
|
+
conservative character-count approximation instead of model-specific tokenizers.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Iterable
|
|
12
|
+
|
|
13
|
+
CHARS_PER_TOKEN = 4
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def estimate_tokens(value: Any) -> int:
|
|
17
|
+
"""Estimate token count with a conservative 4 chars/token approximation."""
|
|
18
|
+
if value is None:
|
|
19
|
+
return 0
|
|
20
|
+
if isinstance(value, str):
|
|
21
|
+
text = value
|
|
22
|
+
else:
|
|
23
|
+
text = json.dumps(
|
|
24
|
+
value,
|
|
25
|
+
default=str,
|
|
26
|
+
ensure_ascii=True,
|
|
27
|
+
separators=(",", ":"),
|
|
28
|
+
sort_keys=True,
|
|
29
|
+
)
|
|
30
|
+
if not text:
|
|
31
|
+
return 0
|
|
32
|
+
return max(1, (len(text) + CHARS_PER_TOKEN - 1) // CHARS_PER_TOKEN)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def estimate_file_tokens(repo_root: Path, files: Iterable[str]) -> int:
|
|
36
|
+
"""Estimate tokens for changed files using file sizes, not file contents."""
|
|
37
|
+
total = 0
|
|
38
|
+
root = repo_root.resolve()
|
|
39
|
+
for file_name in files:
|
|
40
|
+
path = Path(file_name)
|
|
41
|
+
full_path = path if path.is_absolute() else root / path
|
|
42
|
+
try:
|
|
43
|
+
if full_path.is_file():
|
|
44
|
+
total += max(
|
|
45
|
+
1,
|
|
46
|
+
(full_path.stat().st_size + CHARS_PER_TOKEN - 1)
|
|
47
|
+
// CHARS_PER_TOKEN,
|
|
48
|
+
)
|
|
49
|
+
except OSError:
|
|
50
|
+
continue
|
|
51
|
+
return total
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def estimate_context_savings(
|
|
55
|
+
*,
|
|
56
|
+
original_context: Any | None = None,
|
|
57
|
+
returned_context: Any | None = None,
|
|
58
|
+
original_tokens: int | None = None,
|
|
59
|
+
returned_tokens: int | None = None,
|
|
60
|
+
) -> dict[str, int | bool] | None:
|
|
61
|
+
"""Return tiny savings metadata, or None when no baseline is available."""
|
|
62
|
+
baseline = (
|
|
63
|
+
original_tokens
|
|
64
|
+
if original_tokens is not None
|
|
65
|
+
else estimate_tokens(original_context)
|
|
66
|
+
)
|
|
67
|
+
returned = (
|
|
68
|
+
returned_tokens
|
|
69
|
+
if returned_tokens is not None
|
|
70
|
+
else estimate_tokens(returned_context)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if baseline <= 0:
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
saved = max(0, baseline - returned)
|
|
77
|
+
percent = round((saved / baseline) * 100) if baseline else 0
|
|
78
|
+
return {
|
|
79
|
+
"estimated": True,
|
|
80
|
+
"saved_tokens": int(saved),
|
|
81
|
+
"saved_percent": int(percent),
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def attach_context_savings(
|
|
86
|
+
result: dict[str, Any],
|
|
87
|
+
*,
|
|
88
|
+
original_context: Any | None = None,
|
|
89
|
+
original_tokens: int | None = None,
|
|
90
|
+
returned_context: Any | None = None,
|
|
91
|
+
returned_tokens: int | None = None,
|
|
92
|
+
) -> dict[str, Any]:
|
|
93
|
+
"""Attach compact ``context_savings`` metadata when it can be estimated."""
|
|
94
|
+
estimate = estimate_context_savings(
|
|
95
|
+
original_context=original_context,
|
|
96
|
+
returned_context=result if returned_context is None else returned_context,
|
|
97
|
+
original_tokens=original_tokens,
|
|
98
|
+
returned_tokens=returned_tokens,
|
|
99
|
+
)
|
|
100
|
+
if estimate is not None:
|
|
101
|
+
result["context_savings"] = estimate
|
|
102
|
+
return result
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def format_context_savings(estimate: dict[str, Any] | None) -> str | None:
|
|
106
|
+
"""Format a one-line human summary for CLI output."""
|
|
107
|
+
if not estimate:
|
|
108
|
+
return None
|
|
109
|
+
saved = int(estimate.get("saved_tokens", 0))
|
|
110
|
+
percent = int(estimate.get("saved_percent", 0))
|
|
111
|
+
return f"Estimated context saved: ~{saved:,} tokens (~{percent}%)"
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _fmt_compact(n: int) -> str:
|
|
115
|
+
"""Compact integer formatting: 1234 -> '1.2k', 9876 -> '9.9k', 500 -> '500'."""
|
|
116
|
+
if n >= 10_000:
|
|
117
|
+
return f"{n // 1000:,}k"
|
|
118
|
+
if n >= 1000:
|
|
119
|
+
return f"{n / 1000:.1f}k"
|
|
120
|
+
return str(n)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _breakdown_from_response(response: dict[str, Any]) -> dict[str, int]:
|
|
124
|
+
"""Pull a per-category token estimate from a detect-changes / review response.
|
|
125
|
+
|
|
126
|
+
Only fields that exist and have content are reported, so the breakdown
|
|
127
|
+
line stays meaningful instead of padding with zeros.
|
|
128
|
+
"""
|
|
129
|
+
# Friendly label -> response-dict key
|
|
130
|
+
fields = [
|
|
131
|
+
("Functions", "changed_functions"),
|
|
132
|
+
("Flows", "affected_flows"),
|
|
133
|
+
("Tests", "test_gaps"),
|
|
134
|
+
("Risk", "review_priorities"),
|
|
135
|
+
("Impact", "impacted_nodes"),
|
|
136
|
+
("Edges", "edges"),
|
|
137
|
+
("Source", "source_snippets"),
|
|
138
|
+
("Imports", "imports"),
|
|
139
|
+
]
|
|
140
|
+
out: dict[str, int] = {}
|
|
141
|
+
for label, key in fields:
|
|
142
|
+
value = response.get(key)
|
|
143
|
+
if not value:
|
|
144
|
+
continue
|
|
145
|
+
tokens = estimate_tokens(value)
|
|
146
|
+
if tokens > 0:
|
|
147
|
+
out[label] = tokens
|
|
148
|
+
return out
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def verify_with_tiktoken(
|
|
152
|
+
repo_root: "Path | str",
|
|
153
|
+
changed_files: Iterable[str],
|
|
154
|
+
response: Any,
|
|
155
|
+
encoding_name: str = "cl100k_base",
|
|
156
|
+
) -> dict[str, int] | None:
|
|
157
|
+
"""Calibrate the chars/4 estimate against a real model tokenizer.
|
|
158
|
+
|
|
159
|
+
Returns ``{"verified_baseline": int, "verified_returned": int,
|
|
160
|
+
"verified_saved": int, "verified_percent": int}`` or ``None`` if
|
|
161
|
+
tiktoken is not installed. Reads every changed file's content (unlike
|
|
162
|
+
the stat-only ``estimate_file_tokens``) so the numbers reflect what
|
|
163
|
+
an agent would actually consume.
|
|
164
|
+
"""
|
|
165
|
+
try:
|
|
166
|
+
import tiktoken # type: ignore[import-untyped]
|
|
167
|
+
except ImportError:
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
enc = tiktoken.get_encoding(encoding_name)
|
|
171
|
+
root = Path(repo_root).resolve()
|
|
172
|
+
|
|
173
|
+
naive_real = 0
|
|
174
|
+
for f in changed_files:
|
|
175
|
+
p = root / f
|
|
176
|
+
try:
|
|
177
|
+
if p.is_file():
|
|
178
|
+
naive_real += len(enc.encode(p.read_text(errors="replace")))
|
|
179
|
+
except OSError:
|
|
180
|
+
continue
|
|
181
|
+
|
|
182
|
+
if isinstance(response, str):
|
|
183
|
+
graph_real = len(enc.encode(response))
|
|
184
|
+
else:
|
|
185
|
+
text = json.dumps(
|
|
186
|
+
response, default=str, ensure_ascii=True,
|
|
187
|
+
separators=(",", ":"), sort_keys=True,
|
|
188
|
+
)
|
|
189
|
+
graph_real = len(enc.encode(text))
|
|
190
|
+
|
|
191
|
+
saved = max(0, naive_real - graph_real)
|
|
192
|
+
pct = round(saved * 100 / naive_real) if naive_real > 0 else 0
|
|
193
|
+
return {
|
|
194
|
+
"verified_baseline": naive_real,
|
|
195
|
+
"verified_returned": graph_real,
|
|
196
|
+
"verified_saved": saved,
|
|
197
|
+
"verified_percent": pct,
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def format_context_savings_panel(
|
|
202
|
+
estimate: dict[str, Any] | None,
|
|
203
|
+
*,
|
|
204
|
+
original_tokens: int | None = None,
|
|
205
|
+
returned_tokens: int | None = None,
|
|
206
|
+
response: dict[str, Any] | None = None,
|
|
207
|
+
breakdown: dict[str, int] | None = None,
|
|
208
|
+
verified: dict[str, int] | None = None,
|
|
209
|
+
title: str = "Token Savings",
|
|
210
|
+
width: int = 64,
|
|
211
|
+
) -> str | None:
|
|
212
|
+
"""Format the savings estimate as a boxed multi-line CLI panel.
|
|
213
|
+
|
|
214
|
+
Example output (width=60)::
|
|
215
|
+
|
|
216
|
+
┌──────────────── Token Savings ────────────────┐
|
|
217
|
+
│ Full context would be: 12,932 tokens │
|
|
218
|
+
│ Graph context used: 773 tokens │
|
|
219
|
+
│ Saved: 12,159 tokens (~94%) │
|
|
220
|
+
│ Breakdown: Functions 580 · Tests 120 · ... │
|
|
221
|
+
└───────────────────────────────────────────────┘
|
|
222
|
+
|
|
223
|
+
All numbers are labelled as estimates upstream (``estimated: true`` in the
|
|
224
|
+
metadata dict) because the project uses a 4-chars-per-token approximation,
|
|
225
|
+
not model-specific tokenization.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
estimate: The ``context_savings`` dict from a tool response.
|
|
229
|
+
original_tokens: Optional override for the naive baseline.
|
|
230
|
+
returned_tokens: Optional override for the graph response size.
|
|
231
|
+
response: When provided, breakdown is auto-derived from common keys
|
|
232
|
+
(``changed_functions``, ``affected_flows``, ``test_gaps``,
|
|
233
|
+
``review_priorities``, ``impacted_nodes``, ``edges``,
|
|
234
|
+
``source_snippets``, ``imports``).
|
|
235
|
+
breakdown: Explicit ``{label: tokens}`` map; takes precedence over
|
|
236
|
+
``response``-derived breakdown when both are provided.
|
|
237
|
+
title: Title centered in the top border.
|
|
238
|
+
width: Total panel width, capped at terminal width if larger.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
The panel as a single ``\\n``-joined string, or ``None`` when there
|
|
242
|
+
is nothing meaningful to display.
|
|
243
|
+
"""
|
|
244
|
+
if not estimate:
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
saved = int(estimate.get("saved_tokens", 0))
|
|
248
|
+
percent = int(estimate.get("saved_percent", 0))
|
|
249
|
+
|
|
250
|
+
# Derive baseline + returned from saved+percent if not provided
|
|
251
|
+
if original_tokens is None:
|
|
252
|
+
if percent > 0:
|
|
253
|
+
original_tokens = int(round(saved * 100 / percent))
|
|
254
|
+
else:
|
|
255
|
+
original_tokens = saved
|
|
256
|
+
if returned_tokens is None:
|
|
257
|
+
returned_tokens = max(0, (original_tokens or 0) - saved)
|
|
258
|
+
|
|
259
|
+
if breakdown is None and response is not None:
|
|
260
|
+
breakdown = _breakdown_from_response(response)
|
|
261
|
+
|
|
262
|
+
# Top up the breakdown with an "Other" bucket so the parts sum to
|
|
263
|
+
# ``returned_tokens`` exactly. "Other" covers fields the breakdown
|
|
264
|
+
# doesn't enumerate (status, summary, risk_score, context_savings
|
|
265
|
+
# metadata, JSON envelope chars). Skip when there's no positive
|
|
266
|
+
# remainder — the breakdown already accounts for the whole response.
|
|
267
|
+
if breakdown and returned_tokens is not None:
|
|
268
|
+
labelled_sum = sum(breakdown.values())
|
|
269
|
+
remainder = returned_tokens - labelled_sum
|
|
270
|
+
if remainder > 0:
|
|
271
|
+
breakdown = dict(breakdown) # copy before mutating
|
|
272
|
+
breakdown["Other"] = remainder
|
|
273
|
+
|
|
274
|
+
# Lines that go inside the box (without borders)
|
|
275
|
+
inner_lines: list[str] = [
|
|
276
|
+
f"Full context would be: {original_tokens:>9,} tokens",
|
|
277
|
+
f"Graph context used: {returned_tokens:>9,} tokens",
|
|
278
|
+
f"Saved: {saved:>9,} tokens (~{percent}%)",
|
|
279
|
+
]
|
|
280
|
+
if verified:
|
|
281
|
+
vb = verified["verified_baseline"]
|
|
282
|
+
vr = verified["verified_returned"]
|
|
283
|
+
vs = verified["verified_saved"]
|
|
284
|
+
vp = verified["verified_percent"]
|
|
285
|
+
inner_lines.append(
|
|
286
|
+
f"Verified (tiktoken): {vs:>9,} tokens (~{vp}%) "
|
|
287
|
+
f"[{vb:,} → {vr:,}]"
|
|
288
|
+
)
|
|
289
|
+
if breakdown:
|
|
290
|
+
parts = [f"{label} {_fmt_compact(tok)}" for label, tok in breakdown.items()]
|
|
291
|
+
bd_line = "Breakdown: " + " · ".join(parts)
|
|
292
|
+
inner_lines.append(bd_line)
|
|
293
|
+
|
|
294
|
+
# Compute final width: at least wide enough for the longest inner line + padding
|
|
295
|
+
content_width = max(len(s) for s in inner_lines)
|
|
296
|
+
inner_w = max(width - 2, content_width + 2) # +2 for one space pad each side
|
|
297
|
+
# Title bar
|
|
298
|
+
title_str = f" {title} "
|
|
299
|
+
dash_total = inner_w - len(title_str)
|
|
300
|
+
if dash_total < 4:
|
|
301
|
+
dash_total = 4
|
|
302
|
+
left_dash = dash_total // 2
|
|
303
|
+
right_dash = dash_total - left_dash
|
|
304
|
+
top = "┌" + "─" * left_dash + title_str + "─" * right_dash + "┐"
|
|
305
|
+
bottom = "└" + "─" * inner_w + "┘"
|
|
306
|
+
|
|
307
|
+
def _box_line(content: str) -> str:
|
|
308
|
+
pad = inner_w - 2 - len(content)
|
|
309
|
+
if pad < 0:
|
|
310
|
+
pad = 0
|
|
311
|
+
return f"│ {content}{' ' * pad} │"
|
|
312
|
+
|
|
313
|
+
lines = [top]
|
|
314
|
+
for s in inner_lines:
|
|
315
|
+
lines.append(_box_line(s))
|
|
316
|
+
lines.append(bottom)
|
|
317
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""Config-driven custom language support ("bring your own language").
|
|
2
|
+
|
|
3
|
+
Repos can teach the parser new tree-sitter languages without forking by
|
|
4
|
+
dropping a ``languages.toml`` file into ``.code-review-graph/``::
|
|
5
|
+
|
|
6
|
+
[languages.erlang]
|
|
7
|
+
extensions = [".erl", ".hrl"]
|
|
8
|
+
grammar = "erlang" # tree_sitter_language_pack name
|
|
9
|
+
function_node_types = ["function_clause"]
|
|
10
|
+
class_node_types = ["record_decl"]
|
|
11
|
+
import_node_types = ["import_attribute"]
|
|
12
|
+
call_node_types = ["call"]
|
|
13
|
+
comment = "Erlang via the bundled tree-sitter-erlang grammar"
|
|
14
|
+
|
|
15
|
+
The loader is deliberately defensive: a broken config must never crash a
|
|
16
|
+
build. Invalid entries are skipped with a ``logger.warning``, and built-in
|
|
17
|
+
languages always win — custom entries can neither override built-in file
|
|
18
|
+
extensions nor reuse built-in language names. At most
|
|
19
|
+
``MAX_CUSTOM_LANGUAGES`` entries are honoured per repo.
|
|
20
|
+
|
|
21
|
+
See docs/CUSTOM_LANGUAGES.md for the full schema reference (answers #320).
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import logging
|
|
27
|
+
import re
|
|
28
|
+
import sys
|
|
29
|
+
import threading
|
|
30
|
+
from collections.abc import Mapping
|
|
31
|
+
from dataclasses import dataclass, field
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Optional
|
|
34
|
+
|
|
35
|
+
import tree_sitter_language_pack as tslp
|
|
36
|
+
|
|
37
|
+
if sys.version_info >= (3, 11):
|
|
38
|
+
import tomllib
|
|
39
|
+
else:
|
|
40
|
+
try:
|
|
41
|
+
import tomli as tomllib # type: ignore[no-redef]
|
|
42
|
+
except ImportError:
|
|
43
|
+
tomllib = None # type: ignore[assignment]
|
|
44
|
+
|
|
45
|
+
logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
#: Location of the config file, relative to the repo root.
|
|
48
|
+
CONFIG_RELATIVE_PATH = Path(".code-review-graph") / "languages.toml"
|
|
49
|
+
|
|
50
|
+
#: Hard cap on the number of custom languages loaded from a single config.
|
|
51
|
+
MAX_CUSTOM_LANGUAGES = 20
|
|
52
|
+
|
|
53
|
+
#: Custom language names: short lowercase identifiers. The name becomes the
|
|
54
|
+
#: ``language`` field on every node parsed from matching files.
|
|
55
|
+
_NAME_RE = re.compile(r"^[a-z][a-z0-9_-]{0,31}$")
|
|
56
|
+
|
|
57
|
+
#: Extensions: a leading dot followed by 1-15 safe characters (".erl",
|
|
58
|
+
#: ".cls", ".4gl"). Uppercase input is normalised to lowercase because the
|
|
59
|
+
#: parser lowercases file suffixes before lookup.
|
|
60
|
+
_EXTENSION_RE = re.compile(r"^\.[a-z0-9_+-]{1,15}$")
|
|
61
|
+
|
|
62
|
+
#: The four node-type lists recognised in each ``[languages.<name>]`` table.
|
|
63
|
+
_NODE_TYPE_KEYS = (
|
|
64
|
+
"function_node_types",
|
|
65
|
+
"class_node_types",
|
|
66
|
+
"import_node_types",
|
|
67
|
+
"call_node_types",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass(frozen=True)
|
|
72
|
+
class CustomLanguage:
|
|
73
|
+
"""One validated ``[languages.<name>]`` entry from languages.toml."""
|
|
74
|
+
|
|
75
|
+
name: str
|
|
76
|
+
grammar: str
|
|
77
|
+
extensions: tuple[str, ...]
|
|
78
|
+
function_node_types: tuple[str, ...] = ()
|
|
79
|
+
class_node_types: tuple[str, ...] = ()
|
|
80
|
+
import_node_types: tuple[str, ...] = ()
|
|
81
|
+
call_node_types: tuple[str, ...] = ()
|
|
82
|
+
comment: str = ""
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass(frozen=True)
|
|
86
|
+
class _CacheEntry:
|
|
87
|
+
mtime_ns: int
|
|
88
|
+
size: int
|
|
89
|
+
languages: dict[str, CustomLanguage] = field(default_factory=dict)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# Config files are re-read only when their mtime/size changes. This matters
|
|
93
|
+
# because full builds construct one CodeParser per worker task, and probing
|
|
94
|
+
# tree-sitter grammars on every file parse would be wasteful.
|
|
95
|
+
_cache_lock = threading.Lock()
|
|
96
|
+
_cache: dict[str, _CacheEntry] = {}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def clear_cache() -> None:
|
|
100
|
+
"""Drop the loader cache (used by tests)."""
|
|
101
|
+
with _cache_lock:
|
|
102
|
+
_cache.clear()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def load_custom_languages(
|
|
106
|
+
repo_root: Path,
|
|
107
|
+
*,
|
|
108
|
+
builtin_extensions: Mapping[str, str],
|
|
109
|
+
builtin_languages: frozenset[str],
|
|
110
|
+
) -> dict[str, CustomLanguage]:
|
|
111
|
+
"""Load and validate ``<repo_root>/.code-review-graph/languages.toml``.
|
|
112
|
+
|
|
113
|
+
Returns a mapping of custom language name -> :class:`CustomLanguage`.
|
|
114
|
+
Always returns (possibly empty) — a broken config never raises.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
repo_root: Repository root containing ``.code-review-graph/``.
|
|
118
|
+
builtin_extensions: The parser's built-in extension map; custom
|
|
119
|
+
entries colliding with these are skipped (built-ins win).
|
|
120
|
+
builtin_languages: All built-in language identifiers; custom names
|
|
121
|
+
shadowing these are skipped.
|
|
122
|
+
"""
|
|
123
|
+
config_path = Path(repo_root) / CONFIG_RELATIVE_PATH
|
|
124
|
+
try:
|
|
125
|
+
stat = config_path.stat()
|
|
126
|
+
except OSError:
|
|
127
|
+
return {} # No config file — the common case; not worth a log line.
|
|
128
|
+
|
|
129
|
+
cache_key = str(config_path)
|
|
130
|
+
with _cache_lock:
|
|
131
|
+
cached = _cache.get(cache_key)
|
|
132
|
+
if (
|
|
133
|
+
cached is not None
|
|
134
|
+
and cached.mtime_ns == stat.st_mtime_ns
|
|
135
|
+
and cached.size == stat.st_size
|
|
136
|
+
):
|
|
137
|
+
return dict(cached.languages)
|
|
138
|
+
|
|
139
|
+
languages = _load_uncached(config_path, builtin_extensions, builtin_languages)
|
|
140
|
+
with _cache_lock:
|
|
141
|
+
_cache[cache_key] = _CacheEntry(stat.st_mtime_ns, stat.st_size, dict(languages))
|
|
142
|
+
return languages
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _load_uncached(
|
|
146
|
+
config_path: Path,
|
|
147
|
+
builtin_extensions: Mapping[str, str],
|
|
148
|
+
builtin_languages: frozenset[str],
|
|
149
|
+
) -> dict[str, CustomLanguage]:
|
|
150
|
+
if tomllib is None:
|
|
151
|
+
logger.warning(
|
|
152
|
+
"%s found but TOML parsing requires the 'tomli' package on "
|
|
153
|
+
"Python < 3.11 — no custom languages loaded",
|
|
154
|
+
config_path,
|
|
155
|
+
)
|
|
156
|
+
return {}
|
|
157
|
+
try:
|
|
158
|
+
raw = config_path.read_bytes()
|
|
159
|
+
except (OSError, PermissionError) as exc:
|
|
160
|
+
logger.warning("Cannot read %s: %s — no custom languages loaded", config_path, exc)
|
|
161
|
+
return {}
|
|
162
|
+
try:
|
|
163
|
+
data = tomllib.loads(raw.decode("utf-8", errors="replace"))
|
|
164
|
+
except tomllib.TOMLDecodeError as exc:
|
|
165
|
+
logger.warning("Malformed TOML in %s: %s — no custom languages loaded", config_path, exc)
|
|
166
|
+
return {}
|
|
167
|
+
|
|
168
|
+
tables = data.get("languages")
|
|
169
|
+
if tables is None:
|
|
170
|
+
return {}
|
|
171
|
+
if not isinstance(tables, dict):
|
|
172
|
+
logger.warning(
|
|
173
|
+
"%s: [languages] must be a table of tables — no custom languages loaded",
|
|
174
|
+
config_path,
|
|
175
|
+
)
|
|
176
|
+
return {}
|
|
177
|
+
|
|
178
|
+
result: dict[str, CustomLanguage] = {}
|
|
179
|
+
claimed_extensions: set[str] = set()
|
|
180
|
+
for name, table in tables.items():
|
|
181
|
+
if len(result) >= MAX_CUSTOM_LANGUAGES:
|
|
182
|
+
logger.warning(
|
|
183
|
+
"%s defines more than %d custom languages — ignoring the rest",
|
|
184
|
+
config_path, MAX_CUSTOM_LANGUAGES,
|
|
185
|
+
)
|
|
186
|
+
break
|
|
187
|
+
lang = _validate_entry(
|
|
188
|
+
name, table, builtin_extensions, builtin_languages,
|
|
189
|
+
claimed_extensions, config_path,
|
|
190
|
+
)
|
|
191
|
+
if lang is None:
|
|
192
|
+
continue
|
|
193
|
+
result[lang.name] = lang
|
|
194
|
+
claimed_extensions.update(lang.extensions)
|
|
195
|
+
return result
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _validate_entry(
|
|
199
|
+
name: object,
|
|
200
|
+
table: object,
|
|
201
|
+
builtin_extensions: Mapping[str, str],
|
|
202
|
+
builtin_languages: frozenset[str],
|
|
203
|
+
claimed_extensions: set[str],
|
|
204
|
+
config_path: Path,
|
|
205
|
+
) -> Optional[CustomLanguage]:
|
|
206
|
+
"""Validate one ``[languages.<name>]`` table; None (after a warning) on
|
|
207
|
+
any problem so a bad entry can never break a build."""
|
|
208
|
+
label = name if isinstance(name, str) else repr(name)
|
|
209
|
+
if not isinstance(table, dict):
|
|
210
|
+
logger.warning("%s: [languages.%s] is not a table — skipping", config_path, label)
|
|
211
|
+
return None
|
|
212
|
+
if not isinstance(name, str) or not _NAME_RE.match(name):
|
|
213
|
+
logger.warning(
|
|
214
|
+
"%s: invalid custom language name %r (expected lowercase "
|
|
215
|
+
"letters/digits/_/-, max 32 chars) — skipping",
|
|
216
|
+
config_path, label,
|
|
217
|
+
)
|
|
218
|
+
return None
|
|
219
|
+
if name in builtin_languages:
|
|
220
|
+
logger.warning(
|
|
221
|
+
"%s: custom language %r shadows a built-in language — skipping "
|
|
222
|
+
"(built-ins cannot be overridden)",
|
|
223
|
+
config_path, name,
|
|
224
|
+
)
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
grammar = table.get("grammar")
|
|
228
|
+
if not isinstance(grammar, str) or not grammar.strip():
|
|
229
|
+
logger.warning(
|
|
230
|
+
"%s: custom language %r needs a non-empty 'grammar' string — skipping",
|
|
231
|
+
config_path, name,
|
|
232
|
+
)
|
|
233
|
+
return None
|
|
234
|
+
grammar = grammar.strip()
|
|
235
|
+
|
|
236
|
+
raw_extensions = table.get("extensions")
|
|
237
|
+
if not isinstance(raw_extensions, list) or not raw_extensions:
|
|
238
|
+
logger.warning(
|
|
239
|
+
"%s: custom language %r needs a non-empty 'extensions' list — skipping",
|
|
240
|
+
config_path, name,
|
|
241
|
+
)
|
|
242
|
+
return None
|
|
243
|
+
extensions: list[str] = []
|
|
244
|
+
for ext in raw_extensions:
|
|
245
|
+
normalized = ext.strip().lower() if isinstance(ext, str) else ""
|
|
246
|
+
if not normalized.startswith("."):
|
|
247
|
+
logger.warning(
|
|
248
|
+
"%s: custom language %r: extension %r must start with a dot — skipping",
|
|
249
|
+
config_path, name, ext,
|
|
250
|
+
)
|
|
251
|
+
return None
|
|
252
|
+
if not _EXTENSION_RE.match(normalized):
|
|
253
|
+
logger.warning(
|
|
254
|
+
"%s: custom language %r: extension %r is not a valid file "
|
|
255
|
+
"extension — skipping",
|
|
256
|
+
config_path, name, ext,
|
|
257
|
+
)
|
|
258
|
+
return None
|
|
259
|
+
if normalized in builtin_extensions:
|
|
260
|
+
logger.warning(
|
|
261
|
+
"%s: custom language %r: extension %r is already handled by "
|
|
262
|
+
"the built-in %r parser — skipping (built-ins cannot be overridden)",
|
|
263
|
+
config_path, name, normalized, builtin_extensions[normalized],
|
|
264
|
+
)
|
|
265
|
+
return None
|
|
266
|
+
if normalized in claimed_extensions:
|
|
267
|
+
logger.warning(
|
|
268
|
+
"%s: custom language %r: extension %r is already claimed by "
|
|
269
|
+
"an earlier custom language — skipping",
|
|
270
|
+
config_path, name, normalized,
|
|
271
|
+
)
|
|
272
|
+
return None
|
|
273
|
+
if normalized not in extensions:
|
|
274
|
+
extensions.append(normalized)
|
|
275
|
+
|
|
276
|
+
node_types: dict[str, tuple[str, ...]] = {}
|
|
277
|
+
for key in _NODE_TYPE_KEYS:
|
|
278
|
+
value = table.get(key, [])
|
|
279
|
+
if not isinstance(value, list) or any(
|
|
280
|
+
not isinstance(item, str) or not item.strip() for item in value
|
|
281
|
+
):
|
|
282
|
+
logger.warning(
|
|
283
|
+
"%s: custom language %r: %s must be a list of non-empty "
|
|
284
|
+
"strings — skipping",
|
|
285
|
+
config_path, name, key,
|
|
286
|
+
)
|
|
287
|
+
return None
|
|
288
|
+
node_types[key] = tuple(item.strip() for item in value)
|
|
289
|
+
if not any(node_types.values()):
|
|
290
|
+
logger.warning(
|
|
291
|
+
"%s: custom language %r defines no node types — nothing to "
|
|
292
|
+
"extract, skipping",
|
|
293
|
+
config_path, name,
|
|
294
|
+
)
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
comment = table.get("comment", "")
|
|
298
|
+
if not isinstance(comment, str):
|
|
299
|
+
comment = ""
|
|
300
|
+
|
|
301
|
+
# Probe the grammar last (it is the expensive check). Parser objects
|
|
302
|
+
# themselves are created lazily by CodeParser._get_parser.
|
|
303
|
+
try:
|
|
304
|
+
tslp.get_language(grammar) # type: ignore[arg-type]
|
|
305
|
+
except (LookupError, ValueError, ImportError, OSError) as exc:
|
|
306
|
+
logger.warning(
|
|
307
|
+
"%s: custom language %r: grammar %r is not available in "
|
|
308
|
+
"tree_sitter_language_pack (%s) — skipping",
|
|
309
|
+
config_path, name, grammar, exc,
|
|
310
|
+
)
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
return CustomLanguage(
|
|
314
|
+
name=name,
|
|
315
|
+
grammar=grammar,
|
|
316
|
+
extensions=tuple(extensions),
|
|
317
|
+
function_node_types=node_types["function_node_types"],
|
|
318
|
+
class_node_types=node_types["class_node_types"],
|
|
319
|
+
import_node_types=node_types["import_node_types"],
|
|
320
|
+
call_node_types=node_types["call_node_types"],
|
|
321
|
+
comment=comment,
|
|
322
|
+
)
|