code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. code_review_graph/__init__.py +20 -0
  2. code_review_graph/__main__.py +4 -0
  3. code_review_graph/analysis.py +410 -0
  4. code_review_graph/changes.py +409 -0
  5. code_review_graph/cli.py +1255 -0
  6. code_review_graph/communities.py +874 -0
  7. code_review_graph/constants.py +23 -0
  8. code_review_graph/context_savings.py +317 -0
  9. code_review_graph/custom_languages.py +322 -0
  10. code_review_graph/daemon.py +1009 -0
  11. code_review_graph/daemon_cli.py +320 -0
  12. code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
  13. code_review_graph/embeddings.py +1006 -0
  14. code_review_graph/enrich.py +303 -0
  15. code_review_graph/eval/__init__.py +33 -0
  16. code_review_graph/eval/benchmarks/__init__.py +1 -0
  17. code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
  18. code_review_graph/eval/benchmarks/build_performance.py +60 -0
  19. code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
  20. code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
  21. code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
  22. code_review_graph/eval/benchmarks/search_quality.py +59 -0
  23. code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
  24. code_review_graph/eval/configs/code-review-graph.yaml +50 -0
  25. code_review_graph/eval/configs/express.yaml +45 -0
  26. code_review_graph/eval/configs/fastapi.yaml +48 -0
  27. code_review_graph/eval/configs/flask.yaml +50 -0
  28. code_review_graph/eval/configs/gin.yaml +51 -0
  29. code_review_graph/eval/configs/httpx.yaml +48 -0
  30. code_review_graph/eval/reporter.py +301 -0
  31. code_review_graph/eval/runner.py +211 -0
  32. code_review_graph/eval/scorer.py +85 -0
  33. code_review_graph/eval/token_benchmark.py +182 -0
  34. code_review_graph/exports.py +409 -0
  35. code_review_graph/flows.py +698 -0
  36. code_review_graph/graph.py +1427 -0
  37. code_review_graph/graph_diff.py +122 -0
  38. code_review_graph/hints.py +384 -0
  39. code_review_graph/incremental.py +1245 -0
  40. code_review_graph/jedi_resolver.py +303 -0
  41. code_review_graph/main.py +1079 -0
  42. code_review_graph/memory.py +142 -0
  43. code_review_graph/migrations.py +284 -0
  44. code_review_graph/parser.py +6957 -0
  45. code_review_graph/postprocessing.py +134 -0
  46. code_review_graph/prompts.py +159 -0
  47. code_review_graph/refactor.py +852 -0
  48. code_review_graph/registry.py +319 -0
  49. code_review_graph/rescript_resolver.py +206 -0
  50. code_review_graph/search.py +447 -0
  51. code_review_graph/skills.py +1481 -0
  52. code_review_graph/spring_resolver.py +200 -0
  53. code_review_graph/temporal_resolver.py +199 -0
  54. code_review_graph/token_benchmark.py +125 -0
  55. code_review_graph/tools/__init__.py +156 -0
  56. code_review_graph/tools/_common.py +176 -0
  57. code_review_graph/tools/analysis_tools.py +184 -0
  58. code_review_graph/tools/build.py +541 -0
  59. code_review_graph/tools/community_tools.py +246 -0
  60. code_review_graph/tools/context.py +152 -0
  61. code_review_graph/tools/docs.py +274 -0
  62. code_review_graph/tools/flows_tools.py +176 -0
  63. code_review_graph/tools/query.py +692 -0
  64. code_review_graph/tools/refactor_tools.py +168 -0
  65. code_review_graph/tools/registry_tools.py +125 -0
  66. code_review_graph/tools/review.py +477 -0
  67. code_review_graph/tsconfig_resolver.py +257 -0
  68. code_review_graph/visualization.py +2184 -0
  69. code_review_graph/wiki.py +305 -0
  70. code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
  71. code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
  72. code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
  73. code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
  74. code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,23 @@
1
+ """Shared constants for code-review-graph."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ SECURITY_KEYWORDS: frozenset[str] = frozenset({
8
+ "auth", "login", "password", "token", "session", "crypt", "secret",
9
+ "credential", "permission", "sql", "query", "execute", "connect",
10
+ "socket", "request", "http", "sanitize", "validate", "encrypt",
11
+ "decrypt", "hash", "sign", "verify", "admin", "privilege",
12
+ })
13
+
14
+ # ---------------------------------------------------------------------------
15
+ # Configurable limits (override via environment variables)
16
+ # ---------------------------------------------------------------------------
17
+ MAX_IMPACT_NODES = int(os.environ.get("CRG_MAX_IMPACT_NODES", "500"))
18
+ MAX_IMPACT_DEPTH = int(os.environ.get("CRG_MAX_IMPACT_DEPTH", "2"))
19
+ MAX_BFS_DEPTH = int(os.environ.get("CRG_MAX_BFS_DEPTH", "15"))
20
+ MAX_SEARCH_RESULTS = int(os.environ.get("CRG_MAX_SEARCH_RESULTS", "20"))
21
+
22
+ # BFS engine: "sql" (SQLite recursive CTE) or "networkx" (Python-side BFS)
23
+ BFS_ENGINE = os.environ.get("CRG_BFS_ENGINE", "sql")
@@ -0,0 +1,317 @@
1
+ """Compact estimated context savings helpers.
2
+
3
+ The project intentionally labels these values as estimates: the helper uses a
4
+ conservative character-count approximation instead of model-specific tokenizers.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import Any, Iterable
12
+
13
+ CHARS_PER_TOKEN = 4
14
+
15
+
16
+ def estimate_tokens(value: Any) -> int:
17
+ """Estimate token count with a conservative 4 chars/token approximation."""
18
+ if value is None:
19
+ return 0
20
+ if isinstance(value, str):
21
+ text = value
22
+ else:
23
+ text = json.dumps(
24
+ value,
25
+ default=str,
26
+ ensure_ascii=True,
27
+ separators=(",", ":"),
28
+ sort_keys=True,
29
+ )
30
+ if not text:
31
+ return 0
32
+ return max(1, (len(text) + CHARS_PER_TOKEN - 1) // CHARS_PER_TOKEN)
33
+
34
+
35
+ def estimate_file_tokens(repo_root: Path, files: Iterable[str]) -> int:
36
+ """Estimate tokens for changed files using file sizes, not file contents."""
37
+ total = 0
38
+ root = repo_root.resolve()
39
+ for file_name in files:
40
+ path = Path(file_name)
41
+ full_path = path if path.is_absolute() else root / path
42
+ try:
43
+ if full_path.is_file():
44
+ total += max(
45
+ 1,
46
+ (full_path.stat().st_size + CHARS_PER_TOKEN - 1)
47
+ // CHARS_PER_TOKEN,
48
+ )
49
+ except OSError:
50
+ continue
51
+ return total
52
+
53
+
54
+ def estimate_context_savings(
55
+ *,
56
+ original_context: Any | None = None,
57
+ returned_context: Any | None = None,
58
+ original_tokens: int | None = None,
59
+ returned_tokens: int | None = None,
60
+ ) -> dict[str, int | bool] | None:
61
+ """Return tiny savings metadata, or None when no baseline is available."""
62
+ baseline = (
63
+ original_tokens
64
+ if original_tokens is not None
65
+ else estimate_tokens(original_context)
66
+ )
67
+ returned = (
68
+ returned_tokens
69
+ if returned_tokens is not None
70
+ else estimate_tokens(returned_context)
71
+ )
72
+
73
+ if baseline <= 0:
74
+ return None
75
+
76
+ saved = max(0, baseline - returned)
77
+ percent = round((saved / baseline) * 100) if baseline else 0
78
+ return {
79
+ "estimated": True,
80
+ "saved_tokens": int(saved),
81
+ "saved_percent": int(percent),
82
+ }
83
+
84
+
85
+ def attach_context_savings(
86
+ result: dict[str, Any],
87
+ *,
88
+ original_context: Any | None = None,
89
+ original_tokens: int | None = None,
90
+ returned_context: Any | None = None,
91
+ returned_tokens: int | None = None,
92
+ ) -> dict[str, Any]:
93
+ """Attach compact ``context_savings`` metadata when it can be estimated."""
94
+ estimate = estimate_context_savings(
95
+ original_context=original_context,
96
+ returned_context=result if returned_context is None else returned_context,
97
+ original_tokens=original_tokens,
98
+ returned_tokens=returned_tokens,
99
+ )
100
+ if estimate is not None:
101
+ result["context_savings"] = estimate
102
+ return result
103
+
104
+
105
+ def format_context_savings(estimate: dict[str, Any] | None) -> str | None:
106
+ """Format a one-line human summary for CLI output."""
107
+ if not estimate:
108
+ return None
109
+ saved = int(estimate.get("saved_tokens", 0))
110
+ percent = int(estimate.get("saved_percent", 0))
111
+ return f"Estimated context saved: ~{saved:,} tokens (~{percent}%)"
112
+
113
+
114
+ def _fmt_compact(n: int) -> str:
115
+ """Compact integer formatting: 1234 -> '1.2k', 9876 -> '9.9k', 500 -> '500'."""
116
+ if n >= 10_000:
117
+ return f"{n // 1000:,}k"
118
+ if n >= 1000:
119
+ return f"{n / 1000:.1f}k"
120
+ return str(n)
121
+
122
+
123
+ def _breakdown_from_response(response: dict[str, Any]) -> dict[str, int]:
124
+ """Pull a per-category token estimate from a detect-changes / review response.
125
+
126
+ Only fields that exist and have content are reported, so the breakdown
127
+ line stays meaningful instead of padding with zeros.
128
+ """
129
+ # Friendly label -> response-dict key
130
+ fields = [
131
+ ("Functions", "changed_functions"),
132
+ ("Flows", "affected_flows"),
133
+ ("Tests", "test_gaps"),
134
+ ("Risk", "review_priorities"),
135
+ ("Impact", "impacted_nodes"),
136
+ ("Edges", "edges"),
137
+ ("Source", "source_snippets"),
138
+ ("Imports", "imports"),
139
+ ]
140
+ out: dict[str, int] = {}
141
+ for label, key in fields:
142
+ value = response.get(key)
143
+ if not value:
144
+ continue
145
+ tokens = estimate_tokens(value)
146
+ if tokens > 0:
147
+ out[label] = tokens
148
+ return out
149
+
150
+
151
+ def verify_with_tiktoken(
152
+ repo_root: "Path | str",
153
+ changed_files: Iterable[str],
154
+ response: Any,
155
+ encoding_name: str = "cl100k_base",
156
+ ) -> dict[str, int] | None:
157
+ """Calibrate the chars/4 estimate against a real model tokenizer.
158
+
159
+ Returns ``{"verified_baseline": int, "verified_returned": int,
160
+ "verified_saved": int, "verified_percent": int}`` or ``None`` if
161
+ tiktoken is not installed. Reads every changed file's content (unlike
162
+ the stat-only ``estimate_file_tokens``) so the numbers reflect what
163
+ an agent would actually consume.
164
+ """
165
+ try:
166
+ import tiktoken # type: ignore[import-untyped]
167
+ except ImportError:
168
+ return None
169
+
170
+ enc = tiktoken.get_encoding(encoding_name)
171
+ root = Path(repo_root).resolve()
172
+
173
+ naive_real = 0
174
+ for f in changed_files:
175
+ p = root / f
176
+ try:
177
+ if p.is_file():
178
+ naive_real += len(enc.encode(p.read_text(errors="replace")))
179
+ except OSError:
180
+ continue
181
+
182
+ if isinstance(response, str):
183
+ graph_real = len(enc.encode(response))
184
+ else:
185
+ text = json.dumps(
186
+ response, default=str, ensure_ascii=True,
187
+ separators=(",", ":"), sort_keys=True,
188
+ )
189
+ graph_real = len(enc.encode(text))
190
+
191
+ saved = max(0, naive_real - graph_real)
192
+ pct = round(saved * 100 / naive_real) if naive_real > 0 else 0
193
+ return {
194
+ "verified_baseline": naive_real,
195
+ "verified_returned": graph_real,
196
+ "verified_saved": saved,
197
+ "verified_percent": pct,
198
+ }
199
+
200
+
201
+ def format_context_savings_panel(
202
+ estimate: dict[str, Any] | None,
203
+ *,
204
+ original_tokens: int | None = None,
205
+ returned_tokens: int | None = None,
206
+ response: dict[str, Any] | None = None,
207
+ breakdown: dict[str, int] | None = None,
208
+ verified: dict[str, int] | None = None,
209
+ title: str = "Token Savings",
210
+ width: int = 64,
211
+ ) -> str | None:
212
+ """Format the savings estimate as a boxed multi-line CLI panel.
213
+
214
+ Example output (width=60)::
215
+
216
+ ┌──────────────── Token Savings ────────────────┐
217
+ │ Full context would be: 12,932 tokens │
218
+ │ Graph context used: 773 tokens │
219
+ │ Saved: 12,159 tokens (~94%) │
220
+ │ Breakdown: Functions 580 · Tests 120 · ... │
221
+ └───────────────────────────────────────────────┘
222
+
223
+ All numbers are labelled as estimates upstream (``estimated: true`` in the
224
+ metadata dict) because the project uses a 4-chars-per-token approximation,
225
+ not model-specific tokenization.
226
+
227
+ Args:
228
+ estimate: The ``context_savings`` dict from a tool response.
229
+ original_tokens: Optional override for the naive baseline.
230
+ returned_tokens: Optional override for the graph response size.
231
+ response: When provided, breakdown is auto-derived from common keys
232
+ (``changed_functions``, ``affected_flows``, ``test_gaps``,
233
+ ``review_priorities``, ``impacted_nodes``, ``edges``,
234
+ ``source_snippets``, ``imports``).
235
+ breakdown: Explicit ``{label: tokens}`` map; takes precedence over
236
+ ``response``-derived breakdown when both are provided.
237
+ title: Title centered in the top border.
238
+ width: Total panel width, capped at terminal width if larger.
239
+
240
+ Returns:
241
+ The panel as a single ``\\n``-joined string, or ``None`` when there
242
+ is nothing meaningful to display.
243
+ """
244
+ if not estimate:
245
+ return None
246
+
247
+ saved = int(estimate.get("saved_tokens", 0))
248
+ percent = int(estimate.get("saved_percent", 0))
249
+
250
+ # Derive baseline + returned from saved+percent if not provided
251
+ if original_tokens is None:
252
+ if percent > 0:
253
+ original_tokens = int(round(saved * 100 / percent))
254
+ else:
255
+ original_tokens = saved
256
+ if returned_tokens is None:
257
+ returned_tokens = max(0, (original_tokens or 0) - saved)
258
+
259
+ if breakdown is None and response is not None:
260
+ breakdown = _breakdown_from_response(response)
261
+
262
+ # Top up the breakdown with an "Other" bucket so the parts sum to
263
+ # ``returned_tokens`` exactly. "Other" covers fields the breakdown
264
+ # doesn't enumerate (status, summary, risk_score, context_savings
265
+ # metadata, JSON envelope chars). Skip when there's no positive
266
+ # remainder — the breakdown already accounts for the whole response.
267
+ if breakdown and returned_tokens is not None:
268
+ labelled_sum = sum(breakdown.values())
269
+ remainder = returned_tokens - labelled_sum
270
+ if remainder > 0:
271
+ breakdown = dict(breakdown) # copy before mutating
272
+ breakdown["Other"] = remainder
273
+
274
+ # Lines that go inside the box (without borders)
275
+ inner_lines: list[str] = [
276
+ f"Full context would be: {original_tokens:>9,} tokens",
277
+ f"Graph context used: {returned_tokens:>9,} tokens",
278
+ f"Saved: {saved:>9,} tokens (~{percent}%)",
279
+ ]
280
+ if verified:
281
+ vb = verified["verified_baseline"]
282
+ vr = verified["verified_returned"]
283
+ vs = verified["verified_saved"]
284
+ vp = verified["verified_percent"]
285
+ inner_lines.append(
286
+ f"Verified (tiktoken): {vs:>9,} tokens (~{vp}%) "
287
+ f"[{vb:,} → {vr:,}]"
288
+ )
289
+ if breakdown:
290
+ parts = [f"{label} {_fmt_compact(tok)}" for label, tok in breakdown.items()]
291
+ bd_line = "Breakdown: " + " · ".join(parts)
292
+ inner_lines.append(bd_line)
293
+
294
+ # Compute final width: at least wide enough for the longest inner line + padding
295
+ content_width = max(len(s) for s in inner_lines)
296
+ inner_w = max(width - 2, content_width + 2) # +2 for one space pad each side
297
+ # Title bar
298
+ title_str = f" {title} "
299
+ dash_total = inner_w - len(title_str)
300
+ if dash_total < 4:
301
+ dash_total = 4
302
+ left_dash = dash_total // 2
303
+ right_dash = dash_total - left_dash
304
+ top = "┌" + "─" * left_dash + title_str + "─" * right_dash + "┐"
305
+ bottom = "└" + "─" * inner_w + "┘"
306
+
307
+ def _box_line(content: str) -> str:
308
+ pad = inner_w - 2 - len(content)
309
+ if pad < 0:
310
+ pad = 0
311
+ return f"│ {content}{' ' * pad} │"
312
+
313
+ lines = [top]
314
+ for s in inner_lines:
315
+ lines.append(_box_line(s))
316
+ lines.append(bottom)
317
+ return "\n".join(lines)
@@ -0,0 +1,322 @@
1
+ """Config-driven custom language support ("bring your own language").
2
+
3
+ Repos can teach the parser new tree-sitter languages without forking by
4
+ dropping a ``languages.toml`` file into ``.code-review-graph/``::
5
+
6
+ [languages.erlang]
7
+ extensions = [".erl", ".hrl"]
8
+ grammar = "erlang" # tree_sitter_language_pack name
9
+ function_node_types = ["function_clause"]
10
+ class_node_types = ["record_decl"]
11
+ import_node_types = ["import_attribute"]
12
+ call_node_types = ["call"]
13
+ comment = "Erlang via the bundled tree-sitter-erlang grammar"
14
+
15
+ The loader is deliberately defensive: a broken config must never crash a
16
+ build. Invalid entries are skipped with a ``logger.warning``, and built-in
17
+ languages always win — custom entries can neither override built-in file
18
+ extensions nor reuse built-in language names. At most
19
+ ``MAX_CUSTOM_LANGUAGES`` entries are honoured per repo.
20
+
21
+ See docs/CUSTOM_LANGUAGES.md for the full schema reference (answers #320).
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import logging
27
+ import re
28
+ import sys
29
+ import threading
30
+ from collections.abc import Mapping
31
+ from dataclasses import dataclass, field
32
+ from pathlib import Path
33
+ from typing import Optional
34
+
35
+ import tree_sitter_language_pack as tslp
36
+
37
+ if sys.version_info >= (3, 11):
38
+ import tomllib
39
+ else:
40
+ try:
41
+ import tomli as tomllib # type: ignore[no-redef]
42
+ except ImportError:
43
+ tomllib = None # type: ignore[assignment]
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+ #: Location of the config file, relative to the repo root.
48
+ CONFIG_RELATIVE_PATH = Path(".code-review-graph") / "languages.toml"
49
+
50
+ #: Hard cap on the number of custom languages loaded from a single config.
51
+ MAX_CUSTOM_LANGUAGES = 20
52
+
53
+ #: Custom language names: short lowercase identifiers. The name becomes the
54
+ #: ``language`` field on every node parsed from matching files.
55
+ _NAME_RE = re.compile(r"^[a-z][a-z0-9_-]{0,31}$")
56
+
57
+ #: Extensions: a leading dot followed by 1-15 safe characters (".erl",
58
+ #: ".cls", ".4gl"). Uppercase input is normalised to lowercase because the
59
+ #: parser lowercases file suffixes before lookup.
60
+ _EXTENSION_RE = re.compile(r"^\.[a-z0-9_+-]{1,15}$")
61
+
62
+ #: The four node-type lists recognised in each ``[languages.<name>]`` table.
63
+ _NODE_TYPE_KEYS = (
64
+ "function_node_types",
65
+ "class_node_types",
66
+ "import_node_types",
67
+ "call_node_types",
68
+ )
69
+
70
+
71
+ @dataclass(frozen=True)
72
+ class CustomLanguage:
73
+ """One validated ``[languages.<name>]`` entry from languages.toml."""
74
+
75
+ name: str
76
+ grammar: str
77
+ extensions: tuple[str, ...]
78
+ function_node_types: tuple[str, ...] = ()
79
+ class_node_types: tuple[str, ...] = ()
80
+ import_node_types: tuple[str, ...] = ()
81
+ call_node_types: tuple[str, ...] = ()
82
+ comment: str = ""
83
+
84
+
85
+ @dataclass(frozen=True)
86
+ class _CacheEntry:
87
+ mtime_ns: int
88
+ size: int
89
+ languages: dict[str, CustomLanguage] = field(default_factory=dict)
90
+
91
+
92
+ # Config files are re-read only when their mtime/size changes. This matters
93
+ # because full builds construct one CodeParser per worker task, and probing
94
+ # tree-sitter grammars on every file parse would be wasteful.
95
+ _cache_lock = threading.Lock()
96
+ _cache: dict[str, _CacheEntry] = {}
97
+
98
+
99
+ def clear_cache() -> None:
100
+ """Drop the loader cache (used by tests)."""
101
+ with _cache_lock:
102
+ _cache.clear()
103
+
104
+
105
+ def load_custom_languages(
106
+ repo_root: Path,
107
+ *,
108
+ builtin_extensions: Mapping[str, str],
109
+ builtin_languages: frozenset[str],
110
+ ) -> dict[str, CustomLanguage]:
111
+ """Load and validate ``<repo_root>/.code-review-graph/languages.toml``.
112
+
113
+ Returns a mapping of custom language name -> :class:`CustomLanguage`.
114
+ Always returns (possibly empty) — a broken config never raises.
115
+
116
+ Args:
117
+ repo_root: Repository root containing ``.code-review-graph/``.
118
+ builtin_extensions: The parser's built-in extension map; custom
119
+ entries colliding with these are skipped (built-ins win).
120
+ builtin_languages: All built-in language identifiers; custom names
121
+ shadowing these are skipped.
122
+ """
123
+ config_path = Path(repo_root) / CONFIG_RELATIVE_PATH
124
+ try:
125
+ stat = config_path.stat()
126
+ except OSError:
127
+ return {} # No config file — the common case; not worth a log line.
128
+
129
+ cache_key = str(config_path)
130
+ with _cache_lock:
131
+ cached = _cache.get(cache_key)
132
+ if (
133
+ cached is not None
134
+ and cached.mtime_ns == stat.st_mtime_ns
135
+ and cached.size == stat.st_size
136
+ ):
137
+ return dict(cached.languages)
138
+
139
+ languages = _load_uncached(config_path, builtin_extensions, builtin_languages)
140
+ with _cache_lock:
141
+ _cache[cache_key] = _CacheEntry(stat.st_mtime_ns, stat.st_size, dict(languages))
142
+ return languages
143
+
144
+
145
+ def _load_uncached(
146
+ config_path: Path,
147
+ builtin_extensions: Mapping[str, str],
148
+ builtin_languages: frozenset[str],
149
+ ) -> dict[str, CustomLanguage]:
150
+ if tomllib is None:
151
+ logger.warning(
152
+ "%s found but TOML parsing requires the 'tomli' package on "
153
+ "Python < 3.11 — no custom languages loaded",
154
+ config_path,
155
+ )
156
+ return {}
157
+ try:
158
+ raw = config_path.read_bytes()
159
+ except (OSError, PermissionError) as exc:
160
+ logger.warning("Cannot read %s: %s — no custom languages loaded", config_path, exc)
161
+ return {}
162
+ try:
163
+ data = tomllib.loads(raw.decode("utf-8", errors="replace"))
164
+ except tomllib.TOMLDecodeError as exc:
165
+ logger.warning("Malformed TOML in %s: %s — no custom languages loaded", config_path, exc)
166
+ return {}
167
+
168
+ tables = data.get("languages")
169
+ if tables is None:
170
+ return {}
171
+ if not isinstance(tables, dict):
172
+ logger.warning(
173
+ "%s: [languages] must be a table of tables — no custom languages loaded",
174
+ config_path,
175
+ )
176
+ return {}
177
+
178
+ result: dict[str, CustomLanguage] = {}
179
+ claimed_extensions: set[str] = set()
180
+ for name, table in tables.items():
181
+ if len(result) >= MAX_CUSTOM_LANGUAGES:
182
+ logger.warning(
183
+ "%s defines more than %d custom languages — ignoring the rest",
184
+ config_path, MAX_CUSTOM_LANGUAGES,
185
+ )
186
+ break
187
+ lang = _validate_entry(
188
+ name, table, builtin_extensions, builtin_languages,
189
+ claimed_extensions, config_path,
190
+ )
191
+ if lang is None:
192
+ continue
193
+ result[lang.name] = lang
194
+ claimed_extensions.update(lang.extensions)
195
+ return result
196
+
197
+
198
+ def _validate_entry(
199
+ name: object,
200
+ table: object,
201
+ builtin_extensions: Mapping[str, str],
202
+ builtin_languages: frozenset[str],
203
+ claimed_extensions: set[str],
204
+ config_path: Path,
205
+ ) -> Optional[CustomLanguage]:
206
+ """Validate one ``[languages.<name>]`` table; None (after a warning) on
207
+ any problem so a bad entry can never break a build."""
208
+ label = name if isinstance(name, str) else repr(name)
209
+ if not isinstance(table, dict):
210
+ logger.warning("%s: [languages.%s] is not a table — skipping", config_path, label)
211
+ return None
212
+ if not isinstance(name, str) or not _NAME_RE.match(name):
213
+ logger.warning(
214
+ "%s: invalid custom language name %r (expected lowercase "
215
+ "letters/digits/_/-, max 32 chars) — skipping",
216
+ config_path, label,
217
+ )
218
+ return None
219
+ if name in builtin_languages:
220
+ logger.warning(
221
+ "%s: custom language %r shadows a built-in language — skipping "
222
+ "(built-ins cannot be overridden)",
223
+ config_path, name,
224
+ )
225
+ return None
226
+
227
+ grammar = table.get("grammar")
228
+ if not isinstance(grammar, str) or not grammar.strip():
229
+ logger.warning(
230
+ "%s: custom language %r needs a non-empty 'grammar' string — skipping",
231
+ config_path, name,
232
+ )
233
+ return None
234
+ grammar = grammar.strip()
235
+
236
+ raw_extensions = table.get("extensions")
237
+ if not isinstance(raw_extensions, list) or not raw_extensions:
238
+ logger.warning(
239
+ "%s: custom language %r needs a non-empty 'extensions' list — skipping",
240
+ config_path, name,
241
+ )
242
+ return None
243
+ extensions: list[str] = []
244
+ for ext in raw_extensions:
245
+ normalized = ext.strip().lower() if isinstance(ext, str) else ""
246
+ if not normalized.startswith("."):
247
+ logger.warning(
248
+ "%s: custom language %r: extension %r must start with a dot — skipping",
249
+ config_path, name, ext,
250
+ )
251
+ return None
252
+ if not _EXTENSION_RE.match(normalized):
253
+ logger.warning(
254
+ "%s: custom language %r: extension %r is not a valid file "
255
+ "extension — skipping",
256
+ config_path, name, ext,
257
+ )
258
+ return None
259
+ if normalized in builtin_extensions:
260
+ logger.warning(
261
+ "%s: custom language %r: extension %r is already handled by "
262
+ "the built-in %r parser — skipping (built-ins cannot be overridden)",
263
+ config_path, name, normalized, builtin_extensions[normalized],
264
+ )
265
+ return None
266
+ if normalized in claimed_extensions:
267
+ logger.warning(
268
+ "%s: custom language %r: extension %r is already claimed by "
269
+ "an earlier custom language — skipping",
270
+ config_path, name, normalized,
271
+ )
272
+ return None
273
+ if normalized not in extensions:
274
+ extensions.append(normalized)
275
+
276
+ node_types: dict[str, tuple[str, ...]] = {}
277
+ for key in _NODE_TYPE_KEYS:
278
+ value = table.get(key, [])
279
+ if not isinstance(value, list) or any(
280
+ not isinstance(item, str) or not item.strip() for item in value
281
+ ):
282
+ logger.warning(
283
+ "%s: custom language %r: %s must be a list of non-empty "
284
+ "strings — skipping",
285
+ config_path, name, key,
286
+ )
287
+ return None
288
+ node_types[key] = tuple(item.strip() for item in value)
289
+ if not any(node_types.values()):
290
+ logger.warning(
291
+ "%s: custom language %r defines no node types — nothing to "
292
+ "extract, skipping",
293
+ config_path, name,
294
+ )
295
+ return None
296
+
297
+ comment = table.get("comment", "")
298
+ if not isinstance(comment, str):
299
+ comment = ""
300
+
301
+ # Probe the grammar last (it is the expensive check). Parser objects
302
+ # themselves are created lazily by CodeParser._get_parser.
303
+ try:
304
+ tslp.get_language(grammar) # type: ignore[arg-type]
305
+ except (LookupError, ValueError, ImportError, OSError) as exc:
306
+ logger.warning(
307
+ "%s: custom language %r: grammar %r is not available in "
308
+ "tree_sitter_language_pack (%s) — skipping",
309
+ config_path, name, grammar, exc,
310
+ )
311
+ return None
312
+
313
+ return CustomLanguage(
314
+ name=name,
315
+ grammar=grammar,
316
+ extensions=tuple(extensions),
317
+ function_node_types=node_types["function_node_types"],
318
+ class_node_types=node_types["class_node_types"],
319
+ import_node_types=node_types["import_node_types"],
320
+ call_node_types=node_types["call_node_types"],
321
+ comment=comment,
322
+ )