vigil-codeintel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
  2. vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
  3. vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
  4. vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
  5. vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
  7. vigil_forensic/__init__.py +224 -0
  8. vigil_forensic/_git_utils.py +178 -0
  9. vigil_forensic/_shared.py +510 -0
  10. vigil_forensic/_stubs.py +156 -0
  11. vigil_forensic/gate_checks/__init__.py +1 -0
  12. vigil_forensic/gate_checks/_ast_helpers.py +629 -0
  13. vigil_forensic/gate_checks/_deployment_detector.py +573 -0
  14. vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
  15. vigil_forensic/gate_checks/authority_checks.py +95 -0
  16. vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
  17. vigil_forensic/gate_checks/broad_except_checks.py +301 -0
  18. vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
  19. vigil_forensic/gate_checks/common.py +253 -0
  20. vigil_forensic/gate_checks/config_safety_checks.py +704 -0
  21. vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
  22. vigil_forensic/gate_checks/conflict_checks.py +193 -0
  23. vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
  24. vigil_forensic/gate_checks/context_health_checks.py +289 -0
  25. vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
  26. vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
  27. vigil_forensic/gate_checks/duplication_checks.py +387 -0
  28. vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
  29. vigil_forensic/gate_checks/empty_output_checks.py +87 -0
  30. vigil_forensic/gate_checks/encoding_checks.py +847 -0
  31. vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
  32. vigil_forensic/gate_checks/fallback_checks.py +41 -0
  33. vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
  34. vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
  35. vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
  36. vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
  37. vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
  38. vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
  39. vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
  40. vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
  41. vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
  42. vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
  43. vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
  44. vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
  45. vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
  46. vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
  47. vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
  48. vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
  49. vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
  50. vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
  51. vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
  52. vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
  53. vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
  54. vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
  55. vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
  56. vigil_forensic/gate_checks/hallucination_checks.py +566 -0
  57. vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
  58. vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
  59. vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
  60. vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
  61. vigil_forensic/gate_checks/ml_checks.py +318 -0
  62. vigil_forensic/gate_checks/performance_checks.py +106 -0
  63. vigil_forensic/gate_checks/project_specific_runner.py +691 -0
  64. vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
  65. vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
  66. vigil_forensic/gate_checks/reliability_checks.py +389 -0
  67. vigil_forensic/gate_checks/reporting_checks.py +55 -0
  68. vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
  69. vigil_forensic/gate_checks/security_injection_checks.py +332 -0
  70. vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
  71. vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
  72. vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
  73. vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
  74. vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
  75. vigil_forensic/gate_checks/test_quality_checks.py +946 -0
  76. vigil_forensic/gate_checks/testing_checks.py +149 -0
  77. vigil_forensic/gate_checks/toctou_checks.py +367 -0
  78. vigil_forensic/gate_checks/type_checking_checks.py +316 -0
  79. vigil_forensic/gate_models.py +392 -0
  80. vigil_forensic/gate_packs/__init__.py +1 -0
  81. vigil_forensic/gate_packs/universal.py +179 -0
  82. vigil_forensic/gate_profile.json +31 -0
  83. vigil_forensic/gate_registry.py +21 -0
  84. vigil_forensic/language_profiles.py +219 -0
  85. vigil_forensic/meta_findings.py +207 -0
  86. vigil_forensic/self_audit.py +725 -0
  87. vigil_forensic/source_analysis.py +175 -0
  88. vigil_mapper/__init__.py +103 -0
  89. vigil_mapper/_ast_helpers_minimal.py +229 -0
  90. vigil_mapper/_extract_imports_impl.py +123 -0
  91. vigil_mapper/_file_count_guard.py +129 -0
  92. vigil_mapper/_git_utils.py +178 -0
  93. vigil_mapper/_runtime_ast.py +438 -0
  94. vigil_mapper/_runtime_dispatch.py +137 -0
  95. vigil_mapper/_seed_helpers.py +82 -0
  96. vigil_mapper/authority_builder.py +1102 -0
  97. vigil_mapper/cli_entry.py +731 -0
  98. vigil_mapper/conflict_builder.py +818 -0
  99. vigil_mapper/data_contract_builder.py +446 -0
  100. vigil_mapper/findings_builder.py +716 -0
  101. vigil_mapper/fingerprint.py +53 -0
  102. vigil_mapper/hotspot_builder.py +539 -0
  103. vigil_mapper/map_common.py +449 -0
  104. vigil_mapper/map_errors.py +55 -0
  105. vigil_mapper/map_models.py +431 -0
  106. vigil_mapper/map_models_ext.py +206 -0
  107. vigil_mapper/map_models_findings.py +130 -0
  108. vigil_mapper/map_storage.py +455 -0
  109. vigil_mapper/parse_cache.py +795 -0
  110. vigil_mapper/refactor_boundary_builder.py +266 -0
  111. vigil_mapper/runtime_builder.py +527 -0
  112. vigil_mapper/runtime_tracer.py +243 -0
  113. vigil_mapper/runtime_tracer_entry.py +199 -0
  114. vigil_mapper/semantic_diff.py +71 -0
  115. vigil_mapper/source_adapters/__init__.py +109 -0
  116. vigil_mapper/source_adapters/_base.py +264 -0
  117. vigil_mapper/source_adapters/_ir.py +156 -0
  118. vigil_mapper/source_adapters/_lexer.py +309 -0
  119. vigil_mapper/source_adapters/_patterns.py +212 -0
  120. vigil_mapper/source_adapters/_treesitter.py +182 -0
  121. vigil_mapper/source_adapters/go.py +553 -0
  122. vigil_mapper/source_adapters/java.py +541 -0
  123. vigil_mapper/source_adapters/javascript.py +626 -0
  124. vigil_mapper/source_adapters/python.py +325 -0
  125. vigil_mapper/source_adapters/typescript.py +749 -0
  126. vigil_mapper/structural_builder.py +586 -0
  127. vigil_mcp/__init__.py +1 -0
  128. vigil_mcp/_jobs.py +587 -0
  129. vigil_mcp/_paths.py +93 -0
  130. vigil_mcp/forensic_server.py +419 -0
  131. vigil_mcp/map_server.py +452 -0
@@ -0,0 +1,795 @@
1
+ """Two-level parse cache for the map builder subsystem.
2
+
3
+ L1 (ParseCacheL1): In-memory cache for a single build session.
4
+ L2 (ParseCacheL2): On-disk persistent cache in <project>/.cortex/.map_cache/.
5
+
6
+ Design:
7
+ - ParsedFile holds per-file signals extracted by AST parsing (no ast.Module —
8
+ not serialisable). Reused by structural, runtime, data_contract, authority
9
+ builders so each file is parsed at most once per build.
10
+ - content_hash = sha256(source_bytes).hexdigest()[:32] (full 32-char hex)
11
+ - adapter_version_hash = sha256(sorted adapter repr strings)[:16]
12
+ - L2 cache entries live in .cortex/.map_cache/<content_hash>.json
13
+ - Corrupt / wrong-version entries are treated as cache misses, never raised.
14
+ - Thread-safety: L1 is not thread-safe (single-threaded builder loop).
15
+ L2 writes are atomic (tempfile + os.replace).
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import collections
20
+ import hashlib
21
+ import json
22
+ import logging
23
+ import os
24
+ import tempfile
25
+ import time
26
+ from dataclasses import dataclass
27
+ from pathlib import Path
28
+ from typing import TYPE_CHECKING
29
+
30
+ if TYPE_CHECKING:
31
+ pass
32
+
33
+ __all__ = [
34
+ "ParsedFile",
35
+ "ParseCacheL1",
36
+ "ParseCacheL2",
37
+ ]
38
+
39
+ _log = logging.getLogger(__name__)
40
+
41
+ # Bump this when ParsedFile schema changes incompatibly.
42
+ _CACHE_FORMAT_VERSION = 1
43
+
44
+ # Subdirectory inside .cortex for the L2 on-disk cache.
45
+ _CACHE_SUBDIR = ".cortex/.map_cache"
46
+
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # Adapter version hash — invalidates cache when parser logic changes
50
+ # ---------------------------------------------------------------------------
51
+
52
+ def _compute_adapter_version_hash() -> str:
53
+ """Return a 16-char hex hash derived from adapter capabilities + source code.
54
+
55
+ Combines:
56
+ 1. Adapter class names + capability flags (structural, contracts, runtime, writes)
57
+ 2. Source code of critical extraction modules (parse_cache.py, source_adapters.py)
58
+
59
+ When adapters change, capabilities change, OR extraction logic changes,
60
+ the hash changes and all L2 entries from prior builds become invalid.
61
+ """
62
+ from .source_adapters import ADAPTERS # noqa: PLC0415
63
+
64
+ parts: list[str] = []
65
+
66
+ # Part 1: Adapter capabilities (as before)
67
+ for ext in sorted(ADAPTERS):
68
+ a = ADAPTERS[ext]
69
+ parts.append(
70
+ "%s|%s|structural=%s|contracts=%s|runtime=%s|writes=%s" % (
71
+ ext,
72
+ a.__class__.__name__,
73
+ a.supports_structural,
74
+ a.supports_contracts,
75
+ a.supports_runtime_signals,
76
+ a.supports_authority_writes,
77
+ )
78
+ )
79
+
80
+ # Part 2: Source code hash of critical extraction modules
81
+ # This invalidates cache when extraction logic changes
82
+ map_builder_dir = Path(__file__).parent
83
+ critical_modules = [
84
+ "parse_cache.py",
85
+ "structural_builder.py",
86
+ "runtime_builder.py",
87
+ "data_contract_builder.py",
88
+ "authority_builder.py",
89
+ ]
90
+
91
+ module_parts: list[str] = []
92
+ for mod_name in critical_modules:
93
+ mod_path = map_builder_dir / mod_name
94
+ if mod_path.exists():
95
+ try:
96
+ source = mod_path.read_text(encoding="utf-8")
97
+ mod_hash = hashlib.sha256(source.encode("utf-8")).hexdigest()[:8]
98
+ module_parts.append(f"{mod_name}:{mod_hash}")
99
+ except (OSError, UnicodeDecodeError):
100
+ _log.debug("_compute_adapter_version_hash: failed to read %s", mod_name)
101
+ # Use empty hash if file cannot be read (failure is not silenced,
102
+ # ensures rebuilds happen on file access issues)
103
+ module_parts.append(f"{mod_name}:ERROR")
104
+ else:
105
+ # Module doesn't exist in this version of map_builder (acceptable)
106
+ module_parts.append(f"{mod_name}:MISSING")
107
+
108
+ # source_adapters/ is a package directory — hash all *.py files combined so
109
+ # any adapter file change invalidates the cache.
110
+ source_adapters_dir = map_builder_dir / "source_adapters"
111
+ if source_adapters_dir.is_dir():
112
+ adapter_files = sorted(source_adapters_dir.glob("*.py"))
113
+ per_file_hashes: list[str] = []
114
+ for adapter_path in adapter_files:
115
+ try:
116
+ adapter_source = adapter_path.read_text(encoding="utf-8")
117
+ file_hash = hashlib.sha256(adapter_source.encode("utf-8")).hexdigest()
118
+ per_file_hashes.append(f"{adapter_path.name}:{file_hash}")
119
+ except (OSError, UnicodeDecodeError):
120
+ _log.debug(
121
+ "_compute_adapter_version_hash: failed to read %s",
122
+ adapter_path.name,
123
+ )
124
+ per_file_hashes.append(f"{adapter_path.name}:ERROR")
125
+ combined_adapter_hash = hashlib.sha256(
126
+ "\n".join(per_file_hashes).encode("utf-8")
127
+ ).hexdigest()[:8]
128
+ module_parts.append(f"source_adapters_dir:{combined_adapter_hash}")
129
+ else:
130
+ _log.warning(
131
+ "_compute_adapter_version_hash: source_adapters/ directory missing at %s",
132
+ source_adapters_dir,
133
+ )
134
+ module_parts.append("source_adapters_dir:MISSING")
135
+
136
+ parts.append("extraction_code:" + ",".join(module_parts))
137
+ combined = "\n".join(sorted(parts))
138
+ return hashlib.sha256(combined.encode("utf-8")).hexdigest()[:16]
139
+
140
+
141
+ # Computed once per process lifetime (adapters are registered at import time).
142
+ _ADAPTER_VERSION_HASH: str | None = None
143
+
144
+
145
+ def _get_adapter_version_hash() -> str:
146
+ global _ADAPTER_VERSION_HASH # noqa: PLW0603
147
+ if _ADAPTER_VERSION_HASH is None:
148
+ _ADAPTER_VERSION_HASH = _compute_adapter_version_hash()
149
+ return _ADAPTER_VERSION_HASH
150
+
151
+
152
+ # ---------------------------------------------------------------------------
153
+ # ParsedFile dataclass
154
+ # ---------------------------------------------------------------------------
155
+
156
+ @dataclass
157
+ class ParsedFile:
158
+ """Per-file signals extracted from source. No ast.Module (not serialisable).
159
+
160
+ All list fields are plain lists (not tuples) so they round-trip through JSON
161
+ without conversion. Builders that need tuples cast on consumption.
162
+ """
163
+
164
+ # Structural signals
165
+ imports_out: list[str] # dotted module names imported by this file
166
+ symbols_defined: list[str] # class / function names at any scope
167
+
168
+ # Runtime signals
169
+ env_vars: list[str] # os.environ keys read by this file
170
+ side_effects: list[str] # import-time side-effect categories detected
171
+ write_calls: list[str] # write-target paths detected by AST
172
+
173
+ # Data-contract signals
174
+ entity_classes: list[str] # dataclass / pydantic / NamedTuple / TypedDict names
175
+
176
+ # Meta
177
+ is_parseable: bool # False iff source had a SyntaxError
178
+ content_hash: str # sha256(source)[:32]
179
+ size_lines: int # line count of source
180
+
181
+
182
+ def _parsed_file_to_dict(pf: ParsedFile) -> dict:
183
+ return {
184
+ "imports_out": pf.imports_out,
185
+ "symbols_defined": pf.symbols_defined,
186
+ "env_vars": pf.env_vars,
187
+ "side_effects": pf.side_effects,
188
+ "write_calls": pf.write_calls,
189
+ "entity_classes": pf.entity_classes,
190
+ "is_parseable": pf.is_parseable,
191
+ "content_hash": pf.content_hash,
192
+ "size_lines": pf.size_lines,
193
+ }
194
+
195
+
196
+ def _parsed_file_from_dict(d: dict) -> ParsedFile:
197
+ return ParsedFile(
198
+ imports_out=list(d.get("imports_out", [])),
199
+ symbols_defined=list(d.get("symbols_defined", [])),
200
+ env_vars=list(d.get("env_vars", [])),
201
+ side_effects=list(d.get("side_effects", [])),
202
+ write_calls=list(d.get("write_calls", [])),
203
+ entity_classes=list(d.get("entity_classes", [])),
204
+ is_parseable=bool(d.get("is_parseable", True)),
205
+ content_hash=str(d.get("content_hash", "")),
206
+ size_lines=int(d.get("size_lines", 0)),
207
+ )
208
+
209
+
210
+ # ---------------------------------------------------------------------------
211
+ # ParseCacheL2 — on-disk persistent cache
212
+ # ---------------------------------------------------------------------------
213
+
214
+ class ParseCacheL2:
215
+ """On-disk JSON cache stored in <project_dir>/.cortex/.map_cache/.
216
+
217
+ Cache key is content_hash (sha256[:32]). Each entry is a JSON file
218
+ named <content_hash>.json containing parsed signals + a meta envelope
219
+ for format/adapter-version validation.
220
+
221
+ Partial failures (corrupt JSON, schema mismatch, OSError) are treated as
222
+ cache misses — they never propagate to callers.
223
+ """
224
+
225
+ def __init__(self, project_dir: Path) -> None:
226
+ self._cache_dir = project_dir.resolve() / _CACHE_SUBDIR
227
+ self._adapter_hash = _get_adapter_version_hash()
228
+ self._hits = 0
229
+ self._misses = 0
230
+
231
+ # ------------------------------------------------------------------
232
+ # Public API
233
+ # ------------------------------------------------------------------
234
+
235
+ def get(self, content_hash: str) -> ParsedFile | None:
236
+ """Return cached ParsedFile for content_hash, or None on miss/error."""
237
+ entry_path = self._cache_dir / (content_hash + ".json")
238
+ if not entry_path.exists():
239
+ self._misses += 1
240
+ return None
241
+
242
+ try:
243
+ raw = entry_path.read_text(encoding="utf-8")
244
+ payload = json.loads(raw)
245
+ except (OSError, json.JSONDecodeError, UnicodeDecodeError) as exc:
246
+ _log.debug("ParseCacheL2.get: corrupt entry %s, treating as miss: %s", entry_path.name, exc)
247
+ self._misses += 1
248
+ return None
249
+
250
+ if not isinstance(payload, dict):
251
+ _log.debug("ParseCacheL2.get: non-dict payload in %s, treating as miss", entry_path.name)
252
+ self._misses += 1
253
+ return None
254
+
255
+ # Validate format version
256
+ if payload.get("format_version") != _CACHE_FORMAT_VERSION:
257
+ _log.debug(
258
+ "ParseCacheL2.get: format_version mismatch in %s (got %r, want %r), miss",
259
+ entry_path.name,
260
+ payload.get("format_version"),
261
+ _CACHE_FORMAT_VERSION,
262
+ )
263
+ self._misses += 1
264
+ return None
265
+
266
+ # Validate adapter version
267
+ if payload.get("adapter_version_hash") != self._adapter_hash:
268
+ _log.debug(
269
+ "ParseCacheL2.get: adapter_version_hash mismatch in %s, miss",
270
+ entry_path.name,
271
+ )
272
+ self._misses += 1
273
+ return None
274
+
275
+ signals = payload.get("signals")
276
+ if not isinstance(signals, dict):
277
+ _log.debug("ParseCacheL2.get: missing 'signals' in %s, miss", entry_path.name)
278
+ self._misses += 1
279
+ return None
280
+
281
+ try:
282
+ pf = _parsed_file_from_dict(signals)
283
+ except Exception as exc:
284
+ _log.debug("ParseCacheL2.get: failed to deserialise %s: %s", entry_path.name, exc)
285
+ self._misses += 1
286
+ return None
287
+
288
+ self._hits += 1
289
+ return pf
290
+
291
+ def put(self, content_hash: str, parsed_file: ParsedFile) -> None:
292
+ """Atomically write parsed_file to cache. Silently swallows write errors."""
293
+ self._cache_dir.mkdir(parents=True, exist_ok=True)
294
+ entry_path = self._cache_dir / (content_hash + ".json")
295
+ payload = {
296
+ "format_version": _CACHE_FORMAT_VERSION,
297
+ "adapter_version_hash": self._adapter_hash,
298
+ "content_hash": content_hash,
299
+ "signals": _parsed_file_to_dict(parsed_file),
300
+ }
301
+ try:
302
+ self._atomic_write(entry_path, payload)
303
+ except Exception as exc:
304
+ _log.debug("ParseCacheL2.put: failed to write %s: %s", entry_path.name, exc)
305
+
306
+ def flush(self) -> None:
307
+ """No-op — all writes are already atomic. Reserved for future cleanup."""
308
+
309
+ # ------------------------------------------------------------------
310
+ # Internal helpers
311
+ # ------------------------------------------------------------------
312
+
313
+ def _atomic_write(self, path: Path, payload: dict) -> None:
314
+ """Write payload atomically via tempfile + os.replace."""
315
+ path.parent.mkdir(parents=True, exist_ok=True)
316
+ fd, tmp_path = tempfile.mkstemp(
317
+ dir=str(path.parent),
318
+ prefix=".pcache_",
319
+ suffix=".tmp",
320
+ )
321
+ try:
322
+ with os.fdopen(fd, "w", encoding="utf-8") as fh:
323
+ fh.write(json.dumps(payload, indent=2, ensure_ascii=False, sort_keys=True))
324
+ fh.write("\n")
325
+ os.replace(tmp_path, str(path))
326
+ except BaseException:
327
+ try:
328
+ os.unlink(tmp_path)
329
+ except OSError:
330
+ pass
331
+ raise
332
+
333
+
334
+ # ---------------------------------------------------------------------------
335
+ # ParseCacheL1 — in-memory cache for one build session
336
+ # ---------------------------------------------------------------------------
337
+
338
+ class ParseCacheL1:
339
+ """In-memory parse cache backed by an optional ParseCacheL2.
340
+
341
+ Lifetime: one map-build invocation. Keyed by resolved absolute path.
342
+ On get_or_parse() miss: reads the file, hashes content, checks L2,
343
+ then falls back to full AST parse. Result stored in both L1 and L2.
344
+
345
+ Also caches source text in L1 (not serialized to L2) to avoid re-reading
346
+ files when multiple builders consume the same file.
347
+ """
348
+
349
+ # Default cap for the source-text LRU: keeps at most this many file texts
350
+ # in memory simultaneously. On a typical project the working set is small
351
+ # (builders read each file once), so 256 covers virtually all cases while
352
+ # preventing unbounded growth when a repo has thousands of source files.
353
+ _SOURCE_CACHE_MAX_ENTRIES: int = 256
354
+
355
+ def __init__(
356
+ self,
357
+ l2: ParseCacheL2 | None = None,
358
+ *,
359
+ max_file_mb: float = 5.0,
360
+ source_cache_max_entries: int | None = None,
361
+ ) -> None:
362
+ """Initialise the L1 in-memory cache.
363
+
364
+ Args:
365
+ l2: Optional L2 on-disk cache.
366
+ max_file_mb: Files larger than this threshold (in MiB) are SKIPPED —
367
+ their full-text is never loaded and an empty ParsedFile is
368
+ returned. The skipped file is recorded in ``oversized_files``.
369
+ Default is 5.0 MiB. Pass ``float('inf')`` to disable.
370
+ source_cache_max_entries: Maximum number of raw source strings to
371
+ retain in the LRU text cache. Oldest entry is evicted when the
372
+ cap is reached. Default: ``_SOURCE_CACHE_MAX_ENTRIES`` (256).
373
+ """
374
+ self._l2 = l2
375
+ self._max_file_bytes: float = max_file_mb * 1024 * 1024
376
+ _cap = source_cache_max_entries if source_cache_max_entries is not None else self._SOURCE_CACHE_MAX_ENTRIES
377
+ self._source_cache_max: int = max(1, _cap)
378
+ self._cache: dict[str, ParsedFile] = {} # key: str(abs_path)
379
+ # Bounded LRU: OrderedDict keeps insertion order; we move-to-end on hit
380
+ # and pop the oldest entry when the cap is reached.
381
+ self._source_cache: collections.OrderedDict[str, str] = collections.OrderedDict()
382
+
383
+ # Oversized-file tracking (consumed by cli_entry to populate meta)
384
+ self.oversized_files: list[dict] = [] # [{path, size_mb}]
385
+
386
+ # Counters
387
+ self.hits = 0 # L1 hits
388
+ self.misses = 0 # L1 misses (file parsed fresh or from L2)
389
+ self.l2_hits = 0 # subset of misses served by L2
390
+ self.l2_misses = 0 # subset of misses that required full parse
391
+ self.total_files = 0 # total get_or_parse() calls
392
+ self.time_saved_ms: float = 0.0 # estimated ms saved by L1+L2 cache hits
393
+
394
+ # ------------------------------------------------------------------
395
+ # Public API
396
+ # ------------------------------------------------------------------
397
+
398
+ def get_or_parse(self, abs_path: Path, project_dir: Path) -> ParsedFile:
399
+ """Return ParsedFile for abs_path, computing it if necessary.
400
+
401
+ Order of precedence:
402
+ 1. L1 in-memory cache (fastest)
403
+ 2. L2 on-disk cache (keyed by content_hash)
404
+ 3. Full AST parse (slowest — result stored in L1 + L2)
405
+
406
+ Also caches the source text in L1 for later retrieval via get_cached_source().
407
+ """
408
+ self.total_files += 1
409
+ key = str(abs_path)
410
+
411
+ # --- L1 hit ---
412
+ if key in self._cache:
413
+ self.hits += 1
414
+ _log.debug("ParseCacheL1: L1 hit for %s", abs_path.name)
415
+ return self._cache[key]
416
+
417
+ self.misses += 1
418
+ t0 = time.perf_counter()
419
+
420
+ # --- File-size guard (fast stat before read) ---
421
+ try:
422
+ file_bytes = abs_path.stat().st_size
423
+ except OSError:
424
+ file_bytes = 0
425
+ if file_bytes > self._max_file_bytes:
426
+ size_mb = file_bytes / (1024 * 1024)
427
+ _log.warning(
428
+ "ParseCacheL1: skipping oversized file %s (%.1f MiB > %.1f MiB limit)",
429
+ abs_path, size_mb, self._max_file_bytes / (1024 * 1024),
430
+ )
431
+ self.oversized_files.append({"path": str(abs_path), "size_mb": round(size_mb, 3)})
432
+ pf = _empty_parsed_file("")
433
+ self._cache[key] = pf
434
+ return pf
435
+
436
+ # Read source
437
+ try:
438
+ source = abs_path.read_text(encoding="utf-8", errors="replace")
439
+ except OSError as exc:
440
+ _log.warning("ParseCacheL1.get_or_parse: cannot read %s: %s", abs_path, exc)
441
+ pf = _empty_parsed_file("")
442
+ self._cache[key] = pf
443
+ return pf
444
+
445
+ # Store source in bounded LRU cache (evict oldest when cap reached)
446
+ if key not in self._source_cache and len(self._source_cache) >= self._source_cache_max:
447
+ self._source_cache.popitem(last=False) # evict oldest
448
+ self._source_cache[key] = source
449
+ self._source_cache.move_to_end(key) # mark as most-recently used
450
+
451
+ content_hash = hashlib.sha256(source.encode("utf-8")).hexdigest()[:32]
452
+
453
+ # --- L2 hit ---
454
+ if self._l2 is not None:
455
+ cached = self._l2.get(content_hash)
456
+ if cached is not None:
457
+ self.l2_hits += 1
458
+ elapsed_ms = (time.perf_counter() - t0) * 1000
459
+ self.time_saved_ms += _estimate_parse_time_ms(source) - elapsed_ms
460
+ _log.debug("ParseCacheL1: L2 hit for %s (hash=%s)", abs_path.name, content_hash)
461
+ self._cache[key] = cached
462
+ return cached
463
+
464
+ # --- Full parse ---
465
+ self.l2_misses += 1
466
+ pf = _parse_file(source, content_hash, abs_path, project_dir)
467
+ _log.debug("ParseCacheL1: parsed %s (%d lines)", abs_path.name, pf.size_lines)
468
+
469
+ # Store results
470
+ self._cache[key] = pf
471
+ if self._l2 is not None:
472
+ self._l2.put(content_hash, pf)
473
+
474
+ return pf
475
+
476
+ def get_cached_source(self, abs_path: Path) -> str | None:
477
+ """Return cached source text if available, else None.
478
+
479
+ Used by runtime/data_contract builders to avoid re-reading files
480
+ that were already read by get_or_parse(). On hit the entry is
481
+ promoted to most-recently-used so it survives longer in the LRU.
482
+ """
483
+ key = str(abs_path)
484
+ src = self._source_cache.get(key)
485
+ if src is not None:
486
+ self._source_cache.move_to_end(key)
487
+ return src
488
+
489
+ def log_stats(self) -> None:
490
+ """Log hit/miss stats at INFO level."""
491
+ total = self.total_files
492
+ l1_rate = (self.hits / total * 100) if total > 0 else 0.0
493
+ l2_rate = (self.l2_hits / max(self.misses, 1) * 100) if self.misses > 0 else 0.0
494
+ _log.info(
495
+ "ParseCacheL1 stats: total=%d L1_hits=%d (%.0f%%) "
496
+ "L2_hits=%d (%.0f%% of L1-misses) full_parses=%d "
497
+ "estimated_saved=%.0fms",
498
+ total,
499
+ self.hits,
500
+ l1_rate,
501
+ self.l2_hits,
502
+ l2_rate,
503
+ self.l2_misses,
504
+ self.time_saved_ms,
505
+ )
506
+
507
+
508
+ # ---------------------------------------------------------------------------
509
+ # Parsing implementation (no ast.Module stored in ParsedFile)
510
+ # ---------------------------------------------------------------------------
511
+
512
+ def _empty_parsed_file(content_hash: str) -> ParsedFile:
513
+ """Return a minimal ParsedFile for unreadable files."""
514
+ return ParsedFile(
515
+ imports_out=[],
516
+ symbols_defined=[],
517
+ env_vars=[],
518
+ side_effects=[],
519
+ write_calls=[],
520
+ entity_classes=[],
521
+ is_parseable=False,
522
+ content_hash=content_hash,
523
+ size_lines=0,
524
+ )
525
+
526
+
527
+ def _estimate_parse_time_ms(source: str) -> float:
528
+ """Rough estimate of AST parse time based on file size.
529
+
530
+ Used only for time_saved_ms accounting — not a hard measurement.
531
+ Empirically: ~1ms per 200 lines on modern hardware.
532
+ """
533
+ lines = source.count("\n") + 1
534
+ return max(1.0, lines / 200.0)
535
+
536
+
537
+ def _parse_file(
538
+ source: str,
539
+ content_hash: str,
540
+ abs_path: Path,
541
+ project_dir: Path,
542
+ ) -> ParsedFile:
543
+ """Extract all signals from source via AST. Never raises on SyntaxError."""
544
+ import ast # noqa: PLC0415
545
+
546
+ size_lines = source.count("\n") + (1 if source and not source.endswith("\n") else 0)
547
+
548
+ # --- Parseability check ---
549
+ try:
550
+ tree = ast.parse(source)
551
+ is_parseable = True
552
+ except SyntaxError:
553
+ return ParsedFile(
554
+ imports_out=[],
555
+ symbols_defined=[],
556
+ env_vars=[],
557
+ side_effects=[],
558
+ write_calls=[],
559
+ entity_classes=[],
560
+ is_parseable=False,
561
+ content_hash=content_hash,
562
+ size_lines=size_lines,
563
+ )
564
+
565
+ # --- Imports ---
566
+ imports_out: list[str] = _extract_imports_out(tree, source)
567
+
568
+ # --- Symbols defined ---
569
+ symbols_defined: list[str] = [
570
+ node.name
571
+ for node in ast.walk(tree)
572
+ if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef))
573
+ ]
574
+
575
+ # --- Env vars ---
576
+ env_vars: list[str] = _extract_env_vars(tree)
577
+
578
+ # --- Side effects (import-time call statements) ---
579
+ side_effects: list[str] = _extract_side_effects(tree)
580
+
581
+ # --- Write calls ---
582
+ write_calls: list[str] = _extract_write_calls(tree)
583
+
584
+ # --- Entity classes (dataclass / pydantic / NamedTuple / TypedDict) ---
585
+ entity_classes: list[str] = _extract_entity_classes(tree)
586
+
587
+ return ParsedFile(
588
+ imports_out=imports_out,
589
+ symbols_defined=symbols_defined,
590
+ env_vars=env_vars,
591
+ side_effects=side_effects,
592
+ write_calls=write_calls,
593
+ entity_classes=entity_classes,
594
+ is_parseable=is_parseable,
595
+ content_hash=content_hash,
596
+ size_lines=size_lines,
597
+ )
598
+
599
+
600
+ def _extract_imports_out(tree: "ast.Module", source: str) -> list[str]:
601
+ """Collect all import targets including 'from X import Y' → 'X.Y' candidates."""
602
+ import ast # noqa: PLC0415
603
+
604
+ seen: set[str] = set()
605
+ result: list[str] = []
606
+
607
+ def _add(name: str) -> None:
608
+ if name and name not in seen:
609
+ seen.add(name)
610
+ result.append(name)
611
+
612
+ for node in ast.walk(tree):
613
+ if isinstance(node, ast.Import):
614
+ for alias in node.names:
615
+ _add(alias.name)
616
+ elif isinstance(node, ast.ImportFrom):
617
+ if node.level == 0 and node.module:
618
+ # Module-level import
619
+ _add(node.module)
620
+ # Also produce 'module.name' candidates for sub-module resolution
621
+ for alias in node.names:
622
+ if alias.name != "*":
623
+ _add("%s.%s" % (node.module, alias.name))
624
+ elif node.level > 0:
625
+ # Relative import: represent as ".name" or "..name"
626
+ dots = "." * node.level
627
+ if node.module:
628
+ _add(dots + node.module)
629
+ else:
630
+ for alias in node.names:
631
+ if alias.name != "*":
632
+ _add(dots + alias.name)
633
+
634
+ return result
635
+
636
+
637
+ _ENV_CALL_PATTERNS = frozenset({
638
+ ("os", "environ", "get"), # os.environ.get(...)
639
+ ("os", "getenv"), # os.getenv(...)
640
+ })
641
+
642
+
643
+ def _extract_env_vars(tree: "ast.Module") -> list[str]:
644
+ """Extract string keys from os.environ.get/os.getenv calls."""
645
+ import ast # noqa: PLC0415
646
+
647
+ found: list[str] = []
648
+ seen: set[str] = set()
649
+
650
+ for node in ast.walk(tree):
651
+ if not isinstance(node, ast.Call):
652
+ continue
653
+ func = node.func
654
+ # os.environ.get("KEY") — func is Attribute(value=Attribute(value=Name("os"), attr="environ"), attr="get")
655
+ if (
656
+ isinstance(func, ast.Attribute)
657
+ and func.attr == "get"
658
+ and isinstance(func.value, ast.Attribute)
659
+ and func.value.attr == "environ"
660
+ and isinstance(func.value.value, ast.Name)
661
+ and func.value.value.id == "os"
662
+ ):
663
+ if node.args and isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str):
664
+ key = node.args[0].value
665
+ if key not in seen:
666
+ seen.add(key)
667
+ found.append(key)
668
+ # os.getenv("KEY") — func is Attribute(value=Name("os"), attr="getenv")
669
+ elif (
670
+ isinstance(func, ast.Attribute)
671
+ and func.attr == "getenv"
672
+ and isinstance(func.value, ast.Name)
673
+ and func.value.id == "os"
674
+ ):
675
+ if node.args and isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str):
676
+ key = node.args[0].value
677
+ if key not in seen:
678
+ seen.add(key)
679
+ found.append(key)
680
+
681
+ # Also catch os.environ["KEY"] subscripts
682
+ for node in ast.walk(tree):
683
+ if (
684
+ isinstance(node, ast.Subscript)
685
+ and isinstance(node.value, ast.Attribute)
686
+ and node.value.attr == "environ"
687
+ and isinstance(node.value.value, ast.Name)
688
+ and node.value.value.id == "os"
689
+ ):
690
+ key_node = node.slice
691
+ # Python 3.9+: slice is the node directly; 3.8: wrapped in Index
692
+ if isinstance(key_node, ast.Constant) and isinstance(key_node.value, str):
693
+ key = key_node.value
694
+ if key not in seen:
695
+ seen.add(key)
696
+ found.append(key)
697
+
698
+ return found
699
+
700
+
701
+ def _extract_side_effects(tree: "ast.Module") -> list[str]:
702
+ """Detect import-time side-effect categories at module top level."""
703
+ import ast # noqa: PLC0415
704
+
705
+ categories: list[str] = []
706
+ seen: set[str] = set()
707
+
708
+ def _add(cat: str) -> None:
709
+ if cat not in seen:
710
+ seen.add(cat)
711
+ categories.append(cat)
712
+
713
+ # Module body: top-level statements that are expressions (calls) indicate
714
+ # side-effects at import time.
715
+ for stmt in getattr(tree, "body", []):
716
+ if isinstance(stmt, ast.Expr) and isinstance(stmt.value, ast.Call):
717
+ func = stmt.value.func
718
+ func_name = ""
719
+ if isinstance(func, ast.Name):
720
+ func_name = func.id
721
+ elif isinstance(func, ast.Attribute):
722
+ func_name = func.attr
723
+ # Common side-effect patterns
724
+ if func_name in ("register", "setup", "configure", "init", "bootstrap", "start"):
725
+ _add("import_time_side_effects")
726
+ elif func_name in ("Thread", "Process", "create_task"):
727
+ _add("background_task")
728
+ else:
729
+ _add("import_time_side_effects")
730
+
731
+ return categories
732
+
733
+
734
+ def _extract_write_calls(tree: "ast.Module") -> list[str]:
735
+ """Extract literal path targets from .write_text / .write_bytes / .save calls."""
736
+ import ast # noqa: PLC0415
737
+
738
+ _WRITE_METHODS = frozenset({"write_text", "write_bytes", "save"})
739
+ found: list[str] = []
740
+ seen: set[str] = set()
741
+
742
+ for node in ast.walk(tree):
743
+ if not isinstance(node, ast.Call):
744
+ continue
745
+ func = node.func
746
+ if not (isinstance(func, ast.Attribute) and func.attr in _WRITE_METHODS):
747
+ continue
748
+ # Try to extract literal path from receiver: Path("literal").write_text(...)
749
+ receiver = func.value
750
+ if isinstance(receiver, ast.Call):
751
+ func2 = receiver.func
752
+ fname = func2.id if isinstance(func2, ast.Name) else getattr(func2, "attr", "")
753
+ if fname in ("Path", "PurePath", "PosixPath", "WindowsPath") and receiver.args:
754
+ arg0 = receiver.args[0]
755
+ if isinstance(arg0, ast.Constant) and isinstance(arg0.value, str):
756
+ target = arg0.value
757
+ if target not in seen:
758
+ seen.add(target)
759
+ found.append(target)
760
+
761
+ return found
762
+
763
+
764
+ def _extract_entity_classes(tree: "ast.Module") -> list[str]:
765
+ """Return names of dataclass / pydantic / NamedTuple / TypedDict classes."""
766
+ import ast # noqa: PLC0415
767
+
768
+ _DATACLASS_DECS = frozenset({"dataclass", "dataclasses.dataclass"})
769
+ _ENTITY_BASES = frozenset({
770
+ "NamedTuple", "typing.NamedTuple",
771
+ "TypedDict", "typing.TypedDict",
772
+ "BaseModel", "pydantic.BaseModel",
773
+ })
774
+
775
+ def _name_of(node: ast.expr) -> str:
776
+ if isinstance(node, ast.Name):
777
+ return node.id
778
+ if isinstance(node, ast.Attribute):
779
+ return "%s.%s" % (_name_of(node.value), node.attr)
780
+ return ""
781
+
782
+ names: list[str] = []
783
+ for node in ast.walk(tree):
784
+ if not isinstance(node, ast.ClassDef):
785
+ continue
786
+ # Decorated with @dataclass / @dataclasses.dataclass
787
+ if any(_name_of(d) in _DATACLASS_DECS for d in node.decorator_list):
788
+ names.append(node.name)
789
+ continue
790
+ # Inherits from entity base
791
+ bases = {_name_of(b) for b in node.bases}
792
+ if bases & _ENTITY_BASES:
793
+ names.append(node.name)
794
+
795
+ return names