vigil-codeintel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
- vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
- vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
- vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
- vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
- vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
- vigil_forensic/__init__.py +224 -0
- vigil_forensic/_git_utils.py +178 -0
- vigil_forensic/_shared.py +510 -0
- vigil_forensic/_stubs.py +156 -0
- vigil_forensic/gate_checks/__init__.py +1 -0
- vigil_forensic/gate_checks/_ast_helpers.py +629 -0
- vigil_forensic/gate_checks/_deployment_detector.py +573 -0
- vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
- vigil_forensic/gate_checks/authority_checks.py +95 -0
- vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
- vigil_forensic/gate_checks/broad_except_checks.py +301 -0
- vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
- vigil_forensic/gate_checks/common.py +253 -0
- vigil_forensic/gate_checks/config_safety_checks.py +704 -0
- vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
- vigil_forensic/gate_checks/conflict_checks.py +193 -0
- vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
- vigil_forensic/gate_checks/context_health_checks.py +289 -0
- vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
- vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
- vigil_forensic/gate_checks/duplication_checks.py +387 -0
- vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
- vigil_forensic/gate_checks/empty_output_checks.py +87 -0
- vigil_forensic/gate_checks/encoding_checks.py +847 -0
- vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
- vigil_forensic/gate_checks/fallback_checks.py +41 -0
- vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
- vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
- vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
- vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
- vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
- vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
- vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
- vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
- vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
- vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
- vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
- vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
- vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
- vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
- vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
- vigil_forensic/gate_checks/hallucination_checks.py +566 -0
- vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
- vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
- vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
- vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
- vigil_forensic/gate_checks/ml_checks.py +318 -0
- vigil_forensic/gate_checks/performance_checks.py +106 -0
- vigil_forensic/gate_checks/project_specific_runner.py +691 -0
- vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
- vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
- vigil_forensic/gate_checks/reliability_checks.py +389 -0
- vigil_forensic/gate_checks/reporting_checks.py +55 -0
- vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
- vigil_forensic/gate_checks/security_injection_checks.py +332 -0
- vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
- vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
- vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
- vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
- vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
- vigil_forensic/gate_checks/test_quality_checks.py +946 -0
- vigil_forensic/gate_checks/testing_checks.py +149 -0
- vigil_forensic/gate_checks/toctou_checks.py +367 -0
- vigil_forensic/gate_checks/type_checking_checks.py +316 -0
- vigil_forensic/gate_models.py +392 -0
- vigil_forensic/gate_packs/__init__.py +1 -0
- vigil_forensic/gate_packs/universal.py +179 -0
- vigil_forensic/gate_profile.json +31 -0
- vigil_forensic/gate_registry.py +21 -0
- vigil_forensic/language_profiles.py +219 -0
- vigil_forensic/meta_findings.py +207 -0
- vigil_forensic/self_audit.py +725 -0
- vigil_forensic/source_analysis.py +175 -0
- vigil_mapper/__init__.py +103 -0
- vigil_mapper/_ast_helpers_minimal.py +229 -0
- vigil_mapper/_extract_imports_impl.py +123 -0
- vigil_mapper/_file_count_guard.py +129 -0
- vigil_mapper/_git_utils.py +178 -0
- vigil_mapper/_runtime_ast.py +438 -0
- vigil_mapper/_runtime_dispatch.py +137 -0
- vigil_mapper/_seed_helpers.py +82 -0
- vigil_mapper/authority_builder.py +1102 -0
- vigil_mapper/cli_entry.py +731 -0
- vigil_mapper/conflict_builder.py +818 -0
- vigil_mapper/data_contract_builder.py +446 -0
- vigil_mapper/findings_builder.py +716 -0
- vigil_mapper/fingerprint.py +53 -0
- vigil_mapper/hotspot_builder.py +539 -0
- vigil_mapper/map_common.py +449 -0
- vigil_mapper/map_errors.py +55 -0
- vigil_mapper/map_models.py +431 -0
- vigil_mapper/map_models_ext.py +206 -0
- vigil_mapper/map_models_findings.py +130 -0
- vigil_mapper/map_storage.py +455 -0
- vigil_mapper/parse_cache.py +795 -0
- vigil_mapper/refactor_boundary_builder.py +266 -0
- vigil_mapper/runtime_builder.py +527 -0
- vigil_mapper/runtime_tracer.py +243 -0
- vigil_mapper/runtime_tracer_entry.py +199 -0
- vigil_mapper/semantic_diff.py +71 -0
- vigil_mapper/source_adapters/__init__.py +109 -0
- vigil_mapper/source_adapters/_base.py +264 -0
- vigil_mapper/source_adapters/_ir.py +156 -0
- vigil_mapper/source_adapters/_lexer.py +309 -0
- vigil_mapper/source_adapters/_patterns.py +212 -0
- vigil_mapper/source_adapters/_treesitter.py +182 -0
- vigil_mapper/source_adapters/go.py +553 -0
- vigil_mapper/source_adapters/java.py +541 -0
- vigil_mapper/source_adapters/javascript.py +626 -0
- vigil_mapper/source_adapters/python.py +325 -0
- vigil_mapper/source_adapters/typescript.py +749 -0
- vigil_mapper/structural_builder.py +586 -0
- vigil_mcp/__init__.py +1 -0
- vigil_mcp/_jobs.py +587 -0
- vigil_mcp/_paths.py +93 -0
- vigil_mcp/forensic_server.py +419 -0
- vigil_mcp/map_server.py +452 -0
|
@@ -0,0 +1,795 @@
|
|
|
1
|
+
"""Two-level parse cache for the map builder subsystem.
|
|
2
|
+
|
|
3
|
+
L1 (ParseCacheL1): In-memory cache for a single build session.
|
|
4
|
+
L2 (ParseCacheL2): On-disk persistent cache in <project>/.cortex/.map_cache/.
|
|
5
|
+
|
|
6
|
+
Design:
|
|
7
|
+
- ParsedFile holds per-file signals extracted by AST parsing (no ast.Module —
|
|
8
|
+
not serialisable). Reused by structural, runtime, data_contract, authority
|
|
9
|
+
builders so each file is parsed at most once per build.
|
|
10
|
+
- content_hash = sha256(source_bytes).hexdigest()[:32] (full 32-char hex)
|
|
11
|
+
- adapter_version_hash = sha256(sorted adapter repr strings)[:16]
|
|
12
|
+
- L2 cache entries live in .cortex/.map_cache/<content_hash>.json
|
|
13
|
+
- Corrupt / wrong-version entries are treated as cache misses, never raised.
|
|
14
|
+
- Thread-safety: L1 is not thread-safe (single-threaded builder loop).
|
|
15
|
+
L2 writes are atomic (tempfile + os.replace).
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import collections
|
|
20
|
+
import hashlib
|
|
21
|
+
import json
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
import tempfile
|
|
25
|
+
import time
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import TYPE_CHECKING
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"ParsedFile",
|
|
35
|
+
"ParseCacheL1",
|
|
36
|
+
"ParseCacheL2",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
_log = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
# Bump this when ParsedFile schema changes incompatibly.
|
|
42
|
+
_CACHE_FORMAT_VERSION = 1
|
|
43
|
+
|
|
44
|
+
# Subdirectory inside .cortex for the L2 on-disk cache.
|
|
45
|
+
_CACHE_SUBDIR = ".cortex/.map_cache"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Adapter version hash — invalidates cache when parser logic changes
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
def _compute_adapter_version_hash() -> str:
|
|
53
|
+
"""Return a 16-char hex hash derived from adapter capabilities + source code.
|
|
54
|
+
|
|
55
|
+
Combines:
|
|
56
|
+
1. Adapter class names + capability flags (structural, contracts, runtime, writes)
|
|
57
|
+
2. Source code of critical extraction modules (parse_cache.py, source_adapters.py)
|
|
58
|
+
|
|
59
|
+
When adapters change, capabilities change, OR extraction logic changes,
|
|
60
|
+
the hash changes and all L2 entries from prior builds become invalid.
|
|
61
|
+
"""
|
|
62
|
+
from .source_adapters import ADAPTERS # noqa: PLC0415
|
|
63
|
+
|
|
64
|
+
parts: list[str] = []
|
|
65
|
+
|
|
66
|
+
# Part 1: Adapter capabilities (as before)
|
|
67
|
+
for ext in sorted(ADAPTERS):
|
|
68
|
+
a = ADAPTERS[ext]
|
|
69
|
+
parts.append(
|
|
70
|
+
"%s|%s|structural=%s|contracts=%s|runtime=%s|writes=%s" % (
|
|
71
|
+
ext,
|
|
72
|
+
a.__class__.__name__,
|
|
73
|
+
a.supports_structural,
|
|
74
|
+
a.supports_contracts,
|
|
75
|
+
a.supports_runtime_signals,
|
|
76
|
+
a.supports_authority_writes,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Part 2: Source code hash of critical extraction modules
|
|
81
|
+
# This invalidates cache when extraction logic changes
|
|
82
|
+
map_builder_dir = Path(__file__).parent
|
|
83
|
+
critical_modules = [
|
|
84
|
+
"parse_cache.py",
|
|
85
|
+
"structural_builder.py",
|
|
86
|
+
"runtime_builder.py",
|
|
87
|
+
"data_contract_builder.py",
|
|
88
|
+
"authority_builder.py",
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
module_parts: list[str] = []
|
|
92
|
+
for mod_name in critical_modules:
|
|
93
|
+
mod_path = map_builder_dir / mod_name
|
|
94
|
+
if mod_path.exists():
|
|
95
|
+
try:
|
|
96
|
+
source = mod_path.read_text(encoding="utf-8")
|
|
97
|
+
mod_hash = hashlib.sha256(source.encode("utf-8")).hexdigest()[:8]
|
|
98
|
+
module_parts.append(f"{mod_name}:{mod_hash}")
|
|
99
|
+
except (OSError, UnicodeDecodeError):
|
|
100
|
+
_log.debug("_compute_adapter_version_hash: failed to read %s", mod_name)
|
|
101
|
+
# Use empty hash if file cannot be read (failure is not silenced,
|
|
102
|
+
# ensures rebuilds happen on file access issues)
|
|
103
|
+
module_parts.append(f"{mod_name}:ERROR")
|
|
104
|
+
else:
|
|
105
|
+
# Module doesn't exist in this version of map_builder (acceptable)
|
|
106
|
+
module_parts.append(f"{mod_name}:MISSING")
|
|
107
|
+
|
|
108
|
+
# source_adapters/ is a package directory — hash all *.py files combined so
|
|
109
|
+
# any adapter file change invalidates the cache.
|
|
110
|
+
source_adapters_dir = map_builder_dir / "source_adapters"
|
|
111
|
+
if source_adapters_dir.is_dir():
|
|
112
|
+
adapter_files = sorted(source_adapters_dir.glob("*.py"))
|
|
113
|
+
per_file_hashes: list[str] = []
|
|
114
|
+
for adapter_path in adapter_files:
|
|
115
|
+
try:
|
|
116
|
+
adapter_source = adapter_path.read_text(encoding="utf-8")
|
|
117
|
+
file_hash = hashlib.sha256(adapter_source.encode("utf-8")).hexdigest()
|
|
118
|
+
per_file_hashes.append(f"{adapter_path.name}:{file_hash}")
|
|
119
|
+
except (OSError, UnicodeDecodeError):
|
|
120
|
+
_log.debug(
|
|
121
|
+
"_compute_adapter_version_hash: failed to read %s",
|
|
122
|
+
adapter_path.name,
|
|
123
|
+
)
|
|
124
|
+
per_file_hashes.append(f"{adapter_path.name}:ERROR")
|
|
125
|
+
combined_adapter_hash = hashlib.sha256(
|
|
126
|
+
"\n".join(per_file_hashes).encode("utf-8")
|
|
127
|
+
).hexdigest()[:8]
|
|
128
|
+
module_parts.append(f"source_adapters_dir:{combined_adapter_hash}")
|
|
129
|
+
else:
|
|
130
|
+
_log.warning(
|
|
131
|
+
"_compute_adapter_version_hash: source_adapters/ directory missing at %s",
|
|
132
|
+
source_adapters_dir,
|
|
133
|
+
)
|
|
134
|
+
module_parts.append("source_adapters_dir:MISSING")
|
|
135
|
+
|
|
136
|
+
parts.append("extraction_code:" + ",".join(module_parts))
|
|
137
|
+
combined = "\n".join(sorted(parts))
|
|
138
|
+
return hashlib.sha256(combined.encode("utf-8")).hexdigest()[:16]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# Computed once per process lifetime (adapters are registered at import time).
|
|
142
|
+
_ADAPTER_VERSION_HASH: str | None = None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _get_adapter_version_hash() -> str:
|
|
146
|
+
global _ADAPTER_VERSION_HASH # noqa: PLW0603
|
|
147
|
+
if _ADAPTER_VERSION_HASH is None:
|
|
148
|
+
_ADAPTER_VERSION_HASH = _compute_adapter_version_hash()
|
|
149
|
+
return _ADAPTER_VERSION_HASH
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
# ParsedFile dataclass
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
@dataclass
|
|
157
|
+
class ParsedFile:
|
|
158
|
+
"""Per-file signals extracted from source. No ast.Module (not serialisable).
|
|
159
|
+
|
|
160
|
+
All list fields are plain lists (not tuples) so they round-trip through JSON
|
|
161
|
+
without conversion. Builders that need tuples cast on consumption.
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
# Structural signals
|
|
165
|
+
imports_out: list[str] # dotted module names imported by this file
|
|
166
|
+
symbols_defined: list[str] # class / function names at any scope
|
|
167
|
+
|
|
168
|
+
# Runtime signals
|
|
169
|
+
env_vars: list[str] # os.environ keys read by this file
|
|
170
|
+
side_effects: list[str] # import-time side-effect categories detected
|
|
171
|
+
write_calls: list[str] # write-target paths detected by AST
|
|
172
|
+
|
|
173
|
+
# Data-contract signals
|
|
174
|
+
entity_classes: list[str] # dataclass / pydantic / NamedTuple / TypedDict names
|
|
175
|
+
|
|
176
|
+
# Meta
|
|
177
|
+
is_parseable: bool # False iff source had a SyntaxError
|
|
178
|
+
content_hash: str # sha256(source)[:32]
|
|
179
|
+
size_lines: int # line count of source
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _parsed_file_to_dict(pf: ParsedFile) -> dict:
|
|
183
|
+
return {
|
|
184
|
+
"imports_out": pf.imports_out,
|
|
185
|
+
"symbols_defined": pf.symbols_defined,
|
|
186
|
+
"env_vars": pf.env_vars,
|
|
187
|
+
"side_effects": pf.side_effects,
|
|
188
|
+
"write_calls": pf.write_calls,
|
|
189
|
+
"entity_classes": pf.entity_classes,
|
|
190
|
+
"is_parseable": pf.is_parseable,
|
|
191
|
+
"content_hash": pf.content_hash,
|
|
192
|
+
"size_lines": pf.size_lines,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _parsed_file_from_dict(d: dict) -> ParsedFile:
|
|
197
|
+
return ParsedFile(
|
|
198
|
+
imports_out=list(d.get("imports_out", [])),
|
|
199
|
+
symbols_defined=list(d.get("symbols_defined", [])),
|
|
200
|
+
env_vars=list(d.get("env_vars", [])),
|
|
201
|
+
side_effects=list(d.get("side_effects", [])),
|
|
202
|
+
write_calls=list(d.get("write_calls", [])),
|
|
203
|
+
entity_classes=list(d.get("entity_classes", [])),
|
|
204
|
+
is_parseable=bool(d.get("is_parseable", True)),
|
|
205
|
+
content_hash=str(d.get("content_hash", "")),
|
|
206
|
+
size_lines=int(d.get("size_lines", 0)),
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# ---------------------------------------------------------------------------
|
|
211
|
+
# ParseCacheL2 — on-disk persistent cache
|
|
212
|
+
# ---------------------------------------------------------------------------
|
|
213
|
+
|
|
214
|
+
class ParseCacheL2:
|
|
215
|
+
"""On-disk JSON cache stored in <project_dir>/.cortex/.map_cache/.
|
|
216
|
+
|
|
217
|
+
Cache key is content_hash (sha256[:32]). Each entry is a JSON file
|
|
218
|
+
named <content_hash>.json containing parsed signals + a meta envelope
|
|
219
|
+
for format/adapter-version validation.
|
|
220
|
+
|
|
221
|
+
Partial failures (corrupt JSON, schema mismatch, OSError) are treated as
|
|
222
|
+
cache misses — they never propagate to callers.
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
def __init__(self, project_dir: Path) -> None:
|
|
226
|
+
self._cache_dir = project_dir.resolve() / _CACHE_SUBDIR
|
|
227
|
+
self._adapter_hash = _get_adapter_version_hash()
|
|
228
|
+
self._hits = 0
|
|
229
|
+
self._misses = 0
|
|
230
|
+
|
|
231
|
+
# ------------------------------------------------------------------
|
|
232
|
+
# Public API
|
|
233
|
+
# ------------------------------------------------------------------
|
|
234
|
+
|
|
235
|
+
def get(self, content_hash: str) -> ParsedFile | None:
|
|
236
|
+
"""Return cached ParsedFile for content_hash, or None on miss/error."""
|
|
237
|
+
entry_path = self._cache_dir / (content_hash + ".json")
|
|
238
|
+
if not entry_path.exists():
|
|
239
|
+
self._misses += 1
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
raw = entry_path.read_text(encoding="utf-8")
|
|
244
|
+
payload = json.loads(raw)
|
|
245
|
+
except (OSError, json.JSONDecodeError, UnicodeDecodeError) as exc:
|
|
246
|
+
_log.debug("ParseCacheL2.get: corrupt entry %s, treating as miss: %s", entry_path.name, exc)
|
|
247
|
+
self._misses += 1
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
if not isinstance(payload, dict):
|
|
251
|
+
_log.debug("ParseCacheL2.get: non-dict payload in %s, treating as miss", entry_path.name)
|
|
252
|
+
self._misses += 1
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
# Validate format version
|
|
256
|
+
if payload.get("format_version") != _CACHE_FORMAT_VERSION:
|
|
257
|
+
_log.debug(
|
|
258
|
+
"ParseCacheL2.get: format_version mismatch in %s (got %r, want %r), miss",
|
|
259
|
+
entry_path.name,
|
|
260
|
+
payload.get("format_version"),
|
|
261
|
+
_CACHE_FORMAT_VERSION,
|
|
262
|
+
)
|
|
263
|
+
self._misses += 1
|
|
264
|
+
return None
|
|
265
|
+
|
|
266
|
+
# Validate adapter version
|
|
267
|
+
if payload.get("adapter_version_hash") != self._adapter_hash:
|
|
268
|
+
_log.debug(
|
|
269
|
+
"ParseCacheL2.get: adapter_version_hash mismatch in %s, miss",
|
|
270
|
+
entry_path.name,
|
|
271
|
+
)
|
|
272
|
+
self._misses += 1
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
signals = payload.get("signals")
|
|
276
|
+
if not isinstance(signals, dict):
|
|
277
|
+
_log.debug("ParseCacheL2.get: missing 'signals' in %s, miss", entry_path.name)
|
|
278
|
+
self._misses += 1
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
try:
|
|
282
|
+
pf = _parsed_file_from_dict(signals)
|
|
283
|
+
except Exception as exc:
|
|
284
|
+
_log.debug("ParseCacheL2.get: failed to deserialise %s: %s", entry_path.name, exc)
|
|
285
|
+
self._misses += 1
|
|
286
|
+
return None
|
|
287
|
+
|
|
288
|
+
self._hits += 1
|
|
289
|
+
return pf
|
|
290
|
+
|
|
291
|
+
def put(self, content_hash: str, parsed_file: ParsedFile) -> None:
|
|
292
|
+
"""Atomically write parsed_file to cache. Silently swallows write errors."""
|
|
293
|
+
self._cache_dir.mkdir(parents=True, exist_ok=True)
|
|
294
|
+
entry_path = self._cache_dir / (content_hash + ".json")
|
|
295
|
+
payload = {
|
|
296
|
+
"format_version": _CACHE_FORMAT_VERSION,
|
|
297
|
+
"adapter_version_hash": self._adapter_hash,
|
|
298
|
+
"content_hash": content_hash,
|
|
299
|
+
"signals": _parsed_file_to_dict(parsed_file),
|
|
300
|
+
}
|
|
301
|
+
try:
|
|
302
|
+
self._atomic_write(entry_path, payload)
|
|
303
|
+
except Exception as exc:
|
|
304
|
+
_log.debug("ParseCacheL2.put: failed to write %s: %s", entry_path.name, exc)
|
|
305
|
+
|
|
306
|
+
def flush(self) -> None:
|
|
307
|
+
"""No-op — all writes are already atomic. Reserved for future cleanup."""
|
|
308
|
+
|
|
309
|
+
# ------------------------------------------------------------------
|
|
310
|
+
# Internal helpers
|
|
311
|
+
# ------------------------------------------------------------------
|
|
312
|
+
|
|
313
|
+
def _atomic_write(self, path: Path, payload: dict) -> None:
|
|
314
|
+
"""Write payload atomically via tempfile + os.replace."""
|
|
315
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
316
|
+
fd, tmp_path = tempfile.mkstemp(
|
|
317
|
+
dir=str(path.parent),
|
|
318
|
+
prefix=".pcache_",
|
|
319
|
+
suffix=".tmp",
|
|
320
|
+
)
|
|
321
|
+
try:
|
|
322
|
+
with os.fdopen(fd, "w", encoding="utf-8") as fh:
|
|
323
|
+
fh.write(json.dumps(payload, indent=2, ensure_ascii=False, sort_keys=True))
|
|
324
|
+
fh.write("\n")
|
|
325
|
+
os.replace(tmp_path, str(path))
|
|
326
|
+
except BaseException:
|
|
327
|
+
try:
|
|
328
|
+
os.unlink(tmp_path)
|
|
329
|
+
except OSError:
|
|
330
|
+
pass
|
|
331
|
+
raise
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
# ---------------------------------------------------------------------------
|
|
335
|
+
# ParseCacheL1 — in-memory cache for one build session
|
|
336
|
+
# ---------------------------------------------------------------------------
|
|
337
|
+
|
|
338
|
+
class ParseCacheL1:
|
|
339
|
+
"""In-memory parse cache backed by an optional ParseCacheL2.
|
|
340
|
+
|
|
341
|
+
Lifetime: one map-build invocation. Keyed by resolved absolute path.
|
|
342
|
+
On get_or_parse() miss: reads the file, hashes content, checks L2,
|
|
343
|
+
then falls back to full AST parse. Result stored in both L1 and L2.
|
|
344
|
+
|
|
345
|
+
Also caches source text in L1 (not serialized to L2) to avoid re-reading
|
|
346
|
+
files when multiple builders consume the same file.
|
|
347
|
+
"""
|
|
348
|
+
|
|
349
|
+
# Default cap for the source-text LRU: keeps at most this many file texts
|
|
350
|
+
# in memory simultaneously. On a typical project the working set is small
|
|
351
|
+
# (builders read each file once), so 256 covers virtually all cases while
|
|
352
|
+
# preventing unbounded growth when a repo has thousands of source files.
|
|
353
|
+
_SOURCE_CACHE_MAX_ENTRIES: int = 256
|
|
354
|
+
|
|
355
|
+
def __init__(
|
|
356
|
+
self,
|
|
357
|
+
l2: ParseCacheL2 | None = None,
|
|
358
|
+
*,
|
|
359
|
+
max_file_mb: float = 5.0,
|
|
360
|
+
source_cache_max_entries: int | None = None,
|
|
361
|
+
) -> None:
|
|
362
|
+
"""Initialise the L1 in-memory cache.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
l2: Optional L2 on-disk cache.
|
|
366
|
+
max_file_mb: Files larger than this threshold (in MiB) are SKIPPED —
|
|
367
|
+
their full-text is never loaded and an empty ParsedFile is
|
|
368
|
+
returned. The skipped file is recorded in ``oversized_files``.
|
|
369
|
+
Default is 5.0 MiB. Pass ``float('inf')`` to disable.
|
|
370
|
+
source_cache_max_entries: Maximum number of raw source strings to
|
|
371
|
+
retain in the LRU text cache. Oldest entry is evicted when the
|
|
372
|
+
cap is reached. Default: ``_SOURCE_CACHE_MAX_ENTRIES`` (256).
|
|
373
|
+
"""
|
|
374
|
+
self._l2 = l2
|
|
375
|
+
self._max_file_bytes: float = max_file_mb * 1024 * 1024
|
|
376
|
+
_cap = source_cache_max_entries if source_cache_max_entries is not None else self._SOURCE_CACHE_MAX_ENTRIES
|
|
377
|
+
self._source_cache_max: int = max(1, _cap)
|
|
378
|
+
self._cache: dict[str, ParsedFile] = {} # key: str(abs_path)
|
|
379
|
+
# Bounded LRU: OrderedDict keeps insertion order; we move-to-end on hit
|
|
380
|
+
# and pop the oldest entry when the cap is reached.
|
|
381
|
+
self._source_cache: collections.OrderedDict[str, str] = collections.OrderedDict()
|
|
382
|
+
|
|
383
|
+
# Oversized-file tracking (consumed by cli_entry to populate meta)
|
|
384
|
+
self.oversized_files: list[dict] = [] # [{path, size_mb}]
|
|
385
|
+
|
|
386
|
+
# Counters
|
|
387
|
+
self.hits = 0 # L1 hits
|
|
388
|
+
self.misses = 0 # L1 misses (file parsed fresh or from L2)
|
|
389
|
+
self.l2_hits = 0 # subset of misses served by L2
|
|
390
|
+
self.l2_misses = 0 # subset of misses that required full parse
|
|
391
|
+
self.total_files = 0 # total get_or_parse() calls
|
|
392
|
+
self.time_saved_ms: float = 0.0 # estimated ms saved by L1+L2 cache hits
|
|
393
|
+
|
|
394
|
+
# ------------------------------------------------------------------
|
|
395
|
+
# Public API
|
|
396
|
+
# ------------------------------------------------------------------
|
|
397
|
+
|
|
398
|
+
def get_or_parse(self, abs_path: Path, project_dir: Path) -> ParsedFile:
|
|
399
|
+
"""Return ParsedFile for abs_path, computing it if necessary.
|
|
400
|
+
|
|
401
|
+
Order of precedence:
|
|
402
|
+
1. L1 in-memory cache (fastest)
|
|
403
|
+
2. L2 on-disk cache (keyed by content_hash)
|
|
404
|
+
3. Full AST parse (slowest — result stored in L1 + L2)
|
|
405
|
+
|
|
406
|
+
Also caches the source text in L1 for later retrieval via get_cached_source().
|
|
407
|
+
"""
|
|
408
|
+
self.total_files += 1
|
|
409
|
+
key = str(abs_path)
|
|
410
|
+
|
|
411
|
+
# --- L1 hit ---
|
|
412
|
+
if key in self._cache:
|
|
413
|
+
self.hits += 1
|
|
414
|
+
_log.debug("ParseCacheL1: L1 hit for %s", abs_path.name)
|
|
415
|
+
return self._cache[key]
|
|
416
|
+
|
|
417
|
+
self.misses += 1
|
|
418
|
+
t0 = time.perf_counter()
|
|
419
|
+
|
|
420
|
+
# --- File-size guard (fast stat before read) ---
|
|
421
|
+
try:
|
|
422
|
+
file_bytes = abs_path.stat().st_size
|
|
423
|
+
except OSError:
|
|
424
|
+
file_bytes = 0
|
|
425
|
+
if file_bytes > self._max_file_bytes:
|
|
426
|
+
size_mb = file_bytes / (1024 * 1024)
|
|
427
|
+
_log.warning(
|
|
428
|
+
"ParseCacheL1: skipping oversized file %s (%.1f MiB > %.1f MiB limit)",
|
|
429
|
+
abs_path, size_mb, self._max_file_bytes / (1024 * 1024),
|
|
430
|
+
)
|
|
431
|
+
self.oversized_files.append({"path": str(abs_path), "size_mb": round(size_mb, 3)})
|
|
432
|
+
pf = _empty_parsed_file("")
|
|
433
|
+
self._cache[key] = pf
|
|
434
|
+
return pf
|
|
435
|
+
|
|
436
|
+
# Read source
|
|
437
|
+
try:
|
|
438
|
+
source = abs_path.read_text(encoding="utf-8", errors="replace")
|
|
439
|
+
except OSError as exc:
|
|
440
|
+
_log.warning("ParseCacheL1.get_or_parse: cannot read %s: %s", abs_path, exc)
|
|
441
|
+
pf = _empty_parsed_file("")
|
|
442
|
+
self._cache[key] = pf
|
|
443
|
+
return pf
|
|
444
|
+
|
|
445
|
+
# Store source in bounded LRU cache (evict oldest when cap reached)
|
|
446
|
+
if key not in self._source_cache and len(self._source_cache) >= self._source_cache_max:
|
|
447
|
+
self._source_cache.popitem(last=False) # evict oldest
|
|
448
|
+
self._source_cache[key] = source
|
|
449
|
+
self._source_cache.move_to_end(key) # mark as most-recently used
|
|
450
|
+
|
|
451
|
+
content_hash = hashlib.sha256(source.encode("utf-8")).hexdigest()[:32]
|
|
452
|
+
|
|
453
|
+
# --- L2 hit ---
|
|
454
|
+
if self._l2 is not None:
|
|
455
|
+
cached = self._l2.get(content_hash)
|
|
456
|
+
if cached is not None:
|
|
457
|
+
self.l2_hits += 1
|
|
458
|
+
elapsed_ms = (time.perf_counter() - t0) * 1000
|
|
459
|
+
self.time_saved_ms += _estimate_parse_time_ms(source) - elapsed_ms
|
|
460
|
+
_log.debug("ParseCacheL1: L2 hit for %s (hash=%s)", abs_path.name, content_hash)
|
|
461
|
+
self._cache[key] = cached
|
|
462
|
+
return cached
|
|
463
|
+
|
|
464
|
+
# --- Full parse ---
|
|
465
|
+
self.l2_misses += 1
|
|
466
|
+
pf = _parse_file(source, content_hash, abs_path, project_dir)
|
|
467
|
+
_log.debug("ParseCacheL1: parsed %s (%d lines)", abs_path.name, pf.size_lines)
|
|
468
|
+
|
|
469
|
+
# Store results
|
|
470
|
+
self._cache[key] = pf
|
|
471
|
+
if self._l2 is not None:
|
|
472
|
+
self._l2.put(content_hash, pf)
|
|
473
|
+
|
|
474
|
+
return pf
|
|
475
|
+
|
|
476
|
+
def get_cached_source(self, abs_path: Path) -> str | None:
|
|
477
|
+
"""Return cached source text if available, else None.
|
|
478
|
+
|
|
479
|
+
Used by runtime/data_contract builders to avoid re-reading files
|
|
480
|
+
that were already read by get_or_parse(). On hit the entry is
|
|
481
|
+
promoted to most-recently-used so it survives longer in the LRU.
|
|
482
|
+
"""
|
|
483
|
+
key = str(abs_path)
|
|
484
|
+
src = self._source_cache.get(key)
|
|
485
|
+
if src is not None:
|
|
486
|
+
self._source_cache.move_to_end(key)
|
|
487
|
+
return src
|
|
488
|
+
|
|
489
|
+
def log_stats(self) -> None:
|
|
490
|
+
"""Log hit/miss stats at INFO level."""
|
|
491
|
+
total = self.total_files
|
|
492
|
+
l1_rate = (self.hits / total * 100) if total > 0 else 0.0
|
|
493
|
+
l2_rate = (self.l2_hits / max(self.misses, 1) * 100) if self.misses > 0 else 0.0
|
|
494
|
+
_log.info(
|
|
495
|
+
"ParseCacheL1 stats: total=%d L1_hits=%d (%.0f%%) "
|
|
496
|
+
"L2_hits=%d (%.0f%% of L1-misses) full_parses=%d "
|
|
497
|
+
"estimated_saved=%.0fms",
|
|
498
|
+
total,
|
|
499
|
+
self.hits,
|
|
500
|
+
l1_rate,
|
|
501
|
+
self.l2_hits,
|
|
502
|
+
l2_rate,
|
|
503
|
+
self.l2_misses,
|
|
504
|
+
self.time_saved_ms,
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
# ---------------------------------------------------------------------------
|
|
509
|
+
# Parsing implementation (no ast.Module stored in ParsedFile)
|
|
510
|
+
# ---------------------------------------------------------------------------
|
|
511
|
+
|
|
512
|
+
def _empty_parsed_file(content_hash: str) -> ParsedFile:
|
|
513
|
+
"""Return a minimal ParsedFile for unreadable files."""
|
|
514
|
+
return ParsedFile(
|
|
515
|
+
imports_out=[],
|
|
516
|
+
symbols_defined=[],
|
|
517
|
+
env_vars=[],
|
|
518
|
+
side_effects=[],
|
|
519
|
+
write_calls=[],
|
|
520
|
+
entity_classes=[],
|
|
521
|
+
is_parseable=False,
|
|
522
|
+
content_hash=content_hash,
|
|
523
|
+
size_lines=0,
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def _estimate_parse_time_ms(source: str) -> float:
|
|
528
|
+
"""Rough estimate of AST parse time based on file size.
|
|
529
|
+
|
|
530
|
+
Used only for time_saved_ms accounting — not a hard measurement.
|
|
531
|
+
Empirically: ~1ms per 200 lines on modern hardware.
|
|
532
|
+
"""
|
|
533
|
+
lines = source.count("\n") + 1
|
|
534
|
+
return max(1.0, lines / 200.0)
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def _parse_file(
|
|
538
|
+
source: str,
|
|
539
|
+
content_hash: str,
|
|
540
|
+
abs_path: Path,
|
|
541
|
+
project_dir: Path,
|
|
542
|
+
) -> ParsedFile:
|
|
543
|
+
"""Extract all signals from source via AST. Never raises on SyntaxError."""
|
|
544
|
+
import ast # noqa: PLC0415
|
|
545
|
+
|
|
546
|
+
size_lines = source.count("\n") + (1 if source and not source.endswith("\n") else 0)
|
|
547
|
+
|
|
548
|
+
# --- Parseability check ---
|
|
549
|
+
try:
|
|
550
|
+
tree = ast.parse(source)
|
|
551
|
+
is_parseable = True
|
|
552
|
+
except SyntaxError:
|
|
553
|
+
return ParsedFile(
|
|
554
|
+
imports_out=[],
|
|
555
|
+
symbols_defined=[],
|
|
556
|
+
env_vars=[],
|
|
557
|
+
side_effects=[],
|
|
558
|
+
write_calls=[],
|
|
559
|
+
entity_classes=[],
|
|
560
|
+
is_parseable=False,
|
|
561
|
+
content_hash=content_hash,
|
|
562
|
+
size_lines=size_lines,
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
# --- Imports ---
|
|
566
|
+
imports_out: list[str] = _extract_imports_out(tree, source)
|
|
567
|
+
|
|
568
|
+
# --- Symbols defined ---
|
|
569
|
+
symbols_defined: list[str] = [
|
|
570
|
+
node.name
|
|
571
|
+
for node in ast.walk(tree)
|
|
572
|
+
if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef))
|
|
573
|
+
]
|
|
574
|
+
|
|
575
|
+
# --- Env vars ---
|
|
576
|
+
env_vars: list[str] = _extract_env_vars(tree)
|
|
577
|
+
|
|
578
|
+
# --- Side effects (import-time call statements) ---
|
|
579
|
+
side_effects: list[str] = _extract_side_effects(tree)
|
|
580
|
+
|
|
581
|
+
# --- Write calls ---
|
|
582
|
+
write_calls: list[str] = _extract_write_calls(tree)
|
|
583
|
+
|
|
584
|
+
# --- Entity classes (dataclass / pydantic / NamedTuple / TypedDict) ---
|
|
585
|
+
entity_classes: list[str] = _extract_entity_classes(tree)
|
|
586
|
+
|
|
587
|
+
return ParsedFile(
|
|
588
|
+
imports_out=imports_out,
|
|
589
|
+
symbols_defined=symbols_defined,
|
|
590
|
+
env_vars=env_vars,
|
|
591
|
+
side_effects=side_effects,
|
|
592
|
+
write_calls=write_calls,
|
|
593
|
+
entity_classes=entity_classes,
|
|
594
|
+
is_parseable=is_parseable,
|
|
595
|
+
content_hash=content_hash,
|
|
596
|
+
size_lines=size_lines,
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def _extract_imports_out(tree: "ast.Module", source: str) -> list[str]:
|
|
601
|
+
"""Collect all import targets including 'from X import Y' → 'X.Y' candidates."""
|
|
602
|
+
import ast # noqa: PLC0415
|
|
603
|
+
|
|
604
|
+
seen: set[str] = set()
|
|
605
|
+
result: list[str] = []
|
|
606
|
+
|
|
607
|
+
def _add(name: str) -> None:
|
|
608
|
+
if name and name not in seen:
|
|
609
|
+
seen.add(name)
|
|
610
|
+
result.append(name)
|
|
611
|
+
|
|
612
|
+
for node in ast.walk(tree):
|
|
613
|
+
if isinstance(node, ast.Import):
|
|
614
|
+
for alias in node.names:
|
|
615
|
+
_add(alias.name)
|
|
616
|
+
elif isinstance(node, ast.ImportFrom):
|
|
617
|
+
if node.level == 0 and node.module:
|
|
618
|
+
# Module-level import
|
|
619
|
+
_add(node.module)
|
|
620
|
+
# Also produce 'module.name' candidates for sub-module resolution
|
|
621
|
+
for alias in node.names:
|
|
622
|
+
if alias.name != "*":
|
|
623
|
+
_add("%s.%s" % (node.module, alias.name))
|
|
624
|
+
elif node.level > 0:
|
|
625
|
+
# Relative import: represent as ".name" or "..name"
|
|
626
|
+
dots = "." * node.level
|
|
627
|
+
if node.module:
|
|
628
|
+
_add(dots + node.module)
|
|
629
|
+
else:
|
|
630
|
+
for alias in node.names:
|
|
631
|
+
if alias.name != "*":
|
|
632
|
+
_add(dots + alias.name)
|
|
633
|
+
|
|
634
|
+
return result
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
_ENV_CALL_PATTERNS = frozenset({
|
|
638
|
+
("os", "environ", "get"), # os.environ.get(...)
|
|
639
|
+
("os", "getenv"), # os.getenv(...)
|
|
640
|
+
})
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def _extract_env_vars(tree: "ast.Module") -> list[str]:
|
|
644
|
+
"""Extract string keys from os.environ.get/os.getenv calls."""
|
|
645
|
+
import ast # noqa: PLC0415
|
|
646
|
+
|
|
647
|
+
found: list[str] = []
|
|
648
|
+
seen: set[str] = set()
|
|
649
|
+
|
|
650
|
+
for node in ast.walk(tree):
|
|
651
|
+
if not isinstance(node, ast.Call):
|
|
652
|
+
continue
|
|
653
|
+
func = node.func
|
|
654
|
+
# os.environ.get("KEY") — func is Attribute(value=Attribute(value=Name("os"), attr="environ"), attr="get")
|
|
655
|
+
if (
|
|
656
|
+
isinstance(func, ast.Attribute)
|
|
657
|
+
and func.attr == "get"
|
|
658
|
+
and isinstance(func.value, ast.Attribute)
|
|
659
|
+
and func.value.attr == "environ"
|
|
660
|
+
and isinstance(func.value.value, ast.Name)
|
|
661
|
+
and func.value.value.id == "os"
|
|
662
|
+
):
|
|
663
|
+
if node.args and isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str):
|
|
664
|
+
key = node.args[0].value
|
|
665
|
+
if key not in seen:
|
|
666
|
+
seen.add(key)
|
|
667
|
+
found.append(key)
|
|
668
|
+
# os.getenv("KEY") — func is Attribute(value=Name("os"), attr="getenv")
|
|
669
|
+
elif (
|
|
670
|
+
isinstance(func, ast.Attribute)
|
|
671
|
+
and func.attr == "getenv"
|
|
672
|
+
and isinstance(func.value, ast.Name)
|
|
673
|
+
and func.value.id == "os"
|
|
674
|
+
):
|
|
675
|
+
if node.args and isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str):
|
|
676
|
+
key = node.args[0].value
|
|
677
|
+
if key not in seen:
|
|
678
|
+
seen.add(key)
|
|
679
|
+
found.append(key)
|
|
680
|
+
|
|
681
|
+
# Also catch os.environ["KEY"] subscripts
|
|
682
|
+
for node in ast.walk(tree):
|
|
683
|
+
if (
|
|
684
|
+
isinstance(node, ast.Subscript)
|
|
685
|
+
and isinstance(node.value, ast.Attribute)
|
|
686
|
+
and node.value.attr == "environ"
|
|
687
|
+
and isinstance(node.value.value, ast.Name)
|
|
688
|
+
and node.value.value.id == "os"
|
|
689
|
+
):
|
|
690
|
+
key_node = node.slice
|
|
691
|
+
# Python 3.9+: slice is the node directly; 3.8: wrapped in Index
|
|
692
|
+
if isinstance(key_node, ast.Constant) and isinstance(key_node.value, str):
|
|
693
|
+
key = key_node.value
|
|
694
|
+
if key not in seen:
|
|
695
|
+
seen.add(key)
|
|
696
|
+
found.append(key)
|
|
697
|
+
|
|
698
|
+
return found
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
def _extract_side_effects(tree: "ast.Module") -> list[str]:
|
|
702
|
+
"""Detect import-time side-effect categories at module top level."""
|
|
703
|
+
import ast # noqa: PLC0415
|
|
704
|
+
|
|
705
|
+
categories: list[str] = []
|
|
706
|
+
seen: set[str] = set()
|
|
707
|
+
|
|
708
|
+
def _add(cat: str) -> None:
|
|
709
|
+
if cat not in seen:
|
|
710
|
+
seen.add(cat)
|
|
711
|
+
categories.append(cat)
|
|
712
|
+
|
|
713
|
+
# Module body: top-level statements that are expressions (calls) indicate
|
|
714
|
+
# side-effects at import time.
|
|
715
|
+
for stmt in getattr(tree, "body", []):
|
|
716
|
+
if isinstance(stmt, ast.Expr) and isinstance(stmt.value, ast.Call):
|
|
717
|
+
func = stmt.value.func
|
|
718
|
+
func_name = ""
|
|
719
|
+
if isinstance(func, ast.Name):
|
|
720
|
+
func_name = func.id
|
|
721
|
+
elif isinstance(func, ast.Attribute):
|
|
722
|
+
func_name = func.attr
|
|
723
|
+
# Common side-effect patterns
|
|
724
|
+
if func_name in ("register", "setup", "configure", "init", "bootstrap", "start"):
|
|
725
|
+
_add("import_time_side_effects")
|
|
726
|
+
elif func_name in ("Thread", "Process", "create_task"):
|
|
727
|
+
_add("background_task")
|
|
728
|
+
else:
|
|
729
|
+
_add("import_time_side_effects")
|
|
730
|
+
|
|
731
|
+
return categories
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
def _extract_write_calls(tree: "ast.Module") -> list[str]:
|
|
735
|
+
"""Extract literal path targets from .write_text / .write_bytes / .save calls."""
|
|
736
|
+
import ast # noqa: PLC0415
|
|
737
|
+
|
|
738
|
+
_WRITE_METHODS = frozenset({"write_text", "write_bytes", "save"})
|
|
739
|
+
found: list[str] = []
|
|
740
|
+
seen: set[str] = set()
|
|
741
|
+
|
|
742
|
+
for node in ast.walk(tree):
|
|
743
|
+
if not isinstance(node, ast.Call):
|
|
744
|
+
continue
|
|
745
|
+
func = node.func
|
|
746
|
+
if not (isinstance(func, ast.Attribute) and func.attr in _WRITE_METHODS):
|
|
747
|
+
continue
|
|
748
|
+
# Try to extract literal path from receiver: Path("literal").write_text(...)
|
|
749
|
+
receiver = func.value
|
|
750
|
+
if isinstance(receiver, ast.Call):
|
|
751
|
+
func2 = receiver.func
|
|
752
|
+
fname = func2.id if isinstance(func2, ast.Name) else getattr(func2, "attr", "")
|
|
753
|
+
if fname in ("Path", "PurePath", "PosixPath", "WindowsPath") and receiver.args:
|
|
754
|
+
arg0 = receiver.args[0]
|
|
755
|
+
if isinstance(arg0, ast.Constant) and isinstance(arg0.value, str):
|
|
756
|
+
target = arg0.value
|
|
757
|
+
if target not in seen:
|
|
758
|
+
seen.add(target)
|
|
759
|
+
found.append(target)
|
|
760
|
+
|
|
761
|
+
return found
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def _extract_entity_classes(tree: "ast.Module") -> list[str]:
|
|
765
|
+
"""Return names of dataclass / pydantic / NamedTuple / TypedDict classes."""
|
|
766
|
+
import ast # noqa: PLC0415
|
|
767
|
+
|
|
768
|
+
_DATACLASS_DECS = frozenset({"dataclass", "dataclasses.dataclass"})
|
|
769
|
+
_ENTITY_BASES = frozenset({
|
|
770
|
+
"NamedTuple", "typing.NamedTuple",
|
|
771
|
+
"TypedDict", "typing.TypedDict",
|
|
772
|
+
"BaseModel", "pydantic.BaseModel",
|
|
773
|
+
})
|
|
774
|
+
|
|
775
|
+
def _name_of(node: ast.expr) -> str:
|
|
776
|
+
if isinstance(node, ast.Name):
|
|
777
|
+
return node.id
|
|
778
|
+
if isinstance(node, ast.Attribute):
|
|
779
|
+
return "%s.%s" % (_name_of(node.value), node.attr)
|
|
780
|
+
return ""
|
|
781
|
+
|
|
782
|
+
names: list[str] = []
|
|
783
|
+
for node in ast.walk(tree):
|
|
784
|
+
if not isinstance(node, ast.ClassDef):
|
|
785
|
+
continue
|
|
786
|
+
# Decorated with @dataclass / @dataclasses.dataclass
|
|
787
|
+
if any(_name_of(d) in _DATACLASS_DECS for d in node.decorator_list):
|
|
788
|
+
names.append(node.name)
|
|
789
|
+
continue
|
|
790
|
+
# Inherits from entity base
|
|
791
|
+
bases = {_name_of(b) for b in node.bases}
|
|
792
|
+
if bases & _ENTITY_BASES:
|
|
793
|
+
names.append(node.name)
|
|
794
|
+
|
|
795
|
+
return names
|