vigil-codeintel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
- vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
- vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
- vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
- vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
- vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
- vigil_forensic/__init__.py +224 -0
- vigil_forensic/_git_utils.py +178 -0
- vigil_forensic/_shared.py +510 -0
- vigil_forensic/_stubs.py +156 -0
- vigil_forensic/gate_checks/__init__.py +1 -0
- vigil_forensic/gate_checks/_ast_helpers.py +629 -0
- vigil_forensic/gate_checks/_deployment_detector.py +573 -0
- vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
- vigil_forensic/gate_checks/authority_checks.py +95 -0
- vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
- vigil_forensic/gate_checks/broad_except_checks.py +301 -0
- vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
- vigil_forensic/gate_checks/common.py +253 -0
- vigil_forensic/gate_checks/config_safety_checks.py +704 -0
- vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
- vigil_forensic/gate_checks/conflict_checks.py +193 -0
- vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
- vigil_forensic/gate_checks/context_health_checks.py +289 -0
- vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
- vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
- vigil_forensic/gate_checks/duplication_checks.py +387 -0
- vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
- vigil_forensic/gate_checks/empty_output_checks.py +87 -0
- vigil_forensic/gate_checks/encoding_checks.py +847 -0
- vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
- vigil_forensic/gate_checks/fallback_checks.py +41 -0
- vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
- vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
- vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
- vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
- vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
- vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
- vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
- vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
- vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
- vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
- vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
- vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
- vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
- vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
- vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
- vigil_forensic/gate_checks/hallucination_checks.py +566 -0
- vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
- vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
- vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
- vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
- vigil_forensic/gate_checks/ml_checks.py +318 -0
- vigil_forensic/gate_checks/performance_checks.py +106 -0
- vigil_forensic/gate_checks/project_specific_runner.py +691 -0
- vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
- vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
- vigil_forensic/gate_checks/reliability_checks.py +389 -0
- vigil_forensic/gate_checks/reporting_checks.py +55 -0
- vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
- vigil_forensic/gate_checks/security_injection_checks.py +332 -0
- vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
- vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
- vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
- vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
- vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
- vigil_forensic/gate_checks/test_quality_checks.py +946 -0
- vigil_forensic/gate_checks/testing_checks.py +149 -0
- vigil_forensic/gate_checks/toctou_checks.py +367 -0
- vigil_forensic/gate_checks/type_checking_checks.py +316 -0
- vigil_forensic/gate_models.py +392 -0
- vigil_forensic/gate_packs/__init__.py +1 -0
- vigil_forensic/gate_packs/universal.py +179 -0
- vigil_forensic/gate_profile.json +31 -0
- vigil_forensic/gate_registry.py +21 -0
- vigil_forensic/language_profiles.py +219 -0
- vigil_forensic/meta_findings.py +207 -0
- vigil_forensic/self_audit.py +725 -0
- vigil_forensic/source_analysis.py +175 -0
- vigil_mapper/__init__.py +103 -0
- vigil_mapper/_ast_helpers_minimal.py +229 -0
- vigil_mapper/_extract_imports_impl.py +123 -0
- vigil_mapper/_file_count_guard.py +129 -0
- vigil_mapper/_git_utils.py +178 -0
- vigil_mapper/_runtime_ast.py +438 -0
- vigil_mapper/_runtime_dispatch.py +137 -0
- vigil_mapper/_seed_helpers.py +82 -0
- vigil_mapper/authority_builder.py +1102 -0
- vigil_mapper/cli_entry.py +731 -0
- vigil_mapper/conflict_builder.py +818 -0
- vigil_mapper/data_contract_builder.py +446 -0
- vigil_mapper/findings_builder.py +716 -0
- vigil_mapper/fingerprint.py +53 -0
- vigil_mapper/hotspot_builder.py +539 -0
- vigil_mapper/map_common.py +449 -0
- vigil_mapper/map_errors.py +55 -0
- vigil_mapper/map_models.py +431 -0
- vigil_mapper/map_models_ext.py +206 -0
- vigil_mapper/map_models_findings.py +130 -0
- vigil_mapper/map_storage.py +455 -0
- vigil_mapper/parse_cache.py +795 -0
- vigil_mapper/refactor_boundary_builder.py +266 -0
- vigil_mapper/runtime_builder.py +527 -0
- vigil_mapper/runtime_tracer.py +243 -0
- vigil_mapper/runtime_tracer_entry.py +199 -0
- vigil_mapper/semantic_diff.py +71 -0
- vigil_mapper/source_adapters/__init__.py +109 -0
- vigil_mapper/source_adapters/_base.py +264 -0
- vigil_mapper/source_adapters/_ir.py +156 -0
- vigil_mapper/source_adapters/_lexer.py +309 -0
- vigil_mapper/source_adapters/_patterns.py +212 -0
- vigil_mapper/source_adapters/_treesitter.py +182 -0
- vigil_mapper/source_adapters/go.py +553 -0
- vigil_mapper/source_adapters/java.py +541 -0
- vigil_mapper/source_adapters/javascript.py +626 -0
- vigil_mapper/source_adapters/python.py +325 -0
- vigil_mapper/source_adapters/typescript.py +749 -0
- vigil_mapper/structural_builder.py +586 -0
- vigil_mcp/__init__.py +1 -0
- vigil_mcp/_jobs.py +587 -0
- vigil_mcp/_paths.py +93 -0
- vigil_mcp/forensic_server.py +419 -0
- vigil_mcp/map_server.py +452 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""ML/NN correctness forensic checks (static AST, Python-only).
|
|
2
|
+
|
|
3
|
+
Catches machine-learning / quant-trading bugs that generic linters miss and that
|
|
4
|
+
are catastrophic in backtests and live trading:
|
|
5
|
+
|
|
6
|
+
ml.lookahead_negative_shift -- .shift(-N): future data leaks into the present row
|
|
7
|
+
ml.nondeterministic_split -- train_test_split(...) with no random_state
|
|
8
|
+
ml.scaler_fit_on_test -- .fit()/.fit_transform() on a *_test / *_val array
|
|
9
|
+
ml.missing_random_seed -- module uses RNG but never seeds it
|
|
10
|
+
|
|
11
|
+
Pure AST over file snapshots — never executes the model. Conservative by design:
|
|
12
|
+
prefers a missed case over a false alarm (these run on any Python repo, most of
|
|
13
|
+
which is not ML code).
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import ast
|
|
18
|
+
import logging
|
|
19
|
+
|
|
20
|
+
from vigil_forensic._shared import (
|
|
21
|
+
EvidenceReference,
|
|
22
|
+
GateCategory,
|
|
23
|
+
GateImpact,
|
|
24
|
+
GateSeverity,
|
|
25
|
+
RepairKind,
|
|
26
|
+
)
|
|
27
|
+
from vigil_forensic.gate_checks.common import build_check_result, build_finding
|
|
28
|
+
|
|
29
|
+
_log = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# --------------------------------------------------------------------------
|
|
33
|
+
# helpers
|
|
34
|
+
# --------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
def _negative_int(node: ast.AST) -> int | None:
|
|
37
|
+
"""Return the negative int value of *node* if it is a negative int literal."""
|
|
38
|
+
if (
|
|
39
|
+
isinstance(node, ast.UnaryOp)
|
|
40
|
+
and isinstance(node.op, ast.USub)
|
|
41
|
+
and isinstance(node.operand, ast.Constant)
|
|
42
|
+
and isinstance(node.operand.value, int)
|
|
43
|
+
and not isinstance(node.operand.value, bool)
|
|
44
|
+
):
|
|
45
|
+
return -node.operand.value
|
|
46
|
+
if (
|
|
47
|
+
isinstance(node, ast.Constant)
|
|
48
|
+
and isinstance(node.value, int)
|
|
49
|
+
and not isinstance(node.value, bool)
|
|
50
|
+
and node.value < 0
|
|
51
|
+
):
|
|
52
|
+
return node.value
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _shift_negative_period(call: ast.Call) -> int | None:
|
|
57
|
+
"""If *call* is ``.shift(-N)`` / ``.shift(periods=-N)`` return the negative N."""
|
|
58
|
+
if call.args:
|
|
59
|
+
v = _negative_int(call.args[0])
|
|
60
|
+
if v is not None:
|
|
61
|
+
return v
|
|
62
|
+
for kw in call.keywords:
|
|
63
|
+
if kw.arg == "periods":
|
|
64
|
+
v = _negative_int(kw.value)
|
|
65
|
+
if v is not None:
|
|
66
|
+
return v
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _arg_name_lower(node: ast.AST) -> str:
|
|
71
|
+
"""Best-effort lowercase name of an argument expression (Name or attribute)."""
|
|
72
|
+
if isinstance(node, ast.Name):
|
|
73
|
+
return node.id.lower()
|
|
74
|
+
if isinstance(node, ast.Attribute):
|
|
75
|
+
return node.attr.lower()
|
|
76
|
+
if isinstance(node, ast.Subscript) and isinstance(node.value, ast.Name):
|
|
77
|
+
return node.value.id.lower()
|
|
78
|
+
return ""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# --------------------------------------------------------------------------
|
|
82
|
+
# check 1: look-ahead via negative shift
|
|
83
|
+
# --------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
def _check_lookahead_shift(path: str, tree: ast.AST) -> list:
|
|
86
|
+
findings = []
|
|
87
|
+
for node in ast.walk(tree):
|
|
88
|
+
if not (isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute)):
|
|
89
|
+
continue
|
|
90
|
+
if node.func.attr != "shift":
|
|
91
|
+
continue
|
|
92
|
+
neg = _shift_negative_period(node)
|
|
93
|
+
if neg is None:
|
|
94
|
+
continue
|
|
95
|
+
ln = int(getattr(node, "lineno", 0) or 0)
|
|
96
|
+
findings.append(build_finding(
|
|
97
|
+
check_id="ml.lookahead_negative_shift",
|
|
98
|
+
category=GateCategory.ML,
|
|
99
|
+
title=f"Negative .shift({neg}) leaks future data in {path}:{ln}",
|
|
100
|
+
severity=GateSeverity.HIGH,
|
|
101
|
+
impact=GateImpact.REVISE,
|
|
102
|
+
summary=(
|
|
103
|
+
f".shift({neg}) at {path}:{ln} moves a series BACKWARD, exposing future "
|
|
104
|
+
"values to the current row. This is look-ahead bias: it inflates "
|
|
105
|
+
"backtest/validation metrics and silently fails in live use."
|
|
106
|
+
),
|
|
107
|
+
recommendation=(
|
|
108
|
+
"Features must only see past data: use a forward (positive) shift, "
|
|
109
|
+
"or if this is target construction, ensure the model never receives "
|
|
110
|
+
"the shifted future column as an input feature."
|
|
111
|
+
),
|
|
112
|
+
evidence=[EvidenceReference(
|
|
113
|
+
kind="file", path=str(path), detail=f"line:{ln} shift({neg})",
|
|
114
|
+
)],
|
|
115
|
+
repair_kind=RepairKind.FIX_CONTRACT.value,
|
|
116
|
+
executor_action="Replace negative shift with a causal (positive) shift or isolate target alignment.",
|
|
117
|
+
proof_required="No negative .shift() feeds a feature column; backtest uses only past data.",
|
|
118
|
+
allowlist_allowed=True,
|
|
119
|
+
))
|
|
120
|
+
return findings
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# --------------------------------------------------------------------------
|
|
124
|
+
# check 2: non-deterministic train_test_split
|
|
125
|
+
# --------------------------------------------------------------------------
|
|
126
|
+
|
|
127
|
+
def _is_named_call(node: ast.Call, name: str) -> bool:
|
|
128
|
+
f = node.func
|
|
129
|
+
return (isinstance(f, ast.Name) and f.id == name) or (
|
|
130
|
+
isinstance(f, ast.Attribute) and f.attr == name
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _check_nondeterministic_split(path: str, tree: ast.AST) -> list:
|
|
135
|
+
findings = []
|
|
136
|
+
for node in ast.walk(tree):
|
|
137
|
+
if not isinstance(node, ast.Call) or not _is_named_call(node, "train_test_split"):
|
|
138
|
+
continue
|
|
139
|
+
has_seed = any(kw.arg == "random_state" for kw in node.keywords)
|
|
140
|
+
if has_seed:
|
|
141
|
+
continue
|
|
142
|
+
ln = int(getattr(node, "lineno", 0) or 0)
|
|
143
|
+
findings.append(build_finding(
|
|
144
|
+
check_id="ml.nondeterministic_split",
|
|
145
|
+
category=GateCategory.ML,
|
|
146
|
+
title=f"train_test_split without random_state in {path}:{ln}",
|
|
147
|
+
severity=GateSeverity.MEDIUM,
|
|
148
|
+
impact=GateImpact.REVISE,
|
|
149
|
+
summary=(
|
|
150
|
+
f"train_test_split at {path}:{ln} has no random_state. The split is "
|
|
151
|
+
"non-reproducible: every run shuffles differently, so metrics, "
|
|
152
|
+
"hyperparameter choices, and bug reports cannot be reproduced."
|
|
153
|
+
),
|
|
154
|
+
recommendation="Pass an explicit random_state=<int> for a reproducible split.",
|
|
155
|
+
evidence=[EvidenceReference(
|
|
156
|
+
kind="file", path=str(path), detail=f"line:{ln}",
|
|
157
|
+
)],
|
|
158
|
+
repair_kind=RepairKind.FIX_CONTRACT.value,
|
|
159
|
+
executor_action="Add random_state=<int> to train_test_split.",
|
|
160
|
+
proof_required="train_test_split carries an explicit random_state.",
|
|
161
|
+
allowlist_allowed=True,
|
|
162
|
+
))
|
|
163
|
+
return findings
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# --------------------------------------------------------------------------
|
|
167
|
+
# check 3: scaler / transformer fit on test or validation data (leakage)
|
|
168
|
+
# --------------------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
_FIT_METHODS = frozenset({"fit", "fit_transform"})
|
|
171
|
+
_LEAK_TOKENS = ("test", "val", "valid", "holdout", "oot")
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _check_scaler_fit_on_test(path: str, tree: ast.AST) -> list:
|
|
175
|
+
findings = []
|
|
176
|
+
for node in ast.walk(tree):
|
|
177
|
+
if not (isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute)):
|
|
178
|
+
continue
|
|
179
|
+
if node.func.attr not in _FIT_METHODS or not node.args:
|
|
180
|
+
continue
|
|
181
|
+
argname = _arg_name_lower(node.args[0])
|
|
182
|
+
if not argname:
|
|
183
|
+
continue
|
|
184
|
+
# token must appear as a word-ish piece (x_test, test_x, X_val) — substring
|
|
185
|
+
# is acceptable here because these names are conventional and specific.
|
|
186
|
+
if not any(tok in argname for tok in _LEAK_TOKENS):
|
|
187
|
+
continue
|
|
188
|
+
ln = int(getattr(node, "lineno", 0) or 0)
|
|
189
|
+
findings.append(build_finding(
|
|
190
|
+
check_id="ml.scaler_fit_on_test",
|
|
191
|
+
category=GateCategory.ML,
|
|
192
|
+
title=f".{node.func.attr}() on '{argname}' (eval data) in {path}:{ln}",
|
|
193
|
+
severity=GateSeverity.HIGH,
|
|
194
|
+
impact=GateImpact.REVISE,
|
|
195
|
+
summary=(
|
|
196
|
+
f"{node.func.attr}() is called on '{argname}' at {path}:{ln}. Fitting a "
|
|
197
|
+
"scaler/transformer/model on test or validation data leaks information "
|
|
198
|
+
"from the eval set into training, producing optimistic, invalid metrics."
|
|
199
|
+
),
|
|
200
|
+
recommendation=(
|
|
201
|
+
"Fit transforms ONLY on the training split, then .transform() (not "
|
|
202
|
+
"fit_transform) the test/validation split."
|
|
203
|
+
),
|
|
204
|
+
evidence=[EvidenceReference(
|
|
205
|
+
kind="file", path=str(path), detail=f"line:{ln} {node.func.attr}({argname})",
|
|
206
|
+
)],
|
|
207
|
+
repair_kind=RepairKind.FIX_CONTRACT.value,
|
|
208
|
+
executor_action="Fit on train only; use transform() on eval data.",
|
|
209
|
+
proof_required="No fit/fit_transform on a *_test/*_val array.",
|
|
210
|
+
allowlist_allowed=True,
|
|
211
|
+
))
|
|
212
|
+
return findings
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# --------------------------------------------------------------------------
|
|
216
|
+
# check 4: RNG used but never seeded (non-reproducible)
|
|
217
|
+
# --------------------------------------------------------------------------
|
|
218
|
+
|
|
219
|
+
# Calls that *consume* randomness (attribute chains ending in these), e.g.
|
|
220
|
+
# np.random.rand / torch.randn / random.shuffle.
|
|
221
|
+
_RNG_CONSUMERS = frozenset({
|
|
222
|
+
"rand", "randn", "randint", "random", "choice", "shuffle", "permutation",
|
|
223
|
+
"normal", "uniform", "standard_normal", "sample", "randperm",
|
|
224
|
+
})
|
|
225
|
+
# Calls that *seed* an RNG.
|
|
226
|
+
_SEED_CALLS = frozenset({"seed", "manual_seed", "manual_seed_all", "set_seed"})
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _attr_chain(node: ast.AST) -> str:
|
|
230
|
+
"""Return the dotted attribute chain text for a Call.func (best effort)."""
|
|
231
|
+
parts: list[str] = []
|
|
232
|
+
cur = node
|
|
233
|
+
while isinstance(cur, ast.Attribute):
|
|
234
|
+
parts.append(cur.attr)
|
|
235
|
+
cur = cur.value
|
|
236
|
+
if isinstance(cur, ast.Name):
|
|
237
|
+
parts.append(cur.id)
|
|
238
|
+
return ".".join(reversed(parts))
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _check_missing_seed(path: str, tree: ast.AST) -> list:
|
|
242
|
+
uses_rng = False
|
|
243
|
+
has_seed = False
|
|
244
|
+
rng_line = 0
|
|
245
|
+
for node in ast.walk(tree):
|
|
246
|
+
if not (isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute)):
|
|
247
|
+
continue
|
|
248
|
+
chain = _attr_chain(node.func)
|
|
249
|
+
leaf = node.func.attr
|
|
250
|
+
if leaf in _SEED_CALLS:
|
|
251
|
+
has_seed = True
|
|
252
|
+
# consumer must be under a random/np.random/torch namespace to avoid
|
|
253
|
+
# flagging unrelated .sample()/.choice() on domain objects.
|
|
254
|
+
if leaf in _RNG_CONSUMERS and (
|
|
255
|
+
"random" in chain or chain.startswith("np.") or chain.startswith("numpy.")
|
|
256
|
+
or chain.startswith("torch") or chain.startswith("tf.")
|
|
257
|
+
):
|
|
258
|
+
uses_rng = True
|
|
259
|
+
if not rng_line:
|
|
260
|
+
rng_line = int(getattr(node, "lineno", 0) or 0)
|
|
261
|
+
# also count random_state=/seed= kwargs anywhere as "seeded"
|
|
262
|
+
if uses_rng and not has_seed:
|
|
263
|
+
for node in ast.walk(tree):
|
|
264
|
+
if isinstance(node, ast.keyword) and node.arg in ("random_state", "seed"):
|
|
265
|
+
has_seed = True
|
|
266
|
+
break
|
|
267
|
+
if uses_rng and not has_seed:
|
|
268
|
+
return [build_finding(
|
|
269
|
+
check_id="ml.missing_random_seed",
|
|
270
|
+
category=GateCategory.ML,
|
|
271
|
+
title=f"RNG used but never seeded in {path}",
|
|
272
|
+
severity=GateSeverity.MEDIUM,
|
|
273
|
+
impact=GateImpact.REVISE,
|
|
274
|
+
summary=(
|
|
275
|
+
f"{path} consumes randomness (first use at line {rng_line}) but never "
|
|
276
|
+
"sets a seed (np.random.seed / torch.manual_seed / random.seed) and "
|
|
277
|
+
"passes no random_state=. Runs are non-reproducible — results, bugs, "
|
|
278
|
+
"and metrics cannot be replicated."
|
|
279
|
+
),
|
|
280
|
+
recommendation="Seed all RNGs at module/entrypoint start (np.random.seed, torch.manual_seed, random.seed) or pass random_state=.",
|
|
281
|
+
evidence=[EvidenceReference(
|
|
282
|
+
kind="file", path=str(path), detail=f"first RNG use line:{rng_line}",
|
|
283
|
+
)],
|
|
284
|
+
repair_kind=RepairKind.FIX_CONTRACT.value,
|
|
285
|
+
executor_action="Seed all random number generators deterministically.",
|
|
286
|
+
proof_required="A seed is set before any RNG consumption in the module.",
|
|
287
|
+
allowlist_allowed=True,
|
|
288
|
+
)]
|
|
289
|
+
return []
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
# --------------------------------------------------------------------------
|
|
293
|
+
# runner
|
|
294
|
+
# --------------------------------------------------------------------------
|
|
295
|
+
|
|
296
|
+
def run_ml_checks(ctx) -> "object":
|
|
297
|
+
"""Run all ML/NN correctness checks over the snapshot corpus (static)."""
|
|
298
|
+
findings: list = []
|
|
299
|
+
snapshots = getattr(ctx, "file_snapshots", None) or {}
|
|
300
|
+
for path, snap in snapshots.items():
|
|
301
|
+
if not str(path).endswith(".py"):
|
|
302
|
+
continue
|
|
303
|
+
content = getattr(snap, "text", None)
|
|
304
|
+
if not content:
|
|
305
|
+
continue
|
|
306
|
+
try:
|
|
307
|
+
tree = ast.parse(content)
|
|
308
|
+
except SyntaxError:
|
|
309
|
+
continue
|
|
310
|
+
findings.extend(_check_lookahead_shift(path, tree))
|
|
311
|
+
findings.extend(_check_nondeterministic_split(path, tree))
|
|
312
|
+
findings.extend(_check_scaler_fit_on_test(path, tree))
|
|
313
|
+
findings.extend(_check_missing_seed(path, tree))
|
|
314
|
+
return build_check_result(
|
|
315
|
+
check_id="ml_checks",
|
|
316
|
+
category=GateCategory.ML,
|
|
317
|
+
findings=findings,
|
|
318
|
+
)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
|
|
5
|
+
from vigil_forensic._shared import EvidenceReference, GateCategory, GateImpact, GateSeverity, RepairKind
|
|
6
|
+
from vigil_forensic.gate_models import PostExecGateContext
|
|
7
|
+
from ..source_analysis import is_source_file
|
|
8
|
+
from .common import build_check_result, build_finding, iter_touched_snapshots, normalize_path
|
|
9
|
+
from ._ast_helpers import parse_python_source_or_emit_finding
|
|
10
|
+
import logging
|
|
11
|
+
_log = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
EXPENSIVE_CALL_NAMES = {
|
|
15
|
+
"read_text",
|
|
16
|
+
"read_bytes",
|
|
17
|
+
"subprocess.run",
|
|
18
|
+
"check_output",
|
|
19
|
+
"path_exists",
|
|
20
|
+
"execute",
|
|
21
|
+
"connect",
|
|
22
|
+
"os.system",
|
|
23
|
+
"shutil.copy",
|
|
24
|
+
"shutil.copytree",
|
|
25
|
+
"shutil.move",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# Files that ARE file-processors by design — reading files in loops is their job.
|
|
29
|
+
# Flagging these generates noise without actionable signal.
|
|
30
|
+
_FILE_PROCESSOR_PATH_FRAGMENTS = (
|
|
31
|
+
"gate_checks/",
|
|
32
|
+
"map_builder/",
|
|
33
|
+
"source_adapters/",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def run_performance_checks(ctx: PostExecGateContext):
|
|
38
|
+
findings = []
|
|
39
|
+
profile = ctx.repo_profile
|
|
40
|
+
for snapshot in iter_touched_snapshots(ctx):
|
|
41
|
+
if not snapshot.exists or not is_source_file(snapshot.path):
|
|
42
|
+
continue
|
|
43
|
+
if profile and not profile.is_performance_sensitive(snapshot.path):
|
|
44
|
+
continue
|
|
45
|
+
norm_path = snapshot.path.replace("\\", "/")
|
|
46
|
+
if any(frag in norm_path for frag in _FILE_PROCESSOR_PATH_FRAGMENTS):
|
|
47
|
+
continue
|
|
48
|
+
# Sprint C2 (2026-04-23): prefer TestTopology.is_test_path. Legacy
|
|
49
|
+
# basename check preserved as fallback for contexts where
|
|
50
|
+
# ProjectContext.test_topology hasn't been built (older call sites,
|
|
51
|
+
# unit tests constructing a PostExecGateContext by hand).
|
|
52
|
+
topology = getattr(getattr(ctx, "project_context", None), "test_topology", None)
|
|
53
|
+
if topology is not None:
|
|
54
|
+
if topology.is_test_path(norm_path):
|
|
55
|
+
continue
|
|
56
|
+
elif norm_path.split("/")[-1].startswith("test_"):
|
|
57
|
+
continue
|
|
58
|
+
# B4 (2026-04-23): replaces silent `except SyntaxError: continue` —
|
|
59
|
+
# meta.syntax_parse_error is now emitted on broken Python sources.
|
|
60
|
+
tree = parse_python_source_or_emit_finding(
|
|
61
|
+
snapshot.text,
|
|
62
|
+
rel_path=normalize_path(snapshot.path),
|
|
63
|
+
emit_finding=findings.append,
|
|
64
|
+
emitting_gate="performance.expensive_in_loop",
|
|
65
|
+
)
|
|
66
|
+
if tree is None:
|
|
67
|
+
continue
|
|
68
|
+
for node in ast.walk(tree):
|
|
69
|
+
if not isinstance(node, (ast.For, ast.AsyncFor, ast.While)):
|
|
70
|
+
continue
|
|
71
|
+
for child in ast.walk(node):
|
|
72
|
+
if isinstance(child, ast.Call):
|
|
73
|
+
name = _call_name(child)
|
|
74
|
+
parts = name.rsplit(".", 1)
|
|
75
|
+
bare = parts[1] if len(parts) == 2 else None
|
|
76
|
+
if name in EXPENSIVE_CALL_NAMES or (bare and bare in EXPENSIVE_CALL_NAMES):
|
|
77
|
+
findings.append(
|
|
78
|
+
build_finding(
|
|
79
|
+
check_id="performance.expensive_in_loop",
|
|
80
|
+
category=GateCategory.PERFORMANCE,
|
|
81
|
+
title="Touched code performs expensive work inside a loop",
|
|
82
|
+
severity=GateSeverity.HIGH,
|
|
83
|
+
impact=GateImpact.REVISE,
|
|
84
|
+
summary=f"{snapshot.path} calls '{name}' inside a loop, which is a likely hot-path anti-pattern.",
|
|
85
|
+
recommendation="Batch the work, cache repeated reads, or move the expensive call out of the loop.",
|
|
86
|
+
evidence=[EvidenceReference(kind="file", path=snapshot.path, detail=name)],
|
|
87
|
+
repair_kind=RepairKind.REFACTOR.value,
|
|
88
|
+
executor_action="Optimize hot code paths",
|
|
89
|
+
proof_required="Performance acceptable",
|
|
90
|
+
allowlist_allowed=False,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
return build_check_result(check_id="performance", category=GateCategory.PERFORMANCE, findings=findings)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _call_name(node: ast.Call) -> str:
|
|
97
|
+
"""Return a qualified call name like 'subprocess.run' or bare 'load'."""
|
|
98
|
+
func = node.func
|
|
99
|
+
if isinstance(func, ast.Attribute):
|
|
100
|
+
if isinstance(func.value, ast.Name):
|
|
101
|
+
return f"{func.value.id}.{func.attr}"
|
|
102
|
+
# self.executor.run -> just attr
|
|
103
|
+
return str(func.attr)
|
|
104
|
+
if isinstance(func, ast.Name):
|
|
105
|
+
return str(func.id)
|
|
106
|
+
return ""
|