vigil-codeintel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
- vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
- vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
- vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
- vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
- vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
- vigil_forensic/__init__.py +224 -0
- vigil_forensic/_git_utils.py +178 -0
- vigil_forensic/_shared.py +510 -0
- vigil_forensic/_stubs.py +156 -0
- vigil_forensic/gate_checks/__init__.py +1 -0
- vigil_forensic/gate_checks/_ast_helpers.py +629 -0
- vigil_forensic/gate_checks/_deployment_detector.py +573 -0
- vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
- vigil_forensic/gate_checks/authority_checks.py +95 -0
- vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
- vigil_forensic/gate_checks/broad_except_checks.py +301 -0
- vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
- vigil_forensic/gate_checks/common.py +253 -0
- vigil_forensic/gate_checks/config_safety_checks.py +704 -0
- vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
- vigil_forensic/gate_checks/conflict_checks.py +193 -0
- vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
- vigil_forensic/gate_checks/context_health_checks.py +289 -0
- vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
- vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
- vigil_forensic/gate_checks/duplication_checks.py +387 -0
- vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
- vigil_forensic/gate_checks/empty_output_checks.py +87 -0
- vigil_forensic/gate_checks/encoding_checks.py +847 -0
- vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
- vigil_forensic/gate_checks/fallback_checks.py +41 -0
- vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
- vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
- vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
- vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
- vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
- vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
- vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
- vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
- vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
- vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
- vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
- vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
- vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
- vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
- vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
- vigil_forensic/gate_checks/hallucination_checks.py +566 -0
- vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
- vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
- vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
- vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
- vigil_forensic/gate_checks/ml_checks.py +318 -0
- vigil_forensic/gate_checks/performance_checks.py +106 -0
- vigil_forensic/gate_checks/project_specific_runner.py +691 -0
- vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
- vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
- vigil_forensic/gate_checks/reliability_checks.py +389 -0
- vigil_forensic/gate_checks/reporting_checks.py +55 -0
- vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
- vigil_forensic/gate_checks/security_injection_checks.py +332 -0
- vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
- vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
- vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
- vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
- vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
- vigil_forensic/gate_checks/test_quality_checks.py +946 -0
- vigil_forensic/gate_checks/testing_checks.py +149 -0
- vigil_forensic/gate_checks/toctou_checks.py +367 -0
- vigil_forensic/gate_checks/type_checking_checks.py +316 -0
- vigil_forensic/gate_models.py +392 -0
- vigil_forensic/gate_packs/__init__.py +1 -0
- vigil_forensic/gate_packs/universal.py +179 -0
- vigil_forensic/gate_profile.json +31 -0
- vigil_forensic/gate_registry.py +21 -0
- vigil_forensic/language_profiles.py +219 -0
- vigil_forensic/meta_findings.py +207 -0
- vigil_forensic/self_audit.py +725 -0
- vigil_forensic/source_analysis.py +175 -0
- vigil_mapper/__init__.py +103 -0
- vigil_mapper/_ast_helpers_minimal.py +229 -0
- vigil_mapper/_extract_imports_impl.py +123 -0
- vigil_mapper/_file_count_guard.py +129 -0
- vigil_mapper/_git_utils.py +178 -0
- vigil_mapper/_runtime_ast.py +438 -0
- vigil_mapper/_runtime_dispatch.py +137 -0
- vigil_mapper/_seed_helpers.py +82 -0
- vigil_mapper/authority_builder.py +1102 -0
- vigil_mapper/cli_entry.py +731 -0
- vigil_mapper/conflict_builder.py +818 -0
- vigil_mapper/data_contract_builder.py +446 -0
- vigil_mapper/findings_builder.py +716 -0
- vigil_mapper/fingerprint.py +53 -0
- vigil_mapper/hotspot_builder.py +539 -0
- vigil_mapper/map_common.py +449 -0
- vigil_mapper/map_errors.py +55 -0
- vigil_mapper/map_models.py +431 -0
- vigil_mapper/map_models_ext.py +206 -0
- vigil_mapper/map_models_findings.py +130 -0
- vigil_mapper/map_storage.py +455 -0
- vigil_mapper/parse_cache.py +795 -0
- vigil_mapper/refactor_boundary_builder.py +266 -0
- vigil_mapper/runtime_builder.py +527 -0
- vigil_mapper/runtime_tracer.py +243 -0
- vigil_mapper/runtime_tracer_entry.py +199 -0
- vigil_mapper/semantic_diff.py +71 -0
- vigil_mapper/source_adapters/__init__.py +109 -0
- vigil_mapper/source_adapters/_base.py +264 -0
- vigil_mapper/source_adapters/_ir.py +156 -0
- vigil_mapper/source_adapters/_lexer.py +309 -0
- vigil_mapper/source_adapters/_patterns.py +212 -0
- vigil_mapper/source_adapters/_treesitter.py +182 -0
- vigil_mapper/source_adapters/go.py +553 -0
- vigil_mapper/source_adapters/java.py +541 -0
- vigil_mapper/source_adapters/javascript.py +626 -0
- vigil_mapper/source_adapters/python.py +325 -0
- vigil_mapper/source_adapters/typescript.py +749 -0
- vigil_mapper/structural_builder.py +586 -0
- vigil_mcp/__init__.py +1 -0
- vigil_mcp/_jobs.py +587 -0
- vigil_mcp/_paths.py +93 -0
- vigil_mcp/forensic_server.py +419 -0
- vigil_mcp/map_server.py +452 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Fingerprint utilities for the map builder subsystem.
|
|
2
|
+
|
|
3
|
+
Provides stable, deterministic identifiers for conflict entries and schema hashes.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"make_conflict_id",
|
|
13
|
+
"map_schema_hash",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
_log = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def make_conflict_id(domain: str, subject: str, sources: list[dict]) -> str:
|
|
20
|
+
"""Compute a stable conflict identifier from domain, subject and sources.
|
|
21
|
+
|
|
22
|
+
Sources are sorted by (map, claim) before hashing so that insertion
|
|
23
|
+
order does not affect the result.
|
|
24
|
+
|
|
25
|
+
Returns: ``"conf_" + first 12 hex digits of SHA-256``.
|
|
26
|
+
"""
|
|
27
|
+
canonical = json.dumps(
|
|
28
|
+
{
|
|
29
|
+
"domain": domain,
|
|
30
|
+
"subject": subject,
|
|
31
|
+
"sources": sorted(sources, key=lambda s: (s.get("map", ""), s.get("claim", ""))),
|
|
32
|
+
},
|
|
33
|
+
sort_keys=True,
|
|
34
|
+
separators=(",", ":"),
|
|
35
|
+
ensure_ascii=False,
|
|
36
|
+
)
|
|
37
|
+
digest = hashlib.sha256(canonical.encode("utf-8")).hexdigest()
|
|
38
|
+
conflict_id = "conf_" + digest[:12]
|
|
39
|
+
_log.debug("make_conflict_id: domain=%s subject=%s -> %s", domain, subject, conflict_id)
|
|
40
|
+
return conflict_id
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def map_schema_hash(entries: list[dict]) -> str:
|
|
44
|
+
"""Compute a 16-hex-char hash over the union of field names across all entries.
|
|
45
|
+
|
|
46
|
+
This detects schema drift (field additions/removals) between builds
|
|
47
|
+
without comparing values.
|
|
48
|
+
"""
|
|
49
|
+
all_fields = sorted({f for e in entries for f in e.keys()})
|
|
50
|
+
digest = hashlib.sha256(json.dumps(all_fields).encode("utf-8")).hexdigest()
|
|
51
|
+
schema_hash = digest[:16]
|
|
52
|
+
_log.debug("map_schema_hash: %d entries, %d distinct fields -> %s", len(entries), len(all_fields), schema_hash)
|
|
53
|
+
return schema_hash
|
|
@@ -0,0 +1,539 @@
|
|
|
1
|
+
"""Hotspot map builder -- Map 6.
|
|
2
|
+
|
|
3
|
+
Aggregates multi-dimensional risk factors from all available maps into a
|
|
4
|
+
ranked list of file-level hotspot entries.
|
|
5
|
+
|
|
6
|
+
Generic: operates on any RepoMaps, does not assume Vigil project layout.
|
|
7
|
+
Sanctioned-asset patterns are passed by the caller (resolved from seed at
|
|
8
|
+
the CLI entry layer in Phase 7).
|
|
9
|
+
|
|
10
|
+
Public API:
|
|
11
|
+
build_hotspot_map(repo_maps, sanctioned_patterns=(), churn_data=None) -> list[HotspotEntry]
|
|
12
|
+
compute_hotspot_churn_metadata(project_dir, since_window="90.days") -> tuple[dict, dict]
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import fnmatch
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import math
|
|
20
|
+
from collections.abc import Sequence
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
from .map_common import HOTSPOT_WEIGHTS, hotspot_mode_for_score
|
|
25
|
+
from .map_models import AuthorityDomain, DataContractEntry, RepoMaps, StructuralEntry
|
|
26
|
+
from .map_models_ext import HotspotEntry
|
|
27
|
+
|
|
28
|
+
__all__ = ["build_hotspot_map", "compute_hotspot_churn_metadata"]
|
|
29
|
+
|
|
30
|
+
_log = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
_SOURCE = "automated_scoring"
|
|
33
|
+
_CONFIDENCE = 0.88
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Helpers
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
def _utc_now() -> str:
|
|
41
|
+
return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _is_sanctioned(file: str, patterns: Sequence[str]) -> bool:
|
|
45
|
+
"""Return True if file matches any of the given fnmatch patterns."""
|
|
46
|
+
for pat in patterns:
|
|
47
|
+
if fnmatch.fnmatch(file, pat):
|
|
48
|
+
return True
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# Per-component scoring functions
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
def _structural_risk(file: str, structural_map: tuple) -> tuple[int, list[str]]:
|
|
57
|
+
"""Score based on structural tags. Range 0-20.
|
|
58
|
+
|
|
59
|
+
Tag weights are read from HOTSPOT_WEIGHTS["structural_tags"] so they
|
|
60
|
+
remain in one place (map_common.py).
|
|
61
|
+
"""
|
|
62
|
+
tag_weights: dict = HOTSPOT_WEIGHTS.get("structural_tags", {
|
|
63
|
+
"large_file": 10,
|
|
64
|
+
"high_fan_in": 8,
|
|
65
|
+
"high_fan_out": 3,
|
|
66
|
+
"cycle_member": 5,
|
|
67
|
+
"unparseable": 0,
|
|
68
|
+
})
|
|
69
|
+
score = 0
|
|
70
|
+
reasons: list[str] = []
|
|
71
|
+
for entry in structural_map:
|
|
72
|
+
if not isinstance(entry, StructuralEntry) or entry.file != file:
|
|
73
|
+
continue
|
|
74
|
+
for tag in entry.tags:
|
|
75
|
+
w = tag_weights.get(tag, 0)
|
|
76
|
+
if w:
|
|
77
|
+
score += w
|
|
78
|
+
reasons.append("structural_tag:%s(+%d)" % (tag, w))
|
|
79
|
+
break
|
|
80
|
+
capped = min(score, HOTSPOT_WEIGHTS.get("structural_risk_max", 20))
|
|
81
|
+
return capped, reasons
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _runtime_risk(file: str, runtime_map: tuple) -> tuple[int, list[str]]:
|
|
85
|
+
"""Score based on runtime tags. Range 0-20.
|
|
86
|
+
|
|
87
|
+
Tag weights are read from HOTSPOT_WEIGHTS["runtime_tags"] so they
|
|
88
|
+
remain in one place (map_common.py).
|
|
89
|
+
"""
|
|
90
|
+
tag_weights: dict = HOTSPOT_WEIGHTS.get("runtime_tags", {
|
|
91
|
+
"import_time_side_effects": 8,
|
|
92
|
+
"background_task": 5,
|
|
93
|
+
"decorator_registry": 3,
|
|
94
|
+
})
|
|
95
|
+
score = 0
|
|
96
|
+
reasons: list[str] = []
|
|
97
|
+
for node in runtime_map:
|
|
98
|
+
defined_in = getattr(node, "defined_in", "")
|
|
99
|
+
if defined_in != file:
|
|
100
|
+
continue
|
|
101
|
+
tags = getattr(node, "tags", ())
|
|
102
|
+
for tag in tags:
|
|
103
|
+
w = tag_weights.get(tag, 0)
|
|
104
|
+
if w:
|
|
105
|
+
score += w
|
|
106
|
+
reasons.append("runtime_tag:%s(+%d)" % (tag, w))
|
|
107
|
+
capped = min(score, HOTSPOT_WEIGHTS.get("runtime_risk_max", 20))
|
|
108
|
+
return capped, reasons
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _authority_risk(
|
|
112
|
+
file: str,
|
|
113
|
+
authority_map: tuple,
|
|
114
|
+
conflict_map: tuple = (),
|
|
115
|
+
) -> tuple[int, list[str]]:
|
|
116
|
+
"""Score based on authority ownership and open conflicts. Range 0-20.
|
|
117
|
+
|
|
118
|
+
Scoring tiers (conflict-aware, Wave A Agent 3):
|
|
119
|
+
- canonical_owner + any open conflict in that domain -> authority_risk_with_conflict (20)
|
|
120
|
+
- canonical_owner + no open conflicts -> authority_risk_base (5)
|
|
121
|
+
- file is a source in any open conflict (writer role) -> authority_writer_in_conflict (+10)
|
|
122
|
+
|
|
123
|
+
The canonical-owner tier is evaluated first (set, not add). The writer
|
|
124
|
+
role is additive so a file can score both as owner and as conflict writer.
|
|
125
|
+
"""
|
|
126
|
+
import json as _json
|
|
127
|
+
|
|
128
|
+
w_base: int = HOTSPOT_WEIGHTS.get("authority_risk_base", 5)
|
|
129
|
+
w_conflict: int = HOTSPOT_WEIGHTS.get("authority_risk_with_conflict", 20)
|
|
130
|
+
w_writer: int = HOTSPOT_WEIGHTS.get("authority_writer_in_conflict", 10)
|
|
131
|
+
|
|
132
|
+
score = 0
|
|
133
|
+
reasons: list[str] = []
|
|
134
|
+
|
|
135
|
+
# --- Canonical-owner tier ---
|
|
136
|
+
for domain in authority_map:
|
|
137
|
+
if not isinstance(domain, AuthorityDomain):
|
|
138
|
+
continue
|
|
139
|
+
if domain.canonical_owner != file:
|
|
140
|
+
continue
|
|
141
|
+
# Check whether any open ConflictEntry belongs to this domain.
|
|
142
|
+
has_open_conflict = _domain_has_open_conflict(domain.authority_domain, conflict_map)
|
|
143
|
+
if has_open_conflict:
|
|
144
|
+
score = w_conflict
|
|
145
|
+
reasons.append(
|
|
146
|
+
"canonical_owner_of:%s_with_open_conflict(+%d)" % (domain.authority_domain, w_conflict)
|
|
147
|
+
)
|
|
148
|
+
else:
|
|
149
|
+
# Preserve legacy drift-event check as fallback when no conflict map.
|
|
150
|
+
if not conflict_map and domain.last_drift_events:
|
|
151
|
+
score = w_conflict
|
|
152
|
+
reasons.append(
|
|
153
|
+
"canonical_owner_of:%s_with_drift_events(+%d)" % (domain.authority_domain, w_conflict)
|
|
154
|
+
)
|
|
155
|
+
else:
|
|
156
|
+
score = w_base
|
|
157
|
+
reasons.append(
|
|
158
|
+
"canonical_owner_of:%s_clean(+%d)" % (domain.authority_domain, w_base)
|
|
159
|
+
)
|
|
160
|
+
break # Only count the first matching domain; cap applied below.
|
|
161
|
+
|
|
162
|
+
# --- Writer-in-conflict tier (additive) ---
|
|
163
|
+
writer_hit = False
|
|
164
|
+
for conflict in conflict_map:
|
|
165
|
+
conflict_status = getattr(conflict, "conflict_status", "")
|
|
166
|
+
if conflict_status != "open":
|
|
167
|
+
continue
|
|
168
|
+
sources = getattr(conflict, "sources", ())
|
|
169
|
+
for src_raw in sources:
|
|
170
|
+
# sources are stored as JSON strings (per ConflictEntry model).
|
|
171
|
+
if isinstance(src_raw, str):
|
|
172
|
+
try:
|
|
173
|
+
src = _json.loads(src_raw)
|
|
174
|
+
except Exception:
|
|
175
|
+
src = {}
|
|
176
|
+
else:
|
|
177
|
+
src = src_raw if isinstance(src_raw, dict) else {}
|
|
178
|
+
if src.get("file") == file:
|
|
179
|
+
if not writer_hit:
|
|
180
|
+
score += w_writer
|
|
181
|
+
reasons.append("writer_in_open_conflict(+%d)" % w_writer)
|
|
182
|
+
writer_hit = True
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
capped = min(score, HOTSPOT_WEIGHTS.get("authority_risk_max", 20))
|
|
186
|
+
_log.debug("_authority_risk: file=%s raw=%d capped=%d", file, score, capped)
|
|
187
|
+
return capped, reasons
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _domain_has_open_conflict(domain: str, conflict_map: tuple) -> bool:
|
|
191
|
+
"""Return True if any ConflictEntry in conflict_map is open and targets domain."""
|
|
192
|
+
for conflict in conflict_map:
|
|
193
|
+
if getattr(conflict, "conflict_status", "") != "open":
|
|
194
|
+
continue
|
|
195
|
+
# Match by domain field or subject prefix (e.g. "my_domain::symbol").
|
|
196
|
+
if getattr(conflict, "domain", "") == domain:
|
|
197
|
+
return True
|
|
198
|
+
subject = getattr(conflict, "subject", "")
|
|
199
|
+
if subject.startswith(domain + "::"):
|
|
200
|
+
return True
|
|
201
|
+
return False
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _duplication_score(
|
|
205
|
+
file: str,
|
|
206
|
+
contract_map: tuple,
|
|
207
|
+
) -> tuple[int, list[str]]:
|
|
208
|
+
"""Score +10 if file is involved in contract drift_flags. Range 0-20."""
|
|
209
|
+
score = 0
|
|
210
|
+
reasons: list[str] = []
|
|
211
|
+
for contract in contract_map:
|
|
212
|
+
if not isinstance(contract, DataContractEntry):
|
|
213
|
+
continue
|
|
214
|
+
if not contract.drift_flags:
|
|
215
|
+
continue
|
|
216
|
+
# Check if file is a writer or reader in this contract.
|
|
217
|
+
if file in contract.writers or file in contract.readers:
|
|
218
|
+
score += 10
|
|
219
|
+
reasons.append(
|
|
220
|
+
"drift_flags_in_contract:%s(+10)" % contract.entity
|
|
221
|
+
)
|
|
222
|
+
break # Once is enough for the cap.
|
|
223
|
+
capped = min(score, HOTSPOT_WEIGHTS.get("duplication_score_max", 20))
|
|
224
|
+
return capped, reasons
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _test_gap(file: str, structural_map: tuple) -> tuple[int, list[str]]:
|
|
228
|
+
"""Score +10 if no test_<basename> file exists in structural map. Range 0-20."""
|
|
229
|
+
import posixpath
|
|
230
|
+
|
|
231
|
+
basename = posixpath.basename(file.replace("\\", "/"))
|
|
232
|
+
stem = basename[: -len(".py")] if basename.endswith(".py") else basename
|
|
233
|
+
expected_test = "test_" + stem
|
|
234
|
+
|
|
235
|
+
for entry in structural_map:
|
|
236
|
+
if not isinstance(entry, StructuralEntry):
|
|
237
|
+
continue
|
|
238
|
+
entry_base = posixpath.basename(entry.file.replace("\\", "/"))
|
|
239
|
+
if entry_base.startswith("test_") and expected_test in entry_base:
|
|
240
|
+
return 0, []
|
|
241
|
+
|
|
242
|
+
score = min(10, HOTSPOT_WEIGHTS.get("test_gap_max", 20))
|
|
243
|
+
return score, ["no_test_file_for:%s(+%d)" % (stem, score)]
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
# ---------------------------------------------------------------------------
|
|
247
|
+
# Public API
|
|
248
|
+
# ---------------------------------------------------------------------------
|
|
249
|
+
|
|
250
|
+
def _populate_hotspot_evidence(
|
|
251
|
+
file: str,
|
|
252
|
+
repo_maps: RepoMaps,
|
|
253
|
+
all_reasons: list[str],
|
|
254
|
+
) -> tuple[str, ...]:
|
|
255
|
+
"""Build evidence tuples linking hotspot to contributing sources.
|
|
256
|
+
|
|
257
|
+
Evidence strategy:
|
|
258
|
+
1. Top fan-in sources from runtime map (kind="source_location")
|
|
259
|
+
2. Related open conflicts (kind="map_entry", map="conflict")
|
|
260
|
+
3. Representative list (max 8 items) to avoid bloat
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
Tuple of JSON-serialized EvidenceItem strings.
|
|
264
|
+
"""
|
|
265
|
+
from .map_models_findings import EvidenceItem
|
|
266
|
+
|
|
267
|
+
evidence_items: list[EvidenceItem] = []
|
|
268
|
+
|
|
269
|
+
# Build index of conflicts affecting this file
|
|
270
|
+
conflict_evidence_added = False
|
|
271
|
+
for conflict in repo_maps.conflict:
|
|
272
|
+
conflict_status = getattr(conflict, "conflict_status", "")
|
|
273
|
+
if conflict_status != "open":
|
|
274
|
+
continue
|
|
275
|
+
conflict_id = getattr(conflict, "conflict_id", "")
|
|
276
|
+
domain = getattr(conflict, "domain", "")
|
|
277
|
+
subject = getattr(conflict, "subject", "")
|
|
278
|
+
|
|
279
|
+
# Check if this file is involved in the conflict (subject match or in sources)
|
|
280
|
+
involved = (file == subject)
|
|
281
|
+
if not involved:
|
|
282
|
+
sources = getattr(conflict, "sources", ())
|
|
283
|
+
for src_raw in sources:
|
|
284
|
+
try:
|
|
285
|
+
import json as _json
|
|
286
|
+
src = _json.loads(src_raw) if isinstance(src_raw, str) else src_raw
|
|
287
|
+
if isinstance(src, dict) and src.get("file") == file:
|
|
288
|
+
involved = True
|
|
289
|
+
break
|
|
290
|
+
except Exception:
|
|
291
|
+
pass
|
|
292
|
+
|
|
293
|
+
if involved and conflict_id and not conflict_evidence_added:
|
|
294
|
+
evidence_items.append(EvidenceItem(
|
|
295
|
+
kind="map_entry",
|
|
296
|
+
map="conflict",
|
|
297
|
+
entry_id=conflict_id,
|
|
298
|
+
))
|
|
299
|
+
conflict_evidence_added = True
|
|
300
|
+
|
|
301
|
+
# Add high fan-in sources from structural map (top 3-5 importers)
|
|
302
|
+
fan_in_sources: list[tuple[str, int]] = []
|
|
303
|
+
for entry in repo_maps.structural:
|
|
304
|
+
if not isinstance(entry, StructuralEntry) or entry.file != file:
|
|
305
|
+
continue
|
|
306
|
+
# Collect importers of this file, ranked by fan-in
|
|
307
|
+
for importer in entry.imports_in:
|
|
308
|
+
fan_in_sources.append((importer, 1)) # Each importer counts as 1
|
|
309
|
+
break
|
|
310
|
+
|
|
311
|
+
# Add top 3-5 fan-in contributors
|
|
312
|
+
for importer, _ in sorted(fan_in_sources, key=lambda x: x[0])[:5]:
|
|
313
|
+
evidence_items.append(EvidenceItem(
|
|
314
|
+
kind="source_location",
|
|
315
|
+
file=importer,
|
|
316
|
+
map="structural",
|
|
317
|
+
))
|
|
318
|
+
|
|
319
|
+
# Add runtime nodes defined in this file (if any high-risk tags)
|
|
320
|
+
runtime_sources_added = 0
|
|
321
|
+
for node in repo_maps.runtime:
|
|
322
|
+
if runtime_sources_added >= 3:
|
|
323
|
+
break
|
|
324
|
+
defined_in = getattr(node, "defined_in", "")
|
|
325
|
+
if defined_in != file:
|
|
326
|
+
continue
|
|
327
|
+
tags = getattr(node, "tags", ())
|
|
328
|
+
# Only add runtime sources with significant tags
|
|
329
|
+
if any(tag in ("import_time_side_effects", "background_task") for tag in tags):
|
|
330
|
+
evidence_items.append(EvidenceItem(
|
|
331
|
+
kind="source_location",
|
|
332
|
+
file=file,
|
|
333
|
+
map="runtime",
|
|
334
|
+
))
|
|
335
|
+
runtime_sources_added += 1
|
|
336
|
+
|
|
337
|
+
# Serialize to JSON strings
|
|
338
|
+
result: list[str] = []
|
|
339
|
+
for item in evidence_items[:8]: # Cap at 8 items total
|
|
340
|
+
result.append(json.dumps(item.to_dict(), sort_keys=True))
|
|
341
|
+
|
|
342
|
+
return tuple(result)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def build_hotspot_map(
|
|
346
|
+
repo_maps: RepoMaps,
|
|
347
|
+
sanctioned_patterns: Sequence[str] = (),
|
|
348
|
+
*,
|
|
349
|
+
churn_data: dict[str, int] | None = None,
|
|
350
|
+
) -> list[HotspotEntry]:
|
|
351
|
+
"""Build a hotspot map ranking files by multi-dimensional risk score.
|
|
352
|
+
|
|
353
|
+
Scoring formula (per spec Map 6, plan sec.19):
|
|
354
|
+
score = structural_risk[0-20] + runtime_risk[0-20]
|
|
355
|
+
+ authority_risk[0-20] + duplication_score[0-20]
|
|
356
|
+
+ failure_frequency[0] + test_gap[0-20] + churn[0-20]
|
|
357
|
+
- confidence_penalty[5]
|
|
358
|
+
clamped to [0, 130].
|
|
359
|
+
|
|
360
|
+
Mode assignment:
|
|
361
|
+
0-30 -> safe_refactor
|
|
362
|
+
31-60 -> contained_refactor
|
|
363
|
+
61-90 -> forensic_first
|
|
364
|
+
91+ -> do_not_touch_without_runtime_trace
|
|
365
|
+
|
|
366
|
+
Sanctioned files (matching any sanctioned_patterns fnmatch) are excluded.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
repo_maps: Container with all available maps.
|
|
370
|
+
sanctioned_patterns: Glob/fnmatch patterns for files to exclude.
|
|
371
|
+
churn_data: Optional per-file churn line counts (relative paths ->
|
|
372
|
+
total added+deleted lines). If None (default), the churn
|
|
373
|
+
component is 0 for all files (backward-compatible behaviour).
|
|
374
|
+
Compute via :func:`compute_hotspot_churn_metadata`.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
List of HotspotEntry sorted by (-score, target) for deterministic
|
|
378
|
+
tie-breaking.
|
|
379
|
+
|
|
380
|
+
Raises:
|
|
381
|
+
MapIntegrityError: If structural map is empty (minimum requirement).
|
|
382
|
+
"""
|
|
383
|
+
if not repo_maps.structural:
|
|
384
|
+
_log.info("hotspot: skipping -- structural map is empty (non-Python project or empty source tree)")
|
|
385
|
+
return []
|
|
386
|
+
|
|
387
|
+
churn: dict[str, int] = churn_data or {}
|
|
388
|
+
|
|
389
|
+
_log.info(
|
|
390
|
+
"build_hotspot_map: starting -- structural=%d runtime=%d "
|
|
391
|
+
"contract=%d authority=%d sanctioned_patterns=%d churn_files=%d",
|
|
392
|
+
len(repo_maps.structural),
|
|
393
|
+
len(repo_maps.runtime),
|
|
394
|
+
len(repo_maps.data_contract),
|
|
395
|
+
len(repo_maps.authority),
|
|
396
|
+
len(sanctioned_patterns),
|
|
397
|
+
len(churn),
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
freshness = _utc_now()
|
|
401
|
+
|
|
402
|
+
# Collect unique file targets from structural map (primary source of files).
|
|
403
|
+
candidate_files: list[str] = []
|
|
404
|
+
seen_files: set[str] = set()
|
|
405
|
+
for entry in repo_maps.structural:
|
|
406
|
+
if isinstance(entry, StructuralEntry) and entry.file not in seen_files:
|
|
407
|
+
candidate_files.append(entry.file)
|
|
408
|
+
seen_files.add(entry.file)
|
|
409
|
+
|
|
410
|
+
entries: list[HotspotEntry] = []
|
|
411
|
+
|
|
412
|
+
for file in candidate_files:
|
|
413
|
+
# Exclude sanctioned assets.
|
|
414
|
+
if _is_sanctioned(file, sanctioned_patterns):
|
|
415
|
+
_log.debug("build_hotspot_map: skipping sanctioned file %s", file)
|
|
416
|
+
continue
|
|
417
|
+
|
|
418
|
+
all_reasons: list[str] = []
|
|
419
|
+
|
|
420
|
+
sr, sr_reasons = _structural_risk(file, repo_maps.structural)
|
|
421
|
+
rr, rr_reasons = _runtime_risk(file, repo_maps.runtime)
|
|
422
|
+
ar, ar_reasons = _authority_risk(file, repo_maps.authority, repo_maps.conflict)
|
|
423
|
+
ds, ds_reasons = _duplication_score(file, repo_maps.data_contract)
|
|
424
|
+
tg, tg_reasons = _test_gap(file, repo_maps.structural)
|
|
425
|
+
|
|
426
|
+
# Not yet implemented (no historical data).
|
|
427
|
+
failure_frequency = 0
|
|
428
|
+
# Churn component: log-scale dampened, capped at churn_cap (default 20).
|
|
429
|
+
churn_raw = churn.get(file, 0)
|
|
430
|
+
_churn_cap_raw = HOTSPOT_WEIGHTS.get("churn_cap", 20)
|
|
431
|
+
_churn_cap = _churn_cap_raw if isinstance(_churn_cap_raw, int) else 20
|
|
432
|
+
churn_component = min(
|
|
433
|
+
_churn_cap,
|
|
434
|
+
int(math.log1p(churn_raw) * 4),
|
|
435
|
+
)
|
|
436
|
+
# Default confidence penalty.
|
|
437
|
+
confidence_penalty = 5
|
|
438
|
+
|
|
439
|
+
raw_score = sr + rr + ar + ds + failure_frequency + tg + churn_component - confidence_penalty
|
|
440
|
+
|
|
441
|
+
# Test-file penalty: test_*.py or *_test.py get a score reduction so
|
|
442
|
+
# that production files with comparable structural risk rank higher.
|
|
443
|
+
import posixpath as _posixpath
|
|
444
|
+
_basename = _posixpath.basename(file.replace("\\", "/"))
|
|
445
|
+
_penalty: int = 0
|
|
446
|
+
if _basename.startswith("test_") or _basename.endswith("_test.py"):
|
|
447
|
+
_penalty = HOTSPOT_WEIGHTS.get("test_file_penalty", -10)
|
|
448
|
+
_penalty = _penalty if _penalty < 0 else -abs(_penalty) # ensure negative
|
|
449
|
+
all_reasons.append("test_file_penalty(%d)" % _penalty)
|
|
450
|
+
_log.debug("build_hotspot_map: test file penalty applied to %s (%d)", file, _penalty)
|
|
451
|
+
|
|
452
|
+
raw_score += _penalty
|
|
453
|
+
score = max(0, min(130, raw_score))
|
|
454
|
+
|
|
455
|
+
all_reasons.extend(sr_reasons)
|
|
456
|
+
all_reasons.extend(rr_reasons)
|
|
457
|
+
all_reasons.extend(ar_reasons)
|
|
458
|
+
all_reasons.extend(ds_reasons)
|
|
459
|
+
all_reasons.extend(tg_reasons)
|
|
460
|
+
if churn_raw > 0:
|
|
461
|
+
all_reasons.append("churn_%d(+%d)" % (churn_raw, churn_component))
|
|
462
|
+
|
|
463
|
+
mode = hotspot_mode_for_score(score)
|
|
464
|
+
|
|
465
|
+
# Populate evidence from contributing sources
|
|
466
|
+
evidence = _populate_hotspot_evidence(file, repo_maps, all_reasons)
|
|
467
|
+
|
|
468
|
+
entries.append(HotspotEntry(
|
|
469
|
+
target=file,
|
|
470
|
+
hotspot_score=score,
|
|
471
|
+
reasons=tuple(all_reasons),
|
|
472
|
+
recommended_mode=mode,
|
|
473
|
+
source=_SOURCE,
|
|
474
|
+
evidence=evidence,
|
|
475
|
+
confidence=_CONFIDENCE,
|
|
476
|
+
freshness=freshness,
|
|
477
|
+
status="observed",
|
|
478
|
+
))
|
|
479
|
+
|
|
480
|
+
# Sort: highest score first, then alphabetically by target for tie-break.
|
|
481
|
+
entries.sort(key=lambda e: (-e.hotspot_score, e.target))
|
|
482
|
+
|
|
483
|
+
_log.info(
|
|
484
|
+
"build_hotspot_map: done -- %d entries, top score=%d",
|
|
485
|
+
len(entries),
|
|
486
|
+
entries[0].hotspot_score if entries else 0,
|
|
487
|
+
)
|
|
488
|
+
return entries
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def compute_hotspot_churn_metadata(
|
|
492
|
+
project_dir: Path,
|
|
493
|
+
since_window: str = "90.days",
|
|
494
|
+
) -> tuple[dict[str, int], dict]:
|
|
495
|
+
"""Compute per-file churn and metadata for index audit.
|
|
496
|
+
|
|
497
|
+
Build-scoped: no module-level caching. Each invocation executes a
|
|
498
|
+
``git log --numstat`` subprocess (fail-open -- returns empty dict on any
|
|
499
|
+
error or when project_dir is not inside a git repo).
|
|
500
|
+
|
|
501
|
+
Args:
|
|
502
|
+
project_dir: Absolute path to the project root.
|
|
503
|
+
since_window: ``--since`` window passed to ``git log``. Format:
|
|
504
|
+
``"90.days"``, ``"6.months"``, ``"2025-01-01"``, etc.
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
``(churn_data, metadata)`` where:
|
|
508
|
+
- ``churn_data``: ``{relative_path: total_churn_lines}`` dict suitable
|
|
509
|
+
for passing as ``churn_data`` kwarg to :func:`build_hotspot_map`.
|
|
510
|
+
- ``metadata``: ``{churn_source, git_head_sha, since_window}`` dict
|
|
511
|
+
for embedding in the map index under ``maps.hotspot``.
|
|
512
|
+
"""
|
|
513
|
+
from ._git_utils import git_head_sha, git_has_repo, git_log_numstat
|
|
514
|
+
|
|
515
|
+
churn_data: dict[str, int] = {}
|
|
516
|
+
churn_source = "skipped"
|
|
517
|
+
git_head = None
|
|
518
|
+
|
|
519
|
+
if git_has_repo(project_dir):
|
|
520
|
+
churn_data = git_log_numstat(project_dir, since=since_window)
|
|
521
|
+
if churn_data:
|
|
522
|
+
churn_source = "git_log_numstat"
|
|
523
|
+
else:
|
|
524
|
+
churn_source = "git_log_numstat_empty"
|
|
525
|
+
git_head = git_head_sha(project_dir)
|
|
526
|
+
|
|
527
|
+
metadata: dict = {
|
|
528
|
+
"churn_source": churn_source,
|
|
529
|
+
"git_head_sha": git_head,
|
|
530
|
+
"since_window": since_window if churn_source.startswith("git_log_numstat") else None,
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
_log.info(
|
|
534
|
+
"compute_hotspot_churn_metadata: source=%s files=%d git_head=%s",
|
|
535
|
+
churn_source,
|
|
536
|
+
len(churn_data),
|
|
537
|
+
git_head,
|
|
538
|
+
)
|
|
539
|
+
return churn_data, metadata
|