vigil-codeintel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
- vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
- vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
- vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
- vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
- vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
- vigil_forensic/__init__.py +224 -0
- vigil_forensic/_git_utils.py +178 -0
- vigil_forensic/_shared.py +510 -0
- vigil_forensic/_stubs.py +156 -0
- vigil_forensic/gate_checks/__init__.py +1 -0
- vigil_forensic/gate_checks/_ast_helpers.py +629 -0
- vigil_forensic/gate_checks/_deployment_detector.py +573 -0
- vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
- vigil_forensic/gate_checks/authority_checks.py +95 -0
- vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
- vigil_forensic/gate_checks/broad_except_checks.py +301 -0
- vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
- vigil_forensic/gate_checks/common.py +253 -0
- vigil_forensic/gate_checks/config_safety_checks.py +704 -0
- vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
- vigil_forensic/gate_checks/conflict_checks.py +193 -0
- vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
- vigil_forensic/gate_checks/context_health_checks.py +289 -0
- vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
- vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
- vigil_forensic/gate_checks/duplication_checks.py +387 -0
- vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
- vigil_forensic/gate_checks/empty_output_checks.py +87 -0
- vigil_forensic/gate_checks/encoding_checks.py +847 -0
- vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
- vigil_forensic/gate_checks/fallback_checks.py +41 -0
- vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
- vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
- vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
- vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
- vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
- vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
- vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
- vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
- vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
- vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
- vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
- vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
- vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
- vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
- vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
- vigil_forensic/gate_checks/hallucination_checks.py +566 -0
- vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
- vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
- vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
- vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
- vigil_forensic/gate_checks/ml_checks.py +318 -0
- vigil_forensic/gate_checks/performance_checks.py +106 -0
- vigil_forensic/gate_checks/project_specific_runner.py +691 -0
- vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
- vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
- vigil_forensic/gate_checks/reliability_checks.py +389 -0
- vigil_forensic/gate_checks/reporting_checks.py +55 -0
- vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
- vigil_forensic/gate_checks/security_injection_checks.py +332 -0
- vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
- vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
- vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
- vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
- vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
- vigil_forensic/gate_checks/test_quality_checks.py +946 -0
- vigil_forensic/gate_checks/testing_checks.py +149 -0
- vigil_forensic/gate_checks/toctou_checks.py +367 -0
- vigil_forensic/gate_checks/type_checking_checks.py +316 -0
- vigil_forensic/gate_models.py +392 -0
- vigil_forensic/gate_packs/__init__.py +1 -0
- vigil_forensic/gate_packs/universal.py +179 -0
- vigil_forensic/gate_profile.json +31 -0
- vigil_forensic/gate_registry.py +21 -0
- vigil_forensic/language_profiles.py +219 -0
- vigil_forensic/meta_findings.py +207 -0
- vigil_forensic/self_audit.py +725 -0
- vigil_forensic/source_analysis.py +175 -0
- vigil_mapper/__init__.py +103 -0
- vigil_mapper/_ast_helpers_minimal.py +229 -0
- vigil_mapper/_extract_imports_impl.py +123 -0
- vigil_mapper/_file_count_guard.py +129 -0
- vigil_mapper/_git_utils.py +178 -0
- vigil_mapper/_runtime_ast.py +438 -0
- vigil_mapper/_runtime_dispatch.py +137 -0
- vigil_mapper/_seed_helpers.py +82 -0
- vigil_mapper/authority_builder.py +1102 -0
- vigil_mapper/cli_entry.py +731 -0
- vigil_mapper/conflict_builder.py +818 -0
- vigil_mapper/data_contract_builder.py +446 -0
- vigil_mapper/findings_builder.py +716 -0
- vigil_mapper/fingerprint.py +53 -0
- vigil_mapper/hotspot_builder.py +539 -0
- vigil_mapper/map_common.py +449 -0
- vigil_mapper/map_errors.py +55 -0
- vigil_mapper/map_models.py +431 -0
- vigil_mapper/map_models_ext.py +206 -0
- vigil_mapper/map_models_findings.py +130 -0
- vigil_mapper/map_storage.py +455 -0
- vigil_mapper/parse_cache.py +795 -0
- vigil_mapper/refactor_boundary_builder.py +266 -0
- vigil_mapper/runtime_builder.py +527 -0
- vigil_mapper/runtime_tracer.py +243 -0
- vigil_mapper/runtime_tracer_entry.py +199 -0
- vigil_mapper/semantic_diff.py +71 -0
- vigil_mapper/source_adapters/__init__.py +109 -0
- vigil_mapper/source_adapters/_base.py +264 -0
- vigil_mapper/source_adapters/_ir.py +156 -0
- vigil_mapper/source_adapters/_lexer.py +309 -0
- vigil_mapper/source_adapters/_patterns.py +212 -0
- vigil_mapper/source_adapters/_treesitter.py +182 -0
- vigil_mapper/source_adapters/go.py +553 -0
- vigil_mapper/source_adapters/java.py +541 -0
- vigil_mapper/source_adapters/javascript.py +626 -0
- vigil_mapper/source_adapters/python.py +325 -0
- vigil_mapper/source_adapters/typescript.py +749 -0
- vigil_mapper/structural_builder.py +586 -0
- vigil_mcp/__init__.py +1 -0
- vigil_mcp/_jobs.py +587 -0
- vigil_mcp/_paths.py +93 -0
- vigil_mcp/forensic_server.py +419 -0
- vigil_mcp/map_server.py +452 -0
|
@@ -0,0 +1,573 @@
|
|
|
1
|
+
"""Deployment-target detection for forensic gates (F19).
|
|
2
|
+
|
|
3
|
+
Some gates are only meaningful when code actually runs on a specific platform.
|
|
4
|
+
The canonical example is ``encoding.windows_unsafe_char`` — cp1252 console
|
|
5
|
+
crash risk only exists when the Python/shell/Java source actually executes on
|
|
6
|
+
a Windows host. A pure-Linux trading stack has no such risk, so the 1k+
|
|
7
|
+
findings the gate raises on Linux-deployed code are false positives.
|
|
8
|
+
|
|
9
|
+
This module implements a 3-layer cascade for detecting where a target project
|
|
10
|
+
actually runs:
|
|
11
|
+
|
|
12
|
+
Layer 3 — explicit override (highest precedence)
|
|
13
|
+
* ``<project>/.autoforensics/config.json`` with
|
|
14
|
+
``{"deployment_target": "linux-only" | "windows-only" | "cross-platform" | "auto"}``
|
|
15
|
+
* Environment variable ``AUTOFORENSICS_DEPLOYMENT=<value>``. The CLI flag
|
|
16
|
+
``--deployment-target`` is plumbed through the env var so a single
|
|
17
|
+
reader handles both cases.
|
|
18
|
+
|
|
19
|
+
Layer 2 — project-level signals (cached per project dir)
|
|
20
|
+
* ``pyproject.toml`` ``classifiers`` — "POSIX :: Linux" / "Microsoft ::
|
|
21
|
+
Windows" / "OS Independent".
|
|
22
|
+
* ``setup.py`` ``classifiers`` — same semantics, parsed via AST to avoid
|
|
23
|
+
executing arbitrary setup code.
|
|
24
|
+
* ``Dockerfile`` present in project root → container, usually Linux.
|
|
25
|
+
* ``.github/workflows/*.yml`` — if every job uses ``ubuntu-latest``
|
|
26
|
+
only, that is a Linux-only deployment signal.
|
|
27
|
+
* ``.bat`` / ``.ps1`` / ``.cmd`` / ``.psm1`` files present OUTSIDE dev
|
|
28
|
+
infra paths (``.venv``, ``venv``, ``node_modules``, etc.) → the project
|
|
29
|
+
is Windows-aware. Dev-infra-only Windows scripts do not change
|
|
30
|
+
deployment target.
|
|
31
|
+
|
|
32
|
+
Layer 1 — file-level hints (per-file, content-based)
|
|
33
|
+
* Shebang ``#!/usr/bin/env python`` / ``#!/bin/bash`` / ``#!/bin/sh`` →
|
|
34
|
+
Unix signal.
|
|
35
|
+
* Imports ``winreg`` / ``ctypes.windll`` / ``win32com`` / ``pywin32`` →
|
|
36
|
+
Windows signal.
|
|
37
|
+
* Imports ``fcntl`` / ``pwd`` / ``grp`` / ``resource`` / ``uvloop`` /
|
|
38
|
+
``daemonize`` → Unix signal.
|
|
39
|
+
* AST pattern ``if sys.platform == "win32":`` / ``if os.name == "nt":``
|
|
40
|
+
→ Windows-aware code (Windows signal).
|
|
41
|
+
|
|
42
|
+
File-level signals are score-based with a ±2 threshold: small accidental
|
|
43
|
+
matches (e.g. a docstring containing the word "winreg") do not flip the
|
|
44
|
+
classification.
|
|
45
|
+
|
|
46
|
+
Precedence (strictest → weakest): explicit > file > project > unknown.
|
|
47
|
+
|
|
48
|
+
Conservative default: when no layer decides, callers treat the target as
|
|
49
|
+
"unknown" and **scan** — a false positive is recoverable (suppression); a
|
|
50
|
+
false negative hides a real bug.
|
|
51
|
+
"""
|
|
52
|
+
from __future__ import annotations
|
|
53
|
+
|
|
54
|
+
import ast
|
|
55
|
+
import json
|
|
56
|
+
import logging
|
|
57
|
+
import os
|
|
58
|
+
import re
|
|
59
|
+
from pathlib import Path
|
|
60
|
+
from typing import Literal
|
|
61
|
+
|
|
62
|
+
_log = logging.getLogger(__name__)
|
|
63
|
+
|
|
64
|
+
DeploymentTarget = Literal[
|
|
65
|
+
"linux-only",
|
|
66
|
+
"windows-only",
|
|
67
|
+
"cross-platform",
|
|
68
|
+
"auto",
|
|
69
|
+
"unknown",
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
# Accepted values in config / env. "auto" means "fall through to signals".
|
|
73
|
+
_VALID_EXPLICIT: frozenset[str] = frozenset({
|
|
74
|
+
"linux-only", "windows-only", "cross-platform", "auto",
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
# Module-level cache keyed by resolved project-dir string. Rubik has ~1958
|
|
78
|
+
# source files and every file triggers the encoding scan; we MUST NOT
|
|
79
|
+
# re-scan the project tree for every file.
|
|
80
|
+
_PROJECT_CACHE: dict[str, DeploymentTarget] = {}
|
|
81
|
+
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
# Layer 3 — explicit override
|
|
84
|
+
# ---------------------------------------------------------------------------
|
|
85
|
+
|
|
86
|
+
_ENV_VAR = "AUTOFORENSICS_DEPLOYMENT"
|
|
87
|
+
_CONFIG_REL = Path(".autoforensics") / "config.json"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _normalize_explicit(value: str | None) -> DeploymentTarget | None:
|
|
91
|
+
"""Coerce a raw string into a valid DeploymentTarget or None.
|
|
92
|
+
|
|
93
|
+
Unknown / empty values return None so the caller falls through to the
|
|
94
|
+
next layer. "auto" returns None as well — the whole point of "auto" is
|
|
95
|
+
"let the detector decide".
|
|
96
|
+
"""
|
|
97
|
+
if not value:
|
|
98
|
+
return None
|
|
99
|
+
normalized = value.strip().lower()
|
|
100
|
+
if normalized == "auto":
|
|
101
|
+
return None
|
|
102
|
+
if normalized in _VALID_EXPLICIT:
|
|
103
|
+
return normalized # type: ignore[return-value]
|
|
104
|
+
_log.warning(
|
|
105
|
+
"AUTOFORENSICS: ignoring invalid deployment_target=%r (expected one of %s)",
|
|
106
|
+
value,
|
|
107
|
+
sorted(_VALID_EXPLICIT),
|
|
108
|
+
)
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def get_explicit_deployment(project_dir: Path) -> DeploymentTarget | None:
|
|
113
|
+
"""Return explicit override (Layer 3) or None.
|
|
114
|
+
|
|
115
|
+
Env var wins over config file: the CLI flag plumbs through the env var,
|
|
116
|
+
so the most recent caller intention is honoured. Config-file values that
|
|
117
|
+
are syntactically invalid are ignored (no crash — return None so we fall
|
|
118
|
+
through to signal detection).
|
|
119
|
+
"""
|
|
120
|
+
env_value = os.environ.get(_ENV_VAR)
|
|
121
|
+
normalized = _normalize_explicit(env_value)
|
|
122
|
+
if normalized is not None:
|
|
123
|
+
return normalized
|
|
124
|
+
|
|
125
|
+
config_path = project_dir / _CONFIG_REL
|
|
126
|
+
if not config_path.is_file():
|
|
127
|
+
return None
|
|
128
|
+
try:
|
|
129
|
+
payload = json.loads(config_path.read_text(encoding="utf-8"))
|
|
130
|
+
except (OSError, json.JSONDecodeError) as exc:
|
|
131
|
+
_log.warning(
|
|
132
|
+
"AUTOFORENSICS: cannot read %s (%s: %s) — falling through to signal detection",
|
|
133
|
+
config_path, type(exc).__name__, exc,
|
|
134
|
+
)
|
|
135
|
+
return None
|
|
136
|
+
if not isinstance(payload, dict):
|
|
137
|
+
return None
|
|
138
|
+
return _normalize_explicit(payload.get("deployment_target"))
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# ---------------------------------------------------------------------------
|
|
142
|
+
# Layer 1 — file-level hints
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
# Score threshold: a file must score >= +2 (unix) or <= -2 (windows) to be
|
|
146
|
+
# classified. Small accidental matches fall below threshold and stay unknown.
|
|
147
|
+
_FILE_THRESHOLD = 2
|
|
148
|
+
|
|
149
|
+
# Regexes are module-level so Python compiles them once per process.
|
|
150
|
+
_SHEBANG_UNIX_RE = re.compile(
|
|
151
|
+
r"^#!\s*/(?:usr/bin/env\s+(?:python\d*|bash|sh|zsh)|bin/(?:bash|sh|zsh))\b"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Import patterns. These are deliberately conservative — we match
|
|
155
|
+
# module-top-level imports only, not references inside strings/comments.
|
|
156
|
+
_UNIX_IMPORT_RE = re.compile(
|
|
157
|
+
r"^\s*(?:from|import)\s+(fcntl|pwd|grp|resource|uvloop|daemonize|"
|
|
158
|
+
r"termios|syslog|posix|spwd|crypt)\b",
|
|
159
|
+
re.MULTILINE,
|
|
160
|
+
)
|
|
161
|
+
_WINDOWS_IMPORT_RE = re.compile(
|
|
162
|
+
r"^\s*(?:from|import)\s+(winreg|_winreg|win32com|win32api|win32con|"
|
|
163
|
+
r"win32process|win32security|pywin32|msvcrt|winsound)\b",
|
|
164
|
+
re.MULTILINE,
|
|
165
|
+
)
|
|
166
|
+
# ctypes.windll / ctypes.WinDLL — Windows-only ctypes surface.
|
|
167
|
+
_CTYPES_WIN_RE = re.compile(
|
|
168
|
+
r"\bctypes\.(?:windll|WinDLL|oledll|OleDLL)\b"
|
|
169
|
+
)
|
|
170
|
+
# sys.platform == "win32" / os.name == "nt" — Windows-aware branching.
|
|
171
|
+
_SYS_PLATFORM_WIN_RE = re.compile(
|
|
172
|
+
r"\bsys\.platform\s*==\s*['\"]win32['\"]|"
|
|
173
|
+
r"\bos\.name\s*==\s*['\"]nt['\"]"
|
|
174
|
+
)
|
|
175
|
+
_SYS_PLATFORM_LINUX_RE = re.compile(
|
|
176
|
+
r"\bsys\.platform\s*==\s*['\"]linux['\"]|"
|
|
177
|
+
r"\bsys\.platform\.startswith\(\s*['\"]linux['\"]\s*\)|"
|
|
178
|
+
r"\bos\.name\s*==\s*['\"]posix['\"]"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def detect_file_deployment(content: str) -> Literal["unix", "windows", "unknown"]:
|
|
183
|
+
"""Classify a single file's content by platform affinity.
|
|
184
|
+
|
|
185
|
+
Score-based, threshold ±2. Strong single signals (``import winreg``,
|
|
186
|
+
``import fcntl``, ``#!/usr/bin/env python`` + matching imports) are
|
|
187
|
+
worth 2 points so one signal alone flips classification. Weaker
|
|
188
|
+
supporting signals (``sys.platform == 'win32'``) are worth 1.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
"unix" — score >= +2
|
|
192
|
+
"windows" — score <= -2
|
|
193
|
+
"unknown" — between (ambiguous or no clear signal)
|
|
194
|
+
|
|
195
|
+
Anti-FP property: a file that imports BOTH a winreg and a fcntl module
|
|
196
|
+
(e.g. a cross-platform shim with explicit branches) scores 0 and stays
|
|
197
|
+
"unknown" — we let Layer 2 / default decide.
|
|
198
|
+
"""
|
|
199
|
+
if not content:
|
|
200
|
+
return "unknown"
|
|
201
|
+
|
|
202
|
+
score = 0
|
|
203
|
+
|
|
204
|
+
# Shebang (first line only). Canonical Unix signal, worth +2 on its own.
|
|
205
|
+
# A shebang is a deliberate runtime declaration, not an accidental string
|
|
206
|
+
# match, so we treat it as a strong signal.
|
|
207
|
+
first_newline = content.find("\n")
|
|
208
|
+
first_line = content[:first_newline] if first_newline >= 0 else content
|
|
209
|
+
if _SHEBANG_UNIX_RE.match(first_line):
|
|
210
|
+
score += 2
|
|
211
|
+
|
|
212
|
+
# Unix imports. ``import fcntl`` / ``from pwd import getpwnam`` are hard
|
|
213
|
+
# dependencies on Unix-only stdlib modules. Each distinct module adds
|
|
214
|
+
# +2 (strong), capped at +3 so mass-import files don't dominate.
|
|
215
|
+
unix_hits = len(set(_UNIX_IMPORT_RE.findall(content)))
|
|
216
|
+
if unix_hits:
|
|
217
|
+
score += min(2 * unix_hits, 3)
|
|
218
|
+
|
|
219
|
+
# Windows imports. Same reasoning mirrored.
|
|
220
|
+
windows_hits = len(set(_WINDOWS_IMPORT_RE.findall(content)))
|
|
221
|
+
if windows_hits:
|
|
222
|
+
score -= min(2 * windows_hits, 3)
|
|
223
|
+
|
|
224
|
+
# ctypes.windll / ctypes.WinDLL — Windows-specific ctypes surface.
|
|
225
|
+
# Weaker (+1) because projects sometimes reference it conditionally.
|
|
226
|
+
if _CTYPES_WIN_RE.search(content):
|
|
227
|
+
score -= 1
|
|
228
|
+
|
|
229
|
+
# Platform-branch hints. A file that explicitly checks win32 branch is
|
|
230
|
+
# Windows-aware but not necessarily Windows-only — weight 1.
|
|
231
|
+
if _SYS_PLATFORM_WIN_RE.search(content):
|
|
232
|
+
score -= 1
|
|
233
|
+
if _SYS_PLATFORM_LINUX_RE.search(content):
|
|
234
|
+
score += 1
|
|
235
|
+
|
|
236
|
+
if score >= _FILE_THRESHOLD:
|
|
237
|
+
return "unix"
|
|
238
|
+
if score <= -_FILE_THRESHOLD:
|
|
239
|
+
return "windows"
|
|
240
|
+
return "unknown"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# ---------------------------------------------------------------------------
|
|
244
|
+
# Layer 2 — project-level signals
|
|
245
|
+
# ---------------------------------------------------------------------------
|
|
246
|
+
|
|
247
|
+
# Directories whose contents do NOT represent the project's deployment
|
|
248
|
+
# target — virtualenvs, bundled vendor libs, build output, etc. .bat/.ps1
|
|
249
|
+
# files under these paths are dev/tooling artifacts, not a signal that the
|
|
250
|
+
# project itself targets Windows.
|
|
251
|
+
_IGNORED_DIR_PARTS: frozenset[str] = frozenset({
|
|
252
|
+
".venv", "venv", "env", ".env",
|
|
253
|
+
"node_modules",
|
|
254
|
+
"__pycache__",
|
|
255
|
+
".git", ".hg", ".svn",
|
|
256
|
+
"build", "dist",
|
|
257
|
+
".tox", ".mypy_cache", ".pytest_cache", ".ruff_cache",
|
|
258
|
+
"libs", # SYSTEM/libs vendor tree
|
|
259
|
+
".cortex",
|
|
260
|
+
})
|
|
261
|
+
|
|
262
|
+
_WINDOWS_SCRIPT_EXTS: tuple[str, ...] = (".bat", ".cmd", ".ps1", ".psm1")
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _path_in_ignored_tree(rel_parts: tuple[str, ...]) -> bool:
|
|
266
|
+
return any(part in _IGNORED_DIR_PARTS for part in rel_parts)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _read_pyproject_classifiers(project_dir: Path) -> list[str]:
|
|
270
|
+
"""Return list of classifier strings from pyproject.toml, or [] if absent
|
|
271
|
+
/ unreadable. Uses tomllib (stdlib 3.11+)."""
|
|
272
|
+
path = project_dir / "pyproject.toml"
|
|
273
|
+
if not path.is_file():
|
|
274
|
+
return []
|
|
275
|
+
try:
|
|
276
|
+
import tomllib
|
|
277
|
+
except ImportError: # pragma: no cover — 3.11+ always has it
|
|
278
|
+
return []
|
|
279
|
+
try:
|
|
280
|
+
with path.open("rb") as fh:
|
|
281
|
+
data = tomllib.load(fh)
|
|
282
|
+
except (OSError, tomllib.TOMLDecodeError) as exc:
|
|
283
|
+
_log.debug("AUTOFORENSICS: cannot read %s (%s)", path, exc)
|
|
284
|
+
return []
|
|
285
|
+
project = data.get("project") or {}
|
|
286
|
+
classifiers = project.get("classifiers") or []
|
|
287
|
+
if not isinstance(classifiers, list):
|
|
288
|
+
return []
|
|
289
|
+
return [str(c) for c in classifiers]
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _read_setuppy_classifiers(project_dir: Path) -> list[str]:
|
|
293
|
+
"""Extract classifiers list from setup.py via AST (no exec).
|
|
294
|
+
|
|
295
|
+
Returns [] if setup.py missing, unparseable, or has no ``classifiers=``
|
|
296
|
+
keyword on a setup() call.
|
|
297
|
+
"""
|
|
298
|
+
path = project_dir / "setup.py"
|
|
299
|
+
if not path.is_file():
|
|
300
|
+
return []
|
|
301
|
+
try:
|
|
302
|
+
source = path.read_text(encoding="utf-8", errors="replace")
|
|
303
|
+
tree = ast.parse(source)
|
|
304
|
+
except (OSError, SyntaxError) as exc:
|
|
305
|
+
_log.debug("AUTOFORENSICS: cannot parse %s (%s)", path, exc)
|
|
306
|
+
return []
|
|
307
|
+
for node in ast.walk(tree):
|
|
308
|
+
if not isinstance(node, ast.Call):
|
|
309
|
+
continue
|
|
310
|
+
func = node.func
|
|
311
|
+
# Match setup(...) — either bare name or <module>.setup.
|
|
312
|
+
if isinstance(func, ast.Name) and func.id == "setup":
|
|
313
|
+
pass
|
|
314
|
+
elif isinstance(func, ast.Attribute) and func.attr == "setup":
|
|
315
|
+
pass
|
|
316
|
+
else:
|
|
317
|
+
continue
|
|
318
|
+
for kw in node.keywords:
|
|
319
|
+
if kw.arg != "classifiers":
|
|
320
|
+
continue
|
|
321
|
+
if not isinstance(kw.value, (ast.List, ast.Tuple)):
|
|
322
|
+
continue
|
|
323
|
+
result: list[str] = []
|
|
324
|
+
for elt in kw.value.elts:
|
|
325
|
+
if isinstance(elt, ast.Constant) and isinstance(elt.value, str):
|
|
326
|
+
result.append(elt.value)
|
|
327
|
+
return result
|
|
328
|
+
return []
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _classify_from_classifiers(classifiers: list[str]) -> DeploymentTarget | None:
|
|
332
|
+
"""Map Python trove classifiers to a deployment target.
|
|
333
|
+
|
|
334
|
+
Matches Trove strings like:
|
|
335
|
+
* "Operating System :: POSIX :: Linux"
|
|
336
|
+
* "Operating System :: Microsoft :: Windows"
|
|
337
|
+
* "Operating System :: OS Independent"
|
|
338
|
+
"""
|
|
339
|
+
if not classifiers:
|
|
340
|
+
return None
|
|
341
|
+
has_linux = any("POSIX" in c or "Linux" in c for c in classifiers)
|
|
342
|
+
has_windows = any("Microsoft" in c or "Windows" in c for c in classifiers)
|
|
343
|
+
has_independent = any("OS Independent" in c for c in classifiers)
|
|
344
|
+
if has_independent:
|
|
345
|
+
return "cross-platform"
|
|
346
|
+
if has_linux and has_windows:
|
|
347
|
+
return "cross-platform"
|
|
348
|
+
if has_linux:
|
|
349
|
+
return "linux-only"
|
|
350
|
+
if has_windows:
|
|
351
|
+
return "windows-only"
|
|
352
|
+
return None
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def _has_dockerfile(project_dir: Path) -> bool:
|
|
356
|
+
"""True when a Dockerfile exists at the project root (case-insensitive
|
|
357
|
+
for the basename)."""
|
|
358
|
+
for name in ("Dockerfile", "dockerfile", "Dockerfile.prod", "Dockerfile.ci"):
|
|
359
|
+
if (project_dir / name).is_file():
|
|
360
|
+
return True
|
|
361
|
+
# Some projects put it under build/ or docker/ — accept up to depth 2.
|
|
362
|
+
for pattern in ("**/Dockerfile", "**/Dockerfile.*"):
|
|
363
|
+
for candidate in project_dir.glob(pattern):
|
|
364
|
+
try:
|
|
365
|
+
rel = candidate.relative_to(project_dir)
|
|
366
|
+
except ValueError:
|
|
367
|
+
continue
|
|
368
|
+
if _path_in_ignored_tree(rel.parts):
|
|
369
|
+
continue
|
|
370
|
+
return True
|
|
371
|
+
return False
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
_GHA_JOB_OS_RE = re.compile(
|
|
375
|
+
r"^\s*runs-on\s*:\s*[\"']?([A-Za-z0-9._-]+)[\"']?",
|
|
376
|
+
re.MULTILINE,
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _gha_runners(project_dir: Path) -> set[str]:
|
|
381
|
+
"""Return the set of all ``runs-on`` values used across GitHub Actions
|
|
382
|
+
workflows. Empty set when no workflows present.
|
|
383
|
+
|
|
384
|
+
We avoid a YAML dependency by matching a simple regex — ``runs-on`` is
|
|
385
|
+
conventionally a single-line scalar. Matrix expressions (``${{...}}``)
|
|
386
|
+
are returned verbatim; callers treat non-ubuntu/windows/macos values as
|
|
387
|
+
unknown runners.
|
|
388
|
+
"""
|
|
389
|
+
wf_dir = project_dir / ".github" / "workflows"
|
|
390
|
+
if not wf_dir.is_dir():
|
|
391
|
+
return set()
|
|
392
|
+
runners: set[str] = set()
|
|
393
|
+
for wf in wf_dir.glob("*.yml"):
|
|
394
|
+
try:
|
|
395
|
+
text = wf.read_text(encoding="utf-8", errors="replace")
|
|
396
|
+
except OSError:
|
|
397
|
+
continue
|
|
398
|
+
for m in _GHA_JOB_OS_RE.finditer(text):
|
|
399
|
+
runners.add(m.group(1).lower())
|
|
400
|
+
for wf in wf_dir.glob("*.yaml"):
|
|
401
|
+
try:
|
|
402
|
+
text = wf.read_text(encoding="utf-8", errors="replace")
|
|
403
|
+
except OSError:
|
|
404
|
+
continue
|
|
405
|
+
for m in _GHA_JOB_OS_RE.finditer(text):
|
|
406
|
+
runners.add(m.group(1).lower())
|
|
407
|
+
return runners
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _windows_scripts_outside_dev(project_dir: Path, cap: int = 4) -> bool:
|
|
411
|
+
"""True when ≥1 .bat/.ps1/.cmd/.psm1 file exists outside dev-infra trees.
|
|
412
|
+
|
|
413
|
+
We short-circuit after finding `cap` hits so the scan stays bounded even
|
|
414
|
+
on huge repos.
|
|
415
|
+
"""
|
|
416
|
+
found = 0
|
|
417
|
+
for ext in _WINDOWS_SCRIPT_EXTS:
|
|
418
|
+
for path in project_dir.rglob(f"*{ext}"):
|
|
419
|
+
try:
|
|
420
|
+
rel = path.relative_to(project_dir)
|
|
421
|
+
except ValueError:
|
|
422
|
+
continue
|
|
423
|
+
if _path_in_ignored_tree(rel.parts):
|
|
424
|
+
continue
|
|
425
|
+
found += 1
|
|
426
|
+
if found >= cap:
|
|
427
|
+
return True
|
|
428
|
+
return found > 0
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def _linux_only_deps_in_requirements(project_dir: Path) -> bool:
|
|
432
|
+
"""True when requirements*.txt lists a Linux-exclusive package (uvloop,
|
|
433
|
+
daemonize, sdnotify, systemd-python, etc.)."""
|
|
434
|
+
linux_pkgs = ("uvloop", "daemonize", "sdnotify", "systemd-python", "python-systemd")
|
|
435
|
+
for pattern in ("requirements.txt", "requirements-*.txt", "requirements/*.txt"):
|
|
436
|
+
for path in project_dir.glob(pattern):
|
|
437
|
+
try:
|
|
438
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
439
|
+
except OSError:
|
|
440
|
+
continue
|
|
441
|
+
for pkg in linux_pkgs:
|
|
442
|
+
if re.search(rf"(?m)^\s*{re.escape(pkg)}\b", text):
|
|
443
|
+
return True
|
|
444
|
+
return False
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def _detect_project_uncached(project_dir: Path) -> DeploymentTarget:
|
|
448
|
+
"""Run Layer 2 detection (no cache). Returns 'unknown' when signals
|
|
449
|
+
are absent or contradictory.
|
|
450
|
+
|
|
451
|
+
Priority order:
|
|
452
|
+
1. pyproject.toml / setup.py classifiers (authoritative upstream metadata).
|
|
453
|
+
2. GitHub Actions runners (deployment-test target).
|
|
454
|
+
3. Dockerfile (containerisation signal).
|
|
455
|
+
4. Linux-exclusive deps in requirements*.txt.
|
|
456
|
+
5. Windows scripts outside dev-infra — downgrade to cross-platform.
|
|
457
|
+
"""
|
|
458
|
+
# 1. Classifiers — most authoritative.
|
|
459
|
+
classifiers = _read_pyproject_classifiers(project_dir)
|
|
460
|
+
if not classifiers:
|
|
461
|
+
classifiers = _read_setuppy_classifiers(project_dir)
|
|
462
|
+
decision = _classify_from_classifiers(classifiers)
|
|
463
|
+
if decision is not None:
|
|
464
|
+
_log.debug(
|
|
465
|
+
"AUTOFORENSICS: %s classified %s via pyproject/setup.py classifiers",
|
|
466
|
+
project_dir, decision,
|
|
467
|
+
)
|
|
468
|
+
return decision
|
|
469
|
+
|
|
470
|
+
# 2. GitHub Actions runners.
|
|
471
|
+
runners = _gha_runners(project_dir)
|
|
472
|
+
if runners:
|
|
473
|
+
has_ubuntu = any(r.startswith("ubuntu") for r in runners)
|
|
474
|
+
has_windows = any(r.startswith("windows") for r in runners)
|
|
475
|
+
has_macos = any(r.startswith("macos") for r in runners)
|
|
476
|
+
# Unknown runners (matrix expressions, self-hosted) count as
|
|
477
|
+
# "unresolved" — we do not force a linux-only conclusion when they
|
|
478
|
+
# appear alongside ubuntu.
|
|
479
|
+
has_unknown = any(
|
|
480
|
+
not (r.startswith("ubuntu") or r.startswith("windows") or r.startswith("macos"))
|
|
481
|
+
for r in runners
|
|
482
|
+
)
|
|
483
|
+
if has_windows and not has_ubuntu:
|
|
484
|
+
return "windows-only"
|
|
485
|
+
if has_ubuntu and not has_windows and not has_unknown:
|
|
486
|
+
# Linux-only CI is a deployment signal. macOS runners alongside
|
|
487
|
+
# ubuntu still indicate a Unix-only test matrix — classify as
|
|
488
|
+
# linux-only for encoding-gate purposes (macOS console is UTF-8
|
|
489
|
+
# by default, not cp1252).
|
|
490
|
+
if not has_macos or has_macos:
|
|
491
|
+
return "linux-only"
|
|
492
|
+
|
|
493
|
+
# 3. Dockerfile → container → Linux in the overwhelming majority of
|
|
494
|
+
# cases. We do not downgrade for the rare Windows-container project;
|
|
495
|
+
# callers can override via explicit config.
|
|
496
|
+
if _has_dockerfile(project_dir):
|
|
497
|
+
return "linux-only"
|
|
498
|
+
|
|
499
|
+
# 4. Linux-exclusive deps.
|
|
500
|
+
if _linux_only_deps_in_requirements(project_dir):
|
|
501
|
+
return "linux-only"
|
|
502
|
+
|
|
503
|
+
# 5. Windows scripts outside dev-infra → at least cross-platform.
|
|
504
|
+
# This only fires when steps 1–4 did not decide. A project with a
|
|
505
|
+
# start_vigil.bat launcher but no deployment metadata is assumed to be
|
|
506
|
+
# Windows-aware.
|
|
507
|
+
if _windows_scripts_outside_dev(project_dir):
|
|
508
|
+
return "cross-platform"
|
|
509
|
+
|
|
510
|
+
return "unknown"
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def detect_project_deployment(project_dir: Path) -> DeploymentTarget:
|
|
514
|
+
"""Cached entry-point for Layer 2 detection.
|
|
515
|
+
|
|
516
|
+
Cache key is the resolved, case-normalised string path. A rubik-scale
|
|
517
|
+
project (~2000 files) asks this function once per file; we MUST amortise.
|
|
518
|
+
"""
|
|
519
|
+
try:
|
|
520
|
+
key = str(project_dir.resolve()).lower()
|
|
521
|
+
except OSError:
|
|
522
|
+
key = str(project_dir).lower()
|
|
523
|
+
cached = _PROJECT_CACHE.get(key)
|
|
524
|
+
if cached is not None:
|
|
525
|
+
return cached
|
|
526
|
+
result = _detect_project_uncached(project_dir)
|
|
527
|
+
_PROJECT_CACHE[key] = result
|
|
528
|
+
return result
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def clear_project_cache() -> None:
|
|
532
|
+
"""Drop all memoised project-level detections. Intended for tests."""
|
|
533
|
+
_PROJECT_CACHE.clear()
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
# ---------------------------------------------------------------------------
|
|
537
|
+
# Cascade — the single entrypoint callers should use
|
|
538
|
+
# ---------------------------------------------------------------------------
|
|
539
|
+
|
|
540
|
+
def resolve_deployment(
|
|
541
|
+
project_dir: Path,
|
|
542
|
+
file_content: str | None = None,
|
|
543
|
+
) -> DeploymentTarget:
|
|
544
|
+
"""Resolve a deployment target using the full 3-layer cascade.
|
|
545
|
+
|
|
546
|
+
Precedence (strictest wins):
|
|
547
|
+
1. Explicit override (config.json / env var).
|
|
548
|
+
2. File-level signal — when content is provided and classifies as
|
|
549
|
+
'unix' or 'windows'.
|
|
550
|
+
3. Project-level signal.
|
|
551
|
+
4. 'unknown' — caller decides how to handle (conservative default:
|
|
552
|
+
scan).
|
|
553
|
+
"""
|
|
554
|
+
explicit = get_explicit_deployment(project_dir)
|
|
555
|
+
if explicit is not None:
|
|
556
|
+
return explicit
|
|
557
|
+
if file_content is not None:
|
|
558
|
+
file_signal = detect_file_deployment(file_content)
|
|
559
|
+
if file_signal == "unix":
|
|
560
|
+
return "linux-only"
|
|
561
|
+
if file_signal == "windows":
|
|
562
|
+
return "windows-only"
|
|
563
|
+
return detect_project_deployment(project_dir)
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
__all__ = [
|
|
567
|
+
"DeploymentTarget",
|
|
568
|
+
"get_explicit_deployment",
|
|
569
|
+
"detect_file_deployment",
|
|
570
|
+
"detect_project_deployment",
|
|
571
|
+
"clear_project_cache",
|
|
572
|
+
"resolve_deployment",
|
|
573
|
+
]
|