vigil-codeintel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vigil_codeintel-0.1.0.dist-info/METADATA +780 -0
- vigil_codeintel-0.1.0.dist-info/RECORD +131 -0
- vigil_codeintel-0.1.0.dist-info/WHEEL +5 -0
- vigil_codeintel-0.1.0.dist-info/entry_points.txt +3 -0
- vigil_codeintel-0.1.0.dist-info/licenses/LICENSE +21 -0
- vigil_codeintel-0.1.0.dist-info/top_level.txt +3 -0
- vigil_forensic/__init__.py +224 -0
- vigil_forensic/_git_utils.py +178 -0
- vigil_forensic/_shared.py +510 -0
- vigil_forensic/_stubs.py +156 -0
- vigil_forensic/gate_checks/__init__.py +1 -0
- vigil_forensic/gate_checks/_ast_helpers.py +629 -0
- vigil_forensic/gate_checks/_deployment_detector.py +573 -0
- vigil_forensic/gate_checks/atomic_write_checks.py +1143 -0
- vigil_forensic/gate_checks/authority_checks.py +95 -0
- vigil_forensic/gate_checks/boundary_breach_checks.py +202 -0
- vigil_forensic/gate_checks/broad_except_checks.py +301 -0
- vigil_forensic/gate_checks/broad_except_hidden_sentinel_checks.py +365 -0
- vigil_forensic/gate_checks/common.py +253 -0
- vigil_forensic/gate_checks/config_safety_checks.py +704 -0
- vigil_forensic/gate_checks/config_ssot_checks.py +78 -0
- vigil_forensic/gate_checks/conflict_checks.py +193 -0
- vigil_forensic/gate_checks/context_fallback_checks.py +697 -0
- vigil_forensic/gate_checks/context_health_checks.py +289 -0
- vigil_forensic/gate_checks/contract_shape_drift_checks.py +459 -0
- vigil_forensic/gate_checks/dirty_baseline_check.py +274 -0
- vigil_forensic/gate_checks/duplication_checks.py +387 -0
- vigil_forensic/gate_checks/embedded_string_checks.py +123 -0
- vigil_forensic/gate_checks/empty_output_checks.py +87 -0
- vigil_forensic/gate_checks/encoding_checks.py +847 -0
- vigil_forensic/gate_checks/export_completeness_checks.py +156 -0
- vigil_forensic/gate_checks/fallback_checks.py +41 -0
- vigil_forensic/gate_checks/file_proliferation_checks.py +171 -0
- vigil_forensic/gate_checks/fix_without_test_checks.py +69 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/__init__.py +9 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/_helpers.py +71 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/advanced_checks.py +322 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/core.py +273 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/integrity_checks.py +203 -0
- vigil_forensic/gate_checks/forensic_cluster_runners/quality_checks.py +666 -0
- vigil_forensic/gate_checks/forensic_clusters/__init__.py +193 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist.py +426 -0
- vigil_forensic/gate_checks/forensic_clusters/allowlist_writer.py +302 -0
- vigil_forensic/gate_checks/forensic_clusters/api_protocol.py +231 -0
- vigil_forensic/gate_checks/forensic_clusters/async_quality.py +1156 -0
- vigil_forensic/gate_checks/forensic_clusters/code_style.py +808 -0
- vigil_forensic/gate_checks/forensic_clusters/core.py +319 -0
- vigil_forensic/gate_checks/forensic_clusters/data_quality.py +763 -0
- vigil_forensic/gate_checks/forensic_clusters/dead_code.py +480 -0
- vigil_forensic/gate_checks/forensic_clusters/edit_mutation.py +842 -0
- vigil_forensic/gate_checks/forensic_clusters/exception_boundary.py +240 -0
- vigil_forensic/gate_checks/forensic_clusters/legacy_debt.py +556 -0
- vigil_forensic/gate_checks/forensic_clusters/static_analysis.py +834 -0
- vigil_forensic/gate_checks/forensic_clusters/structural_quality.py +298 -0
- vigil_forensic/gate_checks/god_object_zones_checks.py +173 -0
- vigil_forensic/gate_checks/hallucination_checks.py +566 -0
- vigil_forensic/gate_checks/hunter_artifact_completeness_check.py +139 -0
- vigil_forensic/gate_checks/implementation_overfit_checks.py +380 -0
- vigil_forensic/gate_checks/import_integrity_checks.py +233 -0
- vigil_forensic/gate_checks/imports_in_function_checks.py +283 -0
- vigil_forensic/gate_checks/ml_checks.py +318 -0
- vigil_forensic/gate_checks/performance_checks.py +106 -0
- vigil_forensic/gate_checks/project_specific_runner.py +691 -0
- vigil_forensic/gate_checks/provider_capability_checks.py +73 -0
- vigil_forensic/gate_checks/refactor_completeness_checks.py +274 -0
- vigil_forensic/gate_checks/reliability_checks.py +389 -0
- vigil_forensic/gate_checks/reporting_checks.py +55 -0
- vigil_forensic/gate_checks/runtime_behavior_checks.py +220 -0
- vigil_forensic/gate_checks/security_injection_checks.py +332 -0
- vigil_forensic/gate_checks/semantic_intent_checks.py +139 -0
- vigil_forensic/gate_checks/size_complexity_checks.py +336 -0
- vigil_forensic/gate_checks/stuck_feature_flag_checks.py +354 -0
- vigil_forensic/gate_checks/syntax_validity_checks.py +217 -0
- vigil_forensic/gate_checks/temporal_freshness_checks.py +79 -0
- vigil_forensic/gate_checks/test_quality_checks.py +946 -0
- vigil_forensic/gate_checks/testing_checks.py +149 -0
- vigil_forensic/gate_checks/toctou_checks.py +367 -0
- vigil_forensic/gate_checks/type_checking_checks.py +316 -0
- vigil_forensic/gate_models.py +392 -0
- vigil_forensic/gate_packs/__init__.py +1 -0
- vigil_forensic/gate_packs/universal.py +179 -0
- vigil_forensic/gate_profile.json +31 -0
- vigil_forensic/gate_registry.py +21 -0
- vigil_forensic/language_profiles.py +219 -0
- vigil_forensic/meta_findings.py +207 -0
- vigil_forensic/self_audit.py +725 -0
- vigil_forensic/source_analysis.py +175 -0
- vigil_mapper/__init__.py +103 -0
- vigil_mapper/_ast_helpers_minimal.py +229 -0
- vigil_mapper/_extract_imports_impl.py +123 -0
- vigil_mapper/_file_count_guard.py +129 -0
- vigil_mapper/_git_utils.py +178 -0
- vigil_mapper/_runtime_ast.py +438 -0
- vigil_mapper/_runtime_dispatch.py +137 -0
- vigil_mapper/_seed_helpers.py +82 -0
- vigil_mapper/authority_builder.py +1102 -0
- vigil_mapper/cli_entry.py +731 -0
- vigil_mapper/conflict_builder.py +818 -0
- vigil_mapper/data_contract_builder.py +446 -0
- vigil_mapper/findings_builder.py +716 -0
- vigil_mapper/fingerprint.py +53 -0
- vigil_mapper/hotspot_builder.py +539 -0
- vigil_mapper/map_common.py +449 -0
- vigil_mapper/map_errors.py +55 -0
- vigil_mapper/map_models.py +431 -0
- vigil_mapper/map_models_ext.py +206 -0
- vigil_mapper/map_models_findings.py +130 -0
- vigil_mapper/map_storage.py +455 -0
- vigil_mapper/parse_cache.py +795 -0
- vigil_mapper/refactor_boundary_builder.py +266 -0
- vigil_mapper/runtime_builder.py +527 -0
- vigil_mapper/runtime_tracer.py +243 -0
- vigil_mapper/runtime_tracer_entry.py +199 -0
- vigil_mapper/semantic_diff.py +71 -0
- vigil_mapper/source_adapters/__init__.py +109 -0
- vigil_mapper/source_adapters/_base.py +264 -0
- vigil_mapper/source_adapters/_ir.py +156 -0
- vigil_mapper/source_adapters/_lexer.py +309 -0
- vigil_mapper/source_adapters/_patterns.py +212 -0
- vigil_mapper/source_adapters/_treesitter.py +182 -0
- vigil_mapper/source_adapters/go.py +553 -0
- vigil_mapper/source_adapters/java.py +541 -0
- vigil_mapper/source_adapters/javascript.py +626 -0
- vigil_mapper/source_adapters/python.py +325 -0
- vigil_mapper/source_adapters/typescript.py +749 -0
- vigil_mapper/structural_builder.py +586 -0
- vigil_mcp/__init__.py +1 -0
- vigil_mcp/_jobs.py +587 -0
- vigil_mcp/_paths.py +93 -0
- vigil_mcp/forensic_server.py +419 -0
- vigil_mcp/map_server.py +452 -0
|
@@ -0,0 +1,847 @@
|
|
|
1
|
+
"""Detect non-ASCII characters that crash Windows console (cp1252).
|
|
2
|
+
|
|
3
|
+
Windows default console encoding cannot render emoji, box-drawing, arrows,
|
|
4
|
+
smart quotes. Python print/raise/log crashes with UnicodeEncodeError.
|
|
5
|
+
This is a universal check -- applies to any project running on Windows.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import ast
|
|
10
|
+
import re
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from vigil_forensic._shared import WINDOWS_CLI_RUNTIME_EXTENSIONS as _WINDOWS_CLI_RUNTIME_EXTENSIONS
|
|
15
|
+
from vigil_forensic._shared import EvidenceReference, GateCategory, GateImpact, GateSeverity, RepairKind
|
|
16
|
+
from vigil_forensic.gate_models import PostExecGateContext
|
|
17
|
+
from ..source_analysis import is_source_file
|
|
18
|
+
from .common import build_check_result, build_finding, iter_touched_snapshots
|
|
19
|
+
from ._deployment_detector import (
|
|
20
|
+
detect_file_deployment,
|
|
21
|
+
detect_project_deployment,
|
|
22
|
+
get_explicit_deployment,
|
|
23
|
+
)
|
|
24
|
+
import logging
|
|
25
|
+
_log = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
# subprocess calls that may need encoding=
|
|
28
|
+
_SUBPROCESS_CALL_RE = re.compile(
|
|
29
|
+
r'\bsubprocess\.(run|Popen|check_output|check_call)\s*\(',
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Characters outside cp1252 range (U+0100+)
|
|
33
|
+
_DANGEROUS_RE = re.compile(r'[\u0100-\uffff]')
|
|
34
|
+
|
|
35
|
+
# Pure comment lines — safe, never reach stdout.
|
|
36
|
+
# Python/shell/PowerShell use "#", JS/TS/Go/Java/C* use "//", SQL uses "--".
|
|
37
|
+
# Keyed by file extension (lowercase). Extensions not listed fall back to
|
|
38
|
+
# scanning every line (the textual sink detector then filters out lines that
|
|
39
|
+
# don't contain a recognized output function, so noise stays low).
|
|
40
|
+
_COMMENT_PREFIXES_BY_EXT: dict[str, tuple[str, ...]] = {
|
|
41
|
+
".py": ("#",),
|
|
42
|
+
".ps1": ("#",),
|
|
43
|
+
".sh": ("#",),
|
|
44
|
+
".bash": ("#",),
|
|
45
|
+
".bat": ("rem ", "REM ", "::"),
|
|
46
|
+
".cmd": ("rem ", "REM ", "::"),
|
|
47
|
+
".js": ("//",),
|
|
48
|
+
".mjs": ("//",),
|
|
49
|
+
".cjs": ("//",),
|
|
50
|
+
".ts": ("//",),
|
|
51
|
+
".tsx": ("//",),
|
|
52
|
+
".go": ("//",),
|
|
53
|
+
".java": ("//",),
|
|
54
|
+
".sql": ("--",),
|
|
55
|
+
".ini": (";",),
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Legacy Python-comment regex (kept for back-compat with Python-AST path).
|
|
59
|
+
_COMMENT_RE = re.compile(r'^\s*#')
|
|
60
|
+
|
|
61
|
+
# High-risk Unicode ranges with human-readable names
|
|
62
|
+
_HIGH_RISK_RANGES = (
|
|
63
|
+
(0x2500, 0x257F, "box-drawing"),
|
|
64
|
+
(0x2190, 0x21FF, "arrows"),
|
|
65
|
+
(0x2014, 0x2014, "em-dash"),
|
|
66
|
+
(0x2013, 0x2013, "en-dash"),
|
|
67
|
+
(0x201C, 0x201D, "smart-quotes"),
|
|
68
|
+
(0x2018, 0x2019, "smart-apostrophes"),
|
|
69
|
+
(0x2705, 0x2705, "checkmark-emoji"),
|
|
70
|
+
(0x274C, 0x274C, "cross-emoji"),
|
|
71
|
+
(0x1F300, 0x1F9FF, "emoji"),
|
|
72
|
+
(0x2600, 0x26FF, "misc-symbols"),
|
|
73
|
+
(0x2700, 0x27BF, "dingbats"),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Keywords that indicate text reaches stdout/stderr directly — HIGH risk on cp1252
|
|
77
|
+
_STDOUT_SINK_RE = re.compile(
|
|
78
|
+
r'print\s*\(|sys\.stdout\.write\s*\(|sys\.stderr\.write\s*\('
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Keywords that indicate logging sinks — bytes go through Python logging, not console codec
|
|
82
|
+
_LOGGING_SINK_RE = re.compile(
|
|
83
|
+
r'(?:^|[^a-zA-Z_])(?:logging|_log|_logger|log|logger)\s*\.\s*(?:debug|info|warning|error|critical|exception)\s*\('
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Cross-language console-output substrings — used by the textual fallback
|
|
87
|
+
# sink detector when AST parsing is unavailable (non-Python files, or Python
|
|
88
|
+
# files with a syntax error). Matching is substring-based on the already-
|
|
89
|
+
# uncommented line, so ordering/anchoring is not required.
|
|
90
|
+
# HIGH: anything that writes to stdout/stderr and therefore hits the cp1252
|
|
91
|
+
# console codec on Windows.
|
|
92
|
+
_TEXTUAL_STDOUT_SINKS: tuple[str, ...] = (
|
|
93
|
+
"print(", # Python, also common in JS/TS via CommonJS
|
|
94
|
+
"console.log(", # JS/TS
|
|
95
|
+
"console.error(", # JS/TS — writes to stderr, still hits cp1252 console
|
|
96
|
+
"console.warn(", # JS/TS — writes to stderr on Node; HIGH on Windows Node
|
|
97
|
+
"process.stdout.write(", # Node.js
|
|
98
|
+
"process.stderr.write(", # Node.js
|
|
99
|
+
"stderr.write(", # Generic (covers sys.stderr.write and Go os.Stderr.Write alias-style)
|
|
100
|
+
"sys.stdout.write(", # Python
|
|
101
|
+
"sys.stderr.write(", # Python
|
|
102
|
+
"printf ", # POSIX shell printf builtin (with space)
|
|
103
|
+
"printf(", # C / Go fmt.Printf when imported unqualified
|
|
104
|
+
"fmt.Print", # Go — matches fmt.Print, fmt.Println, fmt.Printf (substring)
|
|
105
|
+
"fmt.Fprint", # Go — matches fmt.Fprint, fmt.Fprintln, fmt.Fprintf
|
|
106
|
+
"log.Print", # Go standard library log package (Print, Println, Printf)
|
|
107
|
+
"System.out.print", # Java (matches print and println)
|
|
108
|
+
"System.err.print", # Java
|
|
109
|
+
"Write-Host", # PowerShell — writes to host, bypasses codec-safe stream
|
|
110
|
+
"Write-Output", # PowerShell — piped, but rendered when terminal sink
|
|
111
|
+
"Write-Error", # PowerShell — writes to error stream, hits cp1252 console
|
|
112
|
+
"echo ", # POSIX shell / batch
|
|
113
|
+
"echo\t", # POSIX shell with tab between echo and args
|
|
114
|
+
"echo(", # Some shells
|
|
115
|
+
)
|
|
116
|
+
# MEDIUM: logger-style sinks — encoding usually handled internally, but stale
|
|
117
|
+
# tooling may still barf; rated MEDIUM (WARN) just like the Python AST path.
|
|
118
|
+
_TEXTUAL_LOGGER_SINKS: tuple[str, ...] = (
|
|
119
|
+
"log.info", "log.debug", "log.warn", "log.warning", "log.error", "log.critical", "log.exception",
|
|
120
|
+
"logger.info", "logger.debug", "logger.warn", "logger.warning", "logger.error", "logger.critical", "logger.exception",
|
|
121
|
+
"console.info(", "console.debug(",
|
|
122
|
+
"Write-Verbose",
|
|
123
|
+
"Write-Warning",
|
|
124
|
+
"Write-Information",
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# F18b / Sprint C3 (2026-04-23): canonical whitelist of extensions whose
|
|
128
|
+
# runtime output passes through a locale-dependent console lives in
|
|
129
|
+
# SYSTEM.shared_helpers.file_extensions.WINDOWS_CLI_RUNTIME_EXTENSIONS. Re-
|
|
130
|
+
# exported above as ``_WINDOWS_CLI_RUNTIME_EXTENSIONS`` so the Layer 1
|
|
131
|
+
# extension filter below continues to resolve the private name.
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _should_scan_for_encoding(
|
|
135
|
+
rel_path: str,
|
|
136
|
+
content: str | None = None,
|
|
137
|
+
project_dir: Path | None = None,
|
|
138
|
+
) -> bool:
|
|
139
|
+
"""Arbiter for 'is this file in scope for the encoding gate?'.
|
|
140
|
+
|
|
141
|
+
Two layers, evaluated in order:
|
|
142
|
+
|
|
143
|
+
1. **Extension whitelist (F18b).** Only runtimes whose output passes
|
|
144
|
+
through a locale-dependent console (cp1252 on Windows). Python,
|
|
145
|
+
Java, C#, Go, Rust, shell, PowerShell, batch. TypeScript / JavaScript
|
|
146
|
+
/ HTML / CSS / JSON / Markdown stay out regardless of deployment.
|
|
147
|
+
|
|
148
|
+
2. **Deployment cascade (F19).** When the target project deploys only
|
|
149
|
+
to Linux (or a file explicitly imports Unix-only modules / has a
|
|
150
|
+
Unix shebang), cp1252 crashes cannot happen — skip the scan. The
|
|
151
|
+
cascade checks:
|
|
152
|
+
|
|
153
|
+
* Layer 3 — explicit ``.autoforensics/config.json`` /
|
|
154
|
+
``AUTOFORENSICS_DEPLOYMENT`` env var.
|
|
155
|
+
* Layer 1 — per-file signals (shebang, Unix/Windows imports,
|
|
156
|
+
``sys.platform`` guards).
|
|
157
|
+
* Layer 2 — project-level signals (pyproject classifiers,
|
|
158
|
+
Dockerfile, GitHub Actions runners, Linux-exclusive deps).
|
|
159
|
+
|
|
160
|
+
Precedence: explicit > file > project. When all layers return
|
|
161
|
+
'unknown' we scan — a false positive is recoverable by allowlist;
|
|
162
|
+
a false negative hides a real bug.
|
|
163
|
+
|
|
164
|
+
Called per file. Project-level detection is cached inside the detector
|
|
165
|
+
module, so a rubik-scale scan (~2000 files) only touches pyproject /
|
|
166
|
+
workflows / Dockerfile once.
|
|
167
|
+
"""
|
|
168
|
+
lower = rel_path.lower()
|
|
169
|
+
extension_match = False
|
|
170
|
+
for ext in _WINDOWS_CLI_RUNTIME_EXTENSIONS:
|
|
171
|
+
if lower.endswith(ext):
|
|
172
|
+
extension_match = True
|
|
173
|
+
break
|
|
174
|
+
if not extension_match:
|
|
175
|
+
return False
|
|
176
|
+
|
|
177
|
+
if project_dir is None:
|
|
178
|
+
# Legacy caller / tests that do not propagate project_dir — keep
|
|
179
|
+
# prior F18b behaviour (scan on extension match).
|
|
180
|
+
return True
|
|
181
|
+
|
|
182
|
+
# Layer 3 — explicit override. Wins over file and project signals.
|
|
183
|
+
explicit = get_explicit_deployment(project_dir)
|
|
184
|
+
if explicit is not None:
|
|
185
|
+
if explicit == "linux-only":
|
|
186
|
+
return False
|
|
187
|
+
# windows-only / cross-platform → scan.
|
|
188
|
+
return True
|
|
189
|
+
|
|
190
|
+
# Layer 1 — per-file signal. A clear Unix file (shebang, fcntl import)
|
|
191
|
+
# does not need scanning even in a cross-platform project; a clear
|
|
192
|
+
# Windows file always scans.
|
|
193
|
+
if content:
|
|
194
|
+
file_signal = detect_file_deployment(content)
|
|
195
|
+
if file_signal == "unix":
|
|
196
|
+
return False
|
|
197
|
+
if file_signal == "windows":
|
|
198
|
+
return True
|
|
199
|
+
|
|
200
|
+
# Layer 2 — project-level signal.
|
|
201
|
+
project_signal = detect_project_deployment(project_dir)
|
|
202
|
+
if project_signal == "linux-only":
|
|
203
|
+
return False
|
|
204
|
+
# windows-only / cross-platform / unknown → scan (conservative default).
|
|
205
|
+
return True
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _classify_textual_sink(line: str) -> str | None:
|
|
209
|
+
"""Return 'stdout' | 'logger' | None for a non-AST line.
|
|
210
|
+
|
|
211
|
+
Pure substring scan against two tables; stdout sinks dominate logger
|
|
212
|
+
sinks when both appear. Used whenever AST parsing is unavailable:
|
|
213
|
+
- Non-Python files (.js, .ts, .go, .java, .sh, .ps1, ...) by design.
|
|
214
|
+
- Python files that fail to parse (syntax errors) — we still want to
|
|
215
|
+
flag obviously broken sources rather than silently skipping them.
|
|
216
|
+
"""
|
|
217
|
+
for needle in _TEXTUAL_STDOUT_SINKS:
|
|
218
|
+
if needle in line:
|
|
219
|
+
return "stdout"
|
|
220
|
+
for needle in _TEXTUAL_LOGGER_SINKS:
|
|
221
|
+
if needle in line:
|
|
222
|
+
return "logger"
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _is_comment_line(line: str, ext: str) -> bool:
|
|
227
|
+
"""Language-aware comment detection.
|
|
228
|
+
|
|
229
|
+
Returns True when `line` is entirely a comment for the given extension.
|
|
230
|
+
Extensions with no known comment syntax return False (we then scan the
|
|
231
|
+
line; the textual sink detector filters out non-sink lines anyway).
|
|
232
|
+
"""
|
|
233
|
+
prefixes = _COMMENT_PREFIXES_BY_EXT.get(ext.lower())
|
|
234
|
+
if not prefixes:
|
|
235
|
+
return False
|
|
236
|
+
stripped = line.lstrip()
|
|
237
|
+
if not stripped:
|
|
238
|
+
return False
|
|
239
|
+
return any(stripped.startswith(p) for p in prefixes)
|
|
240
|
+
|
|
241
|
+
# Safe non-ASCII codepoints that transcode cleanly via every modern codec
|
|
242
|
+
# including Windows cp1252 (they all exist in the cp1252 character table or
|
|
243
|
+
# have a canonical cp1252 equivalent). When a line's entire non-ASCII content
|
|
244
|
+
# falls inside this set, the line is not a crash risk and no finding is emitted.
|
|
245
|
+
_SAFE_UNICODE_CODEPOINTS: frozenset[int] = frozenset({
|
|
246
|
+
0x2013, # en-dash
|
|
247
|
+
0x2014, # em-dash
|
|
248
|
+
0x2018, # left single smart quote
|
|
249
|
+
0x2019, # right single smart quote
|
|
250
|
+
0x201C, # left double smart quote
|
|
251
|
+
0x201D, # right double smart quote
|
|
252
|
+
0x2026, # horizontal ellipsis
|
|
253
|
+
0x00A0, # non-breaking space
|
|
254
|
+
0x00B0, # degree sign
|
|
255
|
+
0x00B5, # micro sign
|
|
256
|
+
0x00AB, # left guillemet
|
|
257
|
+
0x00BB, # right guillemet
|
|
258
|
+
})
|
|
259
|
+
|
|
260
|
+
# Loggers that apply errors='replace' or utf-8 under the hood — MEDIUM risk
|
|
261
|
+
_LOGGER_METHOD_NAMES: frozenset[str] = frozenset({
|
|
262
|
+
"debug", "info", "warning", "warn", "error", "critical", "exception", "log",
|
|
263
|
+
})
|
|
264
|
+
_LOGGER_RECEIVER_NAMES: frozenset[str] = frozenset({
|
|
265
|
+
"_log", "_logger", "log", "logger", "logging",
|
|
266
|
+
})
|
|
267
|
+
|
|
268
|
+
# F9a-tighten (2026-04-23): chained-call logger pattern — a Call whose func is
|
|
269
|
+
# an Attribute whose receiver is ITSELF a Call with ``.getLogger`` as the
|
|
270
|
+
# inner attribute. Matches ``logging.getLogger(...).exception(...)`` and
|
|
271
|
+
# similar one-shot logger-factory chains.
|
|
272
|
+
_LOGGER_FACTORY_INNER_METHODS: frozenset[str] = frozenset({
|
|
273
|
+
"getLogger", "get_logger", "get_child_logger", "getChildLogger",
|
|
274
|
+
})
|
|
275
|
+
|
|
276
|
+
# F9a-tighten: string-transform wrapper methods we traverse through when the
|
|
277
|
+
# nearest enclosing Call is one of these. We then look at the GRANDPARENT
|
|
278
|
+
# Call to classify the sink. Covers ``print(s.format(lit))``,
|
|
279
|
+
# ``sys.stdout.write(' '.join(lits))``, etc.
|
|
280
|
+
_WRAPPER_METHOD_NAMES: frozenset[str] = frozenset({
|
|
281
|
+
"format", "join", "strip", "lstrip", "rstrip", "replace",
|
|
282
|
+
"upper", "lower", "title", "capitalize",
|
|
283
|
+
"encode", "decode",
|
|
284
|
+
"removeprefix", "removesuffix",
|
|
285
|
+
"zfill", "ljust", "rjust", "center",
|
|
286
|
+
})
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _classify_char(ch: str) -> str:
|
|
290
|
+
cp = ord(ch)
|
|
291
|
+
for low, high, name in _HIGH_RISK_RANGES:
|
|
292
|
+
if low <= cp <= high:
|
|
293
|
+
return name
|
|
294
|
+
return "non-cp1252"
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _is_test_path(rel_path: str, ctx: object = None) -> bool:
|
|
298
|
+
"""True when rel_path is a test surface the encoding gate should skip.
|
|
299
|
+
|
|
300
|
+
Sprint C2 (2026-04-23): prefers ``ctx.project_context.test_topology``
|
|
301
|
+
when available. Preserves the original "encoding" filename exception —
|
|
302
|
+
any test file whose basename contains "encoding" remains scannable
|
|
303
|
+
(the gate's own test suite exercises raw Unicode as test data and
|
|
304
|
+
SHOULD be flagged).
|
|
305
|
+
|
|
306
|
+
Tests are run under pytest, which captures stdout and does not write to
|
|
307
|
+
the cp1252 console. Skipping test paths removes the dominant FP source
|
|
308
|
+
(fixture-string Cyrillic, docstring em-dash, etc.).
|
|
309
|
+
"""
|
|
310
|
+
if not rel_path:
|
|
311
|
+
return False
|
|
312
|
+
normalized = rel_path.replace("\\", "/")
|
|
313
|
+
parts = normalized.split("/")
|
|
314
|
+
basename = parts[-1].lower()
|
|
315
|
+
|
|
316
|
+
# Exception: files whose basename advertises "encoding" test behavior
|
|
317
|
+
# are intentionally scanned, regardless of where they live.
|
|
318
|
+
if "encoding" in basename:
|
|
319
|
+
return False
|
|
320
|
+
|
|
321
|
+
topology = getattr(getattr(ctx, "project_context", None), "test_topology", None)
|
|
322
|
+
if topology is not None:
|
|
323
|
+
return topology.is_test_path(normalized)
|
|
324
|
+
|
|
325
|
+
# Legacy fallback — original path-fragment rule.
|
|
326
|
+
if "tests" not in parts:
|
|
327
|
+
return False
|
|
328
|
+
return True
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _classify_call_sink(call_node: ast.Call) -> str | None:
|
|
332
|
+
"""Return 'stdout', 'logger', or None for a given Call node.
|
|
333
|
+
|
|
334
|
+
- 'stdout' : print(...), sys.stdout.write(...), sys.stderr.write(...),
|
|
335
|
+
os.write(1|2, ...) — crashes cp1252 console.
|
|
336
|
+
- 'logger' : _log.info(...), logger.debug(...), logging.warning(...),
|
|
337
|
+
AND chained-call factories like
|
|
338
|
+
``logging.getLogger(__name__).exception(...)`` (blind-spot D
|
|
339
|
+
chained Call traversal).
|
|
340
|
+
- None : json.dumps(...), _append_trace(...), foo.bar(...), etc. —
|
|
341
|
+
not a console sink; finding suppressed entirely.
|
|
342
|
+
"""
|
|
343
|
+
func = call_node.func
|
|
344
|
+
# print(...)
|
|
345
|
+
if isinstance(func, ast.Name) and func.id == "print":
|
|
346
|
+
return "stdout"
|
|
347
|
+
if isinstance(func, ast.Attribute):
|
|
348
|
+
attr = func.attr
|
|
349
|
+
value = func.value
|
|
350
|
+
# sys.stdout.write(...) / sys.stderr.write(...)
|
|
351
|
+
if attr == "write" and isinstance(value, ast.Attribute):
|
|
352
|
+
if value.attr in ("stdout", "stderr") and isinstance(value.value, ast.Name) and value.value.id == "sys":
|
|
353
|
+
return "stdout"
|
|
354
|
+
# os.write(1|2, ...)
|
|
355
|
+
if attr == "write" and isinstance(value, ast.Name) and value.id == "os":
|
|
356
|
+
if call_node.args:
|
|
357
|
+
first = call_node.args[0]
|
|
358
|
+
if isinstance(first, ast.Constant) and first.value in (1, 2):
|
|
359
|
+
return "stdout"
|
|
360
|
+
# Logger methods: _log.info / logger.debug / logging.warning / self._log.info etc.
|
|
361
|
+
if attr in _LOGGER_METHOD_NAMES:
|
|
362
|
+
# Accept plain Name receivers (_log, logger, logging).
|
|
363
|
+
if isinstance(value, ast.Name) and value.id in _LOGGER_RECEIVER_NAMES:
|
|
364
|
+
return "logger"
|
|
365
|
+
# Accept self.<logger>.method / cls.<logger>.method chains.
|
|
366
|
+
if isinstance(value, ast.Attribute) and value.attr in _LOGGER_RECEIVER_NAMES:
|
|
367
|
+
return "logger"
|
|
368
|
+
# F9a-tighten (2026-04-23): chained call
|
|
369
|
+
# ``logging.getLogger(__name__).exception(...)`` — the receiver
|
|
370
|
+
# (func.value) is itself a Call whose inner attribute is a
|
|
371
|
+
# known logger-factory name (``getLogger`` etc.). Classify the
|
|
372
|
+
# OUTER method (info/error/exception/etc.) as a logger sink.
|
|
373
|
+
if isinstance(value, ast.Call):
|
|
374
|
+
inner_func = value.func
|
|
375
|
+
if isinstance(inner_func, ast.Attribute) and inner_func.attr in _LOGGER_FACTORY_INNER_METHODS:
|
|
376
|
+
return "logger"
|
|
377
|
+
if isinstance(inner_func, ast.Name) and inner_func.id in _LOGGER_FACTORY_INNER_METHODS:
|
|
378
|
+
return "logger"
|
|
379
|
+
return None
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _collect_string_literal_sinks(source: str) -> dict[int, str]:
|
|
383
|
+
"""Map 1-based line number -> sink classification for non-ASCII string
|
|
384
|
+
literals inside Call / Raise nodes.
|
|
385
|
+
|
|
386
|
+
Returns a dict with entries only for lines containing a str/JoinedStr
|
|
387
|
+
literal whose resolved sink classifies as 'stdout' or 'logger'. Lines
|
|
388
|
+
not in the dict mean either (a) no string literal in a Call on that line
|
|
389
|
+
or (b) the enclosing Call is a non-sink helper (json.dumps, etc.).
|
|
390
|
+
|
|
391
|
+
F9a-tighten (2026-04-23):
|
|
392
|
+
* Raise detection — a literal whose ancestor chain contains ``ast.Raise``
|
|
393
|
+
is classified as 'stdout' HIGH (the exception message lands on
|
|
394
|
+
stderr, which runs through the cp1252 codec on Windows).
|
|
395
|
+
* Grandparent walk — when the nearest enclosing Call is a wrapper
|
|
396
|
+
method (``.format`` / ``.join`` / ``.replace`` / etc.), we look at
|
|
397
|
+
the outer Call that receives the wrapper's result. This catches
|
|
398
|
+
``print(fmt.format('cyr'))`` where the nearest Call is ``.format``
|
|
399
|
+
but the eventual sink is ``print``.
|
|
400
|
+
|
|
401
|
+
Empty dict on SyntaxError; caller falls back to "suppress all" behavior
|
|
402
|
+
for a file that cannot be parsed (we prefer FN over FP).
|
|
403
|
+
"""
|
|
404
|
+
try:
|
|
405
|
+
tree = ast.parse(source)
|
|
406
|
+
except SyntaxError:
|
|
407
|
+
return {}
|
|
408
|
+
|
|
409
|
+
# Build parent map so we can walk a node's ancestors.
|
|
410
|
+
parent: dict[int, ast.AST] = {}
|
|
411
|
+
for node in ast.walk(tree):
|
|
412
|
+
for child in ast.iter_child_nodes(node):
|
|
413
|
+
parent[id(child)] = node
|
|
414
|
+
|
|
415
|
+
def _is_wrapper_call(call: ast.Call) -> bool:
|
|
416
|
+
"""True if *call* is a transparent string-transform wrapper.
|
|
417
|
+
|
|
418
|
+
We skip through these and look at the grandparent Call instead.
|
|
419
|
+
"""
|
|
420
|
+
f = call.func
|
|
421
|
+
if isinstance(f, ast.Attribute) and f.attr in _WRAPPER_METHOD_NAMES:
|
|
422
|
+
return True
|
|
423
|
+
# ``str.format(...)`` / ``str.join(...)`` — Name chain with attribute.
|
|
424
|
+
if isinstance(f, ast.Attribute) and isinstance(f.value, ast.Name) and f.value.id == "str":
|
|
425
|
+
if f.attr in _WRAPPER_METHOD_NAMES:
|
|
426
|
+
return True
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
def _has_raise_ancestor(node: ast.AST) -> bool:
|
|
430
|
+
"""True if *node*'s ancestor chain contains ``ast.Raise`` within the
|
|
431
|
+
same statement scope.
|
|
432
|
+
|
|
433
|
+
We stop at the first function/class boundary so a literal inside a
|
|
434
|
+
nested def/lambda is not mis-attributed to an outer raise.
|
|
435
|
+
"""
|
|
436
|
+
cur: ast.AST | None = parent.get(id(node))
|
|
437
|
+
while cur is not None:
|
|
438
|
+
if isinstance(cur, ast.Raise):
|
|
439
|
+
return True
|
|
440
|
+
if isinstance(cur, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Lambda, ast.Module)):
|
|
441
|
+
return False
|
|
442
|
+
cur = parent.get(id(cur))
|
|
443
|
+
return False
|
|
444
|
+
|
|
445
|
+
def _resolve_sink(node: ast.AST) -> str | None:
|
|
446
|
+
"""F9a-tighten: walk ancestors, skipping wrapper Calls, and classify.
|
|
447
|
+
|
|
448
|
+
Order of precedence (first hit wins):
|
|
449
|
+
1. Raise ancestor in same statement → ``stdout`` (stderr sink).
|
|
450
|
+
2. Nearest non-wrapper Call → ``_classify_call_sink``.
|
|
451
|
+
3. Grandparent of wrapper Call — keep walking past the wrapper
|
|
452
|
+
until we hit a classifier-matching Call.
|
|
453
|
+
4. None.
|
|
454
|
+
"""
|
|
455
|
+
# Priority 1: Raise ancestor. Exception messages render via
|
|
456
|
+
# ``sys.stderr.write`` / ``traceback.print_exception`` which hit the
|
|
457
|
+
# cp1252 codec on Windows. Check before call classification so
|
|
458
|
+
# ``raise ValueError("cyr")`` classifies even though ``ValueError``
|
|
459
|
+
# itself is not a known sink.
|
|
460
|
+
if _has_raise_ancestor(node):
|
|
461
|
+
return "stdout"
|
|
462
|
+
|
|
463
|
+
# Walk upward, skipping wrapper Calls (grandparent traversal).
|
|
464
|
+
cur: ast.AST | None = parent.get(id(node))
|
|
465
|
+
while cur is not None:
|
|
466
|
+
if isinstance(cur, ast.Call):
|
|
467
|
+
if _is_wrapper_call(cur):
|
|
468
|
+
# Wrapper itself may classify (rare safety net).
|
|
469
|
+
direct = _classify_call_sink(cur)
|
|
470
|
+
if direct is not None:
|
|
471
|
+
return direct
|
|
472
|
+
# Skip past the wrapper: keep hunting for a real sink.
|
|
473
|
+
cur = parent.get(id(cur))
|
|
474
|
+
continue
|
|
475
|
+
# Non-wrapper Call — definitive classifier (sink or None).
|
|
476
|
+
return _classify_call_sink(cur)
|
|
477
|
+
cur = parent.get(id(cur))
|
|
478
|
+
return None
|
|
479
|
+
|
|
480
|
+
result: dict[int, str] = {}
|
|
481
|
+
|
|
482
|
+
def _record(node: ast.AST, sink: str) -> None:
|
|
483
|
+
start = getattr(node, "lineno", None)
|
|
484
|
+
end = getattr(node, "end_lineno", None) or start
|
|
485
|
+
if start is None:
|
|
486
|
+
return
|
|
487
|
+
for ln in range(start, (end or start) + 1):
|
|
488
|
+
# Upgrade: stdout dominates logger if multiple literals share line.
|
|
489
|
+
prev = result.get(ln)
|
|
490
|
+
if prev != "stdout":
|
|
491
|
+
result[ln] = sink
|
|
492
|
+
|
|
493
|
+
for node in ast.walk(tree):
|
|
494
|
+
if isinstance(node, ast.Constant) and isinstance(node.value, str):
|
|
495
|
+
sink = _resolve_sink(node)
|
|
496
|
+
if sink is None:
|
|
497
|
+
continue
|
|
498
|
+
_record(node, sink)
|
|
499
|
+
elif isinstance(node, ast.JoinedStr):
|
|
500
|
+
sink = _resolve_sink(node)
|
|
501
|
+
if sink is None:
|
|
502
|
+
continue
|
|
503
|
+
_record(node, sink)
|
|
504
|
+
|
|
505
|
+
return result
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def _collect_docstring_lines(source: str) -> set[int]:
|
|
509
|
+
"""Return 1-based line numbers belonging to any docstring in `source`.
|
|
510
|
+
|
|
511
|
+
AST-based: identifies module-, class-, and function-body first statement
|
|
512
|
+
when it is ``Expr(Constant(str))``. Docstring strings are compile-time
|
|
513
|
+
constants never written to stdout — we skip them entirely (no finding,
|
|
514
|
+
not even LOW). Returns empty set on SyntaxError so callers fall back to
|
|
515
|
+
standard non-docstring processing for the whole file.
|
|
516
|
+
"""
|
|
517
|
+
try:
|
|
518
|
+
tree = ast.parse(source)
|
|
519
|
+
except SyntaxError:
|
|
520
|
+
return set()
|
|
521
|
+
|
|
522
|
+
docstring_lines: set[int] = set()
|
|
523
|
+
|
|
524
|
+
def _mark(node: ast.AST) -> None:
|
|
525
|
+
if not isinstance(node, ast.Expr):
|
|
526
|
+
return
|
|
527
|
+
val = node.value
|
|
528
|
+
if not isinstance(val, ast.Constant) or not isinstance(val.value, str):
|
|
529
|
+
return
|
|
530
|
+
start = getattr(val, "lineno", None)
|
|
531
|
+
end = getattr(val, "end_lineno", None) or start
|
|
532
|
+
if start is None:
|
|
533
|
+
return
|
|
534
|
+
for ln in range(start, (end or start) + 1):
|
|
535
|
+
docstring_lines.add(ln)
|
|
536
|
+
|
|
537
|
+
# Module docstring: first stmt of tree.body.
|
|
538
|
+
if tree.body:
|
|
539
|
+
_mark(tree.body[0])
|
|
540
|
+
|
|
541
|
+
# Class/function docstrings: first stmt of any Class/FunctionDef/AsyncFunctionDef body.
|
|
542
|
+
for node in ast.walk(tree):
|
|
543
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
|
544
|
+
if node.body:
|
|
545
|
+
_mark(node.body[0])
|
|
546
|
+
|
|
547
|
+
return docstring_lines
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def run_encoding_checks(ctx: PostExecGateContext):
|
|
551
|
+
"""Scan changed files for Windows-unsafe Unicode characters."""
|
|
552
|
+
# Only relevant on Windows
|
|
553
|
+
if sys.platform != "win32":
|
|
554
|
+
return build_check_result(
|
|
555
|
+
check_id="encoding_safety",
|
|
556
|
+
category=GateCategory.RUNTIME_BEHAVIOR,
|
|
557
|
+
notes=["Skipped: not running on Windows"],
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
findings = []
|
|
561
|
+
# Resolve project_dir once — detection helpers cache per-project but
|
|
562
|
+
# we still want a single Path instance per call (fewer attribute lookups).
|
|
563
|
+
project_dir = getattr(ctx, "project_dir", None)
|
|
564
|
+
|
|
565
|
+
for rel_path, snap in ctx.file_snapshots.items():
|
|
566
|
+
if not snap.exists or not snap.text:
|
|
567
|
+
continue
|
|
568
|
+
# F18b: whitelist filter + F19: deployment cascade. Only scan files
|
|
569
|
+
# whose runtime output passes through a locale-dependent console
|
|
570
|
+
# (Python, Java, C#, Go, Rust, shell, PowerShell, batch) AND whose
|
|
571
|
+
# deployment target is not provably Linux-only. TypeScript /
|
|
572
|
+
# JavaScript / markup languages always skip; Linux-only deployments
|
|
573
|
+
# also skip because cp1252 crashes cannot occur there.
|
|
574
|
+
if not _should_scan_for_encoding(rel_path, snap.text, project_dir):
|
|
575
|
+
continue
|
|
576
|
+
# Part 1: skip test paths entirely (pytest captures stdout, not the
|
|
577
|
+
# cp1252 console). Keep files whose basename contains "encoding" so
|
|
578
|
+
# the gate's own test suite can still be scanned. Sprint C2: ctx
|
|
579
|
+
# threaded through so _is_test_path can consult TestTopology.
|
|
580
|
+
if _is_test_path(rel_path, ctx):
|
|
581
|
+
continue
|
|
582
|
+
|
|
583
|
+
all_lines = snap.text.splitlines()
|
|
584
|
+
is_python = rel_path.endswith(".py")
|
|
585
|
+
# Extension used for comment-prefix dispatch. We use the raw suffix
|
|
586
|
+
# rather than going through get_language_id() because comment syntax
|
|
587
|
+
# is per-extension (e.g. .bat differs from .cmd only in rare forms,
|
|
588
|
+
# .jsx shares JS syntax, etc.).
|
|
589
|
+
ext = ""
|
|
590
|
+
dot = rel_path.rfind(".")
|
|
591
|
+
if dot >= 0:
|
|
592
|
+
ext = rel_path[dot:].lower()
|
|
593
|
+
|
|
594
|
+
# Docstrings are compile-time constants, never reach stdout — skip
|
|
595
|
+
# them entirely (no finding). AST-based so we correctly identify
|
|
596
|
+
# module/class/function first-statement Expr(Constant(str)).
|
|
597
|
+
# Python-only concept; other languages have no docstring semantics.
|
|
598
|
+
docstring_lines = _collect_docstring_lines(snap.text) if is_python else set()
|
|
599
|
+
|
|
600
|
+
# Part 3: AST-based sink map. For Python, line -> 'stdout' | 'logger'.
|
|
601
|
+
# Lines not in the map either have no string literal inside a Call,
|
|
602
|
+
# or the enclosing Call is a non-console helper (json.dumps, etc.) —
|
|
603
|
+
# we suppress those findings entirely.
|
|
604
|
+
#
|
|
605
|
+
# When ast.parse fails (non-Python file by extension, OR a .py file
|
|
606
|
+
# with a syntax error) `_collect_string_literal_sinks` returns {}
|
|
607
|
+
# and we fall back to the textual-sink detector below. We prefer
|
|
608
|
+
# false-negatives-on-sink-classification over silently skipping a
|
|
609
|
+
# broken or non-Python source file.
|
|
610
|
+
sink_map = _collect_string_literal_sinks(snap.text) if is_python else {}
|
|
611
|
+
|
|
612
|
+
# For non-Python files we always go through the textual fallback.
|
|
613
|
+
# For Python files with a non-empty sink_map we use AST. For Python
|
|
614
|
+
# files whose sink_map came back empty (empty file -> empty dict;
|
|
615
|
+
# parse error -> empty dict) we also go textual. We disambiguate
|
|
616
|
+
# "empty because parse failed" vs "empty because no sinks present"
|
|
617
|
+
# cheaply by attempting a parse here and caching the result.
|
|
618
|
+
ast_available = is_python
|
|
619
|
+
if is_python:
|
|
620
|
+
try:
|
|
621
|
+
ast.parse(snap.text)
|
|
622
|
+
except SyntaxError:
|
|
623
|
+
ast_available = False
|
|
624
|
+
|
|
625
|
+
for line_num, line in enumerate(all_lines, 1):
|
|
626
|
+
# Language-aware comment skip. For extensions without a known
|
|
627
|
+
# comment syntax (none today, but e.g. .html, .json), we scan
|
|
628
|
+
# the whole line and let the sink detector filter.
|
|
629
|
+
if _is_comment_line(line, ext):
|
|
630
|
+
continue
|
|
631
|
+
|
|
632
|
+
# Skip docstring lines entirely — no finding emitted.
|
|
633
|
+
if line_num in docstring_lines:
|
|
634
|
+
continue
|
|
635
|
+
|
|
636
|
+
# Respect explicit per-line allowlist.
|
|
637
|
+
if "noqa: encoding" in line or "noqa:encoding" in line:
|
|
638
|
+
continue
|
|
639
|
+
|
|
640
|
+
matches = _DANGEROUS_RE.findall(line)
|
|
641
|
+
if not matches:
|
|
642
|
+
continue
|
|
643
|
+
|
|
644
|
+
# Part 2: split matches into safe (cp1252-compatible) and unsafe.
|
|
645
|
+
# When ALL chars on the line are safe → skip; safe chars transcode
|
|
646
|
+
# cleanly so there is no crash risk regardless of sink.
|
|
647
|
+
# This rule is language-agnostic: safe codepoints stay safe
|
|
648
|
+
# regardless of the file's language.
|
|
649
|
+
unsafe_chars = [ch for ch in matches if ord(ch) not in _SAFE_UNICODE_CODEPOINTS]
|
|
650
|
+
if not unsafe_chars:
|
|
651
|
+
continue
|
|
652
|
+
|
|
653
|
+
# Sink resolution:
|
|
654
|
+
# - Python + AST-parsable: use AST map (precise, few FPs).
|
|
655
|
+
# - Otherwise: simple textual substring match against the
|
|
656
|
+
# cross-language sink tables. Lines with no recognized
|
|
657
|
+
# output function are skipped entirely — they are not a
|
|
658
|
+
# crash risk even if they contain unsafe unicode (e.g.
|
|
659
|
+
# a JS string constant never passed to console.log).
|
|
660
|
+
if ast_available:
|
|
661
|
+
sink = sink_map.get(line_num)
|
|
662
|
+
if sink is None:
|
|
663
|
+
# Either the literal is not inside any Call, or the Call
|
|
664
|
+
# is a non-console helper. Suppress entirely — these are
|
|
665
|
+
# the dominant FP source (_append_trace, json.dumps,
|
|
666
|
+
# module-level constants, return values, etc.).
|
|
667
|
+
continue
|
|
668
|
+
is_stdout_sink = sink == "stdout"
|
|
669
|
+
is_logging_sink = sink == "logger"
|
|
670
|
+
else:
|
|
671
|
+
sink = _classify_textual_sink(line)
|
|
672
|
+
if sink is None:
|
|
673
|
+
continue
|
|
674
|
+
is_stdout_sink = sink == "stdout"
|
|
675
|
+
is_logging_sink = sink == "logger"
|
|
676
|
+
|
|
677
|
+
for ch in set(unsafe_chars):
|
|
678
|
+
category = _classify_char(ch)
|
|
679
|
+
|
|
680
|
+
if is_stdout_sink:
|
|
681
|
+
# Direct stdout/stderr — crashes cp1252 console
|
|
682
|
+
severity = GateSeverity.HIGH
|
|
683
|
+
impact = GateImpact.REVISE
|
|
684
|
+
sink_label = "stdout/stderr sink (print/sys.write)"
|
|
685
|
+
executor_action = (
|
|
686
|
+
"replace unicode with ASCII in print/stderr — crashes on cp1252 console"
|
|
687
|
+
)
|
|
688
|
+
elif is_logging_sink:
|
|
689
|
+
# Logger handles encoding internally — no crash, but stale tooling may barf
|
|
690
|
+
severity = GateSeverity.MEDIUM
|
|
691
|
+
impact = GateImpact.WARN
|
|
692
|
+
sink_label = "logging sink"
|
|
693
|
+
executor_action = (
|
|
694
|
+
"consider ASCII for consistency; logger doesn't crash on utf8 but stale tooling may"
|
|
695
|
+
)
|
|
696
|
+
else:
|
|
697
|
+
# Unreachable for Python (we `continue`d above) but kept
|
|
698
|
+
# for defensive completeness if future shell branch adds
|
|
699
|
+
# an "unknown" pathway.
|
|
700
|
+
severity = GateSeverity.MEDIUM
|
|
701
|
+
impact = GateImpact.WARN
|
|
702
|
+
sink_label = "code (unknown sink)"
|
|
703
|
+
executor_action = (
|
|
704
|
+
"consider ASCII for consistency; unknown whether this reaches cp1252 console"
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
findings.append(build_finding(
|
|
708
|
+
check_id="encoding.windows_unsafe_char",
|
|
709
|
+
category=GateCategory.RUNTIME_BEHAVIOR,
|
|
710
|
+
title=f"Windows-unsafe U+{ord(ch):04X} ({category}) in {rel_path}:{line_num}",
|
|
711
|
+
severity=severity,
|
|
712
|
+
impact=impact,
|
|
713
|
+
summary=(
|
|
714
|
+
f"Character U+{ord(ch):04X} ({category}) in {sink_label}. "
|
|
715
|
+
f"Windows cp1252 console will crash with UnicodeEncodeError if this reaches stdout/stderr."
|
|
716
|
+
),
|
|
717
|
+
recommendation=(
|
|
718
|
+
f"Replace with ASCII equivalent in {rel_path} line {line_num}. "
|
|
719
|
+
f"Common fixes: em-dash->--, arrows->->, checkmark->[OK], cross->[X]"
|
|
720
|
+
),
|
|
721
|
+
evidence=(EvidenceReference(
|
|
722
|
+
kind="probe",
|
|
723
|
+
path=rel_path,
|
|
724
|
+
detail=f"Character U+{ord(ch):04X} ({category}) at line {line_num}",
|
|
725
|
+
ok=False,
|
|
726
|
+
),),
|
|
727
|
+
repair_kind=RepairKind.FIX_ENCODING.value,
|
|
728
|
+
executor_action=executor_action,
|
|
729
|
+
proof_required="Proper encoding",
|
|
730
|
+
allowlist_allowed=False,
|
|
731
|
+
))
|
|
732
|
+
|
|
733
|
+
return build_check_result(
|
|
734
|
+
check_id="encoding_safety",
|
|
735
|
+
category=GateCategory.RUNTIME_BEHAVIOR,
|
|
736
|
+
findings=findings,
|
|
737
|
+
notes=[f"Scanned {len(ctx.file_snapshots)} files for Windows-unsafe characters"],
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def _extract_call_block(lines: list[str], start_line: int, max_lines: int = 40) -> str:
|
|
742
|
+
"""Extract argument block of a call starting at start_line (1-based).
|
|
743
|
+
|
|
744
|
+
Tracks paren depth so we stop at the matching close paren instead of
|
|
745
|
+
grabbing N lines and accidentally spanning into the next call.
|
|
746
|
+
Used as fallback when AST parsing is unavailable (syntax errors, etc.).
|
|
747
|
+
"""
|
|
748
|
+
depth = 0
|
|
749
|
+
block: list[str] = []
|
|
750
|
+
for i in range(start_line - 1, min(start_line - 1 + max_lines, len(lines))):
|
|
751
|
+
line = lines[i]
|
|
752
|
+
block.append(line)
|
|
753
|
+
depth += line.count("(") - line.count(")")
|
|
754
|
+
if depth <= 0 and i > start_line - 1:
|
|
755
|
+
break
|
|
756
|
+
return " ".join(block)
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
def _extract_call_kwargs(file_content: str, call_lineno: int) -> set[str] | None:
|
|
760
|
+
"""Return the set of keyword argument names for the call at call_lineno.
|
|
761
|
+
|
|
762
|
+
Uses AST so the result is exact regardless of how many lines the call
|
|
763
|
+
spans. Returns None when the file cannot be parsed (syntax error) so
|
|
764
|
+
the caller can fall back to the regex-based approach.
|
|
765
|
+
"""
|
|
766
|
+
try:
|
|
767
|
+
tree = ast.parse(file_content)
|
|
768
|
+
except SyntaxError:
|
|
769
|
+
return None
|
|
770
|
+
|
|
771
|
+
for node in ast.walk(tree):
|
|
772
|
+
if isinstance(node, ast.Call) and node.lineno == call_lineno:
|
|
773
|
+
return {kw.arg for kw in node.keywords if kw.arg is not None}
|
|
774
|
+
return set()
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
def run_subprocess_encoding_checks(ctx: PostExecGateContext):
|
|
778
|
+
"""Detect subprocess calls with text=True but missing encoding parameter.
|
|
779
|
+
|
|
780
|
+
On Windows, text=True without encoding= defaults to the system locale
|
|
781
|
+
(cp1252), crashing with UnicodeEncodeError on non-ASCII git output
|
|
782
|
+
(branch names, file paths, commit messages).
|
|
783
|
+
|
|
784
|
+
Fix: add encoding='utf-8', errors='replace' to every subprocess call
|
|
785
|
+
that uses text=True.
|
|
786
|
+
"""
|
|
787
|
+
findings = []
|
|
788
|
+
|
|
789
|
+
for snapshot in iter_touched_snapshots(ctx):
|
|
790
|
+
if not snapshot.exists or not snapshot.text:
|
|
791
|
+
continue
|
|
792
|
+
if not is_source_file(snapshot.path):
|
|
793
|
+
continue
|
|
794
|
+
|
|
795
|
+
lines = snapshot.text.splitlines()
|
|
796
|
+
|
|
797
|
+
for match in _SUBPROCESS_CALL_RE.finditer(snapshot.text):
|
|
798
|
+
call_name = match.group(1)
|
|
799
|
+
line_num = snapshot.text.count("\n", 0, match.start()) + 1
|
|
800
|
+
|
|
801
|
+
kwargs = _extract_call_kwargs(snapshot.text, line_num)
|
|
802
|
+
if kwargs is not None:
|
|
803
|
+
# AST path: exact keyword extraction, no line-cap FP.
|
|
804
|
+
has_text_true = "text" in kwargs
|
|
805
|
+
has_encoding = "encoding" in kwargs
|
|
806
|
+
# AST gives us keyword names but not their values; we still
|
|
807
|
+
# need to verify text=True (not text=False). Re-check with
|
|
808
|
+
# regex only when text kwarg is present.
|
|
809
|
+
if has_text_true:
|
|
810
|
+
block = _extract_call_block(lines, line_num)
|
|
811
|
+
has_text_true = bool(re.search(r'\btext\s*=\s*True\b', block))
|
|
812
|
+
else:
|
|
813
|
+
# Fallback for files with syntax errors: regex over block.
|
|
814
|
+
block = _extract_call_block(lines, line_num)
|
|
815
|
+
has_text_true = bool(re.search(r'\btext\s*=\s*True\b', block))
|
|
816
|
+
has_encoding = bool(re.search(r'\bencoding\s*=', block))
|
|
817
|
+
|
|
818
|
+
if has_text_true and not has_encoding:
|
|
819
|
+
findings.append(build_finding(
|
|
820
|
+
check_id="encoding.subprocess_missing_encoding",
|
|
821
|
+
category=GateCategory.RUNTIME_BEHAVIOR,
|
|
822
|
+
title=f"subprocess.{call_name}(text=True) missing encoding= in {snapshot.path}:{line_num}",
|
|
823
|
+
severity=GateSeverity.HIGH,
|
|
824
|
+
impact=GateImpact.REVISE,
|
|
825
|
+
summary=(
|
|
826
|
+
f"subprocess.{call_name}() at line {line_num} uses text=True without encoding=. "
|
|
827
|
+
f"On Windows defaults to cp1252 -- crashes with UnicodeEncodeError on non-ASCII "
|
|
828
|
+
f"git output (branch names, file paths, commit messages)."
|
|
829
|
+
),
|
|
830
|
+
recommendation=(
|
|
831
|
+
f"Add encoding='utf-8', errors='replace' to subprocess.{call_name}() "
|
|
832
|
+
f"in {snapshot.path} line {line_num}."
|
|
833
|
+
),
|
|
834
|
+
evidence=[
|
|
835
|
+
EvidenceReference(kind="file", path=snapshot.path, detail=f"line:{line_num}"),
|
|
836
|
+
],
|
|
837
|
+
repair_kind=RepairKind.FIX_ENCODING.value,
|
|
838
|
+
executor_action="Fix encoding issues",
|
|
839
|
+
proof_required="Proper encoding",
|
|
840
|
+
allowlist_allowed=False,
|
|
841
|
+
))
|
|
842
|
+
|
|
843
|
+
return build_check_result(
|
|
844
|
+
check_id="subprocess_encoding",
|
|
845
|
+
category=GateCategory.RUNTIME_BEHAVIOR,
|
|
846
|
+
findings=findings,
|
|
847
|
+
)
|