pkgwhy 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pkgwhy/__init__.py +3 -0
- pkgwhy/__main__.py +6 -0
- pkgwhy/agent/__init__.py +2 -0
- pkgwhy/agent/judge.py +93 -0
- pkgwhy/cli.py +676 -0
- pkgwhy/core/__init__.py +2 -0
- pkgwhy/core/constants.py +13 -0
- pkgwhy/core/models.py +608 -0
- pkgwhy/dependencies/__init__.py +2 -0
- pkgwhy/dependencies/graph.py +68 -0
- pkgwhy/dependencies/reason.py +79 -0
- pkgwhy/dynamic/__init__.py +2 -0
- pkgwhy/dynamic/analysis.py +156 -0
- pkgwhy/explanations/__init__.py +2 -0
- pkgwhy/explanations/explain.py +47 -0
- pkgwhy/explanations/local_db.py +52 -0
- pkgwhy/imports/__init__.py +2 -0
- pkgwhy/imports/scanner.py +43 -0
- pkgwhy/inspection/__init__.py +2 -0
- pkgwhy/inspection/files.py +540 -0
- pkgwhy/inspection/python_static.py +323 -0
- pkgwhy/inspection/size.py +58 -0
- pkgwhy/inspection/text_patterns.py +135 -0
- pkgwhy/manifests/__init__.py +2 -0
- pkgwhy/manifests/lockfiles.py +51 -0
- pkgwhy/manifests/pyproject.py +37 -0
- pkgwhy/manifests/requirements.py +27 -0
- pkgwhy/metadata/__init__.py +2 -0
- pkgwhy/metadata/installed.py +83 -0
- pkgwhy/metadata/pypi.py +199 -0
- pkgwhy/policy/__init__.py +1 -0
- pkgwhy/policy/agent_policy.py +114 -0
- pkgwhy/policy/audit_log.py +60 -0
- pkgwhy/policy/tool_execution.py +76 -0
- pkgwhy/provenance/__init__.py +2 -0
- pkgwhy/provenance/installed.py +45 -0
- pkgwhy/registry/__init__.py +2 -0
- pkgwhy/registry/local.py +178 -0
- pkgwhy/registry/manifest.py +78 -0
- pkgwhy/registry/publish.py +142 -0
- pkgwhy/registry/run.py +148 -0
- pkgwhy/registry/tools.py +121 -0
- pkgwhy/reports/__init__.py +2 -0
- pkgwhy/reports/audit.py +81 -0
- pkgwhy/risk/__init__.py +5 -0
- pkgwhy/risk/rules.py +372 -0
- pkgwhy/risk/scoring.py +231 -0
- pkgwhy/typosquat/__init__.py +2 -0
- pkgwhy/typosquat/detector.py +182 -0
- pkgwhy/typosquat/popular_packages.py +34 -0
- pkgwhy/vulnerabilities/__init__.py +2 -0
- pkgwhy/vulnerabilities/matching.py +122 -0
- pkgwhy/vulnerabilities/osv.py +330 -0
- pkgwhy-1.0.0.dist-info/METADATA +688 -0
- pkgwhy-1.0.0.dist-info/RECORD +58 -0
- pkgwhy-1.0.0.dist-info/WHEEL +4 -0
- pkgwhy-1.0.0.dist-info/entry_points.txt +2 -0
- pkgwhy-1.0.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,540 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import tomllib
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from importlib.metadata import Distribution
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from pkgwhy.core.models import FileStaticAnalysis, ReadabilityStatus, RiskRuleEvidence, RuleSeverity, SourceAvailability
|
|
10
|
+
from pkgwhy.inspection.size import JAVASCRIPT_SUFFIXES, NATIVE_SUFFIXES
|
|
11
|
+
from pkgwhy.inspection.text_patterns import analyze_text_patterns, is_text_pattern_candidate
|
|
12
|
+
from pkgwhy.risk.rules import make_rule_evidence
|
|
13
|
+
|
|
14
|
+
SHELL_SUFFIXES = {".sh", ".bash", ".zsh", ".fish", ".ksh"}
|
|
15
|
+
INSTALL_TIME_SCRIPT_NAMES = {"setup.py"}
|
|
16
|
+
BUILD_METADATA_NAMES = {"pyproject.toml", "setup.cfg"}
|
|
17
|
+
MAX_TEXT_SCAN_BYTES = 500_000
|
|
18
|
+
LONG_JS_LINE_LENGTH = 500
|
|
19
|
+
MINIFIED_JS_LINE_LENGTH = 1_000
|
|
20
|
+
LOW_WHITESPACE_RATIO = 0.08
|
|
21
|
+
HIGH_PUNCTUATION_RATIO = 0.32
|
|
22
|
+
JS_LIKELY_OBFUSCATED_WARNING = "likely obfuscated javascript"
|
|
23
|
+
JS_POSSIBLY_OBFUSCATED_WARNING = "possibly obfuscated javascript"
|
|
24
|
+
JS_APPEARS_MINIFIED_WARNING = "appears minified"
|
|
25
|
+
JS_MAY_BE_MINIFIED_WARNING = "may be minified"
|
|
26
|
+
|
|
27
|
+
JS_DYNAMIC_PATTERNS = {
|
|
28
|
+
re.compile(r"\beval\s*\("): "JavaScript eval call",
|
|
29
|
+
re.compile(r"\bFunction\s*\("): "JavaScript Function constructor",
|
|
30
|
+
}
|
|
31
|
+
JS_ENCODED_PATTERNS = {
|
|
32
|
+
re.compile(r"\batob\s*\("): "JavaScript base64 decode call",
|
|
33
|
+
re.compile(r"\bbtoa\s*\("): "JavaScript base64 encode call",
|
|
34
|
+
}
|
|
35
|
+
JS_OBFUSCATION_PATTERNS = {
|
|
36
|
+
re.compile(r"_0x[a-fA-F0-9]{3,}"): "hex-like JavaScript identifier",
|
|
37
|
+
re.compile(r"\\x[0-9a-fA-F]{2}"): "hex-escaped JavaScript string content",
|
|
38
|
+
re.compile(r"while\s*\(\s*!!\[\]\s*\)"): "control-flow flattening pattern",
|
|
39
|
+
re.compile(r"debugger\s*;"): "JavaScript anti-debugging statement",
|
|
40
|
+
}
|
|
41
|
+
JS_LARGE_ENCODED_PATTERN = re.compile(r"['\"][A-Za-z0-9+/]{80,}={0,2}['\"]")
|
|
42
|
+
JS_SOURCE_MAP_PATTERN = re.compile(r"sourceMappingURL\s*=", re.IGNORECASE)
|
|
43
|
+
SETUP_SUBPROCESS_PATTERN = re.compile(r"\b(subprocess|os\.system|os\.popen|Popen|check_call|check_output)\b")
|
|
44
|
+
SETUP_NETWORK_PATTERN = re.compile(r"\b(requests|httpx|urllib|socket|urlopen)\b")
|
|
45
|
+
SETUP_DYNAMIC_PATTERN = re.compile(r"\b(eval|exec|compile|__import__|importlib\.import_module)\b")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def distribution_file_paths(dist: Distribution | None, limit: int = 200) -> list[Path]:
|
|
49
|
+
if dist is None or dist.files is None:
|
|
50
|
+
return []
|
|
51
|
+
paths: list[Path] = []
|
|
52
|
+
for package_file in dist.files:
|
|
53
|
+
try:
|
|
54
|
+
path = Path(dist.locate_file(package_file))
|
|
55
|
+
except (OSError, ValueError):
|
|
56
|
+
continue
|
|
57
|
+
try:
|
|
58
|
+
if path.is_file():
|
|
59
|
+
paths.append(path)
|
|
60
|
+
except OSError:
|
|
61
|
+
continue
|
|
62
|
+
if len(paths) >= limit:
|
|
63
|
+
break
|
|
64
|
+
return paths
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def infer_source_availability(paths: list[Path]) -> SourceAvailability:
|
|
68
|
+
if not paths:
|
|
69
|
+
return SourceAvailability.INSTALLED_METADATA_ONLY
|
|
70
|
+
if any(path.suffix == ".py" for path in paths):
|
|
71
|
+
return SourceAvailability.INSTALLED_SOURCE_PRESENT
|
|
72
|
+
return SourceAvailability.SOURCE_AVAILABILITY_UNKNOWN
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def infer_readability(paths: list[Path], file_analysis: FileStaticAnalysis | None = None) -> ReadabilityStatus:
|
|
76
|
+
if any(path.suffix == ".py" for path in paths):
|
|
77
|
+
return ReadabilityStatus.READABLE
|
|
78
|
+
if file_analysis and any(JS_LIKELY_OBFUSCATED_WARNING in warning.lower() for warning in file_analysis.warnings):
|
|
79
|
+
return ReadabilityStatus.LIKELY_OBFUSCATED
|
|
80
|
+
if file_analysis and any(JS_POSSIBLY_OBFUSCATED_WARNING in warning.lower() for warning in file_analysis.warnings):
|
|
81
|
+
return ReadabilityStatus.POSSIBLY_OBFUSCATED
|
|
82
|
+
if file_analysis and any(
|
|
83
|
+
marker in warning.lower()
|
|
84
|
+
for warning in file_analysis.warnings
|
|
85
|
+
for marker in {JS_APPEARS_MINIFIED_WARNING, JS_MAY_BE_MINIFIED_WARNING}
|
|
86
|
+
):
|
|
87
|
+
return ReadabilityStatus.MINIFIED
|
|
88
|
+
return ReadabilityStatus.NOT_ENOUGH_SOURCE_AVAILABLE
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def detect_file_capabilities(paths: list[Path], entry_points: list[str]) -> list[str]:
|
|
92
|
+
return analyze_file_signals(paths, entry_points).detected_capabilities
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def analyze_file_signals(paths: list[Path], entry_points: list[str]) -> FileStaticAnalysis:
|
|
96
|
+
capabilities: set[str] = set()
|
|
97
|
+
warnings: list[str] = []
|
|
98
|
+
evidence: list[str] = []
|
|
99
|
+
rule_evidence = []
|
|
100
|
+
url_references: list[str] = []
|
|
101
|
+
domain_references: list[str] = []
|
|
102
|
+
credential_references: list[str] = []
|
|
103
|
+
javascript_files_scanned = 0
|
|
104
|
+
shell_scripts_detected = 0
|
|
105
|
+
native_binaries_detected = 0
|
|
106
|
+
wasm_files_detected = 0
|
|
107
|
+
setup_files_detected = 0
|
|
108
|
+
|
|
109
|
+
if entry_points:
|
|
110
|
+
capabilities.add("CLI or plugin entrypoints declared in package metadata")
|
|
111
|
+
evidence.append("Package metadata declares CLI or plugin entrypoints.")
|
|
112
|
+
|
|
113
|
+
for path in paths:
|
|
114
|
+
suffix = path.suffix.lower()
|
|
115
|
+
name = path.name
|
|
116
|
+
if is_text_pattern_candidate(path):
|
|
117
|
+
text_result = analyze_text_patterns(path)
|
|
118
|
+
capabilities.update(text_result.detected_capabilities)
|
|
119
|
+
warnings.extend(text_result.warnings)
|
|
120
|
+
evidence.extend(text_result.evidence)
|
|
121
|
+
rule_evidence.extend(text_result.rule_evidence)
|
|
122
|
+
url_references.extend(text_result.url_references)
|
|
123
|
+
domain_references.extend(text_result.domain_references)
|
|
124
|
+
credential_references.extend(text_result.credential_references)
|
|
125
|
+
if suffix in NATIVE_SUFFIXES:
|
|
126
|
+
if suffix == ".wasm":
|
|
127
|
+
wasm_files_detected += 1
|
|
128
|
+
capabilities.add("WASM binary code present")
|
|
129
|
+
evidence.append(f"WASM file present: {name}")
|
|
130
|
+
rule_evidence.append(
|
|
131
|
+
make_rule_evidence(
|
|
132
|
+
"PKGWHY-BIN-002",
|
|
133
|
+
message="WebAssembly binary file is present.",
|
|
134
|
+
evidence=[f"WASM file present: {name}."],
|
|
135
|
+
file_path=name,
|
|
136
|
+
symbol=suffix,
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
native_binaries_detected += 1
|
|
141
|
+
capabilities.add("Native compiled code present")
|
|
142
|
+
evidence.append(f"Native or executable file present: {name}")
|
|
143
|
+
binary_rule_id = "PKGWHY-BIN-003" if suffix == ".exe" else "PKGWHY-BIN-001"
|
|
144
|
+
rule_evidence.append(
|
|
145
|
+
make_rule_evidence(
|
|
146
|
+
binary_rule_id,
|
|
147
|
+
message=f"Native or executable file present: {name}.",
|
|
148
|
+
evidence=[f"File extension {suffix} detected for {name}."],
|
|
149
|
+
file_path=name,
|
|
150
|
+
symbol=suffix,
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
if suffix in JAVASCRIPT_SUFFIXES:
|
|
154
|
+
capabilities.add("Browser or JavaScript code present")
|
|
155
|
+
js_result = _analyze_javascript_file(path)
|
|
156
|
+
javascript_files_scanned += js_result.javascript_files_scanned
|
|
157
|
+
capabilities.update(js_result.detected_capabilities)
|
|
158
|
+
warnings.extend(js_result.warnings)
|
|
159
|
+
evidence.extend(js_result.evidence)
|
|
160
|
+
rule_evidence.extend(js_result.rule_evidence)
|
|
161
|
+
if _is_shell_script(path):
|
|
162
|
+
shell_scripts_detected += 1
|
|
163
|
+
capabilities.add("Shell script files present")
|
|
164
|
+
evidence.append(f"Shell script file present: {name}")
|
|
165
|
+
if name in INSTALL_TIME_SCRIPT_NAMES:
|
|
166
|
+
setup_result = _analyze_setup_py(path)
|
|
167
|
+
setup_files_detected += 1
|
|
168
|
+
capabilities.update(setup_result.detected_capabilities)
|
|
169
|
+
warnings.extend(setup_result.warnings)
|
|
170
|
+
evidence.extend(setup_result.evidence)
|
|
171
|
+
rule_evidence.extend(setup_result.rule_evidence)
|
|
172
|
+
elif name in BUILD_METADATA_NAMES:
|
|
173
|
+
build_result = _analyze_build_metadata(path)
|
|
174
|
+
warnings.extend(build_result.warnings)
|
|
175
|
+
evidence.extend(build_result.evidence)
|
|
176
|
+
rule_evidence.extend(build_result.rule_evidence)
|
|
177
|
+
|
|
178
|
+
return FileStaticAnalysis(
|
|
179
|
+
detected_capabilities=sorted(capabilities),
|
|
180
|
+
warnings=warnings[:100],
|
|
181
|
+
evidence=evidence[:100],
|
|
182
|
+
rule_evidence=_prioritize_rule_evidence(rule_evidence),
|
|
183
|
+
url_references=_unique(url_references)[:100],
|
|
184
|
+
domain_references=_unique(domain_references)[:100],
|
|
185
|
+
credential_references=_unique(credential_references)[:100],
|
|
186
|
+
javascript_files_scanned=javascript_files_scanned,
|
|
187
|
+
shell_scripts_detected=shell_scripts_detected,
|
|
188
|
+
native_binaries_detected=native_binaries_detected,
|
|
189
|
+
wasm_files_detected=wasm_files_detected,
|
|
190
|
+
setup_files_detected=setup_files_detected,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _analyze_setup_py(path: Path) -> FileStaticAnalysis:
|
|
195
|
+
name = path.name
|
|
196
|
+
capabilities = {"Install-time setup files present"}
|
|
197
|
+
warnings = [
|
|
198
|
+
"setup.py is executable Python used by some build/install flows. pkgwhy reports static signals only and does not run it."
|
|
199
|
+
]
|
|
200
|
+
evidence = [f"Install-time setup script present: {name}"]
|
|
201
|
+
rule_evidence = [
|
|
202
|
+
make_rule_evidence(
|
|
203
|
+
"PKGWHY-BUILD-001",
|
|
204
|
+
message="Executable setup.py file is present.",
|
|
205
|
+
evidence=[f"{name} is present."],
|
|
206
|
+
file_path=name,
|
|
207
|
+
symbol="setup.py",
|
|
208
|
+
)
|
|
209
|
+
]
|
|
210
|
+
source = _read_small_text(path)
|
|
211
|
+
if source is None:
|
|
212
|
+
return FileStaticAnalysis(
|
|
213
|
+
detected_capabilities=sorted(capabilities),
|
|
214
|
+
warnings=warnings,
|
|
215
|
+
evidence=evidence,
|
|
216
|
+
rule_evidence=rule_evidence,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
for rule_id, capability, pattern, detail in (
|
|
220
|
+
("PKGWHY-BUILD-002", "Subprocess or shell execution signals", SETUP_SUBPROCESS_PATTERN, "subprocess or shell reference"),
|
|
221
|
+
("PKGWHY-BUILD-003", "Network access signals", SETUP_NETWORK_PATTERN, "network access reference"),
|
|
222
|
+
("PKGWHY-BUILD-004", "Dynamic code execution signals", SETUP_DYNAMIC_PATTERN, "dynamic execution reference"),
|
|
223
|
+
):
|
|
224
|
+
line_number = _first_matching_line(source, pattern)
|
|
225
|
+
if line_number is None:
|
|
226
|
+
continue
|
|
227
|
+
capabilities.add(capability)
|
|
228
|
+
warnings.append(f"setup.py contains {detail}: {name}:{line_number}")
|
|
229
|
+
evidence.append(f"setup.py static signal in {name}:{line_number}: {detail}.")
|
|
230
|
+
rule_evidence.append(
|
|
231
|
+
make_rule_evidence(
|
|
232
|
+
rule_id,
|
|
233
|
+
message=f"setup.py contains {detail}.",
|
|
234
|
+
evidence=[f"{name}:{line_number} contains {detail}."],
|
|
235
|
+
file_path=name,
|
|
236
|
+
line_number=line_number,
|
|
237
|
+
symbol=detail,
|
|
238
|
+
)
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return FileStaticAnalysis(
|
|
242
|
+
detected_capabilities=sorted(capabilities),
|
|
243
|
+
warnings=warnings,
|
|
244
|
+
evidence=evidence,
|
|
245
|
+
rule_evidence=rule_evidence,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _analyze_build_metadata(path: Path) -> FileStaticAnalysis:
|
|
250
|
+
if path.name == "setup.cfg":
|
|
251
|
+
return FileStaticAnalysis(
|
|
252
|
+
evidence=["setup.cfg metadata file present."],
|
|
253
|
+
rule_evidence=[
|
|
254
|
+
make_rule_evidence(
|
|
255
|
+
"PKGWHY-BUILD-006",
|
|
256
|
+
message="setup.cfg metadata file is present.",
|
|
257
|
+
evidence=["setup.cfg is present."],
|
|
258
|
+
file_path=path.name,
|
|
259
|
+
symbol="setup.cfg",
|
|
260
|
+
)
|
|
261
|
+
],
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
source = _read_small_text(path)
|
|
265
|
+
if source is None:
|
|
266
|
+
return FileStaticAnalysis(warnings=["Skipped large or unreadable pyproject.toml during static scan."])
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
data = tomllib.loads(source)
|
|
270
|
+
except tomllib.TOMLDecodeError as exc:
|
|
271
|
+
return FileStaticAnalysis(warnings=[f"Could not statically parse pyproject.toml: {exc.__class__.__name__}"])
|
|
272
|
+
|
|
273
|
+
build_system = data.get("build-system")
|
|
274
|
+
if not isinstance(build_system, dict):
|
|
275
|
+
return FileStaticAnalysis(evidence=["pyproject.toml present without build-system table."])
|
|
276
|
+
|
|
277
|
+
backend = build_system.get("build-backend")
|
|
278
|
+
if not isinstance(backend, str) or not backend.strip():
|
|
279
|
+
return FileStaticAnalysis(evidence=["pyproject.toml build-system table present without build-backend."])
|
|
280
|
+
|
|
281
|
+
line_number = _first_matching_line(source, re.compile(r"build-backend\s*="))
|
|
282
|
+
evidence = [f"pyproject.toml declares build backend: {backend}"]
|
|
283
|
+
return FileStaticAnalysis(
|
|
284
|
+
evidence=evidence,
|
|
285
|
+
rule_evidence=[
|
|
286
|
+
make_rule_evidence(
|
|
287
|
+
"PKGWHY-BUILD-005",
|
|
288
|
+
message=f"Build backend declared: {backend}.",
|
|
289
|
+
evidence=evidence,
|
|
290
|
+
file_path=path.name,
|
|
291
|
+
line_number=line_number,
|
|
292
|
+
symbol=backend,
|
|
293
|
+
)
|
|
294
|
+
],
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _analyze_javascript_file(path: Path) -> FileStaticAnalysis:
|
|
299
|
+
try:
|
|
300
|
+
if path.stat().st_size > MAX_TEXT_SCAN_BYTES:
|
|
301
|
+
return FileStaticAnalysis(
|
|
302
|
+
warnings=[f"Skipped large JavaScript file during static scan: {path.name}"],
|
|
303
|
+
)
|
|
304
|
+
source = path.read_text(encoding="utf-8")
|
|
305
|
+
except (OSError, UnicodeDecodeError) as exc:
|
|
306
|
+
return FileStaticAnalysis(
|
|
307
|
+
warnings=[f"Could not statically read JavaScript file {path.name}: {exc.__class__.__name__}"],
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
capabilities: set[str] = set()
|
|
311
|
+
warnings: list[str] = []
|
|
312
|
+
evidence: list[str] = [f"Statically scanned JavaScript file: {path.name}"]
|
|
313
|
+
rule_evidence = []
|
|
314
|
+
|
|
315
|
+
lines = source.splitlines() or [source]
|
|
316
|
+
longest_line = max((len(line) for line in lines), default=0)
|
|
317
|
+
whitespace_ratio = _character_ratio(source, str.isspace)
|
|
318
|
+
punctuation_ratio = _character_ratio(source, lambda char: not char.isalnum() and not char.isspace())
|
|
319
|
+
|
|
320
|
+
if path.name.endswith(".min.js") or longest_line >= MINIFIED_JS_LINE_LENGTH:
|
|
321
|
+
warnings.append(f"JavaScript file {JS_APPEARS_MINIFIED_WARNING}: {path.name}")
|
|
322
|
+
evidence.append(f"JavaScript minification signal in {path.name}: long line or .min.js filename.")
|
|
323
|
+
rule_evidence.append(
|
|
324
|
+
make_rule_evidence(
|
|
325
|
+
"PKGWHY-JS-001",
|
|
326
|
+
message="JavaScript file appears minified.",
|
|
327
|
+
evidence=[f"{path.name} has .min.js filename or a line at least {MINIFIED_JS_LINE_LENGTH} characters long."],
|
|
328
|
+
file_path=path.name,
|
|
329
|
+
line_number=_first_long_line(lines, MINIFIED_JS_LINE_LENGTH),
|
|
330
|
+
symbol="minified-javascript",
|
|
331
|
+
)
|
|
332
|
+
)
|
|
333
|
+
elif longest_line >= LONG_JS_LINE_LENGTH and whitespace_ratio < LOW_WHITESPACE_RATIO:
|
|
334
|
+
warnings.append(f"JavaScript file {JS_MAY_BE_MINIFIED_WARNING}: {path.name}")
|
|
335
|
+
evidence.append(f"JavaScript readability signal in {path.name}: long line with low whitespace ratio.")
|
|
336
|
+
rule_evidence.append(
|
|
337
|
+
make_rule_evidence(
|
|
338
|
+
"PKGWHY-JS-001",
|
|
339
|
+
message="JavaScript file may be minified.",
|
|
340
|
+
evidence=[f"{path.name} has a long line with low whitespace ratio."],
|
|
341
|
+
file_path=path.name,
|
|
342
|
+
line_number=_first_long_line(lines, LONG_JS_LINE_LENGTH),
|
|
343
|
+
symbol="minified-javascript",
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
if whitespace_ratio < LOW_WHITESPACE_RATIO and punctuation_ratio > HIGH_PUNCTUATION_RATIO:
|
|
348
|
+
warnings.append(f"JavaScript file has low whitespace and high punctuation ratios: {path.name}")
|
|
349
|
+
evidence.append(f"JavaScript density signal in {path.name}: low whitespace and high punctuation.")
|
|
350
|
+
rule_evidence.append(
|
|
351
|
+
make_rule_evidence(
|
|
352
|
+
"PKGWHY-JS-001",
|
|
353
|
+
message="JavaScript file has low whitespace and high punctuation ratios.",
|
|
354
|
+
evidence=[f"{path.name} has low whitespace and high punctuation ratios."],
|
|
355
|
+
file_path=path.name,
|
|
356
|
+
symbol="javascript-density",
|
|
357
|
+
)
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
for pattern, detail in JS_DYNAMIC_PATTERNS.items():
|
|
361
|
+
if pattern.search(source):
|
|
362
|
+
capabilities.add("JavaScript dynamic code execution signals")
|
|
363
|
+
evidence.append(f"JavaScript dynamic execution signal in {path.name}: {detail}.")
|
|
364
|
+
rule_evidence.append(
|
|
365
|
+
make_rule_evidence(
|
|
366
|
+
"PKGWHY-JS-002",
|
|
367
|
+
message=f"JavaScript dynamic execution signal: {detail}.",
|
|
368
|
+
evidence=[f"{path.name}:{_first_matching_line(source, pattern) or 1} references {detail}."],
|
|
369
|
+
file_path=path.name,
|
|
370
|
+
line_number=_first_matching_line(source, pattern),
|
|
371
|
+
symbol=detail,
|
|
372
|
+
)
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
for pattern, detail in JS_ENCODED_PATTERNS.items():
|
|
376
|
+
if pattern.search(source):
|
|
377
|
+
capabilities.add("Encoded payload handling signals")
|
|
378
|
+
evidence.append(f"JavaScript encoded payload signal in {path.name}: {detail}.")
|
|
379
|
+
rule_evidence.append(
|
|
380
|
+
make_rule_evidence(
|
|
381
|
+
"PKGWHY-JS-003",
|
|
382
|
+
message=f"JavaScript encoded payload signal: {detail}.",
|
|
383
|
+
evidence=[f"{path.name}:{_first_matching_line(source, pattern) or 1} references {detail}."],
|
|
384
|
+
file_path=path.name,
|
|
385
|
+
line_number=_first_matching_line(source, pattern),
|
|
386
|
+
symbol=detail,
|
|
387
|
+
)
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
large_encoded_line = _first_matching_line(source, JS_LARGE_ENCODED_PATTERN)
|
|
391
|
+
if large_encoded_line is not None:
|
|
392
|
+
capabilities.add("Encoded payload handling signals")
|
|
393
|
+
evidence.append(f"JavaScript large encoded-string signal in {path.name}:{large_encoded_line}.")
|
|
394
|
+
rule_evidence.append(
|
|
395
|
+
make_rule_evidence(
|
|
396
|
+
"PKGWHY-JS-003",
|
|
397
|
+
message="JavaScript large encoded-string signal detected.",
|
|
398
|
+
evidence=[f"{path.name}:{large_encoded_line} contains a large encoded-looking string; value omitted."],
|
|
399
|
+
file_path=path.name,
|
|
400
|
+
line_number=large_encoded_line,
|
|
401
|
+
symbol="large encoded-looking string",
|
|
402
|
+
)
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
source_map_line = _first_matching_line(source, JS_SOURCE_MAP_PATTERN)
|
|
406
|
+
if source_map_line is not None:
|
|
407
|
+
evidence.append(f"JavaScript source-map reference in {path.name}:{source_map_line}.")
|
|
408
|
+
rule_evidence.append(
|
|
409
|
+
make_rule_evidence(
|
|
410
|
+
"PKGWHY-JS-005",
|
|
411
|
+
message="JavaScript source-map reference detected.",
|
|
412
|
+
evidence=[f"{path.name}:{source_map_line} references sourceMappingURL."],
|
|
413
|
+
file_path=path.name,
|
|
414
|
+
line_number=source_map_line,
|
|
415
|
+
symbol="sourceMappingURL",
|
|
416
|
+
)
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
obfuscation_signals = [
|
|
420
|
+
detail for pattern, detail in JS_OBFUSCATION_PATTERNS.items() if pattern.search(source)
|
|
421
|
+
]
|
|
422
|
+
if len(obfuscation_signals) >= 3:
|
|
423
|
+
warnings.append(f"JavaScript file has {JS_LIKELY_OBFUSCATED_WARNING} signals: {path.name}")
|
|
424
|
+
capabilities.add("JavaScript obfuscation signals")
|
|
425
|
+
evidence.append(f"JavaScript obfuscation signals in {path.name}: {', '.join(sorted(obfuscation_signals))}.")
|
|
426
|
+
rule_evidence.append(
|
|
427
|
+
make_rule_evidence(
|
|
428
|
+
"PKGWHY-JS-004",
|
|
429
|
+
message="JavaScript file has likely obfuscated signals.",
|
|
430
|
+
evidence=[f"{path.name} contains signals: {', '.join(sorted(obfuscation_signals))}."],
|
|
431
|
+
severity=RuleSeverity.HIGH,
|
|
432
|
+
file_path=path.name,
|
|
433
|
+
line_number=_first_obfuscation_line(source),
|
|
434
|
+
symbol="javascript-obfuscation",
|
|
435
|
+
)
|
|
436
|
+
)
|
|
437
|
+
elif len(obfuscation_signals) >= 2:
|
|
438
|
+
warnings.append(f"JavaScript file has {JS_POSSIBLY_OBFUSCATED_WARNING} signals: {path.name}")
|
|
439
|
+
capabilities.add("JavaScript obfuscation signals")
|
|
440
|
+
evidence.append(f"JavaScript obfuscation signals in {path.name}: {', '.join(sorted(obfuscation_signals))}.")
|
|
441
|
+
rule_evidence.append(
|
|
442
|
+
make_rule_evidence(
|
|
443
|
+
"PKGWHY-JS-004",
|
|
444
|
+
message="JavaScript file has possible obfuscation signals.",
|
|
445
|
+
evidence=[f"{path.name} contains signals: {', '.join(sorted(obfuscation_signals))}."],
|
|
446
|
+
file_path=path.name,
|
|
447
|
+
line_number=_first_obfuscation_line(source),
|
|
448
|
+
symbol="javascript-obfuscation",
|
|
449
|
+
)
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
return FileStaticAnalysis(
|
|
453
|
+
detected_capabilities=sorted(capabilities),
|
|
454
|
+
warnings=warnings,
|
|
455
|
+
evidence=evidence,
|
|
456
|
+
rule_evidence=rule_evidence,
|
|
457
|
+
javascript_files_scanned=1,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def _read_small_text(path: Path) -> str | None:
|
|
462
|
+
try:
|
|
463
|
+
if path.stat().st_size > MAX_TEXT_SCAN_BYTES:
|
|
464
|
+
return None
|
|
465
|
+
return path.read_text(encoding="utf-8")
|
|
466
|
+
except (OSError, UnicodeDecodeError):
|
|
467
|
+
return None
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def _first_matching_line(source: str, pattern: re.Pattern[str]) -> int | None:
|
|
471
|
+
for index, line in enumerate(source.splitlines(), start=1):
|
|
472
|
+
if pattern.search(line):
|
|
473
|
+
return index
|
|
474
|
+
return None
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def _first_long_line(lines: list[str], minimum_length: int) -> int | None:
|
|
478
|
+
for index, line in enumerate(lines, start=1):
|
|
479
|
+
if len(line) >= minimum_length:
|
|
480
|
+
return index
|
|
481
|
+
return None
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def _first_obfuscation_line(source: str) -> int | None:
|
|
485
|
+
matching_lines = [
|
|
486
|
+
line_number
|
|
487
|
+
for pattern in JS_OBFUSCATION_PATTERNS
|
|
488
|
+
if (line_number := _first_matching_line(source, pattern)) is not None
|
|
489
|
+
]
|
|
490
|
+
return min(matching_lines) if matching_lines else None
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def _is_shell_script(path: Path) -> bool:
|
|
494
|
+
suffix = path.suffix.lower()
|
|
495
|
+
if suffix in SHELL_SUFFIXES:
|
|
496
|
+
return True
|
|
497
|
+
try:
|
|
498
|
+
with path.open("rb") as handle:
|
|
499
|
+
first_line = handle.readline(128)
|
|
500
|
+
except OSError:
|
|
501
|
+
return False
|
|
502
|
+
return first_line.startswith(b"#!") and b"sh" in first_line.lower()
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def _character_ratio(source: str, predicate: Callable[[str], bool]) -> float:
|
|
506
|
+
if not source:
|
|
507
|
+
return 0.0
|
|
508
|
+
return sum(1 for char in source if predicate(char)) / len(source)
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def _unique(values: list[str]) -> list[str]:
|
|
512
|
+
unique_values: list[str] = []
|
|
513
|
+
for value in values:
|
|
514
|
+
if value not in unique_values:
|
|
515
|
+
unique_values.append(value)
|
|
516
|
+
return unique_values
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def _prioritize_rule_evidence(rules: list[RiskRuleEvidence], limit: int = 100) -> list[RiskRuleEvidence]:
|
|
520
|
+
severity_order = {
|
|
521
|
+
RuleSeverity.CRITICAL: 0,
|
|
522
|
+
RuleSeverity.HIGH: 1,
|
|
523
|
+
RuleSeverity.MEDIUM: 2,
|
|
524
|
+
RuleSeverity.LOW: 3,
|
|
525
|
+
RuleSeverity.INFO: 4,
|
|
526
|
+
}
|
|
527
|
+
deduped: dict[tuple[str, str | None, int | None, str | None, str], RiskRuleEvidence] = {}
|
|
528
|
+
for rule in rules:
|
|
529
|
+
key = (rule.rule_id, rule.file_path, rule.line_number, rule.symbol, rule.message)
|
|
530
|
+
deduped.setdefault(key, rule)
|
|
531
|
+
return sorted(
|
|
532
|
+
deduped.values(),
|
|
533
|
+
key=lambda rule: (
|
|
534
|
+
severity_order.get(rule.severity, 5),
|
|
535
|
+
rule.rule_id,
|
|
536
|
+
rule.file_path or "",
|
|
537
|
+
rule.line_number or 0,
|
|
538
|
+
rule.symbol or "",
|
|
539
|
+
),
|
|
540
|
+
)[:limit]
|