openhack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openhack/__init__.py +2 -0
- openhack/__main__.py +225 -0
- openhack/agents/__init__.py +30 -0
- openhack/agents/base.py +230 -0
- openhack/agents/browser_verifier.py +679 -0
- openhack/agents/browser_verifier_swarm.py +256 -0
- openhack/agents/checkpoint.py +89 -0
- openhack/agents/context_manager.py +356 -0
- openhack/agents/coordinator.py +1105 -0
- openhack/agents/endpoint_analyst.py +307 -0
- openhack/agents/feature_hunter.py +93 -0
- openhack/agents/hunter.py +481 -0
- openhack/agents/hunter_swarm.py +385 -0
- openhack/agents/llm.py +334 -0
- openhack/agents/recon.py +19 -0
- openhack/agents/sandbox_verifier.py +396 -0
- openhack/agents/sandbox_verifier_swarm.py +250 -0
- openhack/agents/session.py +286 -0
- openhack/agents/validator.py +217 -0
- openhack/agents/validator_swarm.py +106 -0
- openhack/auth.py +175 -0
- openhack/browser/__init__.py +12 -0
- openhack/browser/runner.py +385 -0
- openhack/categories.py +130 -0
- openhack/config.py +201 -0
- openhack/deterministic_recon.py +464 -0
- openhack/entry_points.py +745 -0
- openhack/framework_classifier.py +515 -0
- openhack/framework_detection.py +269 -0
- openhack/headless_scan.py +179 -0
- openhack/prompts/__init__.py +108 -0
- openhack/prompts/browser_verifier.py +171 -0
- openhack/prompts/coordinator.py +31 -0
- openhack/prompts/django/__init__.py +32 -0
- openhack/prompts/django/auth_bypass.py +76 -0
- openhack/prompts/django/csrf.py +62 -0
- openhack/prompts/django/data_exposure.py +67 -0
- openhack/prompts/django/idor.py +74 -0
- openhack/prompts/django/injection.py +67 -0
- openhack/prompts/django/misconfiguration.py +70 -0
- openhack/prompts/django/ssrf.py +64 -0
- openhack/prompts/endpoint_analyst.py +122 -0
- openhack/prompts/express/__init__.py +29 -0
- openhack/prompts/express/auth_bypass.py +71 -0
- openhack/prompts/express/data_exposure.py +77 -0
- openhack/prompts/express/idor.py +69 -0
- openhack/prompts/express/injection.py +75 -0
- openhack/prompts/express/misconfiguration.py +72 -0
- openhack/prompts/express/ssrf.py +63 -0
- openhack/prompts/feature_hunter.py +140 -0
- openhack/prompts/flask/__init__.py +29 -0
- openhack/prompts/flask/auth_bypass.py +86 -0
- openhack/prompts/flask/data_exposure.py +78 -0
- openhack/prompts/flask/idor.py +83 -0
- openhack/prompts/flask/injection.py +77 -0
- openhack/prompts/flask/misconfiguration.py +73 -0
- openhack/prompts/flask/ssrf.py +65 -0
- openhack/prompts/hunter.py +362 -0
- openhack/prompts/hunter_continuation_loop.py +12 -0
- openhack/prompts/hunter_continuation_no_findings.py +19 -0
- openhack/prompts/hunter_continuation_no_progress.py +22 -0
- openhack/prompts/hunter_tool_instructions.py +55 -0
- openhack/prompts/nextjs/__init__.py +42 -0
- openhack/prompts/nextjs/auth_bypass.py +80 -0
- openhack/prompts/nextjs/csrf.py +71 -0
- openhack/prompts/nextjs/data_exposure.py +88 -0
- openhack/prompts/nextjs/idor.py +64 -0
- openhack/prompts/nextjs/injection.py +65 -0
- openhack/prompts/nextjs/middleware_bypass.py +75 -0
- openhack/prompts/nextjs/misconfiguration.py +92 -0
- openhack/prompts/nextjs/server_actions.py +97 -0
- openhack/prompts/nextjs/ssrf.py +66 -0
- openhack/prompts/nextjs/xss.py +69 -0
- openhack/prompts/pr_analysis_system.py +80 -0
- openhack/prompts/pr_analysis_user.py +11 -0
- openhack/prompts/project_context.py +89 -0
- openhack/prompts/recon.py +199 -0
- openhack/prompts/reporter.py +88 -0
- openhack/prompts/researchers.py +434 -0
- openhack/prompts/sandbox_verifier.py +128 -0
- openhack/prompts/supabase/__init__.py +39 -0
- openhack/prompts/supabase/auth_tokens.py +131 -0
- openhack/prompts/supabase/edge_functions.py +150 -0
- openhack/prompts/supabase/graphql.py +102 -0
- openhack/prompts/supabase/postgrest.py +99 -0
- openhack/prompts/supabase/realtime.py +93 -0
- openhack/prompts/supabase/rls.py +110 -0
- openhack/prompts/supabase/rpc_functions.py +127 -0
- openhack/prompts/supabase/storage.py +110 -0
- openhack/prompts/supabase/tenant_isolation.py +118 -0
- openhack/prompts/validator.py +319 -0
- openhack/prompts/validator_continuation_incomplete.py +12 -0
- openhack/prompts/validator_tool_instructions.py +29 -0
- openhack/quality.py +231 -0
- openhack/sandbox/__init__.py +12 -0
- openhack/sandbox/orchestrator.py +517 -0
- openhack/sandbox/runner.py +177 -0
- openhack/scan_session.py +245 -0
- openhack/setup.py +452 -0
- openhack/static_validator.py +612 -0
- openhack/tools/__init__.py +1 -0
- openhack/tools/ast_tools.py +307 -0
- openhack/tools/coverage.py +1078 -0
- openhack/tools/filesystem.py +404 -0
- openhack/tools/nextjs.py +258 -0
- openhack/tools/registry.py +52 -0
- openhack/tui.py +3450 -0
- openhack/updates.py +170 -0
- openhack-0.1.0.dist-info/METADATA +189 -0
- openhack-0.1.0.dist-info/RECORD +113 -0
- openhack-0.1.0.dist-info/WHEEL +4 -0
- openhack-0.1.0.dist-info/entry_points.txt +2 -0
- openhack-0.1.0.dist-info/licenses/LICENSE +661 -0
|
@@ -0,0 +1,612 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deterministic pre-validator for hunter findings.
|
|
3
|
+
|
|
4
|
+
Runs BEFORE the LLM validator to filter out findings that are provably
|
|
5
|
+
wrong without any LLM reasoning. Two layers:
|
|
6
|
+
|
|
7
|
+
Layer A (basic checks):
|
|
8
|
+
- Does the file exist?
|
|
9
|
+
- Does the reported line actually contain the claimed pattern?
|
|
10
|
+
- Is the pattern in a comment or string literal, not actual code?
|
|
11
|
+
- Is it in a test/fixture file?
|
|
12
|
+
|
|
13
|
+
Layer B (tree-sitter sink verification):
|
|
14
|
+
- Parse the function containing the reported sink
|
|
15
|
+
- Check whether any parameter traces back to user input (req, request, params, body, etc.)
|
|
16
|
+
- If the sink only receives constants/config values, reject the finding
|
|
17
|
+
|
|
18
|
+
Returns: filtered list of findings with rejection reasons attached to dropped ones.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import logging
|
|
24
|
+
import os
|
|
25
|
+
import re
|
|
26
|
+
from typing import Optional
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
# --- User input source identifiers ---
|
|
31
|
+
_USER_INPUT_NAMES = {
|
|
32
|
+
# JS/TS
|
|
33
|
+
"req", "request", "params", "query", "body", "searchParams",
|
|
34
|
+
"ctx", "input", "args", "data", "payload", "formData",
|
|
35
|
+
"req.body", "req.query", "req.params", "request.body",
|
|
36
|
+
"request.form", "request.args", "request.json", "request.data",
|
|
37
|
+
"request.GET", "request.POST", "request.FILES",
|
|
38
|
+
# Python
|
|
39
|
+
"self.request",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# --- Sink patterns per category ---
|
|
43
|
+
_CATEGORY_SINK_PATTERNS: dict[str, list[re.Pattern]] = {
|
|
44
|
+
"sql_injection": [
|
|
45
|
+
re.compile(r"(?:query|execute|raw|text|cursor\.execute|db\.execute|\.raw\(|\.extra\(|RawSQL)", re.I),
|
|
46
|
+
re.compile(r"(?:SELECT|INSERT|UPDATE|DELETE|FROM|WHERE)", re.I),
|
|
47
|
+
],
|
|
48
|
+
"xss": [
|
|
49
|
+
re.compile(r"(?:dangerouslySetInnerHTML|innerHTML|document\.write|v-html|mark_safe|Markup\(|\|safe)", re.I),
|
|
50
|
+
],
|
|
51
|
+
"rce": [
|
|
52
|
+
re.compile(r"(?:eval|exec|Function\(|child_process|subprocess|os\.system|os\.popen|spawn|execFile)", re.I),
|
|
53
|
+
],
|
|
54
|
+
"command_injection": [
|
|
55
|
+
re.compile(r"(?:exec|spawn|subprocess|os\.system|os\.popen|child_process|shell=True)", re.I),
|
|
56
|
+
],
|
|
57
|
+
"ssrf": [
|
|
58
|
+
re.compile(r"(?:fetch|axios|http\.request|urllib|requests\.get|requests\.post|httpx)", re.I),
|
|
59
|
+
],
|
|
60
|
+
"ssti": [
|
|
61
|
+
re.compile(r"(?:render_template_string|Template\(|from_string|ejs\.render|nunjucks\.renderString|pug\.render)", re.I),
|
|
62
|
+
],
|
|
63
|
+
"path_traversal": [
|
|
64
|
+
re.compile(r"(?:sendFile|send_file|FileResponse|open\(|readFile|createReadStream|res\.download)", re.I),
|
|
65
|
+
],
|
|
66
|
+
"open_redirect": [
|
|
67
|
+
re.compile(r"(?:redirect|302|Location|NextResponse\.redirect|res\.redirect)", re.I),
|
|
68
|
+
],
|
|
69
|
+
"prototype_pollution": [
|
|
70
|
+
re.compile(r"(?:__proto__|Object\.assign|lodash\.merge|_.merge|defaultsDeep|deepmerge)", re.I),
|
|
71
|
+
],
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# --- Comment patterns ---
|
|
75
|
+
_JS_LINE_COMMENT = re.compile(r"^\s*//")
|
|
76
|
+
_JS_BLOCK_COMMENT_START = re.compile(r"/\*")
|
|
77
|
+
_JS_BLOCK_COMMENT_END = re.compile(r"\*/")
|
|
78
|
+
_PY_LINE_COMMENT = re.compile(r"^\s*#")
|
|
79
|
+
_PY_DOCSTRING = re.compile(r'^\s*(?:"""|\'\'\')')
|
|
80
|
+
|
|
81
|
+
_TEST_DIR_PATTERNS = re.compile(
|
|
82
|
+
r"(?:^|/)(?:test|tests|__tests__|spec|__mocks__|fixtures|mocks|__fixtures__|e2e|cypress|playwright)(?:/|$)",
|
|
83
|
+
re.I,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
_NON_PRODUCTION_DIR_PATTERNS = re.compile(
|
|
87
|
+
r"(?:^|/)(?:"
|
|
88
|
+
r"test|tests|__tests__|spec|__mocks__|fixtures|__fixtures__|e2e|cypress|playwright|"
|
|
89
|
+
r"cli|CLI|docs|documentation|examples?|samples?|scripts|tools|devtools|"
|
|
90
|
+
r"benchmarks?|integration-tests|\.storybook|stories"
|
|
91
|
+
r")(?:/|$)",
|
|
92
|
+
re.I,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Signals that code is intentional design, not a vulnerability
|
|
96
|
+
_INTENT_COMMENT_PATTERNS = [
|
|
97
|
+
re.compile(r"@since\s+\d+\.\d+", re.I), # Versioned API — deliberate
|
|
98
|
+
re.compile(r"intentionally?\s+(?:public|open|exposed|disabled|skipped)", re.I),
|
|
99
|
+
re.compile(r"by\s+design", re.I),
|
|
100
|
+
re.compile(r"public\s+(?:endpoint|api|route)", re.I),
|
|
101
|
+
re.compile(r"no\s+auth(?:entication)?\s+(?:required|needed)", re.I),
|
|
102
|
+
re.compile(r"allow\s+(?:unauthenticated|anonymous|public)", re.I),
|
|
103
|
+
re.compile(r"fallback\s+(?:for|in)\s+(?:dev|development|test)", re.I),
|
|
104
|
+
re.compile(r"default\s+(?:for|in)\s+(?:dev|development|test)", re.I),
|
|
105
|
+
re.compile(r"only\s+(?:in|for|when)\s+(?:dev|development|test|non-prod)", re.I),
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
# Code patterns that indicate dev-only fallbacks, not production secrets
|
|
109
|
+
_DEV_FALLBACK_PATTERNS = [
|
|
110
|
+
re.compile(r"(?:NODE_ENV|RAILS_ENV|FLASK_ENV|APP_ENV)\s*[!=]=\s*['\"](?:production|prod)['\"]", re.I),
|
|
111
|
+
re.compile(r"process\.env\.\w+\s*\|\|\s*['\"]", re.I), # env || "default"
|
|
112
|
+
re.compile(r"process\.env\.\w+\s*\|\|\s*\w+", re.I), # env || CONSTANT_NAME
|
|
113
|
+
re.compile(r"os\.environ\.get\(\s*['\"][^'\"]+['\"]\s*,\s*['\"]", re.I), # os.environ.get("X", "default")
|
|
114
|
+
re.compile(r"ENV\.fetch\(\s*['\"][^'\"]+['\"]\s*,\s*['\"]", re.I), # Ruby ENV.fetch("X", "default")
|
|
115
|
+
re.compile(r"(?:const|let|var)\s+DEFAULT_\w*\s*=\s*['\"]", re.I), # const DEFAULT_SECRET = "value"
|
|
116
|
+
re.compile(r"\?\?\s*DEFAULT_", re.I), # ?? DEFAULT_CONSTANT (nullish coalescing)
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _is_test_file(file_path: str) -> bool:
|
|
121
|
+
"""Check if a file is a test/fixture file."""
|
|
122
|
+
if _TEST_DIR_PATTERNS.search(file_path):
|
|
123
|
+
return True
|
|
124
|
+
basename = os.path.basename(file_path).lower()
|
|
125
|
+
return any(basename.startswith(p) or basename.endswith(p) for p in [
|
|
126
|
+
"test_", "_test.", ".test.", ".spec.", "conftest.", "fixture",
|
|
127
|
+
])
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _is_non_production_path(file_path: str) -> bool:
|
|
131
|
+
"""Check if a file is in a non-production directory (tests, CLI, docs, examples, etc.)."""
|
|
132
|
+
if _NON_PRODUCTION_DIR_PATTERNS.search(file_path):
|
|
133
|
+
return True
|
|
134
|
+
basename = os.path.basename(file_path).lower()
|
|
135
|
+
return any(basename.startswith(p) or basename.endswith(p) for p in [
|
|
136
|
+
"test_", "_test.", ".test.", ".spec.", "conftest.", "fixture",
|
|
137
|
+
])
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _check_developer_intent(content: str, line_number: Optional[int], category: str) -> Optional[str]:
|
|
141
|
+
"""Check if surrounding code shows intentional design, returning a reason if so.
|
|
142
|
+
|
|
143
|
+
Scans a window around the reported line for comments/code that indicate
|
|
144
|
+
the flagged pattern is deliberate (e.g., @since tags, 'by design' comments,
|
|
145
|
+
dev-only fallbacks).
|
|
146
|
+
"""
|
|
147
|
+
lines = content.split("\n")
|
|
148
|
+
|
|
149
|
+
# Look at a generous window around the reported line
|
|
150
|
+
if line_number and 0 < line_number <= len(lines):
|
|
151
|
+
window_start = max(0, line_number - 10)
|
|
152
|
+
window_end = min(len(lines), line_number + 5)
|
|
153
|
+
else:
|
|
154
|
+
# No line number — check the whole file (first 50 lines)
|
|
155
|
+
window_start = 0
|
|
156
|
+
window_end = min(len(lines), 50)
|
|
157
|
+
|
|
158
|
+
window_text = "\n".join(lines[window_start:window_end])
|
|
159
|
+
|
|
160
|
+
# Check for intent comments
|
|
161
|
+
for pattern in _INTENT_COMMENT_PATTERNS:
|
|
162
|
+
match = pattern.search(window_text)
|
|
163
|
+
if match:
|
|
164
|
+
return f"Developer intent detected: '{match.group(0).strip()}' near line {line_number or '?'}"
|
|
165
|
+
|
|
166
|
+
# For hardcoded secret / misconfiguration categories, check for dev-only fallbacks
|
|
167
|
+
# Use the FULL file for fallback detection since env vars may be far from the constant
|
|
168
|
+
norm_cat = category.lower().replace(" ", "_").replace("-", "_")
|
|
169
|
+
if norm_cat in ("hardcoded_secret", "hardcoded_secrets", "security_misconfiguration", "misconfiguration"):
|
|
170
|
+
full_text = content # Check the whole file, not just the window
|
|
171
|
+
for pattern in _DEV_FALLBACK_PATTERNS:
|
|
172
|
+
match = pattern.search(full_text)
|
|
173
|
+
if match:
|
|
174
|
+
return f"Dev-only fallback pattern: '{match.group(0).strip()}' — not a production secret"
|
|
175
|
+
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _is_comment_line(line: str, file_path: str) -> bool:
|
|
180
|
+
"""Check if a line is a comment."""
|
|
181
|
+
stripped = line.strip()
|
|
182
|
+
if file_path.endswith(".py"):
|
|
183
|
+
return bool(_PY_LINE_COMMENT.match(stripped)) or bool(_PY_DOCSTRING.match(stripped))
|
|
184
|
+
return bool(_JS_LINE_COMMENT.match(stripped))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _line_has_sink_pattern(line: str, category: str) -> bool:
|
|
188
|
+
"""Check if a line contains a sink pattern for the given vuln category."""
|
|
189
|
+
normalized_cat = category.lower().replace(" ", "_").replace("-", "_")
|
|
190
|
+
patterns = _CATEGORY_SINK_PATTERNS.get(normalized_cat, [])
|
|
191
|
+
return any(p.search(line) for p in patterns)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _normalize_finding_category(cat: str) -> str:
|
|
195
|
+
"""Normalize to our canonical form for lookup."""
|
|
196
|
+
return cat.lower().strip().replace(" ", "_").replace("-", "_")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# =========================================================================
|
|
200
|
+
# Layer A: Basic deterministic checks
|
|
201
|
+
# =========================================================================
|
|
202
|
+
|
|
203
|
+
def validate_finding_basic(finding: dict, fs_tools) -> tuple[bool, str]:
|
|
204
|
+
"""Run basic deterministic checks on a finding.
|
|
205
|
+
|
|
206
|
+
Returns (is_valid, reason) where reason explains rejection.
|
|
207
|
+
"""
|
|
208
|
+
file_path = finding.get("file_path", "")
|
|
209
|
+
line_number = finding.get("line_number")
|
|
210
|
+
category = _normalize_finding_category(finding.get("category", ""))
|
|
211
|
+
|
|
212
|
+
# Check 1: File must exist
|
|
213
|
+
if file_path:
|
|
214
|
+
result = fs_tools.read_file(file_path)
|
|
215
|
+
if "error" in result:
|
|
216
|
+
return False, f"File does not exist: {file_path}"
|
|
217
|
+
|
|
218
|
+
content = result.get("content", "")
|
|
219
|
+
lines = content.split("\n")
|
|
220
|
+
|
|
221
|
+
# Check 2: If line number provided, verify it has something relevant
|
|
222
|
+
if line_number and 0 < line_number <= len(lines):
|
|
223
|
+
target_line = lines[line_number - 1]
|
|
224
|
+
# Strip line-number prefix if present (from read_file format)
|
|
225
|
+
if "\t" in target_line:
|
|
226
|
+
target_line = target_line.split("\t", 1)[1]
|
|
227
|
+
|
|
228
|
+
# Check 2a: Is this line a comment?
|
|
229
|
+
if _is_comment_line(target_line, file_path):
|
|
230
|
+
return False, f"Reported line {line_number} is a comment"
|
|
231
|
+
|
|
232
|
+
# Check 2b: Does the line contain anything related to the claimed category?
|
|
233
|
+
# Only reject if we have specific sink patterns AND the line is clearly unrelated
|
|
234
|
+
# Use a generous window: check line +-3
|
|
235
|
+
window_start = max(0, line_number - 4)
|
|
236
|
+
window_end = min(len(lines), line_number + 3)
|
|
237
|
+
window_lines = lines[window_start:window_end]
|
|
238
|
+
window_text = "\n".join(
|
|
239
|
+
l.split("\t", 1)[1] if "\t" in l else l for l in window_lines
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Skip sink check for categories where the vulnerability is commonly
|
|
243
|
+
# indirect (input stored in one file, sink in another file).
|
|
244
|
+
# SSRF: URL stored in DB by controller, fetched by background service
|
|
245
|
+
# Auth bypass: middleware misconfigured in one file, exploited via another
|
|
246
|
+
# Data exposure: data returned by helper, exposed by controller
|
|
247
|
+
_INDIRECT_CATEGORIES = {
|
|
248
|
+
"ssrf", "auth_bypass", "authentication_bypass", "authorization_bypass",
|
|
249
|
+
"data_exposure", "idor", "business_logic_flaw",
|
|
250
|
+
}
|
|
251
|
+
if category in _CATEGORY_SINK_PATTERNS and category not in _INDIRECT_CATEGORIES:
|
|
252
|
+
if not any(p.search(window_text) for p in _CATEGORY_SINK_PATTERNS[category]):
|
|
253
|
+
return False, (
|
|
254
|
+
f"Line {line_number} (±3 lines) has no {category} sink pattern"
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Check 3: Non-production path — reject findings in test/CLI/docs/examples dirs
|
|
258
|
+
if _is_non_production_path(file_path):
|
|
259
|
+
return False, f"File is in a non-production path (test/CLI/docs/examples): {file_path}"
|
|
260
|
+
|
|
261
|
+
# Check 4: Developer intent — reject findings where code shows deliberate design
|
|
262
|
+
if file_path:
|
|
263
|
+
result = fs_tools.read_file(file_path)
|
|
264
|
+
if "error" not in result:
|
|
265
|
+
file_content = result.get("content", "")
|
|
266
|
+
intent_reason = _check_developer_intent(file_content, line_number, category)
|
|
267
|
+
if intent_reason:
|
|
268
|
+
return False, f"Intentional design: {intent_reason}"
|
|
269
|
+
|
|
270
|
+
return True, "passed"
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
# =========================================================================
|
|
274
|
+
# Layer B: Tree-sitter sink verification
|
|
275
|
+
# =========================================================================
|
|
276
|
+
|
|
277
|
+
_TS_LANGUAGES: dict = {} # Lazy-loaded
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _get_ts_language(file_path: str):
|
|
281
|
+
"""Get the appropriate tree-sitter language for a file."""
|
|
282
|
+
global _TS_LANGUAGES
|
|
283
|
+
|
|
284
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
285
|
+
|
|
286
|
+
lang_key = None
|
|
287
|
+
if ext in (".js", ".jsx", ".mjs"):
|
|
288
|
+
lang_key = "javascript"
|
|
289
|
+
elif ext in (".ts", ".tsx", ".mts"):
|
|
290
|
+
lang_key = "typescript"
|
|
291
|
+
elif ext == ".py":
|
|
292
|
+
lang_key = "python"
|
|
293
|
+
|
|
294
|
+
if lang_key is None:
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
if lang_key not in _TS_LANGUAGES:
|
|
298
|
+
try:
|
|
299
|
+
import tree_sitter as ts
|
|
300
|
+
if lang_key == "javascript":
|
|
301
|
+
import tree_sitter_javascript as ts_js
|
|
302
|
+
_TS_LANGUAGES["javascript"] = ts.Language(ts_js.language())
|
|
303
|
+
elif lang_key == "typescript":
|
|
304
|
+
import tree_sitter_typescript as ts_ts
|
|
305
|
+
_TS_LANGUAGES["typescript"] = ts.Language(ts_ts.language_typescript())
|
|
306
|
+
elif lang_key == "python":
|
|
307
|
+
import tree_sitter_python as ts_py
|
|
308
|
+
_TS_LANGUAGES["python"] = ts.Language(ts_py.language())
|
|
309
|
+
except Exception as e:
|
|
310
|
+
logger.debug(f"tree-sitter language load failed for {lang_key}: {e}")
|
|
311
|
+
_TS_LANGUAGES[lang_key] = None
|
|
312
|
+
|
|
313
|
+
return _TS_LANGUAGES.get(lang_key)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _find_enclosing_function(tree, line: int, language_key: str):
|
|
317
|
+
"""Find the AST node for the function enclosing the given line (0-indexed)."""
|
|
318
|
+
if language_key == "python":
|
|
319
|
+
func_types = {"function_definition"}
|
|
320
|
+
else:
|
|
321
|
+
func_types = {
|
|
322
|
+
"function_declaration", "arrow_function", "function",
|
|
323
|
+
"method_definition", "function_expression",
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
best = None
|
|
327
|
+
stack = [tree.root_node]
|
|
328
|
+
while stack:
|
|
329
|
+
node = stack.pop()
|
|
330
|
+
if node.type in func_types:
|
|
331
|
+
if node.start_point.row <= line <= node.end_point.row:
|
|
332
|
+
# Pick the tightest enclosing function
|
|
333
|
+
if best is None or (node.start_point.row >= best.start_point.row):
|
|
334
|
+
best = node
|
|
335
|
+
for child in node.children:
|
|
336
|
+
if child.start_point.row <= line <= child.end_point.row + 5:
|
|
337
|
+
stack.append(child)
|
|
338
|
+
elif child.start_point.row > line + 5:
|
|
339
|
+
break
|
|
340
|
+
|
|
341
|
+
return best
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _extract_parameter_names(func_node, language_key: str) -> set[str]:
|
|
345
|
+
"""Extract parameter names from a function node."""
|
|
346
|
+
params = set()
|
|
347
|
+
|
|
348
|
+
if language_key == "python":
|
|
349
|
+
# Python: def foo(request, pk, **kwargs)
|
|
350
|
+
for child in func_node.children:
|
|
351
|
+
if child.type == "parameters":
|
|
352
|
+
for param in child.children:
|
|
353
|
+
if param.type == "identifier":
|
|
354
|
+
params.add(param.text.decode())
|
|
355
|
+
elif param.type in ("default_parameter", "typed_parameter", "typed_default_parameter"):
|
|
356
|
+
for sub in param.children:
|
|
357
|
+
if sub.type == "identifier":
|
|
358
|
+
params.add(sub.text.decode())
|
|
359
|
+
break
|
|
360
|
+
elif param.type in ("list_splat_pattern", "dictionary_splat_pattern"):
|
|
361
|
+
for sub in param.children:
|
|
362
|
+
if sub.type == "identifier":
|
|
363
|
+
params.add(sub.text.decode())
|
|
364
|
+
else:
|
|
365
|
+
# JS/TS: function foo(req, res) or (req, res) =>
|
|
366
|
+
for child in func_node.children:
|
|
367
|
+
if child.type == "formal_parameters":
|
|
368
|
+
for param in child.children:
|
|
369
|
+
if param.type == "identifier":
|
|
370
|
+
params.add(param.text.decode())
|
|
371
|
+
elif param.type in ("required_parameter", "optional_parameter"):
|
|
372
|
+
for sub in param.children:
|
|
373
|
+
if sub.type == "identifier":
|
|
374
|
+
params.add(sub.text.decode())
|
|
375
|
+
break
|
|
376
|
+
elif param.type == "object_pattern":
|
|
377
|
+
# Destructured: ({ body, params })
|
|
378
|
+
for sub in param.children:
|
|
379
|
+
if sub.type == "shorthand_property_identifier_pattern":
|
|
380
|
+
params.add(sub.text.decode())
|
|
381
|
+
elif sub.type == "pair_pattern":
|
|
382
|
+
for kv in sub.children:
|
|
383
|
+
if kv.type == "property_identifier":
|
|
384
|
+
params.add(kv.text.decode())
|
|
385
|
+
|
|
386
|
+
return params
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
_INPUT_ACCESSOR_RE = re.compile(
|
|
390
|
+
r"\b(?:req|request|params|query|body|searchParams|self\.request)"
|
|
391
|
+
r"(?:\.\w+)*"
|
|
392
|
+
r"|(?:request\.(?:GET|POST|FILES|body|json|form|args|data|query_params))"
|
|
393
|
+
r"|(?:req\.(?:body|query|params|cookies|headers))"
|
|
394
|
+
r"|await\s+\w+\.json\(\)"
|
|
395
|
+
r"|getattr\s*\(\s*request",
|
|
396
|
+
re.I,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
_KEYWORDS = {"const", "let", "var", "return", "await", "async", "function",
|
|
400
|
+
"if", "else", "for", "while", "true", "false", "null", "undefined",
|
|
401
|
+
"def", "class", "import", "from", "None", "True", "False", "self",
|
|
402
|
+
"new", "typeof", "instanceof", "try", "catch", "throw", "export"}
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _function_has_user_input_at_sink(func_node, sink_line: int, language_key: str) -> bool:
|
|
406
|
+
"""Check if user input can plausibly reach the sink line.
|
|
407
|
+
|
|
408
|
+
Strategy:
|
|
409
|
+
1. Extract function parameters
|
|
410
|
+
2. Only consider params that match known user-input names (req, request, body, etc.)
|
|
411
|
+
3. Check for direct user-input accessor patterns near the sink
|
|
412
|
+
4. Trace variables backwards: if a variable on the sink line was assigned
|
|
413
|
+
from user input earlier in the function, it counts
|
|
414
|
+
|
|
415
|
+
If only non-input params (config, options, settings) appear, reject.
|
|
416
|
+
"""
|
|
417
|
+
func_text = func_node.text.decode()
|
|
418
|
+
func_start_line = func_node.start_point.row
|
|
419
|
+
func_lines = func_text.split("\n")
|
|
420
|
+
|
|
421
|
+
# Get parameter names
|
|
422
|
+
param_names = _extract_parameter_names(func_node, language_key)
|
|
423
|
+
|
|
424
|
+
# Only consider params that look like user input sources
|
|
425
|
+
input_params = param_names & _USER_INPUT_NAMES
|
|
426
|
+
|
|
427
|
+
# Sink window: ±2 lines around the reported line
|
|
428
|
+
relative_sink = sink_line - func_start_line
|
|
429
|
+
check_start = max(0, relative_sink - 2)
|
|
430
|
+
check_end = min(len(func_lines), relative_sink + 3)
|
|
431
|
+
sink_window = " ".join(func_lines[check_start:check_end])
|
|
432
|
+
|
|
433
|
+
# Check 1: Does the sink window directly reference a known input parameter?
|
|
434
|
+
for identifier in input_params:
|
|
435
|
+
if re.search(rf"\b{re.escape(identifier)}\b", sink_window):
|
|
436
|
+
return True
|
|
437
|
+
|
|
438
|
+
# Check 2: Does the sink window contain a user-input accessor pattern?
|
|
439
|
+
if _INPUT_ACCESSOR_RE.search(sink_window):
|
|
440
|
+
return True
|
|
441
|
+
|
|
442
|
+
# Check 3: Trace backwards — does any variable in the sink window
|
|
443
|
+
# get assigned from user input earlier in the function?
|
|
444
|
+
sink_identifiers = set(re.findall(r"\b([a-zA-Z_]\w*)\b", sink_window)) - _KEYWORDS
|
|
445
|
+
|
|
446
|
+
pre_sink_lines = func_lines[:max(0, relative_sink)]
|
|
447
|
+
pre_sink_text = "\n".join(pre_sink_lines)
|
|
448
|
+
|
|
449
|
+
for ident in sink_identifiers:
|
|
450
|
+
assign_pattern = re.compile(
|
|
451
|
+
rf"(?:const|let|var)?\s*{re.escape(ident)}\s*=\s*(.*)",
|
|
452
|
+
)
|
|
453
|
+
for match in assign_pattern.finditer(pre_sink_text):
|
|
454
|
+
rhs = match.group(1)
|
|
455
|
+
if _INPUT_ACCESSOR_RE.search(rhs):
|
|
456
|
+
return True
|
|
457
|
+
for ip in input_params:
|
|
458
|
+
if re.search(rf"\b{re.escape(ip)}\b", rhs):
|
|
459
|
+
return True
|
|
460
|
+
|
|
461
|
+
return False
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def validate_finding_treesitter(finding: dict, fs_tools) -> tuple[bool, str]:
|
|
465
|
+
"""Tree-sitter based sink verification.
|
|
466
|
+
|
|
467
|
+
Parses the function containing the finding, checks if user input
|
|
468
|
+
can plausibly reach the dangerous sink.
|
|
469
|
+
|
|
470
|
+
Returns (is_valid, reason).
|
|
471
|
+
"""
|
|
472
|
+
file_path = finding.get("file_path", "")
|
|
473
|
+
line_number = finding.get("line_number")
|
|
474
|
+
category = _normalize_finding_category(finding.get("category", ""))
|
|
475
|
+
|
|
476
|
+
# Only run for categories where we have sink patterns
|
|
477
|
+
if category not in _CATEGORY_SINK_PATTERNS:
|
|
478
|
+
return True, "no sink patterns for category"
|
|
479
|
+
|
|
480
|
+
if not file_path or not line_number:
|
|
481
|
+
return True, "no file/line to verify"
|
|
482
|
+
|
|
483
|
+
ts_lang = _get_ts_language(file_path)
|
|
484
|
+
if ts_lang is None:
|
|
485
|
+
return True, "unsupported language for tree-sitter"
|
|
486
|
+
|
|
487
|
+
# Read the file
|
|
488
|
+
result = fs_tools.read_file(file_path)
|
|
489
|
+
if "error" in result:
|
|
490
|
+
return True, "could not read file"
|
|
491
|
+
|
|
492
|
+
raw_content = result.get("content", "")
|
|
493
|
+
# Strip line-number prefixes
|
|
494
|
+
lines = raw_content.split("\n")
|
|
495
|
+
clean_lines = []
|
|
496
|
+
for line in lines:
|
|
497
|
+
if "\t" in line:
|
|
498
|
+
clean_lines.append(line.split("\t", 1)[1])
|
|
499
|
+
else:
|
|
500
|
+
clean_lines.append(line)
|
|
501
|
+
source_bytes = "\n".join(clean_lines).encode("utf-8")
|
|
502
|
+
|
|
503
|
+
try:
|
|
504
|
+
import tree_sitter as ts
|
|
505
|
+
parser = ts.Parser(ts_lang)
|
|
506
|
+
tree = parser.parse(source_bytes)
|
|
507
|
+
except Exception as e:
|
|
508
|
+
logger.debug(f"tree-sitter parse failed for {file_path}: {e}")
|
|
509
|
+
return True, "parse failed"
|
|
510
|
+
|
|
511
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
512
|
+
if ext == ".py":
|
|
513
|
+
lang_key = "python"
|
|
514
|
+
elif ext in (".ts", ".tsx", ".mts"):
|
|
515
|
+
lang_key = "typescript"
|
|
516
|
+
else:
|
|
517
|
+
lang_key = "javascript"
|
|
518
|
+
|
|
519
|
+
# Find enclosing function (0-indexed line)
|
|
520
|
+
func_node = _find_enclosing_function(tree, line_number - 1, lang_key)
|
|
521
|
+
if func_node is None:
|
|
522
|
+
# No enclosing function — might be module-level code, allow it
|
|
523
|
+
return True, "no enclosing function found"
|
|
524
|
+
|
|
525
|
+
# Check if user input identifiers appear near the sink
|
|
526
|
+
has_input = _function_has_user_input_at_sink(func_node, line_number - 1, lang_key)
|
|
527
|
+
if not has_input:
|
|
528
|
+
return False, (
|
|
529
|
+
f"tree-sitter: no user input identifiers found near sink at line {line_number} "
|
|
530
|
+
f"in function {func_node.children[1].text.decode() if len(func_node.children) > 1 else '?'}"
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
return True, "user input may reach sink"
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
# =========================================================================
|
|
537
|
+
# Main entry point
|
|
538
|
+
# =========================================================================
|
|
539
|
+
|
|
540
|
+
def run_static_validation(
|
|
541
|
+
potential_findings: list[dict],
|
|
542
|
+
fs_tools,
|
|
543
|
+
enable_treesitter: bool = True,
|
|
544
|
+
) -> tuple[list[dict], list[dict], dict]:
|
|
545
|
+
"""Run deterministic pre-validation on all hunter findings.
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
potential_findings: Raw findings from hunter swarm
|
|
549
|
+
fs_tools: FileSystemTools instance for file access
|
|
550
|
+
enable_treesitter: Whether to run tree-sitter checks (Layer B)
|
|
551
|
+
|
|
552
|
+
Returns:
|
|
553
|
+
(passed, rejected, stats) where:
|
|
554
|
+
- passed: findings that survived validation (indices preserved)
|
|
555
|
+
- rejected: findings that were filtered out
|
|
556
|
+
- stats: validation statistics
|
|
557
|
+
"""
|
|
558
|
+
stats = {
|
|
559
|
+
"input_count": len(potential_findings),
|
|
560
|
+
"basic_rejected": 0,
|
|
561
|
+
"treesitter_rejected": 0,
|
|
562
|
+
"passed": 0,
|
|
563
|
+
"basic_rejections": [],
|
|
564
|
+
"treesitter_rejections": [],
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
passed: list[dict] = []
|
|
568
|
+
rejected: list[dict] = []
|
|
569
|
+
|
|
570
|
+
for i, finding in enumerate(potential_findings):
|
|
571
|
+
finding["_original_index"] = i
|
|
572
|
+
|
|
573
|
+
# Layer A: Basic checks
|
|
574
|
+
is_valid, reason = validate_finding_basic(finding, fs_tools)
|
|
575
|
+
if not is_valid:
|
|
576
|
+
finding["_rejection_reason"] = f"basic: {reason}"
|
|
577
|
+
rejected.append(finding)
|
|
578
|
+
stats["basic_rejected"] += 1
|
|
579
|
+
stats["basic_rejections"].append({
|
|
580
|
+
"index": i,
|
|
581
|
+
"file": finding.get("file_path", ""),
|
|
582
|
+
"category": finding.get("category", ""),
|
|
583
|
+
"reason": reason,
|
|
584
|
+
})
|
|
585
|
+
continue
|
|
586
|
+
|
|
587
|
+
# Layer B: Tree-sitter verification
|
|
588
|
+
if enable_treesitter:
|
|
589
|
+
is_valid, reason = validate_finding_treesitter(finding, fs_tools)
|
|
590
|
+
if not is_valid:
|
|
591
|
+
finding["_rejection_reason"] = f"treesitter: {reason}"
|
|
592
|
+
rejected.append(finding)
|
|
593
|
+
stats["treesitter_rejected"] += 1
|
|
594
|
+
stats["treesitter_rejections"].append({
|
|
595
|
+
"index": i,
|
|
596
|
+
"file": finding.get("file_path", ""),
|
|
597
|
+
"category": finding.get("category", ""),
|
|
598
|
+
"reason": reason,
|
|
599
|
+
})
|
|
600
|
+
continue
|
|
601
|
+
|
|
602
|
+
passed.append(finding)
|
|
603
|
+
stats["passed"] += 1
|
|
604
|
+
|
|
605
|
+
total_rejected = stats["basic_rejected"] + stats["treesitter_rejected"]
|
|
606
|
+
logger.info(
|
|
607
|
+
f"Static validation: {stats['input_count']} findings → "
|
|
608
|
+
f"{stats['passed']} passed, {total_rejected} rejected "
|
|
609
|
+
f"(basic: {stats['basic_rejected']}, tree-sitter: {stats['treesitter_rejected']})"
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
return passed, rejected, stats
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""OpenHack tools."""
|