openhack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. openhack/__init__.py +2 -0
  2. openhack/__main__.py +225 -0
  3. openhack/agents/__init__.py +30 -0
  4. openhack/agents/base.py +230 -0
  5. openhack/agents/browser_verifier.py +679 -0
  6. openhack/agents/browser_verifier_swarm.py +256 -0
  7. openhack/agents/checkpoint.py +89 -0
  8. openhack/agents/context_manager.py +356 -0
  9. openhack/agents/coordinator.py +1105 -0
  10. openhack/agents/endpoint_analyst.py +307 -0
  11. openhack/agents/feature_hunter.py +93 -0
  12. openhack/agents/hunter.py +481 -0
  13. openhack/agents/hunter_swarm.py +385 -0
  14. openhack/agents/llm.py +334 -0
  15. openhack/agents/recon.py +19 -0
  16. openhack/agents/sandbox_verifier.py +396 -0
  17. openhack/agents/sandbox_verifier_swarm.py +250 -0
  18. openhack/agents/session.py +286 -0
  19. openhack/agents/validator.py +217 -0
  20. openhack/agents/validator_swarm.py +106 -0
  21. openhack/auth.py +175 -0
  22. openhack/browser/__init__.py +12 -0
  23. openhack/browser/runner.py +385 -0
  24. openhack/categories.py +130 -0
  25. openhack/config.py +201 -0
  26. openhack/deterministic_recon.py +464 -0
  27. openhack/entry_points.py +745 -0
  28. openhack/framework_classifier.py +515 -0
  29. openhack/framework_detection.py +269 -0
  30. openhack/headless_scan.py +179 -0
  31. openhack/prompts/__init__.py +108 -0
  32. openhack/prompts/browser_verifier.py +171 -0
  33. openhack/prompts/coordinator.py +31 -0
  34. openhack/prompts/django/__init__.py +32 -0
  35. openhack/prompts/django/auth_bypass.py +76 -0
  36. openhack/prompts/django/csrf.py +62 -0
  37. openhack/prompts/django/data_exposure.py +67 -0
  38. openhack/prompts/django/idor.py +74 -0
  39. openhack/prompts/django/injection.py +67 -0
  40. openhack/prompts/django/misconfiguration.py +70 -0
  41. openhack/prompts/django/ssrf.py +64 -0
  42. openhack/prompts/endpoint_analyst.py +122 -0
  43. openhack/prompts/express/__init__.py +29 -0
  44. openhack/prompts/express/auth_bypass.py +71 -0
  45. openhack/prompts/express/data_exposure.py +77 -0
  46. openhack/prompts/express/idor.py +69 -0
  47. openhack/prompts/express/injection.py +75 -0
  48. openhack/prompts/express/misconfiguration.py +72 -0
  49. openhack/prompts/express/ssrf.py +63 -0
  50. openhack/prompts/feature_hunter.py +140 -0
  51. openhack/prompts/flask/__init__.py +29 -0
  52. openhack/prompts/flask/auth_bypass.py +86 -0
  53. openhack/prompts/flask/data_exposure.py +78 -0
  54. openhack/prompts/flask/idor.py +83 -0
  55. openhack/prompts/flask/injection.py +77 -0
  56. openhack/prompts/flask/misconfiguration.py +73 -0
  57. openhack/prompts/flask/ssrf.py +65 -0
  58. openhack/prompts/hunter.py +362 -0
  59. openhack/prompts/hunter_continuation_loop.py +12 -0
  60. openhack/prompts/hunter_continuation_no_findings.py +19 -0
  61. openhack/prompts/hunter_continuation_no_progress.py +22 -0
  62. openhack/prompts/hunter_tool_instructions.py +55 -0
  63. openhack/prompts/nextjs/__init__.py +42 -0
  64. openhack/prompts/nextjs/auth_bypass.py +80 -0
  65. openhack/prompts/nextjs/csrf.py +71 -0
  66. openhack/prompts/nextjs/data_exposure.py +88 -0
  67. openhack/prompts/nextjs/idor.py +64 -0
  68. openhack/prompts/nextjs/injection.py +65 -0
  69. openhack/prompts/nextjs/middleware_bypass.py +75 -0
  70. openhack/prompts/nextjs/misconfiguration.py +92 -0
  71. openhack/prompts/nextjs/server_actions.py +97 -0
  72. openhack/prompts/nextjs/ssrf.py +66 -0
  73. openhack/prompts/nextjs/xss.py +69 -0
  74. openhack/prompts/pr_analysis_system.py +80 -0
  75. openhack/prompts/pr_analysis_user.py +11 -0
  76. openhack/prompts/project_context.py +89 -0
  77. openhack/prompts/recon.py +199 -0
  78. openhack/prompts/reporter.py +88 -0
  79. openhack/prompts/researchers.py +434 -0
  80. openhack/prompts/sandbox_verifier.py +128 -0
  81. openhack/prompts/supabase/__init__.py +39 -0
  82. openhack/prompts/supabase/auth_tokens.py +131 -0
  83. openhack/prompts/supabase/edge_functions.py +150 -0
  84. openhack/prompts/supabase/graphql.py +102 -0
  85. openhack/prompts/supabase/postgrest.py +99 -0
  86. openhack/prompts/supabase/realtime.py +93 -0
  87. openhack/prompts/supabase/rls.py +110 -0
  88. openhack/prompts/supabase/rpc_functions.py +127 -0
  89. openhack/prompts/supabase/storage.py +110 -0
  90. openhack/prompts/supabase/tenant_isolation.py +118 -0
  91. openhack/prompts/validator.py +319 -0
  92. openhack/prompts/validator_continuation_incomplete.py +12 -0
  93. openhack/prompts/validator_tool_instructions.py +29 -0
  94. openhack/quality.py +231 -0
  95. openhack/sandbox/__init__.py +12 -0
  96. openhack/sandbox/orchestrator.py +517 -0
  97. openhack/sandbox/runner.py +177 -0
  98. openhack/scan_session.py +245 -0
  99. openhack/setup.py +452 -0
  100. openhack/static_validator.py +612 -0
  101. openhack/tools/__init__.py +1 -0
  102. openhack/tools/ast_tools.py +307 -0
  103. openhack/tools/coverage.py +1078 -0
  104. openhack/tools/filesystem.py +404 -0
  105. openhack/tools/nextjs.py +258 -0
  106. openhack/tools/registry.py +52 -0
  107. openhack/tui.py +3450 -0
  108. openhack/updates.py +170 -0
  109. openhack-0.1.0.dist-info/METADATA +189 -0
  110. openhack-0.1.0.dist-info/RECORD +113 -0
  111. openhack-0.1.0.dist-info/WHEEL +4 -0
  112. openhack-0.1.0.dist-info/entry_points.txt +2 -0
  113. openhack-0.1.0.dist-info/licenses/LICENSE +661 -0
@@ -0,0 +1,612 @@
1
+ """
2
+ Deterministic pre-validator for hunter findings.
3
+
4
+ Runs BEFORE the LLM validator to filter out findings that are provably
5
+ wrong without any LLM reasoning. Two layers:
6
+
7
+ Layer A (basic checks):
8
+ - Does the file exist?
9
+ - Does the reported line actually contain the claimed pattern?
10
+ - Is the pattern in a comment or string literal, not actual code?
11
+ - Is it in a test/fixture file?
12
+
13
+ Layer B (tree-sitter sink verification):
14
+ - Parse the function containing the reported sink
15
+ - Check whether any parameter traces back to user input (req, request, params, body, etc.)
16
+ - If the sink only receives constants/config values, reject the finding
17
+
18
+ Returns: filtered list of findings with rejection reasons attached to dropped ones.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import logging
24
+ import os
25
+ import re
26
+ from typing import Optional
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # --- User input source identifiers ---
31
+ _USER_INPUT_NAMES = {
32
+ # JS/TS
33
+ "req", "request", "params", "query", "body", "searchParams",
34
+ "ctx", "input", "args", "data", "payload", "formData",
35
+ "req.body", "req.query", "req.params", "request.body",
36
+ "request.form", "request.args", "request.json", "request.data",
37
+ "request.GET", "request.POST", "request.FILES",
38
+ # Python
39
+ "self.request",
40
+ }
41
+
42
+ # --- Sink patterns per category ---
43
+ _CATEGORY_SINK_PATTERNS: dict[str, list[re.Pattern]] = {
44
+ "sql_injection": [
45
+ re.compile(r"(?:query|execute|raw|text|cursor\.execute|db\.execute|\.raw\(|\.extra\(|RawSQL)", re.I),
46
+ re.compile(r"(?:SELECT|INSERT|UPDATE|DELETE|FROM|WHERE)", re.I),
47
+ ],
48
+ "xss": [
49
+ re.compile(r"(?:dangerouslySetInnerHTML|innerHTML|document\.write|v-html|mark_safe|Markup\(|\|safe)", re.I),
50
+ ],
51
+ "rce": [
52
+ re.compile(r"(?:eval|exec|Function\(|child_process|subprocess|os\.system|os\.popen|spawn|execFile)", re.I),
53
+ ],
54
+ "command_injection": [
55
+ re.compile(r"(?:exec|spawn|subprocess|os\.system|os\.popen|child_process|shell=True)", re.I),
56
+ ],
57
+ "ssrf": [
58
+ re.compile(r"(?:fetch|axios|http\.request|urllib|requests\.get|requests\.post|httpx)", re.I),
59
+ ],
60
+ "ssti": [
61
+ re.compile(r"(?:render_template_string|Template\(|from_string|ejs\.render|nunjucks\.renderString|pug\.render)", re.I),
62
+ ],
63
+ "path_traversal": [
64
+ re.compile(r"(?:sendFile|send_file|FileResponse|open\(|readFile|createReadStream|res\.download)", re.I),
65
+ ],
66
+ "open_redirect": [
67
+ re.compile(r"(?:redirect|302|Location|NextResponse\.redirect|res\.redirect)", re.I),
68
+ ],
69
+ "prototype_pollution": [
70
+ re.compile(r"(?:__proto__|Object\.assign|lodash\.merge|_.merge|defaultsDeep|deepmerge)", re.I),
71
+ ],
72
+ }
73
+
74
+ # --- Comment patterns ---
75
+ _JS_LINE_COMMENT = re.compile(r"^\s*//")
76
+ _JS_BLOCK_COMMENT_START = re.compile(r"/\*")
77
+ _JS_BLOCK_COMMENT_END = re.compile(r"\*/")
78
+ _PY_LINE_COMMENT = re.compile(r"^\s*#")
79
+ _PY_DOCSTRING = re.compile(r'^\s*(?:"""|\'\'\')')
80
+
81
+ _TEST_DIR_PATTERNS = re.compile(
82
+ r"(?:^|/)(?:test|tests|__tests__|spec|__mocks__|fixtures|mocks|__fixtures__|e2e|cypress|playwright)(?:/|$)",
83
+ re.I,
84
+ )
85
+
86
+ _NON_PRODUCTION_DIR_PATTERNS = re.compile(
87
+ r"(?:^|/)(?:"
88
+ r"test|tests|__tests__|spec|__mocks__|fixtures|__fixtures__|e2e|cypress|playwright|"
89
+ r"cli|CLI|docs|documentation|examples?|samples?|scripts|tools|devtools|"
90
+ r"benchmarks?|integration-tests|\.storybook|stories"
91
+ r")(?:/|$)",
92
+ re.I,
93
+ )
94
+
95
+ # Signals that code is intentional design, not a vulnerability
96
+ _INTENT_COMMENT_PATTERNS = [
97
+ re.compile(r"@since\s+\d+\.\d+", re.I), # Versioned API — deliberate
98
+ re.compile(r"intentionally?\s+(?:public|open|exposed|disabled|skipped)", re.I),
99
+ re.compile(r"by\s+design", re.I),
100
+ re.compile(r"public\s+(?:endpoint|api|route)", re.I),
101
+ re.compile(r"no\s+auth(?:entication)?\s+(?:required|needed)", re.I),
102
+ re.compile(r"allow\s+(?:unauthenticated|anonymous|public)", re.I),
103
+ re.compile(r"fallback\s+(?:for|in)\s+(?:dev|development|test)", re.I),
104
+ re.compile(r"default\s+(?:for|in)\s+(?:dev|development|test)", re.I),
105
+ re.compile(r"only\s+(?:in|for|when)\s+(?:dev|development|test|non-prod)", re.I),
106
+ ]
107
+
108
+ # Code patterns that indicate dev-only fallbacks, not production secrets
109
+ _DEV_FALLBACK_PATTERNS = [
110
+ re.compile(r"(?:NODE_ENV|RAILS_ENV|FLASK_ENV|APP_ENV)\s*[!=]=\s*['\"](?:production|prod)['\"]", re.I),
111
+ re.compile(r"process\.env\.\w+\s*\|\|\s*['\"]", re.I), # env || "default"
112
+ re.compile(r"process\.env\.\w+\s*\|\|\s*\w+", re.I), # env || CONSTANT_NAME
113
+ re.compile(r"os\.environ\.get\(\s*['\"][^'\"]+['\"]\s*,\s*['\"]", re.I), # os.environ.get("X", "default")
114
+ re.compile(r"ENV\.fetch\(\s*['\"][^'\"]+['\"]\s*,\s*['\"]", re.I), # Ruby ENV.fetch("X", "default")
115
+ re.compile(r"(?:const|let|var)\s+DEFAULT_\w*\s*=\s*['\"]", re.I), # const DEFAULT_SECRET = "value"
116
+ re.compile(r"\?\?\s*DEFAULT_", re.I), # ?? DEFAULT_CONSTANT (nullish coalescing)
117
+ ]
118
+
119
+
120
+ def _is_test_file(file_path: str) -> bool:
121
+ """Check if a file is a test/fixture file."""
122
+ if _TEST_DIR_PATTERNS.search(file_path):
123
+ return True
124
+ basename = os.path.basename(file_path).lower()
125
+ return any(basename.startswith(p) or basename.endswith(p) for p in [
126
+ "test_", "_test.", ".test.", ".spec.", "conftest.", "fixture",
127
+ ])
128
+
129
+
130
+ def _is_non_production_path(file_path: str) -> bool:
131
+ """Check if a file is in a non-production directory (tests, CLI, docs, examples, etc.)."""
132
+ if _NON_PRODUCTION_DIR_PATTERNS.search(file_path):
133
+ return True
134
+ basename = os.path.basename(file_path).lower()
135
+ return any(basename.startswith(p) or basename.endswith(p) for p in [
136
+ "test_", "_test.", ".test.", ".spec.", "conftest.", "fixture",
137
+ ])
138
+
139
+
140
+ def _check_developer_intent(content: str, line_number: Optional[int], category: str) -> Optional[str]:
141
+ """Check if surrounding code shows intentional design, returning a reason if so.
142
+
143
+ Scans a window around the reported line for comments/code that indicate
144
+ the flagged pattern is deliberate (e.g., @since tags, 'by design' comments,
145
+ dev-only fallbacks).
146
+ """
147
+ lines = content.split("\n")
148
+
149
+ # Look at a generous window around the reported line
150
+ if line_number and 0 < line_number <= len(lines):
151
+ window_start = max(0, line_number - 10)
152
+ window_end = min(len(lines), line_number + 5)
153
+ else:
154
+ # No line number — check the whole file (first 50 lines)
155
+ window_start = 0
156
+ window_end = min(len(lines), 50)
157
+
158
+ window_text = "\n".join(lines[window_start:window_end])
159
+
160
+ # Check for intent comments
161
+ for pattern in _INTENT_COMMENT_PATTERNS:
162
+ match = pattern.search(window_text)
163
+ if match:
164
+ return f"Developer intent detected: '{match.group(0).strip()}' near line {line_number or '?'}"
165
+
166
+ # For hardcoded secret / misconfiguration categories, check for dev-only fallbacks
167
+ # Use the FULL file for fallback detection since env vars may be far from the constant
168
+ norm_cat = category.lower().replace(" ", "_").replace("-", "_")
169
+ if norm_cat in ("hardcoded_secret", "hardcoded_secrets", "security_misconfiguration", "misconfiguration"):
170
+ full_text = content # Check the whole file, not just the window
171
+ for pattern in _DEV_FALLBACK_PATTERNS:
172
+ match = pattern.search(full_text)
173
+ if match:
174
+ return f"Dev-only fallback pattern: '{match.group(0).strip()}' — not a production secret"
175
+
176
+ return None
177
+
178
+
179
+ def _is_comment_line(line: str, file_path: str) -> bool:
180
+ """Check if a line is a comment."""
181
+ stripped = line.strip()
182
+ if file_path.endswith(".py"):
183
+ return bool(_PY_LINE_COMMENT.match(stripped)) or bool(_PY_DOCSTRING.match(stripped))
184
+ return bool(_JS_LINE_COMMENT.match(stripped))
185
+
186
+
187
+ def _line_has_sink_pattern(line: str, category: str) -> bool:
188
+ """Check if a line contains a sink pattern for the given vuln category."""
189
+ normalized_cat = category.lower().replace(" ", "_").replace("-", "_")
190
+ patterns = _CATEGORY_SINK_PATTERNS.get(normalized_cat, [])
191
+ return any(p.search(line) for p in patterns)
192
+
193
+
194
+ def _normalize_finding_category(cat: str) -> str:
195
+ """Normalize to our canonical form for lookup."""
196
+ return cat.lower().strip().replace(" ", "_").replace("-", "_")
197
+
198
+
199
+ # =========================================================================
200
+ # Layer A: Basic deterministic checks
201
+ # =========================================================================
202
+
203
+ def validate_finding_basic(finding: dict, fs_tools) -> tuple[bool, str]:
204
+ """Run basic deterministic checks on a finding.
205
+
206
+ Returns (is_valid, reason) where reason explains rejection.
207
+ """
208
+ file_path = finding.get("file_path", "")
209
+ line_number = finding.get("line_number")
210
+ category = _normalize_finding_category(finding.get("category", ""))
211
+
212
+ # Check 1: File must exist
213
+ if file_path:
214
+ result = fs_tools.read_file(file_path)
215
+ if "error" in result:
216
+ return False, f"File does not exist: {file_path}"
217
+
218
+ content = result.get("content", "")
219
+ lines = content.split("\n")
220
+
221
+ # Check 2: If line number provided, verify it has something relevant
222
+ if line_number and 0 < line_number <= len(lines):
223
+ target_line = lines[line_number - 1]
224
+ # Strip line-number prefix if present (from read_file format)
225
+ if "\t" in target_line:
226
+ target_line = target_line.split("\t", 1)[1]
227
+
228
+ # Check 2a: Is this line a comment?
229
+ if _is_comment_line(target_line, file_path):
230
+ return False, f"Reported line {line_number} is a comment"
231
+
232
+ # Check 2b: Does the line contain anything related to the claimed category?
233
+ # Only reject if we have specific sink patterns AND the line is clearly unrelated
234
+ # Use a generous window: check line +-3
235
+ window_start = max(0, line_number - 4)
236
+ window_end = min(len(lines), line_number + 3)
237
+ window_lines = lines[window_start:window_end]
238
+ window_text = "\n".join(
239
+ l.split("\t", 1)[1] if "\t" in l else l for l in window_lines
240
+ )
241
+
242
+ # Skip sink check for categories where the vulnerability is commonly
243
+ # indirect (input stored in one file, sink in another file).
244
+ # SSRF: URL stored in DB by controller, fetched by background service
245
+ # Auth bypass: middleware misconfigured in one file, exploited via another
246
+ # Data exposure: data returned by helper, exposed by controller
247
+ _INDIRECT_CATEGORIES = {
248
+ "ssrf", "auth_bypass", "authentication_bypass", "authorization_bypass",
249
+ "data_exposure", "idor", "business_logic_flaw",
250
+ }
251
+ if category in _CATEGORY_SINK_PATTERNS and category not in _INDIRECT_CATEGORIES:
252
+ if not any(p.search(window_text) for p in _CATEGORY_SINK_PATTERNS[category]):
253
+ return False, (
254
+ f"Line {line_number} (±3 lines) has no {category} sink pattern"
255
+ )
256
+
257
+ # Check 3: Non-production path — reject findings in test/CLI/docs/examples dirs
258
+ if _is_non_production_path(file_path):
259
+ return False, f"File is in a non-production path (test/CLI/docs/examples): {file_path}"
260
+
261
+ # Check 4: Developer intent — reject findings where code shows deliberate design
262
+ if file_path:
263
+ result = fs_tools.read_file(file_path)
264
+ if "error" not in result:
265
+ file_content = result.get("content", "")
266
+ intent_reason = _check_developer_intent(file_content, line_number, category)
267
+ if intent_reason:
268
+ return False, f"Intentional design: {intent_reason}"
269
+
270
+ return True, "passed"
271
+
272
+
273
+ # =========================================================================
274
+ # Layer B: Tree-sitter sink verification
275
+ # =========================================================================
276
+
277
+ _TS_LANGUAGES: dict = {} # Lazy-loaded
278
+
279
+
280
+ def _get_ts_language(file_path: str):
281
+ """Get the appropriate tree-sitter language for a file."""
282
+ global _TS_LANGUAGES
283
+
284
+ ext = os.path.splitext(file_path)[1].lower()
285
+
286
+ lang_key = None
287
+ if ext in (".js", ".jsx", ".mjs"):
288
+ lang_key = "javascript"
289
+ elif ext in (".ts", ".tsx", ".mts"):
290
+ lang_key = "typescript"
291
+ elif ext == ".py":
292
+ lang_key = "python"
293
+
294
+ if lang_key is None:
295
+ return None
296
+
297
+ if lang_key not in _TS_LANGUAGES:
298
+ try:
299
+ import tree_sitter as ts
300
+ if lang_key == "javascript":
301
+ import tree_sitter_javascript as ts_js
302
+ _TS_LANGUAGES["javascript"] = ts.Language(ts_js.language())
303
+ elif lang_key == "typescript":
304
+ import tree_sitter_typescript as ts_ts
305
+ _TS_LANGUAGES["typescript"] = ts.Language(ts_ts.language_typescript())
306
+ elif lang_key == "python":
307
+ import tree_sitter_python as ts_py
308
+ _TS_LANGUAGES["python"] = ts.Language(ts_py.language())
309
+ except Exception as e:
310
+ logger.debug(f"tree-sitter language load failed for {lang_key}: {e}")
311
+ _TS_LANGUAGES[lang_key] = None
312
+
313
+ return _TS_LANGUAGES.get(lang_key)
314
+
315
+
316
+ def _find_enclosing_function(tree, line: int, language_key: str):
317
+ """Find the AST node for the function enclosing the given line (0-indexed)."""
318
+ if language_key == "python":
319
+ func_types = {"function_definition"}
320
+ else:
321
+ func_types = {
322
+ "function_declaration", "arrow_function", "function",
323
+ "method_definition", "function_expression",
324
+ }
325
+
326
+ best = None
327
+ stack = [tree.root_node]
328
+ while stack:
329
+ node = stack.pop()
330
+ if node.type in func_types:
331
+ if node.start_point.row <= line <= node.end_point.row:
332
+ # Pick the tightest enclosing function
333
+ if best is None or (node.start_point.row >= best.start_point.row):
334
+ best = node
335
+ for child in node.children:
336
+ if child.start_point.row <= line <= child.end_point.row + 5:
337
+ stack.append(child)
338
+ elif child.start_point.row > line + 5:
339
+ break
340
+
341
+ return best
342
+
343
+
344
+ def _extract_parameter_names(func_node, language_key: str) -> set[str]:
345
+ """Extract parameter names from a function node."""
346
+ params = set()
347
+
348
+ if language_key == "python":
349
+ # Python: def foo(request, pk, **kwargs)
350
+ for child in func_node.children:
351
+ if child.type == "parameters":
352
+ for param in child.children:
353
+ if param.type == "identifier":
354
+ params.add(param.text.decode())
355
+ elif param.type in ("default_parameter", "typed_parameter", "typed_default_parameter"):
356
+ for sub in param.children:
357
+ if sub.type == "identifier":
358
+ params.add(sub.text.decode())
359
+ break
360
+ elif param.type in ("list_splat_pattern", "dictionary_splat_pattern"):
361
+ for sub in param.children:
362
+ if sub.type == "identifier":
363
+ params.add(sub.text.decode())
364
+ else:
365
+ # JS/TS: function foo(req, res) or (req, res) =>
366
+ for child in func_node.children:
367
+ if child.type == "formal_parameters":
368
+ for param in child.children:
369
+ if param.type == "identifier":
370
+ params.add(param.text.decode())
371
+ elif param.type in ("required_parameter", "optional_parameter"):
372
+ for sub in param.children:
373
+ if sub.type == "identifier":
374
+ params.add(sub.text.decode())
375
+ break
376
+ elif param.type == "object_pattern":
377
+ # Destructured: ({ body, params })
378
+ for sub in param.children:
379
+ if sub.type == "shorthand_property_identifier_pattern":
380
+ params.add(sub.text.decode())
381
+ elif sub.type == "pair_pattern":
382
+ for kv in sub.children:
383
+ if kv.type == "property_identifier":
384
+ params.add(kv.text.decode())
385
+
386
+ return params
387
+
388
+
389
+ _INPUT_ACCESSOR_RE = re.compile(
390
+ r"\b(?:req|request|params|query|body|searchParams|self\.request)"
391
+ r"(?:\.\w+)*"
392
+ r"|(?:request\.(?:GET|POST|FILES|body|json|form|args|data|query_params))"
393
+ r"|(?:req\.(?:body|query|params|cookies|headers))"
394
+ r"|await\s+\w+\.json\(\)"
395
+ r"|getattr\s*\(\s*request",
396
+ re.I,
397
+ )
398
+
399
+ _KEYWORDS = {"const", "let", "var", "return", "await", "async", "function",
400
+ "if", "else", "for", "while", "true", "false", "null", "undefined",
401
+ "def", "class", "import", "from", "None", "True", "False", "self",
402
+ "new", "typeof", "instanceof", "try", "catch", "throw", "export"}
403
+
404
+
405
+ def _function_has_user_input_at_sink(func_node, sink_line: int, language_key: str) -> bool:
406
+ """Check if user input can plausibly reach the sink line.
407
+
408
+ Strategy:
409
+ 1. Extract function parameters
410
+ 2. Only consider params that match known user-input names (req, request, body, etc.)
411
+ 3. Check for direct user-input accessor patterns near the sink
412
+ 4. Trace variables backwards: if a variable on the sink line was assigned
413
+ from user input earlier in the function, it counts
414
+
415
+ If only non-input params (config, options, settings) appear, reject.
416
+ """
417
+ func_text = func_node.text.decode()
418
+ func_start_line = func_node.start_point.row
419
+ func_lines = func_text.split("\n")
420
+
421
+ # Get parameter names
422
+ param_names = _extract_parameter_names(func_node, language_key)
423
+
424
+ # Only consider params that look like user input sources
425
+ input_params = param_names & _USER_INPUT_NAMES
426
+
427
+ # Sink window: ±2 lines around the reported line
428
+ relative_sink = sink_line - func_start_line
429
+ check_start = max(0, relative_sink - 2)
430
+ check_end = min(len(func_lines), relative_sink + 3)
431
+ sink_window = " ".join(func_lines[check_start:check_end])
432
+
433
+ # Check 1: Does the sink window directly reference a known input parameter?
434
+ for identifier in input_params:
435
+ if re.search(rf"\b{re.escape(identifier)}\b", sink_window):
436
+ return True
437
+
438
+ # Check 2: Does the sink window contain a user-input accessor pattern?
439
+ if _INPUT_ACCESSOR_RE.search(sink_window):
440
+ return True
441
+
442
+ # Check 3: Trace backwards — does any variable in the sink window
443
+ # get assigned from user input earlier in the function?
444
+ sink_identifiers = set(re.findall(r"\b([a-zA-Z_]\w*)\b", sink_window)) - _KEYWORDS
445
+
446
+ pre_sink_lines = func_lines[:max(0, relative_sink)]
447
+ pre_sink_text = "\n".join(pre_sink_lines)
448
+
449
+ for ident in sink_identifiers:
450
+ assign_pattern = re.compile(
451
+ rf"(?:const|let|var)?\s*{re.escape(ident)}\s*=\s*(.*)",
452
+ )
453
+ for match in assign_pattern.finditer(pre_sink_text):
454
+ rhs = match.group(1)
455
+ if _INPUT_ACCESSOR_RE.search(rhs):
456
+ return True
457
+ for ip in input_params:
458
+ if re.search(rf"\b{re.escape(ip)}\b", rhs):
459
+ return True
460
+
461
+ return False
462
+
463
+
464
+ def validate_finding_treesitter(finding: dict, fs_tools) -> tuple[bool, str]:
465
+ """Tree-sitter based sink verification.
466
+
467
+ Parses the function containing the finding, checks if user input
468
+ can plausibly reach the dangerous sink.
469
+
470
+ Returns (is_valid, reason).
471
+ """
472
+ file_path = finding.get("file_path", "")
473
+ line_number = finding.get("line_number")
474
+ category = _normalize_finding_category(finding.get("category", ""))
475
+
476
+ # Only run for categories where we have sink patterns
477
+ if category not in _CATEGORY_SINK_PATTERNS:
478
+ return True, "no sink patterns for category"
479
+
480
+ if not file_path or not line_number:
481
+ return True, "no file/line to verify"
482
+
483
+ ts_lang = _get_ts_language(file_path)
484
+ if ts_lang is None:
485
+ return True, "unsupported language for tree-sitter"
486
+
487
+ # Read the file
488
+ result = fs_tools.read_file(file_path)
489
+ if "error" in result:
490
+ return True, "could not read file"
491
+
492
+ raw_content = result.get("content", "")
493
+ # Strip line-number prefixes
494
+ lines = raw_content.split("\n")
495
+ clean_lines = []
496
+ for line in lines:
497
+ if "\t" in line:
498
+ clean_lines.append(line.split("\t", 1)[1])
499
+ else:
500
+ clean_lines.append(line)
501
+ source_bytes = "\n".join(clean_lines).encode("utf-8")
502
+
503
+ try:
504
+ import tree_sitter as ts
505
+ parser = ts.Parser(ts_lang)
506
+ tree = parser.parse(source_bytes)
507
+ except Exception as e:
508
+ logger.debug(f"tree-sitter parse failed for {file_path}: {e}")
509
+ return True, "parse failed"
510
+
511
+ ext = os.path.splitext(file_path)[1].lower()
512
+ if ext == ".py":
513
+ lang_key = "python"
514
+ elif ext in (".ts", ".tsx", ".mts"):
515
+ lang_key = "typescript"
516
+ else:
517
+ lang_key = "javascript"
518
+
519
+ # Find enclosing function (0-indexed line)
520
+ func_node = _find_enclosing_function(tree, line_number - 1, lang_key)
521
+ if func_node is None:
522
+ # No enclosing function — might be module-level code, allow it
523
+ return True, "no enclosing function found"
524
+
525
+ # Check if user input identifiers appear near the sink
526
+ has_input = _function_has_user_input_at_sink(func_node, line_number - 1, lang_key)
527
+ if not has_input:
528
+ return False, (
529
+ f"tree-sitter: no user input identifiers found near sink at line {line_number} "
530
+ f"in function {func_node.children[1].text.decode() if len(func_node.children) > 1 else '?'}"
531
+ )
532
+
533
+ return True, "user input may reach sink"
534
+
535
+
536
+ # =========================================================================
537
+ # Main entry point
538
+ # =========================================================================
539
+
540
+ def run_static_validation(
541
+ potential_findings: list[dict],
542
+ fs_tools,
543
+ enable_treesitter: bool = True,
544
+ ) -> tuple[list[dict], list[dict], dict]:
545
+ """Run deterministic pre-validation on all hunter findings.
546
+
547
+ Args:
548
+ potential_findings: Raw findings from hunter swarm
549
+ fs_tools: FileSystemTools instance for file access
550
+ enable_treesitter: Whether to run tree-sitter checks (Layer B)
551
+
552
+ Returns:
553
+ (passed, rejected, stats) where:
554
+ - passed: findings that survived validation (indices preserved)
555
+ - rejected: findings that were filtered out
556
+ - stats: validation statistics
557
+ """
558
+ stats = {
559
+ "input_count": len(potential_findings),
560
+ "basic_rejected": 0,
561
+ "treesitter_rejected": 0,
562
+ "passed": 0,
563
+ "basic_rejections": [],
564
+ "treesitter_rejections": [],
565
+ }
566
+
567
+ passed: list[dict] = []
568
+ rejected: list[dict] = []
569
+
570
+ for i, finding in enumerate(potential_findings):
571
+ finding["_original_index"] = i
572
+
573
+ # Layer A: Basic checks
574
+ is_valid, reason = validate_finding_basic(finding, fs_tools)
575
+ if not is_valid:
576
+ finding["_rejection_reason"] = f"basic: {reason}"
577
+ rejected.append(finding)
578
+ stats["basic_rejected"] += 1
579
+ stats["basic_rejections"].append({
580
+ "index": i,
581
+ "file": finding.get("file_path", ""),
582
+ "category": finding.get("category", ""),
583
+ "reason": reason,
584
+ })
585
+ continue
586
+
587
+ # Layer B: Tree-sitter verification
588
+ if enable_treesitter:
589
+ is_valid, reason = validate_finding_treesitter(finding, fs_tools)
590
+ if not is_valid:
591
+ finding["_rejection_reason"] = f"treesitter: {reason}"
592
+ rejected.append(finding)
593
+ stats["treesitter_rejected"] += 1
594
+ stats["treesitter_rejections"].append({
595
+ "index": i,
596
+ "file": finding.get("file_path", ""),
597
+ "category": finding.get("category", ""),
598
+ "reason": reason,
599
+ })
600
+ continue
601
+
602
+ passed.append(finding)
603
+ stats["passed"] += 1
604
+
605
+ total_rejected = stats["basic_rejected"] + stats["treesitter_rejected"]
606
+ logger.info(
607
+ f"Static validation: {stats['input_count']} findings → "
608
+ f"{stats['passed']} passed, {total_rejected} rejected "
609
+ f"(basic: {stats['basic_rejected']}, tree-sitter: {stats['treesitter_rejected']})"
610
+ )
611
+
612
+ return passed, rejected, stats
@@ -0,0 +1 @@
1
+ """OpenHack tools."""