openhack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. openhack/__init__.py +2 -0
  2. openhack/__main__.py +225 -0
  3. openhack/agents/__init__.py +30 -0
  4. openhack/agents/base.py +230 -0
  5. openhack/agents/browser_verifier.py +679 -0
  6. openhack/agents/browser_verifier_swarm.py +256 -0
  7. openhack/agents/checkpoint.py +89 -0
  8. openhack/agents/context_manager.py +356 -0
  9. openhack/agents/coordinator.py +1105 -0
  10. openhack/agents/endpoint_analyst.py +307 -0
  11. openhack/agents/feature_hunter.py +93 -0
  12. openhack/agents/hunter.py +481 -0
  13. openhack/agents/hunter_swarm.py +385 -0
  14. openhack/agents/llm.py +334 -0
  15. openhack/agents/recon.py +19 -0
  16. openhack/agents/sandbox_verifier.py +396 -0
  17. openhack/agents/sandbox_verifier_swarm.py +250 -0
  18. openhack/agents/session.py +286 -0
  19. openhack/agents/validator.py +217 -0
  20. openhack/agents/validator_swarm.py +106 -0
  21. openhack/auth.py +175 -0
  22. openhack/browser/__init__.py +12 -0
  23. openhack/browser/runner.py +385 -0
  24. openhack/categories.py +130 -0
  25. openhack/config.py +201 -0
  26. openhack/deterministic_recon.py +464 -0
  27. openhack/entry_points.py +745 -0
  28. openhack/framework_classifier.py +515 -0
  29. openhack/framework_detection.py +269 -0
  30. openhack/headless_scan.py +179 -0
  31. openhack/prompts/__init__.py +108 -0
  32. openhack/prompts/browser_verifier.py +171 -0
  33. openhack/prompts/coordinator.py +31 -0
  34. openhack/prompts/django/__init__.py +32 -0
  35. openhack/prompts/django/auth_bypass.py +76 -0
  36. openhack/prompts/django/csrf.py +62 -0
  37. openhack/prompts/django/data_exposure.py +67 -0
  38. openhack/prompts/django/idor.py +74 -0
  39. openhack/prompts/django/injection.py +67 -0
  40. openhack/prompts/django/misconfiguration.py +70 -0
  41. openhack/prompts/django/ssrf.py +64 -0
  42. openhack/prompts/endpoint_analyst.py +122 -0
  43. openhack/prompts/express/__init__.py +29 -0
  44. openhack/prompts/express/auth_bypass.py +71 -0
  45. openhack/prompts/express/data_exposure.py +77 -0
  46. openhack/prompts/express/idor.py +69 -0
  47. openhack/prompts/express/injection.py +75 -0
  48. openhack/prompts/express/misconfiguration.py +72 -0
  49. openhack/prompts/express/ssrf.py +63 -0
  50. openhack/prompts/feature_hunter.py +140 -0
  51. openhack/prompts/flask/__init__.py +29 -0
  52. openhack/prompts/flask/auth_bypass.py +86 -0
  53. openhack/prompts/flask/data_exposure.py +78 -0
  54. openhack/prompts/flask/idor.py +83 -0
  55. openhack/prompts/flask/injection.py +77 -0
  56. openhack/prompts/flask/misconfiguration.py +73 -0
  57. openhack/prompts/flask/ssrf.py +65 -0
  58. openhack/prompts/hunter.py +362 -0
  59. openhack/prompts/hunter_continuation_loop.py +12 -0
  60. openhack/prompts/hunter_continuation_no_findings.py +19 -0
  61. openhack/prompts/hunter_continuation_no_progress.py +22 -0
  62. openhack/prompts/hunter_tool_instructions.py +55 -0
  63. openhack/prompts/nextjs/__init__.py +42 -0
  64. openhack/prompts/nextjs/auth_bypass.py +80 -0
  65. openhack/prompts/nextjs/csrf.py +71 -0
  66. openhack/prompts/nextjs/data_exposure.py +88 -0
  67. openhack/prompts/nextjs/idor.py +64 -0
  68. openhack/prompts/nextjs/injection.py +65 -0
  69. openhack/prompts/nextjs/middleware_bypass.py +75 -0
  70. openhack/prompts/nextjs/misconfiguration.py +92 -0
  71. openhack/prompts/nextjs/server_actions.py +97 -0
  72. openhack/prompts/nextjs/ssrf.py +66 -0
  73. openhack/prompts/nextjs/xss.py +69 -0
  74. openhack/prompts/pr_analysis_system.py +80 -0
  75. openhack/prompts/pr_analysis_user.py +11 -0
  76. openhack/prompts/project_context.py +89 -0
  77. openhack/prompts/recon.py +199 -0
  78. openhack/prompts/reporter.py +88 -0
  79. openhack/prompts/researchers.py +434 -0
  80. openhack/prompts/sandbox_verifier.py +128 -0
  81. openhack/prompts/supabase/__init__.py +39 -0
  82. openhack/prompts/supabase/auth_tokens.py +131 -0
  83. openhack/prompts/supabase/edge_functions.py +150 -0
  84. openhack/prompts/supabase/graphql.py +102 -0
  85. openhack/prompts/supabase/postgrest.py +99 -0
  86. openhack/prompts/supabase/realtime.py +93 -0
  87. openhack/prompts/supabase/rls.py +110 -0
  88. openhack/prompts/supabase/rpc_functions.py +127 -0
  89. openhack/prompts/supabase/storage.py +110 -0
  90. openhack/prompts/supabase/tenant_isolation.py +118 -0
  91. openhack/prompts/validator.py +319 -0
  92. openhack/prompts/validator_continuation_incomplete.py +12 -0
  93. openhack/prompts/validator_tool_instructions.py +29 -0
  94. openhack/quality.py +231 -0
  95. openhack/sandbox/__init__.py +12 -0
  96. openhack/sandbox/orchestrator.py +517 -0
  97. openhack/sandbox/runner.py +177 -0
  98. openhack/scan_session.py +245 -0
  99. openhack/setup.py +452 -0
  100. openhack/static_validator.py +612 -0
  101. openhack/tools/__init__.py +1 -0
  102. openhack/tools/ast_tools.py +307 -0
  103. openhack/tools/coverage.py +1078 -0
  104. openhack/tools/filesystem.py +404 -0
  105. openhack/tools/nextjs.py +258 -0
  106. openhack/tools/registry.py +52 -0
  107. openhack/tui.py +3450 -0
  108. openhack/updates.py +170 -0
  109. openhack-0.1.0.dist-info/METADATA +189 -0
  110. openhack-0.1.0.dist-info/RECORD +113 -0
  111. openhack-0.1.0.dist-info/WHEEL +4 -0
  112. openhack-0.1.0.dist-info/entry_points.txt +2 -0
  113. openhack-0.1.0.dist-info/licenses/LICENSE +661 -0
@@ -0,0 +1,1078 @@
1
+ """
2
+ Static attack surface discovery and coverage computation.
3
+
4
+ All discovery is deterministic (no LLM) -- pure glob + regex.
5
+ Used to measure what percentage of the application the scanner actually analyzed.
6
+ Also provides enrichment (import resolution, danger pattern pre-scan) for
7
+ coverage-guided second-pass hunting.
8
+
9
+ Supports multiple project types:
10
+ - Next.js (API routes, server actions, tRPC, middleware)
11
+ - Supabase (migration SQL: RLS policies, RPC functions, storage policies)
12
+ - Express / Fastify / Hono (route handlers)
13
+ - Small codebases (< 30 source files -- treat everything as attack surface)
14
+ """
15
+
16
+ import os
17
+ import re
18
+ from typing import Optional
19
+
20
+ from .filesystem import FileSystemTools
21
+
22
+
23
+ def discover_attack_surface(
24
+ fs_tools: FileSystemTools,
25
+ nextjs_tools=None,
26
+ ) -> dict:
27
+ """Deterministically enumerate every security-relevant entry point.
28
+
29
+ Detects the project type and runs the appropriate discovery strategy.
30
+ Falls back to "all source files" for small codebases.
31
+ """
32
+ surface: dict = {
33
+ "api_routes": [],
34
+ "server_actions": [],
35
+ "trpc_procedures": [],
36
+ "callback_handlers": [],
37
+ "middleware": [],
38
+ "supabase_migrations": [],
39
+ "route_handlers": [],
40
+ "source_files": [],
41
+ "total_endpoints": 0,
42
+ }
43
+
44
+ discovered_files: set[str] = set()
45
+
46
+ # --- Next.js discovery (when nextjs_tools are available) ---
47
+ if nextjs_tools is not None:
48
+ route_map = nextjs_tools.get_route_map()
49
+ surface["api_routes"] = route_map.get("api_routes", [])
50
+ for ep in surface["api_routes"]:
51
+ discovered_files.add(ep["file"])
52
+
53
+ actions = nextjs_tools.get_server_actions()
54
+ surface["server_actions"] = actions.get("server_actions", [])
55
+ for ep in surface["server_actions"]:
56
+ discovered_files.add(ep["file"])
57
+
58
+ mw = nextjs_tools.get_middleware_config()
59
+ if "error" not in mw:
60
+ surface["middleware"] = [{"file": mw["file"]}]
61
+ discovered_files.add(mw["file"])
62
+
63
+ # --- tRPC (works for any project with tRPC) ---
64
+ surface["trpc_procedures"] = _discover_trpc_procedures(fs_tools)
65
+ for ep in surface["trpc_procedures"]:
66
+ discovered_files.add(ep["file"])
67
+
68
+ # --- Callback handlers ---
69
+ surface["callback_handlers"] = _discover_callback_handlers(fs_tools)
70
+ for ep in surface["callback_handlers"]:
71
+ discovered_files.add(ep["file"])
72
+
73
+ # --- Supabase discovery (migration SQL files) ---
74
+ surface["supabase_migrations"] = _discover_supabase_surface(fs_tools)
75
+ for ep in surface["supabase_migrations"]:
76
+ discovered_files.add(ep["file"])
77
+
78
+ # --- Express / Fastify / Hono route handlers ---
79
+ surface["route_handlers"] = _discover_route_handlers(fs_tools)
80
+ for ep in surface["route_handlers"]:
81
+ discovered_files.add(ep["file"])
82
+
83
+ # --- Django views, URLs, serializers ---
84
+ surface.setdefault("django_views", [])
85
+ surface["django_views"] = _discover_django_surface(fs_tools)
86
+ for ep in surface["django_views"]:
87
+ discovered_files.add(ep["file"])
88
+
89
+ # --- Flask routes ---
90
+ surface.setdefault("flask_routes", [])
91
+ surface["flask_routes"] = _discover_flask_surface(fs_tools)
92
+ for ep in surface["flask_routes"]:
93
+ discovered_files.add(ep["file"])
94
+
95
+ # --- Rails controllers, services, middleware ---
96
+ surface.setdefault("rails_controllers", [])
97
+ rails_discovered = _discover_rails_surface(fs_tools)
98
+ surface["rails_controllers"] = rails_discovered
99
+ for ep in rails_discovered:
100
+ discovered_files.add(ep["file"])
101
+
102
+ # --- Vuln-pattern discovery (framework-agnostic) ---
103
+ # Grep the entire repo for high-signal vulnerability patterns.
104
+ # This catches utility modules (e.g. jinja_context.py, template helpers)
105
+ # that framework-specific discovery misses because they have no route decorators.
106
+ surface.setdefault("danger_files", [])
107
+ danger_discovered = _discover_danger_pattern_files(fs_tools, discovered_files)
108
+ surface["danger_files"] = danger_discovered
109
+ for ep in danger_discovered:
110
+ discovered_files.add(ep["file"])
111
+
112
+ # --- Import-chain following ---
113
+ # For each discovered file, resolve imports 1-2 levels deep to find
114
+ # utility modules, DB clients, and helpers that may contain vulnerabilities
115
+ # but have no route decorators (e.g. SqliteClient.ts, email helpers).
116
+ import_deps = _follow_import_chains(fs_tools, discovered_files, max_depth=2)
117
+ surface["imported_dependencies"] = import_deps
118
+ for ep in import_deps:
119
+ discovered_files.add(ep["file"])
120
+
121
+ # --- Small codebase fallback ---
122
+ # If we found very few endpoints via structured discovery, enumerate all
123
+ # source files. For small projects this ensures nothing is missed.
124
+ if len(discovered_files) < 5:
125
+ surface["source_files"] = _discover_all_source_files(fs_tools, discovered_files)
126
+ for ep in surface["source_files"]:
127
+ discovered_files.add(ep["file"])
128
+
129
+ # Compute total
130
+ surface["total_endpoints"] = len(discovered_files)
131
+
132
+ return surface
133
+
134
+
135
+ def _discover_trpc_procedures(fs_tools: FileSystemTools) -> list[dict]:
136
+ """Find tRPC procedure definitions by grepping for procedure builders."""
137
+ procedures: list[dict] = []
138
+
139
+ search_dirs = ["packages/trpc", "src/server/trpc", "server/trpc", "src/trpc", "app/trpc"]
140
+ trpc_files: set[str] = set()
141
+
142
+ for search_dir in search_dirs:
143
+ result = fs_tools.glob("**/*.ts", search_dir)
144
+ for f in result.get("matches", []):
145
+ if "node_modules" not in f:
146
+ trpc_files.add(f)
147
+
148
+ fallback = fs_tools.glob("**/routers/**/*.ts", ".")
149
+ for f in fallback.get("matches", []):
150
+ if "node_modules" not in f:
151
+ trpc_files.add(f)
152
+
153
+ procedure_pattern = re.compile(
154
+ r"\b(\w+)\s*[:=]\s*(?:router\.)?"
155
+ r"(publicProcedure|authedProcedure|protectedProcedure|"
156
+ r"authedAdminProcedure|importHandler|organizationProcedure)"
157
+ )
158
+
159
+ for file_path in sorted(trpc_files):
160
+ content_result = fs_tools.read_file(file_path)
161
+ if "error" in content_result:
162
+ continue
163
+
164
+ raw = content_result.get("content", "")
165
+ lines = raw.split("\n")
166
+ for line in lines:
167
+ raw_line = line.split("\t", 1)[1] if "\t" in line else line
168
+ match = procedure_pattern.search(raw_line)
169
+ if match:
170
+ proc_name = match.group(1)
171
+ auth_level = match.group(2)
172
+ if proc_name in ("const", "export", "let", "var", "return", "type", "interface"):
173
+ continue
174
+ procedures.append({
175
+ "file": file_path,
176
+ "procedure": proc_name,
177
+ "auth_level": auth_level,
178
+ })
179
+
180
+ return procedures
181
+
182
+
183
+ def _discover_callback_handlers(fs_tools: FileSystemTools) -> list[dict]:
184
+ """Find OAuth / integration callback handler files."""
185
+ callbacks: list[dict] = []
186
+ seen_files: set[str] = set()
187
+
188
+ patterns = [
189
+ ("**/callback.ts", "."),
190
+ ("**/callback.js", "."),
191
+ ("**/callback/route.ts", "."),
192
+ ("**/callback/route.js", "."),
193
+ ]
194
+ for pattern, base in patterns:
195
+ result = fs_tools.glob(pattern, base)
196
+ for f in result.get("matches", []):
197
+ if "node_modules" not in f and f not in seen_files:
198
+ seen_files.add(f)
199
+ route = f
200
+ if "/api/" in f:
201
+ route = "/api/" + f.split("/api/", 1)[1]
202
+ callbacks.append({"file": f, "route": route})
203
+
204
+ return callbacks
205
+
206
+
207
+ def _discover_supabase_surface(fs_tools: FileSystemTools) -> list[dict]:
208
+ """Find Supabase migration SQL files, config, and edge functions."""
209
+ endpoints: list[dict] = []
210
+ seen: set[str] = set()
211
+
212
+ # Migration SQL files
213
+ for pattern in ["supabase/migrations/**/*.sql", "supabase/migrations/*.sql"]:
214
+ result = fs_tools.glob(pattern, ".")
215
+ for f in result.get("matches", []):
216
+ if f not in seen:
217
+ seen.add(f)
218
+ endpoints.append({
219
+ "file": f,
220
+ "type": "migration",
221
+ "label": os.path.basename(f),
222
+ })
223
+
224
+ # Supabase config
225
+ for config_file in ["supabase/config.toml", "supabase/config.ts"]:
226
+ result = fs_tools.read_file(config_file)
227
+ if "error" not in result and config_file not in seen:
228
+ seen.add(config_file)
229
+ endpoints.append({
230
+ "file": config_file,
231
+ "type": "config",
232
+ "label": config_file,
233
+ })
234
+
235
+ # Edge functions
236
+ result = fs_tools.glob("supabase/functions/**/index.ts", ".")
237
+ for f in result.get("matches", []):
238
+ if f not in seen:
239
+ seen.add(f)
240
+ endpoints.append({"file": f, "type": "edge_function", "label": f})
241
+
242
+ return endpoints
243
+
244
+
245
+ def _discover_route_handlers(fs_tools: FileSystemTools) -> list[dict]:
246
+ """Find Express / Fastify / Hono / generic HTTP route handler files."""
247
+ handlers: list[dict] = []
248
+ seen: set[str] = set()
249
+
250
+ _ROUTE_PATTERN = re.compile(
251
+ r"(?:app|router|server|hono)\s*\.\s*"
252
+ r"(?:get|post|put|patch|delete|all|use|register|route)\s*\(",
253
+ re.IGNORECASE,
254
+ )
255
+
256
+ # Search common server source directories
257
+ search_dirs = [
258
+ "src", "server", "api", "routes", "lib",
259
+ "src/gateway", "src/server", "src/api", "src/routes",
260
+ ]
261
+ source_files: set[str] = set()
262
+
263
+ for d in search_dirs:
264
+ for ext in ["**/*.ts", "**/*.js", "**/*.mts", "**/*.mjs"]:
265
+ result = fs_tools.glob(ext, d)
266
+ for f in result.get("matches", []):
267
+ if "node_modules" not in f and "test" not in f.lower():
268
+ source_files.add(f)
269
+
270
+ for file_path in sorted(source_files):
271
+ content_result = fs_tools.read_file(file_path)
272
+ if "error" in content_result:
273
+ continue
274
+ raw = content_result.get("content", "")
275
+ lines = raw.split("\n")
276
+ for line in lines:
277
+ raw_line = line.split("\t", 1)[1] if "\t" in line else line
278
+ if _ROUTE_PATTERN.search(raw_line):
279
+ if file_path not in seen:
280
+ seen.add(file_path)
281
+ handlers.append({"file": file_path, "label": file_path})
282
+ break
283
+
284
+ return handlers
285
+
286
+
287
+ def _discover_django_surface(fs_tools: FileSystemTools) -> list[dict]:
288
+ """Find Django views, URL configs, serializers, and viewsets."""
289
+ endpoints: list[dict] = []
290
+ seen: set[str] = set()
291
+
292
+ for pattern in [
293
+ "**/views.py", "**/views/**/*.py",
294
+ "**/urls.py",
295
+ "**/serializers.py", "**/serializers/**/*.py",
296
+ "**/viewsets.py", "**/viewsets/**/*.py",
297
+ "**/api/*.py", "**/api/**/*.py",
298
+ "**/forms.py",
299
+ "**/admin.py",
300
+ ]:
301
+ result = fs_tools.glob(pattern, ".")
302
+ for f in result.get("matches", []):
303
+ parts = set(f.split("/"))
304
+ if parts.intersection({"node_modules", "venv", ".venv", "__pycache__", "site-packages"}):
305
+ continue
306
+ if "__init__" in f or "test" in f.lower() or "migrations" in f:
307
+ continue
308
+ if f not in seen:
309
+ seen.add(f)
310
+ label = f
311
+ if "views" in f:
312
+ label = f"Django view: {f}"
313
+ elif "urls" in f:
314
+ label = f"Django URL config: {f}"
315
+ elif "serializer" in f:
316
+ label = f"DRF serializer: {f}"
317
+ endpoints.append({"file": f, "label": label})
318
+
319
+ return endpoints
320
+
321
+
322
+ def _discover_flask_surface(fs_tools: FileSystemTools) -> list[dict]:
323
+ """Find Flask route files by grepping for @app.route / @blueprint.route."""
324
+ endpoints: list[dict] = []
325
+ seen: set[str] = set()
326
+
327
+ _FLASK_ROUTE = re.compile(
328
+ r"@\w+\.route\s*\(|@\w+\.before_request|@\w+\.errorhandler",
329
+ )
330
+
331
+ py_files: set[str] = set()
332
+ for d in [".", "src", "app", "api", "routes", "blueprints", "views"]:
333
+ result = fs_tools.glob("**/*.py", d)
334
+ for f in result.get("matches", []):
335
+ parts = set(f.split("/"))
336
+ if parts.intersection({"node_modules", "venv", ".venv", "__pycache__", "site-packages", "migrations"}):
337
+ continue
338
+ py_files.add(f)
339
+
340
+ for file_path in sorted(py_files):
341
+ if file_path in seen:
342
+ continue
343
+ content_result = fs_tools.read_file(file_path)
344
+ if "error" in content_result:
345
+ continue
346
+ raw = content_result.get("content", "")
347
+ lines = raw.split("\n")
348
+ for line in lines:
349
+ raw_line = line.split("\t", 1)[1] if "\t" in line else line
350
+ if _FLASK_ROUTE.search(raw_line):
351
+ seen.add(file_path)
352
+ endpoints.append({"file": file_path, "label": f"Flask route: {file_path}"})
353
+ break
354
+
355
+ return endpoints
356
+
357
+
358
+ _SKIP_DIRS = {"node_modules", "venv", ".venv", "__pycache__", "site-packages",
359
+ "migrations", ".git", "dist", "build", ".next", "coverage",
360
+ "test", "tests", "__tests__", "spec", "fixtures", "mocks"}
361
+
362
+ # High-signal patterns worth grepping the entire repo for.
363
+ # These are sinks that almost always indicate a real vulnerability when
364
+ # combined with user input — worth reading regardless of file location.
365
+ _HIGH_SIGNAL_GREP_PATTERNS: list[tuple[str, str, str]] = [
366
+ # SSTI
367
+ (r"render_template_string\s*\(", "SSTI", "Flask render_template_string"),
368
+ (r"mark_safe\s*\(", "SSTI", "Django mark_safe"),
369
+ (r"Environment\s*\(.*\)\.from_string", "SSTI", "Jinja2 from_string"),
370
+ (r"ejs\.render\s*\(", "SSTI", "EJS render"),
371
+ (r"nunjucks\.renderString\s*\(", "SSTI", "Nunjucks renderString"),
372
+ # Command injection (Python)
373
+ (r"subprocess\.\w+\(.*shell\s*=\s*True", "RCE", "subprocess shell=True"),
374
+ (r"os\.system\s*\(", "RCE", "os.system"),
375
+ (r"os\.popen\s*\(", "RCE", "os.popen"),
376
+ # SQL injection (ORM escape hatches)
377
+ (r"\.raw\s*\(\s*f['\"]", "SQLi", "ORM .raw() with f-string"),
378
+ (r"\.extra\s*\(\s*(?:select|where|tables)", "SQLi", "Django .extra()"),
379
+ (r"RawSQL\s*\(", "SQLi", "Django RawSQL"),
380
+ (r"text\s*\(\s*f['\"]", "SQLi", "SQLAlchemy text() with f-string"),
381
+ # Prototype pollution
382
+ (r"__proto__", "Prototype Pollution", "__proto__ reference"),
383
+ (r"(?:lodash|_)\.(?:merge|defaultsDeep)\s*\(", "Prototype Pollution", "lodash deep merge"),
384
+ # Path traversal (Python)
385
+ (r"send_file\s*\(", "Path Traversal", "Flask send_file"),
386
+ (r"FileResponse\s*\(", "Path Traversal", "Django FileResponse"),
387
+ (r"sendFile\s*\(", "Path Traversal", "Express sendFile"),
388
+ ]
389
+
390
+
391
+ def _discover_rails_surface(fs_tools: FileSystemTools) -> list[dict]:
392
+ """Discover Rails controllers, services, middleware, and GraphQL resolvers."""
393
+ entries: list[dict] = []
394
+ seen: set[str] = set()
395
+
396
+ patterns = [
397
+ ("app/controllers/**/*.rb", "controller"),
398
+ ("app/services/**/*.rb", "service"),
399
+ ("app/middleware/**/*.rb", "middleware"),
400
+ ("app/graphql/**/*.rb", "graphql"),
401
+ ]
402
+
403
+ for pattern, kind in patterns:
404
+ result = fs_tools.glob(pattern, ".")
405
+ for filepath in result.get("matches", []):
406
+ if filepath in seen:
407
+ continue
408
+ if any(skip in filepath for skip in [
409
+ "test/", "spec/", "concerns/", "node_modules/", "vendor/gems/",
410
+ ]):
411
+ continue
412
+ seen.add(filepath)
413
+
414
+ danger_signals = []
415
+ if kind == "controller":
416
+ danger_signals.append({"description": "Rails controller — handles HTTP requests"})
417
+ elif kind == "middleware":
418
+ danger_signals.append({"description": "Middleware — request/response processing"})
419
+ elif kind == "service":
420
+ danger_signals.append({"description": "Service layer — business logic"})
421
+
422
+ entries.append({
423
+ "file": filepath,
424
+ "trigger": f"rails_{kind}",
425
+ "danger_signals": danger_signals,
426
+ })
427
+
428
+ return entries
429
+
430
+
431
+ def _discover_danger_pattern_files(
432
+ fs_tools: FileSystemTools,
433
+ already_discovered: set[str],
434
+ ) -> list[dict]:
435
+ """Grep the entire repo for high-signal vulnerability patterns.
436
+
437
+ Returns files NOT already in the attack surface that contain dangerous sinks.
438
+ Uses a single combined grep for speed on large repos.
439
+ """
440
+ # Single combined grep to find all candidate files
441
+ combined = "|".join(f"({p})" for p, _, _ in _HIGH_SIGNAL_GREP_PATTERNS)
442
+ result = fs_tools.grep(combined, ".")
443
+
444
+ candidate_files: set[str] = set()
445
+ for match in result.get("matches", []):
446
+ file_path = match if isinstance(match, str) else match.get("file", "")
447
+ if not file_path:
448
+ continue
449
+ parts = set(file_path.split("/"))
450
+ if parts.intersection(_SKIP_DIRS):
451
+ continue
452
+ if file_path in already_discovered:
453
+ continue
454
+ candidate_files.add(file_path)
455
+
456
+ if not candidate_files:
457
+ return []
458
+
459
+ # Read each candidate file once and categorize by pattern
460
+ found: list[dict] = []
461
+ seen: set[str] = set()
462
+ compiled = [(re.compile(p, re.IGNORECASE), cat, desc)
463
+ for p, cat, desc in _HIGH_SIGNAL_GREP_PATTERNS]
464
+
465
+ for file_path in sorted(candidate_files):
466
+ content_result = fs_tools.read_file(file_path)
467
+ if "error" in content_result:
468
+ continue
469
+ raw = content_result.get("content", "")
470
+
471
+ for regex, category, description in compiled:
472
+ if regex.search(raw):
473
+ if file_path not in seen:
474
+ seen.add(file_path)
475
+ found.append({
476
+ "file": file_path,
477
+ "label": f"Danger pattern ({category}): {file_path}",
478
+ "category": "danger_pattern",
479
+ "trigger": description,
480
+ })
481
+ break
482
+
483
+ return found
484
+
485
+
486
+ def _follow_import_chains(
487
+ fs_tools: FileSystemTools,
488
+ seed_files: set[str],
489
+ max_depth: int = 2,
490
+ ) -> list[dict]:
491
+ """Follow imports from discovered files to find transitive dependencies.
492
+
493
+ Starting from seed files (route handlers, danger pattern files, etc.),
494
+ resolve relative imports up to `max_depth` levels deep. Returns files NOT
495
+ already in the seed set — these are utility modules, DB clients, helpers,
496
+ etc. that may contain vulnerabilities but have no route decorators.
497
+ """
498
+ visited: set[str] = set(seed_files)
499
+ frontier: set[str] = set(seed_files)
500
+ found: list[dict] = []
501
+
502
+ max_frontier = 100
503
+
504
+ for depth in range(max_depth):
505
+ next_frontier: set[str] = set()
506
+
507
+ work = sorted(frontier)[:max_frontier]
508
+ for file_path in work:
509
+ if not file_path.endswith((".ts", ".tsx", ".js", ".jsx", ".py")):
510
+ continue
511
+
512
+ content_result = fs_tools.read_file(file_path)
513
+ if "error" in content_result:
514
+ continue
515
+
516
+ raw = content_result.get("content", "")
517
+ lines = raw.split("\n")
518
+
519
+ for line in lines:
520
+ raw_line = line.split("\t", 1)[1] if "\t" in line else line
521
+
522
+ # JS/TS imports
523
+ for m in _IMPORT_RE.finditer(raw_line):
524
+ source = m.group(1) or m.group(2)
525
+ if source and source.startswith("."):
526
+ resolved = _resolve_import(source, file_path, fs_tools)
527
+ if resolved and resolved not in visited:
528
+ parts = set(resolved.split("/"))
529
+ if not parts.intersection(_SKIP_DIRS):
530
+ visited.add(resolved)
531
+ next_frontier.add(resolved)
532
+
533
+ # Python imports (from .foo import bar / from ..utils import x)
534
+ py_match = re.match(
535
+ r"from\s+(\.+\w[\w.]*)\s+import",
536
+ raw_line.strip(),
537
+ )
538
+ if py_match and file_path.endswith(".py"):
539
+ rel_module = py_match.group(1)
540
+ resolved = _resolve_python_import(rel_module, file_path, fs_tools)
541
+ if resolved and resolved not in visited:
542
+ parts = set(resolved.split("/"))
543
+ if not parts.intersection(_SKIP_DIRS):
544
+ visited.add(resolved)
545
+ next_frontier.add(resolved)
546
+
547
+ # Run danger scan on newly found files to prioritize them
548
+ for new_file in next_frontier:
549
+ content_result = fs_tools.read_file(new_file)
550
+ signals = []
551
+ if "error" not in content_result:
552
+ raw = content_result.get("content", "")
553
+ signals = _quick_danger_scan(raw)
554
+
555
+ label = f"Import dep (depth={depth + 1}): {new_file}"
556
+ if signals:
557
+ signal_cats = ", ".join(sorted(set(s["category"] for s in signals)))
558
+ label = f"Import dep [{signal_cats}] (depth={depth + 1}): {new_file}"
559
+
560
+ found.append({
561
+ "file": new_file,
562
+ "label": label,
563
+ "category": "imported_dependency",
564
+ "depth": depth + 1,
565
+ "has_danger_signals": bool(signals),
566
+ "danger_signals": signals[:5],
567
+ })
568
+
569
+ frontier = next_frontier
570
+
571
+ return found
572
+
573
+
574
+ def _resolve_python_import(
575
+ rel_module: str,
576
+ from_file: str,
577
+ fs_tools: FileSystemTools,
578
+ ) -> Optional[str]:
579
+ """Resolve a Python relative import (e.g. '.utils' or '..models') to a file path."""
580
+ # Count leading dots
581
+ dots = 0
582
+ for ch in rel_module:
583
+ if ch == ".":
584
+ dots += 1
585
+ else:
586
+ break
587
+
588
+ module_path = rel_module[dots:].replace(".", "/")
589
+ from_dir = os.path.dirname(from_file)
590
+
591
+ # Go up directories based on dot count (1 dot = same package, 2 = parent, etc.)
592
+ base_dir = from_dir
593
+ for _ in range(dots - 1):
594
+ base_dir = os.path.dirname(base_dir)
595
+
596
+ candidate_base = os.path.normpath(os.path.join(base_dir, module_path))
597
+
598
+ # Try as module file or package __init__
599
+ for suffix in [".py", "/__init__.py"]:
600
+ candidate = candidate_base + suffix
601
+ result = fs_tools.read_file(candidate)
602
+ if "error" not in result:
603
+ return candidate
604
+
605
+ return None
606
+
607
+
608
+ def _discover_all_source_files(
609
+ fs_tools: FileSystemTools,
610
+ already_discovered: set[str] | None = None,
611
+ ) -> list[dict]:
612
+ """For small codebases, enumerate source files as attack surface.
613
+
614
+ All files with danger signals are included regardless of cap.
615
+ Files without signals are capped at 50.
616
+ """
617
+ if already_discovered is None:
618
+ already_discovered = set()
619
+
620
+ files: list[dict] = []
621
+ plain_files: list[dict] = []
622
+ seen: set[str] = set()
623
+
624
+ all_source: list[str] = []
625
+ for ext in ["**/*.ts", "**/*.js", "**/*.tsx", "**/*.jsx", "**/*.py",
626
+ "**/*.sql", "**/*.toml"]:
627
+ result = fs_tools.glob(ext, ".")
628
+ for f in result.get("matches", []):
629
+ parts = set(f.split("/"))
630
+ if not parts.intersection(_SKIP_DIRS) and f not in seen and f not in already_discovered:
631
+ seen.add(f)
632
+ all_source.append(f)
633
+
634
+ # Scan every file for danger signals; include all that match, cap the rest
635
+ for f in all_source:
636
+ content_result = fs_tools.read_file(f)
637
+ if "error" in content_result:
638
+ plain_files.append({"file": f, "label": f})
639
+ continue
640
+ raw = content_result.get("content", "")
641
+ signals = _quick_danger_scan(raw)
642
+ if signals:
643
+ signal_cats = ", ".join(sorted(set(s["category"] for s in signals)))
644
+ files.append({"file": f, "label": f"[{signal_cats}] {f}"})
645
+ else:
646
+ plain_files.append({"file": f, "label": f})
647
+
648
+ # Add capped non-signal files
649
+ files.extend(plain_files[:50])
650
+
651
+ return files
652
+
653
+
654
+ def _all_endpoint_files(surface: dict) -> list[dict]:
655
+ """Flatten all endpoints from the surface into a unified list."""
656
+ endpoints: list[dict] = []
657
+ for ep in surface.get("api_routes", []):
658
+ endpoints.append({"file": ep["file"], "category": "api_route", "label": ep.get("route", ep["file"])})
659
+ for ep in surface.get("server_actions", []):
660
+ endpoints.append({"file": ep["file"], "category": "server_action", "label": ep.get("function", ep["file"])})
661
+ for ep in surface.get("trpc_procedures", []):
662
+ endpoints.append({"file": ep["file"], "category": "trpc", "label": ep.get("procedure", ep["file"])})
663
+ for ep in surface.get("callback_handlers", []):
664
+ endpoints.append({"file": ep["file"], "category": "callback", "label": ep.get("route", ep["file"])})
665
+ for ep in surface.get("middleware", []):
666
+ endpoints.append({"file": ep["file"], "category": "middleware", "label": ep["file"]})
667
+ for ep in surface.get("supabase_migrations", []):
668
+ endpoints.append({"file": ep["file"], "category": "supabase", "label": ep.get("label", ep["file"])})
669
+ for ep in surface.get("route_handlers", []):
670
+ endpoints.append({"file": ep["file"], "category": "route_handler", "label": ep.get("label", ep["file"])})
671
+ for ep in surface.get("django_views", []):
672
+ endpoints.append({"file": ep["file"], "category": "django_view", "label": ep.get("label", ep["file"])})
673
+ for ep in surface.get("flask_routes", []):
674
+ endpoints.append({"file": ep["file"], "category": "flask_route", "label": ep.get("label", ep["file"])})
675
+ for ep in surface.get("danger_files", []):
676
+ endpoints.append({"file": ep["file"], "category": "danger_pattern", "label": ep.get("label", ep["file"])})
677
+ for ep in surface.get("imported_dependencies", []):
678
+ endpoints.append({"file": ep["file"], "category": "imported_dependency", "label": ep.get("label", ep["file"])})
679
+ for ep in surface.get("source_files", []):
680
+ endpoints.append({"file": ep["file"], "category": "source_file", "label": ep.get("label", ep["file"])})
681
+ return endpoints
682
+
683
+
684
+ def compute_coverage(attack_surface: dict, files_analyzed: list[str]) -> dict:
685
+ """Compare the static attack surface against files the LLM actually read.
686
+
687
+ Returns a coverage report with covered/missed endpoints and a percentage.
688
+ """
689
+ analyzed_set = {f.lower() for f in files_analyzed}
690
+
691
+ all_endpoints = _all_endpoint_files(attack_surface)
692
+
693
+ # Deduplicate endpoints by file (multiple procedures in the same file count as one)
694
+ unique_files: dict[str, dict] = {}
695
+ for ep in all_endpoints:
696
+ key = ep["file"].lower()
697
+ if key not in unique_files:
698
+ unique_files[key] = ep
699
+
700
+ covered: list[dict] = []
701
+ missed: list[dict] = []
702
+ for key, ep in unique_files.items():
703
+ if key in analyzed_set:
704
+ covered.append(ep)
705
+ else:
706
+ missed.append(ep)
707
+
708
+ total = len(covered) + len(missed)
709
+
710
+ # Per-category breakdown
711
+ categories = {}
712
+ for ep in covered:
713
+ cat = ep["category"]
714
+ categories.setdefault(cat, {"covered": 0, "missed": 0})
715
+ categories[cat]["covered"] += 1
716
+ for ep in missed:
717
+ cat = ep["category"]
718
+ categories.setdefault(cat, {"covered": 0, "missed": 0})
719
+ categories[cat]["missed"] += 1
720
+
721
+ return {
722
+ "total_endpoints": total,
723
+ "covered_count": len(covered),
724
+ "missed_count": len(missed),
725
+ "coverage_pct": round(len(covered) / total * 100, 1) if total > 0 else 100.0,
726
+ "categories": categories,
727
+ "covered": covered,
728
+ "missed": missed,
729
+ "files_analyzed_count": len(files_analyzed),
730
+ }
731
+
732
+
733
+ # ---------------------------------------------------------------------------
734
+ # Enrichment: build context clusters for coverage-guided second pass
735
+ # ---------------------------------------------------------------------------
736
+
737
+ # Patterns from ASTTools.find_dangerous_patterns -- kept in sync
738
+ _DANGER_PATTERNS = [
739
+ # --- XSS ---
740
+ (r"dangerouslySetInnerHTML\s*=\s*\{\s*\{\s*__html\s*:", "XSS", "dangerouslySetInnerHTML usage"),
741
+ (r"innerHTML\s*=", "XSS", "innerHTML assignment"),
742
+ (r"document\.write\s*\(", "XSS", "document.write usage"),
743
+ (r"v-html\s*=", "XSS", "Vue v-html directive"),
744
+ (r"\|\s*safe\b", "XSS", "Django |safe filter (unescaped output)"),
745
+ (r"mark_safe\s*\(", "XSS", "Django mark_safe (unescaped output)"),
746
+ (r"Markup\s*\(", "XSS", "Flask/Jinja2 Markup() (unescaped output)"),
747
+ # --- SSTI / Template Injection ---
748
+ (r"render_template_string\s*\(", "SSTI", "Flask render_template_string (template injection sink)"),
749
+ (r"Template\s*\(\s*(?:req|request|data|user|input|params|body|f['\"])", "SSTI", "Template constructed from user input"),
750
+ (r"Environment\s*\(.*\)\.from_string\s*\(", "SSTI", "Jinja2 Environment.from_string (template injection)"),
751
+ (r"\.render\s*\(\s*(?:req|request|data|user|input|params|body)", "SSTI", "Template render with user-controlled data"),
752
+ (r"ejs\.render\s*\(", "SSTI", "EJS render (potential SSTI)"),
753
+ (r"nunjucks\.renderString\s*\(", "SSTI", "Nunjucks renderString (template injection)"),
754
+ (r"pug\.render\s*\(", "SSTI", "Pug render from string"),
755
+ # --- RCE / Command Injection ---
756
+ (r"eval\s*\(", "RCE", "eval() usage"),
757
+ (r"new\s+Function\s*\(", "RCE", "Function constructor"),
758
+ (r"exec\s*\(\s*[`'\"].*\$\{", "RCE", "Command injection risk"),
759
+ (r"child_process.*exec", "RCE", "child_process exec usage"),
760
+ (r"subprocess\.\w+\(.*shell\s*=\s*True", "RCE", "Python subprocess with shell=True"),
761
+ (r"os\.system\s*\(", "RCE", "os.system() call"),
762
+ (r"os\.popen\s*\(", "RCE", "os.popen() call"),
763
+ (r"commands\.getoutput\s*\(", "RCE", "commands.getoutput() call"),
764
+ # --- SQL Injection ---
765
+ (r"\$\{.*\}\s*(?:SELECT|INSERT|UPDATE|DELETE|FROM|WHERE)", "SQLi", "String interpolation in SQL"),
766
+ (r"\.raw\s*\(\s*(?:f['\"]|['\"].*%|['\"].*\+|['\"].*format)", "SQLi", "Django/SQLAlchemy .raw() with string formatting"),
767
+ (r"\.extra\s*\(", "SQLi", "Django .extra() (raw SQL injection risk)"),
768
+ (r"RawSQL\s*\(", "SQLi", "Django RawSQL expression"),
769
+ (r"text\s*\(\s*f['\"]", "SQLi", "SQLAlchemy text() with f-string"),
770
+ (r"execute\s*\(\s*f['\"]", "SQLi", "Raw SQL execute with f-string"),
771
+ (r"Sequelize\.literal\s*\(", "SQLi", "Sequelize.literal (raw SQL)"),
772
+ (r"knex\.raw\s*\(", "SQLi", "Knex.raw (raw SQL)"),
773
+ # --- Path Traversal ---
774
+ (r"sendFile\s*\(\s*(?:req|request|params|query|path)", "Path Traversal", "Express sendFile with user input"),
775
+ (r"res\.download\s*\(\s*(?:req|request|params|query|path)", "Path Traversal", "Express res.download with user input"),
776
+ (r"open\s*\(\s*(?:req|request|params|os\.path\.join.*request)", "Path Traversal", "File open with user-controlled path"),
777
+ (r"send_file\s*\(\s*(?:req|request|path|os\.path\.join)", "Path Traversal", "Flask send_file with user input"),
778
+ (r"FileResponse\s*\(\s*(?:req|request|path|os\.path\.join)", "Path Traversal", "Django FileResponse with user input"),
779
+ (r"\.\.\/|\.\.\\", "Path Traversal", "Directory traversal sequence in code"),
780
+ # --- Prototype Pollution ---
781
+ (r"__proto__", "Prototype Pollution", "__proto__ reference"),
782
+ (r"Object\.assign\s*\(\s*\{\s*\}\s*,\s*(?:req|request|body|params|input)", "Prototype Pollution", "Object.assign from user input"),
783
+ (r"(?:lodash|_)\.merge\s*\(", "Prototype Pollution", "lodash.merge (deep merge, pollution risk)"),
784
+ (r"(?:lodash|_)\.defaultsDeep\s*\(", "Prototype Pollution", "lodash.defaultsDeep (pollution risk)"),
785
+ (r"deepmerge|deep-extend|merge-deep", "Prototype Pollution", "Deep merge library import"),
786
+ # --- Open Redirect ---
787
+ (r"redirect\s*\(\s*(?:req|request|params|query|searchParams|state|returnTo|url|res)", "Open Redirect", "User-controlled redirect"),
788
+ (r"(?:NextResponse\.redirect|res\.redirect)\s*\(", "Open Redirect", "Redirect call"),
789
+ (r"(?:returnTo|redirectTo|onErrorReturnTo|callbackUrl|redirect_url)", "Open Redirect", "Redirect parameter name"),
790
+ # --- SSRF ---
791
+ (r"(?:fetch|axios|http\.request|urllib\.request|requests\.get|requests\.post)\s*\(\s*(?:req|request|params|query|url|data)", "SSRF", "User-controlled URL in request"),
792
+ # --- Hardcoded Secrets ---
793
+ (r"(?:password|secret|key|token)\s*=\s*['\"][^'\"]+['\"]", "Hardcoded Secret", "Hardcoded credential"),
794
+ # --- IDOR ---
795
+ (r"findUnique\s*\(\s*\{\s*where\s*:\s*\{\s*id", "IDOR", "Lookup by ID without ownership check"),
796
+ # --- Race Conditions ---
797
+ (r"\.save\s*\(\s*\).*\.save\s*\(\s*\)", "Race Condition", "Multiple .save() calls (non-atomic update)"),
798
+ (r"balance|inventory|quantity|stock|credits", "Race Condition", "Financial/inventory field (check atomicity)"),
799
+ ]
800
+
801
+ _IMPORT_RE = re.compile(
802
+ r"""(?:import\s+(?:\w+|\{[^}]+\}|\*\s+as\s+\w+)\s+from\s+['\"]([^'\"]+)['\"]"""
803
+ r"""|require\s*\(\s*['\"]([^'\"]+)['\"]\s*\))""",
804
+ )
805
+
806
+
807
+ def _resolve_import(source: str, from_file: str, fs_tools: FileSystemTools) -> Optional[str]:
808
+ """Resolve a relative import to an actual file path."""
809
+ if not source.startswith("."):
810
+ return None
811
+
812
+ from_dir = os.path.dirname(from_file)
813
+ candidate_base = os.path.normpath(os.path.join(from_dir, source))
814
+
815
+ extensions = ["", ".ts", ".tsx", ".js", ".jsx", "/index.ts", "/index.js"]
816
+ for ext in extensions:
817
+ candidate = candidate_base + ext
818
+ result = fs_tools.read_file(candidate)
819
+ if "error" not in result:
820
+ return candidate
821
+ return None
822
+
823
+
824
+ def _quick_danger_scan(content: str) -> list[dict]:
825
+ """Run regex danger patterns on raw file content. Returns matches."""
826
+ findings = []
827
+ lines = content.split("\n")
828
+ for pattern_str, category, description in _DANGER_PATTERNS:
829
+ pattern = re.compile(pattern_str, re.IGNORECASE)
830
+ for i, line in enumerate(lines, 1):
831
+ raw_line = line.split("\t", 1)[1] if "\t" in line else line
832
+ if pattern.search(raw_line):
833
+ findings.append({
834
+ "line": i,
835
+ "category": category,
836
+ "description": description,
837
+ "content": raw_line.strip()[:150],
838
+ })
839
+ return findings
840
+
841
+
842
+ def enrich_missed_endpoints(
843
+ missed: list[dict],
844
+ fs_tools: FileSystemTools,
845
+ ) -> list[dict]:
846
+ """Build context clusters for missed attack surface files.
847
+
848
+ For each missed file:
849
+ 1. Run danger pattern pre-scan (smoke signals)
850
+ 2. Resolve local imports (dependency context)
851
+
852
+ Returns enriched endpoint dicts with 'danger_signals' and 'imports' fields.
853
+ """
854
+ enriched: list[dict] = []
855
+
856
+ for ep in missed:
857
+ file_path = ep["file"]
858
+ cluster: dict = {
859
+ **ep,
860
+ "danger_signals": [],
861
+ "imports": [],
862
+ }
863
+
864
+ content_result = fs_tools.read_file(file_path)
865
+ if "error" in content_result:
866
+ enriched.append(cluster)
867
+ continue
868
+
869
+ raw_content = content_result.get("content", "")
870
+
871
+ # Danger pattern pre-scan
872
+ cluster["danger_signals"] = _quick_danger_scan(raw_content)
873
+
874
+ # Import resolution
875
+ lines = raw_content.split("\n")
876
+ for line in lines:
877
+ raw_line = line.split("\t", 1)[1] if "\t" in line else line
878
+ for m in _IMPORT_RE.finditer(raw_line):
879
+ source = m.group(1) or m.group(2)
880
+ if source and source.startswith("."):
881
+ resolved = _resolve_import(source, file_path, fs_tools)
882
+ if resolved:
883
+ cluster["imports"].append(resolved)
884
+
885
+ enriched.append(cluster)
886
+
887
+ return enriched
888
+
889
+
890
+ def build_researcher_zones(
891
+ attack_surface: dict,
892
+ num_zones: int = 7,
893
+ max_files_per_zone: int = 50,
894
+ ) -> list[dict]:
895
+ """Divide attack surface files into zones for parallel researcher assignment.
896
+
897
+ Groups files by directory boundary, balances by count and danger signals,
898
+ and returns zones with file lists and formatted scope text for prompt injection.
899
+
900
+ Returns empty list if the codebase is too small to benefit from zoning
901
+ (threshold: num_zones * 4 unique files).
902
+
903
+ Each zone dict contains:
904
+ - name: human-readable zone label
905
+ - files: list of {file, categories, danger_signals} dicts
906
+ - file_count: number of unique files
907
+ - danger_summary: {category: count}
908
+ - scope_text: formatted text for researcher prompt
909
+ """
910
+ file_meta: dict[str, dict] = {}
911
+
912
+ for section_key, entries in attack_surface.items():
913
+ if not isinstance(entries, list):
914
+ continue
915
+ for ep in entries:
916
+ fp = ep.get("file", "")
917
+ if not fp:
918
+ continue
919
+ if fp not in file_meta:
920
+ file_meta[fp] = {
921
+ "file": fp,
922
+ "categories": [],
923
+ "danger_signals": [],
924
+ }
925
+ file_meta[fp]["categories"].append(section_key)
926
+ if ep.get("trigger"):
927
+ file_meta[fp]["danger_signals"].append(ep["trigger"])
928
+ for sig in ep.get("danger_signals", []):
929
+ desc = sig.get("description", "") if isinstance(sig, dict) else str(sig)
930
+ if desc:
931
+ file_meta[fp]["danger_signals"].append(desc)
932
+
933
+ min_for_zoning = max(num_zones * 2, 10)
934
+ if len(file_meta) < min_for_zoning:
935
+ return []
936
+
937
+ # Group by directory prefix (2 levels deep, skipping noise prefixes)
938
+ _NOISE_PREFIXES = {".", "src", ""}
939
+ dir_groups: dict[str, list[dict]] = {}
940
+ for fp, meta in file_meta.items():
941
+ parts = fp.split("/")
942
+ meaningful = [p for p in parts[:-1] if p not in _NOISE_PREFIXES]
943
+ if len(meaningful) >= 2:
944
+ key = "/".join(meaningful[:2])
945
+ elif meaningful:
946
+ key = meaningful[0]
947
+ else:
948
+ key = "_root"
949
+ dir_groups.setdefault(key, []).append(meta)
950
+
951
+ def _group_score(group: list[dict]) -> float:
952
+ danger = sum(len(m["danger_signals"]) for m in group)
953
+ return danger * 2 + len(group)
954
+
955
+ # Sort largest / most dangerous groups first for greedy assignment
956
+ sorted_dirs = sorted(dir_groups.items(), key=lambda x: -_group_score(x[1]))
957
+
958
+ # Split oversized groups, then use greedy bin-packing into num_zones bins
959
+ groups_to_assign: list[tuple[str, list[dict]]] = []
960
+ for dir_name, files in sorted_dirs:
961
+ if len(files) > max_files_per_zone:
962
+ for i in range(0, len(files), max_files_per_zone):
963
+ chunk = files[i:i + max_files_per_zone]
964
+ suffix = f" (part {i // max_files_per_zone + 1})" if i > 0 else ""
965
+ groups_to_assign.append((f"{dir_name}{suffix}", chunk))
966
+ else:
967
+ groups_to_assign.append((dir_name, files))
968
+
969
+ # Greedy bin-packing: assign each group to the zone with fewest files
970
+ actual_zones = min(num_zones, len(groups_to_assign))
971
+ zones: list[dict] = [{"names": [], "files": []} for _ in range(actual_zones)]
972
+
973
+ for dir_name, files in groups_to_assign:
974
+ smallest_zone = min(zones, key=lambda z: len(z["files"]))
975
+ smallest_zone["files"].extend(files)
976
+ smallest_zone["names"].append(dir_name)
977
+
978
+ # Convert to final format
979
+ final_zones: list[dict] = []
980
+ for z in zones:
981
+ if z["files"]:
982
+ final_zones.append({
983
+ "name": " + ".join(z["names"][:3]),
984
+ "files": z["files"],
985
+ })
986
+ zones = final_zones
987
+
988
+ # Annotate each zone with metadata and scope text
989
+ for zone in zones:
990
+ zone["file_count"] = len(zone["files"])
991
+ zone["file_paths"] = {m["file"] for m in zone["files"]}
992
+
993
+ danger_summary: dict[str, int] = {}
994
+ for meta in zone["files"]:
995
+ for sig in meta.get("danger_signals", []):
996
+ cat = sig.split("(")[0].strip().split(":")[0].strip() if sig else "Unknown"
997
+ danger_summary[cat] = danger_summary.get(cat, 0) + 1
998
+ zone["danger_summary"] = danger_summary
999
+
1000
+ lines = []
1001
+ for meta in zone["files"]:
1002
+ cat_str = ", ".join(sorted(set(meta["categories"])))
1003
+ line = f" - `{meta['file']}` ({cat_str})"
1004
+ if meta["danger_signals"]:
1005
+ line += f" — DANGER: {meta['danger_signals'][0]}"
1006
+ lines.append(line)
1007
+
1008
+ danger_text = ""
1009
+ if danger_summary:
1010
+ parts = [f"{cat}: {cnt}" for cat, cnt in
1011
+ sorted(danger_summary.items(), key=lambda x: -x[1])[:5]]
1012
+ danger_text = f"\n\nDanger signals in this zone: {', '.join(parts)}"
1013
+
1014
+ zone["scope_text"] = (
1015
+ f"ZONE SCOPE — You are assigned to these {zone['file_count']} files. "
1016
+ f"Read and analyze EACH file. Do not skip any.\n"
1017
+ f"You may follow imports outside your zone for context, but focus hunting here.\n\n"
1018
+ f"Files:\n" + "\n".join(lines) + danger_text
1019
+ )
1020
+
1021
+ return zones
1022
+
1023
+
1024
+ def build_second_pass_tasks(
1025
+ enriched_endpoints: list[dict],
1026
+ max_files_per_task: int = 8,
1027
+ ) -> list[str]:
1028
+ """Build focused hunter tasks from enriched missed endpoints.
1029
+
1030
+ Prioritizes files with danger signals and groups them into manageable
1031
+ task descriptions for second-pass sub-hunters.
1032
+ """
1033
+ # Sort: files with danger signals first, then by category
1034
+ prioritized = sorted(
1035
+ enriched_endpoints,
1036
+ key=lambda ep: (
1037
+ 0 if ep.get("danger_signals") else 1,
1038
+ ep.get("category", "z"),
1039
+ ),
1040
+ )
1041
+
1042
+ tasks: list[str] = []
1043
+ for i in range(0, len(prioritized), max_files_per_task):
1044
+ batch = prioritized[i : i + max_files_per_task]
1045
+ file_descriptions: list[str] = []
1046
+
1047
+ for ep in batch:
1048
+ desc = f" - `{ep['file']}` ({ep.get('category', 'unknown')})"
1049
+ signals = ep.get("danger_signals", [])
1050
+ if signals:
1051
+ signal_summary = ", ".join(
1052
+ sorted(set(s["category"] for s in signals))
1053
+ )
1054
+ desc += f" -- DANGER SIGNALS: {signal_summary}"
1055
+ top_signals = signals[:3]
1056
+ for s in top_signals:
1057
+ desc += f"\n Line {s['line']}: {s['description']}: `{s['content']}`"
1058
+ imports = ep.get("imports", [])
1059
+ if imports:
1060
+ desc += f"\n Key imports: {', '.join(imports[:5])}"
1061
+ file_descriptions.append(desc)
1062
+
1063
+ file_list = "\n".join(file_descriptions)
1064
+ task = (
1065
+ "COVERAGE SECOND PASS: The following attack surface files were NOT analyzed "
1066
+ "in the first hunting pass. Read EACH file listed below and hunt for "
1067
+ "vulnerabilities. Do NOT skip any file.\n\n"
1068
+ f"Files to analyze ({len(batch)}):\n{file_list}\n\n"
1069
+ "For each file:\n"
1070
+ "1. Use read_file to read the FULL file content\n"
1071
+ "2. If danger signals are noted, investigate those specific lines\n"
1072
+ "3. Follow key imports if they contain validation/sanitization logic\n"
1073
+ "4. Report any confirmed vulnerabilities using report_finding\n"
1074
+ "5. Call finish_hunt when done analyzing ALL files in this batch"
1075
+ )
1076
+ tasks.append(task)
1077
+
1078
+ return tasks