bbot 2.4.2__py3-none-any.whl → 2.4.2.6590rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bbot might be problematic. Click here for more details.

Files changed (64) hide show
  1. bbot/__init__.py +1 -1
  2. bbot/core/event/base.py +64 -4
  3. bbot/core/helpers/diff.py +10 -7
  4. bbot/core/helpers/helper.py +5 -1
  5. bbot/core/helpers/misc.py +48 -11
  6. bbot/core/helpers/regex.py +4 -0
  7. bbot/core/helpers/regexes.py +45 -8
  8. bbot/core/helpers/url.py +21 -5
  9. bbot/core/helpers/web/client.py +25 -5
  10. bbot/core/helpers/web/engine.py +9 -1
  11. bbot/core/helpers/web/envelopes.py +352 -0
  12. bbot/core/helpers/web/web.py +10 -2
  13. bbot/core/helpers/yara_helper.py +50 -0
  14. bbot/core/modules.py +23 -7
  15. bbot/defaults.yml +26 -1
  16. bbot/modules/base.py +4 -2
  17. bbot/modules/{deadly/dastardly.py → dastardly.py} +1 -1
  18. bbot/modules/{deadly/ffuf.py → ffuf.py} +1 -1
  19. bbot/modules/ffuf_shortnames.py +1 -1
  20. bbot/modules/httpx.py +14 -0
  21. bbot/modules/hunt.py +24 -6
  22. bbot/modules/internal/aggregate.py +1 -0
  23. bbot/modules/internal/excavate.py +356 -197
  24. bbot/modules/lightfuzz/lightfuzz.py +203 -0
  25. bbot/modules/lightfuzz/submodules/__init__.py +0 -0
  26. bbot/modules/lightfuzz/submodules/base.py +312 -0
  27. bbot/modules/lightfuzz/submodules/cmdi.py +106 -0
  28. bbot/modules/lightfuzz/submodules/crypto.py +474 -0
  29. bbot/modules/lightfuzz/submodules/nosqli.py +183 -0
  30. bbot/modules/lightfuzz/submodules/path.py +154 -0
  31. bbot/modules/lightfuzz/submodules/serial.py +179 -0
  32. bbot/modules/lightfuzz/submodules/sqli.py +187 -0
  33. bbot/modules/lightfuzz/submodules/ssti.py +39 -0
  34. bbot/modules/lightfuzz/submodules/xss.py +191 -0
  35. bbot/modules/{deadly/nuclei.py → nuclei.py} +1 -1
  36. bbot/modules/paramminer_headers.py +2 -0
  37. bbot/modules/reflected_parameters.py +80 -0
  38. bbot/modules/{deadly/vhost.py → vhost.py} +2 -2
  39. bbot/presets/web/lightfuzz-heavy.yml +16 -0
  40. bbot/presets/web/lightfuzz-light.yml +20 -0
  41. bbot/presets/web/lightfuzz-medium.yml +14 -0
  42. bbot/presets/web/lightfuzz-superheavy.yml +13 -0
  43. bbot/presets/web/lightfuzz-xss.yml +21 -0
  44. bbot/presets/web/paramminer.yml +8 -5
  45. bbot/scanner/preset/args.py +26 -0
  46. bbot/scanner/scanner.py +6 -0
  47. bbot/test/test_step_1/test__module__tests.py +1 -1
  48. bbot/test/test_step_1/test_helpers.py +7 -0
  49. bbot/test/test_step_1/test_presets.py +2 -2
  50. bbot/test/test_step_1/test_web.py +20 -0
  51. bbot/test/test_step_1/test_web_envelopes.py +343 -0
  52. bbot/test/test_step_2/module_tests/test_module_excavate.py +404 -29
  53. bbot/test/test_step_2/module_tests/test_module_httpx.py +29 -0
  54. bbot/test/test_step_2/module_tests/test_module_hunt.py +18 -1
  55. bbot/test/test_step_2/module_tests/test_module_lightfuzz.py +1947 -0
  56. bbot/test/test_step_2/module_tests/test_module_paramminer_getparams.py +4 -1
  57. bbot/test/test_step_2/module_tests/test_module_paramminer_headers.py +46 -2
  58. bbot/test/test_step_2/module_tests/test_module_reflected_parameters.py +226 -0
  59. bbot/wordlists/paramminer_parameters.txt +0 -8
  60. {bbot-2.4.2.dist-info → bbot-2.4.2.6590rc0.dist-info}/METADATA +2 -1
  61. {bbot-2.4.2.dist-info → bbot-2.4.2.6590rc0.dist-info}/RECORD +64 -42
  62. {bbot-2.4.2.dist-info → bbot-2.4.2.6590rc0.dist-info}/LICENSE +0 -0
  63. {bbot-2.4.2.dist-info → bbot-2.4.2.6590rc0.dist-info}/WHEEL +0 -0
  64. {bbot-2.4.2.dist-info → bbot-2.4.2.6590rc0.dist-info}/entry_points.txt +0 -0
@@ -63,6 +63,17 @@ def _exclude_key(original_dict, key_to_exclude):
63
63
 
64
64
 
65
65
  def extract_params_url(parsed_url):
66
+ """
67
+ Yields query parameters from a parsed URL.
68
+
69
+ Args:
70
+ parsed_url (ParseResult): The URL to extract parameters from.
71
+
72
+ Yields:
73
+ tuple: Contains the hardcoded HTTP method ('GET'), parsed URL, parameter name,
74
+ original value, source (hardcoded to 'direct_url'), and additional parameters
75
+ (all parameters excluding the current one).
76
+ """
66
77
  params = parse_qs(parsed_url.query)
67
78
  flat_params = {k: v[0] for k, v in params.items()}
68
79
 
@@ -303,44 +314,38 @@ class excavate(BaseInternalModule, BaseInterceptModule):
303
314
  }
304
315
 
305
316
  options = {
306
- "retain_querystring": False,
307
317
  "yara_max_match_data": 2000,
308
318
  "custom_yara_rules": "",
319
+ "speculate_params": False,
309
320
  }
310
321
  options_desc = {
311
- "retain_querystring": "Keep the querystring intact on emitted WEB_PARAMETERS",
312
322
  "yara_max_match_data": "Sets the maximum amount of text that can extracted from a YARA regex",
313
323
  "custom_yara_rules": "Include custom Yara rules",
324
+ "speculate_params": "Enable speculative parameter extraction from JSON and XML content",
314
325
  }
315
326
  scope_distance_modifier = None
316
327
  accept_dupes = False
317
328
 
318
329
  _module_threads = 8
319
330
 
320
- parameter_blacklist = {
321
- p.lower()
322
- for p in [
323
- "__VIEWSTATE",
324
- "__EVENTARGUMENT",
325
- "__EVENTVALIDATION",
326
- "__EVENTTARGET",
327
- "__EVENTARGUMENT",
328
- "__VIEWSTATEGENERATOR",
329
- "__SCROLLPOSITIONY",
330
- "__SCROLLPOSITIONX",
331
- "ASP.NET_SessionId",
332
- "JSESSIONID",
333
- "PHPSESSID",
334
- ]
335
- }
336
-
337
331
  yara_rule_name_regex = re.compile(r"rule\s(\w+)\s{")
338
332
  yara_rule_regex = re.compile(r"(?s)((?:rule\s+\w+\s*{[^{}]*(?:{[^{}]*}[^{}]*)*[^{}]*(?:/\S*?}[^/]*?/)*)*})")
339
333
 
340
334
  def in_bl(self, value):
341
- return value.lower() in self.parameter_blacklist
335
+ # Check if the value is in the blacklist or starts with a blacklisted prefix.
336
+ lower_value = value.lower()
337
+
338
+ if lower_value in self.parameter_blacklist:
339
+ return True
340
+
341
+ for bl_param_prefix in self.parameter_blacklist_prefixes:
342
+ if lower_value.startswith(bl_param_prefix.lower()):
343
+ return True
344
+
345
+ return False
342
346
 
343
347
  def url_unparse(self, param_type, parsed_url):
348
+ # Reconstructs a URL, optionally omitting the query string based on remove_querystring configuration value.
344
349
  if param_type == "GETPARAM":
345
350
  querystring = ""
346
351
  else:
@@ -352,7 +357,7 @@ class excavate(BaseInternalModule, BaseInterceptModule):
352
357
  parsed_url.netloc,
353
358
  parsed_url.path,
354
359
  "",
355
- querystring if self.retain_querystring else "",
360
+ "" if self.remove_querystring else querystring,
356
361
  "",
357
362
  )
358
363
  )
@@ -364,7 +369,7 @@ class excavate(BaseInternalModule, BaseInterceptModule):
364
369
  class ParameterExtractorRule:
365
370
  name = ""
366
371
 
367
- def extract(self):
372
+ async def extract(self):
368
373
  pass
369
374
 
370
375
  def __init__(self, excavate, result):
@@ -377,29 +382,32 @@ class excavate(BaseInternalModule, BaseInterceptModule):
377
382
  extraction_regex = re.compile(r"\$.get\([\'\"](.+)[\'\"].+(\{.+\})\)")
378
383
  output_type = "GETPARAM"
379
384
 
380
- def convert_to_dict(self, extracted_str):
381
- extracted_str = extracted_str.replace("'", '"')
382
- extracted_str = re.sub(r"(\w+):", r'"\1":', extracted_str)
383
- try:
384
- return json.loads(extracted_str)
385
- except json.JSONDecodeError as e:
386
- self.excavate.debug(f"Failed to decode JSON: {e}")
387
- return None
388
-
389
- def extract(self):
390
- extracted_results = self.extraction_regex.findall(str(self.result))
385
+ async def extract(self):
386
+ extracted_results = await self.excavate.helpers.re.findall(self.extraction_regex, str(self.result))
391
387
  if extracted_results:
392
388
  for action, extracted_parameters in extracted_results:
393
- extracted_parameters_dict = self.convert_to_dict(extracted_parameters)
389
+ extracted_parameters_dict = await self.convert_to_dict(extracted_parameters)
394
390
  for parameter_name, original_value in extracted_parameters_dict.items():
395
391
  yield (
396
392
  self.output_type,
397
393
  parameter_name,
398
- original_value,
394
+ original_value.strip(),
399
395
  action,
400
396
  _exclude_key(extracted_parameters_dict, parameter_name),
401
397
  )
402
398
 
399
+ async def convert_to_dict(self, extracted_str):
400
+ extracted_str = extracted_str.replace("'", '"')
401
+ extracted_str = await self.excavate.helpers.re.sub(
402
+ re.compile(r"(\w+):"), r'"\1":', extracted_str
403
+ ) # Quote keys
404
+
405
+ try:
406
+ return json.loads(extracted_str)
407
+ except json.JSONDecodeError as e:
408
+ self.excavate.debug(f"Failed to decode JSON: {e}")
409
+ return None
410
+
403
411
  class PostJquery(GetJquery):
404
412
  name = "POST jquery"
405
413
  discovery_regex = r"/\$.post\([^\)].+\)/ nocase"
@@ -408,56 +416,136 @@ class excavate(BaseInternalModule, BaseInterceptModule):
408
416
 
409
417
  class HtmlTags(ParameterExtractorRule):
410
418
  name = "HTML Tags"
411
- discovery_regex = r'/<[^>]+(href|src)=["\'][^"\']*["\'][^>]*>/ nocase'
419
+ discovery_regex = r'/<[^>]+(href|src|action)=["\']?[^"\'>\s]*["\']?[^>]*>/ nocase'
412
420
  extraction_regex = bbot_regexes.tag_attribute_regex
413
421
  output_type = "GETPARAM"
414
422
 
415
- def extract(self):
416
- urls = self.extraction_regex.findall(str(self.result))
423
+ async def extract(self):
424
+ urls = await self.excavate.helpers.re.findall(self.extraction_regex, str(self.result))
417
425
  for url in urls:
418
426
  parsed_url = urlparse(url)
419
- query_strings = parse_qs(parsed_url.query)
420
- query_strings_dict = {
421
- k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in query_strings.items()
422
- }
427
+ query_strings = parse_qs(html.unescape(parsed_url.query))
428
+ query_strings_dict = {k: v[0] if isinstance(v, list) else v for k, v in query_strings.items()}
423
429
  for parameter_name, original_value in query_strings_dict.items():
424
430
  yield (
425
431
  self.output_type,
426
432
  parameter_name,
427
- original_value,
433
+ original_value.strip(),
428
434
  url,
429
435
  _exclude_key(query_strings_dict, parameter_name),
430
436
  )
431
437
 
438
+ class AjaxJquery(ParameterExtractorRule):
439
+ name = "JQuery Extractor"
440
+ discovery_regex = r"/\$\.ajax\(\{[^\<$\$]*\}\)/s nocase"
441
+ extraction_regex = None
442
+ output_type = "BODYJSON"
443
+ ajax_content_regexes = {
444
+ "url": re.compile(r"url\s*:\s*['\"](.*?)['\"]"),
445
+ "type": re.compile(r"type\s*:\s*['\"](.*?)['\"]"),
446
+ "content_type": re.compile(r"contentType\s*:\s*['\"](.*?)['\"]"),
447
+ "data": re.compile(r"data:.*(\{[^}]*\})"),
448
+ }
449
+
450
+ async def extract(self):
451
+ # Iterate through each regex in ajax_content_regexes
452
+ extracted_values = {}
453
+ for key, pattern in self.ajax_content_regexes.items():
454
+ match = await self.excavate.helpers.re.search(pattern, self.result)
455
+ if match:
456
+ # Store the matched value in the dictionary
457
+ extracted_values[key] = match.group(1)
458
+
459
+ # Check to see if the format is defined as JSON
460
+ if (
461
+ "content_type" in extracted_values.keys()
462
+ and extracted_values["content_type"] == "application/json"
463
+ ):
464
+ form_parameters = {}
465
+
466
+ # If we can't figure out the parameter names, there is no point in continuing
467
+ if "data" in extracted_values.keys():
468
+ form_url = extracted_values.get("url", None)
469
+
470
+ try:
471
+ s = extracted_values["data"]
472
+ s = await self.excavate.helpers.re.sub(re.compile(r"(\w+)\s*:"), r'"\1":', s) # Quote keys
473
+ s = await self.excavate.helpers.re.sub(
474
+ re.compile(r":\s*(\w+)"), r': "\1"', s
475
+ ) # Quote values if they are unquoted
476
+ data = json.loads(s)
477
+ except (ValueError, SyntaxError):
478
+ data = None
479
+
480
+ if data:
481
+ for p in data.keys():
482
+ form_parameters[p] = None
483
+
484
+ for parameter_name in form_parameters:
485
+ yield (
486
+ "BODYJSON",
487
+ parameter_name,
488
+ None,
489
+ form_url,
490
+ _exclude_key(form_parameters, parameter_name),
491
+ )
492
+
432
493
  class GetForm(ParameterExtractorRule):
433
494
  name = "GET Form"
434
495
  discovery_regex = r'/<form[^>]*\bmethod=["\']?get["\']?[^>]*>.*<\/form>/s nocase'
435
- form_content_regexes = [
436
- bbot_regexes.input_tag_regex,
437
- bbot_regexes.select_tag_regex,
438
- bbot_regexes.textarea_tag_regex,
439
- ]
496
+ form_content_regexes = {
497
+ "input_tag_regex": bbot_regexes.input_tag_regex,
498
+ "input_tag_regex2": bbot_regexes.input_tag_regex2,
499
+ "select_tag_regex": bbot_regexes.select_tag_regex,
500
+ "textarea_tag_regex": bbot_regexes.textarea_tag_regex,
501
+ "textarea_tag_regex2": bbot_regexes.textarea_tag_regex2,
502
+ "textarea_tag_novalue_regex": bbot_regexes.textarea_tag_novalue_regex,
503
+ "button_tag_regex": bbot_regexes.button_tag_regex,
504
+ "button_tag_regex2": bbot_regexes.button_tag_regex2,
505
+ "_input_tag_novalue_regex": bbot_regexes.input_tag_novalue_regex,
506
+ }
440
507
  extraction_regex = bbot_regexes.get_form_regex
441
508
  output_type = "GETPARAM"
442
509
 
443
- def extract(self):
444
- forms = self.extraction_regex.findall(str(self.result))
510
+ async def extract(self):
511
+ forms = await self.excavate.helpers.re.findall(self.extraction_regex, str(self.result))
445
512
  for form_action, form_content in forms:
446
- form_parameters = {}
447
- for form_content_regex in self.form_content_regexes:
448
- input_tags = form_content_regex.findall(form_content)
513
+ if not form_action or form_action == "#":
514
+ form_action = None
449
515
 
450
- for parameter_name, original_value in input_tags:
451
- form_parameters[parameter_name] = original_value
516
+ elif form_action.startswith("./"):
517
+ form_action = form_action.lstrip(".")
452
518
 
453
- for parameter_name, original_value in form_parameters.items():
454
- yield (
455
- self.output_type,
456
- parameter_name,
457
- original_value,
458
- form_action,
459
- _exclude_key(form_parameters, parameter_name),
460
- )
519
+ form_parameters = {}
520
+ for form_content_regex_name, form_content_regex in self.form_content_regexes.items():
521
+ input_tags = await self.excavate.helpers.re.findall(form_content_regex, form_content)
522
+ if input_tags:
523
+ # Normalize each input_tag to be a tuple of two elements
524
+ input_tags = [(tag if isinstance(tag, tuple) else (tag, None)) for tag in input_tags]
525
+
526
+ if form_content_regex_name in [
527
+ "input_tag_regex2",
528
+ "button_tag_regex2",
529
+ "textarea_tag_regex2",
530
+ ]:
531
+ # Swap elements if needed
532
+ input_tags = [(b, a) for a, b in input_tags]
533
+ for parameter_name, original_value in input_tags:
534
+ form_parameters.setdefault(
535
+ parameter_name, original_value.strip() if original_value else None
536
+ )
537
+
538
+ for parameter_name, original_value in form_parameters.items():
539
+ yield (
540
+ self.output_type,
541
+ parameter_name,
542
+ original_value,
543
+ form_action,
544
+ _exclude_key(form_parameters, parameter_name),
545
+ )
546
+
547
+ class GetForm2(GetForm):
548
+ extraction_regex = bbot_regexes.get_form_regex2
461
549
 
462
550
  class PostForm(GetForm):
463
551
  name = "POST Form"
@@ -465,6 +553,21 @@ class excavate(BaseInternalModule, BaseInterceptModule):
465
553
  extraction_regex = bbot_regexes.post_form_regex
466
554
  output_type = "POSTPARAM"
467
555
 
556
+ class PostForm2(PostForm):
557
+ extraction_regex = bbot_regexes.post_form_regex2
558
+
559
+ class PostForm_NoAction(PostForm):
560
+ name = "POST Form (no action)"
561
+ extraction_regex = bbot_regexes.post_form_regex_noaction
562
+
563
+ # underscore ensure generic forms runs last, so it doesn't cause dedupe to stop full form detection
564
+ class _GenericForm(GetForm):
565
+ name = "Generic Form"
566
+ discovery_regex = r"/<form[^>]*>.*<\/form>/s nocase"
567
+
568
+ extraction_regex = bbot_regexes.generic_form_regex
569
+ output_type = "GETPARAM"
570
+
468
571
  def __init__(self, excavate):
469
572
  super().__init__(excavate)
470
573
  self.parameterExtractorCallbackDict = {}
@@ -476,7 +579,7 @@ class excavate(BaseInternalModule, BaseInterceptModule):
476
579
  regexes_component_list.append(f"${r.__name__} = {r.discovery_regex}")
477
580
  regexes_component = " ".join(regexes_component_list)
478
581
  self.yara_rules["parameter_extraction"] = (
479
- rf'rule parameter_extraction {{meta: description = "contains POST form" strings: {regexes_component} condition: any of them}}'
582
+ rf'rule parameter_extraction {{meta: description = "contains Parameter" strings: {regexes_component} condition: any of them}}'
480
583
  )
481
584
 
482
585
  async def process(self, yara_results, event, yara_rule_settings, discovery_context):
@@ -487,51 +590,64 @@ class excavate(BaseInternalModule, BaseInterceptModule):
487
590
  parameterExtractorSubModule = self.parameterExtractorCallbackDict[identifier](
488
591
  self.excavate, result
489
592
  )
490
- extracted_params = parameterExtractorSubModule.extract()
491
- if extracted_params:
492
- for (
493
- parameter_type,
494
- parameter_name,
495
- original_value,
496
- endpoint,
497
- additional_params,
498
- ) in extracted_params:
499
- self.excavate.debug(
500
- f"Found Parameter [{parameter_name}] in [{parameterExtractorSubModule.name}] ParameterExtractor Submodule"
501
- )
502
- endpoint = event.data["url"] if not endpoint else endpoint
503
- url = (
504
- endpoint
505
- if endpoint.startswith(("http://", "https://"))
506
- else f"{event.parsed_url.scheme}://{event.parsed_url.netloc}{endpoint}"
507
- )
508
593
 
509
- if self.excavate.helpers.validate_parameter(parameter_name, parameter_type):
510
- if self.excavate.in_bl(parameter_name) is False:
511
- parsed_url = urlparse(url)
512
- if not parsed_url.hostname:
513
- self.excavate.warning(
514
- f"Error Parsing reconstructed URL [{url}] during parameter extraction, missing hostname"
515
- )
516
- continue
517
- description = f"HTTP Extracted Parameter [{parameter_name}] ({parameterExtractorSubModule.name} Submodule)"
518
- data = {
519
- "host": parsed_url.hostname,
520
- "type": parameter_type,
521
- "name": parameter_name,
522
- "original_value": original_value,
523
- "url": self.excavate.url_unparse(parameter_type, parsed_url),
524
- "additional_params": additional_params,
525
- "assigned_cookies": self.excavate.assigned_cookies,
526
- "description": description,
527
- }
528
- await self.report(
529
- data, event, yara_rule_settings, discovery_context, event_type="WEB_PARAMETER"
594
+ # Use async for to iterate over the async generator
595
+ async for (
596
+ parameter_type,
597
+ parameter_name,
598
+ original_value,
599
+ endpoint,
600
+ additional_params,
601
+ ) in parameterExtractorSubModule.extract():
602
+ self.excavate.debug(
603
+ f"Found Parameter [{parameter_name}] in [{parameterExtractorSubModule.name}] ParameterExtractor Submodule"
604
+ )
605
+
606
+ # account for the case where the action is html encoded
607
+ if endpoint and (
608
+ endpoint.startswith("https&#x3a;&#x2f;&#x2f;")
609
+ or endpoint.startswith("http&#x3a;&#x2f;&#x2f;")
610
+ ):
611
+ endpoint = html.unescape(endpoint)
612
+
613
+ # If we have a full URL, leave it as-is
614
+ if endpoint and endpoint.startswith(("http://", "https://")):
615
+ url = endpoint
616
+
617
+ # The endpoint is usually a form action - we should use it if we have it. If not, default to URL.
618
+ else:
619
+ # Use the original URL as the base and resolve the endpoint correctly in case of relative paths
620
+ base_url = f"{event.parsed_url.scheme}://{event.parsed_url.netloc}{event.parsed_url.path}"
621
+ if not self.excavate.remove_querystring and len(event.parsed_url.query) > 0:
622
+ base_url += f"?{event.parsed_url.query}"
623
+ url = urljoin(base_url, endpoint)
624
+
625
+ if self.excavate.helpers.validate_parameter(parameter_name, parameter_type):
626
+ if self.excavate.in_bl(parameter_name) is False:
627
+ parsed_url = urlparse(url)
628
+ if not parsed_url.hostname:
629
+ self.excavate.warning(
630
+ f"Error Parsing reconstructed URL [{url}] during parameter extraction, missing hostname"
530
631
  )
531
- else:
532
- self.excavate.debug(f"blocked parameter [{parameter_name}] due to BL match")
632
+ continue
633
+ description = f"HTTP Extracted Parameter [{parameter_name}] ({parameterExtractorSubModule.name} Submodule)"
634
+ data = {
635
+ "host": parsed_url.hostname,
636
+ "type": parameter_type,
637
+ "name": parameter_name,
638
+ "original_value": original_value,
639
+ "url": self.excavate.url_unparse(parameter_type, parsed_url),
640
+ "additional_params": additional_params,
641
+ "assigned_cookies": self.excavate.assigned_cookies,
642
+ "description": description,
643
+ }
644
+ await self.report(
645
+ data, event, yara_rule_settings, discovery_context, event_type="WEB_PARAMETER"
646
+ )
533
647
  else:
534
- self.excavate.debug(f"blocked parameter [{parameter_name}] due to validation failure")
648
+ self.excavate.debug(f"blocked parameter [{parameter_name}] due to BL match")
649
+ else:
650
+ self.excavate.debug(f"blocked parameter [{parameter_name}] due to validation failure")
535
651
 
536
652
  class CSPExtractor(ExcavateRule):
537
653
  description = "Extracts domains from CSP headers."
@@ -609,12 +725,13 @@ class excavate(BaseInternalModule, BaseInterceptModule):
609
725
  class SerializationExtractor(ExcavateRule):
610
726
  description = "Identifies serialized objects from various platforms."
611
727
  regexes = {
612
- "Java": re.compile(r"[^a-zA-Z0-9\/+]rO0[a-zA-Z0-9+\/]+={0,2}"),
613
- "DOTNET": re.compile(r"[^a-zA-Z0-9\/+]AAEAAAD\/\/[a-zA-Z0-9\/+]+={0,2}"),
614
- "PHP_Array": re.compile(r"[^a-zA-Z0-9\/+]YTo[xyz0123456][a-zA-Z0-9+\/]+={0,2}"),
615
- "PHP_String": re.compile(r"[^a-zA-Z0-9\/+]czo[xyz0123456][a-zA-Z0-9+\/]+={0,2}"),
616
- "PHP_Object": re.compile(r"[^a-zA-Z0-9\/+]Tzo[xyz0123456][a-zA-Z0-9+\/]+={0,2}"),
617
- "Possible_Compressed": re.compile(r"[^a-zA-Z0-9\/+]H4sIAAAAAAAA[a-zA-Z0-9+\/]+={0,2}"),
728
+ "Java": re.compile(r"[^a-zA-Z0-9\/+][\"']?rO0[a-zA-Z0-9+\/]+={0,2}"),
729
+ "Ruby": re.compile(r"[^a-zA-Z0-9\/+][\"']?BAh[a-zA-Z0-9+\/]+={0,2}"),
730
+ "DOTNET": re.compile(r"[^a-zA-Z0-9\/+][\"']?AAEAAAD\/\/[a-zA-Z0-9\/+]+={0,2}"),
731
+ "PHP_Array": re.compile(r"[^a-zA-Z0-9\/+][\"']?YTo[xyz0123456][a-zA-Z0-9+\/]+={0,2}"),
732
+ "PHP_String": re.compile(r"[^a-zA-Z0-9\/+][\"']?czo[xyz0123456][a-zA-Z0-9+\/]+={0,2}"),
733
+ "PHP_Object": re.compile(r"[^a-zA-Z0-9\/+][\"']?Tzo[xyz0123456][a-zA-Z0-9+\/]+={0,2}"),
734
+ "Possible_Compressed": re.compile(r"[^a-zA-Z0-9\/+][\"']?H4sIAAAA[a-zA-Z0-9+\/]+={0,2}"),
618
735
  }
619
736
  yara_rules = {}
620
737
 
@@ -622,7 +739,7 @@ class excavate(BaseInternalModule, BaseInterceptModule):
622
739
  super().__init__(excavate)
623
740
  regexes_component_list = []
624
741
  for regex_name, regex in self.regexes.items():
625
- regexes_component_list.append(rf"${regex_name} = /\b{regex.pattern}/ nocase")
742
+ regexes_component_list.append(rf"${regex_name} = /\b{regex.pattern}/")
626
743
  regexes_component = " ".join(regexes_component_list)
627
744
  self.yara_rules["serialization_detection"] = (
628
745
  f'rule serialization_detection {{meta: description = "contains a possible serialized object" strings: {regexes_component} condition: any of them}}'
@@ -715,7 +832,7 @@ class excavate(BaseInternalModule, BaseInterceptModule):
715
832
  tags = "spider-danger"
716
833
  description = "contains tag with src or href attribute"
717
834
  strings:
718
- $url_attr = /<[^>]+(href|src)=["\'][^"\']*["\'][^>]*>/
835
+ $url_attr = /<[^>]+(href|src|action)=["\']?[^"\']*["\']?[^>]*>/
719
836
  condition:
720
837
  $url_attr
721
838
  }
@@ -762,7 +879,6 @@ class excavate(BaseInternalModule, BaseInterceptModule):
762
879
  if final_url:
763
880
  if self.excavate.scan.in_scope(final_url):
764
881
  urls_found += 1
765
-
766
882
  await self.report(
767
883
  final_url,
768
884
  event,
@@ -828,6 +944,36 @@ class excavate(BaseInternalModule, BaseInterceptModule):
828
944
  for r in await self.helpers.re.findall(self.yara_rule_regex, rules_content):
829
945
  yield r
830
946
 
947
+ async def emit_web_parameter(
948
+ self, host, param_type, name, original_value, url, description, additional_params, event, context
949
+ ):
950
+ data = {
951
+ "host": host,
952
+ "type": param_type,
953
+ "name": name,
954
+ "original_value": original_value,
955
+ "url": url,
956
+ "description": description,
957
+ "additional_params": additional_params,
958
+ }
959
+ await self.emit_event(data, "WEB_PARAMETER", event, context=context)
960
+
961
+ async def emit_custom_parameters(self, event, config_key, param_type, description_suffix):
962
+ # Emits WEB_PARAMETER events for custom headers and cookies from the configuration.
963
+ custom_params = self.scan.web_config.get(config_key, {})
964
+ for param_name, param_value in custom_params.items():
965
+ await self.emit_web_parameter(
966
+ host=event.parsed_url.hostname,
967
+ param_type=param_type,
968
+ name=param_name,
969
+ original_value=param_value,
970
+ url=self.url_unparse(param_type, event.parsed_url),
971
+ description=f"HTTP Extracted Parameter [{param_name}] ({description_suffix})",
972
+ additional_params=_exclude_key(custom_params, param_name),
973
+ event=event,
974
+ context=f"Excavate saw a custom {param_type.lower()} set [{param_name}], and emitted a WEB_PARAMETER for it",
975
+ )
976
+
831
977
  async def setup(self):
832
978
  self.yara_rules_dict = {}
833
979
  self.yara_preprocess_dict = {}
@@ -839,10 +985,8 @@ class excavate(BaseInternalModule, BaseInterceptModule):
839
985
  ]
840
986
 
841
987
  self.parameter_extraction = bool(modules_WEB_PARAMETER)
842
-
843
- self.retain_querystring = False
844
- if self.config.get("retain_querystring", False) is True:
845
- self.retain_querystring = True
988
+ self.speculate_params = bool(self.config.get("speculate_params", False))
989
+ self.remove_querystring = self.scan.config.get("url_querystring_remove", True)
846
990
 
847
991
  for module in self.scan.modules.values():
848
992
  if not str(module).startswith("_"):
@@ -863,6 +1007,9 @@ class excavate(BaseInternalModule, BaseInterceptModule):
863
1007
  for rule_name, rule_content in excavateRule.yara_rules.items():
864
1008
  self.add_yara_rule(rule_name, rule_content, excavateRule)
865
1009
 
1010
+ self.parameter_blacklist = set(p.lower() for p in self.scan.config.get("parameter_blacklist", []))
1011
+ self.parameter_blacklist_prefixes = set(self.scan.config.get("parameter_blacklist_prefixes", []))
1012
+
866
1013
  self.custom_yara_rules = str(self.config.get("custom_yara_rules", ""))
867
1014
  if self.custom_yara_rules:
868
1015
  custom_rules_count = 0
@@ -919,10 +1066,9 @@ class excavate(BaseInternalModule, BaseInterceptModule):
919
1066
  async def search(self, data, event, content_type, discovery_context="HTTP response"):
920
1067
  if not data:
921
1068
  return None
922
-
923
1069
  decoded_data = await self.helpers.re.recursive_decode(data)
924
1070
 
925
- if self.parameter_extraction:
1071
+ if self.parameter_extraction and self.speculate_params:
926
1072
  content_type_lower = content_type.lower() if content_type else ""
927
1073
  extraction_map = {
928
1074
  "json": self.helpers.extract_params_json,
@@ -934,62 +1080,74 @@ class excavate(BaseInternalModule, BaseInterceptModule):
934
1080
  results = extract_func(data)
935
1081
  if results:
936
1082
  for parameter_name, original_value in results:
937
- description = (
938
- f"HTTP Extracted Parameter (speculative from {source_type} content) [{parameter_name}]"
1083
+ await self.emit_web_parameter(
1084
+ host=str(event.host),
1085
+ param_type="SPECULATIVE",
1086
+ name=parameter_name,
1087
+ original_value=original_value,
1088
+ url=str(event.data["url"]),
1089
+ description=f"HTTP Extracted Parameter (speculative from {source_type} content) [{parameter_name}]",
1090
+ additional_params={},
1091
+ event=event,
1092
+ context=f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {str(event.host)}",
939
1093
  )
940
- data = {
941
- "host": str(event.host),
942
- "type": "SPECULATIVE",
943
- "name": parameter_name,
944
- "original_value": original_value,
945
- "url": str(event.data["url"]),
946
- "additional_params": {},
947
- "assigned_cookies": self.assigned_cookies,
948
- "description": description,
949
- }
950
- context = f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {str(event.host)}"
951
- await self.emit_event(data, "WEB_PARAMETER", event, context=context)
952
1094
  return
953
1095
 
954
- for result in self.yara_rules.match(data=f"{data}\n{decoded_data}"):
955
- rule_name = result.rule
956
- if rule_name in self.yara_preprocess_dict:
957
- await self.yara_preprocess_dict[rule_name](result, event, discovery_context)
958
- else:
959
- self.hugewarning(f"YARA Rule {rule_name} not found in pre-compiled rules")
1096
+ # Initialize the list of data items to process
1097
+ data_items = []
1098
+
1099
+ # Check if data and decoded_data are identical
1100
+ if data == decoded_data:
1101
+ data_items.append(("data", data)) # Add only one since both are the same
1102
+ else:
1103
+ data_items.append(("data", data))
1104
+ data_items.append(("decoded_data", decoded_data))
1105
+
1106
+ for label, data_instance in data_items:
1107
+ # Your existing processing code
1108
+ for result in self.yara_rules.match(data=f"{data_instance}"):
1109
+ rule_name = result.rule
1110
+
1111
+ # Skip specific operations for 'parameter_extraction' rule on decoded_data
1112
+ if label == "decoded_data" and rule_name == "parameter_extraction":
1113
+ continue
1114
+
1115
+ # Check if rule processing function exists
1116
+ if rule_name in self.yara_preprocess_dict:
1117
+ await self.yara_preprocess_dict[rule_name](result, event, discovery_context)
1118
+ else:
1119
+ self.hugewarning(f"YARA Rule {rule_name} not found in pre-compiled rules")
960
1120
 
961
1121
  async def handle_event(self, event, **kwargs):
962
1122
  if event.type == "HTTP_RESPONSE":
963
- # Harvest GET parameters from URL, if it came directly from the target, and parameter extraction is enabled
964
- if (
965
- self.parameter_extraction is True
966
- and self.url_querystring_remove is False
967
- and str(event.parent.parent.module) == "TARGET"
968
- ):
969
- self.debug(f"Processing target URL [{urlunparse(event.parsed_url)}] for GET parameters")
970
- for (
971
- method,
972
- parsed_url,
973
- parameter_name,
974
- original_value,
975
- regex_name,
976
- additional_params,
977
- ) in extract_params_url(event.parsed_url):
978
- if self.in_bl(parameter_name) is False:
979
- description = f"HTTP Extracted Parameter [{parameter_name}] (Target URL)"
980
- data = {
981
- "host": parsed_url.hostname,
982
- "type": "GETPARAM",
983
- "name": parameter_name,
984
- "original_value": original_value,
985
- "url": self.url_unparse("GETPARAM", parsed_url),
986
- "description": description,
987
- "additional_params": additional_params,
988
- }
989
- context = f"Excavate parsed a URL directly from the scan target for parameters and found [GETPARAM] Parameter Name: [{parameter_name}] and emitted a WEB_PARAMETER for it"
990
- await self.emit_event(data, "WEB_PARAMETER", event, context=context)
991
-
992
- data = event.data
1123
+ if self.parameter_extraction is True:
1124
+ # if parameter extraction is enabled, and we have custom cookies or headers, emit them as WEB_PARAMETER events
1125
+ await self.emit_custom_parameters(event, "http_cookies", "COOKIE", "Custom Cookie")
1126
+ await self.emit_custom_parameters(event, "http_headers", "HEADER", "Custom Header")
1127
+
1128
+ # if parameter extraction is enabled, and querystring removal is disabled, and the event is directly from the TARGET, create a WEB
1129
+ if self.url_querystring_remove is False and str(event.parent.parent.module) == "TARGET":
1130
+ self.debug(f"Processing target URL [{urlunparse(event.parsed_url)}] for GET parameters")
1131
+ for (
1132
+ method,
1133
+ parsed_url,
1134
+ parameter_name,
1135
+ original_value,
1136
+ regex_name,
1137
+ additional_params,
1138
+ ) in extract_params_url(event.parsed_url):
1139
+ if self.in_bl(parameter_name) is False:
1140
+ await self.emit_web_parameter(
1141
+ host=parsed_url.hostname,
1142
+ param_type="GETPARAM",
1143
+ name=parameter_name,
1144
+ original_value=original_value,
1145
+ url=self.url_unparse("GETPARAM", parsed_url),
1146
+ description=f"HTTP Extracted Parameter [{parameter_name}] (Target URL)",
1147
+ additional_params=additional_params,
1148
+ event=event,
1149
+ context=f"Excavate parsed a URL directly from the scan target for parameters and found [GETPARAM] Parameter Name: [{parameter_name}] and emitted a WEB_PARAMETER for it",
1150
+ )
993
1151
 
994
1152
  # process response data
995
1153
  body = event.data.get("body", "")
@@ -1003,29 +1161,31 @@ class excavate(BaseInternalModule, BaseInterceptModule):
1003
1161
 
1004
1162
  for header, header_values in headers.items():
1005
1163
  for header_value in header_values:
1164
+ # Process 'set-cookie' headers to extract and emit cookies as WEB_PARAMETER events.
1006
1165
  if header.lower() == "set-cookie" and self.parameter_extraction:
1007
1166
  if "=" not in header_value:
1008
1167
  self.debug(f"Cookie found without '=': {header_value}")
1009
1168
  continue
1010
1169
  else:
1011
- cookie_name = header_value.split("=")[0]
1012
- cookie_value = header_value.split("=")[1].split(";")[0]
1170
+ cookie_name, _, remainder = header_value.partition("=")
1171
+ cookie_value = remainder.split(";")[0]
1013
1172
 
1014
- if self.in_bl(cookie_value) is False:
1173
+ if self.in_bl(cookie_name) is False:
1015
1174
  self.assigned_cookies[cookie_name] = cookie_value
1016
- description = f"Set-Cookie Assigned Cookie [{cookie_name}]"
1017
- data = {
1018
- "host": str(event.host),
1019
- "type": "COOKIE",
1020
- "name": cookie_name,
1021
- "original_value": cookie_value,
1022
- "url": self.url_unparse("COOKIE", event.parsed_url),
1023
- "description": description,
1024
- }
1025
- context = f"Excavate noticed a set-cookie header for cookie [{cookie_name}] and emitted a WEB_PARAMETER for it"
1026
- await self.emit_event(data, "WEB_PARAMETER", event, context=context)
1175
+ await self.emit_web_parameter(
1176
+ host=str(event.host),
1177
+ param_type="COOKIE",
1178
+ name=cookie_name,
1179
+ original_value=cookie_value,
1180
+ url=self.url_unparse("COOKIE", event.parsed_url),
1181
+ description=f"Set-Cookie Assigned Cookie [{cookie_name}]",
1182
+ additional_params={},
1183
+ event=event,
1184
+ context=f"Excavate noticed a set-cookie header for cookie [{cookie_name}] and emitted a WEB_PARAMETER for it",
1185
+ )
1027
1186
  else:
1028
1187
  self.debug(f"blocked cookie parameter [{cookie_name}] due to BL match")
1188
+ # Handle 'location' headers to process and emit redirect URLs as URL_UNVERIFIED events.
1029
1189
  if header.lower() == "location":
1030
1190
  redirect_location = getattr(event, "redirect_location", "")
1031
1191
  if redirect_location:
@@ -1056,18 +1216,17 @@ class excavate(BaseInternalModule, BaseInterceptModule):
1056
1216
  additional_params,
1057
1217
  ) in extract_params_location(header_value, event.parsed_url):
1058
1218
  if self.in_bl(parameter_name) is False:
1059
- description = f"HTTP Extracted Parameter [{parameter_name}] (Location Header)"
1060
- data = {
1061
- "host": parsed_url.hostname,
1062
- "type": "GETPARAM",
1063
- "name": parameter_name,
1064
- "original_value": original_value,
1065
- "url": self.url_unparse("GETPARAM", parsed_url),
1066
- "description": description,
1067
- "additional_params": additional_params,
1068
- }
1069
- context = f"Excavate parsed a location header for parameters and found [GETPARAM] Parameter Name: [{parameter_name}] and emitted a WEB_PARAMETER for it"
1070
- await self.emit_event(data, "WEB_PARAMETER", event, context=context)
1219
+ await self.emit_web_parameter(
1220
+ host=parsed_url.hostname,
1221
+ param_type="GETPARAM",
1222
+ name=parameter_name,
1223
+ original_value=original_value,
1224
+ url=self.url_unparse("GETPARAM", parsed_url),
1225
+ description=f"HTTP Extracted Parameter [{parameter_name}] (Location Header)",
1226
+ additional_params=additional_params,
1227
+ event=event,
1228
+ context=f"Excavate parsed a location header for parameters and found [GETPARAM] Parameter Name: [{parameter_name}] and emitted a WEB_PARAMETER for it",
1229
+ )
1071
1230
  else:
1072
1231
  self.warning("location header found but missing redirect_location in HTTP_RESPONSE")
1073
1232
  if header.lower() == "content-type":