bbot 2.4.2.6109rc0__py3-none-any.whl → 2.4.2.6596rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bbot might be problematic. Click here for more details.

Files changed (67) hide show
  1. bbot/__init__.py +1 -1
  2. bbot/core/event/base.py +64 -4
  3. bbot/core/helpers/diff.py +10 -7
  4. bbot/core/helpers/helper.py +5 -1
  5. bbot/core/helpers/misc.py +48 -11
  6. bbot/core/helpers/regex.py +4 -0
  7. bbot/core/helpers/regexes.py +45 -8
  8. bbot/core/helpers/url.py +21 -5
  9. bbot/core/helpers/web/client.py +25 -5
  10. bbot/core/helpers/web/engine.py +9 -1
  11. bbot/core/helpers/web/envelopes.py +352 -0
  12. bbot/core/helpers/web/web.py +10 -2
  13. bbot/core/helpers/yara_helper.py +50 -0
  14. bbot/core/modules.py +23 -7
  15. bbot/defaults.yml +26 -1
  16. bbot/modules/base.py +4 -2
  17. bbot/modules/{deadly/dastardly.py → dastardly.py} +1 -1
  18. bbot/modules/{deadly/ffuf.py → ffuf.py} +1 -1
  19. bbot/modules/ffuf_shortnames.py +1 -1
  20. bbot/modules/httpx.py +14 -0
  21. bbot/modules/hunt.py +24 -6
  22. bbot/modules/internal/aggregate.py +1 -0
  23. bbot/modules/internal/excavate.py +356 -197
  24. bbot/modules/lightfuzz/lightfuzz.py +203 -0
  25. bbot/modules/lightfuzz/submodules/__init__.py +0 -0
  26. bbot/modules/lightfuzz/submodules/base.py +312 -0
  27. bbot/modules/lightfuzz/submodules/cmdi.py +106 -0
  28. bbot/modules/lightfuzz/submodules/crypto.py +474 -0
  29. bbot/modules/lightfuzz/submodules/nosqli.py +183 -0
  30. bbot/modules/lightfuzz/submodules/path.py +154 -0
  31. bbot/modules/lightfuzz/submodules/serial.py +179 -0
  32. bbot/modules/lightfuzz/submodules/sqli.py +187 -0
  33. bbot/modules/lightfuzz/submodules/ssti.py +39 -0
  34. bbot/modules/lightfuzz/submodules/xss.py +191 -0
  35. bbot/modules/{deadly/nuclei.py → nuclei.py} +1 -1
  36. bbot/modules/paramminer_headers.py +2 -0
  37. bbot/modules/reflected_parameters.py +80 -0
  38. bbot/modules/{deadly/vhost.py → vhost.py} +2 -2
  39. bbot/presets/web/lightfuzz-heavy.yml +16 -0
  40. bbot/presets/web/lightfuzz-light.yml +20 -0
  41. bbot/presets/web/lightfuzz-medium.yml +14 -0
  42. bbot/presets/web/lightfuzz-superheavy.yml +13 -0
  43. bbot/presets/web/lightfuzz-xss.yml +22 -0
  44. bbot/presets/web/paramminer.yml +8 -5
  45. bbot/scanner/preset/args.py +26 -0
  46. bbot/scanner/preset/path.py +12 -10
  47. bbot/scanner/preset/preset.py +42 -37
  48. bbot/scanner/scanner.py +6 -0
  49. bbot/scripts/docs.py +5 -5
  50. bbot/test/test_step_1/test__module__tests.py +1 -1
  51. bbot/test/test_step_1/test_helpers.py +7 -0
  52. bbot/test/test_step_1/test_presets.py +2 -2
  53. bbot/test/test_step_1/test_web.py +20 -0
  54. bbot/test/test_step_1/test_web_envelopes.py +343 -0
  55. bbot/test/test_step_2/module_tests/test_module_excavate.py +404 -29
  56. bbot/test/test_step_2/module_tests/test_module_httpx.py +29 -0
  57. bbot/test/test_step_2/module_tests/test_module_hunt.py +18 -1
  58. bbot/test/test_step_2/module_tests/test_module_lightfuzz.py +1947 -0
  59. bbot/test/test_step_2/module_tests/test_module_paramminer_getparams.py +4 -1
  60. bbot/test/test_step_2/module_tests/test_module_paramminer_headers.py +46 -2
  61. bbot/test/test_step_2/module_tests/test_module_reflected_parameters.py +226 -0
  62. bbot/wordlists/paramminer_parameters.txt +0 -8
  63. {bbot-2.4.2.6109rc0.dist-info → bbot-2.4.2.6596rc0.dist-info}/METADATA +2 -1
  64. {bbot-2.4.2.6109rc0.dist-info → bbot-2.4.2.6596rc0.dist-info}/RECORD +67 -45
  65. {bbot-2.4.2.6109rc0.dist-info → bbot-2.4.2.6596rc0.dist-info}/LICENSE +0 -0
  66. {bbot-2.4.2.6109rc0.dist-info → bbot-2.4.2.6596rc0.dist-info}/WHEEL +0 -0
  67. {bbot-2.4.2.6109rc0.dist-info → bbot-2.4.2.6596rc0.dist-info}/entry_points.txt +0 -0
bbot/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  # version placeholder (replaced by poetry-dynamic-versioning)
2
- __version__ = "v2.4.2.6109rc"
2
+ __version__ = "v2.4.2.6596rc"
3
3
 
4
4
  from .scanner import Scanner, Preset
5
5
 
bbot/core/event/base.py CHANGED
@@ -9,9 +9,9 @@ import datetime
9
9
  import ipaddress
10
10
  import traceback
11
11
 
12
- from copy import copy
13
12
  from pathlib import Path
14
13
  from typing import Optional
14
+ from copy import copy, deepcopy
15
15
  from contextlib import suppress
16
16
  from radixtarget import RadixTarget
17
17
  from pydantic import BaseModel, field_validator
@@ -40,6 +40,7 @@ from bbot.core.helpers import (
40
40
  validators,
41
41
  get_file_extension,
42
42
  )
43
+ from bbot.core.helpers.web.envelopes import BaseEnvelope
43
44
 
44
45
 
45
46
  log = logging.getLogger("bbot.core.event")
@@ -633,6 +634,10 @@ class BaseEvent:
633
634
  elif not self._dummy:
634
635
  log.warning(f"Tried to set invalid parent on {self}: (got: {repr(parent)} ({type(parent)}))")
635
636
 
637
+ @property
638
+ def children(self):
639
+ return []
640
+
636
641
  @property
637
642
  def parent_id(self):
638
643
  parent_id = getattr(self.get_parent(), "id", None)
@@ -687,6 +692,13 @@ class BaseEvent:
687
692
  e = parent
688
693
  return parents
689
694
 
695
+ def clone(self):
696
+ # Create a shallow copy of the event first
697
+ cloned_event = copy(self)
698
+ # Re-assign a new UUID
699
+ cloned_event._uuid = uuid.uuid4()
700
+ return cloned_event
701
+
690
702
  def _host(self):
691
703
  return ""
692
704
 
@@ -868,7 +880,13 @@ class BaseEvent:
868
880
  j["discovery_path"] = self.discovery_path
869
881
  j["parent_chain"] = self.parent_chain
870
882
 
883
+ # parameter envelopes
884
+ parameter_envelopes = getattr(self, "envelopes", None)
885
+ if parameter_envelopes is not None:
886
+ j["envelopes"] = parameter_envelopes.to_dict()
887
+
871
888
  # normalize non-primitive python objects
889
+
872
890
  for k, v in list(j.items()):
873
891
  if k == "data":
874
892
  continue
@@ -1368,12 +1386,56 @@ class URL_HINT(URL_UNVERIFIED):
1368
1386
 
1369
1387
 
1370
1388
  class WEB_PARAMETER(DictHostEvent):
1389
+ @property
1390
+ def children(self):
1391
+ # if we have any subparams, raise a new WEB_PARAMETER for each one
1392
+ children = []
1393
+ envelopes = getattr(self, "envelopes", None)
1394
+ if envelopes is not None:
1395
+ subparams = sorted(list(self.envelopes.get_subparams()))
1396
+
1397
+ if envelopes.selected_subparam is None:
1398
+ current_subparam = subparams[0]
1399
+ envelopes.selected_subparam = current_subparam[0]
1400
+ if len(subparams) > 1:
1401
+ for subparam, _ in subparams[1:]:
1402
+ clone = self.clone()
1403
+ clone.envelopes = deepcopy(envelopes)
1404
+ clone.envelopes.selected_subparam = subparam
1405
+ clone.parent = self
1406
+ children.append(clone)
1407
+ return children
1408
+
1409
+ def sanitize_data(self, data):
1410
+ original_value = data.get("original_value", None)
1411
+ if original_value is not None:
1412
+ try:
1413
+ envelopes = BaseEnvelope.detect(original_value)
1414
+ setattr(self, "envelopes", envelopes)
1415
+ except ValueError as e:
1416
+ log.verbose(f"Error detecting envelopes for {self}: {e}")
1417
+ return data
1418
+
1371
1419
  def _data_id(self):
1372
1420
  # dedupe by url:name:param_type
1373
1421
  url = self.data.get("url", "")
1374
1422
  name = self.data.get("name", "")
1375
1423
  param_type = self.data.get("type", "")
1376
- return f"{url}:{name}:{param_type}"
1424
+ envelopes = getattr(self, "envelopes", "")
1425
+ subparam = getattr(envelopes, "selected_subparam", "")
1426
+
1427
+ return f"{url}:{name}:{param_type}:{subparam}"
1428
+
1429
+ def _outgoing_dedup_hash(self, event):
1430
+ return hash(
1431
+ (
1432
+ str(event.host),
1433
+ event.data["url"],
1434
+ event.data.get("name", ""),
1435
+ event.data.get("type", ""),
1436
+ event.data.get("envelopes", ""),
1437
+ )
1438
+ )
1377
1439
 
1378
1440
  def _url(self):
1379
1441
  return self.data["url"]
@@ -1810,7 +1872,6 @@ def make_event(
1810
1872
  data = net.network_address
1811
1873
 
1812
1874
  event_class = globals().get(event_type, DefaultEvent)
1813
-
1814
1875
  return event_class(
1815
1876
  data,
1816
1877
  event_type=event_type,
@@ -1868,7 +1929,6 @@ def event_from_json(j, siem_friendly=False):
1868
1929
 
1869
1930
  resolved_hosts = j.get("resolved_hosts", [])
1870
1931
  event._resolved_hosts = set(resolved_hosts)
1871
-
1872
1932
  event.timestamp = datetime.datetime.fromisoformat(j["timestamp"])
1873
1933
  event.scope_distance = j["scope_distance"]
1874
1934
  parent_id = j.get("parent", None)
bbot/core/helpers/diff.py CHANGED
@@ -15,22 +15,24 @@ class HttpCompare:
15
15
  parent_helper,
16
16
  method="GET",
17
17
  data=None,
18
+ json=None,
18
19
  allow_redirects=False,
19
20
  include_cache_buster=True,
20
21
  headers=None,
21
22
  cookies=None,
22
- timeout=15,
23
+ timeout=10,
23
24
  ):
24
25
  self.parent_helper = parent_helper
25
26
  self.baseline_url = baseline_url
26
27
  self.include_cache_buster = include_cache_buster
27
28
  self.method = method
28
29
  self.data = data
30
+ self.json = json
29
31
  self.allow_redirects = allow_redirects
30
32
  self._baselined = False
31
33
  self.headers = headers
32
34
  self.cookies = cookies
33
- self.timeout = 15
35
+ self.timeout = 10
34
36
 
35
37
  @staticmethod
36
38
  def merge_dictionaries(headers1, headers2):
@@ -53,12 +55,13 @@ class HttpCompare:
53
55
  follow_redirects=self.allow_redirects,
54
56
  method=self.method,
55
57
  data=self.data,
58
+ json=self.json,
56
59
  headers=self.headers,
57
60
  cookies=self.cookies,
58
61
  retries=2,
59
62
  timeout=self.timeout,
60
63
  )
61
- await self.parent_helper.sleep(1)
64
+ await self.parent_helper.sleep(0.5)
62
65
  # put random parameters in URL, headers, and cookies
63
66
  get_params = {self.parent_helper.rand_string(6): self.parent_helper.rand_string(6)}
64
67
 
@@ -76,12 +79,12 @@ class HttpCompare:
76
79
  follow_redirects=self.allow_redirects,
77
80
  method=self.method,
78
81
  data=self.data,
82
+ json=self.json,
79
83
  retries=2,
80
84
  timeout=self.timeout,
81
85
  )
82
86
 
83
87
  self.baseline = baseline_1
84
-
85
88
  if baseline_1 is None or baseline_2 is None:
86
89
  log.debug("HTTP error while establishing baseline, aborting")
87
90
  raise HttpCompareError(
@@ -90,6 +93,7 @@ class HttpCompare:
90
93
  if baseline_1.status_code != baseline_2.status_code:
91
94
  log.debug("Status code not stable during baseline, aborting")
92
95
  raise HttpCompareError("Can't get baseline from source URL")
96
+
93
97
  try:
94
98
  baseline_1_json = xmltodict.parse(baseline_1.text)
95
99
  baseline_2_json = xmltodict.parse(baseline_2.text)
@@ -105,11 +109,9 @@ class HttpCompare:
105
109
 
106
110
  for k in ddiff.keys():
107
111
  for x in list(ddiff[k]):
108
- log.debug(f"Added {k} filter for path: {x.path()}")
109
112
  self.ddiff_filters.append(x.path())
110
113
 
111
114
  self.baseline_json = baseline_1_json
112
-
113
115
  self.baseline_ignore_headers = [
114
116
  h.lower()
115
117
  for h in [
@@ -167,7 +169,6 @@ class HttpCompare:
167
169
  if len(ddiff.keys()) == 0:
168
170
  return True
169
171
  else:
170
- log.debug(ddiff)
171
172
  return False
172
173
 
173
174
  async def compare(
@@ -178,6 +179,7 @@ class HttpCompare:
178
179
  check_reflection=False,
179
180
  method="GET",
180
181
  data=None,
182
+ json=None,
181
183
  allow_redirects=False,
182
184
  timeout=None,
183
185
  ):
@@ -208,6 +210,7 @@ class HttpCompare:
208
210
  follow_redirects=allow_redirects,
209
211
  method=method,
210
212
  data=data,
213
+ json=json,
211
214
  timeout=timeout,
212
215
  )
213
216
 
@@ -12,6 +12,7 @@ from .diff import HttpCompare
12
12
  from .regex import RegexHelper
13
13
  from .wordcloud import WordCloud
14
14
  from .interactsh import Interactsh
15
+ from .yara_helper import YaraHelper
15
16
  from .depsinstaller import DepsInstaller
16
17
  from .async_helpers import get_event_loop
17
18
 
@@ -85,6 +86,7 @@ class ConfigAwareHelper:
85
86
  self._cloud = None
86
87
 
87
88
  self.re = RegexHelper(self)
89
+ self.yara = YaraHelper(self)
88
90
  self._dns = None
89
91
  self._web = None
90
92
  self.config_aware_validators = self.validators.Validators(self)
@@ -129,7 +131,8 @@ class ConfigAwareHelper:
129
131
  cookies=None,
130
132
  method="GET",
131
133
  data=None,
132
- timeout=15,
134
+ json=None,
135
+ timeout=10,
133
136
  ):
134
137
  return HttpCompare(
135
138
  url,
@@ -141,6 +144,7 @@ class ConfigAwareHelper:
141
144
  timeout=timeout,
142
145
  method=method,
143
146
  data=data,
147
+ json=json,
144
148
  )
145
149
 
146
150
  def temp_filename(self, extension=None):
bbot/core/helpers/misc.py CHANGED
@@ -2,6 +2,7 @@ import os
2
2
  import sys
3
3
  import copy
4
4
  import json
5
+ import math
5
6
  import random
6
7
  import string
7
8
  import asyncio
@@ -9,6 +10,7 @@ import logging
9
10
  import ipaddress
10
11
  import regex as re
11
12
  import subprocess as sp
13
+
12
14
  from pathlib import Path
13
15
  from contextlib import suppress
14
16
  from unidecode import unidecode # noqa F401
@@ -797,17 +799,14 @@ def recursive_decode(data, max_depth=5):
797
799
  return data
798
800
 
799
801
 
800
- rand_pool = string.ascii_lowercase
801
- rand_pool_digits = rand_pool + string.digits
802
-
803
-
804
- def rand_string(length=10, digits=True):
802
+ def rand_string(length=10, digits=True, numeric_only=False):
805
803
  """
806
804
  Generates a random string of specified length.
807
805
 
808
806
  Args:
809
807
  length (int, optional): The length of the random string. Defaults to 10.
810
808
  digits (bool, optional): Whether to include digits in the string. Defaults to True.
809
+ numeric_only (bool, optional): Whether to generate a numeric-only string. Defaults to False.
811
810
 
812
811
  Returns:
813
812
  str: A random string of the specified length.
@@ -819,11 +818,17 @@ def rand_string(length=10, digits=True):
819
818
  'ap4rsdtg5iw7ey7y3oa5'
820
819
  >>> rand_string(30, digits=False)
821
820
  'xdmyxtglqfzqktngkesyulwbfrihva'
821
+ >>> rand_string(15, numeric_only=True)
822
+ '934857349857395'
822
823
  """
823
- pool = rand_pool
824
- if digits:
825
- pool = rand_pool_digits
826
- return "".join([random.choice(pool) for _ in range(int(length))])
824
+ if numeric_only:
825
+ pool = string.digits
826
+ elif digits:
827
+ pool = string.ascii_lowercase + string.digits
828
+ else:
829
+ pool = string.ascii_lowercase
830
+
831
+ return "".join(random.choice(pool) for _ in range(length))
827
832
 
828
833
 
829
834
  def truncate_string(s, n):
@@ -885,7 +890,7 @@ def extract_params_xml(xml_data, compare_mode="getparam"):
885
890
  xml_data (str): XML-formatted string containing elements.
886
891
 
887
892
  Returns:
888
- set: A set of tuples containing the tags and their corresponding text values present in the XML object.
893
+ set: A set of tuples containing the tags and their corresponding sanitized text values present in the XML object.
889
894
 
890
895
  Raises:
891
896
  Returns an empty set if ParseError occurs.
@@ -907,7 +912,10 @@ def extract_params_xml(xml_data, compare_mode="getparam"):
907
912
  while stack:
908
913
  current_element = stack.pop()
909
914
  if validate_parameter(current_element.tag, compare_mode):
910
- tag_value_pairs.add((current_element.tag, current_element.text))
915
+ # Sanitize the text value
916
+ text_value = current_element.text.strip() if current_element.text else None
917
+ sanitized_value = quote(text_value, safe="") if text_value else None
918
+ tag_value_pairs.add((current_element.tag, sanitized_value))
911
919
  for child in current_element:
912
920
  stack.append(child)
913
921
  return tag_value_pairs
@@ -921,6 +929,7 @@ valid_chars_dict = {
921
929
  "getparam": {chr(c) for c in range(33, 127) if chr(c) not in ":/?#[]@!$&'()*+,;="},
922
930
  "postparam": {chr(c) for c in range(33, 127) if chr(c) not in ":/?#[]@!$&'()*+,;="},
923
931
  "cookie": {chr(c) for c in range(33, 127) if chr(c) not in '()<>@,;:"/[]?={} \t'},
932
+ "bodyjson": set(chr(c) for c in range(33, 127) if chr(c) not in ":/?#[]@!$&'()*+,;="),
924
933
  }
925
934
 
926
935
 
@@ -1874,6 +1883,7 @@ def make_table(rows, header, **kwargs):
1874
1883
  | row2 | row2 |
1875
1884
  +-----------+-----------+
1876
1885
  """
1886
+
1877
1887
  from tabulate import tabulate
1878
1888
 
1879
1889
  # fix IndexError: list index out of range
@@ -2772,6 +2782,21 @@ def clean_dict(d, *key_names, fuzzy=False, exclude_keys=None, _prev_key=None):
2772
2782
  return d
2773
2783
 
2774
2784
 
2785
+ def calculate_entropy(data):
2786
+ """Calculate the Shannon entropy of a byte sequence"""
2787
+ if not data:
2788
+ return 0
2789
+ frequency = {}
2790
+ for byte in data:
2791
+ if byte in frequency:
2792
+ frequency[byte] += 1
2793
+ else:
2794
+ frequency[byte] = 1
2795
+ data_len = len(data)
2796
+ entropy = -sum((count / data_len) * math.log2(count / data_len) for count in frequency.values())
2797
+ return entropy
2798
+
2799
+
2775
2800
  top_ports_cache = None
2776
2801
 
2777
2802
 
@@ -2825,3 +2850,15 @@ def get_python_constraints():
2825
2850
 
2826
2851
  dist = distribution("bbot")
2827
2852
  return [clean_requirement(r) for r in dist.requires]
2853
+
2854
+
2855
+ def is_printable(s):
2856
+ """
2857
+ Check if a string is printable
2858
+ """
2859
+ if not isinstance(s, str):
2860
+ raise ValueError(f"Expected a string, got {type(s)}")
2861
+
2862
+ # Exclude control characters that break display/printing
2863
+ s = set(s)
2864
+ return all(ord(c) >= 32 or c in "\t\n\r" for c in s)
@@ -31,6 +31,10 @@ class RegexHelper:
31
31
  self.ensure_compiled_regex(compiled_regex)
32
32
  return await self.parent_helper.run_in_executor(compiled_regex.search, *args, **kwargs)
33
33
 
34
+ async def match(self, compiled_regex, *args, **kwargs):
35
+ self.ensure_compiled_regex(compiled_regex)
36
+ return await self.parent_helper.run_in_executor(compiled_regex.match, *args, **kwargs)
37
+
34
38
  async def sub(self, compiled_regex, *args, **kwargs):
35
39
  self.ensure_compiled_regex(compiled_regex)
36
40
  return await self.parent_helper.run_in_executor(compiled_regex.sub, *args, **kwargs)
@@ -114,27 +114,64 @@ scan_name_regex = re.compile(r"[a-z]{3,20}_[a-z]{3,20}")
114
114
 
115
115
  # For use with excavate parameters extractor
116
116
  input_tag_regex = re.compile(
117
- r"<input[^>]+?name=[\"\']?([\.$\w]+)[\"\']?(?:[^>]*?value=[\"\']([=+\/\w]*)[\"\'])?[^>]*>"
117
+ r"<input[^>]*?\sname=[\"\']?([\-\._=+\/\w]+)[\"\']?[^>]*?\svalue=[\"\']?([:%\-\._=+\/\w\s]*)[\"\']?[^>]*?>"
118
118
  )
119
- jquery_get_regex = re.compile(r"url:\s?[\"\'].+?\?(\w+)=")
120
- jquery_post_regex = re.compile(r"\$.post\([\'\"].+[\'\"].+\{(.+)\}")
119
+ input_tag_regex2 = re.compile(
120
+ r"<input[^>]*?\svalue=[\"\']?([:\-%\._=+\/\w\s]*)[\"\']?[^>]*?\sname=[\"\']?([\-\._=+\/\w]+)[\"\']?[^>]*?>"
121
+ )
122
+ input_tag_novalue_regex = re.compile(r"<input(?![^>]*\b\svalue=)[^>]*?\sname=[\"\']?([\-\._=+\/\w]*)[\"\']?[^>]*?>")
123
+ # jquery_get_regex = re.compile(r"url:\s?[\"\'].+?\?(\w+)=")
124
+ # jquery_get_regex = re.compile(r"\$.get\([\'\"].+[\'\"].+\{(.+)\}")
125
+ # jquery_post_regex = re.compile(r"\$.post\([\'\"].+[\'\"].+\{(.+)\}")
121
126
  a_tag_regex = re.compile(r"<a[^>]*href=[\"\']([^\"\'?>]*)\?([^&\"\'=]+)=([^&\"\'=]+)")
122
127
  img_tag_regex = re.compile(r"<img[^>]*src=[\"\']([^\"\'?>]*)\?([^&\"\'=]+)=([^&\"\'=]+)")
123
128
  get_form_regex = re.compile(
124
- r"<form[^>]+(?:action=[\"']?([^\s\'\"]+)[\"\']?)?[^>]*method=[\"']?[gG][eE][tT][\"']?[^>]*>([\s\S]*?)<\/form>",
129
+ r"<form[^>]*\bmethod=[\"']?[gG][eE][tT][\"']?[^>]*\baction=[\"']?([^\s\"'<>]+)[\"']?[^>]*>([\s\S]*?)<\/form>",
130
+ re.DOTALL,
131
+ )
132
+ get_form_regex2 = re.compile(
133
+ r"<form[^>]*\baction=[\"']?([^\s\"'<>]+)[\"']?[^>]*\bmethod=[\"']?[gG][eE][tT][\"']?[^>]*>([\s\S]*?)<\/form>",
125
134
  re.DOTALL,
126
135
  )
127
136
  post_form_regex = re.compile(
128
- r"<form[^>]+(?:action=[\"']?([^\s\'\"]+)[\"\']?)?[^>]*method=[\"']?[pP][oO][sS][tT][\"']?[^>]*>([\s\S]*?)<\/form>",
137
+ r"<form[^>]*\bmethod=[\"']?[pP][oO][sS][tT][\"']?[^>]*\baction=[\"']?([^\s\"'<>]+)[\"']?[^>]*>([\s\S]*?)<\/form>",
138
+ re.DOTALL,
139
+ )
140
+ post_form_regex2 = re.compile(
141
+ r"<form[^>]*\baction=[\"']?([^\s\"'<>]+)[\"']?[^>]*\bmethod=[\"']?[pP][oO][sS][tT][\"']?[^>]*>([\s\S]*?)<\/form>",
142
+ re.DOTALL,
143
+ )
144
+ post_form_regex_noaction = re.compile(
145
+ r"<form[^>]*(?:\baction=[\"']?([^\s\"'<>]+)[\"']?)?[^>]*\bmethod=[\"']?[pP][oO][sS][tT][\"']?[^>]*>([\s\S]*?)<\/form>",
129
146
  re.DOTALL,
130
147
  )
148
+ generic_form_regex = re.compile(
149
+ r"<form(?![^>]*\bmethod=)[^>]+(?:\baction=[\"']?([^\s\"'<>]+)[\"']?)[^>]*>([\s\S]*?)<\/form>",
150
+ re.IGNORECASE | re.DOTALL,
151
+ )
152
+
131
153
  select_tag_regex = re.compile(
132
- r"<select[^>]+?name=[\"\']?(\w+)[\"\']?[^>]*>(?:\s*<option[^>]*?value=[\"\'](\w*)[\"\']?[^>]*>)?"
154
+ r"<select[^>]+?name=[\"\']?([_\-\.\w]+)[\"\']?[^>]*>(?:\s*<option[^>]*?value=[\"\']?([_\.\-\w]*)[\"\']?[^>]*>)?",
155
+ re.IGNORECASE | re.DOTALL,
133
156
  )
157
+
134
158
  textarea_tag_regex = re.compile(
135
- r'<textarea[^>]*\bname=["\']?(\w+)["\']?[^>]*>(.*?)</textarea>', re.IGNORECASE | re.DOTALL
159
+ r"<textarea[^>]*?\sname=[\"\']?([\-\._=+\/\w]+)[\"\']?[^>]*?\svalue=[\"\']?([:%\-\._=+\/\w]*)[\"\']?[^>]*?>"
160
+ )
161
+ textarea_tag_regex2 = re.compile(
162
+ r"<textarea[^>]*?\svalue=[\"\']?([:\-%\._=+\/\w]*)[\"\']?[^>]*?\sname=[\"\']?([\-\._=+\/\w]+)[\"\']?[^>]*?>"
163
+ )
164
+ textarea_tag_novalue_regex = re.compile(
165
+ r'<textarea[^>]*\bname=["\']?([_\-\.\w]+)["\']?[^>]*>(.*?)</textarea>', re.IGNORECASE | re.DOTALL
166
+ )
167
+
168
+ button_tag_regex = re.compile(
169
+ r"<button[^>]*?name=[\"\']?([\-\._=+\/\w]+)[\"\']?[^>]*?value=[\"\']?([%\-\._=+\/\w]*)[\"\']?[^>]*?>"
170
+ )
171
+ button_tag_regex2 = re.compile(
172
+ r"<button[^>]*?value=[\"\']?([\-%\._=+\/\w]*)[\"\']?[^>]*?name=[\"\']?([\-\._=+\/\w]+)[\"\']?[^>]*?>"
136
173
  )
137
- tag_attribute_regex = re.compile(r"<[^>]*(?:href|action|src)\s*=\s*[\"\']?(?!mailto:)([^\s\'\"\>]+)[\"\']?[^>]*>")
174
+ tag_attribute_regex = re.compile(r"<[^>]*(?:href|action|src)\s*=\s*[\"\']?(?!mailto:)([^\'\"\>]+)[\"\']?[^>]*>")
138
175
 
139
176
  valid_netloc = r"[^\s!@#$%^&()=/?\\'\";~`<>]+"
140
177
 
bbot/core/helpers/url.py CHANGED
@@ -32,7 +32,10 @@ def parse_url(url):
32
32
  return urlparse(url)
33
33
 
34
34
 
35
- def add_get_params(url, params):
35
+ def add_get_params(url, params, encode=True):
36
+ def _no_encode_quote(s, safe="/", encoding=None, errors=None):
37
+ return s
38
+
36
39
  """
37
40
  Add or update query parameters to the given URL.
38
41
 
@@ -53,10 +56,23 @@ def add_get_params(url, params):
53
56
  >>> add_get_params('https://www.evilcorp.com?foo=1', {'foo': 2})
54
57
  ParseResult(scheme='https', netloc='www.evilcorp.com', path='', params='', query='foo=2', fragment='')
55
58
  """
56
- parsed = parse_url(url)
57
- old_params = dict(parse_qs(parsed.query))
58
- old_params.update(params)
59
- return parsed._replace(query=urlencode(old_params, doseq=True))
59
+ parsed = urlparse(url)
60
+ query_params = parsed.query.split("&")
61
+
62
+ existing_params = {}
63
+ for param in query_params:
64
+ if "=" in param:
65
+ k, v = param.split("=", 1)
66
+ existing_params[k] = v
67
+
68
+ existing_params.update(params)
69
+
70
+ if encode:
71
+ new_query = urlencode(existing_params, doseq=True)
72
+ else:
73
+ new_query = urlencode(existing_params, doseq=True, quote_via=_no_encode_quote)
74
+
75
+ return parsed._replace(query=new_query)
60
76
 
61
77
 
62
78
  def get_get_params(url):
@@ -52,7 +52,7 @@ class BBOTAsyncClient(httpx.AsyncClient):
52
52
  if http_debug:
53
53
  log.trace(f"Creating AsyncClient: {args}, {kwargs}")
54
54
 
55
- self._persist_cookies = kwargs.pop("persist_cookies", True)
55
+ self._persist_cookies = kwargs.pop("persist_cookies", False)
56
56
 
57
57
  # timeout
58
58
  http_timeout = self._web_config.get("http_timeout", 20)
@@ -63,11 +63,18 @@ class BBOTAsyncClient(httpx.AsyncClient):
63
63
  headers = kwargs.get("headers", None)
64
64
  if headers is None:
65
65
  headers = {}
66
+
67
+ # cookies
68
+ cookies = kwargs.get("cookies", None)
69
+ if cookies is None:
70
+ cookies = {}
71
+
66
72
  # user agent
67
73
  user_agent = self._web_config.get("user_agent", "BBOT")
68
74
  if "User-Agent" not in headers:
69
75
  headers["User-Agent"] = user_agent
70
76
  kwargs["headers"] = headers
77
+ kwargs["cookies"] = cookies
71
78
  # proxy
72
79
  proxies = self._web_config.get("http_proxy", None)
73
80
  kwargs["proxy"] = proxies
@@ -78,10 +85,23 @@ class BBOTAsyncClient(httpx.AsyncClient):
78
85
  self._cookies = DummyCookies()
79
86
 
80
87
  def build_request(self, *args, **kwargs):
81
- request = super().build_request(*args, **kwargs)
82
- # add custom headers if the URL is in-scope
83
- # TODO: re-enable this
84
- if self._target.in_scope(str(request.url)):
88
+ if args:
89
+ url = args[0]
90
+ kwargs["url"] = url
91
+ url = kwargs["url"]
92
+
93
+ target_in_scope = self._target.in_scope(str(url))
94
+
95
+ if target_in_scope:
96
+ if not kwargs.get("cookies", None):
97
+ kwargs["cookies"] = {}
98
+ for ck, cv in self._web_config.get("http_cookies", {}).items():
99
+ if ck not in kwargs["cookies"]:
100
+ kwargs["cookies"][ck] = cv
101
+
102
+ request = super().build_request(**kwargs)
103
+
104
+ if target_in_scope:
85
105
  for hk, hv in self._web_config.get("http_headers", {}).items():
86
106
  hv = str(hv)
87
107
  # don't clobber headers
@@ -8,7 +8,7 @@ from socksio.exceptions import SOCKSError
8
8
  from contextlib import asynccontextmanager
9
9
 
10
10
  from bbot.core.engine import EngineServer
11
- from bbot.core.helpers.misc import bytes_to_human, human_to_bytes, get_exception_chain
11
+ from bbot.core.helpers.misc import bytes_to_human, human_to_bytes, get_exception_chain, truncate_string
12
12
 
13
13
  log = logging.getLogger("bbot.core.helpers.web.engine")
14
14
 
@@ -203,6 +203,14 @@ class HTTPEngine(EngineServer):
203
203
  else:
204
204
  log.trace(f"Error with request to URL: {url}: {e}")
205
205
  log.trace(traceback.format_exc())
206
+ except httpx.InvalidURL as e:
207
+ if raise_error:
208
+ raise
209
+ else:
210
+ log.warning(
211
+ f"Invalid URL (possibly due to dangerous redirect) on request to : {url}: {truncate_string(e, 200)}"
212
+ )
213
+ log.trace(traceback.format_exc())
206
214
  except ssl.SSLError as e:
207
215
  msg = f"SSL error with request to URL: {url}: {e}"
208
216
  if raise_error: