assemblyline-v4-service 4.4.0.24__py3-none-any.whl → 4.4.0.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of assemblyline-v4-service might be problematic. Click here for more details.
- assemblyline_v4_service/VERSION +1 -1
- assemblyline_v4_service/common/api.py +3 -2
- assemblyline_v4_service/common/base.py +3 -4
- assemblyline_v4_service/common/helper.py +1 -2
- assemblyline_v4_service/common/{extractor/ocr.py → ocr.py} +0 -1
- assemblyline_v4_service/common/ontology_helper.py +7 -8
- assemblyline_v4_service/common/request.py +4 -5
- assemblyline_v4_service/common/result.py +3 -3
- assemblyline_v4_service/common/task.py +3 -3
- assemblyline_v4_service/common/utils.py +2 -2
- assemblyline_v4_service/updater/helper.py +4 -0
- {assemblyline_v4_service-4.4.0.24.dist-info → assemblyline_v4_service-4.4.0.26.dist-info}/METADATA +1 -1
- assemblyline_v4_service-4.4.0.26.dist-info/RECORD +28 -0
- assemblyline_v4_service/common/balbuzard/__init__.py +0 -0
- assemblyline_v4_service/common/balbuzard/balbuzard.py +0 -656
- assemblyline_v4_service/common/balbuzard/bbcrack.py +0 -830
- assemblyline_v4_service/common/balbuzard/patterns.py +0 -650
- assemblyline_v4_service/common/dynamic_service_helper.py +0 -3631
- assemblyline_v4_service/common/extractor/__init__.py +0 -1
- assemblyline_v4_service/common/extractor/base64.py +0 -86
- assemblyline_v4_service/common/extractor/pe_file.py +0 -51
- assemblyline_v4_service/common/icap.py +0 -149
- assemblyline_v4_service/common/keytool_parse.py +0 -66
- assemblyline_v4_service/common/pestudio/__init__.py +0 -0
- assemblyline_v4_service/common/pestudio/xml/__init__.py +0 -0
- assemblyline_v4_service/common/pestudio/xml/features.xml +0 -5607
- assemblyline_v4_service/common/pestudio/xml/functions.xml +0 -5824
- assemblyline_v4_service/common/pestudio/xml/languages.xml +0 -375
- assemblyline_v4_service/common/pestudio/xml/resources.xml +0 -511
- assemblyline_v4_service/common/pestudio/xml/signatures.xml +0 -29105
- assemblyline_v4_service/common/pestudio/xml/strings.xml +0 -2379
- assemblyline_v4_service/common/safelist_helper.py +0 -73
- assemblyline_v4_service/common/section_reducer.py +0 -43
- assemblyline_v4_service/common/tag_helper.py +0 -117
- assemblyline_v4_service/common/tag_reducer.py +0 -242
- assemblyline_v4_service/testing/__init__.py +0 -0
- assemblyline_v4_service/testing/helper.py +0 -463
- assemblyline_v4_service/testing/regenerate_results.py +0 -37
- assemblyline_v4_service-4.4.0.24.dist-info/RECORD +0 -53
- {assemblyline_v4_service-4.4.0.24.dist-info → assemblyline_v4_service-4.4.0.26.dist-info}/LICENCE.md +0 -0
- {assemblyline_v4_service-4.4.0.24.dist-info → assemblyline_v4_service-4.4.0.26.dist-info}/WHEEL +0 -0
- {assemblyline_v4_service-4.4.0.24.dist-info → assemblyline_v4_service-4.4.0.26.dist-info}/top_level.txt +0 -0
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
from re import compile, IGNORECASE, match, search
|
|
2
|
-
from typing import Dict, List
|
|
3
|
-
from urllib.parse import urlparse
|
|
4
|
-
|
|
5
|
-
from assemblyline.odm.base import DOMAIN_REGEX, IP_REGEX
|
|
6
|
-
|
|
7
|
-
URL_REGEX = compile(
|
|
8
|
-
r"(?:(?:(?:[A-Za-z]*:)?//)?(?:\S+(?::\S*)?@)?(?:(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}"
|
|
9
|
-
r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[A-Za-z0-9\u00a1-\uffff][A-Za-z0-9\u00a1-\uffff_-]{0,62})"
|
|
10
|
-
r"?[A-Za-z0-9\u00a1-\uffff]\.)+(?:xn--)?(?:[A-Za-z0-9\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?)(?:[/?#][^\s,\\\\]*)?")
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def is_tag_safelisted(
|
|
14
|
-
value: str, tags: List[str],
|
|
15
|
-
safelist: Dict[str, Dict[str, List[str]]],
|
|
16
|
-
substring: bool = False) -> bool:
|
|
17
|
-
"""
|
|
18
|
-
This method determines if a given value has any safelisted components.
|
|
19
|
-
:param value: The value to be checked if it has been safelisted
|
|
20
|
-
:param tags: The tags which will be used for grabbing specific values from the safelist
|
|
21
|
-
:param safelist: The safelist containing matches and regexs. The
|
|
22
|
-
product of a service using self.get_api_interface().get_safelist().
|
|
23
|
-
:param substring: A flag that indicates if we should check if the value is contained within the match
|
|
24
|
-
:return: A boolean indicating if the value has been safelisted
|
|
25
|
-
"""
|
|
26
|
-
if not value or not tags or not safelist:
|
|
27
|
-
return False
|
|
28
|
-
|
|
29
|
-
if not any(key in safelist for key in ["match", "regex"]):
|
|
30
|
-
return False
|
|
31
|
-
|
|
32
|
-
safelist_matches = safelist.get("match", {})
|
|
33
|
-
safelist_regexes = safelist.get("regex", {})
|
|
34
|
-
|
|
35
|
-
for tag in tags:
|
|
36
|
-
if tag in safelist_matches:
|
|
37
|
-
for safelist_match in safelist_matches[tag]:
|
|
38
|
-
if value.lower() == safelist_match.lower():
|
|
39
|
-
return True
|
|
40
|
-
elif substring and safelist_match.lower() in value.lower():
|
|
41
|
-
return True
|
|
42
|
-
|
|
43
|
-
if tag in safelist_regexes:
|
|
44
|
-
for safelist_regex in safelist_regexes[tag]:
|
|
45
|
-
if match(safelist_regex, value, IGNORECASE):
|
|
46
|
-
return True
|
|
47
|
-
|
|
48
|
-
return False
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def contains_safelisted_value(val: str, safelist: Dict[str, Dict[str, List[str]]]) -> bool:
|
|
52
|
-
"""
|
|
53
|
-
This method checks if a given value is part of a safelist
|
|
54
|
-
:param val: The given value
|
|
55
|
-
:param safelist: A dictionary containing matches and regexes for use in safelisting values
|
|
56
|
-
:return: A boolean representing if the given value is part of a safelist
|
|
57
|
-
"""
|
|
58
|
-
if not val or not isinstance(val, str):
|
|
59
|
-
return False
|
|
60
|
-
ip = search(IP_REGEX, val)
|
|
61
|
-
url = search(URL_REGEX, val)
|
|
62
|
-
domain = search(DOMAIN_REGEX, val)
|
|
63
|
-
if ip is not None:
|
|
64
|
-
ip = ip.group()
|
|
65
|
-
return is_tag_safelisted(ip, ["network.dynamic.ip"], safelist)
|
|
66
|
-
elif domain is not None:
|
|
67
|
-
domain = domain.group()
|
|
68
|
-
return is_tag_safelisted(domain, ["network.dynamic.domain"], safelist)
|
|
69
|
-
elif url is not None:
|
|
70
|
-
url_pieces = urlparse(url.group())
|
|
71
|
-
domain = url_pieces.netloc
|
|
72
|
-
return is_tag_safelisted(domain, ["network.dynamic.domain"], safelist)
|
|
73
|
-
return False
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
from assemblyline_v4_service.common.result import Result, ResultSection
|
|
2
|
-
from assemblyline_v4_service.common.tag_reducer import REDUCE_MAP
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def reduce(al_result: Result) -> Result:
|
|
6
|
-
"""
|
|
7
|
-
This function goes through a result section recursively and try reduce the amount of
|
|
8
|
-
produced tags based on a reducer set for each specific tags
|
|
9
|
-
|
|
10
|
-
:param al_result: An Assemblyline result object
|
|
11
|
-
:return: Reduced Assemblyline result object
|
|
12
|
-
"""
|
|
13
|
-
for section in al_result.sections:
|
|
14
|
-
_section_traverser(section)
|
|
15
|
-
return al_result
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def _section_traverser(section: ResultSection = None) -> ResultSection:
|
|
19
|
-
"""
|
|
20
|
-
This function goes through each section and sends the tags to a function
|
|
21
|
-
that will reduce specific tags
|
|
22
|
-
|
|
23
|
-
:param section: An Assemblyline result section
|
|
24
|
-
:return: Reduced Assemblyline result section
|
|
25
|
-
"""
|
|
26
|
-
for subsection in section.subsections:
|
|
27
|
-
_section_traverser(subsection)
|
|
28
|
-
if section.tags:
|
|
29
|
-
section.set_tags(_reduce_specific_tags(section.tags))
|
|
30
|
-
return section
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def _reduce_specific_tags(tags=None) -> dict:
|
|
34
|
-
"""
|
|
35
|
-
This function is very much a work in progress. Currently the only tags that we
|
|
36
|
-
feel the need to reduce are unique uris and uri paths
|
|
37
|
-
:param tags: Dictionary of tag types and their values
|
|
38
|
-
:return: Dictionary of tag types and their reduced values
|
|
39
|
-
"""
|
|
40
|
-
if tags is None:
|
|
41
|
-
tags = {}
|
|
42
|
-
|
|
43
|
-
return {tag_type: REDUCE_MAP.get(tag_type, lambda x: x)(tag_values) for tag_type, tag_values in tags.items()}
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
from re import match, search
|
|
2
|
-
from typing import Any, Dict, List, Optional, Union
|
|
3
|
-
|
|
4
|
-
from assemblyline.common.net import is_valid_domain, is_valid_ip
|
|
5
|
-
from assemblyline.common.str_utils import safe_str
|
|
6
|
-
from assemblyline.odm.base import DOMAIN_ONLY_REGEX, DOMAIN_REGEX, FULL_URI, IP_REGEX, URI_PATH
|
|
7
|
-
from assemblyline_v4_service.common.result import ResultSection
|
|
8
|
-
from assemblyline_v4_service.common.safelist_helper import is_tag_safelisted
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def add_tag(
|
|
12
|
-
result_section: ResultSection,
|
|
13
|
-
tag: str, value: Union[Any, List[Any]],
|
|
14
|
-
safelist: Dict[str, Dict[str, List[str]]] = None
|
|
15
|
-
) -> bool:
|
|
16
|
-
"""
|
|
17
|
-
This method adds the value(s) as a tag to the ResultSection. Can take a list of values or a single value.
|
|
18
|
-
:param result_section: The ResultSection that the tag will be added to
|
|
19
|
-
:param tag: The tag type that the value will be tagged under
|
|
20
|
-
:param value: The value, a single item or a list, that will be tagged under the tag type
|
|
21
|
-
:param safelist: The safelist containing matches and regexs. The product of a
|
|
22
|
-
service using self.get_api_interface().get_safelist().
|
|
23
|
-
:return: Tag was successfully added
|
|
24
|
-
"""
|
|
25
|
-
if safelist is None:
|
|
26
|
-
safelist = {}
|
|
27
|
-
|
|
28
|
-
tags_were_added = False
|
|
29
|
-
if not value:
|
|
30
|
-
return tags_were_added
|
|
31
|
-
|
|
32
|
-
if type(value) == list:
|
|
33
|
-
for item in value:
|
|
34
|
-
# If one tag is added, then return True
|
|
35
|
-
tags_were_added = _validate_tag(result_section, tag, item, safelist) or tags_were_added
|
|
36
|
-
else:
|
|
37
|
-
tags_were_added = _validate_tag(result_section, tag, value, safelist)
|
|
38
|
-
return tags_were_added
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def _get_regex_for_tag(tag: str) -> str:
|
|
42
|
-
"""
|
|
43
|
-
This method returns a regular expression used for validating a certain tag type
|
|
44
|
-
:param tag: The type of tag
|
|
45
|
-
:return: The relevant regular expression
|
|
46
|
-
"""
|
|
47
|
-
reg_to_match: Optional[str] = None
|
|
48
|
-
if "domain" in tag:
|
|
49
|
-
reg_to_match = DOMAIN_ONLY_REGEX
|
|
50
|
-
elif "uri_path" in tag:
|
|
51
|
-
reg_to_match = URI_PATH
|
|
52
|
-
elif "uri" in tag:
|
|
53
|
-
reg_to_match = FULL_URI
|
|
54
|
-
elif "ip" in tag:
|
|
55
|
-
reg_to_match = IP_REGEX
|
|
56
|
-
return reg_to_match
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def _validate_tag(
|
|
60
|
-
result_section: ResultSection,
|
|
61
|
-
tag: str,
|
|
62
|
-
value: Any,
|
|
63
|
-
safelist: Dict[str, Dict[str, List[str]]] = None
|
|
64
|
-
) -> bool:
|
|
65
|
-
"""
|
|
66
|
-
This method validates the value relative to the tag type before adding the value as a tag to the ResultSection.
|
|
67
|
-
:param result_section: The ResultSection that the tag will be added to
|
|
68
|
-
:param tag: The tag type that the value will be tagged under
|
|
69
|
-
:param value: The item that will be tagged under the tag type
|
|
70
|
-
:param safelist: The safelist containing matches and regexs. The product of a
|
|
71
|
-
service using self.get_api_interface().get_safelist().
|
|
72
|
-
:return: Tag was successfully added
|
|
73
|
-
"""
|
|
74
|
-
if safelist is None:
|
|
75
|
-
safelist = {}
|
|
76
|
-
|
|
77
|
-
if tag.startswith("network.static."):
|
|
78
|
-
network_tag_type = "static"
|
|
79
|
-
else:
|
|
80
|
-
network_tag_type = "dynamic"
|
|
81
|
-
|
|
82
|
-
regex = _get_regex_for_tag(tag)
|
|
83
|
-
if regex and not match(regex, value):
|
|
84
|
-
return False
|
|
85
|
-
|
|
86
|
-
if "ip" in tag and not is_valid_ip(value):
|
|
87
|
-
return False
|
|
88
|
-
|
|
89
|
-
if "domain" in tag and not is_valid_domain(value):
|
|
90
|
-
return False
|
|
91
|
-
|
|
92
|
-
if is_tag_safelisted(value, [tag], safelist):
|
|
93
|
-
return False
|
|
94
|
-
|
|
95
|
-
# if "uri" is in the tag, let's try to extract its domain/ip and tag it.
|
|
96
|
-
if "uri_path" not in tag and "uri" in tag:
|
|
97
|
-
# First try to get the domain
|
|
98
|
-
valid_domain = False
|
|
99
|
-
domain = search(DOMAIN_REGEX, value)
|
|
100
|
-
if domain:
|
|
101
|
-
domain = domain.group()
|
|
102
|
-
valid_domain = _validate_tag(result_section, f"network.{network_tag_type}.domain", domain, safelist)
|
|
103
|
-
# Then try to get the IP
|
|
104
|
-
valid_ip = False
|
|
105
|
-
ip = search(IP_REGEX, value)
|
|
106
|
-
if ip:
|
|
107
|
-
ip = ip.group()
|
|
108
|
-
valid_ip = _validate_tag(result_section, f"network.{network_tag_type}.ip", ip, safelist)
|
|
109
|
-
|
|
110
|
-
if value not in [domain, ip] and (valid_domain or valid_ip):
|
|
111
|
-
result_section.add_tag(tag, safe_str(value))
|
|
112
|
-
else:
|
|
113
|
-
return False
|
|
114
|
-
else:
|
|
115
|
-
result_section.add_tag(tag, safe_str(value))
|
|
116
|
-
|
|
117
|
-
return True
|
|
@@ -1,242 +0,0 @@
|
|
|
1
|
-
import regex as re
|
|
2
|
-
import os.path
|
|
3
|
-
|
|
4
|
-
from copy import deepcopy
|
|
5
|
-
from typing import List
|
|
6
|
-
from urllib.parse import urlparse, parse_qs, urlunparse, urlencode, unquote
|
|
7
|
-
|
|
8
|
-
NUMBER_REGEX = re.compile("[0-9]*")
|
|
9
|
-
ALPHA_REGEX = re.compile("[a-zA-Z]*")
|
|
10
|
-
ALPHANUM_REGEX = re.compile("[a-zA-Z0-9]*")
|
|
11
|
-
BASE64_REGEX = re.compile("(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?")
|
|
12
|
-
DO_NOT_REDUCE = ["netloc", "hostname"]
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def reduce_uri_tags(uris=None) -> List[str]:
|
|
16
|
-
"""
|
|
17
|
-
The purpose of this helper function is to reduce the amount of unique uris to be tagged.
|
|
18
|
-
ex. If a sample makes a hundred network calls to four unqiue domains, with only one parameter
|
|
19
|
-
changing in the HTTP request each time, this should be synthesized to four uris to
|
|
20
|
-
be tagged, but with a placeholder for the parameter(s) that changes in each callout.
|
|
21
|
-
"""
|
|
22
|
-
if uris is None:
|
|
23
|
-
uris = []
|
|
24
|
-
|
|
25
|
-
parsed_uris = []
|
|
26
|
-
reduced_uris = set()
|
|
27
|
-
for uri in uris:
|
|
28
|
-
parsed_uri = urlparse(uri)
|
|
29
|
-
# Match items we care about into a nice dictionary
|
|
30
|
-
uri_dict = {
|
|
31
|
-
"scheme": parsed_uri.scheme, # scheme param
|
|
32
|
-
"netloc": parsed_uri.netloc, # ""
|
|
33
|
-
"path": parsed_uri.path, # ""
|
|
34
|
-
"params": parsed_uri.params, # ""
|
|
35
|
-
"query": parsed_uri.query, # ""
|
|
36
|
-
"fragment": parsed_uri.fragment, # ""
|
|
37
|
-
"username": parsed_uri.username, # None
|
|
38
|
-
"password": parsed_uri.password, # None
|
|
39
|
-
"hostname": parsed_uri.hostname, # None
|
|
40
|
-
"port": parsed_uri.port # None
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
# We need to parse a couple of the returned params from urlparse more in-depth
|
|
44
|
-
if uri_dict["query"] != "":
|
|
45
|
-
# note that values of keys in dict will be in lists of length 1, which we don't want
|
|
46
|
-
uri_dict["query"] = parse_qs(uri_dict["query"])
|
|
47
|
-
if uri_dict["path"] != "":
|
|
48
|
-
# converting tuple to list
|
|
49
|
-
uri_dict["path"] = list(os.path.split(uri_dict["path"]))
|
|
50
|
-
# removing lone slashes
|
|
51
|
-
uri_dict["path"] = [not_slash for not_slash in uri_dict["path"] if not_slash != "/"]
|
|
52
|
-
|
|
53
|
-
parsed_uris.append(uri_dict)
|
|
54
|
-
|
|
55
|
-
# iterate through, comparing two parsed uris. if the percentage of similarity
|
|
56
|
-
# is greater than x, then they are sufficiently similar and can have parts
|
|
57
|
-
# replaced.
|
|
58
|
-
|
|
59
|
-
# time for the smarts
|
|
60
|
-
comparison_uris = deepcopy(parsed_uris)
|
|
61
|
-
for parsed_uri in parsed_uris:
|
|
62
|
-
# this flag will be used to check if this uri matches any other uri ever
|
|
63
|
-
totally_unique = True
|
|
64
|
-
for comparison_uri in comparison_uris:
|
|
65
|
-
if parsed_uri == comparison_uri:
|
|
66
|
-
continue
|
|
67
|
-
equal_keys = 0
|
|
68
|
-
total_list_len = 0
|
|
69
|
-
total_dict_len = 0
|
|
70
|
-
difference = {}
|
|
71
|
-
# now go through each key, and check for equality
|
|
72
|
-
for key in parsed_uri.keys():
|
|
73
|
-
val = parsed_uri[key]
|
|
74
|
-
comp_val = comparison_uri[key]
|
|
75
|
-
|
|
76
|
-
# if equal, add to count of similar keys
|
|
77
|
-
if type(val) == list:
|
|
78
|
-
val_len = len(val)
|
|
79
|
-
if val == comp_val:
|
|
80
|
-
equal_keys += val_len
|
|
81
|
-
else:
|
|
82
|
-
difference[key] = dict()
|
|
83
|
-
comp_len = len(comp_val)
|
|
84
|
-
max_list_len = max(val_len, comp_len)
|
|
85
|
-
for item in range(max_list_len):
|
|
86
|
-
if item >= comp_len or item >= val_len:
|
|
87
|
-
# bail!
|
|
88
|
-
break
|
|
89
|
-
if val[item] == comp_val[item]:
|
|
90
|
-
equal_keys += 1
|
|
91
|
-
else:
|
|
92
|
-
difference[key][item] = []
|
|
93
|
-
difference[key][item].append(val[item])
|
|
94
|
-
difference[key][item].append(comp_val[item])
|
|
95
|
-
total_list_len += val_len
|
|
96
|
-
|
|
97
|
-
elif type(val) == dict:
|
|
98
|
-
val_len = len(val)
|
|
99
|
-
if val == comp_val:
|
|
100
|
-
equal_keys += val_len
|
|
101
|
-
else:
|
|
102
|
-
difference[key] = dict()
|
|
103
|
-
if comp_val != "":
|
|
104
|
-
comp_keys = list(comp_val.keys())
|
|
105
|
-
val_keys = list(val.keys())
|
|
106
|
-
all_keys = set(comp_keys + val_keys)
|
|
107
|
-
val_len = len(all_keys)
|
|
108
|
-
|
|
109
|
-
for item in all_keys:
|
|
110
|
-
if val.get(item) and comp_val.get(item) and val[item] == comp_val[item]:
|
|
111
|
-
equal_keys += 1
|
|
112
|
-
else:
|
|
113
|
-
difference[key][item] = []
|
|
114
|
-
if val.get(item):
|
|
115
|
-
difference[key][item].append(val[item])
|
|
116
|
-
if comp_val.get(item):
|
|
117
|
-
difference[key][item].append(comp_val[item])
|
|
118
|
-
total_dict_len += val_len
|
|
119
|
-
else: # Not dict or a list
|
|
120
|
-
if val == comp_val:
|
|
121
|
-
equal_keys += 1
|
|
122
|
-
else:
|
|
123
|
-
difference[key] = []
|
|
124
|
-
difference[key].append(val)
|
|
125
|
-
difference[key].append(comp_val)
|
|
126
|
-
# now find percentage similar
|
|
127
|
-
if total_dict_len > 1 and total_list_len > 1:
|
|
128
|
-
percentage_equal = equal_keys / (len(parsed_uri.keys()) - 2 + total_list_len + total_dict_len)
|
|
129
|
-
elif total_dict_len > 1 or total_list_len > 1:
|
|
130
|
-
percentage_equal = equal_keys / (len(parsed_uri.keys()) - 1 + total_list_len + total_dict_len)
|
|
131
|
-
else:
|
|
132
|
-
percentage_equal = equal_keys / (len(parsed_uri.keys()) + total_list_len + total_dict_len)
|
|
133
|
-
|
|
134
|
-
# if percentage equal is > some value (say 90), then we can say that
|
|
135
|
-
# urls are similar enough to reduce
|
|
136
|
-
if percentage_equal >= 0.80:
|
|
137
|
-
# So that we don't overwrite details
|
|
138
|
-
comparison_uri_copy = deepcopy(comparison_uri)
|
|
139
|
-
# somehow recognize where parameters are that match and replace them.
|
|
140
|
-
for item in difference.keys():
|
|
141
|
-
# We don't want to replace the following:
|
|
142
|
-
if item in DO_NOT_REDUCE:
|
|
143
|
-
continue
|
|
144
|
-
|
|
145
|
-
val = difference[item]
|
|
146
|
-
if item == "query":
|
|
147
|
-
for key in val.keys():
|
|
148
|
-
placeholders = []
|
|
149
|
-
# since each of these items is a list of lists
|
|
150
|
-
for val_item in val[key]:
|
|
151
|
-
# use regex to determine the parameter type
|
|
152
|
-
value = val_item[0]
|
|
153
|
-
placeholder = _get_placeholder(value)
|
|
154
|
-
placeholders.append(placeholder)
|
|
155
|
-
if len(set(placeholders)) == 1:
|
|
156
|
-
# the same placeholder type is consistent with all values
|
|
157
|
-
# update the url_dict value
|
|
158
|
-
comparison_uri_copy[item][key] = list(set(placeholders))
|
|
159
|
-
else:
|
|
160
|
-
# the placeholder types vary
|
|
161
|
-
comparison_uri_copy[item][key] = ",".join(placeholders)
|
|
162
|
-
elif item == "path":
|
|
163
|
-
placeholders = {}
|
|
164
|
-
for key in val.keys():
|
|
165
|
-
placeholders[key] = []
|
|
166
|
-
for list_item in val[key]:
|
|
167
|
-
# if / exists, pop the rest out
|
|
168
|
-
if list_item != "/" and list_item[0] == "/":
|
|
169
|
-
# use regex to determine the parameter type
|
|
170
|
-
placeholder = _get_placeholder(list_item[1:])
|
|
171
|
-
placeholders[key].append("/"+placeholder)
|
|
172
|
-
else:
|
|
173
|
-
placeholder = _get_placeholder(list_item)
|
|
174
|
-
placeholders[key].append(placeholder)
|
|
175
|
-
for key in placeholders.keys():
|
|
176
|
-
if len(set(placeholders[key])) == 1:
|
|
177
|
-
# the same placeholder type is consistent with all values
|
|
178
|
-
# update the comparison_uri_copy value
|
|
179
|
-
comparison_uri_copy[item][key] = list(set(placeholders[key]))[0]
|
|
180
|
-
else:
|
|
181
|
-
# the placeholder types vary
|
|
182
|
-
comparison_uri_copy[item][key] = ",".join(set(placeholders[key]))
|
|
183
|
-
else:
|
|
184
|
-
comparison_uri_copy[item] = _get_placeholder(val)
|
|
185
|
-
|
|
186
|
-
# now it's time to rejoin the parts of the url
|
|
187
|
-
reduced_uris.add(_turn_back_into_uri(comparison_uri_copy))
|
|
188
|
-
totally_unique = False
|
|
189
|
-
|
|
190
|
-
# Congratulations, you are one in a million
|
|
191
|
-
if totally_unique:
|
|
192
|
-
reduced_uris.add(_turn_back_into_uri(parsed_uri))
|
|
193
|
-
reduced_uris_list = list(reduced_uris)
|
|
194
|
-
# recursive_list = reduce_uri_tags(reduced_uris_list)
|
|
195
|
-
# if len(recursive_list) < len(reduced_uris_list):
|
|
196
|
-
# return reduced_uris_list
|
|
197
|
-
# elif
|
|
198
|
-
# if reduce_uri_tags(reduced_uris_list))
|
|
199
|
-
return reduced_uris_list
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
def _turn_back_into_uri(uri_parts: dict) -> str:
|
|
203
|
-
# turn the path back into a string
|
|
204
|
-
uri_parts["path"] = '/'.join(uri_parts["path"])
|
|
205
|
-
# turn the query back into a query string
|
|
206
|
-
# first, remove the list wrappers
|
|
207
|
-
if uri_parts["query"] != "":
|
|
208
|
-
for item in uri_parts["query"].keys():
|
|
209
|
-
uri_parts["query"][item] = uri_parts["query"][item][0]
|
|
210
|
-
uri_parts["query"] = unquote(urlencode(uri_parts["query"]))
|
|
211
|
-
|
|
212
|
-
uri_tuple = (uri_parts["scheme"], uri_parts["netloc"],
|
|
213
|
-
uri_parts["path"], uri_parts["params"],
|
|
214
|
-
uri_parts["query"], uri_parts["fragment"])
|
|
215
|
-
real_url = urlunparse(uri_tuple)
|
|
216
|
-
return real_url
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
def _get_placeholder(val: str) -> str:
|
|
220
|
-
if not val:
|
|
221
|
-
return "${UNKNOWN_TYPE}"
|
|
222
|
-
|
|
223
|
-
if NUMBER_REGEX.fullmatch(val):
|
|
224
|
-
placeholder = "${NUMBER}"
|
|
225
|
-
elif ALPHA_REGEX.fullmatch(val):
|
|
226
|
-
placeholder = "${ALPHA}"
|
|
227
|
-
# Note that BASE64 Regex must happen before ALPHANUM regex or else ALPHANUM will hit on BASE64
|
|
228
|
-
elif BASE64_REGEX.fullmatch(val):
|
|
229
|
-
placeholder = "${BASE64}"
|
|
230
|
-
elif ALPHANUM_REGEX.fullmatch(val):
|
|
231
|
-
placeholder = "${ALPHA_NUM}"
|
|
232
|
-
else:
|
|
233
|
-
placeholder = "${UNKNOWN_TYPE}"
|
|
234
|
-
return placeholder
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
REDUCE_MAP = {
|
|
238
|
-
"network.dynamic.uri": reduce_uri_tags,
|
|
239
|
-
"network.static.uri": reduce_uri_tags,
|
|
240
|
-
"network.dynamic.uri_path": reduce_uri_tags,
|
|
241
|
-
"network.static.uri_path": reduce_uri_tags
|
|
242
|
-
}
|
|
File without changes
|