txt2stix 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- txt2stix/__init__.py +33 -0
- txt2stix/ai_extractor/__init__.py +15 -0
- txt2stix/ai_extractor/anthropic.py +12 -0
- txt2stix/ai_extractor/base.py +87 -0
- txt2stix/ai_extractor/deepseek.py +19 -0
- txt2stix/ai_extractor/gemini.py +18 -0
- txt2stix/ai_extractor/openai.py +15 -0
- txt2stix/ai_extractor/openrouter.py +20 -0
- txt2stix/ai_extractor/prompts.py +164 -0
- txt2stix/ai_extractor/utils.py +85 -0
- txt2stix/attack_flow.py +101 -0
- txt2stix/bundler.py +428 -0
- txt2stix/common.py +23 -0
- txt2stix/extractions.py +59 -0
- txt2stix/includes/__init__.py +0 -0
- txt2stix/includes/extractions/ai/config.yaml +1023 -0
- txt2stix/includes/extractions/lookup/config.yaml +393 -0
- txt2stix/includes/extractions/pattern/config.yaml +609 -0
- txt2stix/includes/helpers/mimetype_filename_extension_list.csv +936 -0
- txt2stix/includes/helpers/stix_relationship_types.txt +41 -0
- txt2stix/includes/helpers/tlds.txt +1446 -0
- txt2stix/includes/helpers/windows_registry_key_prefix.txt +12 -0
- txt2stix/includes/lookups/_README.md +11 -0
- txt2stix/includes/lookups/_generate_lookups.py +247 -0
- txt2stix/includes/lookups/attack_pattern.txt +1 -0
- txt2stix/includes/lookups/campaign.txt +1 -0
- txt2stix/includes/lookups/country_iso3166_alpha2.txt +249 -0
- txt2stix/includes/lookups/course_of_action.txt +1 -0
- txt2stix/includes/lookups/disarm_id_v1_5.txt +345 -0
- txt2stix/includes/lookups/disarm_name_v1_5.txt +347 -0
- txt2stix/includes/lookups/extensions.txt +78 -0
- txt2stix/includes/lookups/identity.txt +1 -0
- txt2stix/includes/lookups/infrastructure.txt +1 -0
- txt2stix/includes/lookups/intrusion_set.txt +1 -0
- txt2stix/includes/lookups/malware.txt +2 -0
- txt2stix/includes/lookups/mitre_atlas_id_v4_5_2.txt +116 -0
- txt2stix/includes/lookups/mitre_atlas_name_v4_5_2.txt +117 -0
- txt2stix/includes/lookups/mitre_attack_enterprise_aliases_v16_0.txt +1502 -0
- txt2stix/includes/lookups/mitre_attack_enterprise_id_v16_0.txt +1656 -0
- txt2stix/includes/lookups/mitre_attack_enterprise_name_v16_0.txt +1765 -0
- txt2stix/includes/lookups/mitre_attack_ics_aliases_v16_0.txt +141 -0
- txt2stix/includes/lookups/mitre_attack_ics_id_v16_0.txt +254 -0
- txt2stix/includes/lookups/mitre_attack_ics_name_v16_0.txt +293 -0
- txt2stix/includes/lookups/mitre_attack_mobile_aliases_v16_0.txt +159 -0
- txt2stix/includes/lookups/mitre_attack_mobile_id_v16_0.txt +277 -0
- txt2stix/includes/lookups/mitre_attack_mobile_name_v16_0.txt +296 -0
- txt2stix/includes/lookups/mitre_capec_id_v3_9.txt +559 -0
- txt2stix/includes/lookups/mitre_capec_name_v3_9.txt +560 -0
- txt2stix/includes/lookups/mitre_cwe_id_v4_15.txt +939 -0
- txt2stix/includes/lookups/mitre_cwe_name_v4_15.txt +939 -0
- txt2stix/includes/lookups/threat_actor.txt +1 -0
- txt2stix/includes/lookups/tld.txt +1422 -0
- txt2stix/includes/lookups/tool.txt +1 -0
- txt2stix/includes/tests/test_cases.yaml +695 -0
- txt2stix/indicator.py +860 -0
- txt2stix/lookups.py +68 -0
- txt2stix/pattern/__init__.py +13 -0
- txt2stix/pattern/extractors/__init__.py +0 -0
- txt2stix/pattern/extractors/base_extractor.py +167 -0
- txt2stix/pattern/extractors/card/README.md +34 -0
- txt2stix/pattern/extractors/card/__init__.py +15 -0
- txt2stix/pattern/extractors/card/amex_card_extractor.py +52 -0
- txt2stix/pattern/extractors/card/diners_card_extractor.py +47 -0
- txt2stix/pattern/extractors/card/discover_card_extractor.py +48 -0
- txt2stix/pattern/extractors/card/jcb_card_extractor.py +43 -0
- txt2stix/pattern/extractors/card/master_card_extractor.py +63 -0
- txt2stix/pattern/extractors/card/union_card_extractor.py +38 -0
- txt2stix/pattern/extractors/card/visa_card_extractor.py +46 -0
- txt2stix/pattern/extractors/crypto/__init__.py +3 -0
- txt2stix/pattern/extractors/crypto/btc_extractor.py +38 -0
- txt2stix/pattern/extractors/directory/__init__.py +10 -0
- txt2stix/pattern/extractors/directory/unix_directory_extractor.py +40 -0
- txt2stix/pattern/extractors/directory/unix_file_path_extractor.py +42 -0
- txt2stix/pattern/extractors/directory/windows_directory_path_extractor.py +47 -0
- txt2stix/pattern/extractors/directory/windows_file_path_extractor.py +42 -0
- txt2stix/pattern/extractors/domain/__init__.py +8 -0
- txt2stix/pattern/extractors/domain/domain_extractor.py +39 -0
- txt2stix/pattern/extractors/domain/hostname_extractor.py +36 -0
- txt2stix/pattern/extractors/domain/sub_domain_extractor.py +49 -0
- txt2stix/pattern/extractors/hashes/__init__.py +16 -0
- txt2stix/pattern/extractors/hashes/md5_extractor.py +16 -0
- txt2stix/pattern/extractors/hashes/sha1_extractor.py +14 -0
- txt2stix/pattern/extractors/hashes/sha224_extractor.py +18 -0
- txt2stix/pattern/extractors/hashes/sha2_256_exactor.py +14 -0
- txt2stix/pattern/extractors/hashes/sha2_512_exactor.py +13 -0
- txt2stix/pattern/extractors/hashes/sha3_256_exactor.py +15 -0
- txt2stix/pattern/extractors/hashes/sha3_512_exactor.py +16 -0
- txt2stix/pattern/extractors/helper.py +64 -0
- txt2stix/pattern/extractors/ip/__init__.py +14 -0
- txt2stix/pattern/extractors/ip/ipv4_cidr_extractor.py +49 -0
- txt2stix/pattern/extractors/ip/ipv4_extractor.py +18 -0
- txt2stix/pattern/extractors/ip/ipv4_port_extractor.py +42 -0
- txt2stix/pattern/extractors/ip/ipv6_cidr_extractor.py +18 -0
- txt2stix/pattern/extractors/ip/ipv6_extractor.py +16 -0
- txt2stix/pattern/extractors/ip/ipv6_port_extractor.py +46 -0
- txt2stix/pattern/extractors/others/__init__.py +22 -0
- txt2stix/pattern/extractors/others/asn_extractor.py +14 -0
- txt2stix/pattern/extractors/others/cpe_extractor.py +29 -0
- txt2stix/pattern/extractors/others/cve_extractor.py +14 -0
- txt2stix/pattern/extractors/others/email_extractor.py +21 -0
- txt2stix/pattern/extractors/others/filename_extractor.py +17 -0
- txt2stix/pattern/extractors/others/iban_extractor.py +15 -0
- txt2stix/pattern/extractors/others/mac_address_extractor.py +13 -0
- txt2stix/pattern/extractors/others/phonenumber_extractor.py +41 -0
- txt2stix/pattern/extractors/others/user_agent_extractor.py +20 -0
- txt2stix/pattern/extractors/others/windows_registry_key_extractor.py +18 -0
- txt2stix/pattern/extractors/url/__init__.py +7 -0
- txt2stix/pattern/extractors/url/url_extractor.py +22 -0
- txt2stix/pattern/extractors/url/url_file_extractor.py +21 -0
- txt2stix/pattern/extractors/url/url_path_extractor.py +74 -0
- txt2stix/retriever.py +126 -0
- txt2stix/stix.py +1 -0
- txt2stix/txt2stix.py +336 -0
- txt2stix/utils.py +86 -0
- txt2stix-0.0.4.dist-info/METADATA +190 -0
- txt2stix-0.0.4.dist-info/RECORD +119 -0
- txt2stix-0.0.4.dist-info/WHEEL +4 -0
- txt2stix-0.0.4.dist-info/entry_points.txt +2 -0
- txt2stix-0.0.4.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,38 @@
|
|
1
|
+
from validators import base58
|
2
|
+
from ..base_extractor import BaseExtractor
|
3
|
+
from base58 import b58decode
|
4
|
+
|
5
|
+
|
6
|
+
class CryptoBTCWalletExtractor(BaseExtractor):
|
7
|
+
"""
|
8
|
+
A class for extracting Bitcoin (BTC) wallet addresses from text using regular expressions.
|
9
|
+
|
10
|
+
"""
|
11
|
+
|
12
|
+
name = "pattern_cryptocurrency_btc_wallet"
|
13
|
+
extraction_regex = "^(bc1|[13])[a-km-zA-HJ-NP-Z1-9]{25,34}$"
|
14
|
+
|
15
|
+
@classmethod
|
16
|
+
def filter_function(cls, value):
|
17
|
+
try:
|
18
|
+
b58decode(value).hex()
|
19
|
+
return True
|
20
|
+
except:
|
21
|
+
return False
|
22
|
+
|
23
|
+
class CryptoBTCWalletTransactionExtractor(CryptoBTCWalletExtractor):
|
24
|
+
"""
|
25
|
+
A class for extracting Bitcoin (BTC) wallet addresses along with the transactions on that account.
|
26
|
+
|
27
|
+
"""
|
28
|
+
name = "pattern_cryptocurrency_btc_wallet_transaction"
|
29
|
+
|
30
|
+
class CryptoBTCTransactionExtractor(BaseExtractor):
|
31
|
+
"""
|
32
|
+
A class for extracting Bitcoin (BTC) transaction hash from text using regular expressions.
|
33
|
+
|
34
|
+
"""
|
35
|
+
|
36
|
+
name = "pattern_cryptocurrency_btc_transaction"
|
37
|
+
extraction_regex = "^[a-fA-F0-9]{64}$"
|
38
|
+
|
@@ -0,0 +1,10 @@
|
|
1
|
+
from .unix_directory_extractor import UnixDirectoryExtractor
|
2
|
+
from .unix_file_path_extractor import UnixFilePathExtractor
|
3
|
+
from .windows_directory_path_extractor import WindowDirectoryExtractor
|
4
|
+
from .windows_file_path_extractor import WindowsFilePathExtractor
|
5
|
+
|
6
|
+
DIRECTORY_EXTRACTORS = [UnixDirectoryExtractor,
|
7
|
+
UnixFilePathExtractor,
|
8
|
+
WindowDirectoryExtractor,
|
9
|
+
WindowsFilePathExtractor
|
10
|
+
]
|
@@ -0,0 +1,40 @@
|
|
1
|
+
from ..base_extractor import BaseExtractor
|
2
|
+
from pathvalidate import is_valid_filepath
|
3
|
+
|
4
|
+
|
5
|
+
class UnixDirectoryExtractor(BaseExtractor):
|
6
|
+
"""
|
7
|
+
A class for extracting valid Unix-style directory paths from text using a custom extraction function.
|
8
|
+
|
9
|
+
Attributes:
|
10
|
+
name (str): The name of the extractor, set to "unix-directory-path".
|
11
|
+
extraction_function (function): The custom extraction function to validate and extract directory paths.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = "pattern_directory_unix"
|
15
|
+
extraction_function = lambda x: UnixDirectoryExtractor.is_valid_directory(x)
|
16
|
+
|
17
|
+
@staticmethod
|
18
|
+
def is_valid_directory(directory_path):
|
19
|
+
"""
|
20
|
+
Custom extraction function to validate if the provided path is a valid Unix directory path.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
directory_path (str): The path to be checked.
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
bool: True if the path is a valid Unix directory path, False otherwise.
|
27
|
+
"""
|
28
|
+
if "/" in directory_path and (directory_path[0] in ['.', '/', '~']):
|
29
|
+
try:
|
30
|
+
# Checking if it's a file path by splitting the path and checking the last component for a dot.
|
31
|
+
file_name = directory_path.split('/')[-1]
|
32
|
+
if "." in file_name:
|
33
|
+
return False
|
34
|
+
except Exception as e:
|
35
|
+
pass
|
36
|
+
|
37
|
+
# Using pathvalidate library to check if the path is valid for Linux platform.
|
38
|
+
check = is_valid_filepath(directory_path, platform="Linux")
|
39
|
+
return check
|
40
|
+
return False
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from pathvalidate import is_valid_filepath
|
2
|
+
|
3
|
+
from ..base_extractor import BaseExtractor
|
4
|
+
from ..helper import validate_file_extension
|
5
|
+
|
6
|
+
|
7
|
+
class UnixFilePathExtractor(BaseExtractor):
|
8
|
+
"""
|
9
|
+
A class for extracting valid Unix-style file paths from text using a custom extraction function.
|
10
|
+
|
11
|
+
Attributes:
|
12
|
+
name (str): The name of the extractor, set to "unix-file-directory".
|
13
|
+
ignore_list (list): A list of strings to be ignored during extraction.
|
14
|
+
extraction_function (function): The custom extraction function to validate and extract file paths.
|
15
|
+
"""
|
16
|
+
|
17
|
+
name = "pattern_directory_unix_file"
|
18
|
+
ignore_list = ["http://", "https://", "http[:]//", "http[://"]
|
19
|
+
extraction_function = lambda x: UnixFilePathExtractor.is_valid_directory(x)
|
20
|
+
|
21
|
+
@staticmethod
|
22
|
+
def is_valid_directory(directory_path):
|
23
|
+
"""
|
24
|
+
Custom extraction function to validate if the provided path is a valid Unix file path.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
directory_path (str): The path to be checked.
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
bool: True if the path is a valid Unix file path, False otherwise.
|
31
|
+
"""
|
32
|
+
if "/" in directory_path and (directory_path[0] in ['.', '/', '~']) and "\\" not in directory_path:
|
33
|
+
# Using pathvalidate library to check if the path is valid for Linux platform.
|
34
|
+
check = is_valid_filepath(directory_path, platform="Linux")
|
35
|
+
if check:
|
36
|
+
try:
|
37
|
+
# Checking if it's a file path by splitting the path and checking the last component for a dot.
|
38
|
+
if validate_file_extension(directory_path):
|
39
|
+
return True
|
40
|
+
except Exception as e:
|
41
|
+
return False
|
42
|
+
return False
|
@@ -0,0 +1,47 @@
|
|
1
|
+
from pathvalidate import is_valid_filepath
|
2
|
+
from ..base_extractor import BaseExtractor
|
3
|
+
from ..helper import validate_file_extension
|
4
|
+
|
5
|
+
|
6
|
+
class WindowDirectoryExtractor(BaseExtractor):
|
7
|
+
"""
|
8
|
+
A class for extracting valid Windows-style directory paths from text using a custom extraction function.
|
9
|
+
|
10
|
+
Attributes:
|
11
|
+
name (str): The name of the extractor, set to "widows-directory".
|
12
|
+
ignore_list (list): A list of strings to be ignored during extraction.
|
13
|
+
extraction_function (function): The custom extraction function to validate and extract directory paths.
|
14
|
+
"""
|
15
|
+
|
16
|
+
name = "pattern_directory_windows"
|
17
|
+
extraction_function = lambda x: WindowDirectoryExtractor.is_valid_directory(x)
|
18
|
+
|
19
|
+
@staticmethod
|
20
|
+
def is_valid_directory(directory_path):
|
21
|
+
"""
|
22
|
+
Custom extraction function to validate if the provided path is a valid Windows directory path.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
directory_path (str): The path to be checked.
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
bool: True if the path is a valid Windows directory path, False otherwise.
|
29
|
+
"""
|
30
|
+
if directory_path == '\\' or directory_path == '\\\\':
|
31
|
+
return False
|
32
|
+
|
33
|
+
directory_path = directory_path.strip('"')
|
34
|
+
drive_letters = ["{}:\\".format(letter) for letter in "CDEFGHIJKLMNO"] + ["{}:".format(letter) for letter in "CDEFGHIJKLMNO"] + ['..\\', '\\', '\\\\']
|
35
|
+
flag = False
|
36
|
+
for prefix in drive_letters:
|
37
|
+
if directory_path.startswith(str(prefix)):
|
38
|
+
flag = True
|
39
|
+
break
|
40
|
+
start, _, _ = directory_path.partition('\\')
|
41
|
+
if start.startswith('%') and start.endswith('%'):
|
42
|
+
flag = True
|
43
|
+
if flag:
|
44
|
+
check = is_valid_filepath(directory_path, platform="Windows")
|
45
|
+
if not validate_file_extension(directory_path) and check:
|
46
|
+
return check
|
47
|
+
return False
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from pathvalidate import is_valid_filepath
|
2
|
+
|
3
|
+
from txt2stix import utils
|
4
|
+
from ..base_extractor import BaseExtractor
|
5
|
+
from ..helper import validate_file_extension
|
6
|
+
|
7
|
+
|
8
|
+
class WindowsFilePathExtractor(BaseExtractor):
|
9
|
+
"""
|
10
|
+
WindowsFilePathExtractor is a class that extracts file paths on the Windows platform and validates the file extension.
|
11
|
+
|
12
|
+
Attributes:
|
13
|
+
name (str): The name of the extractor.
|
14
|
+
extraction_function (function): The function to extract file paths.
|
15
|
+
"""
|
16
|
+
name = "pattern_directory_windows_with_file"
|
17
|
+
extraction_function = lambda x: WindowsFilePathExtractor.is_valid_directory(x)
|
18
|
+
|
19
|
+
@staticmethod
|
20
|
+
def is_valid_directory(directory_path):
|
21
|
+
"""
|
22
|
+
Custom extraction function to validate if the provided path is a valid Windows directory path.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
directory_path (str): The path to be checked.
|
26
|
+
|
27
|
+
Returns:
|
28
|
+
bool: True if the path is a valid Windows directory path, False otherwise.
|
29
|
+
"""
|
30
|
+
directory_path = directory_path.strip('"')
|
31
|
+
drive_letters = ["{}:\\".format(letter) for letter in "CDEFGHIJKLMNO"] + ["{}:".format(letter) for letter in "CDEFGHIJKLMNO"] + ['..\\', '\\', '\\\\' ]
|
32
|
+
flag = False
|
33
|
+
for prefix in drive_letters:
|
34
|
+
if directory_path.startswith(str(prefix)):
|
35
|
+
flag = True
|
36
|
+
break
|
37
|
+
if flag:
|
38
|
+
check = is_valid_filepath(directory_path, platform="Windows")
|
39
|
+
if utils.validate_file_mimetype(directory_path) and check:
|
40
|
+
return True
|
41
|
+
return False
|
42
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
from tld import get_tld
|
2
|
+
|
3
|
+
from ..base_extractor import BaseExtractor
|
4
|
+
|
5
|
+
|
6
|
+
class DomainNameExtractor(BaseExtractor):
|
7
|
+
"""
|
8
|
+
A class for extracting valid domain names from text using a custom extraction function.
|
9
|
+
|
10
|
+
Attributes:
|
11
|
+
name (str): The name of the extractor, set to "domain".
|
12
|
+
extraction_regex (function): The custom extraction function to validate and extract domain names.
|
13
|
+
"""
|
14
|
+
name = "pattern_domain_name_only"
|
15
|
+
extraction_regex = r'(([\da-zA-Z])([_\w-]{,62})\.){,127}(([\da-zA-Z])[_\w-]{,61})?([\da-zA-Z]\.((xn\-\-[a-zA-Z\d]+)|([a-zA-Z\d]{2,})))'
|
16
|
+
|
17
|
+
@staticmethod
|
18
|
+
def filter_function(domain):
|
19
|
+
"""
|
20
|
+
Checks if the given domain is valid based on the number of dots and the top-level domain.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
domain (str): The domain to be checked.
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
bool: True if the domain is valid (has at most 2 dots and a valid TLD), False otherwise.
|
27
|
+
"""
|
28
|
+
if domain.count('.') <= 2:
|
29
|
+
tld = get_tld(domain, fix_protocol=True, fail_silently=True)
|
30
|
+
if tld:
|
31
|
+
domain_name = domain.strip(f".{tld}")
|
32
|
+
if domain_name.count(".") == 0:
|
33
|
+
return True
|
34
|
+
return False
|
35
|
+
|
36
|
+
|
37
|
+
# class HostNameExtractor(DomainNameExtractor):
|
38
|
+
# filter_function = None
|
39
|
+
# name = "pattern_host_name"
|
@@ -0,0 +1,36 @@
|
|
1
|
+
from tld import get_tld
|
2
|
+
|
3
|
+
from txt2stix.utils import validate_file_mimetype
|
4
|
+
from ..helper import TLDs
|
5
|
+
|
6
|
+
from ..base_extractor import BaseExtractor
|
7
|
+
|
8
|
+
|
9
|
+
class HostnameBaseExtractor(BaseExtractor):
|
10
|
+
"""
|
11
|
+
A class for extracting valid hostnames from text using a custom extraction function.
|
12
|
+
|
13
|
+
Attributes:
|
14
|
+
name (str): The name of the extractor, set to "hostname".
|
15
|
+
extraction_regex (function): The custom extraction function to validate and extract hostnames.
|
16
|
+
"""
|
17
|
+
|
18
|
+
name = "pattern_host_name"
|
19
|
+
extraction_regex = r'(([\da-zA-Z])([_\w-]{,62})\.){,127}(([\da-zA-Z])[_\w-]{,61})?([\da-zA-Z]\.((xn\-\-[a-zA-Z\d]+)|([a-zA-Z\d]{2,})))'
|
20
|
+
|
21
|
+
@staticmethod
|
22
|
+
def filter_function(domain):
|
23
|
+
"""
|
24
|
+
Checks if the given domain is valid based on the number of dots and the top-level domain.
|
25
|
+
|
26
|
+
Args:
|
27
|
+
domain (str): The domain to be checked.
|
28
|
+
|
29
|
+
Returns:
|
30
|
+
bool: True if the domain is valid (has at most 2 dots and a valid TLD), False otherwise.
|
31
|
+
"""
|
32
|
+
if domain.count('.') <= 1:
|
33
|
+
tld = get_tld(domain, fix_protocol=True, fail_silently=True)
|
34
|
+
if not tld:
|
35
|
+
return not validate_file_mimetype(domain)
|
36
|
+
return False
|
@@ -0,0 +1,49 @@
|
|
1
|
+
from tld import get_tld
|
2
|
+
|
3
|
+
from txt2stix.utils import validate_file_mimetype
|
4
|
+
from ..helper import TLDs
|
5
|
+
|
6
|
+
from ..base_extractor import BaseExtractor
|
7
|
+
|
8
|
+
|
9
|
+
class SubDomainExtractor(BaseExtractor):
|
10
|
+
"""
|
11
|
+
A class for extracting valid subdomains from text using a custom extraction function.
|
12
|
+
|
13
|
+
This class inherits from BaseExtractor, which defines the basic structure and functionality for all extractors.
|
14
|
+
|
15
|
+
Attributes:
|
16
|
+
name (str): The name of the extractor, set to "sub-domain".
|
17
|
+
extraction_regex (function): The custom extraction function to validate and extract subdomains.
|
18
|
+
"""
|
19
|
+
|
20
|
+
name = "pattern_domain_name_subdomain"
|
21
|
+
extraction_regex = r'(([\da-zA-Z])([_\w-]{,62})\.){,127}(([\da-zA-Z])[_\w-]{,61})?([\da-zA-Z]\.((xn\-\-[' \
|
22
|
+
r'a-zA-Z\d]+)|([a-zA-Z\d]{2,})))'
|
23
|
+
|
24
|
+
@staticmethod
|
25
|
+
def filter_function(domain):
|
26
|
+
"""
|
27
|
+
Checks if the given domain is valid based on the number of dots and the top-level domain.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
domain (str): The domain to be checked.
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
bool: True if the domain is valid (has at most 2 dots and a valid TLD), False otherwise.
|
34
|
+
"""
|
35
|
+
if domain.count('.') >= 2:
|
36
|
+
tld = get_tld(domain, fix_protocol=True, fail_silently=True)
|
37
|
+
if tld:
|
38
|
+
domain_name = domain.strip(f".{tld}")
|
39
|
+
if domain_name.count(".") > 0:
|
40
|
+
return True
|
41
|
+
return False
|
42
|
+
|
43
|
+
class HostNameSubdomainExtractor(SubDomainExtractor):
|
44
|
+
name = "pattern_host_name_subdomain"
|
45
|
+
filter_function = lambda domain: domain.count('.') >= 2 and get_tld(domain, fail_silently=True) not in TLDs
|
46
|
+
|
47
|
+
def filter_function(domain):
|
48
|
+
tld = get_tld(domain, fail_silently=True, fix_protocol=True)
|
49
|
+
return domain.count('.') >= 2 and not tld and not validate_file_mimetype(domain)
|
@@ -0,0 +1,16 @@
|
|
1
|
+
from .md5_extractor import FileHashMD5Extractor
|
2
|
+
from .sha1_extractor import FileHashSHA1Extractor
|
3
|
+
from .sha2_256_exactor import FileHashSHA2_256Extractor
|
4
|
+
from .sha2_512_exactor import FileHashSHA2_512Extractor
|
5
|
+
from .sha3_256_exactor import FileHashSHA3_256Extractor
|
6
|
+
from .sha3_512_exactor import FileHashSHA3_512Extractor
|
7
|
+
from .sha224_extractor import FileHashSHA224Extractor
|
8
|
+
|
9
|
+
SHA_EXTRACTORS = [FileHashMD5Extractor,
|
10
|
+
FileHashSHA1Extractor,
|
11
|
+
FileHashSHA224Extractor,
|
12
|
+
FileHashSHA2_256Extractor,
|
13
|
+
FileHashSHA2_512Extractor,
|
14
|
+
FileHashSHA3_256Extractor,
|
15
|
+
FileHashSHA3_512Extractor
|
16
|
+
]
|
@@ -0,0 +1,16 @@
|
|
1
|
+
import validators
|
2
|
+
from ..base_extractor import BaseExtractor
|
3
|
+
|
4
|
+
|
5
|
+
class FileHashMD5Extractor(BaseExtractor):
|
6
|
+
"""
|
7
|
+
A class for extracting MD5 file hash values from text using a custom extraction function.
|
8
|
+
|
9
|
+
Attributes:
|
10
|
+
name (str): The name of the extractor, set to "md5".
|
11
|
+
extraction_function (function): The custom extraction function to validate and extract MD5 file hash values.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = "pattern_file_hash_md5"
|
15
|
+
extraction_function = lambda x: validators.md5(x)
|
16
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
import validators
|
2
|
+
from ..base_extractor import BaseExtractor
|
3
|
+
|
4
|
+
class FileHashSHA1Extractor(BaseExtractor):
|
5
|
+
"""
|
6
|
+
A class for extracting SHA-1 file hash values from text using a custom extraction function.
|
7
|
+
|
8
|
+
Attributes:
|
9
|
+
name (str): The name of the extractor, set to "sha1".
|
10
|
+
extraction_function (function): The custom extraction function to validate and extract SHA-1 file hash values.
|
11
|
+
"""
|
12
|
+
|
13
|
+
name = "pattern_file_hash_sha_1"
|
14
|
+
extraction_function = lambda x: validators.sha1(x)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import validators
|
2
|
+
from ..base_extractor import BaseExtractor
|
3
|
+
|
4
|
+
|
5
|
+
class FileHashSHA224Extractor(BaseExtractor):
|
6
|
+
"""
|
7
|
+
A class for extracting SHA-256 file hash values from text using a regular expression.
|
8
|
+
|
9
|
+
Attributes:
|
10
|
+
name (str): The name of the extractor, set to "sha2_256".
|
11
|
+
extraction_regex (str): The regular expression pattern used for extracting SHA-256 file hash values from the text.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = "pattern_file_hash_sha_224"
|
15
|
+
extraction_regex = r'^[0-9a-fA-F]{56}$'
|
16
|
+
extraction_function = lambda x: validators.sha224(x)
|
17
|
+
|
18
|
+
# not currently used as not supported by the STIX spec.
|
@@ -0,0 +1,14 @@
|
|
1
|
+
from ..base_extractor import BaseExtractor
|
2
|
+
|
3
|
+
|
4
|
+
class FileHashSHA2_256Extractor(BaseExtractor):
|
5
|
+
"""
|
6
|
+
A class for extracting SHA-256 file hash values from text using a regular expression.
|
7
|
+
|
8
|
+
Attributes:
|
9
|
+
name (str): The name of the extractor, set to "sha2_256".
|
10
|
+
extraction_regex (str): The regular expression pattern used for extracting SHA-256 file hash values from the text.
|
11
|
+
"""
|
12
|
+
|
13
|
+
name = "pattern_file_hash_sha_256"
|
14
|
+
extraction_regex = r'^[0-9a-fA-F]{64}$'
|
@@ -0,0 +1,13 @@
|
|
1
|
+
from ..base_extractor import BaseExtractor
|
2
|
+
|
3
|
+
class FileHashSHA2_512Extractor(BaseExtractor):
|
4
|
+
"""
|
5
|
+
A class for extracting SHA-512 file hash values from text using a regular expression.
|
6
|
+
|
7
|
+
Attributes:
|
8
|
+
name (str): The name of the extractor, set to "sha2_512".
|
9
|
+
extraction_regex (str): The regular expression pattern used for extracting SHA-512 file hash values from the text.
|
10
|
+
"""
|
11
|
+
|
12
|
+
name = "pattern_file_hash_sha_512"
|
13
|
+
extraction_regex = r'^[0-9a-fA-F]{128}$'
|
@@ -0,0 +1,15 @@
|
|
1
|
+
from ..base_extractor import BaseExtractor
|
2
|
+
|
3
|
+
|
4
|
+
class FileHashSHA3_256Extractor(BaseExtractor):
|
5
|
+
"""
|
6
|
+
A class for extracting SHA-3 (256-bit) file hash values from text using a regular expression.
|
7
|
+
|
8
|
+
Attributes:
|
9
|
+
name (str): The name of the extractor, set to "sha3_256".
|
10
|
+
extraction_regex (str): The regular expression pattern used for extracting SHA-3 (256-bit) file hash values from the text.
|
11
|
+
"""
|
12
|
+
|
13
|
+
name = "pattern_file_hash_sha3_256"
|
14
|
+
extraction_regex = r'^[0-9a-fA-F]{64}$'
|
15
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
from ..base_extractor import BaseExtractor
|
2
|
+
|
3
|
+
|
4
|
+
class FileHashSHA3_512Extractor(BaseExtractor):
|
5
|
+
"""
|
6
|
+
A class for extracting SHA-3 (512-bit) file hash values from text using a regular expression.
|
7
|
+
|
8
|
+
Attributes:
|
9
|
+
name (str): The name of the extractor, set to "sha3_512".
|
10
|
+
extraction_regex (str): The regular expression pattern used for extracting SHA-3 (512-bit) file hash values from the text.
|
11
|
+
"""
|
12
|
+
|
13
|
+
name = "pattern_file_hash_sha3_512"
|
14
|
+
extraction_regex = r'^[0-9a-fA-F]{128}$'
|
15
|
+
|
16
|
+
|
@@ -0,0 +1,64 @@
|
|
1
|
+
"""
|
2
|
+
Bunch of helper methods
|
3
|
+
"""
|
4
|
+
import csv
|
5
|
+
import logging
|
6
|
+
import sys
|
7
|
+
|
8
|
+
from .base_extractor import ALL_EXTRACTORS
|
9
|
+
|
10
|
+
from ...extractions import Extractor
|
11
|
+
from ...utils import FILE_EXTENSIONS, read_included_file, TLDs
|
12
|
+
|
13
|
+
def check_false_positive_domain(domain):
|
14
|
+
"""
|
15
|
+
Check if a domain is a false positive based on its file extension.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
domain (str): The domain name to be checked.
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
bool: True if the domain is not a false positive, False otherwise.
|
22
|
+
"""
|
23
|
+
file_extension = domain.split(".")[-1]
|
24
|
+
if file_extension in FILE_EXTENSIONS:
|
25
|
+
return False
|
26
|
+
else:
|
27
|
+
return True
|
28
|
+
|
29
|
+
from txt2stix.utils import validate_file_mimetype as validate_file_extension, validate_tld
|
30
|
+
|
31
|
+
def load_extractor(extractor):
|
32
|
+
if extractor.pattern_extractor:
|
33
|
+
return
|
34
|
+
extractor.pattern_extractor = ALL_EXTRACTORS.get(extractor.slug)
|
35
|
+
if not extractor.pattern_extractor:
|
36
|
+
raise TypeError(f"could not find associated python class for pattern extractor `{extractor.slug}`")
|
37
|
+
extractor.pattern_extractor.version = extractor.version
|
38
|
+
extractor.pattern_extractor.stix_mapping = extractor.stix_mapping
|
39
|
+
|
40
|
+
|
41
|
+
def extract_all(extractors :list[Extractor], input_text, ignore_extraction_boundary=False):
|
42
|
+
logging.info("using pattern extractors")
|
43
|
+
pattern_extracts = []
|
44
|
+
for extractor in extractors:
|
45
|
+
load_extractor(extractor)
|
46
|
+
extracts = extractor.pattern_extractor().extract_extraction_from_text(input_text)
|
47
|
+
pattern_extracts.extend(extracts)
|
48
|
+
|
49
|
+
pattern_extracts.sort(key=lambda ex: (ex['start_index'], len(ex['value'])))
|
50
|
+
retval = {}
|
51
|
+
end = 0
|
52
|
+
for raw_extract in pattern_extracts:
|
53
|
+
start_index = raw_extract['start_index']
|
54
|
+
key = (raw_extract['type'], raw_extract['value'])
|
55
|
+
if ignore_extraction_boundary or start_index >= end:
|
56
|
+
extraction = retval.setdefault(key, {**raw_extract, "start_index":[start_index]})
|
57
|
+
if start_index not in extraction['start_index']:
|
58
|
+
extraction['start_index'].append(start_index)
|
59
|
+
end = start_index + len(raw_extract['value'])
|
60
|
+
return list(retval.values())
|
61
|
+
|
62
|
+
|
63
|
+
# FILE_EXTENSION = read_included_file('lookups/extensions.txt')
|
64
|
+
# TLD = read_included_file('lookups/tld.txt')
|
@@ -0,0 +1,14 @@
|
|
1
|
+
from .ipv4_extractor import IPv4Extractor
|
2
|
+
from .ipv4_port_extractor import IPv4WithPortExtractor
|
3
|
+
from .ipv4_cidr_extractor import IPv4WithCIDRExtractor
|
4
|
+
from .ipv6_extractor import IPv6Extractor
|
5
|
+
from .ipv6_port_extractor import IPv6WithPortExtractor
|
6
|
+
from .ipv6_cidr_extractor import IPv6WithCIDRExtractor
|
7
|
+
|
8
|
+
IP_EXTRACTORS = [IPv4Extractor,
|
9
|
+
IPv4WithPortExtractor,
|
10
|
+
IPv4WithCIDRExtractor,
|
11
|
+
IPv6Extractor,
|
12
|
+
IPv6WithPortExtractor,
|
13
|
+
IPv6WithCIDRExtractor,
|
14
|
+
]
|
@@ -0,0 +1,49 @@
|
|
1
|
+
from ..base_extractor import BaseExtractor
|
2
|
+
from ipaddress import IPv4Address
|
3
|
+
|
4
|
+
|
5
|
+
class IPv4WithCIDRExtractor(BaseExtractor):
|
6
|
+
"""
|
7
|
+
A class for extracting valid IPv4 addresses with CIDR notation from text using a custom extraction function.
|
8
|
+
|
9
|
+
Attributes:
|
10
|
+
name (str): The name of the extractor, set to "ipv4-cidr".
|
11
|
+
extraction_function (function): The custom extraction function to validate and extract IPv4 addresses with CIDR.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = "pattern_ipv4_address_cidr"
|
15
|
+
extraction_function = lambda x: IPv4WithCIDRExtractor.validate_ipv4_with_port(x)
|
16
|
+
|
17
|
+
@staticmethod
|
18
|
+
def validate_ipv4_with_port(x):
|
19
|
+
"""
|
20
|
+
Custom extraction function to validate if the provided string is a valid IPv4 address with CIDR.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
x (str): The string to be checked.
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
tuple: A tuple containing the extracted IPv4 address and CIDR if valid, False otherwise.
|
27
|
+
"""
|
28
|
+
x = x.strip('"')
|
29
|
+
|
30
|
+
if "https://" in x:
|
31
|
+
x = x.strip("https://")
|
32
|
+
|
33
|
+
if "https://" in x:
|
34
|
+
x = x.strip("http://")
|
35
|
+
|
36
|
+
if "/" in x:
|
37
|
+
ip_address, cidr = x.split("/")
|
38
|
+
|
39
|
+
try:
|
40
|
+
# Validate the IPv4 address part.
|
41
|
+
IPv4Address(ip_address)
|
42
|
+
|
43
|
+
# Validate the CIDR part.
|
44
|
+
if 0 <= int(cidr) <= 32:
|
45
|
+
return ip_address, cidr
|
46
|
+
except ValueError:
|
47
|
+
pass
|
48
|
+
|
49
|
+
return False
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import re
|
2
|
+
from ..base_extractor import BaseExtractor
|
3
|
+
from ipaddress import IPv4Interface
|
4
|
+
import validators
|
5
|
+
|
6
|
+
|
7
|
+
class IPv4Extractor(BaseExtractor):
|
8
|
+
"""
|
9
|
+
A class for extracting valid IPv4 addresses from text using a custom extraction function and a filter function.
|
10
|
+
|
11
|
+
Attributes:
|
12
|
+
name (str): The name of the extractor, set to "ipv4".
|
13
|
+
extraction_function (function): The custom extraction function to validate and extract IPv4 addresses.
|
14
|
+
filter_function (function): The custom filter function to further filter the extracted IPv4 addresses.
|
15
|
+
"""
|
16
|
+
|
17
|
+
name = "pattern_ipv4_address_only"
|
18
|
+
extraction_function = lambda ipaddress: validators.ipv4(ipaddress, strict=True, cidr=False)
|