txt2stix 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. txt2stix/__init__.py +33 -0
  2. txt2stix/ai_extractor/__init__.py +15 -0
  3. txt2stix/ai_extractor/anthropic.py +12 -0
  4. txt2stix/ai_extractor/base.py +87 -0
  5. txt2stix/ai_extractor/deepseek.py +19 -0
  6. txt2stix/ai_extractor/gemini.py +18 -0
  7. txt2stix/ai_extractor/openai.py +15 -0
  8. txt2stix/ai_extractor/openrouter.py +20 -0
  9. txt2stix/ai_extractor/prompts.py +164 -0
  10. txt2stix/ai_extractor/utils.py +85 -0
  11. txt2stix/attack_flow.py +101 -0
  12. txt2stix/bundler.py +428 -0
  13. txt2stix/common.py +23 -0
  14. txt2stix/extractions.py +59 -0
  15. txt2stix/includes/__init__.py +0 -0
  16. txt2stix/includes/extractions/ai/config.yaml +1023 -0
  17. txt2stix/includes/extractions/lookup/config.yaml +393 -0
  18. txt2stix/includes/extractions/pattern/config.yaml +609 -0
  19. txt2stix/includes/helpers/mimetype_filename_extension_list.csv +936 -0
  20. txt2stix/includes/helpers/stix_relationship_types.txt +41 -0
  21. txt2stix/includes/helpers/tlds.txt +1446 -0
  22. txt2stix/includes/helpers/windows_registry_key_prefix.txt +12 -0
  23. txt2stix/includes/lookups/_README.md +11 -0
  24. txt2stix/includes/lookups/_generate_lookups.py +247 -0
  25. txt2stix/includes/lookups/attack_pattern.txt +1 -0
  26. txt2stix/includes/lookups/campaign.txt +1 -0
  27. txt2stix/includes/lookups/country_iso3166_alpha2.txt +249 -0
  28. txt2stix/includes/lookups/course_of_action.txt +1 -0
  29. txt2stix/includes/lookups/disarm_id_v1_5.txt +345 -0
  30. txt2stix/includes/lookups/disarm_name_v1_5.txt +347 -0
  31. txt2stix/includes/lookups/extensions.txt +78 -0
  32. txt2stix/includes/lookups/identity.txt +1 -0
  33. txt2stix/includes/lookups/infrastructure.txt +1 -0
  34. txt2stix/includes/lookups/intrusion_set.txt +1 -0
  35. txt2stix/includes/lookups/malware.txt +2 -0
  36. txt2stix/includes/lookups/mitre_atlas_id_v4_5_2.txt +116 -0
  37. txt2stix/includes/lookups/mitre_atlas_name_v4_5_2.txt +117 -0
  38. txt2stix/includes/lookups/mitre_attack_enterprise_aliases_v16_0.txt +1502 -0
  39. txt2stix/includes/lookups/mitre_attack_enterprise_id_v16_0.txt +1656 -0
  40. txt2stix/includes/lookups/mitre_attack_enterprise_name_v16_0.txt +1765 -0
  41. txt2stix/includes/lookups/mitre_attack_ics_aliases_v16_0.txt +141 -0
  42. txt2stix/includes/lookups/mitre_attack_ics_id_v16_0.txt +254 -0
  43. txt2stix/includes/lookups/mitre_attack_ics_name_v16_0.txt +293 -0
  44. txt2stix/includes/lookups/mitre_attack_mobile_aliases_v16_0.txt +159 -0
  45. txt2stix/includes/lookups/mitre_attack_mobile_id_v16_0.txt +277 -0
  46. txt2stix/includes/lookups/mitre_attack_mobile_name_v16_0.txt +296 -0
  47. txt2stix/includes/lookups/mitre_capec_id_v3_9.txt +559 -0
  48. txt2stix/includes/lookups/mitre_capec_name_v3_9.txt +560 -0
  49. txt2stix/includes/lookups/mitre_cwe_id_v4_15.txt +939 -0
  50. txt2stix/includes/lookups/mitre_cwe_name_v4_15.txt +939 -0
  51. txt2stix/includes/lookups/threat_actor.txt +1 -0
  52. txt2stix/includes/lookups/tld.txt +1422 -0
  53. txt2stix/includes/lookups/tool.txt +1 -0
  54. txt2stix/includes/tests/test_cases.yaml +695 -0
  55. txt2stix/indicator.py +860 -0
  56. txt2stix/lookups.py +68 -0
  57. txt2stix/pattern/__init__.py +13 -0
  58. txt2stix/pattern/extractors/__init__.py +0 -0
  59. txt2stix/pattern/extractors/base_extractor.py +167 -0
  60. txt2stix/pattern/extractors/card/README.md +34 -0
  61. txt2stix/pattern/extractors/card/__init__.py +15 -0
  62. txt2stix/pattern/extractors/card/amex_card_extractor.py +52 -0
  63. txt2stix/pattern/extractors/card/diners_card_extractor.py +47 -0
  64. txt2stix/pattern/extractors/card/discover_card_extractor.py +48 -0
  65. txt2stix/pattern/extractors/card/jcb_card_extractor.py +43 -0
  66. txt2stix/pattern/extractors/card/master_card_extractor.py +63 -0
  67. txt2stix/pattern/extractors/card/union_card_extractor.py +38 -0
  68. txt2stix/pattern/extractors/card/visa_card_extractor.py +46 -0
  69. txt2stix/pattern/extractors/crypto/__init__.py +3 -0
  70. txt2stix/pattern/extractors/crypto/btc_extractor.py +38 -0
  71. txt2stix/pattern/extractors/directory/__init__.py +10 -0
  72. txt2stix/pattern/extractors/directory/unix_directory_extractor.py +40 -0
  73. txt2stix/pattern/extractors/directory/unix_file_path_extractor.py +42 -0
  74. txt2stix/pattern/extractors/directory/windows_directory_path_extractor.py +47 -0
  75. txt2stix/pattern/extractors/directory/windows_file_path_extractor.py +42 -0
  76. txt2stix/pattern/extractors/domain/__init__.py +8 -0
  77. txt2stix/pattern/extractors/domain/domain_extractor.py +39 -0
  78. txt2stix/pattern/extractors/domain/hostname_extractor.py +36 -0
  79. txt2stix/pattern/extractors/domain/sub_domain_extractor.py +49 -0
  80. txt2stix/pattern/extractors/hashes/__init__.py +16 -0
  81. txt2stix/pattern/extractors/hashes/md5_extractor.py +16 -0
  82. txt2stix/pattern/extractors/hashes/sha1_extractor.py +14 -0
  83. txt2stix/pattern/extractors/hashes/sha224_extractor.py +18 -0
  84. txt2stix/pattern/extractors/hashes/sha2_256_exactor.py +14 -0
  85. txt2stix/pattern/extractors/hashes/sha2_512_exactor.py +13 -0
  86. txt2stix/pattern/extractors/hashes/sha3_256_exactor.py +15 -0
  87. txt2stix/pattern/extractors/hashes/sha3_512_exactor.py +16 -0
  88. txt2stix/pattern/extractors/helper.py +64 -0
  89. txt2stix/pattern/extractors/ip/__init__.py +14 -0
  90. txt2stix/pattern/extractors/ip/ipv4_cidr_extractor.py +49 -0
  91. txt2stix/pattern/extractors/ip/ipv4_extractor.py +18 -0
  92. txt2stix/pattern/extractors/ip/ipv4_port_extractor.py +42 -0
  93. txt2stix/pattern/extractors/ip/ipv6_cidr_extractor.py +18 -0
  94. txt2stix/pattern/extractors/ip/ipv6_extractor.py +16 -0
  95. txt2stix/pattern/extractors/ip/ipv6_port_extractor.py +46 -0
  96. txt2stix/pattern/extractors/others/__init__.py +22 -0
  97. txt2stix/pattern/extractors/others/asn_extractor.py +14 -0
  98. txt2stix/pattern/extractors/others/cpe_extractor.py +29 -0
  99. txt2stix/pattern/extractors/others/cve_extractor.py +14 -0
  100. txt2stix/pattern/extractors/others/email_extractor.py +21 -0
  101. txt2stix/pattern/extractors/others/filename_extractor.py +17 -0
  102. txt2stix/pattern/extractors/others/iban_extractor.py +15 -0
  103. txt2stix/pattern/extractors/others/mac_address_extractor.py +13 -0
  104. txt2stix/pattern/extractors/others/phonenumber_extractor.py +41 -0
  105. txt2stix/pattern/extractors/others/user_agent_extractor.py +20 -0
  106. txt2stix/pattern/extractors/others/windows_registry_key_extractor.py +18 -0
  107. txt2stix/pattern/extractors/url/__init__.py +7 -0
  108. txt2stix/pattern/extractors/url/url_extractor.py +22 -0
  109. txt2stix/pattern/extractors/url/url_file_extractor.py +21 -0
  110. txt2stix/pattern/extractors/url/url_path_extractor.py +74 -0
  111. txt2stix/retriever.py +126 -0
  112. txt2stix/stix.py +1 -0
  113. txt2stix/txt2stix.py +336 -0
  114. txt2stix/utils.py +86 -0
  115. txt2stix-0.0.4.dist-info/METADATA +190 -0
  116. txt2stix-0.0.4.dist-info/RECORD +119 -0
  117. txt2stix-0.0.4.dist-info/WHEEL +4 -0
  118. txt2stix-0.0.4.dist-info/entry_points.txt +2 -0
  119. txt2stix-0.0.4.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,38 @@
1
+ from validators import base58
2
+ from ..base_extractor import BaseExtractor
3
+ from base58 import b58decode
4
+
5
+
6
+ class CryptoBTCWalletExtractor(BaseExtractor):
7
+ """
8
+ A class for extracting Bitcoin (BTC) wallet addresses from text using regular expressions.
9
+
10
+ """
11
+
12
+ name = "pattern_cryptocurrency_btc_wallet"
13
+ extraction_regex = "^(bc1|[13])[a-km-zA-HJ-NP-Z1-9]{25,34}$"
14
+
15
+ @classmethod
16
+ def filter_function(cls, value):
17
+ try:
18
+ b58decode(value).hex()
19
+ return True
20
+ except:
21
+ return False
22
+
23
+ class CryptoBTCWalletTransactionExtractor(CryptoBTCWalletExtractor):
24
+ """
25
+ A class for extracting Bitcoin (BTC) wallet addresses along with the transactions on that account.
26
+
27
+ """
28
+ name = "pattern_cryptocurrency_btc_wallet_transaction"
29
+
30
+ class CryptoBTCTransactionExtractor(BaseExtractor):
31
+ """
32
+ A class for extracting Bitcoin (BTC) transaction hash from text using regular expressions.
33
+
34
+ """
35
+
36
+ name = "pattern_cryptocurrency_btc_transaction"
37
+ extraction_regex = "^[a-fA-F0-9]{64}$"
38
+
@@ -0,0 +1,10 @@
1
+ from .unix_directory_extractor import UnixDirectoryExtractor
2
+ from .unix_file_path_extractor import UnixFilePathExtractor
3
+ from .windows_directory_path_extractor import WindowDirectoryExtractor
4
+ from .windows_file_path_extractor import WindowsFilePathExtractor
5
+
6
+ DIRECTORY_EXTRACTORS = [UnixDirectoryExtractor,
7
+ UnixFilePathExtractor,
8
+ WindowDirectoryExtractor,
9
+ WindowsFilePathExtractor
10
+ ]
@@ -0,0 +1,40 @@
1
+ from ..base_extractor import BaseExtractor
2
+ from pathvalidate import is_valid_filepath
3
+
4
+
5
+ class UnixDirectoryExtractor(BaseExtractor):
6
+ """
7
+ A class for extracting valid Unix-style directory paths from text using a custom extraction function.
8
+
9
+ Attributes:
10
+ name (str): The name of the extractor, set to "unix-directory-path".
11
+ extraction_function (function): The custom extraction function to validate and extract directory paths.
12
+ """
13
+
14
+ name = "pattern_directory_unix"
15
+ extraction_function = lambda x: UnixDirectoryExtractor.is_valid_directory(x)
16
+
17
+ @staticmethod
18
+ def is_valid_directory(directory_path):
19
+ """
20
+ Custom extraction function to validate if the provided path is a valid Unix directory path.
21
+
22
+ Args:
23
+ directory_path (str): The path to be checked.
24
+
25
+ Returns:
26
+ bool: True if the path is a valid Unix directory path, False otherwise.
27
+ """
28
+ if "/" in directory_path and (directory_path[0] in ['.', '/', '~']):
29
+ try:
30
+ # Checking if it's a file path by splitting the path and checking the last component for a dot.
31
+ file_name = directory_path.split('/')[-1]
32
+ if "." in file_name:
33
+ return False
34
+ except Exception as e:
35
+ pass
36
+
37
+ # Using pathvalidate library to check if the path is valid for Linux platform.
38
+ check = is_valid_filepath(directory_path, platform="Linux")
39
+ return check
40
+ return False
@@ -0,0 +1,42 @@
1
+ from pathvalidate import is_valid_filepath
2
+
3
+ from ..base_extractor import BaseExtractor
4
+ from ..helper import validate_file_extension
5
+
6
+
7
+ class UnixFilePathExtractor(BaseExtractor):
8
+ """
9
+ A class for extracting valid Unix-style file paths from text using a custom extraction function.
10
+
11
+ Attributes:
12
+ name (str): The name of the extractor, set to "unix-file-directory".
13
+ ignore_list (list): A list of strings to be ignored during extraction.
14
+ extraction_function (function): The custom extraction function to validate and extract file paths.
15
+ """
16
+
17
+ name = "pattern_directory_unix_file"
18
+ ignore_list = ["http://", "https://", "http[:]//", "http[://"]
19
+ extraction_function = lambda x: UnixFilePathExtractor.is_valid_directory(x)
20
+
21
+ @staticmethod
22
+ def is_valid_directory(directory_path):
23
+ """
24
+ Custom extraction function to validate if the provided path is a valid Unix file path.
25
+
26
+ Args:
27
+ directory_path (str): The path to be checked.
28
+
29
+ Returns:
30
+ bool: True if the path is a valid Unix file path, False otherwise.
31
+ """
32
+ if "/" in directory_path and (directory_path[0] in ['.', '/', '~']) and "\\" not in directory_path:
33
+ # Using pathvalidate library to check if the path is valid for Linux platform.
34
+ check = is_valid_filepath(directory_path, platform="Linux")
35
+ if check:
36
+ try:
37
+ # Checking if it's a file path by splitting the path and checking the last component for a dot.
38
+ if validate_file_extension(directory_path):
39
+ return True
40
+ except Exception as e:
41
+ return False
42
+ return False
@@ -0,0 +1,47 @@
1
+ from pathvalidate import is_valid_filepath
2
+ from ..base_extractor import BaseExtractor
3
+ from ..helper import validate_file_extension
4
+
5
+
6
+ class WindowDirectoryExtractor(BaseExtractor):
7
+ """
8
+ A class for extracting valid Windows-style directory paths from text using a custom extraction function.
9
+
10
+ Attributes:
11
+ name (str): The name of the extractor, set to "widows-directory".
12
+ ignore_list (list): A list of strings to be ignored during extraction.
13
+ extraction_function (function): The custom extraction function to validate and extract directory paths.
14
+ """
15
+
16
+ name = "pattern_directory_windows"
17
+ extraction_function = lambda x: WindowDirectoryExtractor.is_valid_directory(x)
18
+
19
+ @staticmethod
20
+ def is_valid_directory(directory_path):
21
+ """
22
+ Custom extraction function to validate if the provided path is a valid Windows directory path.
23
+
24
+ Args:
25
+ directory_path (str): The path to be checked.
26
+
27
+ Returns:
28
+ bool: True if the path is a valid Windows directory path, False otherwise.
29
+ """
30
+ if directory_path == '\\' or directory_path == '\\\\':
31
+ return False
32
+
33
+ directory_path = directory_path.strip('"')
34
+ drive_letters = ["{}:\\".format(letter) for letter in "CDEFGHIJKLMNO"] + ["{}:".format(letter) for letter in "CDEFGHIJKLMNO"] + ['..\\', '\\', '\\\\']
35
+ flag = False
36
+ for prefix in drive_letters:
37
+ if directory_path.startswith(str(prefix)):
38
+ flag = True
39
+ break
40
+ start, _, _ = directory_path.partition('\\')
41
+ if start.startswith('%') and start.endswith('%'):
42
+ flag = True
43
+ if flag:
44
+ check = is_valid_filepath(directory_path, platform="Windows")
45
+ if not validate_file_extension(directory_path) and check:
46
+ return check
47
+ return False
@@ -0,0 +1,42 @@
1
+ from pathvalidate import is_valid_filepath
2
+
3
+ from txt2stix import utils
4
+ from ..base_extractor import BaseExtractor
5
+ from ..helper import validate_file_extension
6
+
7
+
8
+ class WindowsFilePathExtractor(BaseExtractor):
9
+ """
10
+ WindowsFilePathExtractor is a class that extracts file paths on the Windows platform and validates the file extension.
11
+
12
+ Attributes:
13
+ name (str): The name of the extractor.
14
+ extraction_function (function): The function to extract file paths.
15
+ """
16
+ name = "pattern_directory_windows_with_file"
17
+ extraction_function = lambda x: WindowsFilePathExtractor.is_valid_directory(x)
18
+
19
+ @staticmethod
20
+ def is_valid_directory(directory_path):
21
+ """
22
+ Custom extraction function to validate if the provided path is a valid Windows directory path.
23
+
24
+ Args:
25
+ directory_path (str): The path to be checked.
26
+
27
+ Returns:
28
+ bool: True if the path is a valid Windows directory path, False otherwise.
29
+ """
30
+ directory_path = directory_path.strip('"')
31
+ drive_letters = ["{}:\\".format(letter) for letter in "CDEFGHIJKLMNO"] + ["{}:".format(letter) for letter in "CDEFGHIJKLMNO"] + ['..\\', '\\', '\\\\' ]
32
+ flag = False
33
+ for prefix in drive_letters:
34
+ if directory_path.startswith(str(prefix)):
35
+ flag = True
36
+ break
37
+ if flag:
38
+ check = is_valid_filepath(directory_path, platform="Windows")
39
+ if utils.validate_file_mimetype(directory_path) and check:
40
+ return True
41
+ return False
42
+
@@ -0,0 +1,8 @@
1
+ from .domain_extractor import DomainNameExtractor
2
+ from .sub_domain_extractor import SubDomainExtractor
3
+ from .hostname_extractor import HostnameBaseExtractor
4
+
5
+ DOMAIN_EXTRACTORS = [DomainNameExtractor,
6
+ SubDomainExtractor,
7
+ HostnameBaseExtractor
8
+ ]
@@ -0,0 +1,39 @@
1
+ from tld import get_tld
2
+
3
+ from ..base_extractor import BaseExtractor
4
+
5
+
6
+ class DomainNameExtractor(BaseExtractor):
7
+ """
8
+ A class for extracting valid domain names from text using a custom extraction function.
9
+
10
+ Attributes:
11
+ name (str): The name of the extractor, set to "domain".
12
+ extraction_regex (function): The custom extraction function to validate and extract domain names.
13
+ """
14
+ name = "pattern_domain_name_only"
15
+ extraction_regex = r'(([\da-zA-Z])([_\w-]{,62})\.){,127}(([\da-zA-Z])[_\w-]{,61})?([\da-zA-Z]\.((xn\-\-[a-zA-Z\d]+)|([a-zA-Z\d]{2,})))'
16
+
17
+ @staticmethod
18
+ def filter_function(domain):
19
+ """
20
+ Checks if the given domain is valid based on the number of dots and the top-level domain.
21
+
22
+ Args:
23
+ domain (str): The domain to be checked.
24
+
25
+ Returns:
26
+ bool: True if the domain is valid (has at most 2 dots and a valid TLD), False otherwise.
27
+ """
28
+ if domain.count('.') <= 2:
29
+ tld = get_tld(domain, fix_protocol=True, fail_silently=True)
30
+ if tld:
31
+ domain_name = domain.strip(f".{tld}")
32
+ if domain_name.count(".") == 0:
33
+ return True
34
+ return False
35
+
36
+
37
+ # class HostNameExtractor(DomainNameExtractor):
38
+ # filter_function = None
39
+ # name = "pattern_host_name"
@@ -0,0 +1,36 @@
1
+ from tld import get_tld
2
+
3
+ from txt2stix.utils import validate_file_mimetype
4
+ from ..helper import TLDs
5
+
6
+ from ..base_extractor import BaseExtractor
7
+
8
+
9
+ class HostnameBaseExtractor(BaseExtractor):
10
+ """
11
+ A class for extracting valid hostnames from text using a custom extraction function.
12
+
13
+ Attributes:
14
+ name (str): The name of the extractor, set to "hostname".
15
+ extraction_regex (function): The custom extraction function to validate and extract hostnames.
16
+ """
17
+
18
+ name = "pattern_host_name"
19
+ extraction_regex = r'(([\da-zA-Z])([_\w-]{,62})\.){,127}(([\da-zA-Z])[_\w-]{,61})?([\da-zA-Z]\.((xn\-\-[a-zA-Z\d]+)|([a-zA-Z\d]{2,})))'
20
+
21
+ @staticmethod
22
+ def filter_function(domain):
23
+ """
24
+ Checks if the given domain is valid based on the number of dots and the top-level domain.
25
+
26
+ Args:
27
+ domain (str): The domain to be checked.
28
+
29
+ Returns:
30
+ bool: True if the domain is valid (has at most 2 dots and a valid TLD), False otherwise.
31
+ """
32
+ if domain.count('.') <= 1:
33
+ tld = get_tld(domain, fix_protocol=True, fail_silently=True)
34
+ if not tld:
35
+ return not validate_file_mimetype(domain)
36
+ return False
@@ -0,0 +1,49 @@
1
+ from tld import get_tld
2
+
3
+ from txt2stix.utils import validate_file_mimetype
4
+ from ..helper import TLDs
5
+
6
+ from ..base_extractor import BaseExtractor
7
+
8
+
9
+ class SubDomainExtractor(BaseExtractor):
10
+ """
11
+ A class for extracting valid subdomains from text using a custom extraction function.
12
+
13
+ This class inherits from BaseExtractor, which defines the basic structure and functionality for all extractors.
14
+
15
+ Attributes:
16
+ name (str): The name of the extractor, set to "sub-domain".
17
+ extraction_regex (function): The custom extraction function to validate and extract subdomains.
18
+ """
19
+
20
+ name = "pattern_domain_name_subdomain"
21
+ extraction_regex = r'(([\da-zA-Z])([_\w-]{,62})\.){,127}(([\da-zA-Z])[_\w-]{,61})?([\da-zA-Z]\.((xn\-\-[' \
22
+ r'a-zA-Z\d]+)|([a-zA-Z\d]{2,})))'
23
+
24
+ @staticmethod
25
+ def filter_function(domain):
26
+ """
27
+ Checks if the given domain is valid based on the number of dots and the top-level domain.
28
+
29
+ Args:
30
+ domain (str): The domain to be checked.
31
+
32
+ Returns:
33
+ bool: True if the domain is valid (has at most 2 dots and a valid TLD), False otherwise.
34
+ """
35
+ if domain.count('.') >= 2:
36
+ tld = get_tld(domain, fix_protocol=True, fail_silently=True)
37
+ if tld:
38
+ domain_name = domain.strip(f".{tld}")
39
+ if domain_name.count(".") > 0:
40
+ return True
41
+ return False
42
+
43
+ class HostNameSubdomainExtractor(SubDomainExtractor):
44
+ name = "pattern_host_name_subdomain"
45
+ filter_function = lambda domain: domain.count('.') >= 2 and get_tld(domain, fail_silently=True) not in TLDs
46
+
47
+ def filter_function(domain):
48
+ tld = get_tld(domain, fail_silently=True, fix_protocol=True)
49
+ return domain.count('.') >= 2 and not tld and not validate_file_mimetype(domain)
@@ -0,0 +1,16 @@
1
+ from .md5_extractor import FileHashMD5Extractor
2
+ from .sha1_extractor import FileHashSHA1Extractor
3
+ from .sha2_256_exactor import FileHashSHA2_256Extractor
4
+ from .sha2_512_exactor import FileHashSHA2_512Extractor
5
+ from .sha3_256_exactor import FileHashSHA3_256Extractor
6
+ from .sha3_512_exactor import FileHashSHA3_512Extractor
7
+ from .sha224_extractor import FileHashSHA224Extractor
8
+
9
+ SHA_EXTRACTORS = [FileHashMD5Extractor,
10
+ FileHashSHA1Extractor,
11
+ FileHashSHA224Extractor,
12
+ FileHashSHA2_256Extractor,
13
+ FileHashSHA2_512Extractor,
14
+ FileHashSHA3_256Extractor,
15
+ FileHashSHA3_512Extractor
16
+ ]
@@ -0,0 +1,16 @@
1
+ import validators
2
+ from ..base_extractor import BaseExtractor
3
+
4
+
5
+ class FileHashMD5Extractor(BaseExtractor):
6
+ """
7
+ A class for extracting MD5 file hash values from text using a custom extraction function.
8
+
9
+ Attributes:
10
+ name (str): The name of the extractor, set to "md5".
11
+ extraction_function (function): The custom extraction function to validate and extract MD5 file hash values.
12
+ """
13
+
14
+ name = "pattern_file_hash_md5"
15
+ extraction_function = lambda x: validators.md5(x)
16
+
@@ -0,0 +1,14 @@
1
+ import validators
2
+ from ..base_extractor import BaseExtractor
3
+
4
+ class FileHashSHA1Extractor(BaseExtractor):
5
+ """
6
+ A class for extracting SHA-1 file hash values from text using a custom extraction function.
7
+
8
+ Attributes:
9
+ name (str): The name of the extractor, set to "sha1".
10
+ extraction_function (function): The custom extraction function to validate and extract SHA-1 file hash values.
11
+ """
12
+
13
+ name = "pattern_file_hash_sha_1"
14
+ extraction_function = lambda x: validators.sha1(x)
@@ -0,0 +1,18 @@
1
+ import validators
2
+ from ..base_extractor import BaseExtractor
3
+
4
+
5
+ class FileHashSHA224Extractor(BaseExtractor):
6
+ """
7
+ A class for extracting SHA-256 file hash values from text using a regular expression.
8
+
9
+ Attributes:
10
+ name (str): The name of the extractor, set to "sha2_256".
11
+ extraction_regex (str): The regular expression pattern used for extracting SHA-256 file hash values from the text.
12
+ """
13
+
14
+ name = "pattern_file_hash_sha_224"
15
+ extraction_regex = r'^[0-9a-fA-F]{56}$'
16
+ extraction_function = lambda x: validators.sha224(x)
17
+
18
+ # not currently used as not supported by the STIX spec.
@@ -0,0 +1,14 @@
1
+ from ..base_extractor import BaseExtractor
2
+
3
+
4
+ class FileHashSHA2_256Extractor(BaseExtractor):
5
+ """
6
+ A class for extracting SHA-256 file hash values from text using a regular expression.
7
+
8
+ Attributes:
9
+ name (str): The name of the extractor, set to "sha2_256".
10
+ extraction_regex (str): The regular expression pattern used for extracting SHA-256 file hash values from the text.
11
+ """
12
+
13
+ name = "pattern_file_hash_sha_256"
14
+ extraction_regex = r'^[0-9a-fA-F]{64}$'
@@ -0,0 +1,13 @@
1
+ from ..base_extractor import BaseExtractor
2
+
3
+ class FileHashSHA2_512Extractor(BaseExtractor):
4
+ """
5
+ A class for extracting SHA-512 file hash values from text using a regular expression.
6
+
7
+ Attributes:
8
+ name (str): The name of the extractor, set to "sha2_512".
9
+ extraction_regex (str): The regular expression pattern used for extracting SHA-512 file hash values from the text.
10
+ """
11
+
12
+ name = "pattern_file_hash_sha_512"
13
+ extraction_regex = r'^[0-9a-fA-F]{128}$'
@@ -0,0 +1,15 @@
1
+ from ..base_extractor import BaseExtractor
2
+
3
+
4
+ class FileHashSHA3_256Extractor(BaseExtractor):
5
+ """
6
+ A class for extracting SHA-3 (256-bit) file hash values from text using a regular expression.
7
+
8
+ Attributes:
9
+ name (str): The name of the extractor, set to "sha3_256".
10
+ extraction_regex (str): The regular expression pattern used for extracting SHA-3 (256-bit) file hash values from the text.
11
+ """
12
+
13
+ name = "pattern_file_hash_sha3_256"
14
+ extraction_regex = r'^[0-9a-fA-F]{64}$'
15
+
@@ -0,0 +1,16 @@
1
+ from ..base_extractor import BaseExtractor
2
+
3
+
4
+ class FileHashSHA3_512Extractor(BaseExtractor):
5
+ """
6
+ A class for extracting SHA-3 (512-bit) file hash values from text using a regular expression.
7
+
8
+ Attributes:
9
+ name (str): The name of the extractor, set to "sha3_512".
10
+ extraction_regex (str): The regular expression pattern used for extracting SHA-3 (512-bit) file hash values from the text.
11
+ """
12
+
13
+ name = "pattern_file_hash_sha3_512"
14
+ extraction_regex = r'^[0-9a-fA-F]{128}$'
15
+
16
+
@@ -0,0 +1,64 @@
1
+ """
2
+ Bunch of helper methods
3
+ """
4
+ import csv
5
+ import logging
6
+ import sys
7
+
8
+ from .base_extractor import ALL_EXTRACTORS
9
+
10
+ from ...extractions import Extractor
11
+ from ...utils import FILE_EXTENSIONS, read_included_file, TLDs
12
+
13
+ def check_false_positive_domain(domain):
14
+ """
15
+ Check if a domain is a false positive based on its file extension.
16
+
17
+ Args:
18
+ domain (str): The domain name to be checked.
19
+
20
+ Returns:
21
+ bool: True if the domain is not a false positive, False otherwise.
22
+ """
23
+ file_extension = domain.split(".")[-1]
24
+ if file_extension in FILE_EXTENSIONS:
25
+ return False
26
+ else:
27
+ return True
28
+
29
+ from txt2stix.utils import validate_file_mimetype as validate_file_extension, validate_tld
30
+
31
+ def load_extractor(extractor):
32
+ if extractor.pattern_extractor:
33
+ return
34
+ extractor.pattern_extractor = ALL_EXTRACTORS.get(extractor.slug)
35
+ if not extractor.pattern_extractor:
36
+ raise TypeError(f"could not find associated python class for pattern extractor `{extractor.slug}`")
37
+ extractor.pattern_extractor.version = extractor.version
38
+ extractor.pattern_extractor.stix_mapping = extractor.stix_mapping
39
+
40
+
41
+ def extract_all(extractors :list[Extractor], input_text, ignore_extraction_boundary=False):
42
+ logging.info("using pattern extractors")
43
+ pattern_extracts = []
44
+ for extractor in extractors:
45
+ load_extractor(extractor)
46
+ extracts = extractor.pattern_extractor().extract_extraction_from_text(input_text)
47
+ pattern_extracts.extend(extracts)
48
+
49
+ pattern_extracts.sort(key=lambda ex: (ex['start_index'], len(ex['value'])))
50
+ retval = {}
51
+ end = 0
52
+ for raw_extract in pattern_extracts:
53
+ start_index = raw_extract['start_index']
54
+ key = (raw_extract['type'], raw_extract['value'])
55
+ if ignore_extraction_boundary or start_index >= end:
56
+ extraction = retval.setdefault(key, {**raw_extract, "start_index":[start_index]})
57
+ if start_index not in extraction['start_index']:
58
+ extraction['start_index'].append(start_index)
59
+ end = start_index + len(raw_extract['value'])
60
+ return list(retval.values())
61
+
62
+
63
+ # FILE_EXTENSION = read_included_file('lookups/extensions.txt')
64
+ # TLD = read_included_file('lookups/tld.txt')
@@ -0,0 +1,14 @@
1
+ from .ipv4_extractor import IPv4Extractor
2
+ from .ipv4_port_extractor import IPv4WithPortExtractor
3
+ from .ipv4_cidr_extractor import IPv4WithCIDRExtractor
4
+ from .ipv6_extractor import IPv6Extractor
5
+ from .ipv6_port_extractor import IPv6WithPortExtractor
6
+ from .ipv6_cidr_extractor import IPv6WithCIDRExtractor
7
+
8
+ IP_EXTRACTORS = [IPv4Extractor,
9
+ IPv4WithPortExtractor,
10
+ IPv4WithCIDRExtractor,
11
+ IPv6Extractor,
12
+ IPv6WithPortExtractor,
13
+ IPv6WithCIDRExtractor,
14
+ ]
@@ -0,0 +1,49 @@
1
+ from ..base_extractor import BaseExtractor
2
+ from ipaddress import IPv4Address
3
+
4
+
5
+ class IPv4WithCIDRExtractor(BaseExtractor):
6
+ """
7
+ A class for extracting valid IPv4 addresses with CIDR notation from text using a custom extraction function.
8
+
9
+ Attributes:
10
+ name (str): The name of the extractor, set to "ipv4-cidr".
11
+ extraction_function (function): The custom extraction function to validate and extract IPv4 addresses with CIDR.
12
+ """
13
+
14
+ name = "pattern_ipv4_address_cidr"
15
+ extraction_function = lambda x: IPv4WithCIDRExtractor.validate_ipv4_with_port(x)
16
+
17
+ @staticmethod
18
+ def validate_ipv4_with_port(x):
19
+ """
20
+ Custom extraction function to validate if the provided string is a valid IPv4 address with CIDR.
21
+
22
+ Args:
23
+ x (str): The string to be checked.
24
+
25
+ Returns:
26
+ tuple: A tuple containing the extracted IPv4 address and CIDR if valid, False otherwise.
27
+ """
28
+ x = x.strip('"')
29
+
30
+ if "https://" in x:
31
+ x = x.strip("https://")
32
+
33
+ if "https://" in x:
34
+ x = x.strip("http://")
35
+
36
+ if "/" in x:
37
+ ip_address, cidr = x.split("/")
38
+
39
+ try:
40
+ # Validate the IPv4 address part.
41
+ IPv4Address(ip_address)
42
+
43
+ # Validate the CIDR part.
44
+ if 0 <= int(cidr) <= 32:
45
+ return ip_address, cidr
46
+ except ValueError:
47
+ pass
48
+
49
+ return False
@@ -0,0 +1,18 @@
1
+ import re
2
+ from ..base_extractor import BaseExtractor
3
+ from ipaddress import IPv4Interface
4
+ import validators
5
+
6
+
7
+ class IPv4Extractor(BaseExtractor):
8
+ """
9
+ A class for extracting valid IPv4 addresses from text using a custom extraction function and a filter function.
10
+
11
+ Attributes:
12
+ name (str): The name of the extractor, set to "ipv4".
13
+ extraction_function (function): The custom extraction function to validate and extract IPv4 addresses.
14
+ filter_function (function): The custom filter function to further filter the extracted IPv4 addresses.
15
+ """
16
+
17
+ name = "pattern_ipv4_address_only"
18
+ extraction_function = lambda ipaddress: validators.ipv4(ipaddress, strict=True, cidr=False)