txt2stix 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. txt2stix/__init__.py +33 -0
  2. txt2stix/ai_extractor/__init__.py +15 -0
  3. txt2stix/ai_extractor/anthropic.py +12 -0
  4. txt2stix/ai_extractor/base.py +87 -0
  5. txt2stix/ai_extractor/deepseek.py +19 -0
  6. txt2stix/ai_extractor/gemini.py +18 -0
  7. txt2stix/ai_extractor/openai.py +15 -0
  8. txt2stix/ai_extractor/openrouter.py +20 -0
  9. txt2stix/ai_extractor/prompts.py +164 -0
  10. txt2stix/ai_extractor/utils.py +85 -0
  11. txt2stix/attack_flow.py +101 -0
  12. txt2stix/bundler.py +428 -0
  13. txt2stix/common.py +23 -0
  14. txt2stix/extractions.py +59 -0
  15. txt2stix/includes/__init__.py +0 -0
  16. txt2stix/includes/extractions/ai/config.yaml +1023 -0
  17. txt2stix/includes/extractions/lookup/config.yaml +393 -0
  18. txt2stix/includes/extractions/pattern/config.yaml +609 -0
  19. txt2stix/includes/helpers/mimetype_filename_extension_list.csv +936 -0
  20. txt2stix/includes/helpers/stix_relationship_types.txt +41 -0
  21. txt2stix/includes/helpers/tlds.txt +1446 -0
  22. txt2stix/includes/helpers/windows_registry_key_prefix.txt +12 -0
  23. txt2stix/includes/lookups/_README.md +11 -0
  24. txt2stix/includes/lookups/_generate_lookups.py +247 -0
  25. txt2stix/includes/lookups/attack_pattern.txt +1 -0
  26. txt2stix/includes/lookups/campaign.txt +1 -0
  27. txt2stix/includes/lookups/country_iso3166_alpha2.txt +249 -0
  28. txt2stix/includes/lookups/course_of_action.txt +1 -0
  29. txt2stix/includes/lookups/disarm_id_v1_5.txt +345 -0
  30. txt2stix/includes/lookups/disarm_name_v1_5.txt +347 -0
  31. txt2stix/includes/lookups/extensions.txt +78 -0
  32. txt2stix/includes/lookups/identity.txt +1 -0
  33. txt2stix/includes/lookups/infrastructure.txt +1 -0
  34. txt2stix/includes/lookups/intrusion_set.txt +1 -0
  35. txt2stix/includes/lookups/malware.txt +2 -0
  36. txt2stix/includes/lookups/mitre_atlas_id_v4_5_2.txt +116 -0
  37. txt2stix/includes/lookups/mitre_atlas_name_v4_5_2.txt +117 -0
  38. txt2stix/includes/lookups/mitre_attack_enterprise_aliases_v16_0.txt +1502 -0
  39. txt2stix/includes/lookups/mitre_attack_enterprise_id_v16_0.txt +1656 -0
  40. txt2stix/includes/lookups/mitre_attack_enterprise_name_v16_0.txt +1765 -0
  41. txt2stix/includes/lookups/mitre_attack_ics_aliases_v16_0.txt +141 -0
  42. txt2stix/includes/lookups/mitre_attack_ics_id_v16_0.txt +254 -0
  43. txt2stix/includes/lookups/mitre_attack_ics_name_v16_0.txt +293 -0
  44. txt2stix/includes/lookups/mitre_attack_mobile_aliases_v16_0.txt +159 -0
  45. txt2stix/includes/lookups/mitre_attack_mobile_id_v16_0.txt +277 -0
  46. txt2stix/includes/lookups/mitre_attack_mobile_name_v16_0.txt +296 -0
  47. txt2stix/includes/lookups/mitre_capec_id_v3_9.txt +559 -0
  48. txt2stix/includes/lookups/mitre_capec_name_v3_9.txt +560 -0
  49. txt2stix/includes/lookups/mitre_cwe_id_v4_15.txt +939 -0
  50. txt2stix/includes/lookups/mitre_cwe_name_v4_15.txt +939 -0
  51. txt2stix/includes/lookups/threat_actor.txt +1 -0
  52. txt2stix/includes/lookups/tld.txt +1422 -0
  53. txt2stix/includes/lookups/tool.txt +1 -0
  54. txt2stix/includes/tests/test_cases.yaml +695 -0
  55. txt2stix/indicator.py +860 -0
  56. txt2stix/lookups.py +68 -0
  57. txt2stix/pattern/__init__.py +13 -0
  58. txt2stix/pattern/extractors/__init__.py +0 -0
  59. txt2stix/pattern/extractors/base_extractor.py +167 -0
  60. txt2stix/pattern/extractors/card/README.md +34 -0
  61. txt2stix/pattern/extractors/card/__init__.py +15 -0
  62. txt2stix/pattern/extractors/card/amex_card_extractor.py +52 -0
  63. txt2stix/pattern/extractors/card/diners_card_extractor.py +47 -0
  64. txt2stix/pattern/extractors/card/discover_card_extractor.py +48 -0
  65. txt2stix/pattern/extractors/card/jcb_card_extractor.py +43 -0
  66. txt2stix/pattern/extractors/card/master_card_extractor.py +63 -0
  67. txt2stix/pattern/extractors/card/union_card_extractor.py +38 -0
  68. txt2stix/pattern/extractors/card/visa_card_extractor.py +46 -0
  69. txt2stix/pattern/extractors/crypto/__init__.py +3 -0
  70. txt2stix/pattern/extractors/crypto/btc_extractor.py +38 -0
  71. txt2stix/pattern/extractors/directory/__init__.py +10 -0
  72. txt2stix/pattern/extractors/directory/unix_directory_extractor.py +40 -0
  73. txt2stix/pattern/extractors/directory/unix_file_path_extractor.py +42 -0
  74. txt2stix/pattern/extractors/directory/windows_directory_path_extractor.py +47 -0
  75. txt2stix/pattern/extractors/directory/windows_file_path_extractor.py +42 -0
  76. txt2stix/pattern/extractors/domain/__init__.py +8 -0
  77. txt2stix/pattern/extractors/domain/domain_extractor.py +39 -0
  78. txt2stix/pattern/extractors/domain/hostname_extractor.py +36 -0
  79. txt2stix/pattern/extractors/domain/sub_domain_extractor.py +49 -0
  80. txt2stix/pattern/extractors/hashes/__init__.py +16 -0
  81. txt2stix/pattern/extractors/hashes/md5_extractor.py +16 -0
  82. txt2stix/pattern/extractors/hashes/sha1_extractor.py +14 -0
  83. txt2stix/pattern/extractors/hashes/sha224_extractor.py +18 -0
  84. txt2stix/pattern/extractors/hashes/sha2_256_exactor.py +14 -0
  85. txt2stix/pattern/extractors/hashes/sha2_512_exactor.py +13 -0
  86. txt2stix/pattern/extractors/hashes/sha3_256_exactor.py +15 -0
  87. txt2stix/pattern/extractors/hashes/sha3_512_exactor.py +16 -0
  88. txt2stix/pattern/extractors/helper.py +64 -0
  89. txt2stix/pattern/extractors/ip/__init__.py +14 -0
  90. txt2stix/pattern/extractors/ip/ipv4_cidr_extractor.py +49 -0
  91. txt2stix/pattern/extractors/ip/ipv4_extractor.py +18 -0
  92. txt2stix/pattern/extractors/ip/ipv4_port_extractor.py +42 -0
  93. txt2stix/pattern/extractors/ip/ipv6_cidr_extractor.py +18 -0
  94. txt2stix/pattern/extractors/ip/ipv6_extractor.py +16 -0
  95. txt2stix/pattern/extractors/ip/ipv6_port_extractor.py +46 -0
  96. txt2stix/pattern/extractors/others/__init__.py +22 -0
  97. txt2stix/pattern/extractors/others/asn_extractor.py +14 -0
  98. txt2stix/pattern/extractors/others/cpe_extractor.py +29 -0
  99. txt2stix/pattern/extractors/others/cve_extractor.py +14 -0
  100. txt2stix/pattern/extractors/others/email_extractor.py +21 -0
  101. txt2stix/pattern/extractors/others/filename_extractor.py +17 -0
  102. txt2stix/pattern/extractors/others/iban_extractor.py +15 -0
  103. txt2stix/pattern/extractors/others/mac_address_extractor.py +13 -0
  104. txt2stix/pattern/extractors/others/phonenumber_extractor.py +41 -0
  105. txt2stix/pattern/extractors/others/user_agent_extractor.py +20 -0
  106. txt2stix/pattern/extractors/others/windows_registry_key_extractor.py +18 -0
  107. txt2stix/pattern/extractors/url/__init__.py +7 -0
  108. txt2stix/pattern/extractors/url/url_extractor.py +22 -0
  109. txt2stix/pattern/extractors/url/url_file_extractor.py +21 -0
  110. txt2stix/pattern/extractors/url/url_path_extractor.py +74 -0
  111. txt2stix/retriever.py +126 -0
  112. txt2stix/stix.py +1 -0
  113. txt2stix/txt2stix.py +336 -0
  114. txt2stix/utils.py +86 -0
  115. txt2stix-0.0.4.dist-info/METADATA +190 -0
  116. txt2stix-0.0.4.dist-info/RECORD +119 -0
  117. txt2stix-0.0.4.dist-info/WHEEL +4 -0
  118. txt2stix-0.0.4.dist-info/entry_points.txt +2 -0
  119. txt2stix-0.0.4.dist-info/licenses/LICENSE +202 -0
txt2stix/lookups.py ADDED
@@ -0,0 +1,68 @@
1
+ import uuid
2
+ import yaml
3
+ from .common import FatalException, NamedDict
4
+ from .extractions import Extractor
5
+
6
+ import yaml, re
7
+ import csv
8
+ from pathlib import Path
9
+
10
+
11
+ def load_lookup(extractor):
12
+ if extractor.terms:
13
+ return extractor.terms
14
+ extractor.terms = set(Path(extractor.file).read_text().splitlines())
15
+ return extractor.terms
16
+
17
+ def find_all(extractor, input_str, start_id=0):
18
+ retval = {}
19
+ for term in extractor.terms:
20
+ indexes = find_get_indexes_re(term, input_str)
21
+ observed = {"value": term, "start_index":[], "stix_mapping": extractor.stix_mapping, "type": extractor.slug}
22
+ for index in indexes:
23
+ observed["start_index"].append(index)
24
+ if observed.get("start_index"):
25
+ retval[f"extraction_{start_id+len(retval)}"] = observed
26
+ return retval
27
+
28
+
29
+ def merge_lookups(extractors: list[Extractor]) -> list[tuple[str, str, str]]:
30
+ retval = []
31
+ for ex in extractors:
32
+ load_lookup(ex)
33
+ retval.extend([(term, ex.stix_mapping, ex.slug) for term in ex.terms])
34
+ return sorted(retval, key=lambda kv: len(kv[0]), reverse=True)
35
+
36
+ def extract_all(extractors, input_str):
37
+ terms_ex:list[tuple[str, str, str]] = merge_lookups(extractors)
38
+ seen_indexes = set()
39
+ retval = []
40
+ for term, stix_mapping, slug in terms_ex:
41
+ indexes = set(find_get_indexes_re(term, input_str))
42
+ difference = list(indexes.difference(seen_indexes))
43
+ seen_indexes.update(difference)
44
+ if difference:
45
+ retval.append({"value": term, "start_index":difference, "stix_mapping": stix_mapping, "type": slug})
46
+ return retval
47
+
48
+ def find_get_indexes(term, input_str):
49
+ idx = -1
50
+ while True:
51
+ idx = input_str.find(term, idx+1)
52
+ if idx == -1:
53
+ break
54
+ yield idx
55
+
56
+ def find_get_indexes_re(term, input_str):
57
+ input_str = " "+input_str+" "
58
+ re_i = re.escape(term)
59
+ rexp = []
60
+ for right in [r"\s", r"\.", r",", r"!\s"]:
61
+ rexp.append(r"\s"+ "(" + re_i +")" +right)
62
+ for open, close in ['""', "[]", "()", "``", "''",]:
63
+ rexp.append(re.escape(open)+ "(" + re_i +")" + re.escape(close))
64
+ rexp = "|".join(rexp)
65
+ r = re.compile(rexp, flags=re.IGNORECASE)
66
+ for match in r.finditer(input_str):
67
+ left, right = match.span()
68
+ yield left
@@ -0,0 +1,13 @@
1
+ from .extractors.base_extractor import ALL_EXTRACTORS
2
+
3
+ from .extractors.card import CARD_EXTRACTORS
4
+ from .extractors.crypto import CRYPTO_EXTRACTORS
5
+ from .extractors.directory import DIRECTORY_EXTRACTORS
6
+ from .extractors.domain import DOMAIN_EXTRACTORS
7
+ from .extractors.ip import IP_EXTRACTORS
8
+ from .extractors.hashes import SHA_EXTRACTORS
9
+ from .extractors.url import URL_EXTRACTORS
10
+ from .extractors.others import OTHER_EXTRACTORS
11
+
12
+
13
+ from .extractors.helper import extract_all, load_extractor
File without changes
@@ -0,0 +1,167 @@
1
+ """
2
+ `Extractor` class represents the properties of a given observable.
3
+ """
4
+ import re
5
+ import logging
6
+ from typing import Iterable
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ ALL_EXTRACTORS = {}
11
+
12
+ class BaseExtractor:
13
+ name = None
14
+ extraction_regex = None
15
+ extraction_function = None
16
+ common_strip_elements = "\"'.,:"
17
+ filter_function = None # further filter the extracted values
18
+ meta_extractor = None
19
+ version = None
20
+ stix_mapping = None
21
+ invalid_characters = ['.', ',', '!', '`', '(', ')', '{', '}', '"', '````', ' ', '[', ']']
22
+ SPLITS_FINDER = re.compile(r'(?<=[\'"<\(\{\[\s])(?P<item>.*?)(?=[\s\]\}\)>"\'])') #split on boundary characters instead of ' ' only
23
+
24
+
25
+ @classmethod
26
+ def register_new_extractor(cls, name, extractor):
27
+ ALL_EXTRACTORS[name] = extractor
28
+
29
+ def __init_subclass__(cls, **kwargs):
30
+ super().__init_subclass__(**kwargs)
31
+ cls.register_new_extractor(cls.name, cls)
32
+
33
+ @classmethod
34
+ def extract_extraction_from_text(cls, text: str):
35
+ """
36
+ Extracts the required observables from text and returns the
37
+ extracted observables and modified text.
38
+ """
39
+ extracted_observables = []
40
+ start_index = 0
41
+ if cls.extraction_regex is not None:
42
+ if cls.extraction_regex.startswith("^") or cls.extraction_regex.endswith("$"):
43
+ for matchsplit in cls.SPLITS_FINDER.finditer(text):
44
+ word = matchsplit.group('item')
45
+ start_index = matchsplit.start('item')
46
+ match = re.match(cls.extraction_regex, word)
47
+ if match:
48
+ extracted_observables.append((match.group(0), match.start()+start_index))
49
+ else:
50
+ stripped_word = word.strip(cls.common_strip_elements)
51
+ match = re.match(cls.extraction_regex, stripped_word)
52
+ if match:
53
+ extracted_observables.append((match.group(0), start_index + word.index(stripped_word)))
54
+ else:
55
+ # Find regex in the entire text (including whitespace)
56
+ for match in re.finditer(cls.extraction_regex, text):
57
+ match_value = match.group().strip('\n')
58
+ start_index, end_index = match.span()
59
+
60
+ extracted_observables.append((match_value, start_index))
61
+
62
+ # If extraction_function is not None, then find matches that don't throw exception when
63
+ elif cls.extraction_function is not None:
64
+
65
+ start_index = 0
66
+
67
+ for index, word in tokenize(text):
68
+ # word = match.group('item')
69
+ # end_index = start_index + len(word) - 1
70
+
71
+ # word = BaseExtractor.trim_invalid_characters(word, cls.invalid_characters)
72
+ try:
73
+ if cls.extraction_function(word):
74
+ extracted_observables.append((word, index))
75
+ except Exception as error:
76
+ pass
77
+
78
+ else:
79
+ raise ValueError("Both extraction_regex and extraction_function can't be None.")
80
+
81
+ string_positions = {}
82
+
83
+ # Iterate through the input list to group positions for each string
84
+ for position, (string, pos) in enumerate(extracted_observables, 1):
85
+ if cls.filter_function and not cls.filter_function(string):
86
+ continue
87
+ if string not in string_positions:
88
+ string_positions[string] = []
89
+ string_positions[string].append(pos)
90
+
91
+ response = []
92
+
93
+ # for extraction, positions in string_positions.items():
94
+ # response.append({
95
+ # "value": extraction,
96
+ # "type": cls.name,
97
+ # "version": cls.version,
98
+ # "stix_mapping": cls.stix_mapping,
99
+ # "start_index": positions,
100
+ # })
101
+
102
+ for position, (string, pos) in enumerate(extracted_observables, 1):
103
+ if cls.filter_function and not cls.filter_function(string):
104
+ continue
105
+ response.append({
106
+ "value": string,
107
+ "type": cls.name,
108
+ "version": cls.version,
109
+ "stix_mapping": cls.stix_mapping,
110
+ "start_index": pos,
111
+ })
112
+ return response
113
+
114
+ @classmethod
115
+ def split_all(cls, text):
116
+ for word in cls.SPLITS_FINDER.findall(text):
117
+ yield cls.trim_invalid_characters(word, cls.invalid_characters)
118
+
119
+ @classmethod
120
+ def trim_invalid_characters(cls, keyword: str, characters: Iterable):
121
+ return keyword.strip(''.join(characters))
122
+
123
+ def tokenize(text):
124
+ # Define pairs of boundary characters as a list of tuples
125
+ boundary_pairs = [
126
+ ('(', ')'),
127
+ ('[', ']'),
128
+ ('{', '}'),
129
+ ('"', '"'),
130
+ ("'", "'"),
131
+ (' ', ' '), # Spaces as general separators
132
+ ('\n', ' '),
133
+ ('\n', '\n'),
134
+ (' ', '\n'),
135
+ ]
136
+
137
+ # Define additional closing boundary characters
138
+ additional_closers = {"!", ";", ".", ","}
139
+
140
+ tokens = []
141
+ words = text.split()
142
+
143
+ # Capture individual words with their starting index
144
+ index = 0
145
+ for word in words:
146
+ start_index = text.find(word, index)
147
+ cleaned_word = word.strip("()[]{}'\".,;!:")
148
+ if cleaned_word:
149
+ tokens.append((start_index, cleaned_word))
150
+ index = start_index + len(word)
151
+
152
+ # Capture nested structures with their starting index
153
+ for i in range(len(text)):
154
+ for opener, closer in boundary_pairs:
155
+ if text[i] == opener:
156
+ token = text[i]
157
+ for j in range(i + 1, len(text)):
158
+ token += text[j]
159
+ if text[j] == closer:
160
+ if token.startswith(opener):
161
+ token = token[1:]
162
+ if token.endswith(closer):
163
+ token = token[:-1]
164
+ tokens.append((i, token))
165
+ break
166
+
167
+ return tokens
@@ -0,0 +1,34 @@
1
+ Data retrieved from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-8A0A2E8B-D740-476E-B10C-885919573022.html, by running the following code on page
2
+
3
+ ```js
4
+ function maptoregexp(elements){
5
+ elements = Array.from(elements)
6
+ return JSON.stringify(elements.map(el=>el.innerText), null, 2)
7
+ }
8
+ var [validator_section, positive_match_section, negative_match_section] = document.querySelectorAll("section.section")
9
+ var positive_matchers = maptoregexp(positive_match_section.querySelectorAll("code")), negative_matchers = maptoregexp(negative_match_section.querySelectorAll("code"))
10
+ var validator = validator_section.querySelector("td") && validator_section.querySelector("td").innerText,
11
+ description = document.querySelector(".shortdesc").innerText // Break long words at exactly 78 characters
12
+ .replace(/([^\s]{78})/g, '$1\n\t')
13
+ // Break long lines honoring whitespace
14
+ .replace(/([^\n]{1,78})(\s|$)/g, '$1\n\t')
15
+ .trim()
16
+
17
+ python_code = `
18
+ ### The following part is automatically generated from ${location.href}
19
+ description = """
20
+ ${description}
21
+
22
+ validator = ${validator}
23
+ """
24
+ extraction_regex_list = ${positive_matchers}
25
+ filter_regex_list = ${negative_matchers}
26
+ extraction_regex = "|".join(extraction_regex_list)
27
+
28
+ #end of generated code
29
+
30
+ `
31
+
32
+ console.log(python_code)
33
+ copy(python_code)
34
+ ```
@@ -0,0 +1,15 @@
1
+ from .amex_card_extractor import AmexCardExtractor
2
+ from .diners_card_extractor import DinersCardExtractor
3
+ from .discover_card_extractor import BankCardDiscoverExtractor
4
+ from .jcb_card_extractor import JCBCardExtractor
5
+ from .master_card_extractor import MastercardCardExtractor
6
+ from .union_card_extractor import UnionPayCardExtractor
7
+ from .visa_card_extractor import VisaCardBaseExtractor
8
+
9
+ CARD_EXTRACTORS = [AmexCardExtractor,
10
+ DinersCardExtractor,
11
+ BankCardDiscoverExtractor,
12
+ JCBCardExtractor,
13
+ MastercardCardExtractor,
14
+ UnionPayCardExtractor,
15
+ VisaCardBaseExtractor]
@@ -0,0 +1,52 @@
1
+ from ..base_extractor import BaseExtractor
2
+ from validators import card_number
3
+
4
+ class AmexCardExtractor(BaseExtractor):
5
+ """
6
+ A class for extracting American Express (Amex) credit card numbers from text using regular expressions.
7
+
8
+ Attributes:
9
+ name (str): The name of the extractor, set to "Amex-card".
10
+ extraction_regex (str): The regular expression pattern used for extracting Amex credit card numbers.
11
+ """
12
+
13
+ name = "pattern_bank_card_amex"
14
+
15
+ # The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-97839BB4-3077-4BB0-9974-CF8EEB0E2426.html
16
+ description = """
17
+ American Express Credit Card Number (CCN) is a 15-digit number starting with
18
+ 34 or 37 and might have dashes (hyphens) or spaces as separators. For example,
19
+ NNNN-NNNNNN-NNNNN or NNNN NNNNNN NNNNN. Matches exclude common test card
20
+ numbers.
21
+
22
+ validator = Luhn 10 (remainder 0)
23
+ """
24
+ extraction_regex_list = [
25
+ "\\b3[47]\\d{2} \\d{6} \\d{5}\\b",
26
+ "\\b3[47]\\d{2}-\\d{6}-\\d{5}\\b",
27
+ "\\b3[47]\\d{13}\\b"
28
+ ]
29
+ filter_regex_list = [
30
+ "3400([- ]?)000000([- ]?)00009",
31
+ "3411([- ]?)111111([- ]?)11111",
32
+ "3434([- ]?)343434([- ]?)34343",
33
+ "3456([- ]?)789012([- ]?)34564",
34
+ "3456([- ]?)400000([- ]?)55123",
35
+ "3468([- ]?)276304([- ]?)35344",
36
+ "3700([- ]?)000000([ -]?)00002",
37
+ "3700([- ]?)002000([ -]?)00000",
38
+ "3704([- ]?)072699([ -]?)09809",
39
+ "3705([- ]?)560193([ -]?)09221",
40
+ "3714([- ]?)496353([ -]?)98431",
41
+ "3742([- ]?)000000([ -]?)00004",
42
+ "3756([- ]?)400000([ -]?)55123",
43
+ "3764([- ]?)622809([ -]?)21451",
44
+ "3777([- ]?)527498([ -]?)96404",
45
+ "3782([- ]?)822463([ -]?)10005",
46
+ "3787([- ]?)344936([ -]?)71000"
47
+ ]
48
+ extraction_regex = "|".join(extraction_regex_list)
49
+
50
+ # end of generated code
51
+
52
+ filter_function = card_number
@@ -0,0 +1,47 @@
1
+ from validators import card_number
2
+ from ..base_extractor import BaseExtractor
3
+
4
+
5
+ class DinersCardExtractor(BaseExtractor):
6
+ """
7
+ A class for extracting Diners Club credit card numbers from text using regular expressions.
8
+
9
+ Attributes:
10
+ pattern (str): The pattern to represent the extracted credit card number.
11
+ extraction_regex (str): The regular expression pattern used for extracting Diners Club credit card numbers.
12
+ """
13
+
14
+ name = "pattern_bank_card_diners"
15
+
16
+ # The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-2B5CF316-ED36-4CAB-92D7-AC46714E9882.html
17
+ description = """
18
+ Credit Card Number (Diner's Club) is a 14-digit number beginning with 300–305,
19
+ 36, 38, or 39 and might have dashes (hyphens) or spaces as separators. For
20
+ example, NNNN-NNNNNN-NNNN or NNNN NNNNNN NNNN. Matches exclude common test
21
+ card numbers.
22
+
23
+ validator = Luhn 10 (remainder 0)
24
+ """
25
+ extraction_regex_list = [
26
+ "\\b30[0-5]\\d-\\d{6}-\\d{4}\\b",
27
+ "\\b30[0-5]\\d \\d{6} \\d{4}\\b",
28
+ "\\b30[0-5]\\d{11}\\b",
29
+ "\\b3[689]\\d{2}-\\d{6}-\\d{4}\\b",
30
+ "\\b3[689]\\d{2} \\d{6} \\d{4}\\b",
31
+ "\\b3[689]\\d{12}\\b"
32
+ ]
33
+ filter_regex_list = [
34
+ "3020([ -]?)416932([ -]?)2643",
35
+ "3021([ -]?)804719([ -]?)6557",
36
+ "3022([ -]?)151156([ -]?)3252",
37
+ "3046([ -]?)400000([ -]?)5512",
38
+ "3600([ -]?)000000([ -]?)0008",
39
+ "3614([ -]?)890064([ -]?)7913",
40
+ "3670([ -]?)010200([ -]?)0000",
41
+ "3852([ -]?)000002([ -]?)3237",
42
+ "3912([ -]?)345678([ -]?)9019"
43
+ ]
44
+ extraction_regex = "|".join(extraction_regex_list)
45
+
46
+ # end of generated code
47
+ filter_function = card_number
@@ -0,0 +1,48 @@
1
+ from validators import card_number
2
+ from ..base_extractor import BaseExtractor
3
+
4
+
5
+ class BankCardDiscoverExtractor(BaseExtractor):
6
+ """
7
+ A class for extracting Discover credit card numbers from text using regular expressions.
8
+
9
+ Attributes:
10
+ name (str): The name of the extractor, set to "bank-card-discover".
11
+ extraction_regex (str): The regular expression pattern used for extracting Discover credit card numbers.
12
+ """
13
+
14
+ name = "pattern_bank_card_discover"
15
+
16
+ # The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-EF96B5CE-6C8E-49B2-BD19-82B0CE0E5091.html
17
+ description = """
18
+ Credit Card Number (Discover) is a 16-digit number beginning with 6011,
19
+ 644–649 or 65 and might have dashes (hyphens) or spaces as separators. For
20
+ example, NNNN-NNNN-NNNN-NNNN or NNNN NNNN NNNN NNNN. This excludes common test
21
+ card numbers.
22
+
23
+ validator = Luhn 10 (remainder 0)
24
+ """
25
+ extraction_regex_list = [
26
+ "\\b6011-\\d{4}-\\d{4}-\\d{4}\\b",
27
+ "\\b6011 \\d{4} \\d{4} \\d{4}\\b",
28
+ "\\b6011\\d{12}\\b",
29
+ "\\b64[4-9]\\d-\\d{4}-\\d{4}-\\d{4}\\b",
30
+ "\\b64[4-9]\\d \\d{4} \\d{4} \\d{4}\\b",
31
+ "\\b64[4-9]\\d{13}\\b",
32
+ "\\b65\\d{2}-\\d{4}-\\d{4}-\\d{4}\\b",
33
+ "\\b65\\d{2} \\d{4} \\d{4} \\d{4}\\b",
34
+ "\\b65\\d{14}\\b"
35
+ ]
36
+ filter_regex_list = [
37
+ "6011([ -]?)0009([ -]?)9013([ -]?)9424",
38
+ "6011([ -]?)1111([ -]?)1111([ -]?)1117",
39
+ "6011([ -]?)1532([ -]?)1637([ -]?)1980",
40
+ "6011([ -]?)6011([ -]?)6011([ -]?)6611",
41
+ "6011([ -]?)6874([ -]?)8256([ -]?)4166",
42
+ "6011([ -]?)8148([ -]?)3690([ -]?)5651",
43
+ "6556([ -]?)4000([ -]?)0055([ -]?)1234"
44
+ ]
45
+ extraction_regex = "|".join(extraction_regex_list)
46
+
47
+ # end of generated code
48
+ filter_function = card_number
@@ -0,0 +1,43 @@
1
+ from validators import card_number
2
+ from ..base_extractor import BaseExtractor
3
+
4
+
5
+ class JCBCardExtractor(BaseExtractor):
6
+ """
7
+ A class for extracting JCB credit card numbers from text using regular expressions.
8
+
9
+ Attributes:
10
+ name (str): The name of the extractor, set to "jcb-card".
11
+ extraction_regex (str): The regular expression pattern used for extracting JCB credit card numbers.
12
+ """
13
+
14
+ name = "pattern_bank_card_jcb"
15
+
16
+ # The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-D56393B2-2C27-4CAF-A7C3-AE83298BD96B.html
17
+ description = """
18
+ JCB CCN is a 16-digit number beginning with 3528 or 3589 and might have dashes
19
+ (hyphens) or spaces as separators. For example, NNNN-NNNN-NNNN-NNNN or NNNN
20
+ NNNN NNNNNNNN. This excludes common test card numbers.
21
+
22
+ validator = Luhn 10 (remainder 0)
23
+ """
24
+ extraction_regex_list = [
25
+ "\\b352[89]-\\d{4}-\\d{4}-\\d{4}\\b",
26
+ "\\b352[89] \\d{4} \\d{4} \\d{4}\\b",
27
+ "\\b352[89]\\d{12}\\b",
28
+ "\\b35[3-8]\\d-\\d{4}-\\d{4}-\\d{4}\\b",
29
+ "\\b35[3-8]\\d \\d{4} \\d{4} \\d{4}\\b",
30
+ "\\b35[3-8]\\d{13}\\b"
31
+ ]
32
+ filter_regex_list = [
33
+ "3528([ -]?)0007([ -]?)0000([ -]?)0000",
34
+ "3528([ -]?)7237([ -]?)4002([ -]?)2896",
35
+ "3530([ -]?)1113([ -]?)3330([ -]?)0000",
36
+ "3556([ -]?)4000([ -]?)0055([ -]?)1234",
37
+ "3566([ -]?)0020([ -]?)2036([ -]?)0505",
38
+ "3569([ -]?)9900([ -]?)0000([ -]?)0009"
39
+ ]
40
+ extraction_regex = "|".join(extraction_regex_list)
41
+
42
+ # end of generated code
43
+ filter_function = card_number
@@ -0,0 +1,63 @@
1
+ from validators import card_number
2
+ from ..base_extractor import BaseExtractor
3
+
4
+
5
+ class MastercardCardExtractor(BaseExtractor):
6
+ """
7
+ A class for extracting Mastercard credit card numbers from text using regular expressions.
8
+
9
+ Attributes:
10
+ name (str): The name of the extractor, set to "master-card".
11
+ extraction_regex (str): The regular expression pattern used for extracting Mastercard credit card numbers.
12
+ """
13
+
14
+ name = "pattern_bank_card_mastercard"
15
+
16
+ # The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-8A0A2E8B-D740-476E-B10C-885919573022.html
17
+ description = """
18
+ Credit Card Number (Mastercard) is a 16-digit number beginning with 51–55 or
19
+ 2221– 2720 and might have dashes (hyphens) or spaces as separators. For
20
+ example, NNNN-NNNN-NNNN-NNNN or NNNN NNNN NNNN NNNN. This excludes common test
21
+ card numbers.
22
+
23
+ validator = Luhn 10 (remainder 0)
24
+ """
25
+ extraction_regex_list = [
26
+ "\\b5[1-5]\\d{2}-\\d{4}-\\d{4}-\\d{4}\\b",
27
+ "\\b5[1-5]\\d{2} \\d{4} \\d{4} \\d{4}\\b",
28
+ "\\b5[1-5]\\d{14}\\b",
29
+ "\\b2[2-7]\\d{2}-\\d{4}-\\d{4}-\\d{4}\\b",
30
+ "\\b2[2-7]\\d{2} \\d{4} \\d{4} \\d{4}\\b",
31
+ "\\b2[2-7]\\d{14}\\b"
32
+ ]
33
+ filter_regex_list = [
34
+ "5100([- ]?)0800([ -]?)0000([ -]?)0000",
35
+ "5105([- ]?)1051([ -]?)0510([ -]?)5100",
36
+ "5111([- ]?)1111([ -]?)1111([ -]?)1118",
37
+ "5123([- ]?)4567([ -]?)8901([ -]?)2346",
38
+ "5123([- ]?)6197([ -]?)4539([ -]?)5853",
39
+ "5138([- ]?)4951([ -]?)2555([ -]?)0554",
40
+ "5274([- ]?)5763([ -]?)9425([ -]?)9961",
41
+ "5301([- ]?)7455([ -]?)2913([ -]?)8831",
42
+ "5311([- ]?)5312([ -]?)8600([ -]?)0465",
43
+ "5364([- ]?)5870([ -]?)1178([ -]?)5834",
44
+ "5404([- ]?)0000([ -]?)0000([ -]?)0001",
45
+ "5431([- ]?)1111([ -]?)1111([ -]?)1111",
46
+ "5454([- ]?)5454([ -]?)5454([ -]?)5454",
47
+ "5459([- ]?)8862([ -]?)6563([ -]?)1843",
48
+ "5460([- ]?)5060([ -]?)4803([ -]?)9935",
49
+ "5500([- ]?)9391([ -]?)7800([ -]?)4613",
50
+ "5555([- ]?)5555([ -]?)5555([ -]?)4444",
51
+ "5556([- ]?)4000([ -]?)0055([ -]?)1234",
52
+ "5565([- ]?)5520([ -]?)6448([ -]?)1449",
53
+ "5597([- ]?)5076([ -]?)4491([ -]?)0558",
54
+ "220\\d([- ]?)\\d{4}([ -]?)\\d{4}([ -]?)\\d{4}",
55
+ "221\\d([- ]?)\\d{4}([ -]?)\\d{4}([ -]?)\\d{4}",
56
+ "2220([- ]?)\\d{4}([ -]?)\\d{4}([ -]?)\\d{4}",
57
+ "272[1-9]([- ]?)\\d{4}([ -]?)\\d{4}([ -]?)\\d{4}",
58
+ "27[3-9]\\d([- ]?)\\d{4}([ -]?)\\d{4}([ -]?)\\d{4}"
59
+ ]
60
+ extraction_regex = "|".join(extraction_regex_list)
61
+
62
+ # end of generated code
63
+ filter_function = card_number
@@ -0,0 +1,38 @@
1
+ from ..base_extractor import BaseExtractor
2
+
3
+
4
+ class UnionPayCardExtractor(BaseExtractor):
5
+ """
6
+ A class for extracting UnionPay credit card numbers from text using regular expressions.
7
+
8
+ This class inherits from BaseExtractor, which defines the basic structure and functionality for all extractors.
9
+
10
+ Attributes:
11
+ name (str): The name of the extractor, set to "union-pay-card".
12
+ extraction_regex (str): The regular expression pattern used for extracting UnionPay credit card numbers.
13
+ """
14
+
15
+ name = "pattern_bank_card_union_pay"
16
+
17
+ # The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-B8D29ECE-E70A-401E-B18D-B773F4FF71ED.html
18
+ description = """
19
+ The China UnionPay credit card numbers begin with 62 or 60 and is a 16-19
20
+ digit long number.
21
+
22
+ validator = China Union Pay Card validator
23
+ """
24
+ extraction_regex_list = [
25
+ "\\b622\\d{13,16}\\b",
26
+ "\\b603601\\d{10}\\b",
27
+ "\\b603265\\d{10}\\b",
28
+ "\\b621977\\d{10}\\b",
29
+ "\\b603708\\d{10}\\b",
30
+ "\\b602969\\d{10}\\b",
31
+ "\\b601428\\d{10}\\b",
32
+ "\\b603367\\d{10}\\b",
33
+ "\\b603694\\d{10}\\b"
34
+ ]
35
+ filter_regex_list = []
36
+ extraction_regex = "|".join(extraction_regex_list)
37
+
38
+ # end of generated code
@@ -0,0 +1,46 @@
1
+ from validators import card_number
2
+ from ..base_extractor import BaseExtractor
3
+
4
+
5
+ class VisaCardBaseExtractor(BaseExtractor):
6
+ """
7
+ A class for extracting VISA credit card numbers from text using regular expressions.
8
+
9
+ Attributes:
10
+ name (str): The name of the extractor, set to "visa-card".
11
+ extraction_regex (str): The regular expression pattern used for extracting VISA credit card numbers.
12
+ """
13
+
14
+ name = "pattern_bank_card_visa"
15
+
16
+ # The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-66A52D16-CA41-4509-826D-8D29B1F968C2.html
17
+ description = """
18
+ Credit Card Number (Visa) is 16-digit number and might have dashes (hyphen) or
19
+ spaces as separators. For example, NNNN-NNNN-NNNN-NNNN or NNNN NNNN NNNN NNNN.
20
+ This excludes common test card numbers.
21
+
22
+ validator = Luhn 10 (remainder 0)
23
+ """
24
+ extraction_regex_list = [
25
+ "\\b4\\d{3}-\\d{4}-\\d{4}-\\d{4}\\b",
26
+ "\\b4\\d{3} \\d{4} \\d{4} \\d{4}\\b",
27
+ "\\b4\\d{15}\\b"
28
+ ]
29
+ filter_regex_list = [
30
+ "4005([ -]?)5500([ -]?)0000([ -]?)0001",
31
+ "4012([ -]?)8888([ -]?)8888([ -]?)1881",
32
+ "4111([ -]?)1111([ -]?)1111([ -]?)1111",
33
+ "4444([ -]?)3333([ -]?)2222([ -]?)1111",
34
+ "4539([ -]?)1050([ -]?)1153([ -]?)9664",
35
+ "4555([ -]?)4000([ -]?)0055([ -]?)5123",
36
+ "4564([ -]?)4564([ -]?)4564([ -]?)4564",
37
+ "4544([ -]?)1821([ -]?)7453([ -]?)7267",
38
+ "4716([ -]?)9147([ -]?)0653([ -]?)4228",
39
+ "4916([ -]?)5417([ -]?)1375([ -]?)7159",
40
+ "4916([ -]?)6156([ -]?)3934([ -]?)6972",
41
+ "4917([ -]?)6100([ -]?)0000([ -]?)0000"
42
+ ]
43
+ extraction_regex = "|".join(extraction_regex_list)
44
+
45
+ # end of generated code
46
+ filter_function = card_number
@@ -0,0 +1,3 @@
1
+ from .btc_extractor import CryptoBTCWalletExtractor
2
+
3
+ CRYPTO_EXTRACTORS = [CryptoBTCWalletExtractor]