txt2stix 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- txt2stix/__init__.py +33 -0
- txt2stix/ai_extractor/__init__.py +15 -0
- txt2stix/ai_extractor/anthropic.py +12 -0
- txt2stix/ai_extractor/base.py +87 -0
- txt2stix/ai_extractor/deepseek.py +19 -0
- txt2stix/ai_extractor/gemini.py +18 -0
- txt2stix/ai_extractor/openai.py +15 -0
- txt2stix/ai_extractor/openrouter.py +20 -0
- txt2stix/ai_extractor/prompts.py +164 -0
- txt2stix/ai_extractor/utils.py +85 -0
- txt2stix/attack_flow.py +101 -0
- txt2stix/bundler.py +428 -0
- txt2stix/common.py +23 -0
- txt2stix/extractions.py +59 -0
- txt2stix/includes/__init__.py +0 -0
- txt2stix/includes/extractions/ai/config.yaml +1023 -0
- txt2stix/includes/extractions/lookup/config.yaml +393 -0
- txt2stix/includes/extractions/pattern/config.yaml +609 -0
- txt2stix/includes/helpers/mimetype_filename_extension_list.csv +936 -0
- txt2stix/includes/helpers/stix_relationship_types.txt +41 -0
- txt2stix/includes/helpers/tlds.txt +1446 -0
- txt2stix/includes/helpers/windows_registry_key_prefix.txt +12 -0
- txt2stix/includes/lookups/_README.md +11 -0
- txt2stix/includes/lookups/_generate_lookups.py +247 -0
- txt2stix/includes/lookups/attack_pattern.txt +1 -0
- txt2stix/includes/lookups/campaign.txt +1 -0
- txt2stix/includes/lookups/country_iso3166_alpha2.txt +249 -0
- txt2stix/includes/lookups/course_of_action.txt +1 -0
- txt2stix/includes/lookups/disarm_id_v1_5.txt +345 -0
- txt2stix/includes/lookups/disarm_name_v1_5.txt +347 -0
- txt2stix/includes/lookups/extensions.txt +78 -0
- txt2stix/includes/lookups/identity.txt +1 -0
- txt2stix/includes/lookups/infrastructure.txt +1 -0
- txt2stix/includes/lookups/intrusion_set.txt +1 -0
- txt2stix/includes/lookups/malware.txt +2 -0
- txt2stix/includes/lookups/mitre_atlas_id_v4_5_2.txt +116 -0
- txt2stix/includes/lookups/mitre_atlas_name_v4_5_2.txt +117 -0
- txt2stix/includes/lookups/mitre_attack_enterprise_aliases_v16_0.txt +1502 -0
- txt2stix/includes/lookups/mitre_attack_enterprise_id_v16_0.txt +1656 -0
- txt2stix/includes/lookups/mitre_attack_enterprise_name_v16_0.txt +1765 -0
- txt2stix/includes/lookups/mitre_attack_ics_aliases_v16_0.txt +141 -0
- txt2stix/includes/lookups/mitre_attack_ics_id_v16_0.txt +254 -0
- txt2stix/includes/lookups/mitre_attack_ics_name_v16_0.txt +293 -0
- txt2stix/includes/lookups/mitre_attack_mobile_aliases_v16_0.txt +159 -0
- txt2stix/includes/lookups/mitre_attack_mobile_id_v16_0.txt +277 -0
- txt2stix/includes/lookups/mitre_attack_mobile_name_v16_0.txt +296 -0
- txt2stix/includes/lookups/mitre_capec_id_v3_9.txt +559 -0
- txt2stix/includes/lookups/mitre_capec_name_v3_9.txt +560 -0
- txt2stix/includes/lookups/mitre_cwe_id_v4_15.txt +939 -0
- txt2stix/includes/lookups/mitre_cwe_name_v4_15.txt +939 -0
- txt2stix/includes/lookups/threat_actor.txt +1 -0
- txt2stix/includes/lookups/tld.txt +1422 -0
- txt2stix/includes/lookups/tool.txt +1 -0
- txt2stix/includes/tests/test_cases.yaml +695 -0
- txt2stix/indicator.py +860 -0
- txt2stix/lookups.py +68 -0
- txt2stix/pattern/__init__.py +13 -0
- txt2stix/pattern/extractors/__init__.py +0 -0
- txt2stix/pattern/extractors/base_extractor.py +167 -0
- txt2stix/pattern/extractors/card/README.md +34 -0
- txt2stix/pattern/extractors/card/__init__.py +15 -0
- txt2stix/pattern/extractors/card/amex_card_extractor.py +52 -0
- txt2stix/pattern/extractors/card/diners_card_extractor.py +47 -0
- txt2stix/pattern/extractors/card/discover_card_extractor.py +48 -0
- txt2stix/pattern/extractors/card/jcb_card_extractor.py +43 -0
- txt2stix/pattern/extractors/card/master_card_extractor.py +63 -0
- txt2stix/pattern/extractors/card/union_card_extractor.py +38 -0
- txt2stix/pattern/extractors/card/visa_card_extractor.py +46 -0
- txt2stix/pattern/extractors/crypto/__init__.py +3 -0
- txt2stix/pattern/extractors/crypto/btc_extractor.py +38 -0
- txt2stix/pattern/extractors/directory/__init__.py +10 -0
- txt2stix/pattern/extractors/directory/unix_directory_extractor.py +40 -0
- txt2stix/pattern/extractors/directory/unix_file_path_extractor.py +42 -0
- txt2stix/pattern/extractors/directory/windows_directory_path_extractor.py +47 -0
- txt2stix/pattern/extractors/directory/windows_file_path_extractor.py +42 -0
- txt2stix/pattern/extractors/domain/__init__.py +8 -0
- txt2stix/pattern/extractors/domain/domain_extractor.py +39 -0
- txt2stix/pattern/extractors/domain/hostname_extractor.py +36 -0
- txt2stix/pattern/extractors/domain/sub_domain_extractor.py +49 -0
- txt2stix/pattern/extractors/hashes/__init__.py +16 -0
- txt2stix/pattern/extractors/hashes/md5_extractor.py +16 -0
- txt2stix/pattern/extractors/hashes/sha1_extractor.py +14 -0
- txt2stix/pattern/extractors/hashes/sha224_extractor.py +18 -0
- txt2stix/pattern/extractors/hashes/sha2_256_exactor.py +14 -0
- txt2stix/pattern/extractors/hashes/sha2_512_exactor.py +13 -0
- txt2stix/pattern/extractors/hashes/sha3_256_exactor.py +15 -0
- txt2stix/pattern/extractors/hashes/sha3_512_exactor.py +16 -0
- txt2stix/pattern/extractors/helper.py +64 -0
- txt2stix/pattern/extractors/ip/__init__.py +14 -0
- txt2stix/pattern/extractors/ip/ipv4_cidr_extractor.py +49 -0
- txt2stix/pattern/extractors/ip/ipv4_extractor.py +18 -0
- txt2stix/pattern/extractors/ip/ipv4_port_extractor.py +42 -0
- txt2stix/pattern/extractors/ip/ipv6_cidr_extractor.py +18 -0
- txt2stix/pattern/extractors/ip/ipv6_extractor.py +16 -0
- txt2stix/pattern/extractors/ip/ipv6_port_extractor.py +46 -0
- txt2stix/pattern/extractors/others/__init__.py +22 -0
- txt2stix/pattern/extractors/others/asn_extractor.py +14 -0
- txt2stix/pattern/extractors/others/cpe_extractor.py +29 -0
- txt2stix/pattern/extractors/others/cve_extractor.py +14 -0
- txt2stix/pattern/extractors/others/email_extractor.py +21 -0
- txt2stix/pattern/extractors/others/filename_extractor.py +17 -0
- txt2stix/pattern/extractors/others/iban_extractor.py +15 -0
- txt2stix/pattern/extractors/others/mac_address_extractor.py +13 -0
- txt2stix/pattern/extractors/others/phonenumber_extractor.py +41 -0
- txt2stix/pattern/extractors/others/user_agent_extractor.py +20 -0
- txt2stix/pattern/extractors/others/windows_registry_key_extractor.py +18 -0
- txt2stix/pattern/extractors/url/__init__.py +7 -0
- txt2stix/pattern/extractors/url/url_extractor.py +22 -0
- txt2stix/pattern/extractors/url/url_file_extractor.py +21 -0
- txt2stix/pattern/extractors/url/url_path_extractor.py +74 -0
- txt2stix/retriever.py +126 -0
- txt2stix/stix.py +1 -0
- txt2stix/txt2stix.py +336 -0
- txt2stix/utils.py +86 -0
- txt2stix-0.0.4.dist-info/METADATA +190 -0
- txt2stix-0.0.4.dist-info/RECORD +119 -0
- txt2stix-0.0.4.dist-info/WHEEL +4 -0
- txt2stix-0.0.4.dist-info/entry_points.txt +2 -0
- txt2stix-0.0.4.dist-info/licenses/LICENSE +202 -0
txt2stix/lookups.py
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
import uuid
|
2
|
+
import yaml
|
3
|
+
from .common import FatalException, NamedDict
|
4
|
+
from .extractions import Extractor
|
5
|
+
|
6
|
+
import yaml, re
|
7
|
+
import csv
|
8
|
+
from pathlib import Path
|
9
|
+
|
10
|
+
|
11
|
+
def load_lookup(extractor):
|
12
|
+
if extractor.terms:
|
13
|
+
return extractor.terms
|
14
|
+
extractor.terms = set(Path(extractor.file).read_text().splitlines())
|
15
|
+
return extractor.terms
|
16
|
+
|
17
|
+
def find_all(extractor, input_str, start_id=0):
|
18
|
+
retval = {}
|
19
|
+
for term in extractor.terms:
|
20
|
+
indexes = find_get_indexes_re(term, input_str)
|
21
|
+
observed = {"value": term, "start_index":[], "stix_mapping": extractor.stix_mapping, "type": extractor.slug}
|
22
|
+
for index in indexes:
|
23
|
+
observed["start_index"].append(index)
|
24
|
+
if observed.get("start_index"):
|
25
|
+
retval[f"extraction_{start_id+len(retval)}"] = observed
|
26
|
+
return retval
|
27
|
+
|
28
|
+
|
29
|
+
def merge_lookups(extractors: list[Extractor]) -> list[tuple[str, str, str]]:
|
30
|
+
retval = []
|
31
|
+
for ex in extractors:
|
32
|
+
load_lookup(ex)
|
33
|
+
retval.extend([(term, ex.stix_mapping, ex.slug) for term in ex.terms])
|
34
|
+
return sorted(retval, key=lambda kv: len(kv[0]), reverse=True)
|
35
|
+
|
36
|
+
def extract_all(extractors, input_str):
|
37
|
+
terms_ex:list[tuple[str, str, str]] = merge_lookups(extractors)
|
38
|
+
seen_indexes = set()
|
39
|
+
retval = []
|
40
|
+
for term, stix_mapping, slug in terms_ex:
|
41
|
+
indexes = set(find_get_indexes_re(term, input_str))
|
42
|
+
difference = list(indexes.difference(seen_indexes))
|
43
|
+
seen_indexes.update(difference)
|
44
|
+
if difference:
|
45
|
+
retval.append({"value": term, "start_index":difference, "stix_mapping": stix_mapping, "type": slug})
|
46
|
+
return retval
|
47
|
+
|
48
|
+
def find_get_indexes(term, input_str):
|
49
|
+
idx = -1
|
50
|
+
while True:
|
51
|
+
idx = input_str.find(term, idx+1)
|
52
|
+
if idx == -1:
|
53
|
+
break
|
54
|
+
yield idx
|
55
|
+
|
56
|
+
def find_get_indexes_re(term, input_str):
|
57
|
+
input_str = " "+input_str+" "
|
58
|
+
re_i = re.escape(term)
|
59
|
+
rexp = []
|
60
|
+
for right in [r"\s", r"\.", r",", r"!\s"]:
|
61
|
+
rexp.append(r"\s"+ "(" + re_i +")" +right)
|
62
|
+
for open, close in ['""', "[]", "()", "``", "''",]:
|
63
|
+
rexp.append(re.escape(open)+ "(" + re_i +")" + re.escape(close))
|
64
|
+
rexp = "|".join(rexp)
|
65
|
+
r = re.compile(rexp, flags=re.IGNORECASE)
|
66
|
+
for match in r.finditer(input_str):
|
67
|
+
left, right = match.span()
|
68
|
+
yield left
|
@@ -0,0 +1,13 @@
|
|
1
|
+
from .extractors.base_extractor import ALL_EXTRACTORS
|
2
|
+
|
3
|
+
from .extractors.card import CARD_EXTRACTORS
|
4
|
+
from .extractors.crypto import CRYPTO_EXTRACTORS
|
5
|
+
from .extractors.directory import DIRECTORY_EXTRACTORS
|
6
|
+
from .extractors.domain import DOMAIN_EXTRACTORS
|
7
|
+
from .extractors.ip import IP_EXTRACTORS
|
8
|
+
from .extractors.hashes import SHA_EXTRACTORS
|
9
|
+
from .extractors.url import URL_EXTRACTORS
|
10
|
+
from .extractors.others import OTHER_EXTRACTORS
|
11
|
+
|
12
|
+
|
13
|
+
from .extractors.helper import extract_all, load_extractor
|
File without changes
|
@@ -0,0 +1,167 @@
|
|
1
|
+
"""
|
2
|
+
`Extractor` class represents the properties of a given observable.
|
3
|
+
"""
|
4
|
+
import re
|
5
|
+
import logging
|
6
|
+
from typing import Iterable
|
7
|
+
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
ALL_EXTRACTORS = {}
|
11
|
+
|
12
|
+
class BaseExtractor:
|
13
|
+
name = None
|
14
|
+
extraction_regex = None
|
15
|
+
extraction_function = None
|
16
|
+
common_strip_elements = "\"'.,:"
|
17
|
+
filter_function = None # further filter the extracted values
|
18
|
+
meta_extractor = None
|
19
|
+
version = None
|
20
|
+
stix_mapping = None
|
21
|
+
invalid_characters = ['.', ',', '!', '`', '(', ')', '{', '}', '"', '````', ' ', '[', ']']
|
22
|
+
SPLITS_FINDER = re.compile(r'(?<=[\'"<\(\{\[\s])(?P<item>.*?)(?=[\s\]\}\)>"\'])') #split on boundary characters instead of ' ' only
|
23
|
+
|
24
|
+
|
25
|
+
@classmethod
|
26
|
+
def register_new_extractor(cls, name, extractor):
|
27
|
+
ALL_EXTRACTORS[name] = extractor
|
28
|
+
|
29
|
+
def __init_subclass__(cls, **kwargs):
|
30
|
+
super().__init_subclass__(**kwargs)
|
31
|
+
cls.register_new_extractor(cls.name, cls)
|
32
|
+
|
33
|
+
@classmethod
|
34
|
+
def extract_extraction_from_text(cls, text: str):
|
35
|
+
"""
|
36
|
+
Extracts the required observables from text and returns the
|
37
|
+
extracted observables and modified text.
|
38
|
+
"""
|
39
|
+
extracted_observables = []
|
40
|
+
start_index = 0
|
41
|
+
if cls.extraction_regex is not None:
|
42
|
+
if cls.extraction_regex.startswith("^") or cls.extraction_regex.endswith("$"):
|
43
|
+
for matchsplit in cls.SPLITS_FINDER.finditer(text):
|
44
|
+
word = matchsplit.group('item')
|
45
|
+
start_index = matchsplit.start('item')
|
46
|
+
match = re.match(cls.extraction_regex, word)
|
47
|
+
if match:
|
48
|
+
extracted_observables.append((match.group(0), match.start()+start_index))
|
49
|
+
else:
|
50
|
+
stripped_word = word.strip(cls.common_strip_elements)
|
51
|
+
match = re.match(cls.extraction_regex, stripped_word)
|
52
|
+
if match:
|
53
|
+
extracted_observables.append((match.group(0), start_index + word.index(stripped_word)))
|
54
|
+
else:
|
55
|
+
# Find regex in the entire text (including whitespace)
|
56
|
+
for match in re.finditer(cls.extraction_regex, text):
|
57
|
+
match_value = match.group().strip('\n')
|
58
|
+
start_index, end_index = match.span()
|
59
|
+
|
60
|
+
extracted_observables.append((match_value, start_index))
|
61
|
+
|
62
|
+
# If extraction_function is not None, then find matches that don't throw exception when
|
63
|
+
elif cls.extraction_function is not None:
|
64
|
+
|
65
|
+
start_index = 0
|
66
|
+
|
67
|
+
for index, word in tokenize(text):
|
68
|
+
# word = match.group('item')
|
69
|
+
# end_index = start_index + len(word) - 1
|
70
|
+
|
71
|
+
# word = BaseExtractor.trim_invalid_characters(word, cls.invalid_characters)
|
72
|
+
try:
|
73
|
+
if cls.extraction_function(word):
|
74
|
+
extracted_observables.append((word, index))
|
75
|
+
except Exception as error:
|
76
|
+
pass
|
77
|
+
|
78
|
+
else:
|
79
|
+
raise ValueError("Both extraction_regex and extraction_function can't be None.")
|
80
|
+
|
81
|
+
string_positions = {}
|
82
|
+
|
83
|
+
# Iterate through the input list to group positions for each string
|
84
|
+
for position, (string, pos) in enumerate(extracted_observables, 1):
|
85
|
+
if cls.filter_function and not cls.filter_function(string):
|
86
|
+
continue
|
87
|
+
if string not in string_positions:
|
88
|
+
string_positions[string] = []
|
89
|
+
string_positions[string].append(pos)
|
90
|
+
|
91
|
+
response = []
|
92
|
+
|
93
|
+
# for extraction, positions in string_positions.items():
|
94
|
+
# response.append({
|
95
|
+
# "value": extraction,
|
96
|
+
# "type": cls.name,
|
97
|
+
# "version": cls.version,
|
98
|
+
# "stix_mapping": cls.stix_mapping,
|
99
|
+
# "start_index": positions,
|
100
|
+
# })
|
101
|
+
|
102
|
+
for position, (string, pos) in enumerate(extracted_observables, 1):
|
103
|
+
if cls.filter_function and not cls.filter_function(string):
|
104
|
+
continue
|
105
|
+
response.append({
|
106
|
+
"value": string,
|
107
|
+
"type": cls.name,
|
108
|
+
"version": cls.version,
|
109
|
+
"stix_mapping": cls.stix_mapping,
|
110
|
+
"start_index": pos,
|
111
|
+
})
|
112
|
+
return response
|
113
|
+
|
114
|
+
@classmethod
|
115
|
+
def split_all(cls, text):
|
116
|
+
for word in cls.SPLITS_FINDER.findall(text):
|
117
|
+
yield cls.trim_invalid_characters(word, cls.invalid_characters)
|
118
|
+
|
119
|
+
@classmethod
|
120
|
+
def trim_invalid_characters(cls, keyword: str, characters: Iterable):
|
121
|
+
return keyword.strip(''.join(characters))
|
122
|
+
|
123
|
+
def tokenize(text):
|
124
|
+
# Define pairs of boundary characters as a list of tuples
|
125
|
+
boundary_pairs = [
|
126
|
+
('(', ')'),
|
127
|
+
('[', ']'),
|
128
|
+
('{', '}'),
|
129
|
+
('"', '"'),
|
130
|
+
("'", "'"),
|
131
|
+
(' ', ' '), # Spaces as general separators
|
132
|
+
('\n', ' '),
|
133
|
+
('\n', '\n'),
|
134
|
+
(' ', '\n'),
|
135
|
+
]
|
136
|
+
|
137
|
+
# Define additional closing boundary characters
|
138
|
+
additional_closers = {"!", ";", ".", ","}
|
139
|
+
|
140
|
+
tokens = []
|
141
|
+
words = text.split()
|
142
|
+
|
143
|
+
# Capture individual words with their starting index
|
144
|
+
index = 0
|
145
|
+
for word in words:
|
146
|
+
start_index = text.find(word, index)
|
147
|
+
cleaned_word = word.strip("()[]{}'\".,;!:")
|
148
|
+
if cleaned_word:
|
149
|
+
tokens.append((start_index, cleaned_word))
|
150
|
+
index = start_index + len(word)
|
151
|
+
|
152
|
+
# Capture nested structures with their starting index
|
153
|
+
for i in range(len(text)):
|
154
|
+
for opener, closer in boundary_pairs:
|
155
|
+
if text[i] == opener:
|
156
|
+
token = text[i]
|
157
|
+
for j in range(i + 1, len(text)):
|
158
|
+
token += text[j]
|
159
|
+
if text[j] == closer:
|
160
|
+
if token.startswith(opener):
|
161
|
+
token = token[1:]
|
162
|
+
if token.endswith(closer):
|
163
|
+
token = token[:-1]
|
164
|
+
tokens.append((i, token))
|
165
|
+
break
|
166
|
+
|
167
|
+
return tokens
|
@@ -0,0 +1,34 @@
|
|
1
|
+
Data retrieved from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-8A0A2E8B-D740-476E-B10C-885919573022.html, by running the following code on page
|
2
|
+
|
3
|
+
```js
|
4
|
+
function maptoregexp(elements){
|
5
|
+
elements = Array.from(elements)
|
6
|
+
return JSON.stringify(elements.map(el=>el.innerText), null, 2)
|
7
|
+
}
|
8
|
+
var [validator_section, positive_match_section, negative_match_section] = document.querySelectorAll("section.section")
|
9
|
+
var positive_matchers = maptoregexp(positive_match_section.querySelectorAll("code")), negative_matchers = maptoregexp(negative_match_section.querySelectorAll("code"))
|
10
|
+
var validator = validator_section.querySelector("td") && validator_section.querySelector("td").innerText,
|
11
|
+
description = document.querySelector(".shortdesc").innerText // Break long words at exactly 78 characters
|
12
|
+
.replace(/([^\s]{78})/g, '$1\n\t')
|
13
|
+
// Break long lines honoring whitespace
|
14
|
+
.replace(/([^\n]{1,78})(\s|$)/g, '$1\n\t')
|
15
|
+
.trim()
|
16
|
+
|
17
|
+
python_code = `
|
18
|
+
### The following part is automatically generated from ${location.href}
|
19
|
+
description = """
|
20
|
+
${description}
|
21
|
+
|
22
|
+
validator = ${validator}
|
23
|
+
"""
|
24
|
+
extraction_regex_list = ${positive_matchers}
|
25
|
+
filter_regex_list = ${negative_matchers}
|
26
|
+
extraction_regex = "|".join(extraction_regex_list)
|
27
|
+
|
28
|
+
#end of generated code
|
29
|
+
|
30
|
+
`
|
31
|
+
|
32
|
+
console.log(python_code)
|
33
|
+
copy(python_code)
|
34
|
+
```
|
@@ -0,0 +1,15 @@
|
|
1
|
+
from .amex_card_extractor import AmexCardExtractor
|
2
|
+
from .diners_card_extractor import DinersCardExtractor
|
3
|
+
from .discover_card_extractor import BankCardDiscoverExtractor
|
4
|
+
from .jcb_card_extractor import JCBCardExtractor
|
5
|
+
from .master_card_extractor import MastercardCardExtractor
|
6
|
+
from .union_card_extractor import UnionPayCardExtractor
|
7
|
+
from .visa_card_extractor import VisaCardBaseExtractor
|
8
|
+
|
9
|
+
CARD_EXTRACTORS = [AmexCardExtractor,
|
10
|
+
DinersCardExtractor,
|
11
|
+
BankCardDiscoverExtractor,
|
12
|
+
JCBCardExtractor,
|
13
|
+
MastercardCardExtractor,
|
14
|
+
UnionPayCardExtractor,
|
15
|
+
VisaCardBaseExtractor]
|
@@ -0,0 +1,52 @@
|
|
1
|
+
from ..base_extractor import BaseExtractor
|
2
|
+
from validators import card_number
|
3
|
+
|
4
|
+
class AmexCardExtractor(BaseExtractor):
|
5
|
+
"""
|
6
|
+
A class for extracting American Express (Amex) credit card numbers from text using regular expressions.
|
7
|
+
|
8
|
+
Attributes:
|
9
|
+
name (str): The name of the extractor, set to "Amex-card".
|
10
|
+
extraction_regex (str): The regular expression pattern used for extracting Amex credit card numbers.
|
11
|
+
"""
|
12
|
+
|
13
|
+
name = "pattern_bank_card_amex"
|
14
|
+
|
15
|
+
# The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-97839BB4-3077-4BB0-9974-CF8EEB0E2426.html
|
16
|
+
description = """
|
17
|
+
American Express Credit Card Number (CCN) is a 15-digit number starting with
|
18
|
+
34 or 37 and might have dashes (hyphens) or spaces as separators. For example,
|
19
|
+
NNNN-NNNNNN-NNNNN or NNNN NNNNNN NNNNN. Matches exclude common test card
|
20
|
+
numbers.
|
21
|
+
|
22
|
+
validator = Luhn 10 (remainder 0)
|
23
|
+
"""
|
24
|
+
extraction_regex_list = [
|
25
|
+
"\\b3[47]\\d{2} \\d{6} \\d{5}\\b",
|
26
|
+
"\\b3[47]\\d{2}-\\d{6}-\\d{5}\\b",
|
27
|
+
"\\b3[47]\\d{13}\\b"
|
28
|
+
]
|
29
|
+
filter_regex_list = [
|
30
|
+
"3400([- ]?)000000([- ]?)00009",
|
31
|
+
"3411([- ]?)111111([- ]?)11111",
|
32
|
+
"3434([- ]?)343434([- ]?)34343",
|
33
|
+
"3456([- ]?)789012([- ]?)34564",
|
34
|
+
"3456([- ]?)400000([- ]?)55123",
|
35
|
+
"3468([- ]?)276304([- ]?)35344",
|
36
|
+
"3700([- ]?)000000([ -]?)00002",
|
37
|
+
"3700([- ]?)002000([ -]?)00000",
|
38
|
+
"3704([- ]?)072699([ -]?)09809",
|
39
|
+
"3705([- ]?)560193([ -]?)09221",
|
40
|
+
"3714([- ]?)496353([ -]?)98431",
|
41
|
+
"3742([- ]?)000000([ -]?)00004",
|
42
|
+
"3756([- ]?)400000([ -]?)55123",
|
43
|
+
"3764([- ]?)622809([ -]?)21451",
|
44
|
+
"3777([- ]?)527498([ -]?)96404",
|
45
|
+
"3782([- ]?)822463([ -]?)10005",
|
46
|
+
"3787([- ]?)344936([ -]?)71000"
|
47
|
+
]
|
48
|
+
extraction_regex = "|".join(extraction_regex_list)
|
49
|
+
|
50
|
+
# end of generated code
|
51
|
+
|
52
|
+
filter_function = card_number
|
@@ -0,0 +1,47 @@
|
|
1
|
+
from validators import card_number
|
2
|
+
from ..base_extractor import BaseExtractor
|
3
|
+
|
4
|
+
|
5
|
+
class DinersCardExtractor(BaseExtractor):
|
6
|
+
"""
|
7
|
+
A class for extracting Diners Club credit card numbers from text using regular expressions.
|
8
|
+
|
9
|
+
Attributes:
|
10
|
+
pattern (str): The pattern to represent the extracted credit card number.
|
11
|
+
extraction_regex (str): The regular expression pattern used for extracting Diners Club credit card numbers.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = "pattern_bank_card_diners"
|
15
|
+
|
16
|
+
# The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-2B5CF316-ED36-4CAB-92D7-AC46714E9882.html
|
17
|
+
description = """
|
18
|
+
Credit Card Number (Diner's Club) is a 14-digit number beginning with 300–305,
|
19
|
+
36, 38, or 39 and might have dashes (hyphens) or spaces as separators. For
|
20
|
+
example, NNNN-NNNNNN-NNNN or NNNN NNNNNN NNNN. Matches exclude common test
|
21
|
+
card numbers.
|
22
|
+
|
23
|
+
validator = Luhn 10 (remainder 0)
|
24
|
+
"""
|
25
|
+
extraction_regex_list = [
|
26
|
+
"\\b30[0-5]\\d-\\d{6}-\\d{4}\\b",
|
27
|
+
"\\b30[0-5]\\d \\d{6} \\d{4}\\b",
|
28
|
+
"\\b30[0-5]\\d{11}\\b",
|
29
|
+
"\\b3[689]\\d{2}-\\d{6}-\\d{4}\\b",
|
30
|
+
"\\b3[689]\\d{2} \\d{6} \\d{4}\\b",
|
31
|
+
"\\b3[689]\\d{12}\\b"
|
32
|
+
]
|
33
|
+
filter_regex_list = [
|
34
|
+
"3020([ -]?)416932([ -]?)2643",
|
35
|
+
"3021([ -]?)804719([ -]?)6557",
|
36
|
+
"3022([ -]?)151156([ -]?)3252",
|
37
|
+
"3046([ -]?)400000([ -]?)5512",
|
38
|
+
"3600([ -]?)000000([ -]?)0008",
|
39
|
+
"3614([ -]?)890064([ -]?)7913",
|
40
|
+
"3670([ -]?)010200([ -]?)0000",
|
41
|
+
"3852([ -]?)000002([ -]?)3237",
|
42
|
+
"3912([ -]?)345678([ -]?)9019"
|
43
|
+
]
|
44
|
+
extraction_regex = "|".join(extraction_regex_list)
|
45
|
+
|
46
|
+
# end of generated code
|
47
|
+
filter_function = card_number
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from validators import card_number
|
2
|
+
from ..base_extractor import BaseExtractor
|
3
|
+
|
4
|
+
|
5
|
+
class BankCardDiscoverExtractor(BaseExtractor):
|
6
|
+
"""
|
7
|
+
A class for extracting Discover credit card numbers from text using regular expressions.
|
8
|
+
|
9
|
+
Attributes:
|
10
|
+
name (str): The name of the extractor, set to "bank-card-discover".
|
11
|
+
extraction_regex (str): The regular expression pattern used for extracting Discover credit card numbers.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = "pattern_bank_card_discover"
|
15
|
+
|
16
|
+
# The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-EF96B5CE-6C8E-49B2-BD19-82B0CE0E5091.html
|
17
|
+
description = """
|
18
|
+
Credit Card Number (Discover) is a 16-digit number beginning with 6011,
|
19
|
+
644–649 or 65 and might have dashes (hyphens) or spaces as separators. For
|
20
|
+
example, NNNN-NNNN-NNNN-NNNN or NNNN NNNN NNNN NNNN. This excludes common test
|
21
|
+
card numbers.
|
22
|
+
|
23
|
+
validator = Luhn 10 (remainder 0)
|
24
|
+
"""
|
25
|
+
extraction_regex_list = [
|
26
|
+
"\\b6011-\\d{4}-\\d{4}-\\d{4}\\b",
|
27
|
+
"\\b6011 \\d{4} \\d{4} \\d{4}\\b",
|
28
|
+
"\\b6011\\d{12}\\b",
|
29
|
+
"\\b64[4-9]\\d-\\d{4}-\\d{4}-\\d{4}\\b",
|
30
|
+
"\\b64[4-9]\\d \\d{4} \\d{4} \\d{4}\\b",
|
31
|
+
"\\b64[4-9]\\d{13}\\b",
|
32
|
+
"\\b65\\d{2}-\\d{4}-\\d{4}-\\d{4}\\b",
|
33
|
+
"\\b65\\d{2} \\d{4} \\d{4} \\d{4}\\b",
|
34
|
+
"\\b65\\d{14}\\b"
|
35
|
+
]
|
36
|
+
filter_regex_list = [
|
37
|
+
"6011([ -]?)0009([ -]?)9013([ -]?)9424",
|
38
|
+
"6011([ -]?)1111([ -]?)1111([ -]?)1117",
|
39
|
+
"6011([ -]?)1532([ -]?)1637([ -]?)1980",
|
40
|
+
"6011([ -]?)6011([ -]?)6011([ -]?)6611",
|
41
|
+
"6011([ -]?)6874([ -]?)8256([ -]?)4166",
|
42
|
+
"6011([ -]?)8148([ -]?)3690([ -]?)5651",
|
43
|
+
"6556([ -]?)4000([ -]?)0055([ -]?)1234"
|
44
|
+
]
|
45
|
+
extraction_regex = "|".join(extraction_regex_list)
|
46
|
+
|
47
|
+
# end of generated code
|
48
|
+
filter_function = card_number
|
@@ -0,0 +1,43 @@
|
|
1
|
+
from validators import card_number
|
2
|
+
from ..base_extractor import BaseExtractor
|
3
|
+
|
4
|
+
|
5
|
+
class JCBCardExtractor(BaseExtractor):
|
6
|
+
"""
|
7
|
+
A class for extracting JCB credit card numbers from text using regular expressions.
|
8
|
+
|
9
|
+
Attributes:
|
10
|
+
name (str): The name of the extractor, set to "jcb-card".
|
11
|
+
extraction_regex (str): The regular expression pattern used for extracting JCB credit card numbers.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = "pattern_bank_card_jcb"
|
15
|
+
|
16
|
+
# The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-D56393B2-2C27-4CAF-A7C3-AE83298BD96B.html
|
17
|
+
description = """
|
18
|
+
JCB CCN is a 16-digit number beginning with 3528 or 3589 and might have dashes
|
19
|
+
(hyphens) or spaces as separators. For example, NNNN-NNNN-NNNN-NNNN or NNNN
|
20
|
+
NNNN NNNNNNNN. This excludes common test card numbers.
|
21
|
+
|
22
|
+
validator = Luhn 10 (remainder 0)
|
23
|
+
"""
|
24
|
+
extraction_regex_list = [
|
25
|
+
"\\b352[89]-\\d{4}-\\d{4}-\\d{4}\\b",
|
26
|
+
"\\b352[89] \\d{4} \\d{4} \\d{4}\\b",
|
27
|
+
"\\b352[89]\\d{12}\\b",
|
28
|
+
"\\b35[3-8]\\d-\\d{4}-\\d{4}-\\d{4}\\b",
|
29
|
+
"\\b35[3-8]\\d \\d{4} \\d{4} \\d{4}\\b",
|
30
|
+
"\\b35[3-8]\\d{13}\\b"
|
31
|
+
]
|
32
|
+
filter_regex_list = [
|
33
|
+
"3528([ -]?)0007([ -]?)0000([ -]?)0000",
|
34
|
+
"3528([ -]?)7237([ -]?)4002([ -]?)2896",
|
35
|
+
"3530([ -]?)1113([ -]?)3330([ -]?)0000",
|
36
|
+
"3556([ -]?)4000([ -]?)0055([ -]?)1234",
|
37
|
+
"3566([ -]?)0020([ -]?)2036([ -]?)0505",
|
38
|
+
"3569([ -]?)9900([ -]?)0000([ -]?)0009"
|
39
|
+
]
|
40
|
+
extraction_regex = "|".join(extraction_regex_list)
|
41
|
+
|
42
|
+
# end of generated code
|
43
|
+
filter_function = card_number
|
@@ -0,0 +1,63 @@
|
|
1
|
+
from validators import card_number
|
2
|
+
from ..base_extractor import BaseExtractor
|
3
|
+
|
4
|
+
|
5
|
+
class MastercardCardExtractor(BaseExtractor):
|
6
|
+
"""
|
7
|
+
A class for extracting Mastercard credit card numbers from text using regular expressions.
|
8
|
+
|
9
|
+
Attributes:
|
10
|
+
name (str): The name of the extractor, set to "master-card".
|
11
|
+
extraction_regex (str): The regular expression pattern used for extracting Mastercard credit card numbers.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = "pattern_bank_card_mastercard"
|
15
|
+
|
16
|
+
# The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-8A0A2E8B-D740-476E-B10C-885919573022.html
|
17
|
+
description = """
|
18
|
+
Credit Card Number (Mastercard) is a 16-digit number beginning with 51–55 or
|
19
|
+
2221– 2720 and might have dashes (hyphens) or spaces as separators. For
|
20
|
+
example, NNNN-NNNN-NNNN-NNNN or NNNN NNNN NNNN NNNN. This excludes common test
|
21
|
+
card numbers.
|
22
|
+
|
23
|
+
validator = Luhn 10 (remainder 0)
|
24
|
+
"""
|
25
|
+
extraction_regex_list = [
|
26
|
+
"\\b5[1-5]\\d{2}-\\d{4}-\\d{4}-\\d{4}\\b",
|
27
|
+
"\\b5[1-5]\\d{2} \\d{4} \\d{4} \\d{4}\\b",
|
28
|
+
"\\b5[1-5]\\d{14}\\b",
|
29
|
+
"\\b2[2-7]\\d{2}-\\d{4}-\\d{4}-\\d{4}\\b",
|
30
|
+
"\\b2[2-7]\\d{2} \\d{4} \\d{4} \\d{4}\\b",
|
31
|
+
"\\b2[2-7]\\d{14}\\b"
|
32
|
+
]
|
33
|
+
filter_regex_list = [
|
34
|
+
"5100([- ]?)0800([ -]?)0000([ -]?)0000",
|
35
|
+
"5105([- ]?)1051([ -]?)0510([ -]?)5100",
|
36
|
+
"5111([- ]?)1111([ -]?)1111([ -]?)1118",
|
37
|
+
"5123([- ]?)4567([ -]?)8901([ -]?)2346",
|
38
|
+
"5123([- ]?)6197([ -]?)4539([ -]?)5853",
|
39
|
+
"5138([- ]?)4951([ -]?)2555([ -]?)0554",
|
40
|
+
"5274([- ]?)5763([ -]?)9425([ -]?)9961",
|
41
|
+
"5301([- ]?)7455([ -]?)2913([ -]?)8831",
|
42
|
+
"5311([- ]?)5312([ -]?)8600([ -]?)0465",
|
43
|
+
"5364([- ]?)5870([ -]?)1178([ -]?)5834",
|
44
|
+
"5404([- ]?)0000([ -]?)0000([ -]?)0001",
|
45
|
+
"5431([- ]?)1111([ -]?)1111([ -]?)1111",
|
46
|
+
"5454([- ]?)5454([ -]?)5454([ -]?)5454",
|
47
|
+
"5459([- ]?)8862([ -]?)6563([ -]?)1843",
|
48
|
+
"5460([- ]?)5060([ -]?)4803([ -]?)9935",
|
49
|
+
"5500([- ]?)9391([ -]?)7800([ -]?)4613",
|
50
|
+
"5555([- ]?)5555([ -]?)5555([ -]?)4444",
|
51
|
+
"5556([- ]?)4000([ -]?)0055([ -]?)1234",
|
52
|
+
"5565([- ]?)5520([ -]?)6448([ -]?)1449",
|
53
|
+
"5597([- ]?)5076([ -]?)4491([ -]?)0558",
|
54
|
+
"220\\d([- ]?)\\d{4}([ -]?)\\d{4}([ -]?)\\d{4}",
|
55
|
+
"221\\d([- ]?)\\d{4}([ -]?)\\d{4}([ -]?)\\d{4}",
|
56
|
+
"2220([- ]?)\\d{4}([ -]?)\\d{4}([ -]?)\\d{4}",
|
57
|
+
"272[1-9]([- ]?)\\d{4}([ -]?)\\d{4}([ -]?)\\d{4}",
|
58
|
+
"27[3-9]\\d([- ]?)\\d{4}([ -]?)\\d{4}([ -]?)\\d{4}"
|
59
|
+
]
|
60
|
+
extraction_regex = "|".join(extraction_regex_list)
|
61
|
+
|
62
|
+
# end of generated code
|
63
|
+
filter_function = card_number
|
@@ -0,0 +1,38 @@
|
|
1
|
+
from ..base_extractor import BaseExtractor
|
2
|
+
|
3
|
+
|
4
|
+
class UnionPayCardExtractor(BaseExtractor):
|
5
|
+
"""
|
6
|
+
A class for extracting UnionPay credit card numbers from text using regular expressions.
|
7
|
+
|
8
|
+
This class inherits from BaseExtractor, which defines the basic structure and functionality for all extractors.
|
9
|
+
|
10
|
+
Attributes:
|
11
|
+
name (str): The name of the extractor, set to "union-pay-card".
|
12
|
+
extraction_regex (str): The regular expression pattern used for extracting UnionPay credit card numbers.
|
13
|
+
"""
|
14
|
+
|
15
|
+
name = "pattern_bank_card_union_pay"
|
16
|
+
|
17
|
+
# The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-B8D29ECE-E70A-401E-B18D-B773F4FF71ED.html
|
18
|
+
description = """
|
19
|
+
The China UnionPay credit card numbers begin with 62 or 60 and is a 16-19
|
20
|
+
digit long number.
|
21
|
+
|
22
|
+
validator = China Union Pay Card validator
|
23
|
+
"""
|
24
|
+
extraction_regex_list = [
|
25
|
+
"\\b622\\d{13,16}\\b",
|
26
|
+
"\\b603601\\d{10}\\b",
|
27
|
+
"\\b603265\\d{10}\\b",
|
28
|
+
"\\b621977\\d{10}\\b",
|
29
|
+
"\\b603708\\d{10}\\b",
|
30
|
+
"\\b602969\\d{10}\\b",
|
31
|
+
"\\b601428\\d{10}\\b",
|
32
|
+
"\\b603367\\d{10}\\b",
|
33
|
+
"\\b603694\\d{10}\\b"
|
34
|
+
]
|
35
|
+
filter_regex_list = []
|
36
|
+
extraction_regex = "|".join(extraction_regex_list)
|
37
|
+
|
38
|
+
# end of generated code
|
@@ -0,0 +1,46 @@
|
|
1
|
+
from validators import card_number
|
2
|
+
from ..base_extractor import BaseExtractor
|
3
|
+
|
4
|
+
|
5
|
+
class VisaCardBaseExtractor(BaseExtractor):
|
6
|
+
"""
|
7
|
+
A class for extracting VISA credit card numbers from text using regular expressions.
|
8
|
+
|
9
|
+
Attributes:
|
10
|
+
name (str): The name of the extractor, set to "visa-card".
|
11
|
+
extraction_regex (str): The regular expression pattern used for extracting VISA credit card numbers.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = "pattern_bank_card_visa"
|
15
|
+
|
16
|
+
# The following part is automatically generated from https://docs.trellix.com/bundle/data-loss-prevention-11.10.x-classification-definitions-reference-guide/page/GUID-66A52D16-CA41-4509-826D-8D29B1F968C2.html
|
17
|
+
description = """
|
18
|
+
Credit Card Number (Visa) is 16-digit number and might have dashes (hyphen) or
|
19
|
+
spaces as separators. For example, NNNN-NNNN-NNNN-NNNN or NNNN NNNN NNNN NNNN.
|
20
|
+
This excludes common test card numbers.
|
21
|
+
|
22
|
+
validator = Luhn 10 (remainder 0)
|
23
|
+
"""
|
24
|
+
extraction_regex_list = [
|
25
|
+
"\\b4\\d{3}-\\d{4}-\\d{4}-\\d{4}\\b",
|
26
|
+
"\\b4\\d{3} \\d{4} \\d{4} \\d{4}\\b",
|
27
|
+
"\\b4\\d{15}\\b"
|
28
|
+
]
|
29
|
+
filter_regex_list = [
|
30
|
+
"4005([ -]?)5500([ -]?)0000([ -]?)0001",
|
31
|
+
"4012([ -]?)8888([ -]?)8888([ -]?)1881",
|
32
|
+
"4111([ -]?)1111([ -]?)1111([ -]?)1111",
|
33
|
+
"4444([ -]?)3333([ -]?)2222([ -]?)1111",
|
34
|
+
"4539([ -]?)1050([ -]?)1153([ -]?)9664",
|
35
|
+
"4555([ -]?)4000([ -]?)0055([ -]?)5123",
|
36
|
+
"4564([ -]?)4564([ -]?)4564([ -]?)4564",
|
37
|
+
"4544([ -]?)1821([ -]?)7453([ -]?)7267",
|
38
|
+
"4716([ -]?)9147([ -]?)0653([ -]?)4228",
|
39
|
+
"4916([ -]?)5417([ -]?)1375([ -]?)7159",
|
40
|
+
"4916([ -]?)6156([ -]?)3934([ -]?)6972",
|
41
|
+
"4917([ -]?)6100([ -]?)0000([ -]?)0000"
|
42
|
+
]
|
43
|
+
extraction_regex = "|".join(extraction_regex_list)
|
44
|
+
|
45
|
+
# end of generated code
|
46
|
+
filter_function = card_number
|