credsweeper 1.11.2__py3-none-any.whl → 1.11.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of credsweeper might be problematic. Click here for more details.
- credsweeper/__init__.py +1 -1
- credsweeper/__main__.py +7 -5
- credsweeper/app.py +28 -47
- credsweeper/common/constants.py +2 -5
- credsweeper/common/keyword_pattern.py +15 -9
- credsweeper/common/morpheme_checklist.txt +4 -2
- credsweeper/credentials/candidate_key.py +1 -1
- credsweeper/credentials/credential_manager.py +4 -3
- credsweeper/credentials/line_data.py +16 -15
- credsweeper/deep_scanner/abstract_scanner.py +10 -1
- credsweeper/deep_scanner/deb_scanner.py +48 -0
- credsweeper/deep_scanner/deep_scanner.py +65 -43
- credsweeper/deep_scanner/docx_scanner.py +1 -1
- credsweeper/deep_scanner/encoder_scanner.py +2 -2
- credsweeper/deep_scanner/gzip_scanner.py +1 -1
- credsweeper/deep_scanner/html_scanner.py +3 -3
- credsweeper/deep_scanner/jks_scanner.py +2 -4
- credsweeper/deep_scanner/lang_scanner.py +2 -2
- credsweeper/deep_scanner/lzma_scanner.py +40 -0
- credsweeper/deep_scanner/pkcs12_scanner.py +3 -5
- credsweeper/deep_scanner/xml_scanner.py +2 -2
- credsweeper/file_handler/byte_content_provider.py +2 -2
- credsweeper/file_handler/content_provider.py +1 -1
- credsweeper/file_handler/data_content_provider.py +23 -14
- credsweeper/file_handler/diff_content_provider.py +2 -2
- credsweeper/file_handler/file_path_extractor.py +1 -1
- credsweeper/file_handler/files_provider.py +2 -4
- credsweeper/file_handler/patches_provider.py +1 -1
- credsweeper/file_handler/string_content_provider.py +2 -2
- credsweeper/file_handler/struct_content_provider.py +1 -1
- credsweeper/file_handler/text_content_provider.py +2 -2
- credsweeper/filters/value_array_dictionary_check.py +3 -1
- credsweeper/filters/value_azure_token_check.py +1 -2
- credsweeper/filters/value_base64_encoded_pem_check.py +1 -1
- credsweeper/filters/value_base64_part_check.py +30 -21
- credsweeper/filters/value_discord_bot_check.py +1 -2
- credsweeper/filters/value_entropy_base32_check.py +11 -31
- credsweeper/filters/value_entropy_base36_check.py +11 -34
- credsweeper/filters/value_entropy_base64_check.py +15 -48
- credsweeper/filters/value_entropy_base_check.py +37 -0
- credsweeper/filters/value_file_path_check.py +1 -1
- credsweeper/filters/value_hex_number_check.py +3 -3
- credsweeper/filters/value_json_web_token_check.py +4 -5
- credsweeper/filters/value_pattern_check.py +64 -16
- credsweeper/filters/value_string_type_check.py +11 -3
- credsweeper/filters/value_token_base32_check.py +0 -4
- credsweeper/filters/value_token_base36_check.py +0 -4
- credsweeper/filters/value_token_base64_check.py +0 -4
- credsweeper/filters/value_token_check.py +1 -1
- credsweeper/ml_model/features/file_extension.py +2 -2
- credsweeper/ml_model/features/morpheme_dense.py +0 -4
- credsweeper/ml_model/features/rule_name.py +1 -1
- credsweeper/ml_model/features/word_in_path.py +0 -9
- credsweeper/ml_model/features/word_in_postamble.py +0 -11
- credsweeper/ml_model/features/word_in_preamble.py +0 -11
- credsweeper/ml_model/features/word_in_transition.py +0 -11
- credsweeper/ml_model/features/word_in_value.py +0 -11
- credsweeper/ml_model/features/word_in_variable.py +0 -11
- credsweeper/ml_model/ml_validator.py +45 -22
- credsweeper/rules/config.yaml +238 -208
- credsweeper/rules/rule.py +3 -3
- credsweeper/scanner/scan_type/scan_type.py +2 -3
- credsweeper/scanner/scanner.py +7 -1
- credsweeper/secret/config.json +16 -5
- credsweeper/utils/hop_stat.py +3 -3
- credsweeper/utils/pem_key_detector.py +8 -7
- credsweeper/utils/util.py +76 -146
- {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/METADATA +1 -1
- {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/RECORD +72 -70
- credsweeper/utils/entropy_validator.py +0 -72
- {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/WHEEL +0 -0
- {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/entry_points.txt +0 -0
- {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -15,10 +15,10 @@ class FileExtension(WordIn):
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
def __init__(self, extensions: List[str]) -> None:
|
|
18
|
-
super().__init__(extensions)
|
|
18
|
+
super().__init__(words=extensions)
|
|
19
19
|
|
|
20
20
|
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
|
|
21
|
-
extension_set = set(
|
|
21
|
+
extension_set = set(candidate.line_data_list[0].file_type.lower() for candidate in candidates)
|
|
22
22
|
return self.word_in_set(extension_set)
|
|
23
23
|
|
|
24
24
|
def extract(self, candidate: Candidate) -> Any:
|
|
@@ -6,10 +6,6 @@ from credsweeper.ml_model.features.feature import Feature
|
|
|
6
6
|
class MorphemeDense(Feature):
|
|
7
7
|
"""Feature calculates morphemes density for a value"""
|
|
8
8
|
|
|
9
|
-
def __init__(self) -> None:
|
|
10
|
-
"""Class initializer"""
|
|
11
|
-
super().__init__()
|
|
12
|
-
|
|
13
9
|
def extract(self, candidate: Candidate) -> float:
|
|
14
10
|
if value := candidate.line_data_list[0].value.lower():
|
|
15
11
|
morphemes_counter = 0
|
|
@@ -15,7 +15,7 @@ class RuleName(WordIn):
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
def __init__(self, rule_names: List[str]) -> None:
|
|
18
|
-
super().__init__(rule_names)
|
|
18
|
+
super().__init__(words=rule_names)
|
|
19
19
|
|
|
20
20
|
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
|
|
21
21
|
candidate_rule_set = set(x.rule_name for x in candidates)
|
|
@@ -10,15 +10,6 @@ from credsweeper.ml_model.features.word_in import WordIn
|
|
|
10
10
|
class WordInPath(WordIn):
|
|
11
11
|
"""Categorical feature that corresponds to words in path (POSIX, lowercase)"""
|
|
12
12
|
|
|
13
|
-
def __init__(self, words: List[str]) -> None:
|
|
14
|
-
"""WordInPath constructor
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
words: list of predefined words - MUST BE IN LOWER CASE & POSIX
|
|
18
|
-
|
|
19
|
-
"""
|
|
20
|
-
super().__init__(words)
|
|
21
|
-
|
|
22
13
|
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
|
|
23
14
|
# actually there must be one path because the candidates are grouped before
|
|
24
15
|
if file_path := candidates[0].line_data_list[0].path:
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
1
|
import numpy as np
|
|
4
2
|
|
|
5
3
|
from credsweeper.common.constants import ML_HUNK
|
|
@@ -10,15 +8,6 @@ from credsweeper.ml_model.features.word_in import WordIn
|
|
|
10
8
|
class WordInPostamble(WordIn):
|
|
11
9
|
"""Feature is true if line contains at least one word from predefined list."""
|
|
12
10
|
|
|
13
|
-
def __init__(self, words: List[str]) -> None:
|
|
14
|
-
"""Feature returns array of matching words
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
words: list of predefined words - MUST BE IN LOWER CASE
|
|
18
|
-
|
|
19
|
-
"""
|
|
20
|
-
super().__init__(words)
|
|
21
|
-
|
|
22
11
|
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
23
12
|
"""Returns true if any words in a part of line after value"""
|
|
24
13
|
postamble_end = len(candidate.line_data_list[0].line) \
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
1
|
import numpy as np
|
|
4
2
|
|
|
5
3
|
from credsweeper.common.constants import ML_HUNK
|
|
@@ -10,15 +8,6 @@ from credsweeper.ml_model.features.word_in import WordIn
|
|
|
10
8
|
class WordInPreamble(WordIn):
|
|
11
9
|
"""Feature is true if line contains at least one word from predefined list."""
|
|
12
10
|
|
|
13
|
-
def __init__(self, words: List[str]) -> None:
|
|
14
|
-
"""Feature returns array of matching words
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
words: list of predefined words - MUST BE IN LOWER CASE
|
|
18
|
-
|
|
19
|
-
"""
|
|
20
|
-
super().__init__(words)
|
|
21
|
-
|
|
22
11
|
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
23
12
|
"""Returns true if any words in line before variable or value"""
|
|
24
13
|
if 0 <= candidate.line_data_list[0].variable_start:
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
1
|
import numpy as np
|
|
4
2
|
|
|
5
3
|
from credsweeper.credentials import Candidate
|
|
@@ -9,15 +7,6 @@ from credsweeper.ml_model.features.word_in import WordIn
|
|
|
9
7
|
class WordInTransition(WordIn):
|
|
10
8
|
"""Feature is true if line contains at least one word from predefined list."""
|
|
11
9
|
|
|
12
|
-
def __init__(self, words: List[str]) -> None:
|
|
13
|
-
"""Feature returns array of matching words
|
|
14
|
-
|
|
15
|
-
Args:
|
|
16
|
-
words: list of predefined words - MUST BE IN LOWER CASE
|
|
17
|
-
|
|
18
|
-
"""
|
|
19
|
-
super().__init__(words)
|
|
20
|
-
|
|
21
10
|
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
22
11
|
"""Returns true if any words between variable and value"""
|
|
23
12
|
if 0 <= candidate.line_data_list[0].variable_end < candidate.line_data_list[0].value_start:
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
1
|
import numpy as np
|
|
4
2
|
|
|
5
3
|
from credsweeper.credentials import Candidate
|
|
@@ -9,15 +7,6 @@ from credsweeper.ml_model.features.word_in import WordIn
|
|
|
9
7
|
class WordInValue(WordIn):
|
|
10
8
|
"""Feature returns true if candidate value contains at least one word from predefined list."""
|
|
11
9
|
|
|
12
|
-
def __init__(self, words: List[str]) -> None:
|
|
13
|
-
"""Feature is true if candidate value contains at least one predefined word.
|
|
14
|
-
|
|
15
|
-
Args:
|
|
16
|
-
words: list of predefined words - MUST BE IN LOWER CASE and SORTED (preferred)
|
|
17
|
-
|
|
18
|
-
"""
|
|
19
|
-
super().__init__(words)
|
|
20
|
-
|
|
21
10
|
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
22
11
|
"""Returns array of matching words for first line"""
|
|
23
12
|
if value := candidate.line_data_list[0].value:
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import List
|
|
2
|
-
|
|
3
1
|
import numpy as np
|
|
4
2
|
|
|
5
3
|
from credsweeper.credentials import Candidate
|
|
@@ -9,15 +7,6 @@ from credsweeper.ml_model.features.word_in import WordIn
|
|
|
9
7
|
class WordInVariable(WordIn):
|
|
10
8
|
"""Feature returns array of words matching in variable"""
|
|
11
9
|
|
|
12
|
-
def __init__(self, words: List[str]) -> None:
|
|
13
|
-
"""Feature is true if candidate value contains at least one predefined word.
|
|
14
|
-
|
|
15
|
-
Args:
|
|
16
|
-
words: list of predefined words - MUST BE IN LOWER CASE
|
|
17
|
-
|
|
18
|
-
"""
|
|
19
|
-
super().__init__(words)
|
|
20
|
-
|
|
21
10
|
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
22
11
|
"""Returns array of matching words for first line"""
|
|
23
12
|
if variable := candidate.line_data_list[0].variable:
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
+
import json
|
|
2
3
|
import logging
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import List, Tuple, Union, Optional, Dict
|
|
5
6
|
|
|
6
7
|
import numpy as np
|
|
7
|
-
|
|
8
|
+
from onnxruntime import InferenceSession
|
|
8
9
|
|
|
9
10
|
import credsweeper.ml_model.features as features
|
|
10
11
|
from credsweeper.common.constants import ThresholdPreset, ML_HUNK
|
|
@@ -22,6 +23,8 @@ class MlValidator:
|
|
|
22
23
|
# applied for unknown characters
|
|
23
24
|
FAKE_CHAR = '\x01'
|
|
24
25
|
|
|
26
|
+
_dir_path = Path(__file__).parent
|
|
27
|
+
|
|
25
28
|
def __init__(
|
|
26
29
|
self, #
|
|
27
30
|
threshold: Union[float, ThresholdPreset], #
|
|
@@ -36,35 +39,36 @@ class MlValidator:
|
|
|
36
39
|
ml_model: path to ml model
|
|
37
40
|
ml_providers: coma separated list of providers https://onnxruntime.ai/docs/execution-providers/
|
|
38
41
|
"""
|
|
39
|
-
|
|
42
|
+
self.__session: Optional[InferenceSession] = None
|
|
40
43
|
|
|
41
44
|
if ml_config:
|
|
42
45
|
ml_config_path = Path(ml_config)
|
|
43
46
|
else:
|
|
44
|
-
ml_config_path =
|
|
47
|
+
ml_config_path = MlValidator._dir_path / "ml_config.json"
|
|
45
48
|
with open(ml_config_path, "rb") as f:
|
|
46
|
-
|
|
49
|
+
__ml_config_data = f.read()
|
|
50
|
+
|
|
51
|
+
model_config = json.loads(__ml_config_data)
|
|
47
52
|
|
|
48
53
|
if ml_model:
|
|
49
54
|
ml_model_path = Path(ml_model)
|
|
50
55
|
else:
|
|
51
|
-
ml_model_path =
|
|
56
|
+
ml_model_path = MlValidator._dir_path / "ml_model.onnx"
|
|
52
57
|
with open(ml_model_path, "rb") as f:
|
|
53
|
-
|
|
58
|
+
self.__ml_model_data = f.read()
|
|
54
59
|
|
|
55
60
|
if ml_providers:
|
|
56
|
-
providers = ml_providers.split(',')
|
|
61
|
+
self.providers = ml_providers.split(',')
|
|
57
62
|
else:
|
|
58
|
-
providers = ["CPUExecutionProvider"]
|
|
59
|
-
self.model_session = ort.InferenceSession(ml_model_path, providers=providers)
|
|
63
|
+
self.providers = ["CPUExecutionProvider"]
|
|
60
64
|
|
|
61
|
-
model_config = Util.json_load(ml_config_path)
|
|
62
65
|
if isinstance(threshold, float):
|
|
63
66
|
self.threshold = threshold
|
|
64
67
|
elif isinstance(threshold, ThresholdPreset) and "thresholds" in model_config:
|
|
65
68
|
self.threshold = model_config["thresholds"][threshold.value]
|
|
66
69
|
else:
|
|
67
70
|
self.threshold = 0.5
|
|
71
|
+
logger.warning(f"Use fallback threshold value: {self.threshold}")
|
|
68
72
|
|
|
69
73
|
char_set = set(model_config["char_set"])
|
|
70
74
|
if len(char_set) != len(model_config["char_set"]):
|
|
@@ -80,25 +84,44 @@ class MlValidator:
|
|
|
80
84
|
|
|
81
85
|
self.common_feature_list = []
|
|
82
86
|
self.unique_feature_list = []
|
|
83
|
-
logger.
|
|
84
|
-
|
|
85
|
-
|
|
87
|
+
if logger.isEnabledFor(logging.INFO):
|
|
88
|
+
config_dbg = str(model_config) if logger.isEnabledFor(logging.DEBUG) else ''
|
|
89
|
+
config_md5 = hashlib.md5(__ml_config_data).hexdigest()
|
|
90
|
+
model_md5 = hashlib.md5(self.__ml_model_data).hexdigest()
|
|
91
|
+
logger.info("Init ML validator with providers: '%s' ; model:'%s' md5:%s ; config:'%s' md5:%s ; %s",
|
|
92
|
+
self.providers, ml_config_path, config_md5, ml_model_path, model_md5, config_dbg)
|
|
86
93
|
for feature_definition in model_config["features"]:
|
|
87
94
|
feature_class = feature_definition["type"]
|
|
88
95
|
kwargs = feature_definition.get("kwargs", {})
|
|
89
96
|
feature_constructor = getattr(features, feature_class, None)
|
|
90
97
|
if feature_constructor is None:
|
|
91
|
-
raise ValueError(f
|
|
98
|
+
raise ValueError(f"Error while parsing model details. Cannot create feature '{feature_class}'"
|
|
99
|
+
f" from {feature_definition}")
|
|
92
100
|
try:
|
|
93
101
|
feature = feature_constructor(**kwargs)
|
|
94
102
|
except TypeError:
|
|
95
|
-
|
|
96
|
-
|
|
103
|
+
logger.error(f"Error while parsing model details. Cannot create feature '{feature_class}'"
|
|
104
|
+
f" from {feature_definition}")
|
|
105
|
+
raise
|
|
97
106
|
if feature_definition["type"] in ["RuleName"]:
|
|
98
107
|
self.unique_feature_list.append(feature)
|
|
99
108
|
else:
|
|
100
109
|
self.common_feature_list.append(feature)
|
|
101
110
|
|
|
111
|
+
def __reduce__(self):
|
|
112
|
+
# TypeError: cannot pickle 'onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession' object
|
|
113
|
+
self.__session = None
|
|
114
|
+
return super().__reduce__()
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def session(self) -> InferenceSession:
|
|
118
|
+
"""session getter to prevent pickle error"""
|
|
119
|
+
if not self.__session:
|
|
120
|
+
self.__session = InferenceSession(self.__ml_model_data, providers=self.providers)
|
|
121
|
+
if not self.__session:
|
|
122
|
+
raise RuntimeError("InferenceSession was not initialized!")
|
|
123
|
+
return self.__session
|
|
124
|
+
|
|
102
125
|
def encode(self, text: str, limit: int) -> np.ndarray:
|
|
103
126
|
"""Encodes prepared text to array"""
|
|
104
127
|
result_array: np.ndarray = np.zeros(shape=(limit, self.num_classes), dtype=np.float32)
|
|
@@ -135,7 +158,7 @@ class MlValidator:
|
|
|
135
158
|
"value_input": value_input.astype(np.float32),
|
|
136
159
|
"feature_input": feature_input.astype(np.float32),
|
|
137
160
|
}
|
|
138
|
-
result = self.
|
|
161
|
+
result = self.session.run(output_names=None, input_feed=input_feed)
|
|
139
162
|
if result and isinstance(result[0], np.ndarray):
|
|
140
163
|
return result[0]
|
|
141
164
|
raise RuntimeError(f"Unexpected type {type(result[0])}")
|
|
@@ -177,8 +200,8 @@ class MlValidator:
|
|
|
177
200
|
default_candidate = candidates[0]
|
|
178
201
|
line_input = self.encode_line(default_candidate.line_data_list[0].line,
|
|
179
202
|
default_candidate.line_data_list[0].value_start)[np.newaxis]
|
|
180
|
-
variable =
|
|
181
|
-
value =
|
|
203
|
+
variable = ''
|
|
204
|
+
value = ''
|
|
182
205
|
for candidate in candidates:
|
|
183
206
|
if not variable and candidate.line_data_list[0].variable:
|
|
184
207
|
variable = candidate.line_data_list[0].variable
|
|
@@ -229,7 +252,7 @@ class MlValidator:
|
|
|
229
252
|
features_list = []
|
|
230
253
|
probability: np.ndarray = np.zeros(len(group_list), dtype=np.float32)
|
|
231
254
|
head = tail = 0
|
|
232
|
-
for
|
|
255
|
+
for _group_key, candidates in group_list:
|
|
233
256
|
line_input, variable_input, value_input, feature_array = self.get_group_features(candidates)
|
|
234
257
|
line_input_list.append(line_input)
|
|
235
258
|
variable_input_list.append(variable_input)
|
|
@@ -250,8 +273,8 @@ class MlValidator:
|
|
|
250
273
|
features_list)
|
|
251
274
|
is_cred = probability > self.threshold
|
|
252
275
|
if logger.isEnabledFor(logging.DEBUG):
|
|
253
|
-
for i in
|
|
254
|
-
logger.debug("ML decision: %s with prediction: %s for value: %s",
|
|
276
|
+
for i, decision in enumerate(is_cred):
|
|
277
|
+
logger.debug("ML decision: %s with prediction: %s for value: %s", decision, probability[i],
|
|
255
278
|
group_list[i][0])
|
|
256
279
|
# apply cast to float to avoid json export issue
|
|
257
280
|
return is_cred, probability.astype(float)
|