credsweeper 1.12.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of credsweeper might be problematic. Click here for more details.
- credsweeper/__init__.py +1 -1
- credsweeper/__main__.py +15 -8
- credsweeper/app.py +7 -2
- credsweeper/common/keyword_pattern.py +6 -3
- credsweeper/common/morpheme_checklist.txt +11 -1
- credsweeper/config/config.py +1 -0
- credsweeper/credentials/line_data.py +16 -0
- credsweeper/deep_scanner/deep_scanner.py +12 -6
- credsweeper/deep_scanner/rtf_scanner.py +41 -0
- credsweeper/deep_scanner/strings_scanner.py +52 -0
- credsweeper/file_handler/byte_content_provider.py +10 -1
- credsweeper/file_handler/file_path_extractor.py +2 -0
- credsweeper/file_handler/text_content_provider.py +7 -1
- credsweeper/filters/__init__.py +1 -1
- credsweeper/filters/group/token_pattern.py +2 -2
- credsweeper/filters/group/weird_base36_token.py +2 -2
- credsweeper/filters/group/weird_base64_token.py +2 -2
- credsweeper/filters/value_file_path_check.py +5 -3
- credsweeper/filters/value_github_check.py +3 -2
- credsweeper/filters/value_morphemes_check.py +43 -0
- credsweeper/filters/value_string_type_check.py +1 -0
- credsweeper/ml_model/features/feature.py +1 -18
- credsweeper/ml_model/features/file_extension.py +1 -1
- credsweeper/ml_model/features/has_html_tag.py +10 -8
- credsweeper/ml_model/features/is_secret_numeric.py +4 -3
- credsweeper/ml_model/features/rule_name.py +1 -1
- credsweeper/ml_model/features/word_in.py +9 -32
- credsweeper/ml_model/features/word_in_path.py +2 -3
- credsweeper/ml_model/features/word_in_postamble.py +1 -4
- credsweeper/ml_model/features/word_in_preamble.py +1 -4
- credsweeper/ml_model/features/word_in_transition.py +1 -4
- credsweeper/ml_model/features/word_in_value.py +2 -3
- credsweeper/ml_model/features/word_in_variable.py +2 -3
- credsweeper/ml_model/ml_config.json +15 -8
- credsweeper/ml_model/ml_model.onnx +0 -0
- credsweeper/ml_model/ml_validator.py +1 -1
- credsweeper/rules/config.yaml +129 -128
- credsweeper/scanner/scanner.py +12 -7
- credsweeper/secret/config.json +18 -5
- credsweeper/utils/util.py +19 -16
- {credsweeper-1.12.2.dist-info → credsweeper-1.13.0.dist-info}/METADATA +7 -7
- {credsweeper-1.12.2.dist-info → credsweeper-1.13.0.dist-info}/RECORD +45 -43
- credsweeper/filters/value_couple_keyword_check.py +0 -28
- {credsweeper-1.12.2.dist-info → credsweeper-1.13.0.dist-info}/WHEEL +0 -0
- {credsweeper-1.12.2.dist-info → credsweeper-1.13.0.dist-info}/entry_points.txt +0 -0
- {credsweeper-1.12.2.dist-info → credsweeper-1.13.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -10,7 +10,7 @@ class Feature(ABC):
|
|
|
10
10
|
"""Base class for features."""
|
|
11
11
|
|
|
12
12
|
def __init__(self):
|
|
13
|
-
|
|
13
|
+
pass
|
|
14
14
|
|
|
15
15
|
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
|
|
16
16
|
"""Call base class for features.
|
|
@@ -25,20 +25,3 @@ class Feature(ABC):
|
|
|
25
25
|
def extract(self, candidate: Candidate) -> Any:
|
|
26
26
|
"""Abstract method of base class"""
|
|
27
27
|
raise NotImplementedError
|
|
28
|
-
|
|
29
|
-
@property
|
|
30
|
-
def words(self) -> List[str]:
|
|
31
|
-
"""getter"""
|
|
32
|
-
return self.__words
|
|
33
|
-
|
|
34
|
-
@words.setter
|
|
35
|
-
def words(self, words: List[str]) -> None:
|
|
36
|
-
"""setter"""
|
|
37
|
-
self.__words = words
|
|
38
|
-
|
|
39
|
-
def any_word_in_(self, a_string: str) -> bool:
|
|
40
|
-
"""Returns true if any words in a string"""
|
|
41
|
-
for i in self.words:
|
|
42
|
-
if i in a_string:
|
|
43
|
-
return True
|
|
44
|
-
return False
|
|
@@ -19,7 +19,7 @@ class FileExtension(WordIn):
|
|
|
19
19
|
|
|
20
20
|
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
|
|
21
21
|
extension_set = set(candidate.line_data_list[0].file_type.lower() for candidate in candidates)
|
|
22
|
-
return self.
|
|
22
|
+
return self.word_in_(extension_set)
|
|
23
23
|
|
|
24
24
|
def extract(self, candidate: Candidate) -> Any:
|
|
25
25
|
raise NotImplementedError
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
from credsweeper.common.constants import CHUNK_SIZE
|
|
2
2
|
from credsweeper.credentials.candidate import Candidate
|
|
3
|
-
from credsweeper.ml_model.features.
|
|
3
|
+
from credsweeper.ml_model.features.word_in import WordIn
|
|
4
4
|
from credsweeper.utils.util import Util
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
class HasHtmlTag(
|
|
7
|
+
class HasHtmlTag(WordIn):
|
|
8
8
|
"""Feature is true if line has HTML tags (HTML file)."""
|
|
9
9
|
|
|
10
|
+
HTML_WORDS = [
|
|
11
|
+
'< img', '<img', '< script', '<script', '< p', '<p', '< link', '<link', '< meta', '<meta', '< a', '<a'
|
|
12
|
+
]
|
|
13
|
+
|
|
10
14
|
def __init__(self) -> None:
|
|
11
|
-
super().__init__()
|
|
12
|
-
self.words = [
|
|
13
|
-
'< img', '<img', '< script', '<script', '< p', '<p', '< link', '<link', '< meta', '<meta', '< a', '<a'
|
|
14
|
-
]
|
|
15
|
+
super().__init__(HasHtmlTag.HTML_WORDS)
|
|
15
16
|
|
|
16
17
|
def extract(self, candidate: Candidate) -> bool:
|
|
17
18
|
subtext = Util.subtext(candidate.line_data_list[0].line, candidate.line_data_list[0].value_start, CHUNK_SIZE)
|
|
@@ -19,8 +20,9 @@ class HasHtmlTag(Feature):
|
|
|
19
20
|
if '<' not in candidate_line_data_list_0_line_lower:
|
|
20
21
|
# early check
|
|
21
22
|
return False
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
for i in self.words:
|
|
24
|
+
if i in candidate_line_data_list_0_line_lower:
|
|
25
|
+
return True
|
|
24
26
|
if "/>" in candidate_line_data_list_0_line_lower or "</" in candidate_line_data_list_0_line_lower:
|
|
25
27
|
# possible closed tag
|
|
26
28
|
return True
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
|
|
1
3
|
from credsweeper.credentials.candidate import Candidate
|
|
2
4
|
from credsweeper.ml_model.features.feature import Feature
|
|
3
5
|
|
|
@@ -6,8 +8,7 @@ class IsSecretNumeric(Feature):
|
|
|
6
8
|
"""Feature is true if candidate value is a numerical value."""
|
|
7
9
|
|
|
8
10
|
def extract(self, candidate: Candidate) -> bool:
|
|
9
|
-
|
|
11
|
+
with contextlib.suppress(ValueError):
|
|
10
12
|
float(candidate.line_data_list[0].value)
|
|
11
13
|
return True
|
|
12
|
-
|
|
13
|
-
return False
|
|
14
|
+
return False
|
|
@@ -19,7 +19,7 @@ class RuleName(WordIn):
|
|
|
19
19
|
|
|
20
20
|
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
|
|
21
21
|
candidate_rule_set = set(x.rule_name for x in candidates)
|
|
22
|
-
return self.
|
|
22
|
+
return self.word_in_(candidate_rule_set)
|
|
23
23
|
|
|
24
24
|
def extract(self, candidate: Candidate) -> Any:
|
|
25
25
|
raise NotImplementedError
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
|
-
from typing import List, Any,
|
|
2
|
+
from typing import List, Any, Set, Union
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
@@ -18,42 +18,19 @@ class WordIn(Feature):
|
|
|
18
18
|
if len(self.enumerated_words) != self.dimension:
|
|
19
19
|
raise RuntimeError(f"Check duplicates:{words}")
|
|
20
20
|
|
|
21
|
-
@property
|
|
22
|
-
def enumerated_words(self) -> List[Tuple[int, str]]:
|
|
23
|
-
"""getter for speedup"""
|
|
24
|
-
return self.__enumerated_words
|
|
25
|
-
|
|
26
|
-
@enumerated_words.setter
|
|
27
|
-
def enumerated_words(self, enumerated_words: List[Tuple[int, str]]) -> None:
|
|
28
|
-
"""setter for speedup"""
|
|
29
|
-
self.__enumerated_words = enumerated_words
|
|
30
|
-
|
|
31
|
-
@property
|
|
32
|
-
def dimension(self) -> int:
|
|
33
|
-
"""getter"""
|
|
34
|
-
return self.__dimension
|
|
35
|
-
|
|
36
|
-
@dimension.setter
|
|
37
|
-
def dimension(self, dimension: int) -> None:
|
|
38
|
-
"""setter"""
|
|
39
|
-
self.__dimension = dimension
|
|
40
|
-
|
|
41
21
|
@abstractmethod
|
|
42
22
|
def extract(self, candidate: Candidate) -> Any:
|
|
43
23
|
raise NotImplementedError
|
|
44
24
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
if word in a_string:
|
|
50
|
-
result[i] = 1
|
|
51
|
-
return np.array([result])
|
|
25
|
+
@property
|
|
26
|
+
def zero(self) -> np.ndarray:
|
|
27
|
+
"""Returns zero filled array for case of empty input"""
|
|
28
|
+
return np.zeros(shape=[self.dimension], dtype=np.int8)
|
|
52
29
|
|
|
53
|
-
def
|
|
54
|
-
"""Returns array with words
|
|
55
|
-
result: np.ndarray =
|
|
30
|
+
def word_in_(self, iterable_data: Union[str, List[str], Set[str]]) -> np.ndarray:
|
|
31
|
+
"""Returns array with words included in a string"""
|
|
32
|
+
result: np.ndarray = self.zero
|
|
56
33
|
for i, word in self.enumerated_words:
|
|
57
|
-
if word in
|
|
34
|
+
if word in iterable_data:
|
|
58
35
|
result[i] = 1
|
|
59
36
|
return np.array([result])
|
|
@@ -19,9 +19,8 @@ class WordInPath(WordIn):
|
|
|
19
19
|
posix_lower_path = path.as_posix().lower() if path.is_absolute() else f"./{path.as_posix().lower()}"
|
|
20
20
|
# prevent extra confusion from the same word in extension
|
|
21
21
|
path_without_extension, _ = os.path.splitext(posix_lower_path)
|
|
22
|
-
return self.
|
|
23
|
-
|
|
24
|
-
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
22
|
+
return self.word_in_(path_without_extension)
|
|
23
|
+
return np.array([self.zero])
|
|
25
24
|
|
|
26
25
|
def extract(self, candidate: Candidate) -> Any:
|
|
27
26
|
raise NotImplementedError
|
|
@@ -15,7 +15,4 @@ class WordInPostamble(WordIn):
|
|
|
15
15
|
else candidate.line_data_list[0].value_end + ML_HUNK
|
|
16
16
|
postamble = candidate.line_data_list[0].line[candidate.line_data_list[0].value_end:postamble_end].strip()
|
|
17
17
|
|
|
18
|
-
if postamble
|
|
19
|
-
return self.word_in_str(postamble.lower())
|
|
20
|
-
else:
|
|
21
|
-
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
18
|
+
return self.word_in_(postamble.lower()) if postamble else np.array([self.zero])
|
|
@@ -20,7 +20,4 @@ class WordInPreamble(WordIn):
|
|
|
20
20
|
else candidate.line_data_list[0].value_start - ML_HUNK
|
|
21
21
|
preamble = candidate.line_data_list[0].line[preamble_start:candidate.line_data_list[0].value_start].strip()
|
|
22
22
|
|
|
23
|
-
if preamble
|
|
24
|
-
return self.word_in_str(preamble.lower())
|
|
25
|
-
else:
|
|
26
|
-
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
23
|
+
return self.word_in_(preamble.lower()) if preamble else np.array([self.zero])
|
|
@@ -15,7 +15,4 @@ class WordInTransition(WordIn):
|
|
|
15
15
|
else:
|
|
16
16
|
transition = ''
|
|
17
17
|
|
|
18
|
-
if transition
|
|
19
|
-
return self.word_in_str(transition.lower())
|
|
20
|
-
else:
|
|
21
|
-
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
18
|
+
return self.word_in_(transition.lower()) if transition else np.array([self.zero])
|
|
@@ -10,6 +10,5 @@ class WordInValue(WordIn):
|
|
|
10
10
|
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
11
11
|
"""Returns array of matching words for first line"""
|
|
12
12
|
if value := candidate.line_data_list[0].value:
|
|
13
|
-
return self.
|
|
14
|
-
|
|
15
|
-
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
13
|
+
return self.word_in_(value.lower())
|
|
14
|
+
return np.array([self.zero])
|
|
@@ -10,6 +10,5 @@ class WordInVariable(WordIn):
|
|
|
10
10
|
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
11
11
|
"""Returns array of matching words for first line"""
|
|
12
12
|
if variable := candidate.line_data_list[0].variable:
|
|
13
|
-
return self.
|
|
14
|
-
|
|
15
|
-
return np.zeros(shape=[self.dimension], dtype=np.int8)
|
|
13
|
+
return self.word_in_(variable.lower())
|
|
14
|
+
return np.array([self.zero])
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
"features": [
|
|
11
11
|
{
|
|
12
12
|
"type": "RuleSeverity",
|
|
13
|
+
"comment": "INFO=0.0, LOW=0.25, MEDIUM=0.5, HIGH=0.75, CRITICAL=1.0",
|
|
13
14
|
"kwargs": {}
|
|
14
15
|
},
|
|
15
16
|
{
|
|
@@ -62,7 +63,7 @@
|
|
|
62
63
|
"type": "SearchInAttribute",
|
|
63
64
|
"comment": "Repeated symbol",
|
|
64
65
|
"kwargs": {
|
|
65
|
-
"pattern": "
|
|
66
|
+
"pattern": "(?:(\\S)(\\S))((\\1.)|(.\\2)){7,}",
|
|
66
67
|
"attribute": "value"
|
|
67
68
|
}
|
|
68
69
|
},
|
|
@@ -70,7 +71,7 @@
|
|
|
70
71
|
"type": "SearchInAttribute",
|
|
71
72
|
"comment": "SHA marker",
|
|
72
73
|
"kwargs": {
|
|
73
|
-
"pattern": "
|
|
74
|
+
"pattern": "(?i:sha)[_-]?(224|256|384|512)",
|
|
74
75
|
"attribute": "value"
|
|
75
76
|
}
|
|
76
77
|
},
|
|
@@ -126,7 +127,7 @@
|
|
|
126
127
|
"type": "SearchInAttribute",
|
|
127
128
|
"comment": "VariableNotAllowedNameCheck",
|
|
128
129
|
"kwargs": {
|
|
129
|
-
"pattern": "(?i:(filters?|pub(lic)?)_?key)",
|
|
130
|
+
"pattern": "(?i:(sha[_-]?(224|256|384|512)|projects?|filters?|pub(lic)?)_?key)",
|
|
130
131
|
"attribute": "variable"
|
|
131
132
|
}
|
|
132
133
|
},
|
|
@@ -134,7 +135,7 @@
|
|
|
134
135
|
"type": "SearchInAttribute",
|
|
135
136
|
"comment": "VariableNotAllowedNameCheck",
|
|
136
137
|
"kwargs": {
|
|
137
|
-
"pattern": "(?i:(id|size|name|type|manager|algorithm|view|error)$)",
|
|
138
|
+
"pattern": "(?i:(id|sum|size|name|type|manager|algorithm|pattern|view|error|date(time)?|time(stamp)?|tag|version|hash|rate)$)",
|
|
138
139
|
"attribute": "variable"
|
|
139
140
|
}
|
|
140
141
|
},
|
|
@@ -245,8 +246,10 @@
|
|
|
245
246
|
"crypt",
|
|
246
247
|
"crypted",
|
|
247
248
|
"decrypt",
|
|
249
|
+
"edited",
|
|
248
250
|
"encrypt",
|
|
249
251
|
"example",
|
|
252
|
+
"expire",
|
|
250
253
|
"fake",
|
|
251
254
|
"file",
|
|
252
255
|
"foo",
|
|
@@ -260,7 +263,8 @@
|
|
|
260
263
|
"pass",
|
|
261
264
|
"public",
|
|
262
265
|
"pwd",
|
|
263
|
-
"
|
|
266
|
+
"redacted",
|
|
267
|
+
"rsa",
|
|
264
268
|
"salt",
|
|
265
269
|
"secret",
|
|
266
270
|
"sha",
|
|
@@ -339,6 +343,7 @@
|
|
|
339
343
|
"get",
|
|
340
344
|
"e.g.",
|
|
341
345
|
"equal",
|
|
346
|
+
"env",
|
|
342
347
|
"example",
|
|
343
348
|
"expect",
|
|
344
349
|
"line",
|
|
@@ -484,6 +489,7 @@
|
|
|
484
489
|
".bat",
|
|
485
490
|
".bats",
|
|
486
491
|
".bazel",
|
|
492
|
+
".bin",
|
|
487
493
|
".build",
|
|
488
494
|
".bundle",
|
|
489
495
|
".bzl",
|
|
@@ -504,7 +510,6 @@
|
|
|
504
510
|
".csp",
|
|
505
511
|
".csv",
|
|
506
512
|
".dist",
|
|
507
|
-
".doc",
|
|
508
513
|
".dockerfile",
|
|
509
514
|
".edited",
|
|
510
515
|
".eex",
|
|
@@ -527,6 +532,8 @@
|
|
|
527
532
|
".gtpl",
|
|
528
533
|
".h",
|
|
529
534
|
".haml",
|
|
535
|
+
".har",
|
|
536
|
+
".hpp",
|
|
530
537
|
".hs",
|
|
531
538
|
".html",
|
|
532
539
|
".idl",
|
|
@@ -657,8 +664,8 @@
|
|
|
657
664
|
"CMD Password",
|
|
658
665
|
"CMD Secret",
|
|
659
666
|
"CMD Token",
|
|
667
|
+
"CURL User Password",
|
|
660
668
|
"Credential",
|
|
661
|
-
"Github Old Token",
|
|
662
669
|
"Key",
|
|
663
670
|
"Nonce",
|
|
664
671
|
"Password",
|
|
@@ -671,4 +678,4 @@
|
|
|
671
678
|
}
|
|
672
679
|
}
|
|
673
680
|
]
|
|
674
|
-
}
|
|
681
|
+
}
|
|
Binary file
|
|
@@ -272,7 +272,7 @@ class MlValidator:
|
|
|
272
272
|
if head != tail:
|
|
273
273
|
probability[head:tail] = self._batch_call_model(line_input_list, variable_input_list, value_input_list,
|
|
274
274
|
features_list)
|
|
275
|
-
is_cred =
|
|
275
|
+
is_cred = self.threshold <= probability
|
|
276
276
|
if logger.isEnabledFor(logging.DEBUG):
|
|
277
277
|
for i, decision in enumerate(is_cred):
|
|
278
278
|
logger.debug("ML decision: %s with prediction: %s for value: %s", decision, probability[i],
|