credsweeper 1.12.1__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of credsweeper might be problematic. Click here for more details.
- credsweeper/__init__.py +1 -1
- credsweeper/__main__.py +23 -13
- credsweeper/app.py +7 -2
- credsweeper/common/keyword_pattern.py +6 -3
- credsweeper/common/morpheme_checklist.txt +26 -6
- credsweeper/config/config.py +1 -0
- credsweeper/credentials/line_data.py +21 -6
- credsweeper/deep_scanner/abstract_scanner.py +1 -0
- credsweeper/deep_scanner/csv_scanner.py +71 -0
- credsweeper/deep_scanner/deep_scanner.py +19 -9
- credsweeper/deep_scanner/jks_scanner.py +11 -2
- credsweeper/deep_scanner/pkcs_scanner.py +4 -0
- credsweeper/deep_scanner/rtf_scanner.py +41 -0
- credsweeper/deep_scanner/strings_scanner.py +52 -0
- credsweeper/file_handler/byte_content_provider.py +10 -1
- credsweeper/file_handler/file_path_extractor.py +2 -0
- credsweeper/file_handler/text_content_provider.py +7 -1
- credsweeper/filters/__init__.py +1 -1
- credsweeper/filters/group/token_pattern.py +2 -2
- credsweeper/filters/group/weird_base36_token.py +3 -3
- credsweeper/filters/group/weird_base64_token.py +2 -2
- credsweeper/filters/value_camel_case_check.py +2 -2
- credsweeper/filters/value_file_path_check.py +5 -3
- credsweeper/filters/value_github_check.py +3 -2
- credsweeper/filters/value_morphemes_check.py +43 -0
- credsweeper/filters/value_string_type_check.py +1 -0
- credsweeper/ml_model/features/feature.py +1 -18
- credsweeper/ml_model/features/file_extension.py +1 -1
- credsweeper/ml_model/features/has_html_tag.py +10 -8
- credsweeper/ml_model/features/is_secret_numeric.py +4 -3
- credsweeper/ml_model/features/rule_name.py +1 -1
- credsweeper/ml_model/features/word_in.py +9 -32
- credsweeper/ml_model/features/word_in_path.py +2 -3
- credsweeper/ml_model/features/word_in_postamble.py +1 -4
- credsweeper/ml_model/features/word_in_preamble.py +1 -4
- credsweeper/ml_model/features/word_in_transition.py +1 -4
- credsweeper/ml_model/features/word_in_value.py +2 -3
- credsweeper/ml_model/features/word_in_variable.py +2 -3
- credsweeper/ml_model/ml_config.json +15 -8
- credsweeper/ml_model/ml_model.onnx +0 -0
- credsweeper/ml_model/ml_validator.py +1 -1
- credsweeper/rules/config.yaml +174 -207
- credsweeper/scanner/scanner.py +12 -7
- credsweeper/secret/config.json +18 -5
- credsweeper/utils/util.py +21 -18
- {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/METADATA +7 -7
- {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/RECORD +50 -47
- credsweeper/filters/value_couple_keyword_check.py +0 -28
- {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/WHEEL +0 -0
- {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/entry_points.txt +0 -0
- {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
from striprtf import striprtf
|
|
6
|
+
|
|
7
|
+
from credsweeper.credentials.candidate import Candidate
|
|
8
|
+
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
|
|
9
|
+
from credsweeper.file_handler.data_content_provider import DataContentProvider
|
|
10
|
+
from credsweeper.file_handler.string_content_provider import StringContentProvider
|
|
11
|
+
from credsweeper.utils.util import Util
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RtfScanner(AbstractScanner, ABC):
|
|
17
|
+
"""Implements squash file system scanning"""
|
|
18
|
+
|
|
19
|
+
@staticmethod
|
|
20
|
+
def get_lines(text: str) -> List[str]:
|
|
21
|
+
"""Extracts text lines from RTF format"""
|
|
22
|
+
rtf_text = striprtf.rtf_to_text(text)
|
|
23
|
+
lines = Util.split_text(rtf_text)
|
|
24
|
+
return lines
|
|
25
|
+
|
|
26
|
+
def data_scan(
|
|
27
|
+
self, #
|
|
28
|
+
data_provider: DataContentProvider, #
|
|
29
|
+
depth: int, #
|
|
30
|
+
recursive_limit_size: int) -> Optional[List[Candidate]]:
|
|
31
|
+
"""Scans data as RTF"""
|
|
32
|
+
try:
|
|
33
|
+
string_data_provider = StringContentProvider(lines=RtfScanner.get_lines(data_provider.text),
|
|
34
|
+
file_path=data_provider.file_path,
|
|
35
|
+
file_type=data_provider.file_type,
|
|
36
|
+
info=f"{data_provider.info}|RTF")
|
|
37
|
+
rtf_candidates = self.scanner.scan(string_data_provider)
|
|
38
|
+
return rtf_candidates
|
|
39
|
+
except Exception as rtf_exc:
|
|
40
|
+
logger.error(f"{data_provider.file_path}:{rtf_exc}")
|
|
41
|
+
return None
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from typing import List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from credsweeper.common.constants import MIN_DATA_LEN
|
|
6
|
+
from credsweeper.credentials.candidate import Candidate
|
|
7
|
+
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
|
|
8
|
+
from credsweeper.file_handler.data_content_provider import DataContentProvider
|
|
9
|
+
from credsweeper.file_handler.string_content_provider import StringContentProvider
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class StringsScanner(AbstractScanner, ABC):
|
|
15
|
+
"""Implements known binary file scanning with ASCII strings representations"""
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def get_strings(data: bytes) -> List[Tuple[str, int]]:
|
|
19
|
+
"""Processes binary to found ASCII strings. Use offset instead line number."""
|
|
20
|
+
strings = []
|
|
21
|
+
offset = 0
|
|
22
|
+
line = ''
|
|
23
|
+
for n, x in enumerate(data):
|
|
24
|
+
if 0x09 == x or 0x20 <= x <= 0x7E:
|
|
25
|
+
# TAB, SPACE and visible ASCII symbols
|
|
26
|
+
if not offset:
|
|
27
|
+
# for line number
|
|
28
|
+
offset = n
|
|
29
|
+
line += chr(x)
|
|
30
|
+
elif MIN_DATA_LEN <= len(line):
|
|
31
|
+
strings.append((line, offset))
|
|
32
|
+
offset = 0
|
|
33
|
+
line = ''
|
|
34
|
+
if MIN_DATA_LEN <= len(line):
|
|
35
|
+
strings.append((line, offset))
|
|
36
|
+
return strings
|
|
37
|
+
|
|
38
|
+
def data_scan(
|
|
39
|
+
self, #
|
|
40
|
+
data_provider: DataContentProvider, #
|
|
41
|
+
depth: int, #
|
|
42
|
+
recursive_limit_size: int) -> Optional[List[Candidate]]:
|
|
43
|
+
"""Extracts data file from .ar (debian) archive and launches data_scan"""
|
|
44
|
+
|
|
45
|
+
if strings := StringsScanner.get_strings(data_provider.data):
|
|
46
|
+
string_data_provider = StringContentProvider(lines=[x[0] for x in strings],
|
|
47
|
+
line_numbers=[x[1] for x in strings],
|
|
48
|
+
file_path=data_provider.file_path,
|
|
49
|
+
file_type=data_provider.file_type,
|
|
50
|
+
info=f"{data_provider.info}|STRINGS")
|
|
51
|
+
return self.scanner.scan(string_data_provider)
|
|
52
|
+
return None if strings is None else []
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from functools import cached_property
|
|
2
3
|
from typing import List, Optional, Generator
|
|
3
4
|
|
|
@@ -5,6 +6,8 @@ from credsweeper.file_handler.analysis_target import AnalysisTarget
|
|
|
5
6
|
from credsweeper.file_handler.content_provider import ContentProvider
|
|
6
7
|
from credsweeper.utils.util import Util
|
|
7
8
|
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
8
11
|
|
|
9
12
|
class ByteContentProvider(ContentProvider):
|
|
10
13
|
"""Allow to scan byte sequence instead of extra reading a file"""
|
|
@@ -42,7 +45,13 @@ class ByteContentProvider(ContentProvider):
|
|
|
42
45
|
def lines(self) -> List[str]:
|
|
43
46
|
"""lines RO getter for ByteContentProvider"""
|
|
44
47
|
if self.__lines is None:
|
|
45
|
-
|
|
48
|
+
text = Util.decode_text(self.__data)
|
|
49
|
+
if text is None:
|
|
50
|
+
logger.warning("Binary data detected %s %s %s", self.file_path, self.info,
|
|
51
|
+
repr(self.__data[:32]) if isinstance(self.__data, bytes) else "NONE")
|
|
52
|
+
self.__lines = []
|
|
53
|
+
else:
|
|
54
|
+
self.__lines = Util.split_text(text)
|
|
46
55
|
return self.__lines if self.__lines is not None else []
|
|
47
56
|
|
|
48
57
|
def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
|
|
@@ -127,6 +127,8 @@ class FilePathExtractor:
|
|
|
127
127
|
Return:
|
|
128
128
|
True when the file full path should be excluded according config
|
|
129
129
|
"""
|
|
130
|
+
if config.pedantic:
|
|
131
|
+
return False
|
|
130
132
|
path = path.replace('\\', '/')
|
|
131
133
|
lower_path = path.lower()
|
|
132
134
|
if config.not_allowed_path_pattern.match(lower_path):
|
|
@@ -54,7 +54,13 @@ class TextContentProvider(ContentProvider):
|
|
|
54
54
|
def lines(self) -> Optional[List[str]]:
|
|
55
55
|
"""lines getter for TextContentProvider"""
|
|
56
56
|
if self.__lines is None:
|
|
57
|
-
|
|
57
|
+
text = Util.decode_text(self.data)
|
|
58
|
+
if text is None:
|
|
59
|
+
logger.warning("Binary file detected %s %s %s", self.file_path, self.info,
|
|
60
|
+
repr(self.__data[:32]) if isinstance(self.__data, bytes) else "NONE")
|
|
61
|
+
self.__lines = []
|
|
62
|
+
else:
|
|
63
|
+
self.__lines = Util.split_text(text)
|
|
58
64
|
return self.__lines if self.__lines is not None else []
|
|
59
65
|
|
|
60
66
|
def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
|
credsweeper/filters/__init__.py
CHANGED
|
@@ -13,7 +13,6 @@ from credsweeper.filters.value_base64_part_check import ValueBase64PartCheck
|
|
|
13
13
|
from credsweeper.filters.value_basic_auth_check import ValueBasicAuthCheck
|
|
14
14
|
from credsweeper.filters.value_blocklist_check import ValueBlocklistCheck
|
|
15
15
|
from credsweeper.filters.value_camel_case_check import ValueCamelCaseCheck
|
|
16
|
-
from credsweeper.filters.value_couple_keyword_check import ValueCoupleKeywordCheck
|
|
17
16
|
from credsweeper.filters.value_dictionary_keyword_check import ValueDictionaryKeywordCheck
|
|
18
17
|
from credsweeper.filters.value_discord_bot_check import ValueDiscordBotCheck
|
|
19
18
|
from credsweeper.filters.value_entropy_base32_check import ValueEntropyBase32Check
|
|
@@ -30,6 +29,7 @@ from credsweeper.filters.value_json_web_token_check import ValueJsonWebTokenChec
|
|
|
30
29
|
from credsweeper.filters.value_last_word_check import ValueLastWordCheck
|
|
31
30
|
from credsweeper.filters.value_length_check import ValueLengthCheck
|
|
32
31
|
from credsweeper.filters.value_method_check import ValueMethodCheck
|
|
32
|
+
from credsweeper.filters.value_morphemes_check import ValueMorphemesCheck
|
|
33
33
|
from credsweeper.filters.value_not_allowed_pattern_check import ValueNotAllowedPatternCheck
|
|
34
34
|
from credsweeper.filters.value_not_part_encoded_check import ValueNotPartEncodedCheck
|
|
35
35
|
from credsweeper.filters.value_number_check import ValueNumberCheck
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from credsweeper.common.constants import GroupType
|
|
2
2
|
from credsweeper.config.config import Config
|
|
3
|
-
from credsweeper.filters import
|
|
3
|
+
from credsweeper.filters import ValueMorphemesCheck, ValueCamelCaseCheck, ValueNumberCheck, ValuePatternCheck
|
|
4
4
|
from credsweeper.filters.group.group import Group
|
|
5
5
|
|
|
6
6
|
|
|
@@ -10,7 +10,7 @@ class TokenPattern(Group):
|
|
|
10
10
|
def __init__(self, config: Config) -> None:
|
|
11
11
|
super().__init__(config, GroupType.DEFAULT)
|
|
12
12
|
self.filters = [
|
|
13
|
-
|
|
13
|
+
ValueMorphemesCheck(),
|
|
14
14
|
ValueNumberCheck(),
|
|
15
15
|
ValueCamelCaseCheck(),
|
|
16
16
|
ValuePatternCheck(),
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from credsweeper.common.constants import GroupType
|
|
2
2
|
from credsweeper.config.config import Config
|
|
3
|
-
from credsweeper.filters import
|
|
3
|
+
from credsweeper.filters import ValueMorphemesCheck, ValuePatternCheck, ValueNumberCheck, ValueEntropyBase36Check, \
|
|
4
4
|
ValueTokenBase36Check
|
|
5
5
|
from credsweeper.filters.group.group import Group
|
|
6
6
|
|
|
@@ -11,9 +11,9 @@ class WeirdBase36Token(Group):
|
|
|
11
11
|
def __init__(self, config: Config) -> None:
|
|
12
12
|
super().__init__(config, GroupType.DEFAULT)
|
|
13
13
|
self.filters = [
|
|
14
|
-
|
|
14
|
+
ValueMorphemesCheck(threshold=1),
|
|
15
15
|
ValuePatternCheck(),
|
|
16
16
|
ValueNumberCheck(),
|
|
17
17
|
ValueTokenBase36Check(),
|
|
18
|
-
ValueEntropyBase36Check()
|
|
18
|
+
ValueEntropyBase36Check(),
|
|
19
19
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from credsweeper.common.constants import GroupType
|
|
2
2
|
from credsweeper.config.config import Config
|
|
3
|
-
from credsweeper.filters import
|
|
3
|
+
from credsweeper.filters import ValueMorphemesCheck, ValueNotPartEncodedCheck, \
|
|
4
4
|
ValueBase64DataCheck, ValueEntropyBase64Check, ValuePatternCheck, ValueNumberCheck, ValueTokenBase64Check, \
|
|
5
5
|
ValueBase64PartCheck
|
|
6
6
|
from credsweeper.filters.group.group import Group
|
|
@@ -12,7 +12,7 @@ class WeirdBase64Token(Group):
|
|
|
12
12
|
def __init__(self, config: Config) -> None:
|
|
13
13
|
super().__init__(config, GroupType.DEFAULT)
|
|
14
14
|
self.filters = [
|
|
15
|
-
|
|
15
|
+
ValueMorphemesCheck(threshold=1),
|
|
16
16
|
ValueNumberCheck(),
|
|
17
17
|
ValueBase64DataCheck(),
|
|
18
18
|
ValueTokenBase64Check(),
|
|
@@ -12,7 +12,7 @@ from credsweeper.utils.util import Util
|
|
|
12
12
|
class ValueCamelCaseCheck(Filter):
|
|
13
13
|
"""Check that candidate is not written in camel case."""
|
|
14
14
|
|
|
15
|
-
CAMEL_CASE = ["
|
|
15
|
+
CAMEL_CASE = ["[a-z]+([A-Z][a-z]+)+", "[A-Z][a-z]+([A-Z][a-z]+)+"]
|
|
16
16
|
CAMEL_CASE_PATTERN = re.compile(Util.get_regex_combine_or(CAMEL_CASE))
|
|
17
17
|
|
|
18
18
|
def __init__(self, config: Optional[Config] = None) -> None:
|
|
@@ -31,7 +31,7 @@ class ValueCamelCaseCheck(Filter):
|
|
|
31
31
|
"""
|
|
32
32
|
if line_data.is_well_quoted_value:
|
|
33
33
|
return False
|
|
34
|
-
if self.CAMEL_CASE_PATTERN.
|
|
34
|
+
if self.CAMEL_CASE_PATTERN.fullmatch(line_data.value):
|
|
35
35
|
return static_keyword_checklist.check_morphemes(line_data.value.lower(), 1)
|
|
36
36
|
|
|
37
37
|
return False
|
|
@@ -35,6 +35,8 @@ class ValueFilePathCheck(Filter):
|
|
|
35
35
|
|
|
36
36
|
"""
|
|
37
37
|
value = line_data.value
|
|
38
|
+
bit_length = len(value).bit_length()
|
|
39
|
+
morpheme_threshold = 1 if 6 > bit_length else bit_length - 4
|
|
38
40
|
contains_unix_separator = '/' in value
|
|
39
41
|
if contains_unix_separator:
|
|
40
42
|
if ("://" in value #
|
|
@@ -45,14 +47,14 @@ class ValueFilePathCheck(Filter):
|
|
|
45
47
|
or value.startswith("//") and ':' == line_data.separator):
|
|
46
48
|
# common case for url definition or aliases
|
|
47
49
|
# or _keyword_://example.com where : is the separator
|
|
48
|
-
return static_keyword_checklist.check_morphemes(value.lower(),
|
|
50
|
+
return static_keyword_checklist.check_morphemes(value.lower(), morpheme_threshold)
|
|
49
51
|
# base64 encoded data might look like linux path
|
|
50
52
|
min_entropy = ValueEntropyBase64Check.get_min_data_entropy(len(value))
|
|
51
53
|
# get minimal entropy to compare with shannon entropy of found value
|
|
52
54
|
# min_entropy == 0 means that the value cannot be checked with the entropy due high variance
|
|
53
55
|
for i in value:
|
|
54
56
|
if i not in self.base64stdpad_possible_set:
|
|
55
|
-
# value contains wrong BASE64STDPAD_CHARS symbols like -_
|
|
57
|
+
# value contains wrong BASE64STDPAD_CHARS symbols like -_.
|
|
56
58
|
break
|
|
57
59
|
else:
|
|
58
60
|
# all symbols are from base64 alphabet
|
|
@@ -74,5 +76,5 @@ class ValueFilePathCheck(Filter):
|
|
|
74
76
|
break
|
|
75
77
|
else:
|
|
76
78
|
if contains_unix_separator ^ contains_windows_separator:
|
|
77
|
-
return static_keyword_checklist.check_morphemes(value.lower(),
|
|
79
|
+
return static_keyword_checklist.check_morphemes(value.lower(), morpheme_threshold)
|
|
78
80
|
return False
|
|
@@ -12,7 +12,7 @@ from credsweeper.filters.filter import Filter
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class ValueGitHubCheck(Filter):
|
|
15
|
-
"""GitHub Classic Token validation"""
|
|
15
|
+
"""NPM or GitHub Classic Token validation"""
|
|
16
16
|
|
|
17
17
|
def __init__(self, config: Optional[Config] = None) -> None:
|
|
18
18
|
pass
|
|
@@ -29,8 +29,9 @@ class ValueGitHubCheck(Filter):
|
|
|
29
29
|
|
|
30
30
|
"""
|
|
31
31
|
# https://github.blog/2021-04-05-behind-githubs-new-authentication-token-formats/
|
|
32
|
+
# https://github.blog/security/announcing-npms-new-access-token-format/
|
|
32
33
|
with contextlib.suppress(Exception):
|
|
33
|
-
if line_data.value.startswith("gh") and '_' == line_data.value[3]:
|
|
34
|
+
if (line_data.value.startswith("gh") and '_' == line_data.value[3]) or line_data.value.startswith("npm_"):
|
|
34
35
|
token = line_data.value[4:-6]
|
|
35
36
|
data = token.encode(ASCII, errors="strict")
|
|
36
37
|
crc32sum = binascii.crc32(data)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from credsweeper.common import static_keyword_checklist
|
|
4
|
+
from credsweeper.common.constants import MAX_LINE_LENGTH
|
|
5
|
+
from credsweeper.config.config import Config
|
|
6
|
+
from credsweeper.credentials.line_data import LineData
|
|
7
|
+
from credsweeper.file_handler.analysis_target import AnalysisTarget
|
|
8
|
+
from credsweeper.filters.filter import Filter
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ValueMorphemesCheck(Filter):
|
|
12
|
+
"""Check value for a threshold of morphemes count"""
|
|
13
|
+
|
|
14
|
+
THRESHOLDS_X3 = int(MAX_LINE_LENGTH).bit_length()
|
|
15
|
+
# one morpheme is very likely to be random generated even for 3 symbols
|
|
16
|
+
MAX_MORPHEMES_LIMIT = max(1, THRESHOLDS_X3 - 4)
|
|
17
|
+
|
|
18
|
+
def __init__(self, config: Optional[Config] = None, threshold: Optional[int] = None) -> None:
|
|
19
|
+
# threshold - minimum morphemes number in a value
|
|
20
|
+
if threshold is None:
|
|
21
|
+
# use dynamic thresholds
|
|
22
|
+
self.thresholds = [max(1, x - 4) for x in range(ValueMorphemesCheck.THRESHOLDS_X3)]
|
|
23
|
+
elif isinstance(threshold, int) and 0 <= threshold:
|
|
24
|
+
# constant thresholds for any pattern
|
|
25
|
+
self.thresholds = [threshold] * ValueMorphemesCheck.THRESHOLDS_X3
|
|
26
|
+
else:
|
|
27
|
+
raise ValueError(f"Wrong type of pattern length {type(threshold)} = {repr(threshold)}")
|
|
28
|
+
|
|
29
|
+
def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
|
|
30
|
+
"""Run filter checks on received credential candidate data 'line_data'.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
line_data: credential candidate data
|
|
34
|
+
target: multiline target from which line data was obtained
|
|
35
|
+
|
|
36
|
+
Return:
|
|
37
|
+
True, if need to filter candidate and False if left
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
threshold_id = len(line_data.value).bit_length()
|
|
41
|
+
# use the last (max) threshold in very huge value
|
|
42
|
+
threshold = self.thresholds[threshold_id] if len(self.thresholds) > threshold_id else self.thresholds[-1]
|
|
43
|
+
return static_keyword_checklist.check_morphemes(line_data.value.lower(), threshold)
|
|
@@ -51,6 +51,7 @@ class ValueStringTypeCheck(Filter):
|
|
|
51
51
|
and not line_data.is_comment() \
|
|
52
52
|
and not line_data.is_well_quoted_value \
|
|
53
53
|
and not line_data.is_quoted \
|
|
54
|
+
and not '0' <= line_data.value[0] <= '9' \
|
|
54
55
|
and line_data.separator and '=' in line_data.separator:
|
|
55
56
|
# heterogeneous code e.g. YAML in Python uses colon sign instead equals
|
|
56
57
|
return True
|
|
@@ -10,7 +10,7 @@ class Feature(ABC):
|
|
|
10
10
|
"""Base class for features."""
|
|
11
11
|
|
|
12
12
|
def __init__(self):
|
|
13
|
-
|
|
13
|
+
pass
|
|
14
14
|
|
|
15
15
|
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
|
|
16
16
|
"""Call base class for features.
|
|
@@ -25,20 +25,3 @@ class Feature(ABC):
|
|
|
25
25
|
def extract(self, candidate: Candidate) -> Any:
|
|
26
26
|
"""Abstract method of base class"""
|
|
27
27
|
raise NotImplementedError
|
|
28
|
-
|
|
29
|
-
@property
|
|
30
|
-
def words(self) -> List[str]:
|
|
31
|
-
"""getter"""
|
|
32
|
-
return self.__words
|
|
33
|
-
|
|
34
|
-
@words.setter
|
|
35
|
-
def words(self, words: List[str]) -> None:
|
|
36
|
-
"""setter"""
|
|
37
|
-
self.__words = words
|
|
38
|
-
|
|
39
|
-
def any_word_in_(self, a_string: str) -> bool:
|
|
40
|
-
"""Returns true if any words in a string"""
|
|
41
|
-
for i in self.words:
|
|
42
|
-
if i in a_string:
|
|
43
|
-
return True
|
|
44
|
-
return False
|
|
@@ -19,7 +19,7 @@ class FileExtension(WordIn):
|
|
|
19
19
|
|
|
20
20
|
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
|
|
21
21
|
extension_set = set(candidate.line_data_list[0].file_type.lower() for candidate in candidates)
|
|
22
|
-
return self.
|
|
22
|
+
return self.word_in_(extension_set)
|
|
23
23
|
|
|
24
24
|
def extract(self, candidate: Candidate) -> Any:
|
|
25
25
|
raise NotImplementedError
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
from credsweeper.common.constants import CHUNK_SIZE
|
|
2
2
|
from credsweeper.credentials.candidate import Candidate
|
|
3
|
-
from credsweeper.ml_model.features.
|
|
3
|
+
from credsweeper.ml_model.features.word_in import WordIn
|
|
4
4
|
from credsweeper.utils.util import Util
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
class HasHtmlTag(
|
|
7
|
+
class HasHtmlTag(WordIn):
|
|
8
8
|
"""Feature is true if line has HTML tags (HTML file)."""
|
|
9
9
|
|
|
10
|
+
HTML_WORDS = [
|
|
11
|
+
'< img', '<img', '< script', '<script', '< p', '<p', '< link', '<link', '< meta', '<meta', '< a', '<a'
|
|
12
|
+
]
|
|
13
|
+
|
|
10
14
|
def __init__(self) -> None:
|
|
11
|
-
super().__init__()
|
|
12
|
-
self.words = [
|
|
13
|
-
'< img', '<img', '< script', '<script', '< p', '<p', '< link', '<link', '< meta', '<meta', '< a', '<a'
|
|
14
|
-
]
|
|
15
|
+
super().__init__(HasHtmlTag.HTML_WORDS)
|
|
15
16
|
|
|
16
17
|
def extract(self, candidate: Candidate) -> bool:
|
|
17
18
|
subtext = Util.subtext(candidate.line_data_list[0].line, candidate.line_data_list[0].value_start, CHUNK_SIZE)
|
|
@@ -19,8 +20,9 @@ class HasHtmlTag(Feature):
|
|
|
19
20
|
if '<' not in candidate_line_data_list_0_line_lower:
|
|
20
21
|
# early check
|
|
21
22
|
return False
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
for i in self.words:
|
|
24
|
+
if i in candidate_line_data_list_0_line_lower:
|
|
25
|
+
return True
|
|
24
26
|
if "/>" in candidate_line_data_list_0_line_lower or "</" in candidate_line_data_list_0_line_lower:
|
|
25
27
|
# possible closed tag
|
|
26
28
|
return True
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
|
|
1
3
|
from credsweeper.credentials.candidate import Candidate
|
|
2
4
|
from credsweeper.ml_model.features.feature import Feature
|
|
3
5
|
|
|
@@ -6,8 +8,7 @@ class IsSecretNumeric(Feature):
|
|
|
6
8
|
"""Feature is true if candidate value is a numerical value."""
|
|
7
9
|
|
|
8
10
|
def extract(self, candidate: Candidate) -> bool:
|
|
9
|
-
|
|
11
|
+
with contextlib.suppress(ValueError):
|
|
10
12
|
float(candidate.line_data_list[0].value)
|
|
11
13
|
return True
|
|
12
|
-
|
|
13
|
-
return False
|
|
14
|
+
return False
|
|
@@ -19,7 +19,7 @@ class RuleName(WordIn):
|
|
|
19
19
|
|
|
20
20
|
def __call__(self, candidates: List[Candidate]) -> np.ndarray:
|
|
21
21
|
candidate_rule_set = set(x.rule_name for x in candidates)
|
|
22
|
-
return self.
|
|
22
|
+
return self.word_in_(candidate_rule_set)
|
|
23
23
|
|
|
24
24
|
def extract(self, candidate: Candidate) -> Any:
|
|
25
25
|
raise NotImplementedError
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
|
-
from typing import List, Any,
|
|
2
|
+
from typing import List, Any, Set, Union
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
@@ -18,42 +18,19 @@ class WordIn(Feature):
|
|
|
18
18
|
if len(self.enumerated_words) != self.dimension:
|
|
19
19
|
raise RuntimeError(f"Check duplicates:{words}")
|
|
20
20
|
|
|
21
|
-
@property
|
|
22
|
-
def enumerated_words(self) -> List[Tuple[int, str]]:
|
|
23
|
-
"""getter for speedup"""
|
|
24
|
-
return self.__enumerated_words
|
|
25
|
-
|
|
26
|
-
@enumerated_words.setter
|
|
27
|
-
def enumerated_words(self, enumerated_words: List[Tuple[int, str]]) -> None:
|
|
28
|
-
"""setter for speedup"""
|
|
29
|
-
self.__enumerated_words = enumerated_words
|
|
30
|
-
|
|
31
|
-
@property
|
|
32
|
-
def dimension(self) -> int:
|
|
33
|
-
"""getter"""
|
|
34
|
-
return self.__dimension
|
|
35
|
-
|
|
36
|
-
@dimension.setter
|
|
37
|
-
def dimension(self, dimension: int) -> None:
|
|
38
|
-
"""setter"""
|
|
39
|
-
self.__dimension = dimension
|
|
40
|
-
|
|
41
21
|
@abstractmethod
|
|
42
22
|
def extract(self, candidate: Candidate) -> Any:
|
|
43
23
|
raise NotImplementedError
|
|
44
24
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
if word in a_string:
|
|
50
|
-
result[i] = 1
|
|
51
|
-
return np.array([result])
|
|
25
|
+
@property
|
|
26
|
+
def zero(self) -> np.ndarray:
|
|
27
|
+
"""Returns zero filled array for case of empty input"""
|
|
28
|
+
return np.zeros(shape=[self.dimension], dtype=np.int8)
|
|
52
29
|
|
|
53
|
-
def
|
|
54
|
-
"""Returns array with words
|
|
55
|
-
result: np.ndarray =
|
|
30
|
+
def word_in_(self, iterable_data: Union[str, List[str], Set[str]]) -> np.ndarray:
|
|
31
|
+
"""Returns array with words included in a string"""
|
|
32
|
+
result: np.ndarray = self.zero
|
|
56
33
|
for i, word in self.enumerated_words:
|
|
57
|
-
if word in
|
|
34
|
+
if word in iterable_data:
|
|
58
35
|
result[i] = 1
|
|
59
36
|
return np.array([result])
|
|
@@ -19,9 +19,8 @@ class WordInPath(WordIn):
|
|
|
19
19
|
posix_lower_path = path.as_posix().lower() if path.is_absolute() else f"./{path.as_posix().lower()}"
|
|
20
20
|
# prevent extra confusion from the same word in extension
|
|
21
21
|
path_without_extension, _ = os.path.splitext(posix_lower_path)
|
|
22
|
-
return self.
|
|
23
|
-
|
|
24
|
-
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
22
|
+
return self.word_in_(path_without_extension)
|
|
23
|
+
return np.array([self.zero])
|
|
25
24
|
|
|
26
25
|
def extract(self, candidate: Candidate) -> Any:
|
|
27
26
|
raise NotImplementedError
|
|
@@ -15,7 +15,4 @@ class WordInPostamble(WordIn):
|
|
|
15
15
|
else candidate.line_data_list[0].value_end + ML_HUNK
|
|
16
16
|
postamble = candidate.line_data_list[0].line[candidate.line_data_list[0].value_end:postamble_end].strip()
|
|
17
17
|
|
|
18
|
-
if postamble
|
|
19
|
-
return self.word_in_str(postamble.lower())
|
|
20
|
-
else:
|
|
21
|
-
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
18
|
+
return self.word_in_(postamble.lower()) if postamble else np.array([self.zero])
|
|
@@ -20,7 +20,4 @@ class WordInPreamble(WordIn):
|
|
|
20
20
|
else candidate.line_data_list[0].value_start - ML_HUNK
|
|
21
21
|
preamble = candidate.line_data_list[0].line[preamble_start:candidate.line_data_list[0].value_start].strip()
|
|
22
22
|
|
|
23
|
-
if preamble
|
|
24
|
-
return self.word_in_str(preamble.lower())
|
|
25
|
-
else:
|
|
26
|
-
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
23
|
+
return self.word_in_(preamble.lower()) if preamble else np.array([self.zero])
|
|
@@ -15,7 +15,4 @@ class WordInTransition(WordIn):
|
|
|
15
15
|
else:
|
|
16
16
|
transition = ''
|
|
17
17
|
|
|
18
|
-
if transition
|
|
19
|
-
return self.word_in_str(transition.lower())
|
|
20
|
-
else:
|
|
21
|
-
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
18
|
+
return self.word_in_(transition.lower()) if transition else np.array([self.zero])
|
|
@@ -10,6 +10,5 @@ class WordInValue(WordIn):
|
|
|
10
10
|
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
11
11
|
"""Returns array of matching words for first line"""
|
|
12
12
|
if value := candidate.line_data_list[0].value:
|
|
13
|
-
return self.
|
|
14
|
-
|
|
15
|
-
return np.array([np.zeros(shape=[self.dimension], dtype=np.int8)])
|
|
13
|
+
return self.word_in_(value.lower())
|
|
14
|
+
return np.array([self.zero])
|
|
@@ -10,6 +10,5 @@ class WordInVariable(WordIn):
|
|
|
10
10
|
def extract(self, candidate: Candidate) -> np.ndarray:
|
|
11
11
|
"""Returns array of matching words for first line"""
|
|
12
12
|
if variable := candidate.line_data_list[0].variable:
|
|
13
|
-
return self.
|
|
14
|
-
|
|
15
|
-
return np.zeros(shape=[self.dimension], dtype=np.int8)
|
|
13
|
+
return self.word_in_(variable.lower())
|
|
14
|
+
return np.array([self.zero])
|