credsweeper 1.11.3__py3-none-any.whl → 1.11.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of credsweeper might be problematic. Click here for more details.

Files changed (47) hide show
  1. credsweeper/__init__.py +1 -1
  2. credsweeper/__main__.py +1 -1
  3. credsweeper/app.py +21 -44
  4. credsweeper/common/constants.py +2 -5
  5. credsweeper/credentials/candidate_key.py +1 -1
  6. credsweeper/credentials/credential_manager.py +4 -3
  7. credsweeper/credentials/line_data.py +2 -5
  8. credsweeper/deep_scanner/abstract_scanner.py +269 -14
  9. credsweeper/deep_scanner/deb_scanner.py +55 -0
  10. credsweeper/deep_scanner/deep_scanner.py +39 -241
  11. credsweeper/deep_scanner/gzip_scanner.py +1 -1
  12. credsweeper/deep_scanner/jclass_scanner.py +74 -0
  13. credsweeper/deep_scanner/patch_scanner.py +48 -0
  14. credsweeper/deep_scanner/pkcs_scanner.py +41 -0
  15. credsweeper/deep_scanner/rpm_scanner.py +49 -0
  16. credsweeper/deep_scanner/sqlite3_scanner.py +79 -0
  17. credsweeper/file_handler/byte_content_provider.py +2 -2
  18. credsweeper/file_handler/content_provider.py +1 -1
  19. credsweeper/file_handler/data_content_provider.py +3 -4
  20. credsweeper/file_handler/diff_content_provider.py +2 -2
  21. credsweeper/file_handler/file_path_extractor.py +1 -1
  22. credsweeper/file_handler/files_provider.py +2 -4
  23. credsweeper/file_handler/patches_provider.py +5 -2
  24. credsweeper/file_handler/string_content_provider.py +2 -2
  25. credsweeper/file_handler/struct_content_provider.py +1 -1
  26. credsweeper/file_handler/text_content_provider.py +2 -2
  27. credsweeper/filters/__init__.py +1 -0
  28. credsweeper/filters/value_base64_encoded_pem_check.py +1 -1
  29. credsweeper/filters/value_base64_key_check.py +9 -14
  30. credsweeper/filters/value_entropy_base64_check.py +2 -6
  31. credsweeper/filters/value_json_web_key_check.py +37 -0
  32. credsweeper/filters/value_pattern_check.py +64 -16
  33. credsweeper/ml_model/features/file_extension.py +1 -1
  34. credsweeper/ml_model/ml_validator.py +43 -21
  35. credsweeper/rules/config.yaml +51 -9
  36. credsweeper/rules/rule.py +3 -3
  37. credsweeper/scanner/scan_type/multi_pattern.py +1 -2
  38. credsweeper/secret/config.json +6 -6
  39. credsweeper/utils/hop_stat.py +3 -3
  40. credsweeper/utils/pem_key_detector.py +6 -4
  41. credsweeper/utils/util.py +154 -79
  42. {credsweeper-1.11.3.dist-info → credsweeper-1.11.5.dist-info}/METADATA +3 -6
  43. {credsweeper-1.11.3.dist-info → credsweeper-1.11.5.dist-info}/RECORD +46 -40
  44. credsweeper/deep_scanner/pkcs12_scanner.py +0 -45
  45. {credsweeper-1.11.3.dist-info → credsweeper-1.11.5.dist-info}/WHEEL +0 -0
  46. {credsweeper-1.11.3.dist-info → credsweeper-1.11.5.dist-info}/entry_points.txt +0 -0
  47. {credsweeper-1.11.3.dist-info → credsweeper-1.11.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,10 +1,11 @@
1
1
  import hashlib
2
+ import json
2
3
  import logging
3
4
  from pathlib import Path
4
5
  from typing import List, Tuple, Union, Optional, Dict
5
6
 
6
7
  import numpy as np
7
- import onnxruntime as ort
8
+ from onnxruntime import InferenceSession
8
9
 
9
10
  import credsweeper.ml_model.features as features
10
11
  from credsweeper.common.constants import ThresholdPreset, ML_HUNK
@@ -22,6 +23,8 @@ class MlValidator:
22
23
  # applied for unknown characters
23
24
  FAKE_CHAR = '\x01'
24
25
 
26
+ _dir_path = Path(__file__).parent
27
+
25
28
  def __init__(
26
29
  self, #
27
30
  threshold: Union[float, ThresholdPreset], #
@@ -36,35 +39,36 @@ class MlValidator:
36
39
  ml_model: path to ml model
37
40
  ml_providers: coma separated list of providers https://onnxruntime.ai/docs/execution-providers/
38
41
  """
39
- dir_path = Path(__file__).parent
42
+ self.__session: Optional[InferenceSession] = None
40
43
 
41
44
  if ml_config:
42
45
  ml_config_path = Path(ml_config)
43
46
  else:
44
- ml_config_path = dir_path / "ml_config.json"
47
+ ml_config_path = MlValidator._dir_path / "ml_config.json"
45
48
  with open(ml_config_path, "rb") as f:
46
- md5_config = hashlib.md5(f.read()).hexdigest()
49
+ __ml_config_data = f.read()
50
+
51
+ model_config = json.loads(__ml_config_data)
47
52
 
48
53
  if ml_model:
49
54
  ml_model_path = Path(ml_model)
50
55
  else:
51
- ml_model_path = dir_path / "ml_model.onnx"
56
+ ml_model_path = MlValidator._dir_path / "ml_model.onnx"
52
57
  with open(ml_model_path, "rb") as f:
53
- md5_model = hashlib.md5(f.read()).hexdigest()
58
+ self.__ml_model_data = f.read()
54
59
 
55
60
  if ml_providers:
56
- providers = ml_providers.split(',')
61
+ self.providers = ml_providers.split(',')
57
62
  else:
58
- providers = ["CPUExecutionProvider"]
59
- self.model_session = ort.InferenceSession(ml_model_path, providers=providers)
63
+ self.providers = ["CPUExecutionProvider"]
60
64
 
61
- model_config = Util.json_load(ml_config_path)
62
65
  if isinstance(threshold, float):
63
66
  self.threshold = threshold
64
67
  elif isinstance(threshold, ThresholdPreset) and "thresholds" in model_config:
65
68
  self.threshold = model_config["thresholds"][threshold.value]
66
69
  else:
67
70
  self.threshold = 0.5
71
+ logger.warning(f"Use fallback threshold value: {self.threshold}")
68
72
 
69
73
  char_set = set(model_config["char_set"])
70
74
  if len(char_set) != len(model_config["char_set"]):
@@ -80,26 +84,44 @@ class MlValidator:
80
84
 
81
85
  self.common_feature_list = []
82
86
  self.unique_feature_list = []
83
- logger.info("Init ML validator with %s provider; config:'%s' md5:%s model:'%s' md5:%s", providers,
84
- ml_config_path, md5_config, ml_model_path, md5_model)
85
- logger.debug("ML validator details: %s", model_config)
87
+ if logger.isEnabledFor(logging.INFO):
88
+ config_dbg = str(model_config) if logger.isEnabledFor(logging.DEBUG) else ''
89
+ config_md5 = hashlib.md5(__ml_config_data).hexdigest()
90
+ model_md5 = hashlib.md5(self.__ml_model_data).hexdigest()
91
+ logger.info("Init ML validator with providers: '%s' ; model:'%s' md5:%s ; config:'%s' md5:%s ; %s",
92
+ self.providers, ml_config_path, config_md5, ml_model_path, model_md5, config_dbg)
86
93
  for feature_definition in model_config["features"]:
87
94
  feature_class = feature_definition["type"]
88
95
  kwargs = feature_definition.get("kwargs", {})
89
96
  feature_constructor = getattr(features, feature_class, None)
90
97
  if feature_constructor is None:
91
- raise ValueError(f'Error while parsing model details. Cannot create feature "{feature_class}"')
98
+ raise ValueError(f"Error while parsing model details. Cannot create feature '{feature_class}'"
99
+ f" from {feature_definition}")
92
100
  try:
93
101
  feature = feature_constructor(**kwargs)
94
102
  except TypeError:
95
- logger.error(f'Error while parsing model details. Cannot create feature "{feature_class}"'
96
- f' with kwargs "{kwargs}"')
103
+ logger.error(f"Error while parsing model details. Cannot create feature '{feature_class}'"
104
+ f" from {feature_definition}")
97
105
  raise
98
106
  if feature_definition["type"] in ["RuleName"]:
99
107
  self.unique_feature_list.append(feature)
100
108
  else:
101
109
  self.common_feature_list.append(feature)
102
110
 
111
+ def __reduce__(self):
112
+ # TypeError: cannot pickle 'onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession' object
113
+ self.__session = None
114
+ return super().__reduce__()
115
+
116
+ @property
117
+ def session(self) -> InferenceSession:
118
+ """session getter to prevent pickle error"""
119
+ if not self.__session:
120
+ self.__session = InferenceSession(self.__ml_model_data, providers=self.providers)
121
+ if not self.__session:
122
+ raise RuntimeError("InferenceSession was not initialized!")
123
+ return self.__session
124
+
103
125
  def encode(self, text: str, limit: int) -> np.ndarray:
104
126
  """Encodes prepared text to array"""
105
127
  result_array: np.ndarray = np.zeros(shape=(limit, self.num_classes), dtype=np.float32)
@@ -136,7 +158,7 @@ class MlValidator:
136
158
  "value_input": value_input.astype(np.float32),
137
159
  "feature_input": feature_input.astype(np.float32),
138
160
  }
139
- result = self.model_session.run(output_names=None, input_feed=input_feed)
161
+ result = self.session.run(output_names=None, input_feed=input_feed)
140
162
  if result and isinstance(result[0], np.ndarray):
141
163
  return result[0]
142
164
  raise RuntimeError(f"Unexpected type {type(result[0])}")
@@ -178,8 +200,8 @@ class MlValidator:
178
200
  default_candidate = candidates[0]
179
201
  line_input = self.encode_line(default_candidate.line_data_list[0].line,
180
202
  default_candidate.line_data_list[0].value_start)[np.newaxis]
181
- variable = ""
182
- value = ""
203
+ variable = ''
204
+ value = ''
183
205
  for candidate in candidates:
184
206
  if not variable and candidate.line_data_list[0].variable:
185
207
  variable = candidate.line_data_list[0].variable
@@ -251,8 +273,8 @@ class MlValidator:
251
273
  features_list)
252
274
  is_cred = probability > self.threshold
253
275
  if logger.isEnabledFor(logging.DEBUG):
254
- for i in range(len(is_cred)):
255
- logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], probability[i],
276
+ for i, decision in enumerate(is_cred):
277
+ logger.debug("ML decision: %s with prediction: %s for value: %s", decision, probability[i],
256
278
  group_list[i][0])
257
279
  # apply cast to float to avoid json export issue
258
280
  return is_cred, probability.astype(float)
@@ -3,7 +3,7 @@
3
3
  confidence: weak
4
4
  type: pattern
5
5
  values:
6
- - (?P<variable>(\w*(?i:비밀번호|비번|패스워드|키|암호화?|토큰|(?<!by)pass(?!ed|ing|es|age)|\bpwd?\b|token|secret|key|cred)\w*)\s*(설정은|[=:!]{1,3}))?\s*([._0-9A-Za-z\[\]]*get(env)?\s*\(\s*(?(variable)[^,]+)|[\"'\\]*(\\*(['\"]|&(quot|apos);)){0,4}(\w*(?i:(?<!by)pass(?!ed|ing|es|age|\s+[a-z]{3,80})|\bpwd?\b|token|secret|key|cred)\w*)(\\*(['\"]|&(quot|apos);)){0,4})\s*,\s*(default\s*=\s*)?([brufl@]{1,2}(?=\\*['\"&]))?(?P<lq>(\\*(['\"]|&(quot|apos);)){1,4})(?P<value>(.(?!(?P=lq))){4,80}.?)
6
+ - (?P<variable>(\w*(?i:비밀번호|비번|패스워드|키|암호화?|토큰|(?<!by)pass(?!ed|ing|ion|es|age)|\bpwd?\b|token|secret|key|cred)\w*)\s*(설정은|[=:!]{1,3}))?\s*([._0-9A-Za-z\[\]]*get(env)?\s*\(\s*(?(variable)[^,]+)|[\"'\\]*(\\*(['\"]|&(quot|apos);)){0,4}(\w*(?i:(?<!by)pass(?!ed|ing|ion|es|age|\s+[a-z]{3,80})|\bpwd?\b|token|secret|key|cred)\w*)(\\*(['\"]|&(quot|apos);)){0,4})\s*,\s*(default\s*=\s*)?([brufl@]{1,2}(?=\\*['\"&]))?(?P<lq>(\\*(['\"]|&(quot|apos);)){1,4})(?P<value>(.(?!(?P=lq))){4,80}.?)
7
7
  filter_type:
8
8
  - ValueAllowlistCheck
9
9
  - LineGitBinaryCheck
@@ -34,7 +34,7 @@
34
34
  confidence: weak
35
35
  type: pattern
36
36
  values:
37
- - (?P<wrap>[`'\"(])?\s*(?P<variable>(\w*(?i:(?<!by)passw?o?r?d?s?(?!ed|ing|es|age)|pwd?\b|\bp/w\b|token|secret|key|credential)\w*|비밀번호|비번|패스워드|키|암호화?|토큰))[`'\"]*(\s+(?i:is|are|was|were)(\s*[:-])?\s+|\s*(설정은|[=:!]{1,3})\s*)(?P<quote>[`'\"]{1,6})?(?P<value>(?(quote)(?(wrap)[^`'\")]{4,80}|[^`'\"]{4,80})|(?(wrap)[^`'\")]{4,80}|\S{4,80})))
37
+ - (?P<wrap>[`'\"(])?\s*(?P<variable>(\w*(?i:(?<!by)passw?o?r?d?s?(?!ed|ing|ion|es|age)|pwd?\b|\bp/w\b|token|secret|key|credential)\w*|비밀번호|비번|패스워드|키|암호화?|토큰))[`'\"]*(\s+(?i:is|are|was|were)(\s*[:-])?\s+|\s*(설정은|[=:!]{1,3})\s*)(?P<quote>[`'\"]{1,6})?(?P<value>(?(quote)(?(wrap)[^`'\")]{4,80}|[^`'\"]{4,80})|(?(wrap)[^`'\")]{4,80}|\S{4,80})))
38
38
  filter_type:
39
39
  - ValueAllowlistCheck
40
40
  - LineGitBinaryCheck
@@ -375,16 +375,16 @@
375
375
  - code
376
376
  - doc
377
377
 
378
- - name: Heroku API Key
378
+ - name: Heroku Credentials
379
379
  severity: high
380
- confidence: moderate
380
+ confidence: strong
381
381
  type: pattern
382
382
  values:
383
- - (?i)(?P<value>heroku(.{0,20})?[0-9a-f]{8}(-[0-9a-f]{4})+-[0-9a-f]{12})(?![0-9A-Za-z_-])
383
+ - (?P<value>HRKU-([0-9A-Za-z_-]{60}|[0-9A-Fa-f]{8}(-[0-9A-Fa-f]{4}){3}-[0-9A-Fa-f]{12}))
384
384
  filter_type: GeneralPattern
385
385
  required_substrings:
386
- - heroku
387
- min_line_len: 24
386
+ - HRKU-
387
+ min_line_len: 41
388
388
  target:
389
389
  - code
390
390
  - doc
@@ -413,7 +413,49 @@
413
413
  - ValueJsonWebTokenCheck
414
414
  required_substrings:
415
415
  - eyJ
416
- min_line_len: 18
416
+ min_line_len: 64
417
+ target:
418
+ - code
419
+ - doc
420
+
421
+ - name: JSON Web Key
422
+ severity: medium
423
+ confidence: strong
424
+ type: pattern
425
+ values:
426
+ - (?P<value>\b(e(yJ|yAi|woi|wog|w0K)|W(yJ|3si|wp7|wog|w0K|3sK))[0-9A-Za-z_+/-]{60,8000})
427
+ filter_type:
428
+ - ValueJsonWebKeyCheck
429
+ required_substrings:
430
+ - eyJ
431
+ - eyAi
432
+ - ewoi
433
+ - ewog
434
+ - ew0K
435
+ - WyJ
436
+ - W3si
437
+ - Wwp7
438
+ - Wwog
439
+ - Ww0K
440
+ - W3sK
441
+ min_line_len: 64
442
+ target:
443
+ - code
444
+ - doc
445
+
446
+ - name: JWK
447
+ severity: medium
448
+ confidence: moderate
449
+ type: multi
450
+ values:
451
+ - (?P<value>['"]?\b(?P<variable>kty)[^0-9A-Za-z_-]{1,8}(RSA|EC|oct)\b['"]?)
452
+ - (?P<variable>\b[dk])[^0-9A-Za-z_-]{1,8}(?P<value>[0-9A-Za-z_-]{22,8000})(?![=0-9A-Za-z_-])
453
+ filter_type:
454
+ - ValuePatternCheck
455
+ - ValueCoupleKeywordCheck(3)
456
+ required_substrings:
457
+ - kty
458
+ min_line_len: 8
417
459
  target:
418
460
  - code
419
461
  - doc
@@ -1481,7 +1523,7 @@
1481
1523
  confidence: moderate
1482
1524
  type: keyword
1483
1525
  values:
1484
- - (?<!by)pass(?!ed|ing|es|age|\s+[a-z]{3,80})|pw(d|\b)
1526
+ - (?<!by)pass(?!ed|ing|ion|es|age|\s+[a-z]{3,80})|pw(d|\b)
1485
1527
  filter_type: PasswordKeyword
1486
1528
  use_ml: true
1487
1529
  min_line_len: 10
credsweeper/rules/rule.py CHANGED
@@ -179,7 +179,6 @@ class Rule:
179
179
  for value in _values:
180
180
  _pattern = KeywordPattern.get_keyword_pattern(value)
181
181
  _patterns.append(_pattern)
182
- return _patterns
183
182
  elif RuleType.MULTI == self.rule_type and 2 == len(_values) \
184
183
  or self.rule_type in (RuleType.PATTERN, RuleType.PEM_KEY) and 0 < len(_values):
185
184
  for value in _values:
@@ -188,8 +187,9 @@ class Rule:
188
187
  logger.warning(f"Rule {self.rule_name} has extra patterns. Only single pattern supported.")
189
188
  elif RuleType.MULTI == self.rule_type and 2 < len(_values):
190
189
  logger.warning(f"Rule {self.rule_name} has extra patterns. Only two patterns supported.")
191
- return _patterns
192
- raise ValueError(f"Malformed rule config file. Rule '{self.rule_name}' type '{self.rule_type}' is invalid.")
190
+ else:
191
+ raise ValueError(f"Malformed rule config file. Rule '{self.rule_name}' type '{self.rule_type}' is invalid.")
192
+ return _patterns
193
193
 
194
194
  @cached_property
195
195
  def patterns(self) -> List[re.Pattern]:
@@ -37,8 +37,7 @@ class MultiPattern(ScanType):
37
37
  "Rules provided to MultiPattern.run should have pattern_type equal to MULTI_PATTERN"
38
38
 
39
39
  candidates = cls._get_candidates(config, rule, target)
40
- if not candidates:
41
- return candidates
40
+
42
41
  for candidate in candidates:
43
42
  line_pos_margin = 1
44
43
  while line_pos_margin <= cls.MAX_SEARCH_MARGIN:
@@ -5,9 +5,13 @@
5
5
  ".aar",
6
6
  ".apk",
7
7
  ".bz2",
8
+ ".class",
8
9
  ".gz",
10
+ ".jar",
9
11
  ".lzma",
12
+ ".rpm",
10
13
  ".tar",
14
+ ".war",
11
15
  ".xz",
12
16
  ".zip"
13
17
  ],
@@ -28,7 +32,6 @@
28
32
  ".avi",
29
33
  ".bin",
30
34
  ".bmp",
31
- ".class",
32
35
  ".css",
33
36
  ".dmg",
34
37
  ".ear",
@@ -40,7 +43,6 @@
40
43
  ".ico",
41
44
  ".img",
42
45
  ".info",
43
- ".jar",
44
46
  ".jpeg",
45
47
  ".jpg",
46
48
  ".map",
@@ -62,10 +64,8 @@
62
64
  ".rar",
63
65
  ".rc",
64
66
  ".rc2",
65
- ".rar",
66
67
  ".realm",
67
68
  ".res",
68
- ".rpm",
69
69
  ".s7z",
70
70
  ".scss",
71
71
  ".so",
@@ -76,7 +76,6 @@
76
76
  ".ttf",
77
77
  ".vcxproj",
78
78
  ".vdproj",
79
- ".war",
80
79
  ".wav",
81
80
  ".webm",
82
81
  ".webp",
@@ -161,7 +160,8 @@
161
160
  "bruteforce_list": [
162
161
  "",
163
162
  "changeit",
164
- "changeme"
163
+ "changeme",
164
+ "tizen"
165
165
  ],
166
166
  "check_for_literals": true,
167
167
  "min_pattern_value_length": 12,
@@ -1,5 +1,5 @@
1
1
  import statistics
2
- from typing import Tuple
2
+ from typing import Tuple, Dict
3
3
 
4
4
 
5
5
  class HopStat:
@@ -62,7 +62,7 @@ class HopStat:
62
62
  })
63
63
 
64
64
  def __init__(self):
65
- self.__hop_dict = dict()
65
+ self.__hop_dict: Dict[Tuple[str, str], int] = {}
66
66
  base = ''.join(x for x in HopStat.KEYBOARD)
67
67
  for a in (x for x in base if '\0' != x):
68
68
  for b in (x for x in base if '\0' != x):
@@ -81,7 +81,7 @@ class HopStat:
81
81
  def __get_xyz(c: str) -> Tuple[int, int, int]:
82
82
  """Returns axial coordinates of a char on keyboad qwerty"""
83
83
  x = y = z = 0
84
- for i in range(len(HopStat.KEYBOARD)):
84
+ for i, _ in enumerate(HopStat.KEYBOARD):
85
85
  x = HopStat.KEYBOARD[i].find(c)
86
86
  if 0 <= x:
87
87
  z = i
@@ -4,7 +4,7 @@ import re
4
4
  import string
5
5
  from typing import List
6
6
 
7
- from credsweeper.common.constants import PEM_BEGIN_PATTERN, PEM_END_PATTERN, ENTROPY_LIMIT_BASE64
7
+ from credsweeper.common.constants import PEM_BEGIN_PATTERN, PEM_END_PATTERN, Chars
8
8
  from credsweeper.config import Config
9
9
  from credsweeper.credentials import LineData
10
10
  from credsweeper.file_handler.analysis_target import AnalysisTarget
@@ -12,10 +12,12 @@ from credsweeper.utils import Util
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
+ ENTROPY_LIMIT_BASE64 = 4.5
16
+
15
17
 
16
18
  class PemKeyDetector:
17
19
  """Class to detect PEM PRIVATE keys only"""
18
- base64set = set(string.ascii_uppercase) | set(string.ascii_lowercase) | set(string.digits) | {'+', '/', '='}
20
+ base64set = set(Chars.BASE64STDPAD_CHARS.value)
19
21
 
20
22
  ignore_starts = [PEM_BEGIN_PATTERN, "Proc-Type", "Version", "DEK-Info"]
21
23
  wrap_characters = "\\'\";,[]#*!"
@@ -64,7 +66,7 @@ class PemKeyDetector:
64
66
  if PEM_BEGIN_PATTERN in subline:
65
67
  begin_pattern_not_passed = False
66
68
  continue
67
- elif PEM_END_PATTERN in subline:
69
+ if PEM_END_PATTERN in subline:
68
70
  if "PGP" in target.line_strip:
69
71
  # Check if entropy is high enough for base64 set with padding sign
70
72
  entropy = Util.get_shannon_entropy(key_data)
@@ -124,7 +126,7 @@ class PemKeyDetector:
124
126
  line = line.strip(string.whitespace)
125
127
  if line.startswith("//"):
126
128
  # simplify first condition for speed-up of doxygen style processing
127
- if line.startswith("// ") or line.startswith("/// "):
129
+ if line.startswith(("// ", "/// ")):
128
130
  # Assume that the commented line is to be separated from base64 code, it may be a part of PEM, otherwise
129
131
  line = line[3:]
130
132
  if line.startswith("/*"):