credsweeper 1.11.2__py3-none-any.whl → 1.11.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of credsweeper might be problematic. Click here for more details.

Files changed (73) hide show
  1. credsweeper/__init__.py +1 -1
  2. credsweeper/__main__.py +7 -5
  3. credsweeper/app.py +28 -47
  4. credsweeper/common/constants.py +2 -5
  5. credsweeper/common/keyword_pattern.py +15 -9
  6. credsweeper/common/morpheme_checklist.txt +4 -2
  7. credsweeper/credentials/candidate_key.py +1 -1
  8. credsweeper/credentials/credential_manager.py +4 -3
  9. credsweeper/credentials/line_data.py +16 -15
  10. credsweeper/deep_scanner/abstract_scanner.py +10 -1
  11. credsweeper/deep_scanner/deb_scanner.py +48 -0
  12. credsweeper/deep_scanner/deep_scanner.py +65 -43
  13. credsweeper/deep_scanner/docx_scanner.py +1 -1
  14. credsweeper/deep_scanner/encoder_scanner.py +2 -2
  15. credsweeper/deep_scanner/gzip_scanner.py +1 -1
  16. credsweeper/deep_scanner/html_scanner.py +3 -3
  17. credsweeper/deep_scanner/jks_scanner.py +2 -4
  18. credsweeper/deep_scanner/lang_scanner.py +2 -2
  19. credsweeper/deep_scanner/lzma_scanner.py +40 -0
  20. credsweeper/deep_scanner/pkcs12_scanner.py +3 -5
  21. credsweeper/deep_scanner/xml_scanner.py +2 -2
  22. credsweeper/file_handler/byte_content_provider.py +2 -2
  23. credsweeper/file_handler/content_provider.py +1 -1
  24. credsweeper/file_handler/data_content_provider.py +23 -14
  25. credsweeper/file_handler/diff_content_provider.py +2 -2
  26. credsweeper/file_handler/file_path_extractor.py +1 -1
  27. credsweeper/file_handler/files_provider.py +2 -4
  28. credsweeper/file_handler/patches_provider.py +1 -1
  29. credsweeper/file_handler/string_content_provider.py +2 -2
  30. credsweeper/file_handler/struct_content_provider.py +1 -1
  31. credsweeper/file_handler/text_content_provider.py +2 -2
  32. credsweeper/filters/value_array_dictionary_check.py +3 -1
  33. credsweeper/filters/value_azure_token_check.py +1 -2
  34. credsweeper/filters/value_base64_encoded_pem_check.py +1 -1
  35. credsweeper/filters/value_base64_part_check.py +30 -21
  36. credsweeper/filters/value_discord_bot_check.py +1 -2
  37. credsweeper/filters/value_entropy_base32_check.py +11 -31
  38. credsweeper/filters/value_entropy_base36_check.py +11 -34
  39. credsweeper/filters/value_entropy_base64_check.py +15 -48
  40. credsweeper/filters/value_entropy_base_check.py +37 -0
  41. credsweeper/filters/value_file_path_check.py +1 -1
  42. credsweeper/filters/value_hex_number_check.py +3 -3
  43. credsweeper/filters/value_json_web_token_check.py +4 -5
  44. credsweeper/filters/value_pattern_check.py +64 -16
  45. credsweeper/filters/value_string_type_check.py +11 -3
  46. credsweeper/filters/value_token_base32_check.py +0 -4
  47. credsweeper/filters/value_token_base36_check.py +0 -4
  48. credsweeper/filters/value_token_base64_check.py +0 -4
  49. credsweeper/filters/value_token_check.py +1 -1
  50. credsweeper/ml_model/features/file_extension.py +2 -2
  51. credsweeper/ml_model/features/morpheme_dense.py +0 -4
  52. credsweeper/ml_model/features/rule_name.py +1 -1
  53. credsweeper/ml_model/features/word_in_path.py +0 -9
  54. credsweeper/ml_model/features/word_in_postamble.py +0 -11
  55. credsweeper/ml_model/features/word_in_preamble.py +0 -11
  56. credsweeper/ml_model/features/word_in_transition.py +0 -11
  57. credsweeper/ml_model/features/word_in_value.py +0 -11
  58. credsweeper/ml_model/features/word_in_variable.py +0 -11
  59. credsweeper/ml_model/ml_validator.py +45 -22
  60. credsweeper/rules/config.yaml +238 -208
  61. credsweeper/rules/rule.py +3 -3
  62. credsweeper/scanner/scan_type/scan_type.py +2 -3
  63. credsweeper/scanner/scanner.py +7 -1
  64. credsweeper/secret/config.json +16 -5
  65. credsweeper/utils/hop_stat.py +3 -3
  66. credsweeper/utils/pem_key_detector.py +8 -7
  67. credsweeper/utils/util.py +76 -146
  68. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/METADATA +1 -1
  69. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/RECORD +72 -70
  70. credsweeper/utils/entropy_validator.py +0 -72
  71. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/WHEEL +0 -0
  72. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/entry_points.txt +0 -0
  73. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/licenses/LICENSE +0 -0
@@ -17,7 +17,7 @@ class ValueTokenCheck(Filter):
17
17
 
18
18
  """
19
19
 
20
- SPLIT_PATTERN = r" |;|\)|\(|{|}|<|>|\[|\]|`"
20
+ SPLIT_PATTERN = r"(?<!,) (?!,)|;|\)|\(|{|}|<|>|\[|\]|`"
21
21
 
22
22
  def __init__(self, config: Config = None) -> None:
23
23
  pass
@@ -15,10 +15,10 @@ class FileExtension(WordIn):
15
15
  """
16
16
 
17
17
  def __init__(self, extensions: List[str]) -> None:
18
- super().__init__(extensions)
18
+ super().__init__(words=extensions)
19
19
 
20
20
  def __call__(self, candidates: List[Candidate]) -> np.ndarray:
21
- extension_set = set([candidate.line_data_list[0].file_type.lower() for candidate in candidates])
21
+ extension_set = set(candidate.line_data_list[0].file_type.lower() for candidate in candidates)
22
22
  return self.word_in_set(extension_set)
23
23
 
24
24
  def extract(self, candidate: Candidate) -> Any:
@@ -6,10 +6,6 @@ from credsweeper.ml_model.features.feature import Feature
6
6
  class MorphemeDense(Feature):
7
7
  """Feature calculates morphemes density for a value"""
8
8
 
9
- def __init__(self) -> None:
10
- """Class initializer"""
11
- super().__init__()
12
-
13
9
  def extract(self, candidate: Candidate) -> float:
14
10
  if value := candidate.line_data_list[0].value.lower():
15
11
  morphemes_counter = 0
@@ -15,7 +15,7 @@ class RuleName(WordIn):
15
15
  """
16
16
 
17
17
  def __init__(self, rule_names: List[str]) -> None:
18
- super().__init__(rule_names)
18
+ super().__init__(words=rule_names)
19
19
 
20
20
  def __call__(self, candidates: List[Candidate]) -> np.ndarray:
21
21
  candidate_rule_set = set(x.rule_name for x in candidates)
@@ -10,15 +10,6 @@ from credsweeper.ml_model.features.word_in import WordIn
10
10
  class WordInPath(WordIn):
11
11
  """Categorical feature that corresponds to words in path (POSIX, lowercase)"""
12
12
 
13
- def __init__(self, words: List[str]) -> None:
14
- """WordInPath constructor
15
-
16
- Args:
17
- words: list of predefined words - MUST BE IN LOWER CASE & POSIX
18
-
19
- """
20
- super().__init__(words)
21
-
22
13
  def __call__(self, candidates: List[Candidate]) -> np.ndarray:
23
14
  # actually there must be one path because the candidates are grouped before
24
15
  if file_path := candidates[0].line_data_list[0].path:
@@ -1,5 +1,3 @@
1
- from typing import List
2
-
3
1
  import numpy as np
4
2
 
5
3
  from credsweeper.common.constants import ML_HUNK
@@ -10,15 +8,6 @@ from credsweeper.ml_model.features.word_in import WordIn
10
8
  class WordInPostamble(WordIn):
11
9
  """Feature is true if line contains at least one word from predefined list."""
12
10
 
13
- def __init__(self, words: List[str]) -> None:
14
- """Feature returns array of matching words
15
-
16
- Args:
17
- words: list of predefined words - MUST BE IN LOWER CASE
18
-
19
- """
20
- super().__init__(words)
21
-
22
11
  def extract(self, candidate: Candidate) -> np.ndarray:
23
12
  """Returns true if any words in a part of line after value"""
24
13
  postamble_end = len(candidate.line_data_list[0].line) \
@@ -1,5 +1,3 @@
1
- from typing import List
2
-
3
1
  import numpy as np
4
2
 
5
3
  from credsweeper.common.constants import ML_HUNK
@@ -10,15 +8,6 @@ from credsweeper.ml_model.features.word_in import WordIn
10
8
  class WordInPreamble(WordIn):
11
9
  """Feature is true if line contains at least one word from predefined list."""
12
10
 
13
- def __init__(self, words: List[str]) -> None:
14
- """Feature returns array of matching words
15
-
16
- Args:
17
- words: list of predefined words - MUST BE IN LOWER CASE
18
-
19
- """
20
- super().__init__(words)
21
-
22
11
  def extract(self, candidate: Candidate) -> np.ndarray:
23
12
  """Returns true if any words in line before variable or value"""
24
13
  if 0 <= candidate.line_data_list[0].variable_start:
@@ -1,5 +1,3 @@
1
- from typing import List
2
-
3
1
  import numpy as np
4
2
 
5
3
  from credsweeper.credentials import Candidate
@@ -9,15 +7,6 @@ from credsweeper.ml_model.features.word_in import WordIn
9
7
  class WordInTransition(WordIn):
10
8
  """Feature is true if line contains at least one word from predefined list."""
11
9
 
12
- def __init__(self, words: List[str]) -> None:
13
- """Feature returns array of matching words
14
-
15
- Args:
16
- words: list of predefined words - MUST BE IN LOWER CASE
17
-
18
- """
19
- super().__init__(words)
20
-
21
10
  def extract(self, candidate: Candidate) -> np.ndarray:
22
11
  """Returns true if any words between variable and value"""
23
12
  if 0 <= candidate.line_data_list[0].variable_end < candidate.line_data_list[0].value_start:
@@ -1,5 +1,3 @@
1
- from typing import List
2
-
3
1
  import numpy as np
4
2
 
5
3
  from credsweeper.credentials import Candidate
@@ -9,15 +7,6 @@ from credsweeper.ml_model.features.word_in import WordIn
9
7
  class WordInValue(WordIn):
10
8
  """Feature returns true if candidate value contains at least one word from predefined list."""
11
9
 
12
- def __init__(self, words: List[str]) -> None:
13
- """Feature is true if candidate value contains at least one predefined word.
14
-
15
- Args:
16
- words: list of predefined words - MUST BE IN LOWER CASE and SORTED (preferred)
17
-
18
- """
19
- super().__init__(words)
20
-
21
10
  def extract(self, candidate: Candidate) -> np.ndarray:
22
11
  """Returns array of matching words for first line"""
23
12
  if value := candidate.line_data_list[0].value:
@@ -1,5 +1,3 @@
1
- from typing import List
2
-
3
1
  import numpy as np
4
2
 
5
3
  from credsweeper.credentials import Candidate
@@ -9,15 +7,6 @@ from credsweeper.ml_model.features.word_in import WordIn
9
7
  class WordInVariable(WordIn):
10
8
  """Feature returns array of words matching in variable"""
11
9
 
12
- def __init__(self, words: List[str]) -> None:
13
- """Feature is true if candidate value contains at least one predefined word.
14
-
15
- Args:
16
- words: list of predefined words - MUST BE IN LOWER CASE
17
-
18
- """
19
- super().__init__(words)
20
-
21
10
  def extract(self, candidate: Candidate) -> np.ndarray:
22
11
  """Returns array of matching words for first line"""
23
12
  if variable := candidate.line_data_list[0].variable:
@@ -1,10 +1,11 @@
1
1
  import hashlib
2
+ import json
2
3
  import logging
3
4
  from pathlib import Path
4
5
  from typing import List, Tuple, Union, Optional, Dict
5
6
 
6
7
  import numpy as np
7
- import onnxruntime as ort
8
+ from onnxruntime import InferenceSession
8
9
 
9
10
  import credsweeper.ml_model.features as features
10
11
  from credsweeper.common.constants import ThresholdPreset, ML_HUNK
@@ -22,6 +23,8 @@ class MlValidator:
22
23
  # applied for unknown characters
23
24
  FAKE_CHAR = '\x01'
24
25
 
26
+ _dir_path = Path(__file__).parent
27
+
25
28
  def __init__(
26
29
  self, #
27
30
  threshold: Union[float, ThresholdPreset], #
@@ -36,35 +39,36 @@ class MlValidator:
36
39
  ml_model: path to ml model
37
40
  ml_providers: coma separated list of providers https://onnxruntime.ai/docs/execution-providers/
38
41
  """
39
- dir_path = Path(__file__).parent
42
+ self.__session: Optional[InferenceSession] = None
40
43
 
41
44
  if ml_config:
42
45
  ml_config_path = Path(ml_config)
43
46
  else:
44
- ml_config_path = dir_path / "ml_config.json"
47
+ ml_config_path = MlValidator._dir_path / "ml_config.json"
45
48
  with open(ml_config_path, "rb") as f:
46
- md5_config = hashlib.md5(f.read()).hexdigest()
49
+ __ml_config_data = f.read()
50
+
51
+ model_config = json.loads(__ml_config_data)
47
52
 
48
53
  if ml_model:
49
54
  ml_model_path = Path(ml_model)
50
55
  else:
51
- ml_model_path = dir_path / "ml_model.onnx"
56
+ ml_model_path = MlValidator._dir_path / "ml_model.onnx"
52
57
  with open(ml_model_path, "rb") as f:
53
- md5_model = hashlib.md5(f.read()).hexdigest()
58
+ self.__ml_model_data = f.read()
54
59
 
55
60
  if ml_providers:
56
- providers = ml_providers.split(',')
61
+ self.providers = ml_providers.split(',')
57
62
  else:
58
- providers = ["CPUExecutionProvider"]
59
- self.model_session = ort.InferenceSession(ml_model_path, providers=providers)
63
+ self.providers = ["CPUExecutionProvider"]
60
64
 
61
- model_config = Util.json_load(ml_config_path)
62
65
  if isinstance(threshold, float):
63
66
  self.threshold = threshold
64
67
  elif isinstance(threshold, ThresholdPreset) and "thresholds" in model_config:
65
68
  self.threshold = model_config["thresholds"][threshold.value]
66
69
  else:
67
70
  self.threshold = 0.5
71
+ logger.warning(f"Use fallback threshold value: {self.threshold}")
68
72
 
69
73
  char_set = set(model_config["char_set"])
70
74
  if len(char_set) != len(model_config["char_set"]):
@@ -80,25 +84,44 @@ class MlValidator:
80
84
 
81
85
  self.common_feature_list = []
82
86
  self.unique_feature_list = []
83
- logger.info("Init ML validator with %s provider; config:'%s' md5:%s model:'%s' md5:%s", providers,
84
- ml_config_path, md5_config, ml_model_path, md5_model)
85
- logger.debug("ML validator details: %s", model_config)
87
+ if logger.isEnabledFor(logging.INFO):
88
+ config_dbg = str(model_config) if logger.isEnabledFor(logging.DEBUG) else ''
89
+ config_md5 = hashlib.md5(__ml_config_data).hexdigest()
90
+ model_md5 = hashlib.md5(self.__ml_model_data).hexdigest()
91
+ logger.info("Init ML validator with providers: '%s' ; model:'%s' md5:%s ; config:'%s' md5:%s ; %s",
92
+ self.providers, ml_config_path, config_md5, ml_model_path, model_md5, config_dbg)
86
93
  for feature_definition in model_config["features"]:
87
94
  feature_class = feature_definition["type"]
88
95
  kwargs = feature_definition.get("kwargs", {})
89
96
  feature_constructor = getattr(features, feature_class, None)
90
97
  if feature_constructor is None:
91
- raise ValueError(f'Error while parsing model details. Cannot create feature "{feature_class}"')
98
+ raise ValueError(f"Error while parsing model details. Cannot create feature '{feature_class}'"
99
+ f" from {feature_definition}")
92
100
  try:
93
101
  feature = feature_constructor(**kwargs)
94
102
  except TypeError:
95
- raise TypeError(f'Error while parsing model details. Cannot create feature "{feature_class}"'
96
- f' with kwargs "{kwargs}"')
103
+ logger.error(f"Error while parsing model details. Cannot create feature '{feature_class}'"
104
+ f" from {feature_definition}")
105
+ raise
97
106
  if feature_definition["type"] in ["RuleName"]:
98
107
  self.unique_feature_list.append(feature)
99
108
  else:
100
109
  self.common_feature_list.append(feature)
101
110
 
111
+ def __reduce__(self):
112
+ # TypeError: cannot pickle 'onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession' object
113
+ self.__session = None
114
+ return super().__reduce__()
115
+
116
+ @property
117
+ def session(self) -> InferenceSession:
118
+ """session getter to prevent pickle error"""
119
+ if not self.__session:
120
+ self.__session = InferenceSession(self.__ml_model_data, providers=self.providers)
121
+ if not self.__session:
122
+ raise RuntimeError("InferenceSession was not initialized!")
123
+ return self.__session
124
+
102
125
  def encode(self, text: str, limit: int) -> np.ndarray:
103
126
  """Encodes prepared text to array"""
104
127
  result_array: np.ndarray = np.zeros(shape=(limit, self.num_classes), dtype=np.float32)
@@ -135,7 +158,7 @@ class MlValidator:
135
158
  "value_input": value_input.astype(np.float32),
136
159
  "feature_input": feature_input.astype(np.float32),
137
160
  }
138
- result = self.model_session.run(output_names=None, input_feed=input_feed)
161
+ result = self.session.run(output_names=None, input_feed=input_feed)
139
162
  if result and isinstance(result[0], np.ndarray):
140
163
  return result[0]
141
164
  raise RuntimeError(f"Unexpected type {type(result[0])}")
@@ -177,8 +200,8 @@ class MlValidator:
177
200
  default_candidate = candidates[0]
178
201
  line_input = self.encode_line(default_candidate.line_data_list[0].line,
179
202
  default_candidate.line_data_list[0].value_start)[np.newaxis]
180
- variable = ""
181
- value = ""
203
+ variable = ''
204
+ value = ''
182
205
  for candidate in candidates:
183
206
  if not variable and candidate.line_data_list[0].variable:
184
207
  variable = candidate.line_data_list[0].variable
@@ -229,7 +252,7 @@ class MlValidator:
229
252
  features_list = []
230
253
  probability: np.ndarray = np.zeros(len(group_list), dtype=np.float32)
231
254
  head = tail = 0
232
- for group_key, candidates in group_list:
255
+ for _group_key, candidates in group_list:
233
256
  line_input, variable_input, value_input, feature_array = self.get_group_features(candidates)
234
257
  line_input_list.append(line_input)
235
258
  variable_input_list.append(variable_input)
@@ -250,8 +273,8 @@ class MlValidator:
250
273
  features_list)
251
274
  is_cred = probability > self.threshold
252
275
  if logger.isEnabledFor(logging.DEBUG):
253
- for i in range(len(is_cred)):
254
- logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], probability[i],
276
+ for i, decision in enumerate(is_cred):
277
+ logger.debug("ML decision: %s with prediction: %s for value: %s", decision, probability[i],
255
278
  group_list[i][0])
256
279
  # apply cast to float to avoid json export issue
257
280
  return is_cred, probability.astype(float)