credsweeper 1.11.2__py3-none-any.whl → 1.11.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of credsweeper might be problematic. Click here for more details.

Files changed (73) hide show
  1. credsweeper/__init__.py +1 -1
  2. credsweeper/__main__.py +7 -5
  3. credsweeper/app.py +28 -47
  4. credsweeper/common/constants.py +2 -5
  5. credsweeper/common/keyword_pattern.py +15 -9
  6. credsweeper/common/morpheme_checklist.txt +4 -2
  7. credsweeper/credentials/candidate_key.py +1 -1
  8. credsweeper/credentials/credential_manager.py +4 -3
  9. credsweeper/credentials/line_data.py +16 -15
  10. credsweeper/deep_scanner/abstract_scanner.py +10 -1
  11. credsweeper/deep_scanner/deb_scanner.py +48 -0
  12. credsweeper/deep_scanner/deep_scanner.py +65 -43
  13. credsweeper/deep_scanner/docx_scanner.py +1 -1
  14. credsweeper/deep_scanner/encoder_scanner.py +2 -2
  15. credsweeper/deep_scanner/gzip_scanner.py +1 -1
  16. credsweeper/deep_scanner/html_scanner.py +3 -3
  17. credsweeper/deep_scanner/jks_scanner.py +2 -4
  18. credsweeper/deep_scanner/lang_scanner.py +2 -2
  19. credsweeper/deep_scanner/lzma_scanner.py +40 -0
  20. credsweeper/deep_scanner/pkcs12_scanner.py +3 -5
  21. credsweeper/deep_scanner/xml_scanner.py +2 -2
  22. credsweeper/file_handler/byte_content_provider.py +2 -2
  23. credsweeper/file_handler/content_provider.py +1 -1
  24. credsweeper/file_handler/data_content_provider.py +23 -14
  25. credsweeper/file_handler/diff_content_provider.py +2 -2
  26. credsweeper/file_handler/file_path_extractor.py +1 -1
  27. credsweeper/file_handler/files_provider.py +2 -4
  28. credsweeper/file_handler/patches_provider.py +1 -1
  29. credsweeper/file_handler/string_content_provider.py +2 -2
  30. credsweeper/file_handler/struct_content_provider.py +1 -1
  31. credsweeper/file_handler/text_content_provider.py +2 -2
  32. credsweeper/filters/value_array_dictionary_check.py +3 -1
  33. credsweeper/filters/value_azure_token_check.py +1 -2
  34. credsweeper/filters/value_base64_encoded_pem_check.py +1 -1
  35. credsweeper/filters/value_base64_part_check.py +30 -21
  36. credsweeper/filters/value_discord_bot_check.py +1 -2
  37. credsweeper/filters/value_entropy_base32_check.py +11 -31
  38. credsweeper/filters/value_entropy_base36_check.py +11 -34
  39. credsweeper/filters/value_entropy_base64_check.py +15 -48
  40. credsweeper/filters/value_entropy_base_check.py +37 -0
  41. credsweeper/filters/value_file_path_check.py +1 -1
  42. credsweeper/filters/value_hex_number_check.py +3 -3
  43. credsweeper/filters/value_json_web_token_check.py +4 -5
  44. credsweeper/filters/value_pattern_check.py +64 -16
  45. credsweeper/filters/value_string_type_check.py +11 -3
  46. credsweeper/filters/value_token_base32_check.py +0 -4
  47. credsweeper/filters/value_token_base36_check.py +0 -4
  48. credsweeper/filters/value_token_base64_check.py +0 -4
  49. credsweeper/filters/value_token_check.py +1 -1
  50. credsweeper/ml_model/features/file_extension.py +2 -2
  51. credsweeper/ml_model/features/morpheme_dense.py +0 -4
  52. credsweeper/ml_model/features/rule_name.py +1 -1
  53. credsweeper/ml_model/features/word_in_path.py +0 -9
  54. credsweeper/ml_model/features/word_in_postamble.py +0 -11
  55. credsweeper/ml_model/features/word_in_preamble.py +0 -11
  56. credsweeper/ml_model/features/word_in_transition.py +0 -11
  57. credsweeper/ml_model/features/word_in_value.py +0 -11
  58. credsweeper/ml_model/features/word_in_variable.py +0 -11
  59. credsweeper/ml_model/ml_validator.py +45 -22
  60. credsweeper/rules/config.yaml +238 -208
  61. credsweeper/rules/rule.py +3 -3
  62. credsweeper/scanner/scan_type/scan_type.py +2 -3
  63. credsweeper/scanner/scanner.py +7 -1
  64. credsweeper/secret/config.json +16 -5
  65. credsweeper/utils/hop_stat.py +3 -3
  66. credsweeper/utils/pem_key_detector.py +8 -7
  67. credsweeper/utils/util.py +76 -146
  68. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/METADATA +1 -1
  69. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/RECORD +72 -70
  70. credsweeper/utils/entropy_validator.py +0 -72
  71. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/WHEEL +0 -0
  72. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/entry_points.txt +0 -0
  73. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/licenses/LICENSE +0 -0
credsweeper/rules/rule.py CHANGED
@@ -179,7 +179,6 @@ class Rule:
179
179
  for value in _values:
180
180
  _pattern = KeywordPattern.get_keyword_pattern(value)
181
181
  _patterns.append(_pattern)
182
- return _patterns
183
182
  elif RuleType.MULTI == self.rule_type and 2 == len(_values) \
184
183
  or self.rule_type in (RuleType.PATTERN, RuleType.PEM_KEY) and 0 < len(_values):
185
184
  for value in _values:
@@ -188,8 +187,9 @@ class Rule:
188
187
  logger.warning(f"Rule {self.rule_name} has extra patterns. Only single pattern supported.")
189
188
  elif RuleType.MULTI == self.rule_type and 2 < len(_values):
190
189
  logger.warning(f"Rule {self.rule_name} has extra patterns. Only two patterns supported.")
191
- return _patterns
192
- raise ValueError(f"Malformed rule config file. Rule '{self.rule_name}' type '{self.rule_type}' is invalid.")
190
+ else:
191
+ raise ValueError(f"Malformed rule config file. Rule '{self.rule_name}' type '{self.rule_type}' is invalid.")
192
+ return _patterns
193
193
 
194
194
  @cached_property
195
195
  def patterns(self) -> List[re.Pattern]:
@@ -38,13 +38,12 @@ class ScanType(ABC):
38
38
  raise NotImplementedError()
39
39
 
40
40
  @classmethod
41
- def filtering(cls, config: Config, target: AnalysisTarget, line_data: LineData, filters: List[Filter]) -> bool:
41
+ def filtering(cls, target: AnalysisTarget, line_data: LineData, filters: List[Filter]) -> bool:
42
42
  """Check if line data should be removed based on filters.
43
43
 
44
44
  If `use_filters` option is false, always return False
45
45
 
46
46
  Args:
47
- config: dict of credsweeper configuration
48
47
  target: AnalysisTarget from which `line_data` was obtained
49
48
  line_data: Line data to check with `filters`
50
49
  filters: Filters to use
@@ -112,7 +111,7 @@ class ScanType(ABC):
112
111
  bypass_start = line_data.value_end
113
112
  bypass_end = offset_end
114
113
 
115
- if config.use_filters and cls.filtering(config, target, line_data, filters):
114
+ if config.use_filters and cls.filtering(target, line_data, filters):
116
115
  if line_data.variable and 0 <= line_data.variable_start < line_data.variable_end:
117
116
  # may be next matched item will be not filtered - let search it after variable
118
117
  bypass_start = line_data.variable_end
@@ -146,7 +146,13 @@ class Scanner:
146
146
  # "cache" - YAPF and pycharm formatters ...
147
147
  matched_keyword = \
148
148
  target_line_stripped_len >= self.min_keyword_len and ( #
149
- '=' in target_line_stripped or ':' in target_line_stripped) #
149
+ '=' in target_line_stripped
150
+ or ':' in target_line_stripped
151
+ or "set" in target_line_stripped
152
+ or "#define" in target_line_stripped
153
+ or "%define" in target_line_stripped
154
+ or "%global" in target_line_stripped
155
+ ) #
150
156
  matched_pem_key = \
151
157
  target_line_stripped_len >= self.min_pem_key_len \
152
158
  and PEM_BEGIN_PATTERN in target_line_stripped and "PRIVATE" in target_line_stripped
@@ -2,10 +2,13 @@
2
2
  "exclude": {
3
3
  "pattern": [],
4
4
  "containers": [
5
+ ".aar",
5
6
  ".apk",
6
7
  ".bz2",
7
8
  ".gz",
9
+ ".lzma",
8
10
  ".tar",
11
+ ".xz",
9
12
  ".zip"
10
13
  ],
11
14
  "documents": [
@@ -20,17 +23,20 @@
20
23
  ],
21
24
  "extension": [
22
25
  ".7z",
26
+ ".a",
23
27
  ".aac",
24
- ".aar",
25
28
  ".avi",
29
+ ".bin",
26
30
  ".bmp",
27
31
  ".class",
28
32
  ".css",
29
33
  ".dmg",
30
34
  ".ear",
31
35
  ".eot",
36
+ ".elf",
32
37
  ".exe",
33
38
  ".gif",
39
+ ".gmo",
34
40
  ".ico",
35
41
  ".img",
36
42
  ".info",
@@ -45,6 +51,7 @@
45
51
  ".mp4",
46
52
  ".npy",
47
53
  ".npz",
54
+ ".obj",
48
55
  ".ogg",
49
56
  ".pak",
50
57
  ".png",
@@ -52,10 +59,13 @@
52
59
  ".pyc",
53
60
  ".pyd",
54
61
  ".pyo",
62
+ ".rar",
55
63
  ".rc",
56
64
  ".rc2",
57
65
  ".rar",
58
66
  ".realm",
67
+ ".res",
68
+ ".rpm",
59
69
  ".s7z",
60
70
  ".scss",
61
71
  ".so",
@@ -70,6 +80,7 @@
70
80
  ".wav",
71
81
  ".webm",
72
82
  ".webp",
83
+ ".wma",
73
84
  ".woff",
74
85
  ".yuv"
75
86
  ],
@@ -160,13 +171,13 @@
160
171
  "line_num",
161
172
  "path",
162
173
  "info",
163
- "value",
164
- "value_start",
165
- "value_end",
166
174
  "variable",
167
175
  "variable_start",
168
176
  "variable_end",
169
- "entropy_validation"
177
+ "value",
178
+ "value_start",
179
+ "value_end",
180
+ "entropy"
170
181
  ],
171
182
  "candidate_output": [
172
183
  "rule",
@@ -1,5 +1,5 @@
1
1
  import statistics
2
- from typing import Tuple
2
+ from typing import Tuple, Dict
3
3
 
4
4
 
5
5
  class HopStat:
@@ -62,7 +62,7 @@ class HopStat:
62
62
  })
63
63
 
64
64
  def __init__(self):
65
- self.__hop_dict = dict()
65
+ self.__hop_dict: Dict[Tuple[str, str], int] = {}
66
66
  base = ''.join(x for x in HopStat.KEYBOARD)
67
67
  for a in (x for x in base if '\0' != x):
68
68
  for b in (x for x in base if '\0' != x):
@@ -81,7 +81,7 @@ class HopStat:
81
81
  def __get_xyz(c: str) -> Tuple[int, int, int]:
82
82
  """Returns axial coordinates of a char on keyboad qwerty"""
83
83
  x = y = z = 0
84
- for i in range(len(HopStat.KEYBOARD)):
84
+ for i, _ in enumerate(HopStat.KEYBOARD):
85
85
  x = HopStat.KEYBOARD[i].find(c)
86
86
  if 0 <= x:
87
87
  z = i
@@ -4,15 +4,16 @@ import re
4
4
  import string
5
5
  from typing import List
6
6
 
7
- from credsweeper.common.constants import PEM_BEGIN_PATTERN, PEM_END_PATTERN, Chars
7
+ from credsweeper.common.constants import PEM_BEGIN_PATTERN, PEM_END_PATTERN
8
8
  from credsweeper.config import Config
9
9
  from credsweeper.credentials import LineData
10
10
  from credsweeper.file_handler.analysis_target import AnalysisTarget
11
11
  from credsweeper.utils import Util
12
- from credsweeper.utils.entropy_validator import EntropyValidator
13
12
 
14
13
  logger = logging.getLogger(__name__)
15
14
 
15
+ ENTROPY_LIMIT_BASE64 = 4.5
16
+
16
17
 
17
18
  class PemKeyDetector:
18
19
  """Class to detect PEM PRIVATE keys only"""
@@ -65,13 +66,13 @@ class PemKeyDetector:
65
66
  if PEM_BEGIN_PATTERN in subline:
66
67
  begin_pattern_not_passed = False
67
68
  continue
68
- elif PEM_END_PATTERN in subline:
69
+ if PEM_END_PATTERN in subline:
69
70
  if "PGP" in target.line_strip:
70
71
  # Check if entropy is high enough for base64 set with padding sign
71
- entropy_validator = EntropyValidator(key_data, Chars.BASE64STDPAD_CHARS)
72
- if entropy_validator.valid:
72
+ entropy = Util.get_shannon_entropy(key_data)
73
+ if ENTROPY_LIMIT_BASE64 <= entropy:
73
74
  return line_data
74
- logger.debug("Filtered with entropy %f '%s'", entropy_validator.entropy, key_data)
75
+ logger.debug("Filtered with entropy %f '%s'", entropy, key_data)
75
76
  if "OPENSSH" in target.line_strip:
76
77
  # Check whether the key is encrypted
77
78
  with contextlib.suppress(Exception):
@@ -125,7 +126,7 @@ class PemKeyDetector:
125
126
  line = line.strip(string.whitespace)
126
127
  if line.startswith("//"):
127
128
  # simplify first condition for speed-up of doxygen style processing
128
- if line.startswith("// ") or line.startswith("/// "):
129
+ if line.startswith(("// ", "/// ")):
129
130
  # Assume that the commented line is to be separated from base64 code, it may be a part of PEM, otherwise
130
131
  line = line[3:]
131
132
  if line.startswith("/*"):
credsweeper/utils/util.py CHANGED
@@ -12,13 +12,14 @@ from dataclasses import dataclass
12
12
  from pathlib import Path
13
13
  from typing import Any, Dict, List, Tuple, Optional, Union
14
14
 
15
+ import numpy as np
15
16
  import whatthepatch
16
17
  import yaml
17
18
  from lxml import etree
18
19
  from typing_extensions import TypedDict
19
20
 
20
21
  from credsweeper.common.constants import DiffRowType, AVAILABLE_ENCODINGS, \
21
- DEFAULT_ENCODING, LATIN_1, CHUNK_SIZE, MAX_LINE_LENGTH, CHUNK_STEP_SIZE
22
+ DEFAULT_ENCODING, LATIN_1, CHUNK_SIZE, MAX_LINE_LENGTH, CHUNK_STEP_SIZE, ASCII
22
23
 
23
24
  logger = logging.getLogger(__name__)
24
25
 
@@ -65,21 +66,17 @@ class Util:
65
66
  return result
66
67
 
67
68
  @staticmethod
68
- def get_shannon_entropy(data: str, iterator: str) -> float:
69
+ def get_shannon_entropy(data: Union[str, bytes]) -> float:
69
70
  """Borrowed from http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html."""
70
71
  if not data:
71
- return 0
72
-
73
- entropy = 0.
74
- data_len = float(len(data))
75
- for x in iterator:
76
- p_x = data.count(x) / data_len
77
- if p_x > 0:
78
- entropy += -p_x * math.log(p_x, 2)
79
-
72
+ return 0.
73
+ size = len(data)
74
+ _uniq, counts = np.unique(list(data), return_counts=True)
75
+ probabilities = counts / size
76
+ entropy = float(-np.sum(probabilities * np.log2(probabilities)))
80
77
  return entropy
81
78
 
82
- """Precalculated data for speedup"""
79
+ # Precalculated data for speedup
83
80
  MIN_DATA_ENTROPY: Dict[int, float] = {
84
81
  16: 1.66973671780348,
85
82
  20: 2.07723544540831,
@@ -153,41 +150,39 @@ class Util:
153
150
  return entropy < min_entropy
154
151
 
155
152
  @staticmethod
156
- def is_known(data: bytes) -> bool:
157
- """
158
- Returns true if any recognized binary format found
159
- """
160
- if Util.is_zip(data) \
161
- or Util.is_gzip(data) \
162
- or Util.is_tar(data) \
163
- or Util.is_bzip2(data) \
164
- or Util.is_com(data) \
165
- or Util.is_pdf(data) \
166
- or Util.is_elf(data):
167
- return True
153
+ def is_known(data: Union[bytes, bytearray]) -> bool:
154
+ """Returns True if any known binary format is found to prevent extra scan a file without an extension."""
155
+ if isinstance(data, (bytes, bytearray)):
156
+ if 127 <= len(data) and data.startswith(b"\x7f\x45\x4c\x46"):
157
+ # https://en.wikipedia.org/wiki/Executable_and_Linkable_Format
158
+ # minimal ELF is 127 bytes https://github.com/tchajed/minimal-elf
159
+ return True
168
160
  return False
169
161
 
170
162
  @staticmethod
171
- def is_binary(data: bytes) -> bool:
163
+ def is_binary(data: Union[bytes, bytearray]) -> bool:
172
164
  """
173
- Returns True when two zeroes sequence is found which never exists in text format (UTF-8, UTF-16)
174
- UTF-32 is not supported
165
+ Returns True when two zeroes sequence is found in begin of data.
166
+ The sequence never exists in text format (UTF-8, UTF-16). UTF-32 is not supported.
175
167
  """
176
168
  if 0 <= data.find(b"\0\0", 0, MAX_LINE_LENGTH):
177
169
  return True
178
- non_ascii_cnt = 0
179
- for n in range(min([len(data), MAX_LINE_LENGTH])):
180
- i = data[n]
181
- if 0x20 > i and i not in (0x09, 0x0A, 0x0D) or 0x7E < i < 0xA0:
182
- # less than space and not tab, line feed, line end
183
- non_ascii_cnt += 1
170
+ else:
171
+ return False
172
+
173
+ NOT_LATIN1_PRINTABLE_SET = set(range(0, 256)) \
174
+ .difference(set(x for x in string.printable.encode(ASCII))) \
175
+ .difference(set(x for x in range(0xA0, 0x100)))
176
+
177
+ @staticmethod
178
+ def is_latin1(data: Union[bytes, bytearray]) -> bool:
179
+ """Returns True when data looks like LATIN-1 for first MAX_LINE_LENGTH bytes."""
180
+ result = False
184
181
  if data:
182
+ non_latin1_cnt = sum(1 for x in data[:MAX_LINE_LENGTH] if x in Util.NOT_LATIN1_PRINTABLE_SET)
185
183
  # experiment for 255217 binary files shown avg = 0.268264 ± 0.168767, so let choose minimal
186
- chunk_len = float(MAX_LINE_LENGTH if MAX_LINE_LENGTH < len(data) else len(data))
187
- result = 0.1 < non_ascii_cnt / chunk_len
188
- else:
189
- # empty data case
190
- result = False
184
+ chunk_len = min(MAX_LINE_LENGTH, len(data))
185
+ result = 0.1 > non_latin1_cnt / chunk_len
191
186
  return result
192
187
 
193
188
  @staticmethod
@@ -231,10 +226,10 @@ class Util:
231
226
  encodings = AVAILABLE_ENCODINGS
232
227
  for encoding in encodings:
233
228
  try:
234
- if binary_suggest and LATIN_1 == encoding and (Util.is_known(content) or Util.is_binary(content)):
229
+ if binary_suggest and LATIN_1 == encoding and (Util.is_binary(content) or not Util.is_latin1(content)):
235
230
  # LATIN_1 may convert data (bytes in range 0x80:0xFF are transformed)
236
231
  # so skip this encoding when checking binaries
237
- logger.warning("Binary file detected")
232
+ logger.warning("Binary file detected %s", repr(content[:8]))
238
233
  break
239
234
  text = content.decode(encoding, errors="strict")
240
235
  if content != text.encode(encoding, errors="strict"):
@@ -374,7 +369,7 @@ class Util:
374
369
  line = change["line"]
375
370
  if isinstance(line, str):
376
371
  rows_data.extend(Util.preprocess_diff_rows(change.get("new"), change.get("old"), line))
377
- elif isinstance(line, bytes):
372
+ elif isinstance(line, (bytes, bytearray)):
378
373
  logger.warning("The feature is available with the deep scan option")
379
374
  else:
380
375
  logger.error(f"Unknown type of line {type(line)}")
@@ -382,9 +377,9 @@ class Util:
382
377
  return rows_data
383
378
 
384
379
  @staticmethod
385
- def is_zip(data: bytes) -> bool:
380
+ def is_zip(data: Union[bytes, bytearray]) -> bool:
386
381
  """According https://en.wikipedia.org/wiki/List_of_file_signatures"""
387
- if isinstance(data, bytes) and 3 < len(data):
382
+ if isinstance(data, (bytes, bytearray)) and 3 < len(data):
388
383
  # PK
389
384
  if data.startswith(b"PK"):
390
385
  if 0x03 == data[2] and 0x04 == data[3]:
@@ -398,18 +393,18 @@ class Util:
398
393
  return False
399
394
 
400
395
  @staticmethod
401
- def is_com(data: bytes) -> bool:
396
+ def is_com(data: Union[bytes, bytearray]) -> bool:
402
397
  """According https://en.wikipedia.org/wiki/List_of_file_signatures"""
403
- if isinstance(data, bytes) and 8 < len(data):
398
+ if isinstance(data, (bytes, bytearray)) and 8 < len(data):
404
399
  if data.startswith(b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"):
405
400
  # Compound File Binary Format: doc, xls, ppt, msi, msg
406
401
  return True
407
402
  return False
408
403
 
409
404
  @staticmethod
410
- def is_tar(data: bytes) -> bool:
405
+ def is_tar(data: Union[bytes, bytearray]) -> bool:
411
406
  """According https://en.wikipedia.org/wiki/List_of_file_signatures"""
412
- if isinstance(data, bytes) and 512 <= len(data):
407
+ if isinstance(data, (bytes, bytearray)) and 512 <= len(data):
413
408
  if 0x75 == data[257] and 0x73 == data[258] and 0x74 == data[259] \
414
409
  and 0x61 == data[260] and 0x72 == data[261] and (
415
410
  0x00 == data[262] and 0x30 == data[263] and 0x30 == data[264]
@@ -425,9 +420,16 @@ class Util:
425
420
  return False
426
421
 
427
422
  @staticmethod
428
- def is_bzip2(data: bytes) -> bool:
423
+ def is_deb(data: Union[bytes, bytearray]) -> bool:
424
+ """According https://en.wikipedia.org/wiki/Deb_(file_format)"""
425
+ if isinstance(data, (bytes, bytearray)) and 512 <= len(data) and data.startswith(b"!<arch>\n"):
426
+ return True
427
+ return False
428
+
429
+ @staticmethod
430
+ def is_bzip2(data: Union[bytes, bytearray]) -> bool:
429
431
  """According https://en.wikipedia.org/wiki/Bzip2"""
430
- if isinstance(data, bytes) and 10 <= len(data):
432
+ if isinstance(data, (bytes, bytearray)) and 10 <= len(data):
431
433
  if data.startswith(b"\x42\x5A\x68") \
432
434
  and 0x31 <= data[3] <= 0x39 \
433
435
  and 0x31 == data[4] and 0x41 == data[5] and 0x59 == data[6] \
@@ -436,42 +438,49 @@ class Util:
436
438
  return False
437
439
 
438
440
  @staticmethod
439
- def is_gzip(data: bytes) -> bool:
441
+ def is_gzip(data: Union[bytes, bytearray]) -> bool:
440
442
  """According https://www.rfc-editor.org/rfc/rfc1952"""
441
- if isinstance(data, bytes) and 3 <= len(data):
443
+ if isinstance(data, (bytes, bytearray)) and 3 <= len(data):
442
444
  if data.startswith(b"\x1F\x8B\x08"):
443
445
  return True
444
446
  return False
445
447
 
446
448
  @staticmethod
447
- def is_pdf(data: bytes) -> bool:
449
+ def is_pdf(data: Union[bytes, bytearray]) -> bool:
448
450
  """According https://en.wikipedia.org/wiki/List_of_file_signatures - pdf"""
449
- if isinstance(data, bytes) and 5 <= len(data):
451
+ if isinstance(data, (bytes, bytearray)) and 5 <= len(data):
450
452
  if data.startswith(b"\x25\x50\x44\x46\x2D"):
451
453
  return True
452
454
  return False
453
455
 
454
456
  @staticmethod
455
- def is_jks(data: bytes) -> bool:
457
+ def is_jks(data: Union[bytes, bytearray]) -> bool:
456
458
  """According https://en.wikipedia.org/wiki/List_of_file_signatures - jks"""
457
- if isinstance(data, bytes) and 4 <= len(data):
459
+ if isinstance(data, (bytes, bytearray)) and 4 <= len(data):
458
460
  if data.startswith(b"\xFE\xED\xFE\xED"):
459
461
  return True
460
462
  return False
461
463
 
462
464
  @staticmethod
463
- def is_asn1(data: bytes) -> bool:
465
+ def is_lzma(data: Union[bytes, bytearray]) -> bool:
466
+ """According https://en.wikipedia.org/wiki/List_of_file_signatures - lzma also xz"""
467
+ if isinstance(data, (bytes, bytearray)) and 6 <= len(data):
468
+ if data.startswith((b"\xFD\x37\x7A\x58\x5A\x00", b"\x5D\x00\x00")):
469
+ return True
470
+ return False
471
+
472
+ @staticmethod
473
+ def is_asn1(data: Union[bytes, bytearray]) -> bool:
464
474
  """Only sequence type 0x30 and size correctness is checked"""
465
- data_length = len(data)
466
- if isinstance(data, bytes) and 4 <= data_length:
475
+ if isinstance(data, (bytes, bytearray)) and 4 <= len(data):
467
476
  # sequence
468
477
  if 0x30 == data[0]:
469
478
  # https://www.oss.com/asn1/resources/asn1-made-simple/asn1-quick-reference/basic-encoding-rules.html#Lengths
470
479
  length = data[1]
471
- byte_len = (0x7F & length)
480
+ byte_len = 0x7F & length
472
481
  if 0x80 == length and data.endswith(b"\x00\x00"):
473
482
  return True
474
- elif 0x80 < length and 1 < byte_len < data_length: # additional check
483
+ elif 0x80 < length and 1 < byte_len < len(data): # additional check
475
484
  len_bytes = data[2:2 + byte_len]
476
485
  try:
477
486
  long_size = struct.unpack(">h", len_bytes)
@@ -482,26 +491,17 @@ class Util:
482
491
  length = data[2]
483
492
  else:
484
493
  byte_len = 0
485
- return data_length == length + 2 + byte_len
486
- return False
487
-
488
- @staticmethod
489
- def is_elf(data: Union[bytes, bytearray]) -> bool:
490
- """According to https://en.wikipedia.org/wiki/Executable_and_Linkable_Format use only 5 bytes"""
491
- if isinstance(data, (bytes, bytearray)) and 127 <= len(data):
492
- # minimal is 127 bytes https://github.com/tchajed/minimal-elf
493
- if data.startswith(b"\x7f\x45\x4c\x46") and (0x01 == data[5] or 0x02 == data[5]):
494
- return True
494
+ return len(data) == length + 2 + byte_len
495
495
  return False
496
496
 
497
497
  @staticmethod
498
498
  def is_html(data: Union[bytes, bytearray]) -> bool:
499
499
  """Used to detect html format. Suppose, invocation of is_xml() was True before."""
500
500
  if isinstance(data, (bytes, bytearray)):
501
- for opening_tag, closing_tag in [(b"<html>", b"</html>"), (b"<table", b"</table>"), (b"<p>", b"</p>"),
502
- (b"<span>", b"</span>"), (b"<div>", b"</div>"), (b"<li>", b"</li>"),
503
- (b"<ol>", b"</ol>"), (b"<ul>", b"</ul>"), (b"<th>", b"</th>"),
504
- (b"<tr>", b"</tr>"), (b"<td>", b"</td>")]:
501
+ for opening_tag, closing_tag in [(b"<html", b"</html>"), (b"<body", b"</body>"), (b"<table", b"</table>"),
502
+ (b"<p>", b"</p>"), (b"<span>", b"</span>"), (b"<div>", b"</div>"),
503
+ (b"<li>", b"</li>"), (b"<ol>", b"</ol>"), (b"<ul>", b"</ul>"),
504
+ (b"<th>", b"</th>"), (b"<tr>", b"</tr>"), (b"<td>", b"</td>")]:
505
505
  opening_pos = data.find(opening_tag, 0, MAX_LINE_LENGTH)
506
506
  if 0 <= opening_pos < data.find(closing_tag, opening_pos):
507
507
  # opening and closing tags were found - suppose it is an HTML
@@ -658,81 +658,11 @@ class Util:
658
658
  except Exception as exc:
659
659
  logging.error(f"Failed to write: {file_path} {exc}")
660
660
 
661
- @staticmethod
662
- def __extract_value(node: Any, value: Any) -> List[Any]:
663
- result = []
664
- for i in getattr(node, "targets"):
665
- if hasattr(i, "id"):
666
- result.append({getattr(i, "id"): value})
667
- else:
668
- logger.error(f"{str(i)} has no 'id'")
669
- return result
670
-
671
- @staticmethod
672
- def __extract_assign(node: Any) -> List[Any]:
673
- result = []
674
- if hasattr(node, "value") and hasattr(node, "targets"):
675
- value = getattr(node, "value")
676
- if hasattr(value, "value"):
677
- # python 3.8 - 3.10
678
- result.extend(Util.__extract_value(node, getattr(value, "value")))
679
- else:
680
- logger.error(f"value.{value} has no 'value' {dir(value)}")
681
- else:
682
- logger.error(f"{str(node)} has no 'value' {dir(node)}")
683
- return result
684
-
685
- @staticmethod
686
- def ast_to_dict(node: Any) -> List[Any]:
687
- """Recursive parsing AST tree of python source to list with strings"""
688
- result: List[Any] = []
689
- if hasattr(node, "value") and isinstance(node.value, str):
690
- result.append(node.value)
691
-
692
- if isinstance(node, ast.Module) \
693
- or isinstance(node, ast.FunctionDef):
694
- if hasattr(node, "body"):
695
- for i in node.body:
696
- x = Util.ast_to_dict(i)
697
- if x:
698
- result.extend(x)
699
- elif isinstance(node, ast.Import):
700
- logger.debug("Import:%s", str(node))
701
- elif isinstance(node, ast.Assign):
702
- result.extend(Util.__extract_assign(node))
703
- elif isinstance(node, ast.Expr) \
704
- or isinstance(node, ast.AnnAssign) \
705
- or isinstance(node, ast.AugAssign) \
706
- or isinstance(node, ast.Call) \
707
- or isinstance(node, ast.JoinedStr) \
708
- or isinstance(node, ast.Return) \
709
- or isinstance(node, ast.ImportFrom) \
710
- or isinstance(node, ast.Assert) \
711
- or isinstance(node, ast.Pass) \
712
- or isinstance(node, ast.Raise) \
713
- or isinstance(node, ast.Str) \
714
- or isinstance(node, ast.Name) \
715
- or isinstance(node, ast.FormattedValue) \
716
- or isinstance(node, ast.Global):
717
- if hasattr(node, "value"):
718
- result.extend(Util.ast_to_dict(getattr(node, "value")))
719
- if hasattr(node, "args"):
720
- for i in getattr(node, "args"):
721
- result.extend(Util.ast_to_dict(i))
722
- if hasattr(node, "values"):
723
- for i in getattr(node, "values"):
724
- result.extend(Util.ast_to_dict(i))
725
- else:
726
- logger.debug(f"skip:{str(node)}")
727
- else:
728
- logger.debug(f"unknown:{str(node)}")
729
- return result
730
-
731
661
  @staticmethod
732
662
  def parse_python(source: str) -> List[Any]:
733
- """Parse python source to list of strings and assignments"""
663
+ """Parse python source and back to remove strings merge and line wrap"""
734
664
  src = ast.parse(source)
735
- result = Util.ast_to_dict(src)
665
+ result = ast.unparse(src).splitlines()
736
666
  return result
737
667
 
738
668
  @staticmethod
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: credsweeper
3
- Version: 1.11.2
3
+ Version: 1.11.4
4
4
  Summary: Credential Sweeper
5
5
  Project-URL: Homepage, https://github.com/Samsung/CredSweeper
6
6
  Project-URL: Bug Tracker, https://github.com/Samsung/CredSweeper/issues