credsweeper 1.11.2__py3-none-any.whl → 1.11.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of credsweeper might be problematic. Click here for more details.

Files changed (73) hide show
  1. credsweeper/__init__.py +1 -1
  2. credsweeper/__main__.py +7 -5
  3. credsweeper/app.py +28 -47
  4. credsweeper/common/constants.py +2 -5
  5. credsweeper/common/keyword_pattern.py +15 -9
  6. credsweeper/common/morpheme_checklist.txt +4 -2
  7. credsweeper/credentials/candidate_key.py +1 -1
  8. credsweeper/credentials/credential_manager.py +4 -3
  9. credsweeper/credentials/line_data.py +16 -15
  10. credsweeper/deep_scanner/abstract_scanner.py +10 -1
  11. credsweeper/deep_scanner/deb_scanner.py +48 -0
  12. credsweeper/deep_scanner/deep_scanner.py +65 -43
  13. credsweeper/deep_scanner/docx_scanner.py +1 -1
  14. credsweeper/deep_scanner/encoder_scanner.py +2 -2
  15. credsweeper/deep_scanner/gzip_scanner.py +1 -1
  16. credsweeper/deep_scanner/html_scanner.py +3 -3
  17. credsweeper/deep_scanner/jks_scanner.py +2 -4
  18. credsweeper/deep_scanner/lang_scanner.py +2 -2
  19. credsweeper/deep_scanner/lzma_scanner.py +40 -0
  20. credsweeper/deep_scanner/pkcs12_scanner.py +3 -5
  21. credsweeper/deep_scanner/xml_scanner.py +2 -2
  22. credsweeper/file_handler/byte_content_provider.py +2 -2
  23. credsweeper/file_handler/content_provider.py +1 -1
  24. credsweeper/file_handler/data_content_provider.py +23 -14
  25. credsweeper/file_handler/diff_content_provider.py +2 -2
  26. credsweeper/file_handler/file_path_extractor.py +1 -1
  27. credsweeper/file_handler/files_provider.py +2 -4
  28. credsweeper/file_handler/patches_provider.py +1 -1
  29. credsweeper/file_handler/string_content_provider.py +2 -2
  30. credsweeper/file_handler/struct_content_provider.py +1 -1
  31. credsweeper/file_handler/text_content_provider.py +2 -2
  32. credsweeper/filters/value_array_dictionary_check.py +3 -1
  33. credsweeper/filters/value_azure_token_check.py +1 -2
  34. credsweeper/filters/value_base64_encoded_pem_check.py +1 -1
  35. credsweeper/filters/value_base64_part_check.py +30 -21
  36. credsweeper/filters/value_discord_bot_check.py +1 -2
  37. credsweeper/filters/value_entropy_base32_check.py +11 -31
  38. credsweeper/filters/value_entropy_base36_check.py +11 -34
  39. credsweeper/filters/value_entropy_base64_check.py +15 -48
  40. credsweeper/filters/value_entropy_base_check.py +37 -0
  41. credsweeper/filters/value_file_path_check.py +1 -1
  42. credsweeper/filters/value_hex_number_check.py +3 -3
  43. credsweeper/filters/value_json_web_token_check.py +4 -5
  44. credsweeper/filters/value_pattern_check.py +64 -16
  45. credsweeper/filters/value_string_type_check.py +11 -3
  46. credsweeper/filters/value_token_base32_check.py +0 -4
  47. credsweeper/filters/value_token_base36_check.py +0 -4
  48. credsweeper/filters/value_token_base64_check.py +0 -4
  49. credsweeper/filters/value_token_check.py +1 -1
  50. credsweeper/ml_model/features/file_extension.py +2 -2
  51. credsweeper/ml_model/features/morpheme_dense.py +0 -4
  52. credsweeper/ml_model/features/rule_name.py +1 -1
  53. credsweeper/ml_model/features/word_in_path.py +0 -9
  54. credsweeper/ml_model/features/word_in_postamble.py +0 -11
  55. credsweeper/ml_model/features/word_in_preamble.py +0 -11
  56. credsweeper/ml_model/features/word_in_transition.py +0 -11
  57. credsweeper/ml_model/features/word_in_value.py +0 -11
  58. credsweeper/ml_model/features/word_in_variable.py +0 -11
  59. credsweeper/ml_model/ml_validator.py +45 -22
  60. credsweeper/rules/config.yaml +238 -208
  61. credsweeper/rules/rule.py +3 -3
  62. credsweeper/scanner/scan_type/scan_type.py +2 -3
  63. credsweeper/scanner/scanner.py +7 -1
  64. credsweeper/secret/config.json +16 -5
  65. credsweeper/utils/hop_stat.py +3 -3
  66. credsweeper/utils/pem_key_detector.py +8 -7
  67. credsweeper/utils/util.py +76 -146
  68. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/METADATA +1 -1
  69. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/RECORD +72 -70
  70. credsweeper/utils/entropy_validator.py +0 -72
  71. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/WHEEL +0 -0
  72. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/entry_points.txt +0 -0
  73. {credsweeper-1.11.2.dist-info → credsweeper-1.11.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,8 +1,9 @@
1
+ import contextlib
1
2
  import datetime
2
3
  import logging
3
4
  from typing import List, Optional, Any, Tuple, Union
4
5
 
5
- from credsweeper.common.constants import RECURSIVE_SCAN_LIMITATION
6
+ from credsweeper.common.constants import RECURSIVE_SCAN_LIMITATION, MIN_DATA_LEN, MIN_VALUE_LENGTH
6
7
  from credsweeper.config import Config
7
8
  from credsweeper.credentials import Candidate
8
9
  from credsweeper.credentials.augment_candidates import augment_candidates
@@ -16,6 +17,7 @@ from credsweeper.scanner import Scanner
16
17
  from credsweeper.utils import Util
17
18
  from .byte_scanner import ByteScanner
18
19
  from .bzip2_scanner import Bzip2Scanner
20
+ from .deb_scanner import DebScanner
19
21
  from .docx_scanner import DocxScanner
20
22
  from .eml_scanner import EmlScanner
21
23
  from .encoder_scanner import EncoderScanner
@@ -23,6 +25,7 @@ from .gzip_scanner import GzipScanner
23
25
  from .html_scanner import HtmlScanner
24
26
  from .jks_scanner import JksScanner
25
27
  from .lang_scanner import LangScanner
28
+ from .lzma_scanner import LzmaScanner
26
29
  from .mxfile_scanner import MxfileScanner
27
30
  from .pdf_scanner import PdfScanner
28
31
  from .pkcs12_scanner import Pkcs12Scanner
@@ -48,10 +51,12 @@ class DeepScanner(
48
51
  HtmlScanner, #
49
52
  JksScanner, #
50
53
  LangScanner, #
54
+ LzmaScanner, #
51
55
  PdfScanner, #
52
56
  Pkcs12Scanner, #
53
57
  PptxScanner, #
54
58
  TarScanner, #
59
+ DebScanner, #
55
60
  XmlScanner, #
56
61
  XlsxScanner, #
57
62
  ZipScanner
@@ -106,9 +111,15 @@ class DeepScanner(
106
111
  elif Util.is_bzip2(data):
107
112
  if 0 < depth:
108
113
  deep_scanners.append(Bzip2Scanner)
114
+ elif Util.is_lzma(data):
115
+ if 0 < depth:
116
+ deep_scanners.append(LzmaScanner)
109
117
  elif Util.is_tar(data):
110
118
  if 0 < depth:
111
119
  deep_scanners.append(TarScanner)
120
+ elif Util.is_deb(data):
121
+ if 0 < depth:
122
+ deep_scanners.append(DebScanner)
112
123
  elif Util.is_gzip(data):
113
124
  if 0 < depth:
114
125
  deep_scanners.append(GzipScanner)
@@ -140,13 +151,16 @@ class DeepScanner(
140
151
  else:
141
152
  fallback_scanners.append(EmlScanner)
142
153
  fallback_scanners.append(ByteScanner)
154
+ elif Util.is_known(data):
155
+ # the format is known but cannot be scanned
156
+ pass
143
157
  elif not Util.is_binary(data):
144
158
  if 0 < depth:
145
159
  deep_scanners.append(EncoderScanner)
146
160
  deep_scanners.append(LangScanner)
147
161
  deep_scanners.append(ByteScanner)
148
162
  else:
149
- logger.warning("Cannot apply a deep scanner for type %s", file_type)
163
+ logger.warning("Cannot apply a deep scanner for type %s prefix %s", file_type, str(data[:MIN_DATA_LEN]))
150
164
  return deep_scanners, fallback_scanners
151
165
 
152
166
  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@@ -175,7 +189,7 @@ class DeepScanner(
175
189
  # this scan is successful, so fallback is not necessary
176
190
  fallback = False
177
191
  if fallback:
178
- for scan_class in deep_scanners:
192
+ for scan_class in fallback_scanners:
179
193
  fallback_candidates = scan_class.data_scan(self, data_provider, depth, recursive_limit_size)
180
194
  if fallback_candidates is None:
181
195
  continue
@@ -201,10 +215,10 @@ class DeepScanner(
201
215
  int) else RECURSIVE_SCAN_LIMITATION
202
216
  candidates: List[Candidate] = []
203
217
  data: Optional[bytes] = None
204
- if isinstance(content_provider, TextContentProvider) or isinstance(content_provider, ByteContentProvider):
218
+ if isinstance(content_provider, (TextContentProvider, ByteContentProvider)):
205
219
  # Feature to scan files which might be containers
206
220
  data = content_provider.data
207
- info = "FILE"
221
+ info = f"FILE:{content_provider.file_path}"
208
222
  elif isinstance(content_provider, DiffContentProvider) and content_provider.diff:
209
223
  candidates = self.scanner.scan(content_provider)
210
224
  # Feature to scan binary diffs
@@ -212,7 +226,7 @@ class DeepScanner(
212
226
  # the check for legal fix mypy issue
213
227
  if isinstance(diff, bytes):
214
228
  data = diff
215
- info = "DIFF"
229
+ info = f"DIFF:{content_provider.file_path}"
216
230
  else:
217
231
  logger.warning(f"Content provider {type(content_provider)} does not support deep scan")
218
232
  info = "NA"
@@ -239,15 +253,18 @@ class DeepScanner(
239
253
  recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
240
254
  """
241
255
  candidates: List[Candidate] = []
242
- logger.debug("Start data_scan: size=%d, depth=%d, limit=%d, path=%s, info=%s", len(data_provider.data), depth,
243
- recursive_limit_size, data_provider.file_path, data_provider.info)
244
-
245
256
  if 0 > depth:
246
257
  # break recursion if maximal depth is reached
247
- logger.debug("bottom reached %s recursive_limit_size:%d", data_provider.file_path, recursive_limit_size)
258
+ logger.debug("Bottom reached %s recursive_limit_size:%d", data_provider.file_path, recursive_limit_size)
248
259
  return candidates
249
-
250
260
  depth -= 1
261
+ if MIN_DATA_LEN > len(data_provider.data):
262
+ # break recursion for minimal data size
263
+ logger.debug("Too small data: size=%d, depth=%d, limit=%d, path=%s, info=%s", len(data_provider.data),
264
+ depth, recursive_limit_size, data_provider.file_path, data_provider.info)
265
+ return candidates
266
+ logger.debug("Start data_scan: size=%d, depth=%d, limit=%d, path=%s, info=%s", len(data_provider.data), depth,
267
+ recursive_limit_size, data_provider.file_path, data_provider.info)
251
268
 
252
269
  if FilePathExtractor.is_find_by_ext_file(self.config, data_provider.file_type):
253
270
  # Skip scanning file and makes fake candidate due the extension is suspicious
@@ -287,7 +304,7 @@ class DeepScanner(
287
304
  items: List[Tuple[Union[int, str], Any]] = []
288
305
  struct_key: Optional[str] = None
289
306
  struct_value: Optional[str] = None
290
- line_for_keyword_rules = ""
307
+ lines_for_keyword_rules = []
291
308
  if isinstance(struct_provider.struct, dict):
292
309
  for key, value in struct_provider.struct.items():
293
310
  if isinstance(value, (list, tuple)) and 1 == len(value):
@@ -298,13 +315,13 @@ class DeepScanner(
298
315
  # for transformation {"key": "api_key", "value": "XXXXXXX"} -> {"api_key": "XXXXXXX"}
299
316
  struct_key = struct_provider.struct.get("key")
300
317
  struct_value = struct_provider.struct.get("value")
301
- elif isinstance(struct_provider.struct, list) or isinstance(struct_provider.struct, tuple):
318
+ elif isinstance(struct_provider.struct, (list, tuple)):
302
319
  items = list(enumerate(struct_provider.struct))
303
320
  else:
304
321
  logger.error("Not supported type:%s val:%s", str(type(struct_provider.struct)), str(struct_provider.struct))
305
322
 
306
323
  for key, value in items:
307
- if isinstance(value, dict) or isinstance(value, (list, tuple)) and 1 < len(value):
324
+ if isinstance(value, dict) or isinstance(value, (list, tuple)) and 1 <= len(value):
308
325
  val_struct_provider = StructContentProvider(struct=value,
309
326
  file_path=struct_provider.file_path,
310
327
  file_type=struct_provider.file_type,
@@ -313,52 +330,57 @@ class DeepScanner(
313
330
  candidates.extend(new_candidates)
314
331
 
315
332
  elif isinstance(value, bytes):
316
- bytes_struct_provider = DataContentProvider(data=value,
317
- file_path=struct_provider.file_path,
318
- file_type=struct_provider.file_type,
319
- info=f"{struct_provider.info}|BYTES:{key}")
320
- new_limit = recursive_limit_size - len(value)
321
- new_candidates = self.recursive_scan(bytes_struct_provider, depth, new_limit)
322
- candidates.extend(new_candidates)
333
+ if MIN_DATA_LEN <= len(value):
334
+ bytes_struct_provider = DataContentProvider(data=value,
335
+ file_path=struct_provider.file_path,
336
+ file_type=struct_provider.file_type,
337
+ info=f"{struct_provider.info}|BYTES:{key}")
338
+ new_limit = recursive_limit_size - len(value)
339
+ new_candidates = self.recursive_scan(bytes_struct_provider, depth, new_limit)
340
+ candidates.extend(new_candidates)
341
+ if MIN_VALUE_LENGTH <= len(value) and isinstance(key, str) \
342
+ and self.scanner.keywords_required_substrings_check(key.lower()):
343
+ str_val = str(value)
344
+ lines_for_keyword_rules.append(f"{key} = '{str_val}'" if '"' in str_val else f'{key} = "{str_val}"')
323
345
 
324
346
  elif isinstance(value, str):
325
- data = value.encode(encoding=DEFAULT_ENCODING, errors='replace')
326
- str_struct_provider = DataContentProvider(data=data,
327
- file_path=struct_provider.file_path,
328
- file_type=struct_provider.file_type,
329
- info=f"{struct_provider.info}|STRING:{key}")
330
- new_limit = recursive_limit_size - len(str_struct_provider.data)
331
- new_candidates = self.recursive_scan(str_struct_provider, depth, new_limit)
332
- candidates.extend(new_candidates)
333
-
347
+ if MIN_DATA_LEN <= len(value):
348
+ # recursive scan only for data which may be decoded at least
349
+ with contextlib.suppress(UnicodeError):
350
+ data = value.encode(encoding=DEFAULT_ENCODING, errors='strict')
351
+ str_struct_provider = DataContentProvider(data=data,
352
+ file_path=struct_provider.file_path,
353
+ file_type=struct_provider.file_type,
354
+ info=f"{struct_provider.info}|STRING:{key}")
355
+ new_limit = recursive_limit_size - len(str_struct_provider.data)
356
+ new_candidates = self.recursive_scan(str_struct_provider, depth, new_limit)
357
+ candidates.extend(new_candidates)
334
358
  # use key = "value" scan for common cases like in TOML
335
- if isinstance(key, str) and self.scanner.keywords_required_substrings_check(key):
336
- line_for_keyword_rules += f"{key} = \"{value}\"; "
359
+ if MIN_VALUE_LENGTH <= len(value) and isinstance(key, str) \
360
+ and self.scanner.keywords_required_substrings_check(key.lower()):
361
+ lines_for_keyword_rules.append(f"{key} = '{value}'" if '"' in value else f'{key} = "{value}"')
337
362
 
338
363
  elif isinstance(value, (int, float, datetime.date, datetime.datetime)):
339
- # use the fields only in case of matched keywords
340
- if isinstance(key, str) and self.scanner.keywords_required_substrings_check(key):
341
- line_for_keyword_rules += f"{key} = \"{value}\"; "
342
-
364
+ # skip useless types
365
+ pass
343
366
  else:
344
367
  logger.warning("Not supported type:%s value(%s)", str(type(value)), str(value))
345
368
 
346
- if line_for_keyword_rules:
347
- str_provider = StringContentProvider([line_for_keyword_rules],
369
+ if lines_for_keyword_rules:
370
+ str_provider = StringContentProvider(lines_for_keyword_rules,
348
371
  file_path=struct_provider.file_path,
349
- file_type=".toml",
350
- info=f"{struct_provider.info}|KEYWORD:`{line_for_keyword_rules}`")
372
+ file_type=".py",
373
+ info=f"{struct_provider.info}|KEYWORD:`{lines_for_keyword_rules}`")
351
374
  new_candidates = self.scanner.scan(str_provider)
352
375
  augment_candidates(candidates, new_candidates)
353
376
 
354
377
  # last check when dictionary is {"key": "api_key", "value": "XXXXXXX"} -> {"api_key": "XXXXXXX"}
355
378
  if isinstance(struct_key, str) and isinstance(struct_value, str):
356
- line_for_keyword_rules = f"{struct_key} = \"{struct_value}\""
357
379
  key_value_provider = StringContentProvider(
358
- [line_for_keyword_rules],
380
+ [f"{struct_key} = '{struct_value}'" if '"' in struct_value else f'{struct_key} = "{struct_value}"'],
359
381
  file_path=struct_provider.file_path,
360
382
  file_type=".toml",
361
- info=f"{struct_provider.info}|KEY_VALUE:`{line_for_keyword_rules}`")
383
+ info=f"{struct_provider.info}|KEY_VALUE:`{lines_for_keyword_rules}`")
362
384
  new_candidates = self.scanner.scan(key_value_provider)
363
385
  augment_candidates(candidates, new_candidates)
364
386
  return candidates
@@ -42,7 +42,7 @@ class DocxScanner(AbstractScanner, ABC):
42
42
  yield from DocxScanner._iter_block_items(block.footer)
43
43
  return
44
44
  elif isinstance(block, _Cell):
45
- parent_elm = block._tc
45
+ parent_elm = block._tc # pylint: disable=W0212
46
46
  else:
47
47
  raise ValueError(f"unrecognised:{type(block)}")
48
48
 
@@ -18,11 +18,11 @@ class EncoderScanner(AbstractScanner, ABC):
18
18
  depth: int, #
19
19
  recursive_limit_size: int) -> Optional[List[Candidate]]:
20
20
  """Tries to decode data from base64 encode to bytes and scan as bytes again"""
21
- if data_provider.represent_as_encoded():
21
+ if result := data_provider.represent_as_encoded():
22
22
  decoded_data_provider = DataContentProvider(data=data_provider.decoded,
23
23
  file_path=data_provider.file_path,
24
24
  file_type=data_provider.file_type,
25
25
  info=f"{data_provider.info}|BASE64")
26
26
  new_limit = recursive_limit_size - len(decoded_data_provider.data)
27
27
  return self.recursive_scan(decoded_data_provider, depth, new_limit)
28
- return None
28
+ return None if result is None else []
@@ -31,7 +31,7 @@ class GzipScanner(AbstractScanner, ABC):
31
31
  gzip_content_provider = DataContentProvider(data=f.read(),
32
32
  file_path=new_path,
33
33
  file_type=Util.get_extension(new_path),
34
- info=f"{data_provider.info}|GZIP:{file_path}")
34
+ info=f"{data_provider.info}|GZIP:{new_path}")
35
35
  new_limit = recursive_limit_size - len(gzip_content_provider.data)
36
36
  gzip_candidates = self.recursive_scan(gzip_content_provider, depth, new_limit)
37
37
  return gzip_candidates
@@ -19,12 +19,12 @@ class HtmlScanner(AbstractScanner, ABC):
19
19
  depth: int, #
20
20
  recursive_limit_size: int) -> Optional[List[Candidate]]:
21
21
  """Tries to represent data as html text and scan as text lines"""
22
- if data_provider.represent_as_html(depth, recursive_limit_size,
23
- self.scanner.keywords_required_substrings_check):
22
+ if result := data_provider.represent_as_html(depth, recursive_limit_size,
23
+ self.scanner.keywords_required_substrings_check):
24
24
  string_data_provider = StringContentProvider(lines=data_provider.lines,
25
25
  line_numbers=data_provider.line_numbers,
26
26
  file_path=data_provider.file_path,
27
27
  file_type=data_provider.file_type,
28
28
  info=f"{data_provider.info}|HTML")
29
29
  return self.scanner.scan(string_data_provider)
30
- return None
30
+ return None if result is None else []
@@ -20,7 +20,6 @@ class JksScanner(AbstractScanner, ABC):
20
20
  depth: int, #
21
21
  recursive_limit_size: int) -> Optional[List[Candidate]]:
22
22
  """Tries to scan JKS to open with standard password"""
23
- candidates = []
24
23
  for pw_probe in self.config.bruteforce_list:
25
24
  try:
26
25
  keystore = jks.KeyStore.loads(data_provider.data, pw_probe, try_decrypt_keys=True)
@@ -38,8 +37,7 @@ class JksScanner(AbstractScanner, ABC):
38
37
  candidate.line_data_list[0].value = pw_probe or "<EMPTY PASSWORD>"
39
38
  candidate.line_data_list[0].value_start = 1
40
39
  candidate.line_data_list[0].value_end = 1 + len(candidate.line_data_list[0].value)
41
- candidates.append(candidate)
42
- break
40
+ return [candidate]
43
41
  except Exception as jks_exc:
44
42
  logger.debug(f"{data_provider.file_path}:{pw_probe}:{jks_exc}")
45
- return candidates
43
+ return None
@@ -19,10 +19,10 @@ class LangScanner(AbstractScanner, ABC):
19
19
  depth: int, #
20
20
  recursive_limit_size: int) -> Optional[List[Candidate]]:
21
21
  """Tries to represent data as markup language and scan as structure"""
22
- if data_provider.represent_as_structure():
22
+ if result := data_provider.represent_as_structure():
23
23
  struct_data_provider = StructContentProvider(struct=data_provider.structure,
24
24
  file_path=data_provider.file_path,
25
25
  file_type=data_provider.file_type,
26
26
  info=f"{data_provider.info}|STRUCT")
27
27
  return self.structure_scan(struct_data_provider, depth, recursive_limit_size)
28
- return None
28
+ return None if result is None else []
@@ -0,0 +1,40 @@
1
+ import logging
2
+ import lzma
3
+ from abc import ABC
4
+ from pathlib import Path
5
+ from typing import List, Optional
6
+
7
+ from credsweeper.credentials import Candidate
8
+ from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
9
+ from credsweeper.file_handler.data_content_provider import DataContentProvider
10
+ from credsweeper.utils import Util
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class LzmaScanner(AbstractScanner, ABC):
16
+ """Implements lzma scanning"""
17
+
18
+ def data_scan(
19
+ self, #
20
+ data_provider: DataContentProvider, #
21
+ depth: int, #
22
+ recursive_limit_size: int) -> Optional[List[Candidate]]:
23
+ """Extracts data from lzma archive and launches data_scan"""
24
+ try:
25
+ file_path = Path(data_provider.file_path)
26
+ new_path = file_path.as_posix()
27
+ if ".xz" == file_path.suffix:
28
+ new_path = new_path[:-3]
29
+ elif ".lzma" == file_path.suffix:
30
+ new_path = new_path[:-5]
31
+ lzma_content_provider = DataContentProvider(data=lzma.decompress(data_provider.data),
32
+ file_path=new_path,
33
+ file_type=Util.get_extension(new_path),
34
+ info=f"{data_provider.info}|LZMA:{file_path}")
35
+ new_limit = recursive_limit_size - len(lzma_content_provider.data)
36
+ lzma_candidates = self.recursive_scan(lzma_content_provider, depth, new_limit)
37
+ return lzma_candidates
38
+ except Exception as lzma_exc:
39
+ logger.error(f"{data_provider.file_path}:{lzma_exc}")
40
+ return None
@@ -20,10 +20,9 @@ class Pkcs12Scanner(AbstractScanner, ABC):
20
20
  depth: int, #
21
21
  recursive_limit_size: int) -> Optional[List[Candidate]]:
22
22
  """Tries to scan PKCS12 to open with standard password"""
23
- candidates = []
24
23
  for pw_probe in self.config.bruteforce_list:
25
24
  try:
26
- (private_key, certificate, additional_certificates) \
25
+ (private_key, _certificate, _additional_certificates) \
27
26
  = cryptography.hazmat.primitives.serialization.pkcs12.load_key_and_certificates(data_provider.data,
28
27
  pw_probe.encode())
29
28
  # the password probe has passed, it will be the value
@@ -40,8 +39,7 @@ class Pkcs12Scanner(AbstractScanner, ABC):
40
39
  candidate.line_data_list[0].value = value
41
40
  candidate.line_data_list[0].value_start = 1
42
41
  candidate.line_data_list[0].value_end = 1 + len(candidate.line_data_list[0].value)
43
- candidates.append(candidate)
44
- break
42
+ return [candidate]
45
43
  except Exception as pkcs_exc:
46
44
  logger.debug(f"{data_provider.file_path}:{pw_probe}:{pkcs_exc}")
47
- return candidates
45
+ return None
@@ -19,11 +19,11 @@ class XmlScanner(AbstractScanner, ABC):
19
19
  depth: int, #
20
20
  recursive_limit_size: int) -> Optional[List[Candidate]]:
21
21
  """Tries to represent data as xml text and scan as text lines"""
22
- if data_provider.represent_as_xml():
22
+ if result := data_provider.represent_as_xml():
23
23
  string_data_provider = StringContentProvider(lines=data_provider.lines,
24
24
  line_numbers=data_provider.line_numbers,
25
25
  file_path=data_provider.file_path,
26
26
  file_type=data_provider.file_type,
27
27
  info=f"{data_provider.info}|XML")
28
28
  return self.scanner.scan(string_data_provider)
29
- return None
29
+ return None if result is None else []
@@ -32,10 +32,10 @@ class ByteContentProvider(ContentProvider):
32
32
  def free(self) -> None:
33
33
  """free data after scan to reduce memory usage"""
34
34
  self.__data = None
35
- if hasattr(self, "data"):
35
+ if "data" in self.__dict__:
36
36
  delattr(self, "data")
37
37
  self.__lines = None
38
- if hasattr(self, "lines"):
38
+ if "lines" in self.__dict__:
39
39
  delattr(self, "lines")
40
40
 
41
41
  @cached_property
@@ -93,7 +93,7 @@ class ContentProvider(ABC):
93
93
  if min_len > len(line.strip()):
94
94
  # Ignore target if stripped part is too short for all types
95
95
  continue
96
- elif MAX_LINE_LENGTH < len(line):
96
+ if MAX_LINE_LENGTH < len(line):
97
97
  for chunk_start, chunk_end in Util.get_chunks(len(line)):
98
98
  target = AnalysisTarget(
99
99
  line_pos=line_pos, #
@@ -54,10 +54,10 @@ class DataContentProvider(ContentProvider):
54
54
  def free(self) -> None:
55
55
  """free data after scan to reduce memory usage"""
56
56
  self.__data = None
57
- if hasattr(self, "data"):
57
+ if "data" in self.__dict__:
58
58
  delattr(self, "data")
59
59
  self.__text = None
60
- if hasattr(self, "text"):
60
+ if "text" in self.__dict__:
61
61
  delattr(self, "text")
62
62
  self.structure = None
63
63
  self.decoded = None
@@ -76,9 +76,14 @@ class DataContentProvider(ContentProvider):
76
76
  return self.structure is not None and (isinstance(self.structure, dict) and 0 < len(self.structure.keys())
77
77
  or isinstance(self.structure, list) and 0 < len(self.structure))
78
78
 
79
- def represent_as_structure(self) -> bool:
79
+ def represent_as_structure(self) -> Optional[bool]:
80
80
  """Tries to convert data with many parsers. Stores result to internal structure
81
- Return True if some structure found
81
+
82
+ Return:
83
+ True if some structure found
84
+ False if no data found
85
+ None if the format is not acceptable
86
+
82
87
  """
83
88
  if MIN_DATA_LEN > len(self.text):
84
89
  return False
@@ -134,13 +139,15 @@ class DataContentProvider(ContentProvider):
134
139
  if self.__is_structure():
135
140
  return True
136
141
  # # # None of above
137
- return False
142
+ return None
138
143
 
139
- def represent_as_xml(self) -> bool:
144
+ def represent_as_xml(self) -> Optional[bool]:
140
145
  """Tries to read data as xml
141
146
 
142
147
  Return:
143
148
  True if reading was successful
149
+ False if no data found
150
+ None if the format is not acceptable
144
151
 
145
152
  """
146
153
  if MIN_XML_LEN > len(self.text):
@@ -150,14 +157,12 @@ class DataContentProvider(ContentProvider):
150
157
  xml_text = self.text.splitlines()
151
158
  self.lines, self.line_numbers = Util.get_xml_from_lines(xml_text)
152
159
  logger.debug("CONVERTED from xml")
160
+ return bool(self.lines and self.line_numbers)
153
161
  else:
154
162
  logger.debug("Weak data to parse as XML")
155
- return False
156
163
  except Exception as exc:
157
164
  logger.debug("Cannot parse as XML:%s %s", exc, self.data)
158
- else:
159
- return bool(self.lines and self.line_numbers)
160
- return False
165
+ return None
161
166
 
162
167
  def _check_multiline_cell(self, cell: Tag) -> Optional[Tuple[int, str]]:
163
168
  """multiline cell will be analysed as text or return single line from cell
@@ -336,11 +341,13 @@ class DataContentProvider(ContentProvider):
336
341
  self, #
337
342
  depth: int, #
338
343
  recursive_limit_size: int, #
339
- keywords_required_substrings_check: Callable[[str], bool]) -> bool:
344
+ keywords_required_substrings_check: Callable[[str], bool]) -> Optional[bool]:
340
345
  """Tries to read data as html
341
346
 
342
347
  Return:
343
348
  True if reading was successful
349
+ False if no data found
350
+ None if the format is not acceptable
344
351
 
345
352
  """
346
353
  try:
@@ -361,13 +368,15 @@ class DataContentProvider(ContentProvider):
361
368
  logger.debug("Cannot parse as HTML:%s %s", exc, self.data)
362
369
  else:
363
370
  return bool(self.lines and self.line_numbers)
364
- return False
371
+ return None
365
372
 
366
- def represent_as_encoded(self) -> bool:
373
+ def represent_as_encoded(self) -> Optional[bool]:
367
374
  """Decodes data from base64. Stores result in decoded
368
375
 
369
376
  Return:
370
377
  True if the data correctly parsed and verified
378
+ False if no data found
379
+ None if the format is not acceptable
371
380
 
372
381
  """
373
382
  if len(self.data) < MIN_ENCODED_DATA_LEN \
@@ -383,7 +392,7 @@ class DataContentProvider(ContentProvider):
383
392
  logger.debug("Cannot decoded as base64:%s %s", exc, self.data)
384
393
  else:
385
394
  return self.decoded is not None and 0 < len(self.decoded)
386
- return False
395
+ return None
387
396
 
388
397
  def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
389
398
  """Return nothing. The class provides only data storage.
@@ -48,8 +48,8 @@ class DiffContentProvider(ContentProvider):
48
48
 
49
49
  def free(self) -> None:
50
50
  """free data after scan to reduce memory usage"""
51
- self.__diff = None
52
- if hasattr(self, "diff"):
51
+ self.__diff = []
52
+ if "diff" in self.__dict__:
53
53
  delattr(self, "diff")
54
54
 
55
55
  @staticmethod
@@ -162,7 +162,7 @@ class FilePathExtractor:
162
162
  True when the file is oversize or less than MIN_DATA_LEN, or unsupported
163
163
  """
164
164
  path = reference[1] if isinstance(reference, tuple) else reference
165
- if isinstance(path, str) or isinstance(path, Path):
165
+ if isinstance(path, (str, Path)):
166
166
  file_size = os.path.getsize(path)
167
167
  elif isinstance(path, io.BytesIO):
168
168
  current_pos = path.tell()
@@ -42,7 +42,7 @@ class FilesProvider(AbstractProvider):
42
42
  """
43
43
  text_content_provider_list: List[Union[DiffContentProvider, TextContentProvider]] = []
44
44
  for path in self.paths:
45
- if isinstance(path, str) or isinstance(path, Path):
45
+ if isinstance(path, (str, Path)):
46
46
  new_files = FilePathExtractor.get_file_paths(config, path)
47
47
  if self.skip_ignored:
48
48
  new_files = FilePathExtractor.apply_gitignore(new_files)
@@ -50,9 +50,7 @@ class FilesProvider(AbstractProvider):
50
50
  text_content_provider_list.append(TextContentProvider(_file))
51
51
  elif isinstance(path, io.BytesIO):
52
52
  text_content_provider_list.append(TextContentProvider((":memory:", path)))
53
- elif isinstance(path, tuple) \
54
- and (isinstance(path[0], str) or isinstance(path[0], Path)) \
55
- and isinstance(path[1], io.BytesIO):
53
+ elif isinstance(path, tuple) and (isinstance(path[0], (str, Path))) and isinstance(path[1], io.BytesIO):
56
54
  # suppose, all the files must be scanned
57
55
  text_content_provider_list.append(TextContentProvider(path))
58
56
  else:
@@ -37,7 +37,7 @@ class PatchesProvider(AbstractProvider):
37
37
  for file_path in self.paths:
38
38
  if FilePathExtractor.check_file_size(config, file_path):
39
39
  continue
40
- if isinstance(file_path, str) or isinstance(file_path, Path):
40
+ if isinstance(file_path, (str, Path)):
41
41
  raw_patches.append(Util.read_file(file_path))
42
42
  elif isinstance(file_path, io.BytesIO):
43
43
  the_patch = Util.decode_bytes(file_path.read())
@@ -38,10 +38,10 @@ class StringContentProvider(ContentProvider):
38
38
  def free(self) -> None:
39
39
  """free data after scan to reduce memory usage"""
40
40
  self.__lines = []
41
- if hasattr(self, "lines"):
41
+ if "lines" in self.__dict__:
42
42
  delattr(self, "lines")
43
43
  self.__line_numbers = []
44
- if hasattr(self, "line_numbers"):
44
+ if "line_numbers" in self.__dict__:
45
45
  delattr(self, "line_numbers")
46
46
 
47
47
  @cached_property
@@ -38,7 +38,7 @@ class StructContentProvider(ContentProvider):
38
38
  def free(self) -> None:
39
39
  """free data after scan to reduce memory usage"""
40
40
  self.__struct = None
41
- if hasattr(self, "struct"):
41
+ if "struct" in self.__dict__:
42
42
  delattr(self, "struct")
43
43
 
44
44
  def yield_analysis_target(self, min_len: int) -> Generator[AnalysisTarget, None, None]:
@@ -42,10 +42,10 @@ class TextContentProvider(ContentProvider):
42
42
  def free(self) -> None:
43
43
  """free data after scan to reduce memory usage"""
44
44
  self.__data = None
45
- if hasattr(self, "data"):
45
+ if "data" in self.__dict__:
46
46
  delattr(self, "data")
47
47
  self.__lines = None
48
- if hasattr(self, "lines"):
48
+ if "lines" in self.__dict__:
49
49
  delattr(self, "lines")
50
50
  if isinstance(self.__io, io.BytesIO) and self.__io and not self.__io.closed:
51
51
  self.__io.close()