credsweeper 1.11.5__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of credsweeper might be problematic. Click here for more details.

Files changed (145) hide show
  1. credsweeper/__init__.py +21 -15
  2. credsweeper/__main__.py +158 -42
  3. credsweeper/app.py +18 -13
  4. credsweeper/common/keyword_pattern.py +19 -18
  5. credsweeper/common/morpheme_checklist.txt +28 -6
  6. credsweeper/config/__init__.py +0 -1
  7. credsweeper/config/config.py +4 -3
  8. credsweeper/credentials/__init__.py +0 -5
  9. credsweeper/credentials/augment_candidates.py +1 -1
  10. credsweeper/credentials/candidate.py +1 -1
  11. credsweeper/credentials/credential_manager.py +1 -1
  12. credsweeper/credentials/line_data.py +43 -8
  13. credsweeper/deep_scanner/__init__.py +0 -1
  14. credsweeper/deep_scanner/abstract_scanner.py +4 -3
  15. credsweeper/deep_scanner/byte_scanner.py +1 -1
  16. credsweeper/deep_scanner/bzip2_scanner.py +2 -2
  17. credsweeper/deep_scanner/csv_scanner.py +71 -0
  18. credsweeper/deep_scanner/deb_scanner.py +1 -1
  19. credsweeper/deep_scanner/deep_scanner.py +22 -12
  20. credsweeper/deep_scanner/docx_scanner.py +1 -1
  21. credsweeper/deep_scanner/eml_scanner.py +1 -1
  22. credsweeper/deep_scanner/encoder_scanner.py +1 -1
  23. credsweeper/deep_scanner/gzip_scanner.py +2 -2
  24. credsweeper/deep_scanner/html_scanner.py +1 -1
  25. credsweeper/deep_scanner/jclass_scanner.py +1 -1
  26. credsweeper/deep_scanner/jks_scanner.py +12 -3
  27. credsweeper/deep_scanner/lang_scanner.py +1 -1
  28. credsweeper/deep_scanner/lzma_scanner.py +2 -2
  29. credsweeper/deep_scanner/mxfile_scanner.py +1 -1
  30. credsweeper/deep_scanner/pdf_scanner.py +1 -1
  31. credsweeper/deep_scanner/pkcs_scanner.py +6 -2
  32. credsweeper/deep_scanner/pptx_scanner.py +1 -1
  33. credsweeper/deep_scanner/rpm_scanner.py +1 -1
  34. credsweeper/deep_scanner/rtf_scanner.py +41 -0
  35. credsweeper/deep_scanner/strings_scanner.py +52 -0
  36. credsweeper/deep_scanner/tar_scanner.py +2 -2
  37. credsweeper/deep_scanner/tmx_scanner.py +2 -2
  38. credsweeper/deep_scanner/xlsx_scanner.py +2 -2
  39. credsweeper/deep_scanner/xml_scanner.py +1 -1
  40. credsweeper/deep_scanner/zip_scanner.py +2 -2
  41. credsweeper/file_handler/__init__.py +0 -15
  42. credsweeper/file_handler/abstract_provider.py +3 -4
  43. credsweeper/file_handler/byte_content_provider.py +11 -2
  44. credsweeper/file_handler/content_provider.py +1 -1
  45. credsweeper/file_handler/data_content_provider.py +1 -1
  46. credsweeper/file_handler/diff_content_provider.py +133 -3
  47. credsweeper/file_handler/file_path_extractor.py +4 -2
  48. credsweeper/file_handler/files_provider.py +4 -4
  49. credsweeper/file_handler/patches_provider.py +7 -8
  50. credsweeper/file_handler/text_content_provider.py +8 -2
  51. credsweeper/filters/__init__.py +3 -4
  52. credsweeper/filters/filter.py +5 -3
  53. credsweeper/filters/group/__init__.py +0 -2
  54. credsweeper/filters/group/general_keyword.py +2 -2
  55. credsweeper/filters/group/general_pattern.py +2 -2
  56. credsweeper/filters/group/group.py +38 -36
  57. credsweeper/filters/group/password_keyword.py +9 -8
  58. credsweeper/filters/group/token_pattern.py +5 -5
  59. credsweeper/filters/group/url_credentials_group.py +8 -8
  60. credsweeper/filters/group/weird_base36_token.py +6 -6
  61. credsweeper/filters/group/weird_base64_token.py +5 -5
  62. credsweeper/filters/line_git_binary_check.py +5 -4
  63. credsweeper/filters/line_specific_key_check.py +6 -5
  64. credsweeper/filters/line_uue_part_check.py +5 -4
  65. credsweeper/filters/value_allowlist_check.py +6 -5
  66. credsweeper/filters/value_array_dictionary_check.py +8 -6
  67. credsweeper/filters/value_atlassian_token_check.py +6 -5
  68. credsweeper/filters/value_azure_token_check.py +6 -5
  69. credsweeper/filters/value_base32_data_check.py +8 -5
  70. credsweeper/filters/value_base64_data_check.py +6 -5
  71. credsweeper/filters/value_base64_encoded_pem_check.py +6 -5
  72. credsweeper/filters/value_base64_key_check.py +6 -5
  73. credsweeper/filters/value_base64_part_check.py +6 -5
  74. credsweeper/filters/value_basic_auth_check.py +37 -0
  75. credsweeper/filters/value_blocklist_check.py +6 -4
  76. credsweeper/filters/value_camel_case_check.py +8 -7
  77. credsweeper/filters/value_dictionary_keyword_check.py +6 -4
  78. credsweeper/filters/value_discord_bot_check.py +6 -5
  79. credsweeper/filters/value_entropy_base_check.py +6 -5
  80. credsweeper/filters/value_file_path_check.py +13 -8
  81. credsweeper/filters/value_github_check.py +8 -6
  82. credsweeper/filters/value_grafana_check.py +6 -5
  83. credsweeper/filters/value_grafana_service_check.py +5 -4
  84. credsweeper/filters/value_hex_number_check.py +5 -4
  85. credsweeper/filters/value_jfrog_token_check.py +6 -5
  86. credsweeper/filters/value_json_web_key_check.py +6 -5
  87. credsweeper/filters/value_json_web_token_check.py +6 -5
  88. credsweeper/filters/value_last_word_check.py +6 -4
  89. credsweeper/filters/{value_dictionary_value_length_check.py → value_length_check.py} +12 -6
  90. credsweeper/filters/value_method_check.py +5 -4
  91. credsweeper/filters/value_morphemes_check.py +43 -0
  92. credsweeper/filters/value_not_allowed_pattern_check.py +6 -5
  93. credsweeper/filters/value_not_part_encoded_check.py +4 -4
  94. credsweeper/filters/value_number_check.py +5 -4
  95. credsweeper/filters/value_pattern_check.py +61 -41
  96. credsweeper/filters/value_similarity_check.py +6 -4
  97. credsweeper/filters/value_split_keyword_check.py +5 -4
  98. credsweeper/filters/value_string_type_check.py +10 -7
  99. credsweeper/filters/value_token_base_check.py +5 -4
  100. credsweeper/filters/value_token_check.py +6 -5
  101. credsweeper/logger/__init__.py +0 -1
  102. credsweeper/logger/logger.py +1 -1
  103. credsweeper/ml_model/__init__.py +0 -1
  104. credsweeper/ml_model/features/__init__.py +1 -0
  105. credsweeper/ml_model/features/entropy_evaluation.py +1 -1
  106. credsweeper/ml_model/features/feature.py +2 -19
  107. credsweeper/ml_model/features/file_extension.py +2 -2
  108. credsweeper/ml_model/features/has_html_tag.py +12 -10
  109. credsweeper/ml_model/features/is_secret_numeric.py +5 -4
  110. credsweeper/ml_model/features/length_of_attribute.py +1 -1
  111. credsweeper/ml_model/features/morpheme_dense.py +15 -8
  112. credsweeper/ml_model/features/rule_name.py +2 -2
  113. credsweeper/ml_model/features/rule_severity.py +21 -0
  114. credsweeper/ml_model/features/search_in_attribute.py +1 -1
  115. credsweeper/ml_model/features/word_in.py +10 -33
  116. credsweeper/ml_model/features/word_in_path.py +6 -4
  117. credsweeper/ml_model/features/word_in_postamble.py +2 -5
  118. credsweeper/ml_model/features/word_in_preamble.py +2 -5
  119. credsweeper/ml_model/features/word_in_transition.py +2 -5
  120. credsweeper/ml_model/features/word_in_value.py +3 -4
  121. credsweeper/ml_model/features/word_in_variable.py +3 -4
  122. credsweeper/ml_model/ml_config.json +140 -27
  123. credsweeper/ml_model/ml_model.onnx +0 -0
  124. credsweeper/ml_model/ml_validator.py +4 -3
  125. credsweeper/rules/__init__.py +0 -1
  126. credsweeper/rules/config.yaml +329 -239
  127. credsweeper/rules/rule.py +4 -3
  128. credsweeper/scanner/__init__.py +0 -1
  129. credsweeper/scanner/scan_type/__init__.py +0 -5
  130. credsweeper/scanner/scan_type/multi_pattern.py +4 -4
  131. credsweeper/scanner/scan_type/pem_key_pattern.py +4 -4
  132. credsweeper/scanner/scan_type/scan_type.py +4 -4
  133. credsweeper/scanner/scan_type/single_pattern.py +4 -4
  134. credsweeper/scanner/scanner.py +24 -15
  135. credsweeper/secret/config.json +19 -6
  136. credsweeper/utils/__init__.py +0 -1
  137. credsweeper/utils/pem_key_detector.py +3 -3
  138. credsweeper/utils/util.py +24 -150
  139. {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/METADATA +7 -7
  140. credsweeper-1.13.3.dist-info/RECORD +164 -0
  141. credsweeper/filters/value_couple_keyword_check.py +0 -26
  142. credsweeper-1.11.5.dist-info/RECORD +0 -159
  143. {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/WHEEL +0 -0
  144. {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/entry_points.txt +0 -0
  145. {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/licenses/LICENSE +0 -0
credsweeper/__init__.py CHANGED
@@ -1,21 +1,27 @@
1
1
  from credsweeper.app import CredSweeper
2
- from credsweeper.common.constants import ThresholdPreset
3
- from credsweeper.file_handler import ContentProvider, ByteContentProvider, DiffContentProvider, StringContentProvider, \
4
- DataContentProvider, \
5
- TextContentProvider
2
+ from credsweeper.common.constants import ThresholdPreset, Severity, Confidence
3
+ from credsweeper.file_handler.byte_content_provider import ByteContentProvider
4
+ from credsweeper.file_handler.content_provider import ContentProvider
5
+ from credsweeper.file_handler.data_content_provider import DataContentProvider
6
+ from credsweeper.file_handler.diff_content_provider import DiffContentProvider
7
+ from credsweeper.file_handler.string_content_provider import StringContentProvider
8
+ from credsweeper.file_handler.text_content_provider import TextContentProvider
9
+
6
10
  from credsweeper.ml_model.ml_validator import MlValidator
7
11
 
8
12
  __all__ = [
9
- 'ByteContentProvider', #
10
- 'ContentProvider', #
11
- 'CredSweeper', #
12
- 'DataContentProvider', #
13
- 'DiffContentProvider', #
14
- 'MlValidator', #
15
- 'StringContentProvider', #
16
- 'TextContentProvider', #
17
- 'ThresholdPreset', #
18
- '__version__'
13
+ "ByteContentProvider", #
14
+ "Confidence", #
15
+ "ContentProvider", #
16
+ "CredSweeper", #
17
+ "DataContentProvider", #
18
+ "DiffContentProvider", #
19
+ "MlValidator", #
20
+ "Severity", #
21
+ "StringContentProvider", #
22
+ "TextContentProvider", #
23
+ "ThresholdPreset", #
24
+ "__version__"
19
25
  ]
20
26
 
21
- __version__ = "1.11.5"
27
+ __version__ = "1.13.3"
credsweeper/__main__.py CHANGED
@@ -1,20 +1,24 @@
1
1
  import binascii
2
+ import contextlib
2
3
  import logging
3
4
  import os
4
5
  import sys
5
6
  import time
6
7
  from argparse import ArgumentParser, ArgumentTypeError, Namespace, BooleanOptionalAction
7
8
  from pathlib import Path
8
- from typing import Any, Union, Dict
9
+ from typing import Any, Union, Dict, Tuple, Sequence
10
+
11
+ from git import Repo, Commit
9
12
 
10
13
  from credsweeper import __version__
11
14
  from credsweeper.app import APP_PATH, CredSweeper
12
15
  from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType, ML_HUNK
13
16
  from credsweeper.file_handler.abstract_provider import AbstractProvider
17
+ from credsweeper.file_handler.byte_content_provider import ByteContentProvider
14
18
  from credsweeper.file_handler.files_provider import FilesProvider
15
19
  from credsweeper.file_handler.patches_provider import PatchesProvider
16
20
  from credsweeper.logger.logger import Logger
17
- from credsweeper.utils import Util
21
+ from credsweeper.utils.util import Util
18
22
 
19
23
  EXIT_SUCCESS = 0
20
24
  EXIT_FAILURE = 1
@@ -31,24 +35,24 @@ def positive_int(value: Any) -> int:
31
35
  return int_value
32
36
 
33
37
 
34
- def threshold_or_float(arg: str) -> Union[float, ThresholdPreset]:
38
+ def threshold_or_float_or_zero(arg: str) -> Union[int, float, ThresholdPreset]:
35
39
  """Return ThresholdPreset or a float from the input string
36
40
 
37
41
  Args:
38
42
  arg: string that either a float or one of allowed values in ThresholdPreset
39
43
 
40
44
  Returns:
41
- float if arg convertible to float, ThresholdPreset if one of the allowed values
45
+ int = 0 to disable ML validator, float if arg convertible to float, ThresholdPreset if one of the allowed values
42
46
 
43
47
  Raises:
44
48
  ArgumentTypeError: if arg cannot be interpreted as float or ThresholdPreset
45
49
 
46
50
  """
47
51
  allowed_presents = [e.value for e in ThresholdPreset]
48
- try:
52
+ if '0' == arg:
53
+ return 0
54
+ with contextlib.suppress(ValueError):
49
55
  return float(arg) # try convert to float
50
- except ValueError:
51
- pass
52
56
  if arg in allowed_presents:
53
57
  return ThresholdPreset[arg]
54
58
  raise ArgumentTypeError(f"value must be a float or one of {allowed_presents}")
@@ -118,6 +122,11 @@ def get_arguments() -> Namespace:
118
122
  const="log.yaml",
119
123
  dest="export_log_config",
120
124
  metavar="PATH")
125
+ group.add_argument("--git", help="git repo to scan", dest="git", metavar="PATH")
126
+ parser.add_argument("--ref",
127
+ help="scan git repo from the ref, otherwise - all branches were scanned (slow)",
128
+ dest="ref",
129
+ type=str)
121
130
  parser.add_argument("--rules",
122
131
  help="path of rule config file (default: credsweeper/rules/config.yaml). "
123
132
  f"severity:{[i.value for i in Severity]} "
@@ -150,6 +159,10 @@ def get_arguments() -> Namespace:
150
159
  help="find files by predefined extension",
151
160
  dest="find_by_ext",
152
161
  action="store_true")
162
+ parser.add_argument("--pedantic",
163
+ help="process files without extension",
164
+ action=BooleanOptionalAction,
165
+ default=False)
153
166
  parser.add_argument("--depth",
154
167
  help="additional recursive search in data (experimental)",
155
168
  type=positive_int,
@@ -164,11 +177,11 @@ def get_arguments() -> Namespace:
164
177
  "The lower the threshold - the more credentials will be reported. "
165
178
  f"Allowed values: float between 0 and 1, or any of {[e.value for e in ThresholdPreset]} "
166
179
  "(default: medium)",
167
- type=threshold_or_float,
180
+ type=threshold_or_float_or_zero,
168
181
  default=ThresholdPreset.medium,
169
182
  dest="ml_threshold",
170
183
  required=False,
171
- metavar="FLOAT_OR_STR")
184
+ metavar="THRESHOLD_OR_FLOAT_OR_ZERO")
172
185
  parser.add_argument("--ml_batch_size",
173
186
  "-b",
174
187
  help="batch size for model inference (default: 16)",
@@ -246,8 +259,8 @@ def get_arguments() -> Namespace:
246
259
  default=False)
247
260
  parser.add_argument("--log",
248
261
  "-l",
249
- help=f"provide logging level of {list(Logger.LEVELS.keys())}"
250
- f"(default: 'warning', case insensitive)",
262
+ help=(f"provide logging level of {list(Logger.LEVELS.keys())}"
263
+ f" (default: 'warning', case insensitive)"),
251
264
  default="warning",
252
265
  dest="log",
253
266
  metavar="LOG_LEVEL",
@@ -268,6 +281,40 @@ def get_arguments() -> Namespace:
268
281
  return parser.parse_args()
269
282
 
270
283
 
284
+ def get_credsweeper(args: Namespace) -> CredSweeper:
285
+ """Common function to create the instance"""
286
+ if args.denylist_path is not None:
287
+ denylist = [line for line in Util.read_file(args.denylist_path) if line]
288
+ else:
289
+ denylist = []
290
+ return CredSweeper(rule_path=args.rule_path,
291
+ config_path=args.config_path,
292
+ json_filename=args.json_filename,
293
+ xlsx_filename=args.xlsx_filename,
294
+ stdout=args.stdout,
295
+ color=args.color,
296
+ hashed=args.hashed,
297
+ subtext=args.subtext,
298
+ sort_output=args.sort_output,
299
+ use_filters=args.no_filters,
300
+ pool_count=args.jobs,
301
+ ml_batch_size=args.ml_batch_size,
302
+ ml_threshold=args.ml_threshold,
303
+ ml_config=args.ml_config,
304
+ ml_model=args.ml_model,
305
+ ml_providers=args.ml_providers,
306
+ find_by_ext=args.find_by_ext,
307
+ pedantic=args.pedantic,
308
+ depth=args.depth,
309
+ doc=args.doc,
310
+ severity=args.severity,
311
+ size_limit=args.size_limit,
312
+ exclude_lines=denylist,
313
+ exclude_values=denylist,
314
+ thrifty=args.thrifty,
315
+ log_level=args.log)
316
+
317
+
271
318
  def scan(args: Namespace, content_provider: AbstractProvider) -> int:
272
319
  """Scan content_provider data, print results or save them to json_filename is not None
273
320
 
@@ -283,42 +330,105 @@ def scan(args: Namespace, content_provider: AbstractProvider) -> int:
283
330
 
284
331
  """
285
332
  try:
286
- if args.denylist_path is not None:
287
- denylist = [line for line in Util.read_file(args.denylist_path) if line]
288
- else:
289
- denylist = []
290
-
291
- credsweeper = CredSweeper(rule_path=args.rule_path,
292
- config_path=args.config_path,
293
- json_filename=args.json_filename,
294
- xlsx_filename=args.xlsx_filename,
295
- stdout=args.stdout,
296
- color=args.color,
297
- hashed=args.hashed,
298
- subtext=args.subtext,
299
- sort_output=args.sort_output,
300
- use_filters=args.no_filters,
301
- pool_count=args.jobs,
302
- ml_batch_size=args.ml_batch_size,
303
- ml_threshold=args.ml_threshold,
304
- ml_config=args.ml_config,
305
- ml_model=args.ml_model,
306
- ml_providers=args.ml_providers,
307
- find_by_ext=args.find_by_ext,
308
- depth=args.depth,
309
- doc=args.doc,
310
- severity=args.severity,
311
- size_limit=args.size_limit,
312
- exclude_lines=denylist,
313
- exclude_values=denylist,
314
- thrifty=args.thrifty,
315
- log_level=args.log)
333
+ credsweeper = get_credsweeper(args)
316
334
  return credsweeper.run(content_provider=content_provider)
317
335
  except Exception as exc:
318
336
  logger.critical(exc, exc_info=True)
337
+ logger.exception(exc)
319
338
  return -1
320
339
 
321
340
 
341
+ def get_commit_providers(commit: Commit, repo: Repo) -> Sequence[ByteContentProvider]:
342
+ """Process a commit and for providers"""
343
+ result = {}
344
+ # use the hardcoded sha1 until sha256 objects are not supported by GitPython
345
+ ancestors = commit.parents or [repo.tree("4b825dc642cb6eb9a060e54bf8d69288fbee4904")]
346
+ for parent in ancestors:
347
+ for diff in parent.diff(commit):
348
+ # only result files
349
+ blob_b = diff.b_blob
350
+ if blob_b and blob_b.path not in result:
351
+ try:
352
+ result[blob_b.path] = ByteContentProvider(content=blob_b.data_stream.read(),
353
+ file_path=str(blob_b.path),
354
+ info=DiffRowType.ADDED.value)
355
+ except Exception as exc:
356
+ logger.warning(f"A submodule was not properly initialized or commit was removed: {exc}")
357
+ return list(result.values())
358
+
359
+
360
+ def drill(args: Namespace) -> Tuple[int, int]:
361
+ """Scan repository for branches and commits
362
+ Returns:
363
+ total credentials found
364
+ total scanned commits
365
+ """
366
+ total_credentials = 0
367
+ total_commits = 0
368
+ try:
369
+ # repo init first
370
+ repo = Repo(args.git)
371
+ if args.ref:
372
+ commits_sha1 = set(x.commit.hexsha for x in repo.refs if x.name == args.ref)
373
+ if not commits_sha1:
374
+ commits_sha1 = {args.ref} # single commit sha1 reference
375
+ else:
376
+ commits_sha1 = set(x.commit.hexsha for x in repo.refs
377
+ if x.name.startswith('origin/') or x.name.startswith('refs/heads/'))
378
+ logger.info(f"Git repository {args.git} with commits: {commits_sha1}")
379
+ # then - credsweeper
380
+ credsweeper = get_credsweeper(args)
381
+ # use flat iterations to avoid recursive limits
382
+ to_scan = set(commits_sha1)
383
+ # local speedup for already scanned commits - avoid file system interactive
384
+ scanned = set()
385
+ # to avoid double-check
386
+ skipped = set()
387
+ while to_scan:
388
+ commit_sha1 = to_scan.pop()
389
+ if commit_sha1 in scanned:
390
+ # the commit was scanned in this launch
391
+ continue
392
+ commit = repo.commit(commit_sha1)
393
+ if commit.parents:
394
+ # add parents only when they were not skipped or scanned previously
395
+ to_scan.update(x.hexsha for x in commit.parents if x.hexsha not in skipped and x.hexsha not in scanned)
396
+ # check whether the commit has been checked and the report is present
397
+ skip_already_scanned = False
398
+ if args.json_filename:
399
+ json_path = Path(args.json_filename)
400
+ json_path = json_path.with_suffix(f".{commit_sha1}{json_path.suffix}")
401
+ if json_path.exists():
402
+ skip_already_scanned = True
403
+ else:
404
+ credsweeper.json_filename = json_path
405
+ if args.xlsx_filename:
406
+ xlsx_path = Path(args.xlsx_filename)
407
+ xlsx_path = xlsx_path.with_suffix(f".{commit_sha1}{xlsx_path.suffix}")
408
+ if xlsx_path.exists():
409
+ skip_already_scanned = True
410
+ else:
411
+ credsweeper.xlsx_filename = xlsx_path
412
+ if skip_already_scanned:
413
+ skipped.add(commit_sha1)
414
+ logger.info("Skip already scanned commit: %s %s", commit_sha1, commit.committed_datetime.isoformat())
415
+ continue
416
+ logger.info("Scan commit: %s %s", commit_sha1, commit.committed_datetime.isoformat())
417
+ # prepare all files to scan in the commit with bytes->IO transformation to avoid a multiprocess issue
418
+ if providers := get_commit_providers(commit, repo):
419
+ credsweeper.credential_manager.candidates.clear()
420
+ credsweeper.scan(providers)
421
+ credsweeper.post_processing()
422
+ credsweeper.export_results()
423
+ total_credentials += credsweeper.credential_manager.len_credentials()
424
+ total_commits += 1
425
+ scanned.add(commit_sha1)
426
+ except Exception as exc:
427
+ logger.critical(exc, exc_info=True)
428
+ return -1, total_commits
429
+ return total_credentials, total_commits
430
+
431
+
322
432
  def main() -> int:
323
433
  """Main function"""
324
434
  result = EXIT_FAILURE
@@ -328,7 +438,7 @@ def main() -> int:
328
438
  if args.banner:
329
439
  print(f"CredSweeper {__version__} crc32:{check_integrity():08x}")
330
440
  Logger.init_logging(args.log, args.log_config_path)
331
- logger.info(f"Init CredSweeper object with arguments: {args}")
441
+ logger.info(f"Init CredSweeper object with arguments: {args} CWD: {os.getcwd()}")
332
442
  summary: Dict[str, int] = {}
333
443
  if args.path:
334
444
  logger.info(f"Run analyzer on path: {args.path}")
@@ -353,6 +463,12 @@ def main() -> int:
353
463
  result = EXIT_SUCCESS
354
464
  # collect number of all found credential to produce error code when necessary
355
465
  credentials_number = add_credentials_number + del_credentials_number
466
+ elif args.git:
467
+ logger.info(f"Run analyzer on GIT: {args.git}")
468
+ credentials_number, commits_number = drill(args)
469
+ summary[f"Detected Credentials in {args.git} for {commits_number} commits "] = credentials_number
470
+ if 0 <= credentials_number:
471
+ result = EXIT_SUCCESS
356
472
  elif args.export_config:
357
473
  logging.info(f"Exporting default config to file: {args.export_config}")
358
474
  config_dict = Util.json_load(APP_PATH / "secret" / "config.json")
credsweeper/app.py CHANGED
@@ -11,18 +11,18 @@ from colorama import Style
11
11
  # Directory of credsweeper sources MUST be placed before imports to avoid circular import error
12
12
  APP_PATH = Path(__file__).resolve().parent
13
13
 
14
+ from credsweeper.scanner.scanner import Scanner
14
15
  from credsweeper.common.constants import Severity, ThresholdPreset, DiffRowType, DEFAULT_ENCODING
15
- from credsweeper.config import Config
16
- from credsweeper.credentials import Candidate, CredentialManager, CandidateKey
16
+ from credsweeper.config.config import Config
17
+ from credsweeper.credentials.candidate import Candidate
18
+ from credsweeper.credentials.candidate_key import CandidateKey
19
+ from credsweeper.credentials.credential_manager import CredentialManager
17
20
  from credsweeper.deep_scanner.deep_scanner import DeepScanner
18
21
  from credsweeper.file_handler.content_provider import ContentProvider
19
- from credsweeper.file_handler.diff_content_provider import DiffContentProvider
20
22
  from credsweeper.file_handler.file_path_extractor import FilePathExtractor
21
23
  from credsweeper.file_handler.abstract_provider import AbstractProvider
22
- from credsweeper.file_handler.text_content_provider import TextContentProvider
23
- from credsweeper.scanner import Scanner
24
24
  from credsweeper.ml_model.ml_validator import MlValidator
25
- from credsweeper.utils import Util
25
+ from credsweeper.utils.util import Util
26
26
 
27
27
  logger = logging.getLogger(__name__)
28
28
 
@@ -52,11 +52,12 @@ class CredSweeper:
52
52
  use_filters: bool = True,
53
53
  pool_count: int = 1,
54
54
  ml_batch_size: Optional[int] = None,
55
- ml_threshold: Union[float, ThresholdPreset] = ThresholdPreset.medium,
55
+ ml_threshold: Union[int, float, ThresholdPreset] = ThresholdPreset.medium,
56
56
  ml_config: Union[None, str, Path] = None,
57
57
  ml_model: Union[None, str, Path] = None,
58
58
  ml_providers: Optional[str] = None,
59
59
  find_by_ext: bool = False,
60
+ pedantic: bool = False,
60
61
  depth: int = 0,
61
62
  doc: bool = False,
62
63
  severity: Union[Severity, str] = Severity.INFO,
@@ -86,6 +87,7 @@ class CredSweeper:
86
87
  ml_model: str or Path to set custom ml model
87
88
  ml_providers: str - comma separated list with providers
88
89
  find_by_ext: boolean - files will be reported by extension
90
+ pedantic: boolean - scan all files
89
91
  depth: int - how deep container files will be scanned
90
92
  doc: boolean - document-specific scanning
91
93
  severity: Severity - minimum severity level of rule
@@ -103,6 +105,7 @@ class CredSweeper:
103
105
  config_dict = self._get_config_dict(config_path=config_path,
104
106
  use_filters=use_filters,
105
107
  find_by_ext=find_by_ext,
108
+ pedantic=pedantic,
106
109
  depth=depth,
107
110
  doc=doc,
108
111
  severity=_severity,
@@ -145,6 +148,7 @@ class CredSweeper:
145
148
  config_path: Optional[str], #
146
149
  use_filters: bool, #
147
150
  find_by_ext: bool, #
151
+ pedantic: bool, #
148
152
  depth: int, #
149
153
  doc: bool, #
150
154
  severity: Severity, #
@@ -155,6 +159,7 @@ class CredSweeper:
155
159
  config_dict["use_filters"] = use_filters
156
160
  config_dict["find_by_ext"] = find_by_ext
157
161
  config_dict["size_limit"] = size_limit
162
+ config_dict["pedantic"] = pedantic
158
163
  config_dict["depth"] = depth
159
164
  config_dict["doc"] = doc
160
165
  config_dict["severity"] = severity.value
@@ -169,7 +174,7 @@ class CredSweeper:
169
174
  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
170
175
 
171
176
  def _use_ml_validation(self) -> bool:
172
- if isinstance(self.ml_threshold, (float, int)) and 0 >= self.ml_threshold:
177
+ if isinstance(self.ml_threshold, int) and 0 == self.ml_threshold:
173
178
  logger.info("ML validation is disabled")
174
179
  return False
175
180
  if not self.credential_manager.candidates:
@@ -215,7 +220,7 @@ class CredSweeper:
215
220
  content_provider: path objects to scan
216
221
 
217
222
  """
218
- _empty_list: Sequence[Union[DiffContentProvider, TextContentProvider]] = []
223
+ _empty_list: Sequence[ContentProvider] = []
219
224
  file_extractors = content_provider.get_scannable_files(self.config) if content_provider else _empty_list
220
225
  if not file_extractors:
221
226
  logger.info(f"No scannable targets for {len(content_provider.paths)} paths")
@@ -229,7 +234,7 @@ class CredSweeper:
229
234
 
230
235
  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
231
236
 
232
- def scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
237
+ def scan(self, content_providers: Sequence[ContentProvider]) -> None:
233
238
  """Run scanning of files from an argument "content_providers".
234
239
 
235
240
  Args:
@@ -243,7 +248,7 @@ class CredSweeper:
243
248
 
244
249
  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
245
250
 
246
- def __single_job_scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
251
+ def __single_job_scan(self, content_providers: Sequence[ContentProvider]) -> None:
247
252
  """Performs scan in main thread"""
248
253
  logger.info(f"Scan for {len(content_providers)} providers")
249
254
  all_cred = self.files_scan(content_providers)
@@ -251,7 +256,7 @@ class CredSweeper:
251
256
 
252
257
  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
253
258
 
254
- def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
259
+ def __multi_jobs_scan(self, content_providers: Sequence[ContentProvider]) -> None:
255
260
  """Performs scan with multiple jobs"""
256
261
  # use this separation to satisfy YAPF formatter
257
262
  yapfix = "%(asctime)s | %(levelname)s | %(processName)s:%(threadName)s | %(filename)s:%(lineno)s | %(message)s"
@@ -265,7 +270,7 @@ class CredSweeper:
265
270
  logger.info(f"Scan in {pool_count} processes for {len(content_providers)} providers")
266
271
  with multiprocessing.get_context("spawn").Pool(processes=pool_count,
267
272
  initializer=CredSweeper.pool_initializer,
268
- initargs=(log_kwargs, )) as pool:
273
+ initargs=(log_kwargs,)) as pool: # yapf: disable
269
274
  try:
270
275
  for scan_results in pool.imap_unordered(self.files_scan,
271
276
  (content_providers[x::pool_count] for x in range(pool_count))):
@@ -3,47 +3,48 @@ import re
3
3
 
4
4
  class KeywordPattern:
5
5
  """Pattern set of keyword types"""
6
- directive = r"(?P<directive>(?:(?:[#%]define|%global)(?:\s|\\t)|\bset))?"
7
- key_left = r"(?:\\[nrt]|%[0-9a-f]{2}|\s)*" \
8
- r"(?P<variable>(([`'\"]{1,8}[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \
9
- r"(?P<keyword>"
10
- # there will be inserted a keyword
11
- key_right = r")" \
12
- r"[^%:='\"`<>({?!&;\n]*" \
6
+ directive = r"(?P<directive>(?:" \
7
+ r"(?:[#%]define|define(?=(\s|\\{1,8}[tnr])*\()|%global)" \
8
+ r"(?:\s?\(|\s|\\{1,8}[tnr]){1,8}|\bset(?=\b|\w*(\s|\\{1,8}[tnr])*\()" \
9
+ r"))?"
10
+ key_left = r"(?:\\[nrt]|(\\\\*u00|%)[0-9a-f]{2}|\s)*" \
11
+ r"(?P<variable>(([\"'`]{1,8}[^:=\"'`}<>\\/&?]*|[^:=\"'`}<>\s()\\/&?;,%]*)"
12
+ # keyword will be inserted here
13
+ key_right = r"[^%:=\"'`<>({?!&;\n]{0,80}" \
13
14
  r")" \
14
- r"(&(quot|apos);|%[0-9a-f]{2}|[`'\"])*" \
15
+ r"(&(quot|apos|#3[49]);|(\\\\*u00|%)[0-9a-f]{2}|[\"'`])*" \
15
16
  r")" # <variable>
16
17
  separator = r"(?(directive)|(\s|\\{1,8}[tnr])*\]?(\s|\\{1,8}[tnr])*)" \
17
18
  r"(?P<separator>:(\s[a-z]{3,9}[?]?\s)?=|:(?!:)|=(>|&gt;|(\\\\*u00|%)26gt;)|!==|!=|===|==|=~|=" \
18
- r"|(?(directive)(\\t|\s|\((?!\))){1,80}|%3d))" \
19
+ r"|(?(directive)(,|\\t|\s|\((?!\))){1,80}|%3d))" \
19
20
  r"(\s|\\{1,8}[tnr])*"
20
21
  # might be curly, square or parenthesis with words before
21
22
  wrap = r"(?P<wrap>(" \
22
- r"(new(\s|\\{1,8}[tnr]|byte|char|string|\[\]){1,8})?" \
23
+ r"((\s|\\{1,8}[tnr]|new|byte|char|string|\[\]){1,8})?" \
23
24
  r"(?P<get>([_a-z][0-9a-z_.\[\]]*\.)get|(os\.)?getenv)?" \
24
25
  r"([0-9a-z_.]|::|-(>|&gt;))*" \
25
26
  r"\s*" \
26
27
  r"(\[(?!\])|\((?!\))|\{(?!\}))" \
27
28
  r"(\s|\\{1,8}[tnr])*" \
28
- r"(?(get)('[^']+'|\"[^\"]+\")\s*,\s*|)" \
29
+ r"(?(get)('[^']{1,31}'|\"[^\"]{1,31}\")\s*,\s*|)" \
29
30
  r"([0-9a-z_]{1,32}\s*[:=]\s*)?" \
30
31
  r"){1,8})?"
31
- string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[`'\"])))?"
32
- left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?([`'\"]|&(quot|apos);)){1,4}))?"
32
+ string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[\"'`])))?"
33
+ left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?([\"'`]|&(quot|apos|#3[49]);)){1,4}))?"
33
34
  # Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
34
35
  auth_keywords = r"(\s?(oauth|bot|basic|bearer|apikey|accesskey|ssws|ntlm)\s)?"
35
36
  value = r"(?P<value>" \
36
37
  r"(?(value_leftquote)" \
37
38
  r"(" \
38
39
  r"(?!(?P=value_leftquote))" \
39
- r"(?(esq)((?!(?P=esq)([`'\"]|&(quot|apos);)).)|((?!(?P=value_leftquote)).)))" \
40
+ r"(?(esq)((?!(?P=esq)([\"'`]|&(quot|apos|#3[49]);)).)|((?!(?P=value_leftquote)).)))" \
40
41
  r"|" \
41
- r"(?!&(quot|apos);)" \
42
- r"(\\{1,8}([ tnr]|[^\s`'\"])" \
42
+ r"(?!&(quot|apos|#3[49]);)" \
43
+ r"(\\{1,8}([ tnr]|[^\s\"'`])" \
43
44
  r"|" \
44
45
  r"(?P<url_esc>%[0-9a-f]{2})" \
45
46
  r"|" \
46
- r"(?(url_esc)[^\s`'\",;\\&]|[^\s`'\",;\\])" \
47
+ r"(?(url_esc)[^\s\"'`,;\\&]|[^\s\"'`,;\\])" \
47
48
  r")" \
48
49
  r"){4,8000}" \
49
50
  r"|" \
@@ -67,7 +68,7 @@ class KeywordPattern:
67
68
  expression = ''.join([ #
68
69
  cls.directive, #
69
70
  cls.key_left, #
70
- keyword, #
71
+ fr"(?P<keyword>{keyword})", # named group required
71
72
  cls.key_right, #
72
73
  cls.separator, #
73
74
  cls.wrap, #