credsweeper 1.11.5__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of credsweeper might be problematic. Click here for more details.
- credsweeper/__init__.py +21 -15
- credsweeper/__main__.py +158 -42
- credsweeper/app.py +18 -13
- credsweeper/common/keyword_pattern.py +19 -18
- credsweeper/common/morpheme_checklist.txt +28 -6
- credsweeper/config/__init__.py +0 -1
- credsweeper/config/config.py +4 -3
- credsweeper/credentials/__init__.py +0 -5
- credsweeper/credentials/augment_candidates.py +1 -1
- credsweeper/credentials/candidate.py +1 -1
- credsweeper/credentials/credential_manager.py +1 -1
- credsweeper/credentials/line_data.py +43 -8
- credsweeper/deep_scanner/__init__.py +0 -1
- credsweeper/deep_scanner/abstract_scanner.py +4 -3
- credsweeper/deep_scanner/byte_scanner.py +1 -1
- credsweeper/deep_scanner/bzip2_scanner.py +2 -2
- credsweeper/deep_scanner/csv_scanner.py +71 -0
- credsweeper/deep_scanner/deb_scanner.py +1 -1
- credsweeper/deep_scanner/deep_scanner.py +22 -12
- credsweeper/deep_scanner/docx_scanner.py +1 -1
- credsweeper/deep_scanner/eml_scanner.py +1 -1
- credsweeper/deep_scanner/encoder_scanner.py +1 -1
- credsweeper/deep_scanner/gzip_scanner.py +2 -2
- credsweeper/deep_scanner/html_scanner.py +1 -1
- credsweeper/deep_scanner/jclass_scanner.py +1 -1
- credsweeper/deep_scanner/jks_scanner.py +12 -3
- credsweeper/deep_scanner/lang_scanner.py +1 -1
- credsweeper/deep_scanner/lzma_scanner.py +2 -2
- credsweeper/deep_scanner/mxfile_scanner.py +1 -1
- credsweeper/deep_scanner/pdf_scanner.py +1 -1
- credsweeper/deep_scanner/pkcs_scanner.py +6 -2
- credsweeper/deep_scanner/pptx_scanner.py +1 -1
- credsweeper/deep_scanner/rpm_scanner.py +1 -1
- credsweeper/deep_scanner/rtf_scanner.py +41 -0
- credsweeper/deep_scanner/strings_scanner.py +52 -0
- credsweeper/deep_scanner/tar_scanner.py +2 -2
- credsweeper/deep_scanner/tmx_scanner.py +2 -2
- credsweeper/deep_scanner/xlsx_scanner.py +2 -2
- credsweeper/deep_scanner/xml_scanner.py +1 -1
- credsweeper/deep_scanner/zip_scanner.py +2 -2
- credsweeper/file_handler/__init__.py +0 -15
- credsweeper/file_handler/abstract_provider.py +3 -4
- credsweeper/file_handler/byte_content_provider.py +11 -2
- credsweeper/file_handler/content_provider.py +1 -1
- credsweeper/file_handler/data_content_provider.py +1 -1
- credsweeper/file_handler/diff_content_provider.py +133 -3
- credsweeper/file_handler/file_path_extractor.py +4 -2
- credsweeper/file_handler/files_provider.py +4 -4
- credsweeper/file_handler/patches_provider.py +7 -8
- credsweeper/file_handler/text_content_provider.py +8 -2
- credsweeper/filters/__init__.py +3 -4
- credsweeper/filters/filter.py +5 -3
- credsweeper/filters/group/__init__.py +0 -2
- credsweeper/filters/group/general_keyword.py +2 -2
- credsweeper/filters/group/general_pattern.py +2 -2
- credsweeper/filters/group/group.py +38 -36
- credsweeper/filters/group/password_keyword.py +9 -8
- credsweeper/filters/group/token_pattern.py +5 -5
- credsweeper/filters/group/url_credentials_group.py +8 -8
- credsweeper/filters/group/weird_base36_token.py +6 -6
- credsweeper/filters/group/weird_base64_token.py +5 -5
- credsweeper/filters/line_git_binary_check.py +5 -4
- credsweeper/filters/line_specific_key_check.py +6 -5
- credsweeper/filters/line_uue_part_check.py +5 -4
- credsweeper/filters/value_allowlist_check.py +6 -5
- credsweeper/filters/value_array_dictionary_check.py +8 -6
- credsweeper/filters/value_atlassian_token_check.py +6 -5
- credsweeper/filters/value_azure_token_check.py +6 -5
- credsweeper/filters/value_base32_data_check.py +8 -5
- credsweeper/filters/value_base64_data_check.py +6 -5
- credsweeper/filters/value_base64_encoded_pem_check.py +6 -5
- credsweeper/filters/value_base64_key_check.py +6 -5
- credsweeper/filters/value_base64_part_check.py +6 -5
- credsweeper/filters/value_basic_auth_check.py +37 -0
- credsweeper/filters/value_blocklist_check.py +6 -4
- credsweeper/filters/value_camel_case_check.py +8 -7
- credsweeper/filters/value_dictionary_keyword_check.py +6 -4
- credsweeper/filters/value_discord_bot_check.py +6 -5
- credsweeper/filters/value_entropy_base_check.py +6 -5
- credsweeper/filters/value_file_path_check.py +13 -8
- credsweeper/filters/value_github_check.py +8 -6
- credsweeper/filters/value_grafana_check.py +6 -5
- credsweeper/filters/value_grafana_service_check.py +5 -4
- credsweeper/filters/value_hex_number_check.py +5 -4
- credsweeper/filters/value_jfrog_token_check.py +6 -5
- credsweeper/filters/value_json_web_key_check.py +6 -5
- credsweeper/filters/value_json_web_token_check.py +6 -5
- credsweeper/filters/value_last_word_check.py +6 -4
- credsweeper/filters/{value_dictionary_value_length_check.py → value_length_check.py} +12 -6
- credsweeper/filters/value_method_check.py +5 -4
- credsweeper/filters/value_morphemes_check.py +43 -0
- credsweeper/filters/value_not_allowed_pattern_check.py +6 -5
- credsweeper/filters/value_not_part_encoded_check.py +4 -4
- credsweeper/filters/value_number_check.py +5 -4
- credsweeper/filters/value_pattern_check.py +61 -41
- credsweeper/filters/value_similarity_check.py +6 -4
- credsweeper/filters/value_split_keyword_check.py +5 -4
- credsweeper/filters/value_string_type_check.py +10 -7
- credsweeper/filters/value_token_base_check.py +5 -4
- credsweeper/filters/value_token_check.py +6 -5
- credsweeper/logger/__init__.py +0 -1
- credsweeper/logger/logger.py +1 -1
- credsweeper/ml_model/__init__.py +0 -1
- credsweeper/ml_model/features/__init__.py +1 -0
- credsweeper/ml_model/features/entropy_evaluation.py +1 -1
- credsweeper/ml_model/features/feature.py +2 -19
- credsweeper/ml_model/features/file_extension.py +2 -2
- credsweeper/ml_model/features/has_html_tag.py +12 -10
- credsweeper/ml_model/features/is_secret_numeric.py +5 -4
- credsweeper/ml_model/features/length_of_attribute.py +1 -1
- credsweeper/ml_model/features/morpheme_dense.py +15 -8
- credsweeper/ml_model/features/rule_name.py +2 -2
- credsweeper/ml_model/features/rule_severity.py +21 -0
- credsweeper/ml_model/features/search_in_attribute.py +1 -1
- credsweeper/ml_model/features/word_in.py +10 -33
- credsweeper/ml_model/features/word_in_path.py +6 -4
- credsweeper/ml_model/features/word_in_postamble.py +2 -5
- credsweeper/ml_model/features/word_in_preamble.py +2 -5
- credsweeper/ml_model/features/word_in_transition.py +2 -5
- credsweeper/ml_model/features/word_in_value.py +3 -4
- credsweeper/ml_model/features/word_in_variable.py +3 -4
- credsweeper/ml_model/ml_config.json +140 -27
- credsweeper/ml_model/ml_model.onnx +0 -0
- credsweeper/ml_model/ml_validator.py +4 -3
- credsweeper/rules/__init__.py +0 -1
- credsweeper/rules/config.yaml +329 -239
- credsweeper/rules/rule.py +4 -3
- credsweeper/scanner/__init__.py +0 -1
- credsweeper/scanner/scan_type/__init__.py +0 -5
- credsweeper/scanner/scan_type/multi_pattern.py +4 -4
- credsweeper/scanner/scan_type/pem_key_pattern.py +4 -4
- credsweeper/scanner/scan_type/scan_type.py +4 -4
- credsweeper/scanner/scan_type/single_pattern.py +4 -4
- credsweeper/scanner/scanner.py +24 -15
- credsweeper/secret/config.json +19 -6
- credsweeper/utils/__init__.py +0 -1
- credsweeper/utils/pem_key_detector.py +3 -3
- credsweeper/utils/util.py +24 -150
- {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/METADATA +7 -7
- credsweeper-1.13.3.dist-info/RECORD +164 -0
- credsweeper/filters/value_couple_keyword_check.py +0 -26
- credsweeper-1.11.5.dist-info/RECORD +0 -159
- {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/WHEEL +0 -0
- {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/entry_points.txt +0 -0
- {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/licenses/LICENSE +0 -0
credsweeper/__init__.py
CHANGED
|
@@ -1,21 +1,27 @@
|
|
|
1
1
|
from credsweeper.app import CredSweeper
|
|
2
|
-
from credsweeper.common.constants import ThresholdPreset
|
|
3
|
-
from credsweeper.file_handler import
|
|
4
|
-
|
|
5
|
-
|
|
2
|
+
from credsweeper.common.constants import ThresholdPreset, Severity, Confidence
|
|
3
|
+
from credsweeper.file_handler.byte_content_provider import ByteContentProvider
|
|
4
|
+
from credsweeper.file_handler.content_provider import ContentProvider
|
|
5
|
+
from credsweeper.file_handler.data_content_provider import DataContentProvider
|
|
6
|
+
from credsweeper.file_handler.diff_content_provider import DiffContentProvider
|
|
7
|
+
from credsweeper.file_handler.string_content_provider import StringContentProvider
|
|
8
|
+
from credsweeper.file_handler.text_content_provider import TextContentProvider
|
|
9
|
+
|
|
6
10
|
from credsweeper.ml_model.ml_validator import MlValidator
|
|
7
11
|
|
|
8
12
|
__all__ = [
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
13
|
+
"ByteContentProvider", #
|
|
14
|
+
"Confidence", #
|
|
15
|
+
"ContentProvider", #
|
|
16
|
+
"CredSweeper", #
|
|
17
|
+
"DataContentProvider", #
|
|
18
|
+
"DiffContentProvider", #
|
|
19
|
+
"MlValidator", #
|
|
20
|
+
"Severity", #
|
|
21
|
+
"StringContentProvider", #
|
|
22
|
+
"TextContentProvider", #
|
|
23
|
+
"ThresholdPreset", #
|
|
24
|
+
"__version__"
|
|
19
25
|
]
|
|
20
26
|
|
|
21
|
-
__version__ = "1.
|
|
27
|
+
__version__ = "1.13.3"
|
credsweeper/__main__.py
CHANGED
|
@@ -1,20 +1,24 @@
|
|
|
1
1
|
import binascii
|
|
2
|
+
import contextlib
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
4
5
|
import sys
|
|
5
6
|
import time
|
|
6
7
|
from argparse import ArgumentParser, ArgumentTypeError, Namespace, BooleanOptionalAction
|
|
7
8
|
from pathlib import Path
|
|
8
|
-
from typing import Any, Union, Dict
|
|
9
|
+
from typing import Any, Union, Dict, Tuple, Sequence
|
|
10
|
+
|
|
11
|
+
from git import Repo, Commit
|
|
9
12
|
|
|
10
13
|
from credsweeper import __version__
|
|
11
14
|
from credsweeper.app import APP_PATH, CredSweeper
|
|
12
15
|
from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType, ML_HUNK
|
|
13
16
|
from credsweeper.file_handler.abstract_provider import AbstractProvider
|
|
17
|
+
from credsweeper.file_handler.byte_content_provider import ByteContentProvider
|
|
14
18
|
from credsweeper.file_handler.files_provider import FilesProvider
|
|
15
19
|
from credsweeper.file_handler.patches_provider import PatchesProvider
|
|
16
20
|
from credsweeper.logger.logger import Logger
|
|
17
|
-
from credsweeper.utils import Util
|
|
21
|
+
from credsweeper.utils.util import Util
|
|
18
22
|
|
|
19
23
|
EXIT_SUCCESS = 0
|
|
20
24
|
EXIT_FAILURE = 1
|
|
@@ -31,24 +35,24 @@ def positive_int(value: Any) -> int:
|
|
|
31
35
|
return int_value
|
|
32
36
|
|
|
33
37
|
|
|
34
|
-
def
|
|
38
|
+
def threshold_or_float_or_zero(arg: str) -> Union[int, float, ThresholdPreset]:
|
|
35
39
|
"""Return ThresholdPreset or a float from the input string
|
|
36
40
|
|
|
37
41
|
Args:
|
|
38
42
|
arg: string that either a float or one of allowed values in ThresholdPreset
|
|
39
43
|
|
|
40
44
|
Returns:
|
|
41
|
-
float if arg convertible to float, ThresholdPreset if one of the allowed values
|
|
45
|
+
int = 0 to disable ML validator, float if arg convertible to float, ThresholdPreset if one of the allowed values
|
|
42
46
|
|
|
43
47
|
Raises:
|
|
44
48
|
ArgumentTypeError: if arg cannot be interpreted as float or ThresholdPreset
|
|
45
49
|
|
|
46
50
|
"""
|
|
47
51
|
allowed_presents = [e.value for e in ThresholdPreset]
|
|
48
|
-
|
|
52
|
+
if '0' == arg:
|
|
53
|
+
return 0
|
|
54
|
+
with contextlib.suppress(ValueError):
|
|
49
55
|
return float(arg) # try convert to float
|
|
50
|
-
except ValueError:
|
|
51
|
-
pass
|
|
52
56
|
if arg in allowed_presents:
|
|
53
57
|
return ThresholdPreset[arg]
|
|
54
58
|
raise ArgumentTypeError(f"value must be a float or one of {allowed_presents}")
|
|
@@ -118,6 +122,11 @@ def get_arguments() -> Namespace:
|
|
|
118
122
|
const="log.yaml",
|
|
119
123
|
dest="export_log_config",
|
|
120
124
|
metavar="PATH")
|
|
125
|
+
group.add_argument("--git", help="git repo to scan", dest="git", metavar="PATH")
|
|
126
|
+
parser.add_argument("--ref",
|
|
127
|
+
help="scan git repo from the ref, otherwise - all branches were scanned (slow)",
|
|
128
|
+
dest="ref",
|
|
129
|
+
type=str)
|
|
121
130
|
parser.add_argument("--rules",
|
|
122
131
|
help="path of rule config file (default: credsweeper/rules/config.yaml). "
|
|
123
132
|
f"severity:{[i.value for i in Severity]} "
|
|
@@ -150,6 +159,10 @@ def get_arguments() -> Namespace:
|
|
|
150
159
|
help="find files by predefined extension",
|
|
151
160
|
dest="find_by_ext",
|
|
152
161
|
action="store_true")
|
|
162
|
+
parser.add_argument("--pedantic",
|
|
163
|
+
help="process files without extension",
|
|
164
|
+
action=BooleanOptionalAction,
|
|
165
|
+
default=False)
|
|
153
166
|
parser.add_argument("--depth",
|
|
154
167
|
help="additional recursive search in data (experimental)",
|
|
155
168
|
type=positive_int,
|
|
@@ -164,11 +177,11 @@ def get_arguments() -> Namespace:
|
|
|
164
177
|
"The lower the threshold - the more credentials will be reported. "
|
|
165
178
|
f"Allowed values: float between 0 and 1, or any of {[e.value for e in ThresholdPreset]} "
|
|
166
179
|
"(default: medium)",
|
|
167
|
-
type=
|
|
180
|
+
type=threshold_or_float_or_zero,
|
|
168
181
|
default=ThresholdPreset.medium,
|
|
169
182
|
dest="ml_threshold",
|
|
170
183
|
required=False,
|
|
171
|
-
metavar="
|
|
184
|
+
metavar="THRESHOLD_OR_FLOAT_OR_ZERO")
|
|
172
185
|
parser.add_argument("--ml_batch_size",
|
|
173
186
|
"-b",
|
|
174
187
|
help="batch size for model inference (default: 16)",
|
|
@@ -246,8 +259,8 @@ def get_arguments() -> Namespace:
|
|
|
246
259
|
default=False)
|
|
247
260
|
parser.add_argument("--log",
|
|
248
261
|
"-l",
|
|
249
|
-
help=f"provide logging level of {list(Logger.LEVELS.keys())}"
|
|
250
|
-
|
|
262
|
+
help=(f"provide logging level of {list(Logger.LEVELS.keys())}"
|
|
263
|
+
f" (default: 'warning', case insensitive)"),
|
|
251
264
|
default="warning",
|
|
252
265
|
dest="log",
|
|
253
266
|
metavar="LOG_LEVEL",
|
|
@@ -268,6 +281,40 @@ def get_arguments() -> Namespace:
|
|
|
268
281
|
return parser.parse_args()
|
|
269
282
|
|
|
270
283
|
|
|
284
|
+
def get_credsweeper(args: Namespace) -> CredSweeper:
|
|
285
|
+
"""Common function to create the instance"""
|
|
286
|
+
if args.denylist_path is not None:
|
|
287
|
+
denylist = [line for line in Util.read_file(args.denylist_path) if line]
|
|
288
|
+
else:
|
|
289
|
+
denylist = []
|
|
290
|
+
return CredSweeper(rule_path=args.rule_path,
|
|
291
|
+
config_path=args.config_path,
|
|
292
|
+
json_filename=args.json_filename,
|
|
293
|
+
xlsx_filename=args.xlsx_filename,
|
|
294
|
+
stdout=args.stdout,
|
|
295
|
+
color=args.color,
|
|
296
|
+
hashed=args.hashed,
|
|
297
|
+
subtext=args.subtext,
|
|
298
|
+
sort_output=args.sort_output,
|
|
299
|
+
use_filters=args.no_filters,
|
|
300
|
+
pool_count=args.jobs,
|
|
301
|
+
ml_batch_size=args.ml_batch_size,
|
|
302
|
+
ml_threshold=args.ml_threshold,
|
|
303
|
+
ml_config=args.ml_config,
|
|
304
|
+
ml_model=args.ml_model,
|
|
305
|
+
ml_providers=args.ml_providers,
|
|
306
|
+
find_by_ext=args.find_by_ext,
|
|
307
|
+
pedantic=args.pedantic,
|
|
308
|
+
depth=args.depth,
|
|
309
|
+
doc=args.doc,
|
|
310
|
+
severity=args.severity,
|
|
311
|
+
size_limit=args.size_limit,
|
|
312
|
+
exclude_lines=denylist,
|
|
313
|
+
exclude_values=denylist,
|
|
314
|
+
thrifty=args.thrifty,
|
|
315
|
+
log_level=args.log)
|
|
316
|
+
|
|
317
|
+
|
|
271
318
|
def scan(args: Namespace, content_provider: AbstractProvider) -> int:
|
|
272
319
|
"""Scan content_provider data, print results or save them to json_filename is not None
|
|
273
320
|
|
|
@@ -283,42 +330,105 @@ def scan(args: Namespace, content_provider: AbstractProvider) -> int:
|
|
|
283
330
|
|
|
284
331
|
"""
|
|
285
332
|
try:
|
|
286
|
-
|
|
287
|
-
denylist = [line for line in Util.read_file(args.denylist_path) if line]
|
|
288
|
-
else:
|
|
289
|
-
denylist = []
|
|
290
|
-
|
|
291
|
-
credsweeper = CredSweeper(rule_path=args.rule_path,
|
|
292
|
-
config_path=args.config_path,
|
|
293
|
-
json_filename=args.json_filename,
|
|
294
|
-
xlsx_filename=args.xlsx_filename,
|
|
295
|
-
stdout=args.stdout,
|
|
296
|
-
color=args.color,
|
|
297
|
-
hashed=args.hashed,
|
|
298
|
-
subtext=args.subtext,
|
|
299
|
-
sort_output=args.sort_output,
|
|
300
|
-
use_filters=args.no_filters,
|
|
301
|
-
pool_count=args.jobs,
|
|
302
|
-
ml_batch_size=args.ml_batch_size,
|
|
303
|
-
ml_threshold=args.ml_threshold,
|
|
304
|
-
ml_config=args.ml_config,
|
|
305
|
-
ml_model=args.ml_model,
|
|
306
|
-
ml_providers=args.ml_providers,
|
|
307
|
-
find_by_ext=args.find_by_ext,
|
|
308
|
-
depth=args.depth,
|
|
309
|
-
doc=args.doc,
|
|
310
|
-
severity=args.severity,
|
|
311
|
-
size_limit=args.size_limit,
|
|
312
|
-
exclude_lines=denylist,
|
|
313
|
-
exclude_values=denylist,
|
|
314
|
-
thrifty=args.thrifty,
|
|
315
|
-
log_level=args.log)
|
|
333
|
+
credsweeper = get_credsweeper(args)
|
|
316
334
|
return credsweeper.run(content_provider=content_provider)
|
|
317
335
|
except Exception as exc:
|
|
318
336
|
logger.critical(exc, exc_info=True)
|
|
337
|
+
logger.exception(exc)
|
|
319
338
|
return -1
|
|
320
339
|
|
|
321
340
|
|
|
341
|
+
def get_commit_providers(commit: Commit, repo: Repo) -> Sequence[ByteContentProvider]:
|
|
342
|
+
"""Process a commit and for providers"""
|
|
343
|
+
result = {}
|
|
344
|
+
# use the hardcoded sha1 until sha256 objects are not supported by GitPython
|
|
345
|
+
ancestors = commit.parents or [repo.tree("4b825dc642cb6eb9a060e54bf8d69288fbee4904")]
|
|
346
|
+
for parent in ancestors:
|
|
347
|
+
for diff in parent.diff(commit):
|
|
348
|
+
# only result files
|
|
349
|
+
blob_b = diff.b_blob
|
|
350
|
+
if blob_b and blob_b.path not in result:
|
|
351
|
+
try:
|
|
352
|
+
result[blob_b.path] = ByteContentProvider(content=blob_b.data_stream.read(),
|
|
353
|
+
file_path=str(blob_b.path),
|
|
354
|
+
info=DiffRowType.ADDED.value)
|
|
355
|
+
except Exception as exc:
|
|
356
|
+
logger.warning(f"A submodule was not properly initialized or commit was removed: {exc}")
|
|
357
|
+
return list(result.values())
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def drill(args: Namespace) -> Tuple[int, int]:
|
|
361
|
+
"""Scan repository for branches and commits
|
|
362
|
+
Returns:
|
|
363
|
+
total credentials found
|
|
364
|
+
total scanned commits
|
|
365
|
+
"""
|
|
366
|
+
total_credentials = 0
|
|
367
|
+
total_commits = 0
|
|
368
|
+
try:
|
|
369
|
+
# repo init first
|
|
370
|
+
repo = Repo(args.git)
|
|
371
|
+
if args.ref:
|
|
372
|
+
commits_sha1 = set(x.commit.hexsha for x in repo.refs if x.name == args.ref)
|
|
373
|
+
if not commits_sha1:
|
|
374
|
+
commits_sha1 = {args.ref} # single commit sha1 reference
|
|
375
|
+
else:
|
|
376
|
+
commits_sha1 = set(x.commit.hexsha for x in repo.refs
|
|
377
|
+
if x.name.startswith('origin/') or x.name.startswith('refs/heads/'))
|
|
378
|
+
logger.info(f"Git repository {args.git} with commits: {commits_sha1}")
|
|
379
|
+
# then - credsweeper
|
|
380
|
+
credsweeper = get_credsweeper(args)
|
|
381
|
+
# use flat iterations to avoid recursive limits
|
|
382
|
+
to_scan = set(commits_sha1)
|
|
383
|
+
# local speedup for already scanned commits - avoid file system interactive
|
|
384
|
+
scanned = set()
|
|
385
|
+
# to avoid double-check
|
|
386
|
+
skipped = set()
|
|
387
|
+
while to_scan:
|
|
388
|
+
commit_sha1 = to_scan.pop()
|
|
389
|
+
if commit_sha1 in scanned:
|
|
390
|
+
# the commit was scanned in this launch
|
|
391
|
+
continue
|
|
392
|
+
commit = repo.commit(commit_sha1)
|
|
393
|
+
if commit.parents:
|
|
394
|
+
# add parents only when they were not skipped or scanned previously
|
|
395
|
+
to_scan.update(x.hexsha for x in commit.parents if x.hexsha not in skipped and x.hexsha not in scanned)
|
|
396
|
+
# check whether the commit has been checked and the report is present
|
|
397
|
+
skip_already_scanned = False
|
|
398
|
+
if args.json_filename:
|
|
399
|
+
json_path = Path(args.json_filename)
|
|
400
|
+
json_path = json_path.with_suffix(f".{commit_sha1}{json_path.suffix}")
|
|
401
|
+
if json_path.exists():
|
|
402
|
+
skip_already_scanned = True
|
|
403
|
+
else:
|
|
404
|
+
credsweeper.json_filename = json_path
|
|
405
|
+
if args.xlsx_filename:
|
|
406
|
+
xlsx_path = Path(args.xlsx_filename)
|
|
407
|
+
xlsx_path = xlsx_path.with_suffix(f".{commit_sha1}{xlsx_path.suffix}")
|
|
408
|
+
if xlsx_path.exists():
|
|
409
|
+
skip_already_scanned = True
|
|
410
|
+
else:
|
|
411
|
+
credsweeper.xlsx_filename = xlsx_path
|
|
412
|
+
if skip_already_scanned:
|
|
413
|
+
skipped.add(commit_sha1)
|
|
414
|
+
logger.info("Skip already scanned commit: %s %s", commit_sha1, commit.committed_datetime.isoformat())
|
|
415
|
+
continue
|
|
416
|
+
logger.info("Scan commit: %s %s", commit_sha1, commit.committed_datetime.isoformat())
|
|
417
|
+
# prepare all files to scan in the commit with bytes->IO transformation to avoid a multiprocess issue
|
|
418
|
+
if providers := get_commit_providers(commit, repo):
|
|
419
|
+
credsweeper.credential_manager.candidates.clear()
|
|
420
|
+
credsweeper.scan(providers)
|
|
421
|
+
credsweeper.post_processing()
|
|
422
|
+
credsweeper.export_results()
|
|
423
|
+
total_credentials += credsweeper.credential_manager.len_credentials()
|
|
424
|
+
total_commits += 1
|
|
425
|
+
scanned.add(commit_sha1)
|
|
426
|
+
except Exception as exc:
|
|
427
|
+
logger.critical(exc, exc_info=True)
|
|
428
|
+
return -1, total_commits
|
|
429
|
+
return total_credentials, total_commits
|
|
430
|
+
|
|
431
|
+
|
|
322
432
|
def main() -> int:
|
|
323
433
|
"""Main function"""
|
|
324
434
|
result = EXIT_FAILURE
|
|
@@ -328,7 +438,7 @@ def main() -> int:
|
|
|
328
438
|
if args.banner:
|
|
329
439
|
print(f"CredSweeper {__version__} crc32:{check_integrity():08x}")
|
|
330
440
|
Logger.init_logging(args.log, args.log_config_path)
|
|
331
|
-
logger.info(f"Init CredSweeper object with arguments: {args}")
|
|
441
|
+
logger.info(f"Init CredSweeper object with arguments: {args} CWD: {os.getcwd()}")
|
|
332
442
|
summary: Dict[str, int] = {}
|
|
333
443
|
if args.path:
|
|
334
444
|
logger.info(f"Run analyzer on path: {args.path}")
|
|
@@ -353,6 +463,12 @@ def main() -> int:
|
|
|
353
463
|
result = EXIT_SUCCESS
|
|
354
464
|
# collect number of all found credential to produce error code when necessary
|
|
355
465
|
credentials_number = add_credentials_number + del_credentials_number
|
|
466
|
+
elif args.git:
|
|
467
|
+
logger.info(f"Run analyzer on GIT: {args.git}")
|
|
468
|
+
credentials_number, commits_number = drill(args)
|
|
469
|
+
summary[f"Detected Credentials in {args.git} for {commits_number} commits "] = credentials_number
|
|
470
|
+
if 0 <= credentials_number:
|
|
471
|
+
result = EXIT_SUCCESS
|
|
356
472
|
elif args.export_config:
|
|
357
473
|
logging.info(f"Exporting default config to file: {args.export_config}")
|
|
358
474
|
config_dict = Util.json_load(APP_PATH / "secret" / "config.json")
|
credsweeper/app.py
CHANGED
|
@@ -11,18 +11,18 @@ from colorama import Style
|
|
|
11
11
|
# Directory of credsweeper sources MUST be placed before imports to avoid circular import error
|
|
12
12
|
APP_PATH = Path(__file__).resolve().parent
|
|
13
13
|
|
|
14
|
+
from credsweeper.scanner.scanner import Scanner
|
|
14
15
|
from credsweeper.common.constants import Severity, ThresholdPreset, DiffRowType, DEFAULT_ENCODING
|
|
15
|
-
from credsweeper.config import Config
|
|
16
|
-
from credsweeper.credentials import Candidate
|
|
16
|
+
from credsweeper.config.config import Config
|
|
17
|
+
from credsweeper.credentials.candidate import Candidate
|
|
18
|
+
from credsweeper.credentials.candidate_key import CandidateKey
|
|
19
|
+
from credsweeper.credentials.credential_manager import CredentialManager
|
|
17
20
|
from credsweeper.deep_scanner.deep_scanner import DeepScanner
|
|
18
21
|
from credsweeper.file_handler.content_provider import ContentProvider
|
|
19
|
-
from credsweeper.file_handler.diff_content_provider import DiffContentProvider
|
|
20
22
|
from credsweeper.file_handler.file_path_extractor import FilePathExtractor
|
|
21
23
|
from credsweeper.file_handler.abstract_provider import AbstractProvider
|
|
22
|
-
from credsweeper.file_handler.text_content_provider import TextContentProvider
|
|
23
|
-
from credsweeper.scanner import Scanner
|
|
24
24
|
from credsweeper.ml_model.ml_validator import MlValidator
|
|
25
|
-
from credsweeper.utils import Util
|
|
25
|
+
from credsweeper.utils.util import Util
|
|
26
26
|
|
|
27
27
|
logger = logging.getLogger(__name__)
|
|
28
28
|
|
|
@@ -52,11 +52,12 @@ class CredSweeper:
|
|
|
52
52
|
use_filters: bool = True,
|
|
53
53
|
pool_count: int = 1,
|
|
54
54
|
ml_batch_size: Optional[int] = None,
|
|
55
|
-
ml_threshold: Union[float, ThresholdPreset] = ThresholdPreset.medium,
|
|
55
|
+
ml_threshold: Union[int, float, ThresholdPreset] = ThresholdPreset.medium,
|
|
56
56
|
ml_config: Union[None, str, Path] = None,
|
|
57
57
|
ml_model: Union[None, str, Path] = None,
|
|
58
58
|
ml_providers: Optional[str] = None,
|
|
59
59
|
find_by_ext: bool = False,
|
|
60
|
+
pedantic: bool = False,
|
|
60
61
|
depth: int = 0,
|
|
61
62
|
doc: bool = False,
|
|
62
63
|
severity: Union[Severity, str] = Severity.INFO,
|
|
@@ -86,6 +87,7 @@ class CredSweeper:
|
|
|
86
87
|
ml_model: str or Path to set custom ml model
|
|
87
88
|
ml_providers: str - comma separated list with providers
|
|
88
89
|
find_by_ext: boolean - files will be reported by extension
|
|
90
|
+
pedantic: boolean - scan all files
|
|
89
91
|
depth: int - how deep container files will be scanned
|
|
90
92
|
doc: boolean - document-specific scanning
|
|
91
93
|
severity: Severity - minimum severity level of rule
|
|
@@ -103,6 +105,7 @@ class CredSweeper:
|
|
|
103
105
|
config_dict = self._get_config_dict(config_path=config_path,
|
|
104
106
|
use_filters=use_filters,
|
|
105
107
|
find_by_ext=find_by_ext,
|
|
108
|
+
pedantic=pedantic,
|
|
106
109
|
depth=depth,
|
|
107
110
|
doc=doc,
|
|
108
111
|
severity=_severity,
|
|
@@ -145,6 +148,7 @@ class CredSweeper:
|
|
|
145
148
|
config_path: Optional[str], #
|
|
146
149
|
use_filters: bool, #
|
|
147
150
|
find_by_ext: bool, #
|
|
151
|
+
pedantic: bool, #
|
|
148
152
|
depth: int, #
|
|
149
153
|
doc: bool, #
|
|
150
154
|
severity: Severity, #
|
|
@@ -155,6 +159,7 @@ class CredSweeper:
|
|
|
155
159
|
config_dict["use_filters"] = use_filters
|
|
156
160
|
config_dict["find_by_ext"] = find_by_ext
|
|
157
161
|
config_dict["size_limit"] = size_limit
|
|
162
|
+
config_dict["pedantic"] = pedantic
|
|
158
163
|
config_dict["depth"] = depth
|
|
159
164
|
config_dict["doc"] = doc
|
|
160
165
|
config_dict["severity"] = severity.value
|
|
@@ -169,7 +174,7 @@ class CredSweeper:
|
|
|
169
174
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
|
170
175
|
|
|
171
176
|
def _use_ml_validation(self) -> bool:
|
|
172
|
-
if isinstance(self.ml_threshold,
|
|
177
|
+
if isinstance(self.ml_threshold, int) and 0 == self.ml_threshold:
|
|
173
178
|
logger.info("ML validation is disabled")
|
|
174
179
|
return False
|
|
175
180
|
if not self.credential_manager.candidates:
|
|
@@ -215,7 +220,7 @@ class CredSweeper:
|
|
|
215
220
|
content_provider: path objects to scan
|
|
216
221
|
|
|
217
222
|
"""
|
|
218
|
-
_empty_list: Sequence[
|
|
223
|
+
_empty_list: Sequence[ContentProvider] = []
|
|
219
224
|
file_extractors = content_provider.get_scannable_files(self.config) if content_provider else _empty_list
|
|
220
225
|
if not file_extractors:
|
|
221
226
|
logger.info(f"No scannable targets for {len(content_provider.paths)} paths")
|
|
@@ -229,7 +234,7 @@ class CredSweeper:
|
|
|
229
234
|
|
|
230
235
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
|
231
236
|
|
|
232
|
-
def scan(self, content_providers: Sequence[
|
|
237
|
+
def scan(self, content_providers: Sequence[ContentProvider]) -> None:
|
|
233
238
|
"""Run scanning of files from an argument "content_providers".
|
|
234
239
|
|
|
235
240
|
Args:
|
|
@@ -243,7 +248,7 @@ class CredSweeper:
|
|
|
243
248
|
|
|
244
249
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
|
245
250
|
|
|
246
|
-
def __single_job_scan(self, content_providers: Sequence[
|
|
251
|
+
def __single_job_scan(self, content_providers: Sequence[ContentProvider]) -> None:
|
|
247
252
|
"""Performs scan in main thread"""
|
|
248
253
|
logger.info(f"Scan for {len(content_providers)} providers")
|
|
249
254
|
all_cred = self.files_scan(content_providers)
|
|
@@ -251,7 +256,7 @@ class CredSweeper:
|
|
|
251
256
|
|
|
252
257
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
|
253
258
|
|
|
254
|
-
def __multi_jobs_scan(self, content_providers: Sequence[
|
|
259
|
+
def __multi_jobs_scan(self, content_providers: Sequence[ContentProvider]) -> None:
|
|
255
260
|
"""Performs scan with multiple jobs"""
|
|
256
261
|
# use this separation to satisfy YAPF formatter
|
|
257
262
|
yapfix = "%(asctime)s | %(levelname)s | %(processName)s:%(threadName)s | %(filename)s:%(lineno)s | %(message)s"
|
|
@@ -265,7 +270,7 @@ class CredSweeper:
|
|
|
265
270
|
logger.info(f"Scan in {pool_count} processes for {len(content_providers)} providers")
|
|
266
271
|
with multiprocessing.get_context("spawn").Pool(processes=pool_count,
|
|
267
272
|
initializer=CredSweeper.pool_initializer,
|
|
268
|
-
initargs=(log_kwargs,
|
|
273
|
+
initargs=(log_kwargs,)) as pool: # yapf: disable
|
|
269
274
|
try:
|
|
270
275
|
for scan_results in pool.imap_unordered(self.files_scan,
|
|
271
276
|
(content_providers[x::pool_count] for x in range(pool_count))):
|
|
@@ -3,47 +3,48 @@ import re
|
|
|
3
3
|
|
|
4
4
|
class KeywordPattern:
|
|
5
5
|
"""Pattern set of keyword types"""
|
|
6
|
-
directive = r"(?P<directive>(?:
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
6
|
+
directive = r"(?P<directive>(?:" \
|
|
7
|
+
r"(?:[#%]define|define(?=(\s|\\{1,8}[tnr])*\()|%global)" \
|
|
8
|
+
r"(?:\s?\(|\s|\\{1,8}[tnr]){1,8}|\bset(?=\b|\w*(\s|\\{1,8}[tnr])*\()" \
|
|
9
|
+
r"))?"
|
|
10
|
+
key_left = r"(?:\\[nrt]|(\\\\*u00|%)[0-9a-f]{2}|\s)*" \
|
|
11
|
+
r"(?P<variable>(([\"'`]{1,8}[^:=\"'`}<>\\/&?]*|[^:=\"'`}<>\s()\\/&?;,%]*)"
|
|
12
|
+
# keyword will be inserted here
|
|
13
|
+
key_right = r"[^%:=\"'`<>({?!&;\n]{0,80}" \
|
|
13
14
|
r")" \
|
|
14
|
-
r"(&(quot|apos)
|
|
15
|
+
r"(&(quot|apos|#3[49]);|(\\\\*u00|%)[0-9a-f]{2}|[\"'`])*" \
|
|
15
16
|
r")" # <variable>
|
|
16
17
|
separator = r"(?(directive)|(\s|\\{1,8}[tnr])*\]?(\s|\\{1,8}[tnr])*)" \
|
|
17
18
|
r"(?P<separator>:(\s[a-z]{3,9}[?]?\s)?=|:(?!:)|=(>|>|(\\\\*u00|%)26gt;)|!==|!=|===|==|=~|=" \
|
|
18
|
-
r"|(?(directive)(
|
|
19
|
+
r"|(?(directive)(,|\\t|\s|\((?!\))){1,80}|%3d))" \
|
|
19
20
|
r"(\s|\\{1,8}[tnr])*"
|
|
20
21
|
# might be curly, square or parenthesis with words before
|
|
21
22
|
wrap = r"(?P<wrap>(" \
|
|
22
|
-
r"(
|
|
23
|
+
r"((\s|\\{1,8}[tnr]|new|byte|char|string|\[\]){1,8})?" \
|
|
23
24
|
r"(?P<get>([_a-z][0-9a-z_.\[\]]*\.)get|(os\.)?getenv)?" \
|
|
24
25
|
r"([0-9a-z_.]|::|-(>|>))*" \
|
|
25
26
|
r"\s*" \
|
|
26
27
|
r"(\[(?!\])|\((?!\))|\{(?!\}))" \
|
|
27
28
|
r"(\s|\\{1,8}[tnr])*" \
|
|
28
|
-
r"(?(get)('[^']
|
|
29
|
+
r"(?(get)('[^']{1,31}'|\"[^\"]{1,31}\")\s*,\s*|)" \
|
|
29
30
|
r"([0-9a-z_]{1,32}\s*[:=]\s*)?" \
|
|
30
31
|
r"){1,8})?"
|
|
31
|
-
string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[
|
|
32
|
-
left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?([
|
|
32
|
+
string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[\"'`])))?"
|
|
33
|
+
left_quote = r"(?P<value_leftquote>((?P<esq>\\{1,8})?([\"'`]|&(quot|apos|#3[49]);)){1,4}))?"
|
|
33
34
|
# Authentication scheme ( oauth | basic | bearer | apikey ) precedes to credential
|
|
34
35
|
auth_keywords = r"(\s?(oauth|bot|basic|bearer|apikey|accesskey|ssws|ntlm)\s)?"
|
|
35
36
|
value = r"(?P<value>" \
|
|
36
37
|
r"(?(value_leftquote)" \
|
|
37
38
|
r"(" \
|
|
38
39
|
r"(?!(?P=value_leftquote))" \
|
|
39
|
-
r"(?(esq)((?!(?P=esq)([
|
|
40
|
+
r"(?(esq)((?!(?P=esq)([\"'`]|&(quot|apos|#3[49]);)).)|((?!(?P=value_leftquote)).)))" \
|
|
40
41
|
r"|" \
|
|
41
|
-
r"(?!&(quot|apos);)" \
|
|
42
|
-
r"(\\{1,8}([ tnr]|[^\s
|
|
42
|
+
r"(?!&(quot|apos|#3[49]);)" \
|
|
43
|
+
r"(\\{1,8}([ tnr]|[^\s\"'`])" \
|
|
43
44
|
r"|" \
|
|
44
45
|
r"(?P<url_esc>%[0-9a-f]{2})" \
|
|
45
46
|
r"|" \
|
|
46
|
-
r"(?(url_esc)[^\s
|
|
47
|
+
r"(?(url_esc)[^\s\"'`,;\\&]|[^\s\"'`,;\\])" \
|
|
47
48
|
r")" \
|
|
48
49
|
r"){4,8000}" \
|
|
49
50
|
r"|" \
|
|
@@ -67,7 +68,7 @@ class KeywordPattern:
|
|
|
67
68
|
expression = ''.join([ #
|
|
68
69
|
cls.directive, #
|
|
69
70
|
cls.key_left, #
|
|
70
|
-
keyword, #
|
|
71
|
+
fr"(?P<keyword>{keyword})", # named group required
|
|
71
72
|
cls.key_right, #
|
|
72
73
|
cls.separator, #
|
|
73
74
|
cls.wrap, #
|