credsweeper 1.12.1__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of credsweeper might be problematic. Click here for more details.
- credsweeper/__init__.py +1 -1
- credsweeper/__main__.py +23 -13
- credsweeper/app.py +7 -2
- credsweeper/common/keyword_pattern.py +6 -3
- credsweeper/common/morpheme_checklist.txt +26 -6
- credsweeper/config/config.py +1 -0
- credsweeper/credentials/line_data.py +21 -6
- credsweeper/deep_scanner/abstract_scanner.py +1 -0
- credsweeper/deep_scanner/csv_scanner.py +71 -0
- credsweeper/deep_scanner/deep_scanner.py +19 -9
- credsweeper/deep_scanner/jks_scanner.py +11 -2
- credsweeper/deep_scanner/pkcs_scanner.py +4 -0
- credsweeper/deep_scanner/rtf_scanner.py +41 -0
- credsweeper/deep_scanner/strings_scanner.py +52 -0
- credsweeper/file_handler/byte_content_provider.py +10 -1
- credsweeper/file_handler/file_path_extractor.py +2 -0
- credsweeper/file_handler/text_content_provider.py +7 -1
- credsweeper/filters/__init__.py +1 -1
- credsweeper/filters/group/token_pattern.py +2 -2
- credsweeper/filters/group/weird_base36_token.py +3 -3
- credsweeper/filters/group/weird_base64_token.py +2 -2
- credsweeper/filters/value_camel_case_check.py +2 -2
- credsweeper/filters/value_file_path_check.py +5 -3
- credsweeper/filters/value_github_check.py +3 -2
- credsweeper/filters/value_morphemes_check.py +43 -0
- credsweeper/filters/value_string_type_check.py +1 -0
- credsweeper/ml_model/features/feature.py +1 -18
- credsweeper/ml_model/features/file_extension.py +1 -1
- credsweeper/ml_model/features/has_html_tag.py +10 -8
- credsweeper/ml_model/features/is_secret_numeric.py +4 -3
- credsweeper/ml_model/features/rule_name.py +1 -1
- credsweeper/ml_model/features/word_in.py +9 -32
- credsweeper/ml_model/features/word_in_path.py +2 -3
- credsweeper/ml_model/features/word_in_postamble.py +1 -4
- credsweeper/ml_model/features/word_in_preamble.py +1 -4
- credsweeper/ml_model/features/word_in_transition.py +1 -4
- credsweeper/ml_model/features/word_in_value.py +2 -3
- credsweeper/ml_model/features/word_in_variable.py +2 -3
- credsweeper/ml_model/ml_config.json +15 -8
- credsweeper/ml_model/ml_model.onnx +0 -0
- credsweeper/ml_model/ml_validator.py +1 -1
- credsweeper/rules/config.yaml +174 -207
- credsweeper/scanner/scanner.py +12 -7
- credsweeper/secret/config.json +18 -5
- credsweeper/utils/util.py +21 -18
- {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/METADATA +7 -7
- {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/RECORD +50 -47
- credsweeper/filters/value_couple_keyword_check.py +0 -28
- {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/WHEEL +0 -0
- {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/entry_points.txt +0 -0
- {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/licenses/LICENSE +0 -0
credsweeper/__init__.py
CHANGED
credsweeper/__main__.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import binascii
|
|
2
|
+
import contextlib
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
4
5
|
import sys
|
|
@@ -34,24 +35,24 @@ def positive_int(value: Any) -> int:
|
|
|
34
35
|
return int_value
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
def
|
|
38
|
+
def threshold_or_float_or_zero(arg: str) -> Union[int, float, ThresholdPreset]:
|
|
38
39
|
"""Return ThresholdPreset or a float from the input string
|
|
39
40
|
|
|
40
41
|
Args:
|
|
41
42
|
arg: string that either a float or one of allowed values in ThresholdPreset
|
|
42
43
|
|
|
43
44
|
Returns:
|
|
44
|
-
float if arg convertible to float, ThresholdPreset if one of the allowed values
|
|
45
|
+
int = 0 to disable ML validator, float if arg convertible to float, ThresholdPreset if one of the allowed values
|
|
45
46
|
|
|
46
47
|
Raises:
|
|
47
48
|
ArgumentTypeError: if arg cannot be interpreted as float or ThresholdPreset
|
|
48
49
|
|
|
49
50
|
"""
|
|
50
51
|
allowed_presents = [e.value for e in ThresholdPreset]
|
|
51
|
-
|
|
52
|
+
if '0' == arg:
|
|
53
|
+
return 0
|
|
54
|
+
with contextlib.suppress(ValueError):
|
|
52
55
|
return float(arg) # try convert to float
|
|
53
|
-
except ValueError:
|
|
54
|
-
pass
|
|
55
56
|
if arg in allowed_presents:
|
|
56
57
|
return ThresholdPreset[arg]
|
|
57
58
|
raise ArgumentTypeError(f"value must be a float or one of {allowed_presents}")
|
|
@@ -158,6 +159,10 @@ def get_arguments() -> Namespace:
|
|
|
158
159
|
help="find files by predefined extension",
|
|
159
160
|
dest="find_by_ext",
|
|
160
161
|
action="store_true")
|
|
162
|
+
parser.add_argument("--pedantic",
|
|
163
|
+
help="process files without extension",
|
|
164
|
+
action=BooleanOptionalAction,
|
|
165
|
+
default=False)
|
|
161
166
|
parser.add_argument("--depth",
|
|
162
167
|
help="additional recursive search in data (experimental)",
|
|
163
168
|
type=positive_int,
|
|
@@ -172,11 +177,11 @@ def get_arguments() -> Namespace:
|
|
|
172
177
|
"The lower the threshold - the more credentials will be reported. "
|
|
173
178
|
f"Allowed values: float between 0 and 1, or any of {[e.value for e in ThresholdPreset]} "
|
|
174
179
|
"(default: medium)",
|
|
175
|
-
type=
|
|
180
|
+
type=threshold_or_float_or_zero,
|
|
176
181
|
default=ThresholdPreset.medium,
|
|
177
182
|
dest="ml_threshold",
|
|
178
183
|
required=False,
|
|
179
|
-
metavar="
|
|
184
|
+
metavar="THRESHOLD_OR_FLOAT_OR_ZERO")
|
|
180
185
|
parser.add_argument("--ml_batch_size",
|
|
181
186
|
"-b",
|
|
182
187
|
help="batch size for model inference (default: 16)",
|
|
@@ -299,6 +304,7 @@ def get_credsweeper(args: Namespace) -> CredSweeper:
|
|
|
299
304
|
ml_model=args.ml_model,
|
|
300
305
|
ml_providers=args.ml_providers,
|
|
301
306
|
find_by_ext=args.find_by_ext,
|
|
307
|
+
pedantic=args.pedantic,
|
|
302
308
|
depth=args.depth,
|
|
303
309
|
doc=args.doc,
|
|
304
310
|
severity=args.severity,
|
|
@@ -335,7 +341,8 @@ def scan(args: Namespace, content_provider: AbstractProvider) -> int:
|
|
|
335
341
|
def get_commit_providers(commit: Commit, repo: Repo) -> Sequence[ByteContentProvider]:
|
|
336
342
|
"""Process a commit and for providers"""
|
|
337
343
|
result = {}
|
|
338
|
-
|
|
344
|
+
# use the hardcoded sha1 until sha256 objects are not supported by GitPython
|
|
345
|
+
ancestors = commit.parents or [repo.tree("4b825dc642cb6eb9a060e54bf8d69288fbee4904")]
|
|
339
346
|
for parent in ancestors:
|
|
340
347
|
for diff in parent.diff(commit):
|
|
341
348
|
# only result files
|
|
@@ -372,9 +379,11 @@ def drill(args: Namespace) -> Tuple[int, int]:
|
|
|
372
379
|
# then - credsweeper
|
|
373
380
|
credsweeper = get_credsweeper(args)
|
|
374
381
|
# use flat iterations to avoid recursive limits
|
|
375
|
-
to_scan =
|
|
382
|
+
to_scan = set(commits_sha1)
|
|
376
383
|
# local speedup for already scanned commits - avoid file system interactive
|
|
377
384
|
scanned = set()
|
|
385
|
+
# to avoid double-check
|
|
386
|
+
skipped = set()
|
|
378
387
|
while to_scan:
|
|
379
388
|
commit_sha1 = to_scan.pop()
|
|
380
389
|
if commit_sha1 in scanned:
|
|
@@ -382,8 +391,8 @@ def drill(args: Namespace) -> Tuple[int, int]:
|
|
|
382
391
|
continue
|
|
383
392
|
commit = repo.commit(commit_sha1)
|
|
384
393
|
if commit.parents:
|
|
385
|
-
# add parents
|
|
386
|
-
to_scan.
|
|
394
|
+
# add parents only when they were not skipped or scanned previously
|
|
395
|
+
to_scan.update(x.hexsha for x in commit.parents if x.hexsha not in skipped and x.hexsha not in scanned)
|
|
387
396
|
# check whether the commit has been checked and the report is present
|
|
388
397
|
skip_already_scanned = False
|
|
389
398
|
if args.json_filename:
|
|
@@ -401,9 +410,10 @@ def drill(args: Namespace) -> Tuple[int, int]:
|
|
|
401
410
|
else:
|
|
402
411
|
credsweeper.xlsx_filename = xlsx_path
|
|
403
412
|
if skip_already_scanned:
|
|
404
|
-
|
|
413
|
+
skipped.add(commit_sha1)
|
|
414
|
+
logger.info("Skip already scanned commit: %s %s", commit_sha1, commit.committed_datetime.isoformat())
|
|
405
415
|
continue
|
|
406
|
-
logger.info("Scan commit: %s", commit_sha1)
|
|
416
|
+
logger.info("Scan commit: %s %s", commit_sha1, commit.committed_datetime.isoformat())
|
|
407
417
|
# prepare all files to scan in the commit with bytes->IO transformation to avoid a multiprocess issue
|
|
408
418
|
if providers := get_commit_providers(commit, repo):
|
|
409
419
|
credsweeper.credential_manager.candidates.clear()
|
credsweeper/app.py
CHANGED
|
@@ -52,11 +52,12 @@ class CredSweeper:
|
|
|
52
52
|
use_filters: bool = True,
|
|
53
53
|
pool_count: int = 1,
|
|
54
54
|
ml_batch_size: Optional[int] = None,
|
|
55
|
-
ml_threshold: Union[float, ThresholdPreset] = ThresholdPreset.medium,
|
|
55
|
+
ml_threshold: Union[int, float, ThresholdPreset] = ThresholdPreset.medium,
|
|
56
56
|
ml_config: Union[None, str, Path] = None,
|
|
57
57
|
ml_model: Union[None, str, Path] = None,
|
|
58
58
|
ml_providers: Optional[str] = None,
|
|
59
59
|
find_by_ext: bool = False,
|
|
60
|
+
pedantic: bool = False,
|
|
60
61
|
depth: int = 0,
|
|
61
62
|
doc: bool = False,
|
|
62
63
|
severity: Union[Severity, str] = Severity.INFO,
|
|
@@ -86,6 +87,7 @@ class CredSweeper:
|
|
|
86
87
|
ml_model: str or Path to set custom ml model
|
|
87
88
|
ml_providers: str - comma separated list with providers
|
|
88
89
|
find_by_ext: boolean - files will be reported by extension
|
|
90
|
+
pedantic: boolean - scan all files
|
|
89
91
|
depth: int - how deep container files will be scanned
|
|
90
92
|
doc: boolean - document-specific scanning
|
|
91
93
|
severity: Severity - minimum severity level of rule
|
|
@@ -103,6 +105,7 @@ class CredSweeper:
|
|
|
103
105
|
config_dict = self._get_config_dict(config_path=config_path,
|
|
104
106
|
use_filters=use_filters,
|
|
105
107
|
find_by_ext=find_by_ext,
|
|
108
|
+
pedantic=pedantic,
|
|
106
109
|
depth=depth,
|
|
107
110
|
doc=doc,
|
|
108
111
|
severity=_severity,
|
|
@@ -145,6 +148,7 @@ class CredSweeper:
|
|
|
145
148
|
config_path: Optional[str], #
|
|
146
149
|
use_filters: bool, #
|
|
147
150
|
find_by_ext: bool, #
|
|
151
|
+
pedantic: bool, #
|
|
148
152
|
depth: int, #
|
|
149
153
|
doc: bool, #
|
|
150
154
|
severity: Severity, #
|
|
@@ -155,6 +159,7 @@ class CredSweeper:
|
|
|
155
159
|
config_dict["use_filters"] = use_filters
|
|
156
160
|
config_dict["find_by_ext"] = find_by_ext
|
|
157
161
|
config_dict["size_limit"] = size_limit
|
|
162
|
+
config_dict["pedantic"] = pedantic
|
|
158
163
|
config_dict["depth"] = depth
|
|
159
164
|
config_dict["doc"] = doc
|
|
160
165
|
config_dict["severity"] = severity.value
|
|
@@ -169,7 +174,7 @@ class CredSweeper:
|
|
|
169
174
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
|
170
175
|
|
|
171
176
|
def _use_ml_validation(self) -> bool:
|
|
172
|
-
if isinstance(self.ml_threshold,
|
|
177
|
+
if isinstance(self.ml_threshold, int) and 0 == self.ml_threshold:
|
|
173
178
|
logger.info("ML validation is disabled")
|
|
174
179
|
return False
|
|
175
180
|
if not self.credential_manager.candidates:
|
|
@@ -3,7 +3,10 @@ import re
|
|
|
3
3
|
|
|
4
4
|
class KeywordPattern:
|
|
5
5
|
"""Pattern set of keyword types"""
|
|
6
|
-
directive = r"(?P<directive>(?:
|
|
6
|
+
directive = r"(?P<directive>(?:" \
|
|
7
|
+
r"(?:[#%]define|define(?=(\s|\\{1,8}[tnr])*\()|%global)" \
|
|
8
|
+
r"(?:\s?\(|\s|\\{1,8}[tnr]){1,8}|\bset(?=\b|\w*(\s|\\{1,8}[tnr])*\()" \
|
|
9
|
+
r"))?"
|
|
7
10
|
key_left = r"(?:\\[nrt]|(\\\\*u00|%)[0-9a-f]{2}|\s)*" \
|
|
8
11
|
r"(?P<variable>(([\"'`]{1,8}[^:=\"'`}<>\\/&?]*|[^:=\"'`}<>\s()\\/&?;,%]*)"
|
|
9
12
|
# keyword will be inserted here
|
|
@@ -13,7 +16,7 @@ class KeywordPattern:
|
|
|
13
16
|
r")" # <variable>
|
|
14
17
|
separator = r"(?(directive)|(\s|\\{1,8}[tnr])*\]?(\s|\\{1,8}[tnr])*)" \
|
|
15
18
|
r"(?P<separator>:(\s[a-z]{3,9}[?]?\s)?=|:(?!:)|=(>|>|(\\\\*u00|%)26gt;)|!==|!=|===|==|=~|=" \
|
|
16
|
-
r"|(?(directive)(
|
|
19
|
+
r"|(?(directive)(,|\\t|\s|\((?!\))){1,80}|%3d))" \
|
|
17
20
|
r"(\s|\\{1,8}[tnr])*"
|
|
18
21
|
# might be curly, square or parenthesis with words before
|
|
19
22
|
wrap = r"(?P<wrap>(" \
|
|
@@ -23,7 +26,7 @@ class KeywordPattern:
|
|
|
23
26
|
r"\s*" \
|
|
24
27
|
r"(\[(?!\])|\((?!\))|\{(?!\}))" \
|
|
25
28
|
r"(\s|\\{1,8}[tnr])*" \
|
|
26
|
-
r"(?(get)('[^']
|
|
29
|
+
r"(?(get)('[^']{1,31}'|\"[^\"]{1,31}\")\s*,\s*|)" \
|
|
27
30
|
r"([0-9a-z_]{1,32}\s*[:=]\s*)?" \
|
|
28
31
|
r"){1,8})?"
|
|
29
32
|
string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[\"'`])))?"
|
|
@@ -14,11 +14,15 @@
|
|
|
14
14
|
/var
|
|
15
15
|
000
|
|
16
16
|
111
|
|
17
|
+
14159265
|
|
18
|
+
18284590
|
|
17
19
|
222
|
|
18
20
|
333
|
|
19
21
|
444
|
|
20
22
|
555
|
|
23
|
+
65358979
|
|
21
24
|
666
|
|
25
|
+
71828182
|
|
22
26
|
777
|
|
23
27
|
80211
|
|
24
28
|
888
|
|
@@ -195,7 +199,7 @@ aux
|
|
|
195
199
|
avail
|
|
196
200
|
avatar
|
|
197
201
|
aver
|
|
198
|
-
|
|
202
|
+
awesom
|
|
199
203
|
axis
|
|
200
204
|
azure
|
|
201
205
|
back
|
|
@@ -227,12 +231,14 @@ bind
|
|
|
227
231
|
bio
|
|
228
232
|
bipol
|
|
229
233
|
bit
|
|
234
|
+
bixby
|
|
230
235
|
black
|
|
231
236
|
blan
|
|
232
237
|
bless
|
|
233
238
|
blic
|
|
234
239
|
blish
|
|
235
240
|
blob
|
|
241
|
+
blood
|
|
236
242
|
blue
|
|
237
243
|
board
|
|
238
244
|
bob
|
|
@@ -243,7 +249,7 @@ boost
|
|
|
243
249
|
boot
|
|
244
250
|
boss
|
|
245
251
|
bot
|
|
246
|
-
|
|
252
|
+
boun
|
|
247
253
|
box
|
|
248
254
|
branch
|
|
249
255
|
break
|
|
@@ -497,6 +503,7 @@ dust
|
|
|
497
503
|
dvb
|
|
498
504
|
dynamic
|
|
499
505
|
dynamo
|
|
506
|
+
eadbee
|
|
500
507
|
easin
|
|
501
508
|
easy
|
|
502
509
|
ecdhe
|
|
@@ -607,6 +614,7 @@ fleet
|
|
|
607
614
|
flick
|
|
608
615
|
flix
|
|
609
616
|
float
|
|
617
|
+
flood
|
|
610
618
|
floor
|
|
611
619
|
fluent
|
|
612
620
|
fluid
|
|
@@ -615,7 +623,7 @@ focus
|
|
|
615
623
|
foo
|
|
616
624
|
for
|
|
617
625
|
fossil
|
|
618
|
-
|
|
626
|
+
foun
|
|
619
627
|
fpga
|
|
620
628
|
frame
|
|
621
629
|
free
|
|
@@ -648,6 +656,7 @@ git
|
|
|
648
656
|
given
|
|
649
657
|
global
|
|
650
658
|
gobble
|
|
659
|
+
good
|
|
651
660
|
google
|
|
652
661
|
grab
|
|
653
662
|
grace
|
|
@@ -703,6 +712,7 @@ home
|
|
|
703
712
|
hook
|
|
704
713
|
horizon
|
|
705
714
|
host
|
|
715
|
+
houn
|
|
706
716
|
hours
|
|
707
717
|
html
|
|
708
718
|
http
|
|
@@ -789,6 +799,7 @@ jpg_
|
|
|
789
799
|
json
|
|
790
800
|
jump
|
|
791
801
|
justif
|
|
802
|
+
kafka
|
|
792
803
|
kerberos
|
|
793
804
|
kernel
|
|
794
805
|
key
|
|
@@ -798,6 +809,7 @@ kind
|
|
|
798
809
|
kinesis
|
|
799
810
|
kirk
|
|
800
811
|
know
|
|
812
|
+
knox
|
|
801
813
|
kris
|
|
802
814
|
lab
|
|
803
815
|
lag
|
|
@@ -854,7 +866,7 @@ local
|
|
|
854
866
|
lock
|
|
855
867
|
log
|
|
856
868
|
long
|
|
857
|
-
|
|
869
|
+
look
|
|
858
870
|
loop
|
|
859
871
|
loose
|
|
860
872
|
lost
|
|
@@ -947,6 +959,7 @@ ndow
|
|
|
947
959
|
ned
|
|
948
960
|
need
|
|
949
961
|
neigh
|
|
962
|
+
neo4j
|
|
950
963
|
ner
|
|
951
964
|
net
|
|
952
965
|
neutr
|
|
@@ -991,6 +1004,7 @@ oncat
|
|
|
991
1004
|
one
|
|
992
1005
|
onfig
|
|
993
1006
|
only
|
|
1007
|
+
ookup
|
|
994
1008
|
open
|
|
995
1009
|
opt/
|
|
996
1010
|
opted
|
|
@@ -1008,6 +1022,7 @@ ormat
|
|
|
1008
1022
|
orph
|
|
1009
1023
|
otorola
|
|
1010
1024
|
ottle
|
|
1025
|
+
ound
|
|
1011
1026
|
ously
|
|
1012
1027
|
out
|
|
1013
1028
|
over
|
|
@@ -1067,6 +1082,7 @@ pose
|
|
|
1067
1082
|
posit
|
|
1068
1083
|
possib
|
|
1069
1084
|
post
|
|
1085
|
+
poun
|
|
1070
1086
|
power
|
|
1071
1087
|
pre_
|
|
1072
1088
|
pred
|
|
@@ -1211,7 +1227,7 @@ rotat
|
|
|
1211
1227
|
rotocol
|
|
1212
1228
|
rottl
|
|
1213
1229
|
rough
|
|
1214
|
-
|
|
1230
|
+
roun
|
|
1215
1231
|
roup
|
|
1216
1232
|
row
|
|
1217
1233
|
rroga
|
|
@@ -1317,9 +1333,10 @@ sock
|
|
|
1317
1333
|
soft
|
|
1318
1334
|
solid
|
|
1319
1335
|
solve
|
|
1336
|
+
some
|
|
1320
1337
|
sony
|
|
1321
1338
|
sort
|
|
1322
|
-
|
|
1339
|
+
soun
|
|
1323
1340
|
source
|
|
1324
1341
|
space
|
|
1325
1342
|
spacing
|
|
@@ -1429,6 +1446,7 @@ tio
|
|
|
1429
1446
|
tish
|
|
1430
1447
|
title
|
|
1431
1448
|
titud
|
|
1449
|
+
tizen
|
|
1432
1450
|
tmp/
|
|
1433
1451
|
to_
|
|
1434
1452
|
tod
|
|
@@ -1440,6 +1458,7 @@ topic
|
|
|
1440
1458
|
tory
|
|
1441
1459
|
total
|
|
1442
1460
|
touch
|
|
1461
|
+
tour
|
|
1443
1462
|
trace
|
|
1444
1463
|
tract
|
|
1445
1464
|
traffic
|
|
@@ -1573,6 +1592,7 @@ yield
|
|
|
1573
1592
|
you
|
|
1574
1593
|
zeppelin
|
|
1575
1594
|
zero
|
|
1595
|
+
zigbee
|
|
1576
1596
|
zing
|
|
1577
1597
|
zona
|
|
1578
1598
|
zorro
|
credsweeper/config/config.py
CHANGED
|
@@ -35,6 +35,7 @@ class Config:
|
|
|
35
35
|
self.candidate_output: List[str] = config["candidate_output"]
|
|
36
36
|
self.find_by_ext: bool = config["find_by_ext"]
|
|
37
37
|
self.size_limit: Optional[int] = parse_size(config["size_limit"]) if config["size_limit"] is not None else None
|
|
38
|
+
self.pedantic: bool = bool(config["pedantic"])
|
|
38
39
|
self.depth: int = int(config["depth"])
|
|
39
40
|
self.doc: bool = config["doc"]
|
|
40
41
|
self.severity: Severity = Severity.get(config.get("severity"))
|
|
@@ -163,6 +163,7 @@ class LineData:
|
|
|
163
163
|
self.clean_url_parameters()
|
|
164
164
|
self.clean_bash_parameters()
|
|
165
165
|
self.clean_toml_parameters()
|
|
166
|
+
self.clean_tag_parameters()
|
|
166
167
|
if 0 <= self.value_start and 0 <= self.value_end and len(self.value) < len(_value):
|
|
167
168
|
start = _value.find(self.value)
|
|
168
169
|
self.value_start += start
|
|
@@ -196,15 +197,14 @@ class LineData:
|
|
|
196
197
|
If line seem to be a URL - split by & character.
|
|
197
198
|
Variable should be right most value after & or ? ([-1]). And value should be left most before & ([0])
|
|
198
199
|
"""
|
|
199
|
-
|
|
200
|
+
# skip sanitize in case of URL credential rule - the regex is mature enough
|
|
201
|
+
if self.check_url_part() and not self.variable.endswith("://"):
|
|
200
202
|
# all checks have passed - line before the value may be a URL
|
|
201
203
|
self.variable = self.variable.rsplit('&')[-1].rsplit('?')[-1].rsplit(';')[-1]
|
|
202
204
|
self.value = self.value.split('&', maxsplit=1)[0].split(';', maxsplit=1)[0].split('#', maxsplit=1)[0]
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
self.value = self.
|
|
206
|
-
if self._3d_escaped_separator:
|
|
207
|
-
self.value = self.url_percent_split.split(self.value)[0]
|
|
205
|
+
self.value = self.url_unicode_split.split(self.value)[0]
|
|
206
|
+
if self._3d_escaped_separator:
|
|
207
|
+
self.value = self.url_percent_split.split(self.value)[0]
|
|
208
208
|
|
|
209
209
|
def clean_bash_parameters(self) -> None:
|
|
210
210
|
"""Split variable and value by bash special characters, if line assumed to be CLI command."""
|
|
@@ -232,6 +232,21 @@ class LineData:
|
|
|
232
232
|
self.value = self.value[:-1]
|
|
233
233
|
cleaning_required = True
|
|
234
234
|
|
|
235
|
+
def clean_tag_parameters(self) -> None:
|
|
236
|
+
"""Remove closing tag from value if the opened is somewhere before in line"""
|
|
237
|
+
cleaning_required = self.value and self.value.endswith('>')
|
|
238
|
+
while cleaning_required:
|
|
239
|
+
closing_tag_pos = self.value.rfind("</")
|
|
240
|
+
if 0 <= closing_tag_pos:
|
|
241
|
+
# use `<a` to avoid tag parameters
|
|
242
|
+
opening_tag_prefix = f"<{self.value[closing_tag_pos + 2:-1]}"
|
|
243
|
+
if cleaning_required := (opening_tag_prefix not in self.value
|
|
244
|
+
and 0 <= self.line.find(opening_tag_prefix, 0, self.value_start)):
|
|
245
|
+
self.value = self.value[:closing_tag_pos]
|
|
246
|
+
cleaning_required = self.value and self.value.endswith('>')
|
|
247
|
+
else:
|
|
248
|
+
break
|
|
249
|
+
|
|
235
250
|
def sanitize_variable(self) -> None:
|
|
236
251
|
"""Remove trailing spaces, dashes and quotations around the variable. Correct position."""
|
|
237
252
|
sanitized_var_len = 0
|
|
@@ -51,6 +51,7 @@ class AbstractScanner(ABC):
|
|
|
51
51
|
@abstractmethod
|
|
52
52
|
def get_deep_scanners(data: bytes, descriptor: Descriptor, depth: int) -> Tuple[List[Any], List[Any]]:
|
|
53
53
|
"""Returns possibly scan methods for the data depends on content and fallback scanners"""
|
|
54
|
+
raise NotImplementedError(__name__)
|
|
54
55
|
|
|
55
56
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
|
56
57
|
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import io
|
|
3
|
+
import logging
|
|
4
|
+
from abc import ABC
|
|
5
|
+
from typing import List, Optional, Dict, Any
|
|
6
|
+
|
|
7
|
+
from credsweeper.common.constants import MAX_LINE_LENGTH
|
|
8
|
+
from credsweeper.credentials.candidate import Candidate
|
|
9
|
+
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
|
|
10
|
+
from credsweeper.file_handler.data_content_provider import DataContentProvider
|
|
11
|
+
from credsweeper.file_handler.struct_content_provider import StructContentProvider
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CsvScanner(AbstractScanner, ABC):
|
|
17
|
+
"""Implements CSV scanning"""
|
|
18
|
+
|
|
19
|
+
sniffer = csv.Sniffer()
|
|
20
|
+
# do not use space as separator to avoid hallucinations
|
|
21
|
+
delimiters = ",;\t|\x1F"
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def get_structure(cls, text: str) -> List[Dict[str, Any]]:
|
|
25
|
+
"""Reads a text as CSV standard with guessed dialect"""
|
|
26
|
+
# windows style \r\n
|
|
27
|
+
first_line_end = text.find('\r', 0, MAX_LINE_LENGTH)
|
|
28
|
+
line_terminator = "\r\n"
|
|
29
|
+
if 0 > first_line_end:
|
|
30
|
+
# unix style \n
|
|
31
|
+
first_line_end = text.find('\n', 0, MAX_LINE_LENGTH)
|
|
32
|
+
line_terminator = "\n"
|
|
33
|
+
if 0 > first_line_end:
|
|
34
|
+
raise ValueError(f"No suitable line end found in {MAX_LINE_LENGTH} symbols")
|
|
35
|
+
|
|
36
|
+
first_line = text[:first_line_end]
|
|
37
|
+
dialect = cls.sniffer.sniff(first_line, delimiters=cls.delimiters)
|
|
38
|
+
rows = []
|
|
39
|
+
reader = csv.DictReader(io.StringIO(text),
|
|
40
|
+
delimiter=dialect.delimiter,
|
|
41
|
+
lineterminator=line_terminator,
|
|
42
|
+
strict=True)
|
|
43
|
+
# check the constant columns number for all rows
|
|
44
|
+
fields_number = sum(1 for x in reader.fieldnames if x is not None)
|
|
45
|
+
for row in reader:
|
|
46
|
+
if not isinstance(row, dict):
|
|
47
|
+
raise ValueError(f"ERROR: wrong row '{row}'")
|
|
48
|
+
if len(row) != fields_number or any(x is None for x in row.values()):
|
|
49
|
+
# None means no separator used
|
|
50
|
+
raise ValueError(f"Different columns number in row '{row}' - mismatch {fields_number}")
|
|
51
|
+
rows.append(row)
|
|
52
|
+
return rows
|
|
53
|
+
|
|
54
|
+
def data_scan(
|
|
55
|
+
self, #
|
|
56
|
+
data_provider: DataContentProvider, #
|
|
57
|
+
depth: int, #
|
|
58
|
+
recursive_limit_size: int) -> Optional[List[Candidate]]:
|
|
59
|
+
"""Tries to scan each row as structure with column name in key"""
|
|
60
|
+
try:
|
|
61
|
+
if rows := self.get_structure(data_provider.text):
|
|
62
|
+
struct_content_provider = StructContentProvider(struct=rows,
|
|
63
|
+
file_path=data_provider.file_path,
|
|
64
|
+
file_type=data_provider.file_type,
|
|
65
|
+
info=f"{data_provider.info}|CSV")
|
|
66
|
+
new_limit = recursive_limit_size - sum(len(x) for x in rows)
|
|
67
|
+
struct_candidates = self.structure_scan(struct_content_provider, depth, new_limit)
|
|
68
|
+
return struct_candidates
|
|
69
|
+
except Exception as csv_exc:
|
|
70
|
+
logger.debug(f"{data_provider.file_path}:{csv_exc}")
|
|
71
|
+
return None
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import List, Any, Tuple
|
|
3
3
|
|
|
4
|
-
from credsweeper.common.constants import MIN_DATA_LEN
|
|
5
4
|
from credsweeper.config.config import Config
|
|
6
5
|
from credsweeper.scanner.scanner import Scanner
|
|
7
6
|
from credsweeper.utils.util import Util
|
|
8
7
|
from .byte_scanner import ByteScanner
|
|
9
8
|
from .bzip2_scanner import Bzip2Scanner
|
|
9
|
+
from .csv_scanner import CsvScanner
|
|
10
10
|
from .deb_scanner import DebScanner
|
|
11
11
|
from .docx_scanner import DocxScanner
|
|
12
12
|
from .eml_scanner import EmlScanner
|
|
@@ -23,7 +23,9 @@ from .pdf_scanner import PdfScanner
|
|
|
23
23
|
from .pkcs_scanner import PkcsScanner
|
|
24
24
|
from .pptx_scanner import PptxScanner
|
|
25
25
|
from .rpm_scanner import RpmScanner
|
|
26
|
+
from .rtf_scanner import RtfScanner
|
|
26
27
|
from .sqlite3_scanner import Sqlite3Scanner
|
|
28
|
+
from .strings_scanner import StringsScanner
|
|
27
29
|
from .tar_scanner import TarScanner
|
|
28
30
|
from .tmx_scanner import TmxScanner
|
|
29
31
|
from .xlsx_scanner import XlsxScanner
|
|
@@ -38,6 +40,7 @@ class DeepScanner(
|
|
|
38
40
|
ByteScanner, #
|
|
39
41
|
Bzip2Scanner, #
|
|
40
42
|
DocxScanner, #
|
|
43
|
+
CsvScanner, #
|
|
41
44
|
EncoderScanner, #
|
|
42
45
|
GzipScanner, #
|
|
43
46
|
HtmlScanner, #
|
|
@@ -49,8 +52,10 @@ class DeepScanner(
|
|
|
49
52
|
PdfScanner, #
|
|
50
53
|
PkcsScanner, #
|
|
51
54
|
PptxScanner, #
|
|
55
|
+
RtfScanner, #
|
|
52
56
|
RpmScanner, #
|
|
53
57
|
Sqlite3Scanner, #
|
|
58
|
+
StringsScanner, #
|
|
54
59
|
TarScanner, #
|
|
55
60
|
DebScanner, #
|
|
56
61
|
XmlScanner, #
|
|
@@ -133,6 +138,9 @@ class DeepScanner(
|
|
|
133
138
|
deep_scanners.append(Sqlite3Scanner)
|
|
134
139
|
elif Util.is_asn1(data):
|
|
135
140
|
deep_scanners.append(PkcsScanner)
|
|
141
|
+
elif Util.is_rtf(data):
|
|
142
|
+
deep_scanners.append(RtfScanner)
|
|
143
|
+
fallback_scanners.append(ByteScanner)
|
|
136
144
|
elif Util.is_xml(data):
|
|
137
145
|
if Util.is_html(data):
|
|
138
146
|
deep_scanners.append(HtmlScanner)
|
|
@@ -150,24 +158,26 @@ class DeepScanner(
|
|
|
150
158
|
deep_scanners.append(XmlScanner)
|
|
151
159
|
fallback_scanners.append(ByteScanner)
|
|
152
160
|
elif Util.is_eml(data):
|
|
153
|
-
if ".eml"
|
|
161
|
+
if descriptor.extension in (".eml", ".mht"):
|
|
154
162
|
deep_scanners.append(EmlScanner)
|
|
155
163
|
else:
|
|
156
164
|
if 0 < depth:
|
|
157
|
-
# formal patch looks like an eml
|
|
165
|
+
# a formal patch looks like an eml
|
|
158
166
|
deep_scanners.append(PatchScanner)
|
|
159
167
|
fallback_scanners.append(EmlScanner)
|
|
160
168
|
fallback_scanners.append(ByteScanner)
|
|
161
|
-
elif Util.is_known(data):
|
|
162
|
-
# the format is known but cannot be scanned
|
|
163
|
-
pass
|
|
164
169
|
elif not Util.is_binary(data):
|
|
170
|
+
# keep ByteScanner first to apply real value position if possible
|
|
171
|
+
deep_scanners.append(ByteScanner)
|
|
165
172
|
if 0 < depth:
|
|
166
173
|
deep_scanners.append(PatchScanner)
|
|
167
174
|
deep_scanners.append(EncoderScanner)
|
|
168
175
|
deep_scanners.append(LangScanner)
|
|
169
|
-
|
|
176
|
+
deep_scanners.append(CsvScanner)
|
|
170
177
|
else:
|
|
171
|
-
|
|
172
|
-
|
|
178
|
+
if 0 < depth:
|
|
179
|
+
deep_scanners.append(StringsScanner)
|
|
180
|
+
else:
|
|
181
|
+
logger.warning("Cannot apply a deep scanner for type %s prefix %s %d", descriptor, repr(data[:32]),
|
|
182
|
+
len(data))
|
|
173
183
|
return deep_scanners, fallback_scanners
|
|
@@ -4,6 +4,7 @@ from typing import List, Optional
|
|
|
4
4
|
|
|
5
5
|
import jks
|
|
6
6
|
|
|
7
|
+
from credsweeper.common.constants import Severity, Confidence
|
|
7
8
|
from credsweeper.credentials.candidate import Candidate
|
|
8
9
|
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
|
|
9
10
|
from credsweeper.file_handler.data_content_provider import DataContentProvider
|
|
@@ -24,14 +25,22 @@ class JksScanner(AbstractScanner, ABC):
|
|
|
24
25
|
try:
|
|
25
26
|
keystore = jks.KeyStore.loads(data_provider.data, pw_probe, try_decrypt_keys=True)
|
|
26
27
|
# the password probe has passed, it will be the value
|
|
27
|
-
|
|
28
|
-
|
|
28
|
+
if keystore.private_keys or keystore.secret_keys:
|
|
29
|
+
severity = Severity.HIGH
|
|
30
|
+
confidence = Confidence.STRONG
|
|
31
|
+
info = f"{data_provider.info}|JKS:default password"
|
|
32
|
+
else:
|
|
33
|
+
severity = Severity.LOW
|
|
34
|
+
confidence = Confidence.WEAK
|
|
35
|
+
info = f"{data_provider.info}|JKS:sensitive data"
|
|
29
36
|
candidate = Candidate.get_dummy_candidate(
|
|
30
37
|
self.config, #
|
|
31
38
|
data_provider.file_path, #
|
|
32
39
|
data_provider.file_type, #
|
|
33
40
|
info, #
|
|
34
41
|
"Java Key Storage")
|
|
42
|
+
candidate.severity = severity
|
|
43
|
+
candidate.confidence = confidence
|
|
35
44
|
value = pw_probe or "<EMPTY PASSWORD>"
|
|
36
45
|
candidate.line_data_list[0].line = f"'{value}' is the password"
|
|
37
46
|
candidate.line_data_list[0].value = pw_probe or "<EMPTY PASSWORD>"
|
|
@@ -3,6 +3,7 @@ import logging
|
|
|
3
3
|
from abc import ABC
|
|
4
4
|
from typing import List, Optional
|
|
5
5
|
|
|
6
|
+
from credsweeper.common.constants import Severity, Confidence
|
|
6
7
|
from credsweeper.credentials.candidate import Candidate
|
|
7
8
|
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
|
|
8
9
|
from credsweeper.file_handler.data_content_provider import DataContentProvider
|
|
@@ -35,6 +36,9 @@ class PkcsScanner(AbstractScanner, ABC):
|
|
|
35
36
|
"PKCS")
|
|
36
37
|
candidate.line_data_list[0].line = base64.b64encode(data_provider.data).decode()
|
|
37
38
|
candidate.line_data_list[0].value = repr(password)
|
|
39
|
+
# high severity is assigned to private key rules
|
|
40
|
+
candidate.severity = Severity.HIGH
|
|
41
|
+
candidate.confidence = Confidence.STRONG
|
|
38
42
|
return [candidate]
|
|
39
43
|
except Exception as pkcs_exc:
|
|
40
44
|
logger.debug(f"{data_provider.file_path}:{pw_probe}:{pkcs_exc}")
|