credsweeper 1.12.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of credsweeper might be problematic. Click here for more details.

Files changed (51) hide show
  1. credsweeper/__init__.py +1 -1
  2. credsweeper/__main__.py +23 -13
  3. credsweeper/app.py +7 -2
  4. credsweeper/common/keyword_pattern.py +6 -3
  5. credsweeper/common/morpheme_checklist.txt +26 -6
  6. credsweeper/config/config.py +1 -0
  7. credsweeper/credentials/line_data.py +21 -6
  8. credsweeper/deep_scanner/abstract_scanner.py +1 -0
  9. credsweeper/deep_scanner/csv_scanner.py +71 -0
  10. credsweeper/deep_scanner/deep_scanner.py +19 -9
  11. credsweeper/deep_scanner/jks_scanner.py +11 -2
  12. credsweeper/deep_scanner/pkcs_scanner.py +4 -0
  13. credsweeper/deep_scanner/rtf_scanner.py +41 -0
  14. credsweeper/deep_scanner/strings_scanner.py +52 -0
  15. credsweeper/file_handler/byte_content_provider.py +10 -1
  16. credsweeper/file_handler/file_path_extractor.py +2 -0
  17. credsweeper/file_handler/text_content_provider.py +7 -1
  18. credsweeper/filters/__init__.py +1 -1
  19. credsweeper/filters/group/token_pattern.py +2 -2
  20. credsweeper/filters/group/weird_base36_token.py +3 -3
  21. credsweeper/filters/group/weird_base64_token.py +2 -2
  22. credsweeper/filters/value_camel_case_check.py +2 -2
  23. credsweeper/filters/value_file_path_check.py +5 -3
  24. credsweeper/filters/value_github_check.py +3 -2
  25. credsweeper/filters/value_morphemes_check.py +43 -0
  26. credsweeper/filters/value_string_type_check.py +1 -0
  27. credsweeper/ml_model/features/feature.py +1 -18
  28. credsweeper/ml_model/features/file_extension.py +1 -1
  29. credsweeper/ml_model/features/has_html_tag.py +10 -8
  30. credsweeper/ml_model/features/is_secret_numeric.py +4 -3
  31. credsweeper/ml_model/features/rule_name.py +1 -1
  32. credsweeper/ml_model/features/word_in.py +9 -32
  33. credsweeper/ml_model/features/word_in_path.py +2 -3
  34. credsweeper/ml_model/features/word_in_postamble.py +1 -4
  35. credsweeper/ml_model/features/word_in_preamble.py +1 -4
  36. credsweeper/ml_model/features/word_in_transition.py +1 -4
  37. credsweeper/ml_model/features/word_in_value.py +2 -3
  38. credsweeper/ml_model/features/word_in_variable.py +2 -3
  39. credsweeper/ml_model/ml_config.json +15 -8
  40. credsweeper/ml_model/ml_model.onnx +0 -0
  41. credsweeper/ml_model/ml_validator.py +1 -1
  42. credsweeper/rules/config.yaml +174 -207
  43. credsweeper/scanner/scanner.py +12 -7
  44. credsweeper/secret/config.json +18 -5
  45. credsweeper/utils/util.py +21 -18
  46. {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/METADATA +7 -7
  47. {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/RECORD +50 -47
  48. credsweeper/filters/value_couple_keyword_check.py +0 -28
  49. {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/WHEEL +0 -0
  50. {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/entry_points.txt +0 -0
  51. {credsweeper-1.12.1.dist-info → credsweeper-1.13.3.dist-info}/licenses/LICENSE +0 -0
credsweeper/__init__.py CHANGED
@@ -24,4 +24,4 @@ __all__ = [
24
24
  "__version__"
25
25
  ]
26
26
 
27
- __version__ = "1.12.1"
27
+ __version__ = "1.13.3"
credsweeper/__main__.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import binascii
2
+ import contextlib
2
3
  import logging
3
4
  import os
4
5
  import sys
@@ -34,24 +35,24 @@ def positive_int(value: Any) -> int:
34
35
  return int_value
35
36
 
36
37
 
37
- def threshold_or_float(arg: str) -> Union[float, ThresholdPreset]:
38
+ def threshold_or_float_or_zero(arg: str) -> Union[int, float, ThresholdPreset]:
38
39
  """Return ThresholdPreset or a float from the input string
39
40
 
40
41
  Args:
41
42
  arg: string that either a float or one of allowed values in ThresholdPreset
42
43
 
43
44
  Returns:
44
- float if arg convertible to float, ThresholdPreset if one of the allowed values
45
+ int = 0 to disable ML validator, float if arg convertible to float, ThresholdPreset if one of the allowed values
45
46
 
46
47
  Raises:
47
48
  ArgumentTypeError: if arg cannot be interpreted as float or ThresholdPreset
48
49
 
49
50
  """
50
51
  allowed_presents = [e.value for e in ThresholdPreset]
51
- try:
52
+ if '0' == arg:
53
+ return 0
54
+ with contextlib.suppress(ValueError):
52
55
  return float(arg) # try convert to float
53
- except ValueError:
54
- pass
55
56
  if arg in allowed_presents:
56
57
  return ThresholdPreset[arg]
57
58
  raise ArgumentTypeError(f"value must be a float or one of {allowed_presents}")
@@ -158,6 +159,10 @@ def get_arguments() -> Namespace:
158
159
  help="find files by predefined extension",
159
160
  dest="find_by_ext",
160
161
  action="store_true")
162
+ parser.add_argument("--pedantic",
163
+ help="process files without extension",
164
+ action=BooleanOptionalAction,
165
+ default=False)
161
166
  parser.add_argument("--depth",
162
167
  help="additional recursive search in data (experimental)",
163
168
  type=positive_int,
@@ -172,11 +177,11 @@ def get_arguments() -> Namespace:
172
177
  "The lower the threshold - the more credentials will be reported. "
173
178
  f"Allowed values: float between 0 and 1, or any of {[e.value for e in ThresholdPreset]} "
174
179
  "(default: medium)",
175
- type=threshold_or_float,
180
+ type=threshold_or_float_or_zero,
176
181
  default=ThresholdPreset.medium,
177
182
  dest="ml_threshold",
178
183
  required=False,
179
- metavar="FLOAT_OR_STR")
184
+ metavar="THRESHOLD_OR_FLOAT_OR_ZERO")
180
185
  parser.add_argument("--ml_batch_size",
181
186
  "-b",
182
187
  help="batch size for model inference (default: 16)",
@@ -299,6 +304,7 @@ def get_credsweeper(args: Namespace) -> CredSweeper:
299
304
  ml_model=args.ml_model,
300
305
  ml_providers=args.ml_providers,
301
306
  find_by_ext=args.find_by_ext,
307
+ pedantic=args.pedantic,
302
308
  depth=args.depth,
303
309
  doc=args.doc,
304
310
  severity=args.severity,
@@ -335,7 +341,8 @@ def scan(args: Namespace, content_provider: AbstractProvider) -> int:
335
341
  def get_commit_providers(commit: Commit, repo: Repo) -> Sequence[ByteContentProvider]:
336
342
  """Process a commit and for providers"""
337
343
  result = {}
338
- ancestors = commit.parents or [repo.tree()]
344
+ # use the hardcoded sha1 until sha256 objects are not supported by GitPython
345
+ ancestors = commit.parents or [repo.tree("4b825dc642cb6eb9a060e54bf8d69288fbee4904")]
339
346
  for parent in ancestors:
340
347
  for diff in parent.diff(commit):
341
348
  # only result files
@@ -372,9 +379,11 @@ def drill(args: Namespace) -> Tuple[int, int]:
372
379
  # then - credsweeper
373
380
  credsweeper = get_credsweeper(args)
374
381
  # use flat iterations to avoid recursive limits
375
- to_scan = list(commits_sha1)
382
+ to_scan = set(commits_sha1)
376
383
  # local speedup for already scanned commits - avoid file system interactive
377
384
  scanned = set()
385
+ # to avoid double-check
386
+ skipped = set()
378
387
  while to_scan:
379
388
  commit_sha1 = to_scan.pop()
380
389
  if commit_sha1 in scanned:
@@ -382,8 +391,8 @@ def drill(args: Namespace) -> Tuple[int, int]:
382
391
  continue
383
392
  commit = repo.commit(commit_sha1)
384
393
  if commit.parents:
385
- # add parents anyway
386
- to_scan.extend(x.hexsha for x in commit.parents)
394
+ # add parents only when they were not skipped or scanned previously
395
+ to_scan.update(x.hexsha for x in commit.parents if x.hexsha not in skipped and x.hexsha not in scanned)
387
396
  # check whether the commit has been checked and the report is present
388
397
  skip_already_scanned = False
389
398
  if args.json_filename:
@@ -401,9 +410,10 @@ def drill(args: Namespace) -> Tuple[int, int]:
401
410
  else:
402
411
  credsweeper.xlsx_filename = xlsx_path
403
412
  if skip_already_scanned:
404
- logger.info("Skip already scanned commit: %s", commit_sha1)
413
+ skipped.add(commit_sha1)
414
+ logger.info("Skip already scanned commit: %s %s", commit_sha1, commit.committed_datetime.isoformat())
405
415
  continue
406
- logger.info("Scan commit: %s", commit_sha1)
416
+ logger.info("Scan commit: %s %s", commit_sha1, commit.committed_datetime.isoformat())
407
417
  # prepare all files to scan in the commit with bytes->IO transformation to avoid a multiprocess issue
408
418
  if providers := get_commit_providers(commit, repo):
409
419
  credsweeper.credential_manager.candidates.clear()
credsweeper/app.py CHANGED
@@ -52,11 +52,12 @@ class CredSweeper:
52
52
  use_filters: bool = True,
53
53
  pool_count: int = 1,
54
54
  ml_batch_size: Optional[int] = None,
55
- ml_threshold: Union[float, ThresholdPreset] = ThresholdPreset.medium,
55
+ ml_threshold: Union[int, float, ThresholdPreset] = ThresholdPreset.medium,
56
56
  ml_config: Union[None, str, Path] = None,
57
57
  ml_model: Union[None, str, Path] = None,
58
58
  ml_providers: Optional[str] = None,
59
59
  find_by_ext: bool = False,
60
+ pedantic: bool = False,
60
61
  depth: int = 0,
61
62
  doc: bool = False,
62
63
  severity: Union[Severity, str] = Severity.INFO,
@@ -86,6 +87,7 @@ class CredSweeper:
86
87
  ml_model: str or Path to set custom ml model
87
88
  ml_providers: str - comma separated list with providers
88
89
  find_by_ext: boolean - files will be reported by extension
90
+ pedantic: boolean - scan all files
89
91
  depth: int - how deep container files will be scanned
90
92
  doc: boolean - document-specific scanning
91
93
  severity: Severity - minimum severity level of rule
@@ -103,6 +105,7 @@ class CredSweeper:
103
105
  config_dict = self._get_config_dict(config_path=config_path,
104
106
  use_filters=use_filters,
105
107
  find_by_ext=find_by_ext,
108
+ pedantic=pedantic,
106
109
  depth=depth,
107
110
  doc=doc,
108
111
  severity=_severity,
@@ -145,6 +148,7 @@ class CredSweeper:
145
148
  config_path: Optional[str], #
146
149
  use_filters: bool, #
147
150
  find_by_ext: bool, #
151
+ pedantic: bool, #
148
152
  depth: int, #
149
153
  doc: bool, #
150
154
  severity: Severity, #
@@ -155,6 +159,7 @@ class CredSweeper:
155
159
  config_dict["use_filters"] = use_filters
156
160
  config_dict["find_by_ext"] = find_by_ext
157
161
  config_dict["size_limit"] = size_limit
162
+ config_dict["pedantic"] = pedantic
158
163
  config_dict["depth"] = depth
159
164
  config_dict["doc"] = doc
160
165
  config_dict["severity"] = severity.value
@@ -169,7 +174,7 @@ class CredSweeper:
169
174
  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
170
175
 
171
176
  def _use_ml_validation(self) -> bool:
172
- if isinstance(self.ml_threshold, (float, int)) and 0 >= self.ml_threshold:
177
+ if isinstance(self.ml_threshold, int) and 0 == self.ml_threshold:
173
178
  logger.info("ML validation is disabled")
174
179
  return False
175
180
  if not self.credential_manager.candidates:
@@ -3,7 +3,10 @@ import re
3
3
 
4
4
  class KeywordPattern:
5
5
  """Pattern set of keyword types"""
6
- directive = r"(?P<directive>(?:(?:[#%]define|%global)(?:\s|\\t)|\bset))?"
6
+ directive = r"(?P<directive>(?:" \
7
+ r"(?:[#%]define|define(?=(\s|\\{1,8}[tnr])*\()|%global)" \
8
+ r"(?:\s?\(|\s|\\{1,8}[tnr]){1,8}|\bset(?=\b|\w*(\s|\\{1,8}[tnr])*\()" \
9
+ r"))?"
7
10
  key_left = r"(?:\\[nrt]|(\\\\*u00|%)[0-9a-f]{2}|\s)*" \
8
11
  r"(?P<variable>(([\"'`]{1,8}[^:=\"'`}<>\\/&?]*|[^:=\"'`}<>\s()\\/&?;,%]*)"
9
12
  # keyword will be inserted here
@@ -13,7 +16,7 @@ class KeywordPattern:
13
16
  r")" # <variable>
14
17
  separator = r"(?(directive)|(\s|\\{1,8}[tnr])*\]?(\s|\\{1,8}[tnr])*)" \
15
18
  r"(?P<separator>:(\s[a-z]{3,9}[?]?\s)?=|:(?!:)|=(>|&gt;|(\\\\*u00|%)26gt;)|!==|!=|===|==|=~|=" \
16
- r"|(?(directive)(\\t|\s|\((?!\))){1,80}|%3d))" \
19
+ r"|(?(directive)(,|\\t|\s|\((?!\))){1,80}|%3d))" \
17
20
  r"(\s|\\{1,8}[tnr])*"
18
21
  # might be curly, square or parenthesis with words before
19
22
  wrap = r"(?P<wrap>(" \
@@ -23,7 +26,7 @@ class KeywordPattern:
23
26
  r"\s*" \
24
27
  r"(\[(?!\])|\((?!\))|\{(?!\}))" \
25
28
  r"(\s|\\{1,8}[tnr])*" \
26
- r"(?(get)('[^']+'|\"[^\"]+\")\s*,\s*|)" \
29
+ r"(?(get)('[^']{1,31}'|\"[^\"]{1,31}\")\s*,\s*|)" \
27
30
  r"([0-9a-z_]{1,32}\s*[:=]\s*)?" \
28
31
  r"){1,8})?"
29
32
  string_prefix = r"(((b|r|br|rb|u|f|rf|fr|l|@)(?=(\\*[\"'`])))?"
@@ -14,11 +14,15 @@
14
14
  /var
15
15
  000
16
16
  111
17
+ 14159265
18
+ 18284590
17
19
  222
18
20
  333
19
21
  444
20
22
  555
23
+ 65358979
21
24
  666
25
+ 71828182
22
26
  777
23
27
  80211
24
28
  888
@@ -195,7 +199,7 @@ aux
195
199
  avail
196
200
  avatar
197
201
  aver
198
- awesome
202
+ awesom
199
203
  axis
200
204
  azure
201
205
  back
@@ -227,12 +231,14 @@ bind
227
231
  bio
228
232
  bipol
229
233
  bit
234
+ bixby
230
235
  black
231
236
  blan
232
237
  bless
233
238
  blic
234
239
  blish
235
240
  blob
241
+ blood
236
242
  blue
237
243
  board
238
244
  bob
@@ -243,7 +249,7 @@ boost
243
249
  boot
244
250
  boss
245
251
  bot
246
- bound
252
+ boun
247
253
  box
248
254
  branch
249
255
  break
@@ -497,6 +503,7 @@ dust
497
503
  dvb
498
504
  dynamic
499
505
  dynamo
506
+ eadbee
500
507
  easin
501
508
  easy
502
509
  ecdhe
@@ -607,6 +614,7 @@ fleet
607
614
  flick
608
615
  flix
609
616
  float
617
+ flood
610
618
  floor
611
619
  fluent
612
620
  fluid
@@ -615,7 +623,7 @@ focus
615
623
  foo
616
624
  for
617
625
  fossil
618
- found
626
+ foun
619
627
  fpga
620
628
  frame
621
629
  free
@@ -648,6 +656,7 @@ git
648
656
  given
649
657
  global
650
658
  gobble
659
+ good
651
660
  google
652
661
  grab
653
662
  grace
@@ -703,6 +712,7 @@ home
703
712
  hook
704
713
  horizon
705
714
  host
715
+ houn
706
716
  hours
707
717
  html
708
718
  http
@@ -789,6 +799,7 @@ jpg_
789
799
  json
790
800
  jump
791
801
  justif
802
+ kafka
792
803
  kerberos
793
804
  kernel
794
805
  key
@@ -798,6 +809,7 @@ kind
798
809
  kinesis
799
810
  kirk
800
811
  know
812
+ knox
801
813
  kris
802
814
  lab
803
815
  lag
@@ -854,7 +866,7 @@ local
854
866
  lock
855
867
  log
856
868
  long
857
- lookup
869
+ look
858
870
  loop
859
871
  loose
860
872
  lost
@@ -947,6 +959,7 @@ ndow
947
959
  ned
948
960
  need
949
961
  neigh
962
+ neo4j
950
963
  ner
951
964
  net
952
965
  neutr
@@ -991,6 +1004,7 @@ oncat
991
1004
  one
992
1005
  onfig
993
1006
  only
1007
+ ookup
994
1008
  open
995
1009
  opt/
996
1010
  opted
@@ -1008,6 +1022,7 @@ ormat
1008
1022
  orph
1009
1023
  otorola
1010
1024
  ottle
1025
+ ound
1011
1026
  ously
1012
1027
  out
1013
1028
  over
@@ -1067,6 +1082,7 @@ pose
1067
1082
  posit
1068
1083
  possib
1069
1084
  post
1085
+ poun
1070
1086
  power
1071
1087
  pre_
1072
1088
  pred
@@ -1211,7 +1227,7 @@ rotat
1211
1227
  rotocol
1212
1228
  rottl
1213
1229
  rough
1214
- round
1230
+ roun
1215
1231
  roup
1216
1232
  row
1217
1233
  rroga
@@ -1317,9 +1333,10 @@ sock
1317
1333
  soft
1318
1334
  solid
1319
1335
  solve
1336
+ some
1320
1337
  sony
1321
1338
  sort
1322
- sound
1339
+ soun
1323
1340
  source
1324
1341
  space
1325
1342
  spacing
@@ -1429,6 +1446,7 @@ tio
1429
1446
  tish
1430
1447
  title
1431
1448
  titud
1449
+ tizen
1432
1450
  tmp/
1433
1451
  to_
1434
1452
  tod
@@ -1440,6 +1458,7 @@ topic
1440
1458
  tory
1441
1459
  total
1442
1460
  touch
1461
+ tour
1443
1462
  trace
1444
1463
  tract
1445
1464
  traffic
@@ -1573,6 +1592,7 @@ yield
1573
1592
  you
1574
1593
  zeppelin
1575
1594
  zero
1595
+ zigbee
1576
1596
  zing
1577
1597
  zona
1578
1598
  zorro
@@ -35,6 +35,7 @@ class Config:
35
35
  self.candidate_output: List[str] = config["candidate_output"]
36
36
  self.find_by_ext: bool = config["find_by_ext"]
37
37
  self.size_limit: Optional[int] = parse_size(config["size_limit"]) if config["size_limit"] is not None else None
38
+ self.pedantic: bool = bool(config["pedantic"])
38
39
  self.depth: int = int(config["depth"])
39
40
  self.doc: bool = config["doc"]
40
41
  self.severity: Severity = Severity.get(config.get("severity"))
@@ -163,6 +163,7 @@ class LineData:
163
163
  self.clean_url_parameters()
164
164
  self.clean_bash_parameters()
165
165
  self.clean_toml_parameters()
166
+ self.clean_tag_parameters()
166
167
  if 0 <= self.value_start and 0 <= self.value_end and len(self.value) < len(_value):
167
168
  start = _value.find(self.value)
168
169
  self.value_start += start
@@ -196,15 +197,14 @@ class LineData:
196
197
  If line seem to be a URL - split by & character.
197
198
  Variable should be right most value after & or ? ([-1]). And value should be left most before & ([0])
198
199
  """
199
- if self.check_url_part():
200
+ # skip sanitize in case of URL credential rule - the regex is mature enough
201
+ if self.check_url_part() and not self.variable.endswith("://"):
200
202
  # all checks have passed - line before the value may be a URL
201
203
  self.variable = self.variable.rsplit('&')[-1].rsplit('?')[-1].rsplit(';')[-1]
202
204
  self.value = self.value.split('&', maxsplit=1)[0].split(';', maxsplit=1)[0].split('#', maxsplit=1)[0]
203
- if not self.variable.endswith("://"):
204
- # skip sanitize in case of URL credential rule
205
- self.value = self.url_unicode_split.split(self.value)[0]
206
- if self._3d_escaped_separator:
207
- self.value = self.url_percent_split.split(self.value)[0]
205
+ self.value = self.url_unicode_split.split(self.value)[0]
206
+ if self._3d_escaped_separator:
207
+ self.value = self.url_percent_split.split(self.value)[0]
208
208
 
209
209
  def clean_bash_parameters(self) -> None:
210
210
  """Split variable and value by bash special characters, if line assumed to be CLI command."""
@@ -232,6 +232,21 @@ class LineData:
232
232
  self.value = self.value[:-1]
233
233
  cleaning_required = True
234
234
 
235
+ def clean_tag_parameters(self) -> None:
236
+ """Remove closing tag from value if the opened is somewhere before in line"""
237
+ cleaning_required = self.value and self.value.endswith('>')
238
+ while cleaning_required:
239
+ closing_tag_pos = self.value.rfind("</")
240
+ if 0 <= closing_tag_pos:
241
+ # use `<a` to avoid tag parameters
242
+ opening_tag_prefix = f"<{self.value[closing_tag_pos + 2:-1]}"
243
+ if cleaning_required := (opening_tag_prefix not in self.value
244
+ and 0 <= self.line.find(opening_tag_prefix, 0, self.value_start)):
245
+ self.value = self.value[:closing_tag_pos]
246
+ cleaning_required = self.value and self.value.endswith('>')
247
+ else:
248
+ break
249
+
235
250
  def sanitize_variable(self) -> None:
236
251
  """Remove trailing spaces, dashes and quotations around the variable. Correct position."""
237
252
  sanitized_var_len = 0
@@ -51,6 +51,7 @@ class AbstractScanner(ABC):
51
51
  @abstractmethod
52
52
  def get_deep_scanners(data: bytes, descriptor: Descriptor, depth: int) -> Tuple[List[Any], List[Any]]:
53
53
  """Returns possibly scan methods for the data depends on content and fallback scanners"""
54
+ raise NotImplementedError(__name__)
54
55
 
55
56
  # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
56
57
 
@@ -0,0 +1,71 @@
1
+ import csv
2
+ import io
3
+ import logging
4
+ from abc import ABC
5
+ from typing import List, Optional, Dict, Any
6
+
7
+ from credsweeper.common.constants import MAX_LINE_LENGTH
8
+ from credsweeper.credentials.candidate import Candidate
9
+ from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
10
+ from credsweeper.file_handler.data_content_provider import DataContentProvider
11
+ from credsweeper.file_handler.struct_content_provider import StructContentProvider
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class CsvScanner(AbstractScanner, ABC):
17
+ """Implements CSV scanning"""
18
+
19
+ sniffer = csv.Sniffer()
20
+ # do not use space as separator to avoid hallucinations
21
+ delimiters = ",;\t|\x1F"
22
+
23
+ @classmethod
24
+ def get_structure(cls, text: str) -> List[Dict[str, Any]]:
25
+ """Reads a text as CSV standard with guessed dialect"""
26
+ # windows style \r\n
27
+ first_line_end = text.find('\r', 0, MAX_LINE_LENGTH)
28
+ line_terminator = "\r\n"
29
+ if 0 > first_line_end:
30
+ # unix style \n
31
+ first_line_end = text.find('\n', 0, MAX_LINE_LENGTH)
32
+ line_terminator = "\n"
33
+ if 0 > first_line_end:
34
+ raise ValueError(f"No suitable line end found in {MAX_LINE_LENGTH} symbols")
35
+
36
+ first_line = text[:first_line_end]
37
+ dialect = cls.sniffer.sniff(first_line, delimiters=cls.delimiters)
38
+ rows = []
39
+ reader = csv.DictReader(io.StringIO(text),
40
+ delimiter=dialect.delimiter,
41
+ lineterminator=line_terminator,
42
+ strict=True)
43
+ # check the constant columns number for all rows
44
+ fields_number = sum(1 for x in reader.fieldnames if x is not None)
45
+ for row in reader:
46
+ if not isinstance(row, dict):
47
+ raise ValueError(f"ERROR: wrong row '{row}'")
48
+ if len(row) != fields_number or any(x is None for x in row.values()):
49
+ # None means no separator used
50
+ raise ValueError(f"Different columns number in row '{row}' - mismatch {fields_number}")
51
+ rows.append(row)
52
+ return rows
53
+
54
+ def data_scan(
55
+ self, #
56
+ data_provider: DataContentProvider, #
57
+ depth: int, #
58
+ recursive_limit_size: int) -> Optional[List[Candidate]]:
59
+ """Tries to scan each row as structure with column name in key"""
60
+ try:
61
+ if rows := self.get_structure(data_provider.text):
62
+ struct_content_provider = StructContentProvider(struct=rows,
63
+ file_path=data_provider.file_path,
64
+ file_type=data_provider.file_type,
65
+ info=f"{data_provider.info}|CSV")
66
+ new_limit = recursive_limit_size - sum(len(x) for x in rows)
67
+ struct_candidates = self.structure_scan(struct_content_provider, depth, new_limit)
68
+ return struct_candidates
69
+ except Exception as csv_exc:
70
+ logger.debug(f"{data_provider.file_path}:{csv_exc}")
71
+ return None
@@ -1,12 +1,12 @@
1
1
  import logging
2
2
  from typing import List, Any, Tuple
3
3
 
4
- from credsweeper.common.constants import MIN_DATA_LEN
5
4
  from credsweeper.config.config import Config
6
5
  from credsweeper.scanner.scanner import Scanner
7
6
  from credsweeper.utils.util import Util
8
7
  from .byte_scanner import ByteScanner
9
8
  from .bzip2_scanner import Bzip2Scanner
9
+ from .csv_scanner import CsvScanner
10
10
  from .deb_scanner import DebScanner
11
11
  from .docx_scanner import DocxScanner
12
12
  from .eml_scanner import EmlScanner
@@ -23,7 +23,9 @@ from .pdf_scanner import PdfScanner
23
23
  from .pkcs_scanner import PkcsScanner
24
24
  from .pptx_scanner import PptxScanner
25
25
  from .rpm_scanner import RpmScanner
26
+ from .rtf_scanner import RtfScanner
26
27
  from .sqlite3_scanner import Sqlite3Scanner
28
+ from .strings_scanner import StringsScanner
27
29
  from .tar_scanner import TarScanner
28
30
  from .tmx_scanner import TmxScanner
29
31
  from .xlsx_scanner import XlsxScanner
@@ -38,6 +40,7 @@ class DeepScanner(
38
40
  ByteScanner, #
39
41
  Bzip2Scanner, #
40
42
  DocxScanner, #
43
+ CsvScanner, #
41
44
  EncoderScanner, #
42
45
  GzipScanner, #
43
46
  HtmlScanner, #
@@ -49,8 +52,10 @@ class DeepScanner(
49
52
  PdfScanner, #
50
53
  PkcsScanner, #
51
54
  PptxScanner, #
55
+ RtfScanner, #
52
56
  RpmScanner, #
53
57
  Sqlite3Scanner, #
58
+ StringsScanner, #
54
59
  TarScanner, #
55
60
  DebScanner, #
56
61
  XmlScanner, #
@@ -133,6 +138,9 @@ class DeepScanner(
133
138
  deep_scanners.append(Sqlite3Scanner)
134
139
  elif Util.is_asn1(data):
135
140
  deep_scanners.append(PkcsScanner)
141
+ elif Util.is_rtf(data):
142
+ deep_scanners.append(RtfScanner)
143
+ fallback_scanners.append(ByteScanner)
136
144
  elif Util.is_xml(data):
137
145
  if Util.is_html(data):
138
146
  deep_scanners.append(HtmlScanner)
@@ -150,24 +158,26 @@ class DeepScanner(
150
158
  deep_scanners.append(XmlScanner)
151
159
  fallback_scanners.append(ByteScanner)
152
160
  elif Util.is_eml(data):
153
- if ".eml" == descriptor.extension:
161
+ if descriptor.extension in (".eml", ".mht"):
154
162
  deep_scanners.append(EmlScanner)
155
163
  else:
156
164
  if 0 < depth:
157
- # formal patch looks like an eml
165
+ # a formal patch looks like an eml
158
166
  deep_scanners.append(PatchScanner)
159
167
  fallback_scanners.append(EmlScanner)
160
168
  fallback_scanners.append(ByteScanner)
161
- elif Util.is_known(data):
162
- # the format is known but cannot be scanned
163
- pass
164
169
  elif not Util.is_binary(data):
170
+ # keep ByteScanner first to apply real value position if possible
171
+ deep_scanners.append(ByteScanner)
165
172
  if 0 < depth:
166
173
  deep_scanners.append(PatchScanner)
167
174
  deep_scanners.append(EncoderScanner)
168
175
  deep_scanners.append(LangScanner)
169
- deep_scanners.append(ByteScanner)
176
+ deep_scanners.append(CsvScanner)
170
177
  else:
171
- logger.warning("Cannot apply a deep scanner for type %s prefix %s %d", descriptor,
172
- repr(data[:MIN_DATA_LEN]), len(data))
178
+ if 0 < depth:
179
+ deep_scanners.append(StringsScanner)
180
+ else:
181
+ logger.warning("Cannot apply a deep scanner for type %s prefix %s %d", descriptor, repr(data[:32]),
182
+ len(data))
173
183
  return deep_scanners, fallback_scanners
@@ -4,6 +4,7 @@ from typing import List, Optional
4
4
 
5
5
  import jks
6
6
 
7
+ from credsweeper.common.constants import Severity, Confidence
7
8
  from credsweeper.credentials.candidate import Candidate
8
9
  from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
9
10
  from credsweeper.file_handler.data_content_provider import DataContentProvider
@@ -24,14 +25,22 @@ class JksScanner(AbstractScanner, ABC):
24
25
  try:
25
26
  keystore = jks.KeyStore.loads(data_provider.data, pw_probe, try_decrypt_keys=True)
26
27
  # the password probe has passed, it will be the value
27
- info = (f"{data_provider.info}|JKS:"
28
- f"{'sensitive data' if keystore.private_keys or keystore.secret_keys else 'default password'}")
28
+ if keystore.private_keys or keystore.secret_keys:
29
+ severity = Severity.HIGH
30
+ confidence = Confidence.STRONG
31
+ info = f"{data_provider.info}|JKS:default password"
32
+ else:
33
+ severity = Severity.LOW
34
+ confidence = Confidence.WEAK
35
+ info = f"{data_provider.info}|JKS:sensitive data"
29
36
  candidate = Candidate.get_dummy_candidate(
30
37
  self.config, #
31
38
  data_provider.file_path, #
32
39
  data_provider.file_type, #
33
40
  info, #
34
41
  "Java Key Storage")
42
+ candidate.severity = severity
43
+ candidate.confidence = confidence
35
44
  value = pw_probe or "<EMPTY PASSWORD>"
36
45
  candidate.line_data_list[0].line = f"'{value}' is the password"
37
46
  candidate.line_data_list[0].value = pw_probe or "<EMPTY PASSWORD>"
@@ -3,6 +3,7 @@ import logging
3
3
  from abc import ABC
4
4
  from typing import List, Optional
5
5
 
6
+ from credsweeper.common.constants import Severity, Confidence
6
7
  from credsweeper.credentials.candidate import Candidate
7
8
  from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
8
9
  from credsweeper.file_handler.data_content_provider import DataContentProvider
@@ -35,6 +36,9 @@ class PkcsScanner(AbstractScanner, ABC):
35
36
  "PKCS")
36
37
  candidate.line_data_list[0].line = base64.b64encode(data_provider.data).decode()
37
38
  candidate.line_data_list[0].value = repr(password)
39
+ # high severity is assigned to private key rules
40
+ candidate.severity = Severity.HIGH
41
+ candidate.confidence = Confidence.STRONG
38
42
  return [candidate]
39
43
  except Exception as pkcs_exc:
40
44
  logger.debug(f"{data_provider.file_path}:{pw_probe}:{pkcs_exc}")