credsweeper 1.11.5__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of credsweeper might be problematic. Click here for more details.

Files changed (145) hide show
  1. credsweeper/__init__.py +21 -15
  2. credsweeper/__main__.py +158 -42
  3. credsweeper/app.py +18 -13
  4. credsweeper/common/keyword_pattern.py +19 -18
  5. credsweeper/common/morpheme_checklist.txt +28 -6
  6. credsweeper/config/__init__.py +0 -1
  7. credsweeper/config/config.py +4 -3
  8. credsweeper/credentials/__init__.py +0 -5
  9. credsweeper/credentials/augment_candidates.py +1 -1
  10. credsweeper/credentials/candidate.py +1 -1
  11. credsweeper/credentials/credential_manager.py +1 -1
  12. credsweeper/credentials/line_data.py +43 -8
  13. credsweeper/deep_scanner/__init__.py +0 -1
  14. credsweeper/deep_scanner/abstract_scanner.py +4 -3
  15. credsweeper/deep_scanner/byte_scanner.py +1 -1
  16. credsweeper/deep_scanner/bzip2_scanner.py +2 -2
  17. credsweeper/deep_scanner/csv_scanner.py +71 -0
  18. credsweeper/deep_scanner/deb_scanner.py +1 -1
  19. credsweeper/deep_scanner/deep_scanner.py +22 -12
  20. credsweeper/deep_scanner/docx_scanner.py +1 -1
  21. credsweeper/deep_scanner/eml_scanner.py +1 -1
  22. credsweeper/deep_scanner/encoder_scanner.py +1 -1
  23. credsweeper/deep_scanner/gzip_scanner.py +2 -2
  24. credsweeper/deep_scanner/html_scanner.py +1 -1
  25. credsweeper/deep_scanner/jclass_scanner.py +1 -1
  26. credsweeper/deep_scanner/jks_scanner.py +12 -3
  27. credsweeper/deep_scanner/lang_scanner.py +1 -1
  28. credsweeper/deep_scanner/lzma_scanner.py +2 -2
  29. credsweeper/deep_scanner/mxfile_scanner.py +1 -1
  30. credsweeper/deep_scanner/pdf_scanner.py +1 -1
  31. credsweeper/deep_scanner/pkcs_scanner.py +6 -2
  32. credsweeper/deep_scanner/pptx_scanner.py +1 -1
  33. credsweeper/deep_scanner/rpm_scanner.py +1 -1
  34. credsweeper/deep_scanner/rtf_scanner.py +41 -0
  35. credsweeper/deep_scanner/strings_scanner.py +52 -0
  36. credsweeper/deep_scanner/tar_scanner.py +2 -2
  37. credsweeper/deep_scanner/tmx_scanner.py +2 -2
  38. credsweeper/deep_scanner/xlsx_scanner.py +2 -2
  39. credsweeper/deep_scanner/xml_scanner.py +1 -1
  40. credsweeper/deep_scanner/zip_scanner.py +2 -2
  41. credsweeper/file_handler/__init__.py +0 -15
  42. credsweeper/file_handler/abstract_provider.py +3 -4
  43. credsweeper/file_handler/byte_content_provider.py +11 -2
  44. credsweeper/file_handler/content_provider.py +1 -1
  45. credsweeper/file_handler/data_content_provider.py +1 -1
  46. credsweeper/file_handler/diff_content_provider.py +133 -3
  47. credsweeper/file_handler/file_path_extractor.py +4 -2
  48. credsweeper/file_handler/files_provider.py +4 -4
  49. credsweeper/file_handler/patches_provider.py +7 -8
  50. credsweeper/file_handler/text_content_provider.py +8 -2
  51. credsweeper/filters/__init__.py +3 -4
  52. credsweeper/filters/filter.py +5 -3
  53. credsweeper/filters/group/__init__.py +0 -2
  54. credsweeper/filters/group/general_keyword.py +2 -2
  55. credsweeper/filters/group/general_pattern.py +2 -2
  56. credsweeper/filters/group/group.py +38 -36
  57. credsweeper/filters/group/password_keyword.py +9 -8
  58. credsweeper/filters/group/token_pattern.py +5 -5
  59. credsweeper/filters/group/url_credentials_group.py +8 -8
  60. credsweeper/filters/group/weird_base36_token.py +6 -6
  61. credsweeper/filters/group/weird_base64_token.py +5 -5
  62. credsweeper/filters/line_git_binary_check.py +5 -4
  63. credsweeper/filters/line_specific_key_check.py +6 -5
  64. credsweeper/filters/line_uue_part_check.py +5 -4
  65. credsweeper/filters/value_allowlist_check.py +6 -5
  66. credsweeper/filters/value_array_dictionary_check.py +8 -6
  67. credsweeper/filters/value_atlassian_token_check.py +6 -5
  68. credsweeper/filters/value_azure_token_check.py +6 -5
  69. credsweeper/filters/value_base32_data_check.py +8 -5
  70. credsweeper/filters/value_base64_data_check.py +6 -5
  71. credsweeper/filters/value_base64_encoded_pem_check.py +6 -5
  72. credsweeper/filters/value_base64_key_check.py +6 -5
  73. credsweeper/filters/value_base64_part_check.py +6 -5
  74. credsweeper/filters/value_basic_auth_check.py +37 -0
  75. credsweeper/filters/value_blocklist_check.py +6 -4
  76. credsweeper/filters/value_camel_case_check.py +8 -7
  77. credsweeper/filters/value_dictionary_keyword_check.py +6 -4
  78. credsweeper/filters/value_discord_bot_check.py +6 -5
  79. credsweeper/filters/value_entropy_base_check.py +6 -5
  80. credsweeper/filters/value_file_path_check.py +13 -8
  81. credsweeper/filters/value_github_check.py +8 -6
  82. credsweeper/filters/value_grafana_check.py +6 -5
  83. credsweeper/filters/value_grafana_service_check.py +5 -4
  84. credsweeper/filters/value_hex_number_check.py +5 -4
  85. credsweeper/filters/value_jfrog_token_check.py +6 -5
  86. credsweeper/filters/value_json_web_key_check.py +6 -5
  87. credsweeper/filters/value_json_web_token_check.py +6 -5
  88. credsweeper/filters/value_last_word_check.py +6 -4
  89. credsweeper/filters/{value_dictionary_value_length_check.py → value_length_check.py} +12 -6
  90. credsweeper/filters/value_method_check.py +5 -4
  91. credsweeper/filters/value_morphemes_check.py +43 -0
  92. credsweeper/filters/value_not_allowed_pattern_check.py +6 -5
  93. credsweeper/filters/value_not_part_encoded_check.py +4 -4
  94. credsweeper/filters/value_number_check.py +5 -4
  95. credsweeper/filters/value_pattern_check.py +61 -41
  96. credsweeper/filters/value_similarity_check.py +6 -4
  97. credsweeper/filters/value_split_keyword_check.py +5 -4
  98. credsweeper/filters/value_string_type_check.py +10 -7
  99. credsweeper/filters/value_token_base_check.py +5 -4
  100. credsweeper/filters/value_token_check.py +6 -5
  101. credsweeper/logger/__init__.py +0 -1
  102. credsweeper/logger/logger.py +1 -1
  103. credsweeper/ml_model/__init__.py +0 -1
  104. credsweeper/ml_model/features/__init__.py +1 -0
  105. credsweeper/ml_model/features/entropy_evaluation.py +1 -1
  106. credsweeper/ml_model/features/feature.py +2 -19
  107. credsweeper/ml_model/features/file_extension.py +2 -2
  108. credsweeper/ml_model/features/has_html_tag.py +12 -10
  109. credsweeper/ml_model/features/is_secret_numeric.py +5 -4
  110. credsweeper/ml_model/features/length_of_attribute.py +1 -1
  111. credsweeper/ml_model/features/morpheme_dense.py +15 -8
  112. credsweeper/ml_model/features/rule_name.py +2 -2
  113. credsweeper/ml_model/features/rule_severity.py +21 -0
  114. credsweeper/ml_model/features/search_in_attribute.py +1 -1
  115. credsweeper/ml_model/features/word_in.py +10 -33
  116. credsweeper/ml_model/features/word_in_path.py +6 -4
  117. credsweeper/ml_model/features/word_in_postamble.py +2 -5
  118. credsweeper/ml_model/features/word_in_preamble.py +2 -5
  119. credsweeper/ml_model/features/word_in_transition.py +2 -5
  120. credsweeper/ml_model/features/word_in_value.py +3 -4
  121. credsweeper/ml_model/features/word_in_variable.py +3 -4
  122. credsweeper/ml_model/ml_config.json +140 -27
  123. credsweeper/ml_model/ml_model.onnx +0 -0
  124. credsweeper/ml_model/ml_validator.py +4 -3
  125. credsweeper/rules/__init__.py +0 -1
  126. credsweeper/rules/config.yaml +329 -239
  127. credsweeper/rules/rule.py +4 -3
  128. credsweeper/scanner/__init__.py +0 -1
  129. credsweeper/scanner/scan_type/__init__.py +0 -5
  130. credsweeper/scanner/scan_type/multi_pattern.py +4 -4
  131. credsweeper/scanner/scan_type/pem_key_pattern.py +4 -4
  132. credsweeper/scanner/scan_type/scan_type.py +4 -4
  133. credsweeper/scanner/scan_type/single_pattern.py +4 -4
  134. credsweeper/scanner/scanner.py +24 -15
  135. credsweeper/secret/config.json +19 -6
  136. credsweeper/utils/__init__.py +0 -1
  137. credsweeper/utils/pem_key_detector.py +3 -3
  138. credsweeper/utils/util.py +24 -150
  139. {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/METADATA +7 -7
  140. credsweeper-1.13.3.dist-info/RECORD +164 -0
  141. credsweeper/filters/value_couple_keyword_check.py +0 -26
  142. credsweeper-1.11.5.dist-info/RECORD +0 -159
  143. {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/WHEEL +0 -0
  144. {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/entry_points.txt +0 -0
  145. {credsweeper-1.11.5.dist-info → credsweeper-1.13.3.dist-info}/licenses/LICENSE +0 -0
credsweeper/rules/rule.py CHANGED
@@ -7,9 +7,10 @@ from typing import Dict, List, Optional, Union, Set
7
7
  from credsweeper import filters
8
8
  from credsweeper.common.constants import RuleType, Severity, MAX_LINE_LENGTH, Confidence
9
9
  from credsweeper.common.keyword_pattern import KeywordPattern
10
- from credsweeper.config import Config
11
- from credsweeper.filters import Filter, group
12
- from credsweeper.filters.group import Group
10
+ from credsweeper.config.config import Config
11
+ from credsweeper.filters import group
12
+ from credsweeper.filters.filter import Filter
13
+ from credsweeper.filters.group.group import Group
13
14
 
14
15
  logger = logging.getLogger(__name__)
15
16
 
@@ -1 +0,0 @@
1
- from credsweeper.scanner.scanner import Scanner
@@ -1,5 +0,0 @@
1
- from credsweeper.scanner.scan_type.scan_type import ScanType # isort:skip
2
-
3
- from credsweeper.scanner.scan_type.multi_pattern import MultiPattern
4
- from credsweeper.scanner.scan_type.pem_key_pattern import PemKeyPattern
5
- from credsweeper.scanner.scan_type.single_pattern import SinglePattern
@@ -1,11 +1,11 @@
1
1
  from typing import List
2
2
 
3
3
  from credsweeper.common.constants import RuleType
4
- from credsweeper.config import Config
5
- from credsweeper.credentials import Candidate
4
+ from credsweeper.config.config import Config
5
+ from credsweeper.credentials.candidate import Candidate
6
6
  from credsweeper.file_handler.analysis_target import AnalysisTarget
7
- from credsweeper.rules import Rule
8
- from credsweeper.scanner.scan_type import ScanType
7
+ from credsweeper.rules.rule import Rule
8
+ from credsweeper.scanner.scan_type.scan_type import ScanType
9
9
 
10
10
 
11
11
  class MultiPattern(ScanType):
@@ -2,11 +2,11 @@ import logging
2
2
  from typing import List
3
3
 
4
4
  from credsweeper.common.constants import RuleType
5
- from credsweeper.config import Config
6
- from credsweeper.credentials import Candidate
5
+ from credsweeper.config.config import Config
6
+ from credsweeper.credentials.candidate import Candidate
7
7
  from credsweeper.file_handler.analysis_target import AnalysisTarget
8
- from credsweeper.rules import Rule
9
- from credsweeper.scanner.scan_type import ScanType
8
+ from credsweeper.rules.rule import Rule
9
+ from credsweeper.scanner.scan_type.scan_type import ScanType
10
10
  from credsweeper.utils.pem_key_detector import PemKeyDetector
11
11
 
12
12
  logger = logging.getLogger(__name__)
@@ -4,11 +4,11 @@ from abc import ABC, abstractmethod
4
4
  from typing import List
5
5
 
6
6
  from credsweeper.common.constants import RuleType, MIN_DATA_LEN
7
- from credsweeper.config import Config
8
- from credsweeper.credentials import Candidate, LineData
7
+ from credsweeper.config.config import Config
8
+ from credsweeper.credentials.candidate import Candidate, LineData
9
9
  from credsweeper.file_handler.analysis_target import AnalysisTarget
10
- from credsweeper.filters import Filter
11
- from credsweeper.rules import Rule
10
+ from credsweeper.filters.filter import Filter
11
+ from credsweeper.rules.rule import Rule
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
@@ -1,10 +1,10 @@
1
1
  from typing import List
2
2
 
3
- from credsweeper.config import Config
4
- from credsweeper.credentials import Candidate
3
+ from credsweeper.config.config import Config
4
+ from credsweeper.credentials.candidate import Candidate
5
5
  from credsweeper.file_handler.analysis_target import AnalysisTarget
6
- from credsweeper.rules import Rule
7
- from credsweeper.scanner.scan_type import ScanType
6
+ from credsweeper.rules.rule import Rule
7
+ from credsweeper.scanner.scan_type.scan_type import ScanType
8
8
 
9
9
 
10
10
  class SinglePattern(ScanType):
@@ -6,16 +6,21 @@ from typing import List, Type, Tuple, Union, Dict, Generator, Set
6
6
  from credsweeper.app import APP_PATH
7
7
  from credsweeper.common.constants import RuleType, MIN_VARIABLE_LENGTH, MIN_SEPARATOR_LENGTH, MIN_VALUE_LENGTH, \
8
8
  MAX_LINE_LENGTH, PEM_BEGIN_PATTERN
9
- from credsweeper.config import Config
10
- from credsweeper.credentials import Candidate
9
+ from credsweeper.config.config import Config
10
+ from credsweeper.credentials.candidate import Candidate
11
11
  from credsweeper.file_handler.analysis_target import AnalysisTarget
12
12
  from credsweeper.file_handler.content_provider import ContentProvider
13
- from credsweeper.rules import Rule
14
- from credsweeper.scanner.scan_type import PemKeyPattern, ScanType, SinglePattern, MultiPattern
15
- from credsweeper.utils import Util
13
+ from credsweeper.rules.rule import Rule
14
+ from credsweeper.scanner.scan_type.multi_pattern import MultiPattern
15
+ from credsweeper.scanner.scan_type.pem_key_pattern import PemKeyPattern
16
+ from credsweeper.scanner.scan_type.scan_type import ScanType
17
+ from credsweeper.scanner.scan_type.single_pattern import SinglePattern
18
+ from credsweeper.utils.util import Util
16
19
 
17
20
  logger = logging.getLogger(__name__)
18
21
 
22
+ RULES_PATH = APP_PATH / "rules" / "config.yaml"
23
+
19
24
 
20
25
  class Scanner:
21
26
  """Advanced Credential Scanner base class.
@@ -63,11 +68,11 @@ class Scanner:
63
68
  return True
64
69
  return False
65
70
 
66
- def _set_rules_scanners(self, rule_path: Union[None, str, Path]) -> None:
71
+ def _set_rules_scanners(self, rules_path: Union[None, str, Path]) -> None:
67
72
  """Auxiliary method to fill rules, determine min_pattern_len and set scanners"""
68
- if rule_path is None:
69
- rule_path = APP_PATH / "rules" / "config.yaml"
70
- rule_templates = Util.yaml_load(rule_path)
73
+ if rules_path is None:
74
+ rules_path = RULES_PATH
75
+ rule_templates = Util.yaml_load(rules_path)
71
76
  if rule_templates and isinstance(rule_templates, list):
72
77
  rule_names = set()
73
78
  for rule_template in rule_templates:
@@ -95,7 +100,7 @@ class Scanner:
95
100
  logger.warning(f"Unknown rule type:{rule.rule_type}")
96
101
  self.rules_scanners.append((rule, self.get_scanner(rule)))
97
102
  else:
98
- raise RuntimeError(f"Wrong rules '{rule_templates}' were read from '{rule_path}'")
103
+ raise RuntimeError(f"Wrong rules '{rule_templates}' were read from '{rules_path}'")
99
104
 
100
105
  def _is_available(self, rule: Rule) -> bool:
101
106
  """separate the method to reduce complexity"""
@@ -142,16 +147,22 @@ class Scanner:
142
147
  # Trim string from outer spaces to make future `x in str` checks faster
143
148
  target_line_stripped = target.line_strip
144
149
  target_line_stripped_len = target.line_strip_len
150
+ # use lower case for required substring
151
+ target_line_stripped_lower = target.line_lower_strip
145
152
 
146
153
  # "cache" - YAPF and pycharm formatters ...
147
154
  matched_keyword = \
148
155
  target_line_stripped_len >= self.min_keyword_len and ( #
149
156
  '=' in target_line_stripped
150
157
  or ':' in target_line_stripped
151
- or "set" in target_line_stripped
152
- or "#define" in target_line_stripped
153
- or "%define" in target_line_stripped
158
+ or ("define" in target_line_stripped
159
+ and ('(' in target_line_stripped and ',' in target_line_stripped
160
+ or "#define" in target_line_stripped
161
+ or "%define" in target_line_stripped)
162
+ )
154
163
  or "%global" in target_line_stripped
164
+ or "set" in target_line_stripped_lower
165
+ or "%3d" in target_line_stripped_lower
155
166
  ) #
156
167
  matched_pem_key = \
157
168
  target_line_stripped_len >= self.min_pem_key_len \
@@ -165,8 +176,6 @@ class Scanner:
165
176
  target.line_num)
166
177
  continue
167
178
 
168
- # use lower case for required substring
169
- target_line_stripped_lower = target.line_lower_strip
170
179
  # cached value to skip the same regex verifying
171
180
  matched_regex: Dict[re.Pattern, bool] = {}
172
181
 
@@ -12,18 +12,21 @@
12
12
  ".rpm",
13
13
  ".tar",
14
14
  ".war",
15
+ ".whl",
15
16
  ".xz",
16
17
  ".zip"
17
18
  ],
18
19
  "documents": [
19
- ".xlsx",
20
+ ".doc",
20
21
  ".docx",
21
- ".pptx",
22
- ".xls",
23
22
  ".odp",
24
23
  ".ods",
25
24
  ".odt",
26
- ".pdf"
25
+ ".pdf",
26
+ ".ppt",
27
+ ".pptx",
28
+ ".xls",
29
+ ".xlsx"
27
30
  ],
28
31
  "extension": [
29
32
  ".7z",
@@ -45,16 +48,23 @@
45
48
  ".info",
46
49
  ".jpeg",
47
50
  ".jpg",
51
+ ".lib",
48
52
  ".map",
49
53
  ".m4a",
50
54
  ".mat",
51
55
  ".mo",
56
+ ".mov",
52
57
  ".mp3",
53
58
  ".mp4",
59
+ ".mpg",
60
+ ".mkv",
54
61
  ".npy",
55
62
  ".npz",
56
63
  ".obj",
64
+ ".oga",
57
65
  ".ogg",
66
+ ".ogv",
67
+ ".ops",
58
68
  ".pak",
59
69
  ".png",
60
70
  ".psd",
@@ -71,8 +81,10 @@
71
81
  ".so",
72
82
  ".sum",
73
83
  ".svg",
84
+ ".swf",
74
85
  ".tif",
75
86
  ".tiff",
87
+ ".tlb",
76
88
  ".ttf",
77
89
  ".vcxproj",
78
90
  ".vdproj",
@@ -81,6 +93,7 @@
81
93
  ".webp",
82
94
  ".wma",
83
95
  ".woff",
96
+ ".woff2",
84
97
  ".yuv"
85
98
  ],
86
99
  "path": [
@@ -164,8 +177,8 @@
164
177
  "tizen"
165
178
  ],
166
179
  "check_for_literals": true,
167
- "min_pattern_value_length": 12,
168
- "min_keyword_value_length": 4,
180
+ "max_password_value_length": 64,
181
+ "max_url_cred_value_length": 80,
169
182
  "line_data_output": [
170
183
  "line",
171
184
  "line_num",
@@ -1 +0,0 @@
1
- from credsweeper.utils.util import DiffRowData, Util, DiffDict
@@ -5,10 +5,10 @@ import string
5
5
  from typing import List
6
6
 
7
7
  from credsweeper.common.constants import PEM_BEGIN_PATTERN, PEM_END_PATTERN, Chars
8
- from credsweeper.config import Config
9
- from credsweeper.credentials import LineData
8
+ from credsweeper.config.config import Config
9
+ from credsweeper.credentials.line_data import LineData
10
10
  from credsweeper.file_handler.analysis_target import AnalysisTarget
11
- from credsweeper.utils import Util
11
+ from credsweeper.utils.util import Util
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
credsweeper/utils/util.py CHANGED
@@ -9,12 +9,10 @@ import random
9
9
  import re
10
10
  import string
11
11
  import tarfile
12
- from dataclasses import dataclass
13
12
  from pathlib import Path
14
13
  from typing import Any, Dict, List, Tuple, Optional, Union
15
14
 
16
15
  import numpy as np
17
- import whatthepatch
18
16
  import yaml
19
17
  from cryptography.hazmat.primitives import hashes
20
18
  from cryptography.hazmat.primitives.asymmetric import padding
@@ -29,31 +27,12 @@ from cryptography.hazmat.primitives.asymmetric.x448 import X448PublicKey, X448Pr
29
27
  from cryptography.hazmat.primitives.serialization import load_der_private_key
30
28
  from cryptography.hazmat.primitives.serialization.pkcs12 import load_key_and_certificates
31
29
  from lxml import etree
32
- from typing_extensions import TypedDict
33
30
 
34
- from credsweeper.common.constants import DiffRowType, AVAILABLE_ENCODINGS, \
31
+ from credsweeper.common.constants import AVAILABLE_ENCODINGS, \
35
32
  DEFAULT_ENCODING, LATIN_1, CHUNK_SIZE, MAX_LINE_LENGTH, CHUNK_STEP_SIZE, ASCII
36
33
 
37
34
  logger = logging.getLogger(__name__)
38
35
 
39
- DiffDict = TypedDict(
40
- "DiffDict",
41
- {
42
- "old": Optional[int], #
43
- "new": Optional[int], #
44
- "line": Union[str, bytes], # bytes are possibly since whatthepatch v1.0.4
45
- "hunk": Any # not used
46
- })
47
-
48
-
49
- @dataclass(frozen=True)
50
- class DiffRowData:
51
- """Class for keeping data of diff row."""
52
-
53
- line_type: DiffRowType
54
- line_numb: int
55
- line: str
56
-
57
36
 
58
37
  class Util:
59
38
  """Class that contains different useful methods."""
@@ -82,11 +61,11 @@ class Util:
82
61
  def get_shannon_entropy(data: Union[str, bytes]) -> float:
83
62
  """Borrowed from http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html."""
84
63
  if not data:
85
- return 0.
64
+ return 0.0
86
65
  size = len(data)
87
66
  _uniq, counts = np.unique(list(data), return_counts=True)
88
67
  probabilities = counts / size
89
- entropy = float(-np.sum(probabilities * np.log2(probabilities)))
68
+ entropy = -float(np.sum(probabilities * np.log2(probabilities)))
90
69
  return entropy
91
70
 
92
71
  # Precalculated data for speedup
@@ -162,15 +141,6 @@ class Util:
162
141
  min_entropy = Util.get_min_data_entropy(data_len)
163
142
  return entropy < min_entropy
164
143
 
165
- @staticmethod
166
- def is_known(data: Union[bytes, bytearray]) -> bool:
167
- """Returns True if any known binary format is found to prevent extra scan a file without an extension."""
168
- if isinstance(data, (bytes, bytearray)) and data.startswith(b"\x7f\x45\x4c\x46") and 127 <= len(data):
169
- # https://en.wikipedia.org/wiki/Executable_and_Linkable_Format
170
- # minimal ELF is 127 bytes https://github.com/tchajed/minimal-elf
171
- return True
172
- return False
173
-
174
144
  @staticmethod
175
145
  def is_binary(data: Union[bytes, bytearray]) -> bool:
176
146
  """
@@ -239,13 +209,12 @@ class Util:
239
209
  try:
240
210
  if binary_suggest and LATIN_1 == encoding and (Util.is_binary(content) or not Util.is_latin1(content)):
241
211
  # LATIN_1 may convert data (bytes in range 0x80:0xFF are transformed)
242
- # so skip this encoding when checking binaries
243
- logger.warning("Binary file detected %s", repr(content[:8]))
244
212
  break
245
- text = content.decode(encoding, errors="strict")
246
- if content != text.encode(encoding, errors="strict"):
213
+ _text = content.decode(encoding=encoding, errors="strict")
214
+ if content != _text.encode(encoding=encoding, errors="strict"):
247
215
  # the check helps to detect a real encoding
248
216
  raise UnicodeError
217
+ text = _text
249
218
  break
250
219
  except UnicodeError:
251
220
  binary_suggest = True
@@ -254,6 +223,11 @@ class Util:
254
223
  logger.error(f"Unexpected Error: Can't read content as {encoding}. Error message: {exc}")
255
224
  return text
256
225
 
226
+ @staticmethod
227
+ def split_text(text: str) -> List[str]:
228
+ """Splits a text into lines, handling all common line endings (e.g., LF, CRLF, CR)."""
229
+ return text.replace("\r\n", '\n').replace('\r', '\n').split('\n')
230
+
257
231
  @staticmethod
258
232
  def decode_bytes(content: bytes, encodings: Optional[List[str]] = None) -> List[str]:
259
233
  """Decode content using different encodings.
@@ -272,121 +246,11 @@ class Util:
272
246
 
273
247
  """
274
248
  if text := Util.decode_text(content, encodings):
275
- lines = text.replace('\r\n', '\n').replace('\r', '\n').split('\n')
249
+ lines = Util.split_text(text)
276
250
  else:
277
251
  lines = []
278
252
  return lines
279
253
 
280
- @staticmethod
281
- def patch2files_diff(raw_patch: List[str], change_type: DiffRowType) -> Dict[str, List[DiffDict]]:
282
- """Generate files changes from patch for added or deleted filepaths.
283
-
284
- Args:
285
- raw_patch: git patch file content
286
- change_type: change type to select, DiffRowType.ADDED or DiffRowType.DELETED
287
-
288
- Return:
289
- return dict with ``{file paths: list of file row changes}``, where
290
- elements of list of file row changes represented as::
291
-
292
- {
293
- "old": line number before diff,
294
- "new": line number after diff,
295
- "line": line text,
296
- "hunk": diff hunk number
297
- }
298
-
299
- """
300
- if not raw_patch:
301
- return {}
302
-
303
- added_files, deleted_files = {}, {}
304
- try:
305
- for patch in whatthepatch.parse_patch(raw_patch):
306
- if patch.changes is None:
307
- logger.warning(f"Patch '{str(patch.header)}' cannot be scanned")
308
- continue
309
- changes = []
310
- for change in patch.changes:
311
- change_dict = change._asdict()
312
- changes.append(change_dict)
313
-
314
- added_files[patch.header.new_path] = changes
315
- deleted_files[patch.header.old_path] = changes
316
- if change_type == DiffRowType.ADDED:
317
- return added_files
318
- elif change_type == DiffRowType.DELETED:
319
- return deleted_files
320
- else:
321
- logger.error(f"Change type should be one of: '{DiffRowType.ADDED}', '{DiffRowType.DELETED}';"
322
- f" but received {change_type}")
323
- except Exception as exc:
324
- logger.exception(exc)
325
- return {}
326
-
327
- @staticmethod
328
- def preprocess_diff_rows(
329
- added_line_number: Optional[int], #
330
- deleted_line_number: Optional[int], #
331
- line: str) -> List[DiffRowData]:
332
- """Auxiliary function to extend diff changes.
333
-
334
- Args:
335
- added_line_number: number of added line or None
336
- deleted_line_number: number of deleted line or None
337
- line: the text line
338
-
339
- Return:
340
- diff rows data with as list of row change type, line number, row content
341
-
342
- """
343
- rows_data: List[DiffRowData] = []
344
- if isinstance(added_line_number, int):
345
- # indicates line was inserted
346
- rows_data.append(DiffRowData(DiffRowType.ADDED, added_line_number, line))
347
- if isinstance(deleted_line_number, int):
348
- # indicates line was removed
349
- rows_data.append(DiffRowData(DiffRowType.DELETED, deleted_line_number, line))
350
- return rows_data
351
-
352
- @staticmethod
353
- def wrong_change(change: DiffDict) -> bool:
354
- """Returns True if the change is wrong"""
355
- for i in ["line", "new", "old"]:
356
- if i not in change:
357
- logger.error(f"Skipping wrong change {change}")
358
- return True
359
- return False
360
-
361
- @staticmethod
362
- def preprocess_file_diff(changes: List[DiffDict]) -> List[DiffRowData]:
363
- """Generate changed file rows from diff data with changed lines (e.g. marked + or - in diff).
364
-
365
- Args:
366
- changes: git diff by file rows data
367
-
368
- Return:
369
- diff rows data with as list of row change type, line number, row content
370
-
371
- """
372
- if not changes:
373
- return []
374
-
375
- rows_data = []
376
- # process diff to restore lines and their positions
377
- for change in changes:
378
- if Util.wrong_change(change):
379
- continue
380
- line = change["line"]
381
- if isinstance(line, str):
382
- rows_data.extend(Util.preprocess_diff_rows(change.get("new"), change.get("old"), line))
383
- elif isinstance(line, (bytes, bytearray)):
384
- logger.warning("The feature is available with the deep scan option")
385
- else:
386
- logger.error(f"Unknown type of line {type(line)}")
387
-
388
- return rows_data
389
-
390
254
  @staticmethod
391
255
  def is_zip(data: Union[bytes, bytearray]) -> bool:
392
256
  """According https://en.wikipedia.org/wiki/List_of_file_signatures"""
@@ -486,13 +350,20 @@ class Util:
486
350
  return True
487
351
  return False
488
352
 
489
- @classmethod
490
- def is_sqlite3(cls, data):
353
+ @staticmethod
354
+ def is_sqlite3(data: Union[bytes, bytearray]):
491
355
  """According https://en.wikipedia.org/wiki/List_of_file_signatures - SQLite Database"""
492
356
  if isinstance(data, (bytes, bytearray)) and data.startswith(b"SQLite format 3\0"):
493
357
  return True
494
358
  return False
495
359
 
360
+ @staticmethod
361
+ def is_rtf(data: Union[bytes, bytearray]):
362
+ """According https://en.wikipedia.org/wiki/List_of_file_signatures - Rich Text Format"""
363
+ if isinstance(data, (bytes, bytearray)) and data.startswith(b"{\\rtf1") and data.endswith(b"}"):
364
+ return True
365
+ return False
366
+
496
367
  @staticmethod
497
368
  def is_asn1(data: Union[bytes, bytearray]) -> int:
498
369
  """Only sequence type 0x30 and size correctness are checked
@@ -706,6 +577,7 @@ class Util:
706
577
  """decode text to bytes with / without padding detect and urlsafe symbols"""
707
578
  value = text.translate(Util.WHITESPACE_TRANS_TABLE)
708
579
  if padding_safe:
580
+ value = value.rstrip('=') # python 3.10 workaround
709
581
  pad_num = 0x3 & len(value)
710
582
  if pad_num:
711
583
  value += '=' * (4 - pad_num)
@@ -769,6 +641,8 @@ class Util:
769
641
  @staticmethod
770
642
  def subtext(text: str, pos: int, hunk_size: int) -> str:
771
643
  """cut text symmetrically for given position or use remained quota to be fitted in 2x hunk_size"""
644
+ # cut trailed whitespaces to obtain more informative data
645
+ text = text.rstrip()
772
646
  if hunk_size <= pos:
773
647
  left_quota = 0
774
648
  left_pos = pos - hunk_size
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: credsweeper
3
- Version: 1.11.5
3
+ Version: 1.13.3
4
4
  Summary: Credential Sweeper
5
5
  Project-URL: Homepage, https://github.com/Samsung/CredSweeper
6
6
  Project-URL: Bug Tracker, https://github.com/Samsung/CredSweeper/issues
@@ -10,13 +10,12 @@ Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3 :: Only
13
- Classifier: Programming Language :: Python :: 3.9
14
13
  Classifier: Programming Language :: Python :: 3.10
15
14
  Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Programming Language :: Python :: 3.12
17
16
  Classifier: Topic :: Security
18
17
  Classifier: Topic :: Software Development :: Quality Assurance
19
- Requires-Python: >=3.9
18
+ Requires-Python: >=3.10
20
19
  Requires-Dist: base58
21
20
  Requires-Dist: beautifulsoup4>=4.11.0
22
21
  Requires-Dist: colorama
@@ -24,10 +23,10 @@ Requires-Dist: cryptography
24
23
  Requires-Dist: gitpython
25
24
  Requires-Dist: humanfriendly
26
25
  Requires-Dist: lxml
27
- Requires-Dist: numpy<2.0.0
26
+ Requires-Dist: numpy
28
27
  Requires-Dist: odfpy
29
- Requires-Dist: onnxruntime; platform_system != 'Windows'
30
- Requires-Dist: onnxruntime==1.19.2; platform_system == 'Windows'
28
+ Requires-Dist: onnxruntime; platform_system != 'Windows' or python_version != '3.12'
29
+ Requires-Dist: onnxruntime==1.19.2; platform_system == 'Windows' and python_version == '3.12'
31
30
  Requires-Dist: openpyxl
32
31
  Requires-Dist: pandas
33
32
  Requires-Dist: pdfminer-six
@@ -38,6 +37,7 @@ Requires-Dist: python-docx
38
37
  Requires-Dist: python-pptx
39
38
  Requires-Dist: pyyaml
40
39
  Requires-Dist: rpmfile
40
+ Requires-Dist: striprtf
41
41
  Requires-Dist: whatthepatch
42
42
  Requires-Dist: xlrd
43
43
  Description-Content-Type: text/markdown
@@ -90,7 +90,7 @@ Full documentation can be found here: <https://credsweeper.readthedocs.io/>
90
90
 
91
91
  ### Main Requirements
92
92
 
93
- - Python 3.9, 3.10, 3.11, 3.12
93
+ - Python 3.10, 3.11, 3.12
94
94
 
95
95
  ### Installation
96
96