credsweeper 1.11.4__py3-none-any.whl → 1.11.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of credsweeper might be problematic. Click here for more details.

credsweeper/__init__.py CHANGED
@@ -18,4 +18,4 @@ __all__ = [
18
18
  '__version__'
19
19
  ]
20
20
 
21
- __version__ = "1.11.4"
21
+ __version__ = "1.11.5"
@@ -1,12 +1,27 @@
1
+ import contextlib
2
+ import datetime
3
+ import logging
1
4
  from abc import abstractmethod, ABC
2
- from typing import List, Optional
5
+ from typing import List, Optional, Tuple, Any, Generator
3
6
 
7
+ from credsweeper.common.constants import RECURSIVE_SCAN_LIMITATION, MIN_DATA_LEN, DEFAULT_ENCODING, UTF_8, \
8
+ MIN_VALUE_LENGTH
4
9
  from credsweeper.config import Config
5
10
  from credsweeper.credentials import Candidate
11
+ from credsweeper.credentials.augment_candidates import augment_candidates
12
+ from credsweeper.file_handler.byte_content_provider import ByteContentProvider
13
+ from credsweeper.file_handler.content_provider import ContentProvider
6
14
  from credsweeper.file_handler.data_content_provider import DataContentProvider
15
+ from credsweeper.file_handler.descriptor import Descriptor
16
+ from credsweeper.file_handler.diff_content_provider import DiffContentProvider
17
+ from credsweeper.file_handler.file_path_extractor import FilePathExtractor
18
+ from credsweeper.file_handler.string_content_provider import StringContentProvider
7
19
  from credsweeper.file_handler.struct_content_provider import StructContentProvider
20
+ from credsweeper.file_handler.text_content_provider import TextContentProvider
8
21
  from credsweeper.scanner import Scanner
9
22
 
23
+ logger = logging.getLogger(__name__)
24
+
10
25
 
11
26
  class AbstractScanner(ABC):
12
27
  """Base abstract class for all recursive scanners"""
@@ -24,28 +39,268 @@ class AbstractScanner(ABC):
24
39
  raise NotImplementedError(__name__)
25
40
 
26
41
  @abstractmethod
27
- def recursive_scan(
42
+ def data_scan(
28
43
  self, #
29
44
  data_provider: DataContentProvider, #
30
- depth: int = 0, #
31
- recursive_limit_size: int = 0) -> List[Candidate]:
45
+ depth: int, #
46
+ recursive_limit_size: int) -> Optional[List[Candidate]]:
32
47
  """Abstract method to be defined in DeepScanner"""
33
48
  raise NotImplementedError(__name__)
34
49
 
50
+ @staticmethod
35
51
  @abstractmethod
52
+ def get_deep_scanners(data: bytes, descriptor: Descriptor, depth: int) -> Tuple[List[Any], List[Any]]:
53
+ """Returns possibly scan methods for the data depends on content and fallback scanners"""
54
+
55
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
56
+
57
+ def recursive_scan(
58
+ self, #
59
+ data_provider: DataContentProvider, #
60
+ depth: int = 0, #
61
+ recursive_limit_size: int = 0) -> List[Candidate]:
62
+ """Recursive function to scan files which might be containers like ZIP archives
63
+
64
+ Args:
65
+ data_provider: DataContentProvider object may be a container
66
+ depth: maximal level of recursion
67
+ recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
68
+ """
69
+ candidates: List[Candidate] = []
70
+ if 0 > depth:
71
+ # break recursion if maximal depth is reached
72
+ logger.debug("Bottom reached %s recursive_limit_size:%d", data_provider.file_path, recursive_limit_size)
73
+ return candidates
74
+ depth -= 1
75
+ if MIN_DATA_LEN > len(data_provider.data):
76
+ # break recursion for minimal data size
77
+ logger.debug("Too small data: size=%d, depth=%d, limit=%d, path=%s, info=%s", len(data_provider.data),
78
+ depth, recursive_limit_size, data_provider.file_path, data_provider.info)
79
+ return candidates
80
+ logger.debug("Start data_scan: size=%d, depth=%d, limit=%d, path=%s, info=%s", len(data_provider.data), depth,
81
+ recursive_limit_size, data_provider.file_path, data_provider.info)
82
+
83
+ if FilePathExtractor.is_find_by_ext_file(self.config, data_provider.file_type):
84
+ # Skip scanning file and makes fake candidate due the extension is suspicious
85
+ dummy_candidate = Candidate.get_dummy_candidate(self.config, data_provider.file_path,
86
+ data_provider.file_type, data_provider.info,
87
+ FilePathExtractor.FIND_BY_EXT_RULE)
88
+ candidates.append(dummy_candidate)
89
+ else:
90
+ new_candidates = self.deep_scan_with_fallback(data_provider, depth, recursive_limit_size)
91
+ augment_candidates(candidates, new_candidates)
92
+
93
+ return candidates
94
+
95
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
96
+
97
+ @staticmethod
98
+ def key_value_combination(structure: dict) -> Generator[Tuple[Any, Any], None, None]:
99
+ """Combine items by `key` and `value` from a dictionary for augmentation
100
+ {..., "key": "api_key", "value": "XXXXXXX", ...} -> ("api_key", "XXXXXXX")
101
+
102
+ """
103
+ for key_id in ("key", "KEY", "Key"):
104
+ if key_id in structure:
105
+ struct_key = structure.get(key_id)
106
+ break
107
+ else:
108
+ struct_key = None
109
+ if isinstance(struct_key, bytes):
110
+ # sqlite table may produce bytes for `key`
111
+ with contextlib.suppress(UnicodeError):
112
+ struct_key = struct_key.decode(UTF_8)
113
+ # only str type is common used for the augmentation
114
+ if struct_key and isinstance(struct_key, str):
115
+ for value_id in ("value", "VALUE", "Value"):
116
+ if value_id in structure:
117
+ struct_value = structure.get(value_id)
118
+ if struct_value and isinstance(struct_value, (str, bytes)):
119
+ yield struct_key, struct_value
120
+ # break in successful case
121
+ break
122
+
123
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
124
+
125
+ @staticmethod
126
+ def structure_processing(structure: Any) -> Generator[Tuple[Any, Any], None, None]:
127
+ """Yields pair `key, value` from given structure if applicable"""
128
+ if isinstance(structure, dict):
129
+ # transform dictionary to list
130
+ for key, value in structure.items():
131
+ if not value:
132
+ # skip empty values
133
+ continue
134
+ if isinstance(value, (list, tuple)):
135
+ if 1 == len(value):
136
+ # simplify some structures like YAML when single item in new line is a value
137
+ yield key, value[0]
138
+ continue
139
+ # all other data will be precessed in next code
140
+ yield key, value
141
+ yield from AbstractScanner.key_value_combination(structure)
142
+ elif isinstance(structure, (list, tuple)):
143
+ # enumerate the items to fit for return structure
144
+ for key, value in enumerate(structure):
145
+ yield key, value
146
+ else:
147
+ logger.error("Not supported type:%s val:%s", str(type(structure)), repr(structure))
148
+
149
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
150
+
36
151
  def structure_scan(
37
152
  self, #
38
153
  struct_provider: StructContentProvider, #
39
154
  depth: int, #
40
155
  recursive_limit_size: int) -> List[Candidate]:
41
- """Abstract method to be defined in DeepScanner"""
42
- raise NotImplementedError(__name__)
156
+ """Recursive function to scan structured data
43
157
 
44
- @abstractmethod
45
- def data_scan(
46
- self, #
47
- data_provider: DataContentProvider, #
48
- depth: int, #
49
- recursive_limit_size: int) -> Optional[List[Candidate]]:
50
- """Abstract method to be defined in DeepScanner"""
51
- raise NotImplementedError(__name__)
158
+ Args:
159
+ struct_provider: DataContentProvider object may be a container
160
+ depth: maximal level of recursion
161
+ recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
162
+ """
163
+ candidates: List[Candidate] = []
164
+ logger.debug("Start struct_scan: depth=%d, limit=%d, path=%s, info=%s", depth, recursive_limit_size,
165
+ struct_provider.file_path, struct_provider.info)
166
+
167
+ if 0 > depth:
168
+ # break recursion if maximal depth is reached
169
+ logger.debug("bottom reached %s recursive_limit_size:%d", struct_provider.file_path, recursive_limit_size)
170
+ return candidates
171
+
172
+ depth -= 1
173
+
174
+ augmented_lines_for_keyword_rules = []
175
+ for key, value in AbstractScanner.structure_processing(struct_provider.struct):
176
+ # a keyword rule may be applicable for `key` (str only) and `value` (str, bytes)
177
+ keyword_match = bool(isinstance(key, str) and self.scanner.keywords_required_substrings_check(key.lower()))
178
+
179
+ if isinstance(value, (dict, list, tuple)) and value:
180
+ # recursive scan for not empty structured `value`
181
+ val_struct_provider = StructContentProvider(struct=value,
182
+ file_path=struct_provider.file_path,
183
+ file_type=struct_provider.file_type,
184
+ info=f"{struct_provider.info}|STRUCT:{key}")
185
+ new_candidates = self.structure_scan(val_struct_provider, depth, recursive_limit_size)
186
+ candidates.extend(new_candidates)
187
+ elif isinstance(value, bytes):
188
+ # recursive data scan
189
+ if MIN_DATA_LEN <= len(value):
190
+ bytes_struct_provider = DataContentProvider(data=value,
191
+ file_path=struct_provider.file_path,
192
+ file_type=struct_provider.file_type,
193
+ info=f"{struct_provider.info}|BYTES:{key}")
194
+ new_limit = recursive_limit_size - len(value)
195
+ new_candidates = self.recursive_scan(bytes_struct_provider, depth, new_limit)
196
+ candidates.extend(new_candidates)
197
+ if keyword_match and MIN_VALUE_LENGTH <= len(value):
198
+ augmented_lines_for_keyword_rules.append(f"{key} = {repr(value)}")
199
+ elif isinstance(value, str):
200
+ # recursive text scan with transformation into bytes
201
+ stripped_value = value.strip()
202
+ if MIN_DATA_LEN <= len(stripped_value):
203
+ # recursive scan only for data which may be decoded at least
204
+ with contextlib.suppress(UnicodeError):
205
+ data = stripped_value.encode(encoding=DEFAULT_ENCODING, errors='strict')
206
+ str_struct_provider = DataContentProvider(data=data,
207
+ file_path=struct_provider.file_path,
208
+ file_type=struct_provider.file_type,
209
+ info=f"{struct_provider.info}|STRING:{key}")
210
+ new_limit = recursive_limit_size - len(str_struct_provider.data)
211
+ new_candidates = self.recursive_scan(str_struct_provider, depth, new_limit)
212
+ candidates.extend(new_candidates)
213
+ if keyword_match and MIN_VALUE_LENGTH <= len(stripped_value):
214
+ augmented_lines_for_keyword_rules.append(f"{key} = {repr(stripped_value)}")
215
+ elif value is None or isinstance(value, (int, float, datetime.date, datetime.datetime)):
216
+ # skip useless types
217
+ pass
218
+ else:
219
+ logger.warning("Not supported type:%s value(%s)", str(type(value)), str(value))
220
+
221
+ if augmented_lines_for_keyword_rules:
222
+ str_provider = StringContentProvider(augmented_lines_for_keyword_rules,
223
+ file_path=struct_provider.file_path,
224
+ file_type=struct_provider.file_type,
225
+ info=f"{struct_provider.info}|KEYWORD")
226
+ new_candidates = self.scanner.scan(str_provider)
227
+ augment_candidates(candidates, new_candidates)
228
+
229
+ return candidates
230
+
231
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
232
+
233
+ def deep_scan_with_fallback(self, data_provider: DataContentProvider, depth: int,
234
+ recursive_limit_size: int) -> List[Candidate]:
235
+ """Scans with deep scanners and fallback scanners if possible
236
+
237
+ Args:
238
+ data_provider: DataContentProvider with raw data
239
+ depth: maximal level of recursion
240
+ recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
241
+
242
+ Returns: list with candidates
243
+
244
+ """
245
+ candidates: List[Candidate] = []
246
+ deep_scanners, fallback_scanners = self.get_deep_scanners(data_provider.data, data_provider.descriptor, depth)
247
+ fallback = True
248
+ for scan_class in deep_scanners:
249
+ new_candidates = scan_class.data_scan(self, data_provider, depth, recursive_limit_size)
250
+ if new_candidates is None:
251
+ # scanner did not recognise the content type
252
+ continue
253
+ augment_candidates(candidates, new_candidates)
254
+ # this scan is successful, so fallback is not necessary
255
+ fallback = False
256
+ if fallback:
257
+ for scan_class in fallback_scanners:
258
+ fallback_candidates = scan_class.data_scan(self, data_provider, depth, recursive_limit_size)
259
+ if fallback_candidates is None:
260
+ continue
261
+ augment_candidates(candidates, fallback_candidates)
262
+ # use only first successful fallback scanner
263
+ break
264
+ return candidates
265
+
266
+ # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
267
+
268
+ def scan(self,
269
+ content_provider: ContentProvider,
270
+ depth: int,
271
+ recursive_limit_size: Optional[int] = None) -> List[Candidate]:
272
+ """Initial scan method to launch recursive scan. Skips ByteScanner to prevent extra scan
273
+
274
+ Args:
275
+ content_provider: ContentProvider that might contain raw data
276
+ depth: maximal level of recursion
277
+ recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
278
+ """
279
+ recursive_limit_size = recursive_limit_size if isinstance(recursive_limit_size,
280
+ int) else RECURSIVE_SCAN_LIMITATION
281
+ candidates: List[Candidate] = []
282
+ data: Optional[bytes] = None
283
+ if isinstance(content_provider, (TextContentProvider, ByteContentProvider)):
284
+ # Feature to scan files which might be containers
285
+ data = content_provider.data
286
+ info = f"FILE:{content_provider.file_path}"
287
+ elif isinstance(content_provider, DiffContentProvider) and content_provider.diff:
288
+ candidates = self.scanner.scan(content_provider)
289
+ # Feature to scan binary diffs
290
+ diff = content_provider.diff[0].get("line")
291
+ # the check for legal fix mypy issue
292
+ if isinstance(diff, bytes):
293
+ data = diff
294
+ info = f"DIFF:{content_provider.file_path}"
295
+ else:
296
+ logger.warning(f"Content provider {type(content_provider)} does not support deep scan")
297
+ info = "NA"
298
+
299
+ if data:
300
+ data_provider = DataContentProvider(data=data,
301
+ file_path=content_provider.file_path,
302
+ file_type=content_provider.file_type,
303
+ info=content_provider.info or info)
304
+ new_candidates = self.deep_scan_with_fallback(data_provider, depth, recursive_limit_size - len(data))
305
+ augment_candidates(candidates, new_candidates)
306
+ return candidates
@@ -1,11 +1,13 @@
1
1
  import logging
2
+ import struct
2
3
  from abc import ABC
3
- from typing import List, Optional
4
+ from typing import List, Optional, Generator, Tuple
4
5
 
5
- from credsweeper.common.constants import ASCII, MIN_DATA_LEN
6
+ from credsweeper.common.constants import MIN_DATA_LEN, UTF_8
6
7
  from credsweeper.credentials import Candidate
7
8
  from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
8
9
  from credsweeper.file_handler.data_content_provider import DataContentProvider
10
+ from credsweeper.utils.util import Util
9
11
 
10
12
  logger = logging.getLogger(__name__)
11
13
 
@@ -13,36 +15,41 @@ logger = logging.getLogger(__name__)
13
15
  class DebScanner(AbstractScanner, ABC):
14
16
  """Implements deb (ar) scanning"""
15
17
 
18
+ __header_size = 60
19
+
20
+ @staticmethod
21
+ def walk_deb(data: bytes) -> Generator[Tuple[int, str, bytes], None, None]:
22
+ """Processes sequence of DEB archive and yields offset, name and data"""
23
+ offset = 8 # b"!<arch>\n"
24
+ data_limit = len(data) - DebScanner.__header_size
25
+ while offset <= data_limit:
26
+ _data = data[offset:offset + DebScanner.__header_size]
27
+ offset += DebScanner.__header_size
28
+ # basic header structure
29
+ _name, _, _size, __ = struct.unpack('16s32s10s2s', _data)
30
+ file_size = int(_size)
31
+ if MIN_DATA_LEN < file_size <= len(data) - offset:
32
+ _data = data[offset:offset + file_size]
33
+ yield offset, _name.decode(encoding=UTF_8).strip().rstrip('/'), _data
34
+ offset += file_size if 0 == 1 & file_size else file_size + 1
35
+
16
36
  def data_scan(
17
37
  self, #
18
38
  data_provider: DataContentProvider, #
19
39
  depth: int, #
20
40
  recursive_limit_size: int) -> Optional[List[Candidate]]:
21
41
  """Extracts data file from .ar (debian) archive and launches data_scan"""
22
- candidates: Optional[List[Candidate]] = None
23
- offset = 8 # b"!<arch>\n"
24
- while offset < len(data_provider.data):
25
- try:
26
- file_size_data = data_provider.data[offset + 48:offset + 58]
27
- file_size = int(file_size_data.decode(ASCII))
28
- offset += 60
29
- if file_size < MIN_DATA_LEN:
30
- offset += file_size
31
- continue
32
- data = data_provider.data[offset:offset + file_size]
42
+ try:
43
+ candidates: List[Candidate] = []
44
+ for offset, name, data in DebScanner.walk_deb(data_provider.data):
33
45
  deb_content_provider = DataContentProvider(data=data,
34
- file_path=data_provider.file_path,
35
- file_type=data_provider.file_type,
46
+ file_path=f"{data_provider.file_path}/{name}",
47
+ file_type=Util.get_extension(name),
36
48
  info=f"{data_provider.info}|DEB:0x{offset:x}")
37
- new_limit = recursive_limit_size - file_size
49
+ new_limit = recursive_limit_size - len(data)
38
50
  deb_candidates = self.recursive_scan(deb_content_provider, depth, new_limit)
39
- if deb_candidates is not None:
40
- if candidates:
41
- candidates.extend(deb_candidates)
42
- else:
43
- candidates = deb_candidates
44
- # data padding = 2
45
- offset += 1 + file_size if 1 & file_size else file_size
46
- except Exception as exc:
47
- logger.error(exc)
48
- return candidates
51
+ candidates.extend(deb_candidates)
52
+ return candidates
53
+ except Exception as exc:
54
+ logger.error(exc)
55
+ return None