scanoss 1.18.0__py3-none-any.whl → 1.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,525 @@
1
+ """
2
+ SPDX-License-Identifier: MIT
3
+
4
+ Copyright (c) 2024, SCANOSS
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in
14
+ all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ THE SOFTWARE.
23
+ """
24
+
25
+ import os
26
+ import sys
27
+ from pathlib import Path
28
+ from typing import List
29
+
30
+ from pathspec import GitIgnoreSpec
31
+
32
+ from .scanoss_settings import ScanossSettings
33
+ from .scanossbase import ScanossBase
34
+
35
+ # Files to skip
36
+ DEFAULT_SKIPPED_FILES = {
37
+ 'gradlew',
38
+ 'gradlew.bat',
39
+ 'mvnw',
40
+ 'mvnw.cmd',
41
+ 'gradle-wrapper.jar',
42
+ 'maven-wrapper.jar',
43
+ 'thumbs.db',
44
+ 'babel.config.js',
45
+ 'license.txt',
46
+ 'license.md',
47
+ 'copying.lib',
48
+ 'makefile',
49
+ }
50
+ # Folders to skip
51
+ DEFAULT_SKIPPED_DIRS = {
52
+ 'nbproject',
53
+ 'nbbuild',
54
+ 'nbdist',
55
+ '__pycache__',
56
+ 'venv',
57
+ '_yardoc',
58
+ 'eggs',
59
+ 'wheels',
60
+ 'htmlcov',
61
+ '__pypackages__',
62
+ }
63
+ # Folder endings to skip
64
+ DEFAULT_SKIPPED_DIR_EXT = {
65
+ '.egg-info'
66
+ }
67
+ # File extensions to skip
68
+ DEFAULT_SKIPPED_EXT = {
69
+ '.1',
70
+ '.2',
71
+ '.3',
72
+ '.4',
73
+ '.5',
74
+ '.6',
75
+ '.7',
76
+ '.8',
77
+ '.9',
78
+ '.ac',
79
+ '.adoc',
80
+ '.am',
81
+ '.asciidoc',
82
+ '.bmp',
83
+ '.build',
84
+ '.cfg',
85
+ '.chm',
86
+ '.class',
87
+ '.cmake',
88
+ '.cnf',
89
+ '.conf',
90
+ '.config',
91
+ '.contributors',
92
+ '.copying',
93
+ '.crt',
94
+ '.csproj',
95
+ '.css',
96
+ '.csv',
97
+ '.dat',
98
+ '.data',
99
+ '.doc',
100
+ '.docx',
101
+ '.dtd',
102
+ '.dts',
103
+ '.iws',
104
+ '.c9',
105
+ '.c9revisions',
106
+ '.dtsi',
107
+ '.dump',
108
+ '.eot',
109
+ '.eps',
110
+ '.geojson',
111
+ '.gdoc',
112
+ '.gif',
113
+ '.glif',
114
+ '.gmo',
115
+ '.gradle',
116
+ '.guess',
117
+ '.hex',
118
+ '.htm',
119
+ '.html',
120
+ '.ico',
121
+ '.iml',
122
+ '.in',
123
+ '.inc',
124
+ '.info',
125
+ '.ini',
126
+ '.ipynb',
127
+ '.jpeg',
128
+ '.jpg',
129
+ '.json',
130
+ '.jsonld',
131
+ '.lock',
132
+ '.log',
133
+ '.m4',
134
+ '.map',
135
+ '.markdown',
136
+ '.md',
137
+ '.md5',
138
+ '.meta',
139
+ '.mk',
140
+ '.mxml',
141
+ '.o',
142
+ '.otf',
143
+ '.out',
144
+ '.pbtxt',
145
+ '.pdf',
146
+ '.pem',
147
+ '.phtml',
148
+ '.plist',
149
+ '.png',
150
+ '.po',
151
+ '.ppt',
152
+ '.prefs',
153
+ '.properties',
154
+ '.pyc',
155
+ '.qdoc',
156
+ '.result',
157
+ '.rgb',
158
+ '.rst',
159
+ '.scss',
160
+ '.sha',
161
+ '.sha1',
162
+ '.sha2',
163
+ '.sha256',
164
+ '.sln',
165
+ '.spec',
166
+ '.sql',
167
+ '.sub',
168
+ '.svg',
169
+ '.svn-base',
170
+ '.tab',
171
+ '.template',
172
+ '.test',
173
+ '.tex',
174
+ '.tiff',
175
+ '.toml',
176
+ '.ttf',
177
+ '.txt',
178
+ '.utf-8',
179
+ '.vim',
180
+ '.wav',
181
+ '.woff',
182
+ '.woff2',
183
+ '.xht',
184
+ '.xhtml',
185
+ '.xls',
186
+ '.xlsx',
187
+ '.xml',
188
+ '.xpm',
189
+ '.xsd',
190
+ '.xul',
191
+ '.yaml',
192
+ '.yml',
193
+ '.wfp',
194
+ '.editorconfig',
195
+ '.dotcover',
196
+ '.pid',
197
+ '.lcov',
198
+ '.egg',
199
+ '.manifest',
200
+ '.cache',
201
+ '.coverage',
202
+ '.cover',
203
+ '.gem',
204
+ '.lst',
205
+ '.pickle',
206
+ '.pdb',
207
+ '.gml',
208
+ '.pot',
209
+ '.plt',
210
+ # File endings
211
+ '-doc',
212
+ 'changelog',
213
+ 'config',
214
+ 'copying',
215
+ 'license',
216
+ 'authors',
217
+ 'news',
218
+ 'licenses',
219
+ 'notice',
220
+ 'readme',
221
+ 'swiftdoc',
222
+ 'texidoc',
223
+ 'todo',
224
+ 'version',
225
+ 'ignore',
226
+ 'manifest',
227
+ 'sqlite',
228
+ 'sqlite3',
229
+ }
230
+
231
+
232
+ class FileFilters(ScanossBase):
233
+ """
234
+ Filter for determining which files to process during scanning, fingerprinting, etc.
235
+ Handles both inclusion and exclusion rules based on file paths, extensions, and sizes.
236
+ """
237
+
238
+ def __init__(
239
+ self,
240
+ debug: bool = False,
241
+ trace: bool = False,
242
+ quiet: bool = False,
243
+ scanoss_settings: 'ScanossSettings | None' = None,
244
+ all_extensions: bool = False,
245
+ all_folders: bool = False,
246
+ hidden_files_folders: bool = False,
247
+ operation_type: str = 'scanning',
248
+ skip_size: int = 0,
249
+ skip_extensions = None,
250
+ skip_folders = None
251
+ ):
252
+ """
253
+ Initialize scan filters based on default settings. Optionally append custom settings.
254
+
255
+ Args:
256
+ debug (bool): Enable debug output
257
+ trace (bool): Enable trace output
258
+ quiet (bool): Suppress output
259
+ scanoss_settings (ScanossSettings): Custom settings to override defaults
260
+ all_extensions (bool): Include all file extensions
261
+ all_folders (bool): Include all folders
262
+ hidden_files_folders (bool): Include hidden files and folders
263
+ operation_type: operation type. can be either 'scanning' or 'fingerprinting'
264
+ """
265
+ super().__init__(debug, trace, quiet)
266
+
267
+ if skip_folders is None:
268
+ skip_folders = []
269
+ if skip_extensions is None:
270
+ skip_extensions = []
271
+ self.hidden_files_folders = hidden_files_folders
272
+ self.scanoss_settings = scanoss_settings
273
+ self.all_extensions = all_extensions
274
+ self.all_folders = all_folders
275
+ self.skip_folders = skip_folders
276
+ self.skip_size = skip_size
277
+ self.skip_extensions = skip_extensions
278
+ self.file_folder_pat_spec = self._get_file_folder_pattern_spec(operation_type)
279
+ self.size_pat_rules = self._get_size_limit_pattern_rules(operation_type)
280
+
281
+ def get_filtered_files_from_folder(self, root: str) -> List[str]:
282
+ """
283
+ Retrieve a list of files to scan or fingerprint from a given directory root based on filter settings.
284
+
285
+ Args:
286
+ root (str): Root directory to scan or fingerprint
287
+
288
+ Returns:
289
+ list[str]: Filtered list of files to scan or fingerprint
290
+ """
291
+ if self.debug:
292
+ if self.file_folder_pat_spec:
293
+ self.print_stderr(f'Running with {len(self.file_folder_pat_spec)} pattern filters.')
294
+ if self.size_pat_rules:
295
+ self.print_stderr(f'Running with {len(self.size_pat_rules)} size pattern rules.')
296
+ if self.skip_size:
297
+ self.print_stderr(f'Running with global skip size: {self.skip_size}')
298
+ if self.skip_extensions:
299
+ self.print_stderr(f'Running with extra global skip extensions: {self.skip_extensions}')
300
+ if self.skip_folders:
301
+ self.print_stderr(f'Running with extra global skip folders: {self.skip_folders}')
302
+ all_files = []
303
+ root_path = Path(root).resolve()
304
+ if not root_path.exists() or not root_path.is_dir():
305
+ self.print_stderr(f'ERROR: Specified root directory {root} does not exist or is not a directory.')
306
+ return all_files
307
+ # Walk the tree looking for files to process. While taking into account files/folders to skip
308
+ for dirpath, dirnames, filenames in os.walk(root_path):
309
+ dirpath = Path(dirpath)
310
+ rel_path = dirpath.relative_to(root_path)
311
+ if dirpath.is_symlink(): # TODO should we skip symlink folders?
312
+ self.print_msg(f'WARNING: Found symbolic link folder: {dirpath}')
313
+
314
+ if self._should_skip_dir(str(rel_path)): # Current directory should be skipped
315
+ dirnames.clear()
316
+ continue
317
+ for filename in filenames:
318
+ file_path = dirpath / filename
319
+ all_files.append(str(file_path))
320
+ # End os.walk loop
321
+ # Now filter the files and return the reduced list
322
+ return self.get_filtered_files_from_files(all_files, str(root_path))
323
+
324
+ def get_filtered_files_from_files(self, files: List[str], scan_root: str = None) -> List[str]:
325
+ """
326
+ Retrieve a list of files to scan or fingerprint from a given list of files based on filter settings.
327
+
328
+ Args:
329
+ files (List[str]): List of files to scan or fingerprint
330
+ scan_root (str): Root directory to scan or fingerprint
331
+
332
+ Returns:
333
+ list[str]: Filtered list of files to scan or fingerprint
334
+ """
335
+ filtered_files = []
336
+ for file_path in files:
337
+ if not os.path.exists(file_path) or not os.path.isfile(file_path) or os.path.islink(file_path):
338
+ self.print_debug(
339
+ f'WARNING: File {file_path} does not exist, is not a file, or is a symbolic link. Ignoring.'
340
+ )
341
+ continue
342
+ try:
343
+ if scan_root:
344
+ rel_path = os.path.relpath(file_path, scan_root)
345
+ else:
346
+ rel_path = os.path.relpath(file_path)
347
+ except ValueError:
348
+ # If file_path is broken, symlink ignore it
349
+ self.print_debug(f'Ignoring file: {file_path} (broken symlink)')
350
+ continue
351
+ if self._should_skip_file(rel_path):
352
+ continue
353
+ try:
354
+ file_size = os.path.getsize(file_path)
355
+ if file_size == 0:
356
+ self.print_debug(f'Skipping file: {rel_path} (empty file)')
357
+ continue
358
+ min_size, max_size = self._get_operation_size_limits(file_path)
359
+ if min_size <= file_size <= max_size:
360
+ filtered_files.append(rel_path)
361
+ else:
362
+ self.print_debug(
363
+ f'Skipping file: {rel_path} (size {file_size} outside limits {min_size}-{max_size})'
364
+ )
365
+ except OSError as e:
366
+ self.print_debug(f'Error getting size for {rel_path}: {e}')
367
+ # End file loop
368
+ return filtered_files
369
+
370
+ def _get_file_folder_pattern_spec(self, operation_type: str = 'scanning'):
371
+ """
372
+ Get file path pattern specification.
373
+
374
+ :param operation_type: which operation is being performed
375
+ :return: List of file path patterns
376
+ """
377
+ patterns = self._get_operation_patterns(operation_type)
378
+ if patterns:
379
+ return GitIgnoreSpec.from_lines(patterns)
380
+ return None
381
+
382
+ def _get_size_limit_pattern_rules(self, operation_type: str = 'scanning'):
383
+ """
384
+ Get size limit pattern rules.
385
+
386
+ :param operation_type: which operation is being performed
387
+ :return: List of size limit pattern rules
388
+ """
389
+ if self.scanoss_settings:
390
+ size_rules = self.scanoss_settings.get_skip_sizes(operation_type)
391
+ if size_rules:
392
+ size_rules_with_patterns = []
393
+ for rule in size_rules:
394
+ patterns = rule.get('patterns', [])
395
+ if not patterns:
396
+ continue
397
+ size_rules_with_patterns.append(rule)
398
+ return size_rules_with_patterns
399
+ return None
400
+
401
+ def _get_operation_patterns(self, operation_type: str) -> List[str]:
402
+ """
403
+ Get patterns specific to the operation type, combining defaults with settings.
404
+
405
+ Args:
406
+ operation_type (str): Type of operation ('scanning' or 'fingerprinting')
407
+
408
+ Returns:
409
+ List[str]: Combined list of patterns to skip
410
+ """
411
+ patterns = []
412
+ if self.scanoss_settings:
413
+ patterns.extend(self.scanoss_settings.get_skip_patterns(operation_type))
414
+ return patterns
415
+
416
+ def _get_operation_size_limits(self, file_path: str = None) -> tuple:
417
+ """
418
+ Get size limits specific to the operation type and file path.
419
+
420
+ Args:
421
+ file_path (str, optional): Path to the file to check against patterns. If None, returns default limits.
422
+
423
+ Returns:
424
+ tuple: (min_size, max_size) tuple for the given file path and operation type
425
+ """
426
+ min_size = 0
427
+ max_size = sys.maxsize
428
+ # Apply global minimum file size if specified
429
+ if self.skip_size > 0:
430
+ min_size = self.skip_size
431
+ return min_size, max_size
432
+ # Return default size limits if no settings specified
433
+ if not self.scanoss_settings or not file_path or not self.size_pat_rules:
434
+ return min_size, max_size
435
+ try:
436
+ rel_path = os.path.relpath(file_path)
437
+ except ValueError:
438
+ rel_path = os.path.basename(file_path)
439
+ rel_path_lower = rel_path.lower()
440
+ # Cycle through each rule looking for a match
441
+ for rule in self.size_pat_rules:
442
+ patterns = rule.get('patterns', [])
443
+ if patterns:
444
+ path_spec = GitIgnoreSpec.from_lines(patterns)
445
+ if path_spec.match_file(rel_path_lower):
446
+ return rule.get('min', min_size), rule.get('max', max_size)
447
+ # End rules loop
448
+ return min_size, max_size
449
+
450
+ def _should_skip_dir(self, dir_rel_path: str) -> bool:
451
+ """
452
+ Check if a directory should be skipped based on operation type and default rules.
453
+
454
+ Args:
455
+ dir_rel_path (str): Relative path to the directory
456
+
457
+ Returns:
458
+ bool: True if directory should be skipped, False otherwise
459
+ """
460
+ dir_name = os.path.basename(dir_rel_path)
461
+ dir_path = Path(dir_rel_path)
462
+ if (
463
+ not self.hidden_files_folders
464
+ and dir_path != Path('.')
465
+ and any(part.startswith('.') for part in dir_path.parts)
466
+ ):
467
+ self.print_debug(f'Skipping directory: {dir_rel_path} (hidden directory)')
468
+ return True
469
+ if self.all_folders:
470
+ return False
471
+ dir_name_lower = dir_name.lower()
472
+ if dir_name_lower in DEFAULT_SKIPPED_DIRS:
473
+ self.print_debug(f'Skipping directory: {dir_rel_path} (matches default skip directory)')
474
+ return True
475
+ if self.skip_folders and dir_name in self.skip_folders:
476
+ self.print_debug(f'Skipping directory: {dir_rel_path} (matches skip folder)')
477
+ return True
478
+ for ext in DEFAULT_SKIPPED_DIR_EXT:
479
+ if dir_name_lower.endswith(ext):
480
+ self.print_debug(f'Skipping directory: {dir_rel_path} (matches default skip extension: {ext})')
481
+ return True
482
+
483
+ if self.file_folder_pat_spec and self.file_folder_pat_spec.match_file(dir_rel_path):
484
+ self.print_debug(f'Skipping directory: {dir_rel_path} (matches custom pattern)')
485
+ return True
486
+ return False
487
+
488
+ def _should_skip_file(self, file_rel_path: str) -> bool:
489
+ """
490
+ Check if a file should be skipped based on operation type and default rules.
491
+
492
+ Args:
493
+ file_rel_path (str): Relative path to the file
494
+
495
+ Returns:
496
+ bool: True if file should be skipped, False otherwise
497
+ """
498
+ file_name = os.path.basename(file_rel_path)
499
+
500
+ if not self.hidden_files_folders and file_name.startswith('.'):
501
+ self.print_debug(f'Skipping file: {file_rel_path} (hidden file)')
502
+ return True
503
+ if self.all_extensions:
504
+ return False
505
+ file_name_lower = file_name.lower()
506
+ # Look for exact files
507
+ if file_name_lower in DEFAULT_SKIPPED_FILES:
508
+ self.print_debug(f'Skipping file: {file_rel_path} (matches default skip file)')
509
+ return True
510
+ # Look for file endings
511
+ for ending in DEFAULT_SKIPPED_EXT:
512
+ if file_name_lower.endswith(ending):
513
+ self.print_debug(f'Skipping file: {file_rel_path} (matches default skip ending: {ending})')
514
+ return True
515
+ # Look for custom (extra) endings
516
+ if self.skip_extensions:
517
+ for ending in self.skip_extensions:
518
+ if file_name_lower.endswith(ending):
519
+ self.print_debug(f'Skipping file: {file_rel_path} (matches skip extension)')
520
+ return True
521
+ # Check for file patterns
522
+ if self.file_folder_pat_spec and self.file_folder_pat_spec.match_file(file_rel_path):
523
+ self.print_debug(f'Skipping file: {file_rel_path} (matches custom pattern)')
524
+ return True
525
+ return False