scanoss 1.12.2__py3-none-any.whl → 1.43.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protoc_gen_swagger/__init__.py +13 -13
- protoc_gen_swagger/options/__init__.py +13 -13
- protoc_gen_swagger/options/annotations_pb2.py +18 -12
- protoc_gen_swagger/options/annotations_pb2.pyi +48 -0
- protoc_gen_swagger/options/annotations_pb2_grpc.py +20 -0
- protoc_gen_swagger/options/openapiv2_pb2.py +110 -99
- protoc_gen_swagger/options/openapiv2_pb2.pyi +1317 -0
- protoc_gen_swagger/options/openapiv2_pb2_grpc.py +20 -0
- scanoss/__init__.py +18 -18
- scanoss/api/__init__.py +17 -17
- scanoss/api/common/__init__.py +17 -17
- scanoss/api/common/v2/__init__.py +17 -17
- scanoss/api/common/v2/scanoss_common_pb2.py +49 -20
- scanoss/api/common/v2/scanoss_common_pb2_grpc.py +25 -0
- scanoss/api/components/__init__.py +17 -17
- scanoss/api/components/v2/__init__.py +17 -17
- scanoss/api/components/v2/scanoss_components_pb2.py +68 -43
- scanoss/api/components/v2/scanoss_components_pb2_grpc.py +83 -22
- scanoss/api/cryptography/v2/scanoss_cryptography_pb2.py +136 -21
- scanoss/api/cryptography/v2/scanoss_cryptography_pb2_grpc.py +766 -13
- scanoss/api/dependencies/__init__.py +17 -17
- scanoss/api/dependencies/v2/__init__.py +17 -17
- scanoss/api/dependencies/v2/scanoss_dependencies_pb2.py +56 -29
- scanoss/api/dependencies/v2/scanoss_dependencies_pb2_grpc.py +94 -8
- scanoss/api/geoprovenance/__init__.py +23 -0
- scanoss/api/geoprovenance/v2/__init__.py +23 -0
- scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2.py +92 -0
- scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2_grpc.py +381 -0
- scanoss/api/licenses/__init__.py +23 -0
- scanoss/api/licenses/v2/__init__.py +23 -0
- scanoss/api/licenses/v2/scanoss_licenses_pb2.py +84 -0
- scanoss/api/licenses/v2/scanoss_licenses_pb2_grpc.py +302 -0
- scanoss/api/scanning/__init__.py +17 -17
- scanoss/api/scanning/v2/__init__.py +17 -17
- scanoss/api/scanning/v2/scanoss_scanning_pb2.py +42 -13
- scanoss/api/scanning/v2/scanoss_scanning_pb2_grpc.py +86 -7
- scanoss/api/semgrep/__init__.py +17 -17
- scanoss/api/semgrep/v2/__init__.py +17 -17
- scanoss/api/semgrep/v2/scanoss_semgrep_pb2.py +50 -23
- scanoss/api/semgrep/v2/scanoss_semgrep_pb2_grpc.py +151 -16
- scanoss/api/vulnerabilities/__init__.py +17 -17
- scanoss/api/vulnerabilities/v2/__init__.py +17 -17
- scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2.py +78 -31
- scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2_grpc.py +282 -18
- scanoss/cli.py +2359 -370
- scanoss/components.py +187 -94
- scanoss/constants.py +22 -0
- scanoss/cryptography.py +308 -0
- scanoss/csvoutput.py +91 -58
- scanoss/cyclonedx.py +221 -63
- scanoss/data/build_date.txt +1 -1
- scanoss/data/osadl-copyleft.json +133 -0
- scanoss/data/scanoss-settings-schema.json +254 -0
- scanoss/delta.py +197 -0
- scanoss/export/__init__.py +23 -0
- scanoss/export/dependency_track.py +227 -0
- scanoss/file_filters.py +582 -0
- scanoss/filecount.py +75 -69
- scanoss/gitlabqualityreport.py +214 -0
- scanoss/header_filter.py +563 -0
- scanoss/inspection/__init__.py +23 -0
- scanoss/inspection/policy_check/__init__.py +0 -0
- scanoss/inspection/policy_check/dependency_track/__init__.py +0 -0
- scanoss/inspection/policy_check/dependency_track/project_violation.py +479 -0
- scanoss/inspection/policy_check/policy_check.py +222 -0
- scanoss/inspection/policy_check/scanoss/__init__.py +0 -0
- scanoss/inspection/policy_check/scanoss/copyleft.py +243 -0
- scanoss/inspection/policy_check/scanoss/undeclared_component.py +309 -0
- scanoss/inspection/summary/__init__.py +0 -0
- scanoss/inspection/summary/component_summary.py +170 -0
- scanoss/inspection/summary/license_summary.py +191 -0
- scanoss/inspection/summary/match_summary.py +341 -0
- scanoss/inspection/utils/file_utils.py +44 -0
- scanoss/inspection/utils/license_utils.py +123 -0
- scanoss/inspection/utils/markdown_utils.py +63 -0
- scanoss/inspection/utils/scan_result_processor.py +417 -0
- scanoss/osadl.py +125 -0
- scanoss/results.py +275 -0
- scanoss/scancodedeps.py +87 -38
- scanoss/scanner.py +431 -539
- scanoss/scanners/__init__.py +23 -0
- scanoss/scanners/container_scanner.py +476 -0
- scanoss/scanners/folder_hasher.py +358 -0
- scanoss/scanners/scanner_config.py +73 -0
- scanoss/scanners/scanner_hfh.py +252 -0
- scanoss/scanoss_settings.py +337 -0
- scanoss/scanossapi.py +140 -101
- scanoss/scanossbase.py +59 -22
- scanoss/scanossgrpc.py +799 -251
- scanoss/scanpostprocessor.py +294 -0
- scanoss/scantype.py +22 -21
- scanoss/services/dependency_track_service.py +132 -0
- scanoss/spdxlite.py +532 -174
- scanoss/threadeddependencies.py +148 -47
- scanoss/threadedscanning.py +53 -37
- scanoss/utils/__init__.py +23 -0
- scanoss/utils/abstract_presenter.py +103 -0
- scanoss/utils/crc64.py +96 -0
- scanoss/utils/file.py +84 -0
- scanoss/utils/scanoss_scan_results_utils.py +41 -0
- scanoss/utils/simhash.py +198 -0
- scanoss/winnowing.py +241 -63
- {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/METADATA +18 -9
- scanoss-1.43.1.dist-info/RECORD +110 -0
- {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/WHEEL +1 -1
- scanoss-1.12.2.dist-info/RECORD +0 -58
- {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/entry_points.txt +0 -0
- {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info/licenses}/LICENSE +0 -0
- {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/top_level.txt +0 -0
scanoss/file_filters.py
ADDED
|
@@ -0,0 +1,582 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2024, SCANOSS
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
|
14
|
+
all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
+
THE SOFTWARE.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import os
|
|
26
|
+
import sys
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import List, Optional
|
|
29
|
+
|
|
30
|
+
from pathspec import GitIgnoreSpec
|
|
31
|
+
|
|
32
|
+
from .scanossbase import ScanossBase
|
|
33
|
+
|
|
34
|
+
# Files to skip
|
|
35
|
+
DEFAULT_SKIPPED_FILES = {
|
|
36
|
+
'gradlew',
|
|
37
|
+
'gradlew.bat',
|
|
38
|
+
'mvnw',
|
|
39
|
+
'mvnw.cmd',
|
|
40
|
+
'gradle-wrapper.jar',
|
|
41
|
+
'maven-wrapper.jar',
|
|
42
|
+
'thumbs.db',
|
|
43
|
+
'babel.config.js',
|
|
44
|
+
'license.txt',
|
|
45
|
+
'license.md',
|
|
46
|
+
'copying.lib',
|
|
47
|
+
'makefile',
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
DEFAULT_SKIPPED_FILES_HFH = {
|
|
51
|
+
'gradlew',
|
|
52
|
+
'gradlew.bat',
|
|
53
|
+
'mvnw',
|
|
54
|
+
'mvnw.cmd',
|
|
55
|
+
'gradle-wrapper.jar',
|
|
56
|
+
'maven-wrapper.jar',
|
|
57
|
+
'thumbs.db',
|
|
58
|
+
'babel.config.js',
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# Folders to skip
|
|
63
|
+
DEFAULT_SKIPPED_DIRS = {
|
|
64
|
+
'nbproject',
|
|
65
|
+
'nbbuild',
|
|
66
|
+
'nbdist',
|
|
67
|
+
'__pycache__',
|
|
68
|
+
'venv',
|
|
69
|
+
'_yardoc',
|
|
70
|
+
'eggs',
|
|
71
|
+
'wheels',
|
|
72
|
+
'htmlcov',
|
|
73
|
+
'__pypackages__',
|
|
74
|
+
'example',
|
|
75
|
+
'examples'
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
DEFAULT_SKIPPED_DIRS_HFH = {
|
|
79
|
+
'nbproject',
|
|
80
|
+
'nbbuild',
|
|
81
|
+
'nbdist',
|
|
82
|
+
'__pycache__',
|
|
83
|
+
'venv',
|
|
84
|
+
'_yardoc',
|
|
85
|
+
'eggs',
|
|
86
|
+
'wheels',
|
|
87
|
+
'htmlcov',
|
|
88
|
+
'__pypackages__',
|
|
89
|
+
'example',
|
|
90
|
+
'examples',
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# Folder endings to skip
|
|
95
|
+
DEFAULT_SKIPPED_DIR_EXT = {'.egg-info'}
|
|
96
|
+
DEFAULT_SKIPPED_DIR_EXT_HFH = {'.egg-info'}
|
|
97
|
+
|
|
98
|
+
# File extensions to skip
|
|
99
|
+
DEFAULT_SKIPPED_EXT = {
|
|
100
|
+
'.1',
|
|
101
|
+
'.2',
|
|
102
|
+
'.3',
|
|
103
|
+
'.4',
|
|
104
|
+
'.5',
|
|
105
|
+
'.6',
|
|
106
|
+
'.7',
|
|
107
|
+
'.8',
|
|
108
|
+
'.9',
|
|
109
|
+
'.ac',
|
|
110
|
+
'.adoc',
|
|
111
|
+
'.am',
|
|
112
|
+
'.asciidoc',
|
|
113
|
+
'.bmp',
|
|
114
|
+
'.build',
|
|
115
|
+
'.cfg',
|
|
116
|
+
'.chm',
|
|
117
|
+
'.class',
|
|
118
|
+
'.cmake',
|
|
119
|
+
'.cnf',
|
|
120
|
+
'.conf',
|
|
121
|
+
'.config',
|
|
122
|
+
'.contributors',
|
|
123
|
+
'.copying',
|
|
124
|
+
'.crt',
|
|
125
|
+
'.csproj',
|
|
126
|
+
'.css',
|
|
127
|
+
'.csv',
|
|
128
|
+
'.dat',
|
|
129
|
+
'.data',
|
|
130
|
+
'.doc',
|
|
131
|
+
'.docx',
|
|
132
|
+
'.dtd',
|
|
133
|
+
'.dts',
|
|
134
|
+
'.iws',
|
|
135
|
+
'.c9',
|
|
136
|
+
'.c9revisions',
|
|
137
|
+
'.dtsi',
|
|
138
|
+
'.dump',
|
|
139
|
+
'.eot',
|
|
140
|
+
'.eps',
|
|
141
|
+
'.geojson',
|
|
142
|
+
'.gdoc',
|
|
143
|
+
'.gif',
|
|
144
|
+
'.glif',
|
|
145
|
+
'.gmo',
|
|
146
|
+
'.gradle',
|
|
147
|
+
'.guess',
|
|
148
|
+
'.hex',
|
|
149
|
+
'.htm',
|
|
150
|
+
'.html',
|
|
151
|
+
'.ico',
|
|
152
|
+
'.iml',
|
|
153
|
+
'.in',
|
|
154
|
+
'.inc',
|
|
155
|
+
'.info',
|
|
156
|
+
'.ini',
|
|
157
|
+
'.ipynb',
|
|
158
|
+
'.jpeg',
|
|
159
|
+
'.jpg',
|
|
160
|
+
'.json',
|
|
161
|
+
'.jsonld',
|
|
162
|
+
'.lock',
|
|
163
|
+
'.log',
|
|
164
|
+
'.m4',
|
|
165
|
+
'.map',
|
|
166
|
+
'.markdown',
|
|
167
|
+
'.md',
|
|
168
|
+
'.md5',
|
|
169
|
+
'.meta',
|
|
170
|
+
'.mk',
|
|
171
|
+
'.mxml',
|
|
172
|
+
'.o',
|
|
173
|
+
'.otf',
|
|
174
|
+
'.out',
|
|
175
|
+
'.pbtxt',
|
|
176
|
+
'.pdf',
|
|
177
|
+
'.pem',
|
|
178
|
+
'.phtml',
|
|
179
|
+
'.plist',
|
|
180
|
+
'.png',
|
|
181
|
+
'.po',
|
|
182
|
+
'.ppt',
|
|
183
|
+
'.prefs',
|
|
184
|
+
'.properties',
|
|
185
|
+
'.pyc',
|
|
186
|
+
'.qdoc',
|
|
187
|
+
'.result',
|
|
188
|
+
'.rgb',
|
|
189
|
+
'.rst',
|
|
190
|
+
'.scss',
|
|
191
|
+
'.sha',
|
|
192
|
+
'.sha1',
|
|
193
|
+
'.sha2',
|
|
194
|
+
'.sha256',
|
|
195
|
+
'.sln',
|
|
196
|
+
'.spec',
|
|
197
|
+
'.sql',
|
|
198
|
+
'.sub',
|
|
199
|
+
'.svg',
|
|
200
|
+
'.svn-base',
|
|
201
|
+
'.tab',
|
|
202
|
+
'.template',
|
|
203
|
+
'.test',
|
|
204
|
+
'.tex',
|
|
205
|
+
'.tiff',
|
|
206
|
+
'.toml',
|
|
207
|
+
'.ttf',
|
|
208
|
+
'.txt',
|
|
209
|
+
'.utf-8',
|
|
210
|
+
'.vim',
|
|
211
|
+
'.wav',
|
|
212
|
+
'.woff',
|
|
213
|
+
'.woff2',
|
|
214
|
+
'.xht',
|
|
215
|
+
'.xhtml',
|
|
216
|
+
'.xls',
|
|
217
|
+
'.xlsx',
|
|
218
|
+
'.xml',
|
|
219
|
+
'.xpm',
|
|
220
|
+
'.xsd',
|
|
221
|
+
'.xul',
|
|
222
|
+
'.yaml',
|
|
223
|
+
'.yml',
|
|
224
|
+
'.wfp',
|
|
225
|
+
'.editorconfig',
|
|
226
|
+
'.dotcover',
|
|
227
|
+
'.pid',
|
|
228
|
+
'.lcov',
|
|
229
|
+
'.egg',
|
|
230
|
+
'.manifest',
|
|
231
|
+
'.cache',
|
|
232
|
+
'.coverage',
|
|
233
|
+
'.cover',
|
|
234
|
+
'.gem',
|
|
235
|
+
'.lst',
|
|
236
|
+
'.pickle',
|
|
237
|
+
'.pdb',
|
|
238
|
+
'.gml',
|
|
239
|
+
'.pot',
|
|
240
|
+
'.plt',
|
|
241
|
+
'.whml',
|
|
242
|
+
'.pom',
|
|
243
|
+
'.smtml',
|
|
244
|
+
'.min.js',
|
|
245
|
+
'.mf',
|
|
246
|
+
'.base64',
|
|
247
|
+
'.s',
|
|
248
|
+
'.diff',
|
|
249
|
+
'.patch',
|
|
250
|
+
'.rules',
|
|
251
|
+
# File endings
|
|
252
|
+
'-doc',
|
|
253
|
+
'changelog',
|
|
254
|
+
'config',
|
|
255
|
+
'copying',
|
|
256
|
+
'license',
|
|
257
|
+
'authors',
|
|
258
|
+
'news',
|
|
259
|
+
'licenses',
|
|
260
|
+
'notice',
|
|
261
|
+
'readme',
|
|
262
|
+
'swiftdoc',
|
|
263
|
+
'texidoc',
|
|
264
|
+
'todo',
|
|
265
|
+
'version',
|
|
266
|
+
'ignore',
|
|
267
|
+
'manifest',
|
|
268
|
+
'sqlite',
|
|
269
|
+
'sqlite3',
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class FileFilters(ScanossBase):
|
|
274
|
+
"""
|
|
275
|
+
Filter for determining which files to process during scanning, fingerprinting, etc.
|
|
276
|
+
Handles both inclusion and exclusion rules based on file paths, extensions, and sizes.
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
def __init__(self, debug: bool = False, trace: bool = False, quiet: bool = False, **kwargs):
|
|
280
|
+
"""
|
|
281
|
+
Initialize scan filters based on default settings. Optionally append custom settings.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
debug (bool): Enable debug output
|
|
285
|
+
trace (bool): Enable trace output
|
|
286
|
+
quiet (bool): Suppress output
|
|
287
|
+
**kwargs: Additional arguments including:
|
|
288
|
+
scanoss_settings (ScanossSettings): Custom settings to override defaults
|
|
289
|
+
all_extensions (bool): Include all file extensions
|
|
290
|
+
all_folders (bool): Include all folders
|
|
291
|
+
hidden_files_folders (bool): Include hidden files and folders
|
|
292
|
+
operation_type (str): Operation type ('scanning' or 'fingerprinting')
|
|
293
|
+
skip_size (int): Size to skip
|
|
294
|
+
skip_extensions (list): Extensions to skip
|
|
295
|
+
skip_folders (list): Folders to skip
|
|
296
|
+
is_folder_hashing_scan (bool): Whether the operation is a folder hashing scan
|
|
297
|
+
"""
|
|
298
|
+
super().__init__(debug, trace, quiet)
|
|
299
|
+
|
|
300
|
+
self.hidden_files_folders = kwargs.get('hidden_files_folders', False)
|
|
301
|
+
self.scanoss_settings = kwargs.get('scanoss_settings')
|
|
302
|
+
self.all_extensions = kwargs.get('all_extensions', False)
|
|
303
|
+
self.all_folders = kwargs.get('all_folders', False)
|
|
304
|
+
self.skip_folders = kwargs.get('skip_folders', [])
|
|
305
|
+
self.skip_size = kwargs.get('skip_size', 0)
|
|
306
|
+
self.skip_extensions = kwargs.get('skip_extensions', [])
|
|
307
|
+
self.is_folder_hashing_scan = kwargs.get('is_folder_hashing_scan', False)
|
|
308
|
+
self.file_folder_pat_spec = self._get_file_folder_pattern_spec(kwargs.get('operation_type', 'scanning'))
|
|
309
|
+
self.size_pat_rules = self._get_size_limit_pattern_rules(kwargs.get('operation_type', 'scanning'))
|
|
310
|
+
|
|
311
|
+
def get_filtered_files_from_folder(self, root: str) -> List[str]:
|
|
312
|
+
"""
|
|
313
|
+
Retrieve a list of files to scan or fingerprint from a given directory root based on filter settings.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
root (str): Root directory to scan or fingerprint
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
list[str]: Filtered list of files to scan or fingerprint
|
|
320
|
+
"""
|
|
321
|
+
if self.debug:
|
|
322
|
+
if self.file_folder_pat_spec:
|
|
323
|
+
self.print_stderr(f'Running with {len(self.file_folder_pat_spec)} pattern filters.')
|
|
324
|
+
if self.size_pat_rules:
|
|
325
|
+
self.print_stderr(f'Running with {len(self.size_pat_rules)} size pattern rules.')
|
|
326
|
+
if self.skip_size:
|
|
327
|
+
self.print_stderr(f'Running with global skip size: {self.skip_size}')
|
|
328
|
+
if self.skip_extensions:
|
|
329
|
+
self.print_stderr(f'Running with extra global skip extensions: {self.skip_extensions}')
|
|
330
|
+
if self.skip_folders:
|
|
331
|
+
self.print_stderr(f'Running with extra global skip folders: {self.skip_folders}')
|
|
332
|
+
all_files = []
|
|
333
|
+
root_path = Path(root).resolve()
|
|
334
|
+
if not root_path.exists() or not root_path.is_dir():
|
|
335
|
+
self.print_stderr(f'ERROR: Specified root directory {root} does not exist or is not a directory.')
|
|
336
|
+
return all_files
|
|
337
|
+
# Walk the tree looking for files to process. While taking into account files/folders to skip
|
|
338
|
+
for dirpath, dirnames, filenames in os.walk(root_path):
|
|
339
|
+
dir_path = Path(dirpath)
|
|
340
|
+
rel_path = dir_path.relative_to(root_path)
|
|
341
|
+
if dir_path.is_symlink(): # TODO should we skip symlink folders?
|
|
342
|
+
self.print_msg(f'WARNING: Found symbolic link folder: {dir_path}')
|
|
343
|
+
|
|
344
|
+
if self.should_skip_dir(str(rel_path)): # Current directory should be skipped
|
|
345
|
+
dirnames.clear()
|
|
346
|
+
continue
|
|
347
|
+
for filename in filenames:
|
|
348
|
+
file_path = dir_path / filename
|
|
349
|
+
all_files.append(str(file_path))
|
|
350
|
+
# End os.walk loop
|
|
351
|
+
# Now filter the files and return the reduced list
|
|
352
|
+
return self.get_filtered_files_from_files(all_files, str(root_path))
|
|
353
|
+
|
|
354
|
+
def get_filtered_files_from_files(self, files: List[str], scan_root: Optional[str] = None) -> List[str]:
|
|
355
|
+
"""
|
|
356
|
+
Retrieve a list of files to scan or fingerprint from a given list of files based on filter settings.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
files (List[str]): List of files to scan or fingerprint
|
|
360
|
+
scan_root (str): Root directory to scan or fingerprint
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
list[str]: Filtered list of files to scan or fingerprint
|
|
364
|
+
"""
|
|
365
|
+
filtered_files = []
|
|
366
|
+
for file_path in files:
|
|
367
|
+
path_obj = Path(file_path)
|
|
368
|
+
try:
|
|
369
|
+
if scan_root:
|
|
370
|
+
rel_path = path_obj.relative_to(scan_root)
|
|
371
|
+
else:
|
|
372
|
+
rel_path = str(path_obj)
|
|
373
|
+
except ValueError:
|
|
374
|
+
self.print_debug(f'Ignoring file: {file_path} (broken symlink)')
|
|
375
|
+
continue
|
|
376
|
+
|
|
377
|
+
if not path_obj.exists() or not path_obj.is_file() or path_obj.is_symlink():
|
|
378
|
+
self.print_debug(
|
|
379
|
+
f'WARNING: File {rel_path} does not exist, is not a file, or is a symbolic link. Ignoring.'
|
|
380
|
+
)
|
|
381
|
+
continue
|
|
382
|
+
|
|
383
|
+
if not self.hidden_files_folders and any(part.startswith('.') for part in path_obj.parts):
|
|
384
|
+
self.print_debug(f'Skipping file: {rel_path} (in hidden directory or is hidden file)')
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
if self._should_skip_file(rel_path):
|
|
388
|
+
continue
|
|
389
|
+
try:
|
|
390
|
+
file_size = path_obj.stat().st_size
|
|
391
|
+
if file_size == 0:
|
|
392
|
+
self.print_debug(f'Skipping file: {rel_path} (empty file)')
|
|
393
|
+
continue
|
|
394
|
+
min_size, max_size = self._get_operation_size_limits(file_path)
|
|
395
|
+
if min_size <= file_size <= max_size:
|
|
396
|
+
filtered_files.append(str(rel_path))
|
|
397
|
+
else:
|
|
398
|
+
self.print_debug(
|
|
399
|
+
f'Skipping file: {rel_path} (size {file_size} outside limits {min_size}-{max_size})'
|
|
400
|
+
)
|
|
401
|
+
except OSError as e:
|
|
402
|
+
self.print_debug(f'Error getting size for {rel_path}: {e}')
|
|
403
|
+
# End file loop
|
|
404
|
+
return filtered_files
|
|
405
|
+
|
|
406
|
+
def _get_file_folder_pattern_spec(self, operation_type: str = 'scanning'):
|
|
407
|
+
"""
|
|
408
|
+
Get file path pattern specification.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
operation_type (str): Type of operation ('scanning' or 'fingerprinting')
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
GitIgnoreSpec: GitIgnoreSpec object containing the file path patterns
|
|
415
|
+
"""
|
|
416
|
+
patterns = self._get_operation_patterns(operation_type)
|
|
417
|
+
if patterns:
|
|
418
|
+
return GitIgnoreSpec.from_lines(patterns)
|
|
419
|
+
return None
|
|
420
|
+
|
|
421
|
+
def _get_size_limit_pattern_rules(self, operation_type: str = 'scanning'):
|
|
422
|
+
"""
|
|
423
|
+
Get size limit pattern rules.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
operation_type (str): Type of operation ('scanning' or 'fingerprinting')
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
List of size limit pattern rules
|
|
430
|
+
"""
|
|
431
|
+
if self.scanoss_settings:
|
|
432
|
+
size_rules = self.scanoss_settings.get_skip_sizes(operation_type)
|
|
433
|
+
if size_rules:
|
|
434
|
+
size_rules_with_patterns = []
|
|
435
|
+
for rule in size_rules:
|
|
436
|
+
patterns = rule.get('patterns', [])
|
|
437
|
+
if not patterns:
|
|
438
|
+
continue
|
|
439
|
+
size_rules_with_patterns.append(rule)
|
|
440
|
+
return size_rules_with_patterns
|
|
441
|
+
return None
|
|
442
|
+
|
|
443
|
+
def _get_operation_patterns(self, operation_type: str) -> List[str]:
|
|
444
|
+
"""
|
|
445
|
+
Get patterns specific to the operation type, combining defaults with settings.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
operation_type (str): Type of operation ('scanning' or 'fingerprinting')
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
List[str]: Combined list of patterns to skip
|
|
452
|
+
"""
|
|
453
|
+
patterns = []
|
|
454
|
+
|
|
455
|
+
# Default patterns for skipping directories
|
|
456
|
+
if not self.all_folders:
|
|
457
|
+
DEFAULT_SKIPPED_DIR_LIST = DEFAULT_SKIPPED_DIRS_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIRS
|
|
458
|
+
DEFAULT_SKIPPED_DIR_EXT_LIST = (
|
|
459
|
+
DEFAULT_SKIPPED_DIR_EXT_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIR_EXT
|
|
460
|
+
)
|
|
461
|
+
for dir_name in DEFAULT_SKIPPED_DIR_LIST:
|
|
462
|
+
patterns.append(f'{dir_name}/')
|
|
463
|
+
for dir_extension in DEFAULT_SKIPPED_DIR_EXT_LIST:
|
|
464
|
+
patterns.append(f'*{dir_extension}/')
|
|
465
|
+
|
|
466
|
+
# Custom patterns added in SCANOSS settings file
|
|
467
|
+
if self.scanoss_settings:
|
|
468
|
+
patterns.extend(self.scanoss_settings.get_skip_patterns(operation_type))
|
|
469
|
+
return patterns
|
|
470
|
+
|
|
471
|
+
def _get_operation_size_limits(self, file_path: str = None) -> tuple:
|
|
472
|
+
"""
|
|
473
|
+
Get size limits specific to the operation type and file path.
|
|
474
|
+
|
|
475
|
+
Args:
|
|
476
|
+
file_path (str, optional): Path to the file to check against patterns. If None, returns default limits.
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
tuple: (min_size, max_size) tuple for the given file path and operation type
|
|
480
|
+
"""
|
|
481
|
+
min_size = 0
|
|
482
|
+
max_size = sys.maxsize
|
|
483
|
+
# Apply global minimum file size if specified
|
|
484
|
+
if self.skip_size > 0:
|
|
485
|
+
min_size = self.skip_size
|
|
486
|
+
return min_size, max_size
|
|
487
|
+
# Return default size limits if no settings specified
|
|
488
|
+
if not self.scanoss_settings or not file_path or not self.size_pat_rules:
|
|
489
|
+
return min_size, max_size
|
|
490
|
+
try:
|
|
491
|
+
rel_path = os.path.relpath(file_path)
|
|
492
|
+
except ValueError:
|
|
493
|
+
rel_path = os.path.basename(file_path)
|
|
494
|
+
rel_path_lower = rel_path.lower()
|
|
495
|
+
# Cycle through each rule looking for a match
|
|
496
|
+
for rule in self.size_pat_rules:
|
|
497
|
+
patterns = rule.get('patterns', [])
|
|
498
|
+
if patterns:
|
|
499
|
+
path_spec = GitIgnoreSpec.from_lines(patterns)
|
|
500
|
+
if path_spec.match_file(rel_path_lower):
|
|
501
|
+
return rule.get('min', min_size), rule.get('max', max_size)
|
|
502
|
+
# End rules loop
|
|
503
|
+
return min_size, max_size
|
|
504
|
+
|
|
505
|
+
def should_skip_dir(self, dir_rel_path: str) -> bool: # noqa: PLR0911
|
|
506
|
+
"""
|
|
507
|
+
Check if a directory should be skipped based on operation type and default rules.
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
dir_rel_path (str): Relative path to the directory
|
|
511
|
+
|
|
512
|
+
Returns:
|
|
513
|
+
bool: True if directory should be skipped, False otherwise
|
|
514
|
+
"""
|
|
515
|
+
dir_name = os.path.basename(dir_rel_path)
|
|
516
|
+
dir_path = Path(dir_rel_path)
|
|
517
|
+
if (
|
|
518
|
+
not self.hidden_files_folders
|
|
519
|
+
and dir_path != Path('.')
|
|
520
|
+
and any(part.startswith('.') for part in dir_path.parts)
|
|
521
|
+
):
|
|
522
|
+
self.print_debug(f'Skipping directory: {dir_rel_path} (hidden directory)')
|
|
523
|
+
return True
|
|
524
|
+
if self.all_folders:
|
|
525
|
+
return False
|
|
526
|
+
dir_name_lower = dir_name.lower()
|
|
527
|
+
if dir_name_lower in DEFAULT_SKIPPED_DIRS:
|
|
528
|
+
self.print_debug(f'Skipping directory: {dir_rel_path} (matches default skip directory)')
|
|
529
|
+
return True
|
|
530
|
+
if self.skip_folders and dir_name in self.skip_folders:
|
|
531
|
+
self.print_debug(f'Skipping directory: {dir_rel_path} (matches skip folder)')
|
|
532
|
+
return True
|
|
533
|
+
for ext in DEFAULT_SKIPPED_DIR_EXT:
|
|
534
|
+
if dir_name_lower.endswith(ext):
|
|
535
|
+
self.print_debug(f'Skipping directory: {dir_rel_path} (matches default skip extension: {ext})')
|
|
536
|
+
return True
|
|
537
|
+
|
|
538
|
+
if self.file_folder_pat_spec and self.file_folder_pat_spec.match_file(dir_rel_path):
|
|
539
|
+
self.print_debug(f'Skipping directory: {dir_rel_path} (matches custom pattern)')
|
|
540
|
+
return True
|
|
541
|
+
return False
|
|
542
|
+
|
|
543
|
+
def _should_skip_file(self, file_rel_path: str) -> bool: # noqa: PLR0911
|
|
544
|
+
"""
|
|
545
|
+
Check if a file should be skipped based on operation type and default rules.
|
|
546
|
+
|
|
547
|
+
Args:
|
|
548
|
+
file_rel_path (str): Relative path to the file
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
bool: True if file should be skipped, False otherwise
|
|
552
|
+
"""
|
|
553
|
+
file_name = os.path.basename(file_rel_path)
|
|
554
|
+
DEFAULT_SKIPPED_EXT_LIST = {} if self.is_folder_hashing_scan else DEFAULT_SKIPPED_EXT
|
|
555
|
+
DEFAULT_SKIPPED_FILES_LIST = DEFAULT_SKIPPED_FILES_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_FILES
|
|
556
|
+
|
|
557
|
+
if not self.hidden_files_folders and file_name.startswith('.'):
|
|
558
|
+
self.print_debug(f'Skipping file: {file_rel_path} (hidden file)')
|
|
559
|
+
return True
|
|
560
|
+
if self.all_extensions:
|
|
561
|
+
return False
|
|
562
|
+
file_name_lower = file_name.lower()
|
|
563
|
+
# Look for exact files
|
|
564
|
+
if file_name_lower in DEFAULT_SKIPPED_FILES_LIST:
|
|
565
|
+
self.print_debug(f'Skipping file: {file_rel_path} (matches default skip file)')
|
|
566
|
+
return True
|
|
567
|
+
# Look for file endings
|
|
568
|
+
for ending in DEFAULT_SKIPPED_EXT_LIST:
|
|
569
|
+
if file_name_lower.endswith(ending):
|
|
570
|
+
self.print_debug(f'Skipping file: {file_rel_path} (matches default skip ending: {ending})')
|
|
571
|
+
return True
|
|
572
|
+
# Look for custom (extra) endings
|
|
573
|
+
if self.skip_extensions:
|
|
574
|
+
for ending in self.skip_extensions:
|
|
575
|
+
if file_name_lower.endswith(ending):
|
|
576
|
+
self.print_debug(f'Skipping file: {file_rel_path} (matches skip extension)')
|
|
577
|
+
return True
|
|
578
|
+
# Check for file patterns
|
|
579
|
+
if self.file_folder_pat_spec and self.file_folder_pat_spec.match_file(file_rel_path):
|
|
580
|
+
self.print_debug(f'Skipping file: {file_rel_path} (matches custom pattern)')
|
|
581
|
+
return True
|
|
582
|
+
return False
|