scanoss 1.12.2__py3-none-any.whl → 1.43.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protoc_gen_swagger/__init__.py +13 -13
- protoc_gen_swagger/options/__init__.py +13 -13
- protoc_gen_swagger/options/annotations_pb2.py +18 -12
- protoc_gen_swagger/options/annotations_pb2.pyi +48 -0
- protoc_gen_swagger/options/annotations_pb2_grpc.py +20 -0
- protoc_gen_swagger/options/openapiv2_pb2.py +110 -99
- protoc_gen_swagger/options/openapiv2_pb2.pyi +1317 -0
- protoc_gen_swagger/options/openapiv2_pb2_grpc.py +20 -0
- scanoss/__init__.py +18 -18
- scanoss/api/__init__.py +17 -17
- scanoss/api/common/__init__.py +17 -17
- scanoss/api/common/v2/__init__.py +17 -17
- scanoss/api/common/v2/scanoss_common_pb2.py +49 -20
- scanoss/api/common/v2/scanoss_common_pb2_grpc.py +25 -0
- scanoss/api/components/__init__.py +17 -17
- scanoss/api/components/v2/__init__.py +17 -17
- scanoss/api/components/v2/scanoss_components_pb2.py +68 -43
- scanoss/api/components/v2/scanoss_components_pb2_grpc.py +83 -22
- scanoss/api/cryptography/v2/scanoss_cryptography_pb2.py +136 -21
- scanoss/api/cryptography/v2/scanoss_cryptography_pb2_grpc.py +766 -13
- scanoss/api/dependencies/__init__.py +17 -17
- scanoss/api/dependencies/v2/__init__.py +17 -17
- scanoss/api/dependencies/v2/scanoss_dependencies_pb2.py +56 -29
- scanoss/api/dependencies/v2/scanoss_dependencies_pb2_grpc.py +94 -8
- scanoss/api/geoprovenance/__init__.py +23 -0
- scanoss/api/geoprovenance/v2/__init__.py +23 -0
- scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2.py +92 -0
- scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2_grpc.py +381 -0
- scanoss/api/licenses/__init__.py +23 -0
- scanoss/api/licenses/v2/__init__.py +23 -0
- scanoss/api/licenses/v2/scanoss_licenses_pb2.py +84 -0
- scanoss/api/licenses/v2/scanoss_licenses_pb2_grpc.py +302 -0
- scanoss/api/scanning/__init__.py +17 -17
- scanoss/api/scanning/v2/__init__.py +17 -17
- scanoss/api/scanning/v2/scanoss_scanning_pb2.py +42 -13
- scanoss/api/scanning/v2/scanoss_scanning_pb2_grpc.py +86 -7
- scanoss/api/semgrep/__init__.py +17 -17
- scanoss/api/semgrep/v2/__init__.py +17 -17
- scanoss/api/semgrep/v2/scanoss_semgrep_pb2.py +50 -23
- scanoss/api/semgrep/v2/scanoss_semgrep_pb2_grpc.py +151 -16
- scanoss/api/vulnerabilities/__init__.py +17 -17
- scanoss/api/vulnerabilities/v2/__init__.py +17 -17
- scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2.py +78 -31
- scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2_grpc.py +282 -18
- scanoss/cli.py +2359 -370
- scanoss/components.py +187 -94
- scanoss/constants.py +22 -0
- scanoss/cryptography.py +308 -0
- scanoss/csvoutput.py +91 -58
- scanoss/cyclonedx.py +221 -63
- scanoss/data/build_date.txt +1 -1
- scanoss/data/osadl-copyleft.json +133 -0
- scanoss/data/scanoss-settings-schema.json +254 -0
- scanoss/delta.py +197 -0
- scanoss/export/__init__.py +23 -0
- scanoss/export/dependency_track.py +227 -0
- scanoss/file_filters.py +582 -0
- scanoss/filecount.py +75 -69
- scanoss/gitlabqualityreport.py +214 -0
- scanoss/header_filter.py +563 -0
- scanoss/inspection/__init__.py +23 -0
- scanoss/inspection/policy_check/__init__.py +0 -0
- scanoss/inspection/policy_check/dependency_track/__init__.py +0 -0
- scanoss/inspection/policy_check/dependency_track/project_violation.py +479 -0
- scanoss/inspection/policy_check/policy_check.py +222 -0
- scanoss/inspection/policy_check/scanoss/__init__.py +0 -0
- scanoss/inspection/policy_check/scanoss/copyleft.py +243 -0
- scanoss/inspection/policy_check/scanoss/undeclared_component.py +309 -0
- scanoss/inspection/summary/__init__.py +0 -0
- scanoss/inspection/summary/component_summary.py +170 -0
- scanoss/inspection/summary/license_summary.py +191 -0
- scanoss/inspection/summary/match_summary.py +341 -0
- scanoss/inspection/utils/file_utils.py +44 -0
- scanoss/inspection/utils/license_utils.py +123 -0
- scanoss/inspection/utils/markdown_utils.py +63 -0
- scanoss/inspection/utils/scan_result_processor.py +417 -0
- scanoss/osadl.py +125 -0
- scanoss/results.py +275 -0
- scanoss/scancodedeps.py +87 -38
- scanoss/scanner.py +431 -539
- scanoss/scanners/__init__.py +23 -0
- scanoss/scanners/container_scanner.py +476 -0
- scanoss/scanners/folder_hasher.py +358 -0
- scanoss/scanners/scanner_config.py +73 -0
- scanoss/scanners/scanner_hfh.py +252 -0
- scanoss/scanoss_settings.py +337 -0
- scanoss/scanossapi.py +140 -101
- scanoss/scanossbase.py +59 -22
- scanoss/scanossgrpc.py +799 -251
- scanoss/scanpostprocessor.py +294 -0
- scanoss/scantype.py +22 -21
- scanoss/services/dependency_track_service.py +132 -0
- scanoss/spdxlite.py +532 -174
- scanoss/threadeddependencies.py +148 -47
- scanoss/threadedscanning.py +53 -37
- scanoss/utils/__init__.py +23 -0
- scanoss/utils/abstract_presenter.py +103 -0
- scanoss/utils/crc64.py +96 -0
- scanoss/utils/file.py +84 -0
- scanoss/utils/scanoss_scan_results_utils.py +41 -0
- scanoss/utils/simhash.py +198 -0
- scanoss/winnowing.py +241 -63
- {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/METADATA +18 -9
- scanoss-1.43.1.dist-info/RECORD +110 -0
- {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/WHEEL +1 -1
- scanoss-1.12.2.dist-info/RECORD +0 -58
- {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/entry_points.txt +0 -0
- {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info/licenses}/LICENSE +0 -0
- {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, List, Literal, Optional
|
|
6
|
+
|
|
7
|
+
from progress.bar import Bar
|
|
8
|
+
|
|
9
|
+
from scanoss.constants import DEFAULT_HFH_DEPTH
|
|
10
|
+
from scanoss.file_filters import FileFilters
|
|
11
|
+
from scanoss.scanoss_settings import ScanossSettings
|
|
12
|
+
from scanoss.scanossbase import ScanossBase
|
|
13
|
+
from scanoss.utils.abstract_presenter import AbstractPresenter
|
|
14
|
+
from scanoss.utils.crc64 import CRC64
|
|
15
|
+
from scanoss.utils.simhash import WordFeatureSet, fingerprint, simhash, vectorize_bytes
|
|
16
|
+
|
|
17
|
+
MINIMUM_FILE_COUNT = 8
|
|
18
|
+
MINIMUM_CONCATENATED_NAME_LENGTH = 32
|
|
19
|
+
|
|
20
|
+
class DirectoryNode:
|
|
21
|
+
"""
|
|
22
|
+
Represents a node in the directory tree for folder hashing.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, path: str):
|
|
26
|
+
self.path = path
|
|
27
|
+
self.is_dir = True
|
|
28
|
+
self.children: Dict[str, DirectoryNode] = {}
|
|
29
|
+
self.files: List[DirectoryFile] = []
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class DirectoryFile:
|
|
33
|
+
"""
|
|
34
|
+
Represents a file in the directory tree for folder hashing.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, path: str, key: List[bytes], key_str: str):
|
|
38
|
+
self.path = path
|
|
39
|
+
self.key = key
|
|
40
|
+
self.key_str = key_str
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class FolderHasherConfig:
|
|
45
|
+
debug: bool = False
|
|
46
|
+
trace: bool = False
|
|
47
|
+
quiet: bool = False
|
|
48
|
+
output_file: Optional[str] = None
|
|
49
|
+
output_format: Literal['json'] = 'json'
|
|
50
|
+
settings_file: Optional[str] = None
|
|
51
|
+
skip_settings_file: bool = False
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def create_folder_hasher_config_from_args(args) -> FolderHasherConfig:
|
|
55
|
+
return FolderHasherConfig(
|
|
56
|
+
debug=args.debug,
|
|
57
|
+
trace=args.trace,
|
|
58
|
+
quiet=args.quiet,
|
|
59
|
+
output_file=getattr(args, 'output', None),
|
|
60
|
+
output_format=getattr(args, 'format', 'json'),
|
|
61
|
+
settings_file=getattr(args, 'settings', None),
|
|
62
|
+
skip_settings_file=getattr(args, 'skip_settings_file', False),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class FolderHasher:
|
|
67
|
+
"""
|
|
68
|
+
Folder Hasher.
|
|
69
|
+
|
|
70
|
+
This class is used to produce a folder hash for a given directory.
|
|
71
|
+
|
|
72
|
+
It builds a directory tree (DirectoryNode) and computes the associated
|
|
73
|
+
hash data for the folder.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
scan_dir (str): The directory to be hashed.
|
|
77
|
+
config (FolderHasherConfig): Configuration parameters for the folder hasher.
|
|
78
|
+
scanoss_settings (Optional[ScanossSettings]): Optional settings for Scanoss.
|
|
79
|
+
depth (int): How many levels to hash from the root directory (default: 1).
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
scan_dir: str,
|
|
85
|
+
config: FolderHasherConfig,
|
|
86
|
+
scanoss_settings: Optional[ScanossSettings] = None,
|
|
87
|
+
depth: int = DEFAULT_HFH_DEPTH,
|
|
88
|
+
):
|
|
89
|
+
self.base = ScanossBase(
|
|
90
|
+
debug=config.debug,
|
|
91
|
+
trace=config.trace,
|
|
92
|
+
quiet=config.quiet,
|
|
93
|
+
)
|
|
94
|
+
self.file_filters = FileFilters(
|
|
95
|
+
debug=config.debug,
|
|
96
|
+
trace=config.trace,
|
|
97
|
+
quiet=config.quiet,
|
|
98
|
+
scanoss_settings=scanoss_settings,
|
|
99
|
+
is_folder_hashing_scan=True,
|
|
100
|
+
)
|
|
101
|
+
self.presenter = FolderHasherPresenter(
|
|
102
|
+
self,
|
|
103
|
+
debug=config.debug,
|
|
104
|
+
trace=config.trace,
|
|
105
|
+
quiet=config.quiet,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
self.scan_dir = scan_dir
|
|
109
|
+
self.tree = None
|
|
110
|
+
self.depth = depth
|
|
111
|
+
|
|
112
|
+
def hash_directory(self, path: str) -> dict:
|
|
113
|
+
"""
|
|
114
|
+
Generate the folder hashing request structure from a directory path.
|
|
115
|
+
|
|
116
|
+
This method builds a directory tree (DirectoryNode) and computes the associated
|
|
117
|
+
hash data for the folder.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
path (str): The root directory path.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
dict: The folder hash request structure.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
root_node = self._build_root_node(path)
|
|
127
|
+
tree = self._hash_calc_from_node(root_node)
|
|
128
|
+
|
|
129
|
+
self.tree = tree
|
|
130
|
+
|
|
131
|
+
return tree
|
|
132
|
+
|
|
133
|
+
def _build_root_node(
|
|
134
|
+
self,
|
|
135
|
+
path: str,
|
|
136
|
+
) -> DirectoryNode:
|
|
137
|
+
"""
|
|
138
|
+
Build a directory tree from the given path with file information.
|
|
139
|
+
|
|
140
|
+
The tree includes DirectoryNode objects populated with filtered file items,
|
|
141
|
+
each containing their relative path and CRC64 hash key.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
path (str): The directory path to build the tree from.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
DirectoryNode: The root node representing the directory.
|
|
148
|
+
"""
|
|
149
|
+
root = Path(path).resolve()
|
|
150
|
+
root_node = DirectoryNode(str(root))
|
|
151
|
+
|
|
152
|
+
all_files = [
|
|
153
|
+
f for f in root.rglob('*') if f.is_file()
|
|
154
|
+
]
|
|
155
|
+
filtered_files = self.file_filters.get_filtered_files_from_files(all_files, str(root))
|
|
156
|
+
|
|
157
|
+
# Sort the files by name to ensure the hash is the same for the same folder
|
|
158
|
+
filtered_files.sort()
|
|
159
|
+
|
|
160
|
+
bar_ctx = Bar('Hashing files...', max=len(filtered_files))
|
|
161
|
+
|
|
162
|
+
with bar_ctx as bar:
|
|
163
|
+
full_file_path = ''
|
|
164
|
+
for file_path in filtered_files:
|
|
165
|
+
try:
|
|
166
|
+
file_path_obj = Path(file_path) if isinstance(file_path, str) else file_path
|
|
167
|
+
full_file_path = file_path_obj if file_path_obj.is_absolute() else root / file_path_obj
|
|
168
|
+
|
|
169
|
+
self.base.print_debug(f'\nHashing file {str(full_file_path)}')
|
|
170
|
+
|
|
171
|
+
file_bytes = full_file_path.read_bytes()
|
|
172
|
+
key = CRC64.get_hash_buff(file_bytes)
|
|
173
|
+
key_str = ''.join(f'{b:02x}' for b in key)
|
|
174
|
+
rel_path = str(full_file_path.relative_to(root))
|
|
175
|
+
|
|
176
|
+
file_item = DirectoryFile(rel_path, key, key_str)
|
|
177
|
+
|
|
178
|
+
current_node = root_node
|
|
179
|
+
for part in Path(rel_path).parent.parts:
|
|
180
|
+
child_path = str(Path(current_node.path) / part)
|
|
181
|
+
if child_path not in current_node.children:
|
|
182
|
+
current_node.children[child_path] = DirectoryNode(child_path)
|
|
183
|
+
current_node = current_node.children[child_path]
|
|
184
|
+
current_node.files.append(file_item)
|
|
185
|
+
|
|
186
|
+
root_node.files.append(file_item)
|
|
187
|
+
|
|
188
|
+
except Exception as e:
|
|
189
|
+
self.base.print_debug(f'Skipping file {full_file_path}: {str(e)}')
|
|
190
|
+
|
|
191
|
+
bar.next()
|
|
192
|
+
return root_node
|
|
193
|
+
|
|
194
|
+
def _hash_calc_from_node(self, node: DirectoryNode, current_depth: int = 1) -> dict:
|
|
195
|
+
"""
|
|
196
|
+
Recursively compute folder hash data for a directory node.
|
|
197
|
+
|
|
198
|
+
The hash data includes the path identifier, simhash for file names,
|
|
199
|
+
simhash for file content, directory hash, language extensions, and children node hash information.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
node (DirectoryNode): The directory node to compute the hash for.
|
|
203
|
+
current_depth (int): The current depth level (1-based, root is depth 1).
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
dict: The computed hash data for the node.
|
|
207
|
+
"""
|
|
208
|
+
hash_data = self._hash_calc(node)
|
|
209
|
+
|
|
210
|
+
# Safely calculate relative path
|
|
211
|
+
try:
|
|
212
|
+
node_path = Path(node.path).resolve()
|
|
213
|
+
scan_dir_path = Path(self.scan_dir).resolve()
|
|
214
|
+
rel_path = node_path.relative_to(scan_dir_path)
|
|
215
|
+
except ValueError:
|
|
216
|
+
# If relative_to fails, use the node path as is or a fallback
|
|
217
|
+
rel_path = Path(node.path).name if node.path else Path('.')
|
|
218
|
+
|
|
219
|
+
# Only process children if we haven't reached the depth limit
|
|
220
|
+
children = []
|
|
221
|
+
if current_depth < self.depth:
|
|
222
|
+
children = [self._hash_calc_from_node(child, current_depth + 1) for child in node.children.values()]
|
|
223
|
+
|
|
224
|
+
return {
|
|
225
|
+
'path_id': str(rel_path),
|
|
226
|
+
'sim_hash_names': f'{hash_data["name_hash"]:02x}' if hash_data['name_hash'] is not None else None,
|
|
227
|
+
'sim_hash_content': f'{hash_data["content_hash"]:02x}' if hash_data['content_hash'] is not None else None,
|
|
228
|
+
'sim_hash_dir_names': f'{hash_data["dir_hash"]:02x}' if hash_data['dir_hash'] is not None else None,
|
|
229
|
+
'lang_extensions': hash_data['lang_extensions'],
|
|
230
|
+
'children': children,
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
def _hash_calc(self, node: DirectoryNode) -> dict:
|
|
234
|
+
"""
|
|
235
|
+
Compute folder hash values for a given directory node.
|
|
236
|
+
|
|
237
|
+
The method aggregates unique file keys and sorted file names to generate
|
|
238
|
+
simhash-based hash values for both file names and file contents.
|
|
239
|
+
|
|
240
|
+
The most significant byte of the name simhash is then replaced by a computed head value.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
node (DirectoryNode): The directory node containing file items.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
dict: A dictionary with 'name_hash', 'content_hash', 'dir_hash', and 'lang_extensions' keys.
|
|
247
|
+
"""
|
|
248
|
+
processed_hashes = set()
|
|
249
|
+
unique_file_names = set()
|
|
250
|
+
unique_directories = set()
|
|
251
|
+
extension_map = {}
|
|
252
|
+
file_hashes = []
|
|
253
|
+
selected_names = []
|
|
254
|
+
|
|
255
|
+
for file in node.files:
|
|
256
|
+
key_str = file.key_str
|
|
257
|
+
|
|
258
|
+
file_name = os.path.basename(file.path)
|
|
259
|
+
|
|
260
|
+
file_name_without_extension, extension = os.path.splitext(file_name)
|
|
261
|
+
current_directory = os.path.dirname(file.path)
|
|
262
|
+
|
|
263
|
+
if extension and len(extension) > 1:
|
|
264
|
+
ext_without_dot = extension[1:]
|
|
265
|
+
extension_map[ext_without_dot] = extension_map.get(ext_without_dot, 0) + 1
|
|
266
|
+
|
|
267
|
+
current_directory.replace(self.scan_dir, '', 1).lstrip(os.path.sep)
|
|
268
|
+
parts = current_directory.split(os.path.sep)
|
|
269
|
+
for d in parts:
|
|
270
|
+
if d in {'', '.', '..'}:
|
|
271
|
+
continue
|
|
272
|
+
unique_directories.add(d)
|
|
273
|
+
|
|
274
|
+
processed_hashes.add(key_str)
|
|
275
|
+
unique_file_names.add(file_name_without_extension)
|
|
276
|
+
selected_names.append(file_name)
|
|
277
|
+
file_hashes.append(file.key)
|
|
278
|
+
|
|
279
|
+
if len(selected_names) < MINIMUM_FILE_COUNT:
|
|
280
|
+
return {'name_hash': None, 'content_hash': None, 'dir_hash': None, 'lang_extensions': None}
|
|
281
|
+
|
|
282
|
+
selected_names.sort()
|
|
283
|
+
concatenated_names = ''.join(selected_names)
|
|
284
|
+
|
|
285
|
+
if len(concatenated_names.encode('utf-8')) < MINIMUM_CONCATENATED_NAME_LENGTH:
|
|
286
|
+
return {'name_hash': None, 'content_hash': None, 'dir_hash': None, 'lang_extensions': None}
|
|
287
|
+
|
|
288
|
+
# Concatenate the unique file names without the extensions, adding a space and sorting them alphabetically
|
|
289
|
+
unique_file_names_list = list(unique_file_names)
|
|
290
|
+
unique_file_names_list.sort()
|
|
291
|
+
concatenated_names = ' '.join(unique_file_names_list)
|
|
292
|
+
|
|
293
|
+
# We do the same for the directory names, adding a space and sorting them alphabetically
|
|
294
|
+
unique_directories_list = list(unique_directories)
|
|
295
|
+
unique_directories_list.sort()
|
|
296
|
+
concatenated_directories = ' '.join(unique_directories_list)
|
|
297
|
+
|
|
298
|
+
names_simhash = simhash(WordFeatureSet(concatenated_names.encode('utf-8')))
|
|
299
|
+
dir_simhash = simhash(WordFeatureSet(concatenated_directories.encode('utf-8')))
|
|
300
|
+
content_simhash = fingerprint(vectorize_bytes(file_hashes))
|
|
301
|
+
|
|
302
|
+
# Debug logging similar to Go implementation
|
|
303
|
+
self.base.print_debug(f'Unique file names: {unique_file_names_list}')
|
|
304
|
+
self.base.print_debug(f'Unique directories: {unique_directories_list}')
|
|
305
|
+
self.base.print_debug(f'{dir_simhash:x}/{names_simhash:x} - {content_simhash:x} - {extension_map}')
|
|
306
|
+
|
|
307
|
+
return {
|
|
308
|
+
'name_hash': names_simhash,
|
|
309
|
+
'content_hash': content_simhash,
|
|
310
|
+
'dir_hash': dir_simhash,
|
|
311
|
+
'lang_extensions': extension_map,
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
def present(self, output_format: Optional[str] = None, output_file: Optional[str] = None):
|
|
315
|
+
"""Present the hashed tree in the selected format"""
|
|
316
|
+
self.presenter.present(output_format=output_format, output_file=output_file)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class FolderHasherPresenter(AbstractPresenter):
|
|
320
|
+
"""
|
|
321
|
+
FolderHasher presenter class
|
|
322
|
+
Handles the presentation of the folder hashing scan results
|
|
323
|
+
"""
|
|
324
|
+
|
|
325
|
+
def __init__(self, folder_hasher: FolderHasher, **kwargs):
|
|
326
|
+
super().__init__(**kwargs)
|
|
327
|
+
self.folder_hasher = folder_hasher
|
|
328
|
+
|
|
329
|
+
def _format_json_output(self) -> str:
|
|
330
|
+
"""
|
|
331
|
+
Format the scan output data into a JSON object
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
str: The formatted JSON string
|
|
335
|
+
"""
|
|
336
|
+
return json.dumps(self.folder_hasher.tree, indent=2)
|
|
337
|
+
|
|
338
|
+
def _format_plain_output(self) -> str:
|
|
339
|
+
"""
|
|
340
|
+
Format the scan output data into a plain text string
|
|
341
|
+
"""
|
|
342
|
+
return (
|
|
343
|
+
json.dumps(self.folder_hasher.tree, indent=2)
|
|
344
|
+
if isinstance(self.folder_hasher.tree, dict)
|
|
345
|
+
else str(self.folder_hasher.tree)
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
def _format_cyclonedx_output(self) -> str:
|
|
349
|
+
raise NotImplementedError('CycloneDX output is not implemented')
|
|
350
|
+
|
|
351
|
+
def _format_spdxlite_output(self) -> str:
|
|
352
|
+
raise NotImplementedError('SPDXlite output is not implemented')
|
|
353
|
+
|
|
354
|
+
def _format_csv_output(self) -> str:
|
|
355
|
+
raise NotImplementedError('CSV output is not implemented')
|
|
356
|
+
|
|
357
|
+
def _format_raw_output(self) -> str:
|
|
358
|
+
raise NotImplementedError('Raw output is not implemented')
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2025, SCANOSS
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
|
14
|
+
all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
+
THE SOFTWARE.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from typing import Optional
|
|
27
|
+
|
|
28
|
+
from pypac.parser import PACFile
|
|
29
|
+
|
|
30
|
+
from scanoss.constants import (
|
|
31
|
+
DEFAULT_NB_THREADS,
|
|
32
|
+
DEFAULT_POST_SIZE,
|
|
33
|
+
DEFAULT_SC_TIMEOUT,
|
|
34
|
+
DEFAULT_TIMEOUT,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ScannerConfig:
|
|
40
|
+
debug: bool = False
|
|
41
|
+
trace: bool = False
|
|
42
|
+
quiet: bool = False
|
|
43
|
+
api_key: Optional[str] = None
|
|
44
|
+
url: Optional[str] = None
|
|
45
|
+
grpc_url: Optional[str] = None
|
|
46
|
+
post_size: int = DEFAULT_POST_SIZE
|
|
47
|
+
timeout: int = DEFAULT_TIMEOUT
|
|
48
|
+
sc_timeout: int = DEFAULT_SC_TIMEOUT
|
|
49
|
+
nb_threads: int = DEFAULT_NB_THREADS
|
|
50
|
+
proxy: Optional[str] = None
|
|
51
|
+
grpc_proxy: Optional[str] = None
|
|
52
|
+
|
|
53
|
+
ca_cert: Optional[str] = None
|
|
54
|
+
pac: Optional[PACFile] = None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def create_scanner_config_from_args(args) -> ScannerConfig:
|
|
58
|
+
return ScannerConfig(
|
|
59
|
+
debug=args.debug,
|
|
60
|
+
trace=args.trace,
|
|
61
|
+
quiet=args.quiet,
|
|
62
|
+
api_key=getattr(args, 'key', None),
|
|
63
|
+
url=getattr(args, 'api_url', None),
|
|
64
|
+
grpc_url=getattr(args, 'grpc_url', None),
|
|
65
|
+
post_size=getattr(args, 'post_size', DEFAULT_POST_SIZE),
|
|
66
|
+
timeout=getattr(args, 'timeout', DEFAULT_TIMEOUT),
|
|
67
|
+
sc_timeout=getattr(args, 'sc_timeout', DEFAULT_SC_TIMEOUT),
|
|
68
|
+
nb_threads=getattr(args, 'nb_threads', DEFAULT_NB_THREADS),
|
|
69
|
+
proxy=getattr(args, 'proxy', None),
|
|
70
|
+
grpc_proxy=getattr(args, 'grpc_proxy', None),
|
|
71
|
+
ca_cert=getattr(args, 'ca_cert', None),
|
|
72
|
+
pac=getattr(args, 'pac', None),
|
|
73
|
+
)
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2025, SCANOSS
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
|
14
|
+
all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
+
THE SOFTWARE.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
import threading
|
|
27
|
+
import time
|
|
28
|
+
from typing import Dict, Optional
|
|
29
|
+
|
|
30
|
+
from progress.spinner import Spinner
|
|
31
|
+
|
|
32
|
+
from scanoss.constants import (
|
|
33
|
+
DEFAULT_HFH_DEPTH,
|
|
34
|
+
DEFAULT_HFH_MIN_ACCEPTED_SCORE,
|
|
35
|
+
DEFAULT_HFH_RANK_THRESHOLD,
|
|
36
|
+
DEFAULT_HFH_RECURSIVE_THRESHOLD,
|
|
37
|
+
)
|
|
38
|
+
from scanoss.cyclonedx import CycloneDx
|
|
39
|
+
from scanoss.file_filters import FileFilters
|
|
40
|
+
from scanoss.scanners.folder_hasher import FolderHasher
|
|
41
|
+
from scanoss.scanners.scanner_config import ScannerConfig
|
|
42
|
+
from scanoss.scanoss_settings import ScanossSettings
|
|
43
|
+
from scanoss.scanossbase import ScanossBase
|
|
44
|
+
from scanoss.scanossgrpc import ScanossGrpc
|
|
45
|
+
from scanoss.utils.abstract_presenter import AbstractPresenter
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ScannerHFH:
|
|
49
|
+
"""
|
|
50
|
+
Folder Hashing Scanner.
|
|
51
|
+
|
|
52
|
+
This scanner processes a directory, computes CRC64 hashes for the files,
|
|
53
|
+
and calculates simhash values based on file names and content to detect folder-level similarities.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__( # noqa: PLR0913
|
|
57
|
+
self,
|
|
58
|
+
scan_dir: str,
|
|
59
|
+
config: ScannerConfig,
|
|
60
|
+
client: Optional[ScanossGrpc] = None,
|
|
61
|
+
scanoss_settings: Optional[ScanossSettings] = None,
|
|
62
|
+
rank_threshold: int = DEFAULT_HFH_RANK_THRESHOLD,
|
|
63
|
+
depth: int = DEFAULT_HFH_DEPTH,
|
|
64
|
+
recursive_threshold: float = DEFAULT_HFH_RECURSIVE_THRESHOLD,
|
|
65
|
+
min_accepted_score: float = DEFAULT_HFH_MIN_ACCEPTED_SCORE,
|
|
66
|
+
use_grpc: bool = False,
|
|
67
|
+
):
|
|
68
|
+
"""
|
|
69
|
+
Initialize the ScannerHFH.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
scan_dir (str): The directory to be scanned.
|
|
73
|
+
config (ScannerConfig): Configuration parameters for the scanner.
|
|
74
|
+
client (ScanossGrpc): gRPC client for communicating with the scanning service.
|
|
75
|
+
scanoss_settings (Optional[ScanossSettings]): Optional settings for Scanoss.
|
|
76
|
+
rank_threshold (int): Get results with rank below this threshold (default: 5).
|
|
77
|
+
depth (int): How many levels to scan (default: 1).
|
|
78
|
+
recursive_threshold (float): Minimum score threshold to consider a match (default: 0.25).
|
|
79
|
+
min_accepted_score (float): Only show results with a score at or above this threshold (default: 0.15).
|
|
80
|
+
"""
|
|
81
|
+
self.base = ScanossBase(
|
|
82
|
+
debug=config.debug,
|
|
83
|
+
trace=config.trace,
|
|
84
|
+
quiet=config.quiet,
|
|
85
|
+
)
|
|
86
|
+
self.presenter = ScannerHFHPresenter(
|
|
87
|
+
self,
|
|
88
|
+
debug=config.debug,
|
|
89
|
+
trace=config.trace,
|
|
90
|
+
quiet=config.quiet,
|
|
91
|
+
)
|
|
92
|
+
self.file_filters = FileFilters(
|
|
93
|
+
debug=config.debug,
|
|
94
|
+
trace=config.trace,
|
|
95
|
+
quiet=config.quiet,
|
|
96
|
+
scanoss_settings=scanoss_settings,
|
|
97
|
+
)
|
|
98
|
+
self.folder_hasher = FolderHasher(
|
|
99
|
+
scan_dir=scan_dir,
|
|
100
|
+
config=config,
|
|
101
|
+
scanoss_settings=scanoss_settings,
|
|
102
|
+
depth=depth,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
self.scan_dir = scan_dir
|
|
106
|
+
self.client = client
|
|
107
|
+
self.scan_results = None
|
|
108
|
+
self.rank_threshold = rank_threshold
|
|
109
|
+
self.recursive_threshold = recursive_threshold
|
|
110
|
+
self.min_accepted_score = min_accepted_score
|
|
111
|
+
self.use_grpc = use_grpc
|
|
112
|
+
|
|
113
|
+
def _execute_grpc_scan(self, hfh_request: Dict) -> None:
|
|
114
|
+
"""
|
|
115
|
+
Execute folder hash scan.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
hfh_request: Request dictionary for the gRPC call
|
|
119
|
+
"""
|
|
120
|
+
try:
|
|
121
|
+
self.scan_results = self.client.folder_hash_scan(hfh_request, self.use_grpc)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
self.base.print_stderr(f'Error during folder hash scan: {e}')
|
|
124
|
+
self.scan_results = None
|
|
125
|
+
|
|
126
|
+
def scan(self) -> Optional[Dict]:
|
|
127
|
+
"""
|
|
128
|
+
Scan the provided directory using the folder hashing algorithm.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Optional[Dict]: The folder hash response from the gRPC client, or None if an error occurs.
|
|
132
|
+
"""
|
|
133
|
+
hfh_request = {
|
|
134
|
+
'root': self.folder_hasher.hash_directory(path=self.scan_dir),
|
|
135
|
+
'rank_threshold': self.rank_threshold,
|
|
136
|
+
'recursive_threshold': self.recursive_threshold,
|
|
137
|
+
'min_accepted_score': self.min_accepted_score,
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
spinner_ctx = Spinner('Scanning folder...')
|
|
141
|
+
|
|
142
|
+
with spinner_ctx as spinner:
|
|
143
|
+
grpc_thread = threading.Thread(target=self._execute_grpc_scan, args=(hfh_request,))
|
|
144
|
+
grpc_thread.start()
|
|
145
|
+
|
|
146
|
+
while grpc_thread.is_alive():
|
|
147
|
+
spinner.next()
|
|
148
|
+
time.sleep(0.1)
|
|
149
|
+
|
|
150
|
+
grpc_thread.join()
|
|
151
|
+
|
|
152
|
+
return self.scan_results
|
|
153
|
+
|
|
154
|
+
def present(self, output_format: str = None, output_file: str = None):
|
|
155
|
+
"""Present the results in the selected format"""
|
|
156
|
+
self.presenter.present(output_format=output_format, output_file=output_file)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class ScannerHFHPresenter(AbstractPresenter):
|
|
160
|
+
"""
|
|
161
|
+
ScannerHFH presenter class
|
|
162
|
+
Handles the presentation of the folder hashing scan results
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
def __init__(self, scanner: ScannerHFH, **kwargs):
|
|
166
|
+
super().__init__(**kwargs)
|
|
167
|
+
self.scanner = scanner
|
|
168
|
+
|
|
169
|
+
def _format_json_output(self) -> str:
|
|
170
|
+
"""
|
|
171
|
+
Format the scan output data into a JSON object
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
str: The formatted JSON string
|
|
175
|
+
"""
|
|
176
|
+
return json.dumps(self.scanner.scan_results, indent=2)
|
|
177
|
+
|
|
178
|
+
def _format_plain_output(self) -> str:
|
|
179
|
+
"""
|
|
180
|
+
Format the scan output data into a plain text string
|
|
181
|
+
"""
|
|
182
|
+
return (
|
|
183
|
+
json.dumps(self.scanner.scan_results, indent=2)
|
|
184
|
+
if isinstance(self.scanner.scan_results, dict)
|
|
185
|
+
else str(self.scanner.scan_results)
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def _format_cyclonedx_output(self) -> str: # noqa: PLR0911
|
|
189
|
+
if not self.scanner.scan_results:
|
|
190
|
+
return ''
|
|
191
|
+
try:
|
|
192
|
+
if 'results' not in self.scanner.scan_results or not self.scanner.scan_results['results']:
|
|
193
|
+
self.base.print_stderr('ERROR: No scan results found')
|
|
194
|
+
return ''
|
|
195
|
+
|
|
196
|
+
first_result = self.scanner.scan_results['results'][0]
|
|
197
|
+
|
|
198
|
+
best_match_components = [c for c in first_result.get('components', []) if c.get('order') == 1]
|
|
199
|
+
if not best_match_components:
|
|
200
|
+
self.base.print_stderr('ERROR: No best match component found')
|
|
201
|
+
return ''
|
|
202
|
+
|
|
203
|
+
best_match_component = best_match_components[0]
|
|
204
|
+
if not best_match_component.get('versions'):
|
|
205
|
+
self.base.print_stderr('ERROR: No versions found for best match component')
|
|
206
|
+
return ''
|
|
207
|
+
|
|
208
|
+
best_match_version = best_match_component['versions'][0]
|
|
209
|
+
purl = best_match_component['purl']
|
|
210
|
+
|
|
211
|
+
get_dependencies_json_request = {
|
|
212
|
+
'files': [
|
|
213
|
+
{
|
|
214
|
+
'file': f'{best_match_component["name"]}:{best_match_version["version"]}',
|
|
215
|
+
'purls': [{'purl': purl, 'requirement': best_match_version['version']}],
|
|
216
|
+
}
|
|
217
|
+
]
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
get_vulnerabilities_json_request = {
|
|
221
|
+
'components': [{'purl': purl, 'requirement': best_match_version['version']}],
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
decorated_scan_results = self.scanner.client.get_dependencies(get_dependencies_json_request)
|
|
225
|
+
vulnerabilities = self.scanner.client.get_vulnerabilities_json(get_vulnerabilities_json_request)
|
|
226
|
+
|
|
227
|
+
cdx = CycloneDx(self.base.debug)
|
|
228
|
+
scan_results = {}
|
|
229
|
+
for f in decorated_scan_results['files']:
|
|
230
|
+
scan_results[f['file']] = [f]
|
|
231
|
+
success, cdx_output = cdx.produce_from_json(scan_results)
|
|
232
|
+
if not success:
|
|
233
|
+
error_msg = 'ERROR: Failed to produce CycloneDX output'
|
|
234
|
+
self.base.print_stderr(error_msg)
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
if vulnerabilities:
|
|
238
|
+
cdx_output = cdx.append_vulnerabilities(cdx_output, vulnerabilities, purl)
|
|
239
|
+
|
|
240
|
+
return json.dumps(cdx_output, indent=2)
|
|
241
|
+
except Exception as e:
|
|
242
|
+
self.base.print_stderr(f'ERROR: Failed to get license information: {e}')
|
|
243
|
+
return None
|
|
244
|
+
|
|
245
|
+
def _format_spdxlite_output(self) -> str:
|
|
246
|
+
raise NotImplementedError('SPDXlite output is not implemented')
|
|
247
|
+
|
|
248
|
+
def _format_csv_output(self) -> str:
|
|
249
|
+
raise NotImplementedError('CSV output is not implemented')
|
|
250
|
+
|
|
251
|
+
def _format_raw_output(self) -> str:
|
|
252
|
+
raise NotImplementedError('Raw output is not implemented')
|