scanoss 1.12.2__py3-none-any.whl → 1.43.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. protoc_gen_swagger/__init__.py +13 -13
  2. protoc_gen_swagger/options/__init__.py +13 -13
  3. protoc_gen_swagger/options/annotations_pb2.py +18 -12
  4. protoc_gen_swagger/options/annotations_pb2.pyi +48 -0
  5. protoc_gen_swagger/options/annotations_pb2_grpc.py +20 -0
  6. protoc_gen_swagger/options/openapiv2_pb2.py +110 -99
  7. protoc_gen_swagger/options/openapiv2_pb2.pyi +1317 -0
  8. protoc_gen_swagger/options/openapiv2_pb2_grpc.py +20 -0
  9. scanoss/__init__.py +18 -18
  10. scanoss/api/__init__.py +17 -17
  11. scanoss/api/common/__init__.py +17 -17
  12. scanoss/api/common/v2/__init__.py +17 -17
  13. scanoss/api/common/v2/scanoss_common_pb2.py +49 -20
  14. scanoss/api/common/v2/scanoss_common_pb2_grpc.py +25 -0
  15. scanoss/api/components/__init__.py +17 -17
  16. scanoss/api/components/v2/__init__.py +17 -17
  17. scanoss/api/components/v2/scanoss_components_pb2.py +68 -43
  18. scanoss/api/components/v2/scanoss_components_pb2_grpc.py +83 -22
  19. scanoss/api/cryptography/v2/scanoss_cryptography_pb2.py +136 -21
  20. scanoss/api/cryptography/v2/scanoss_cryptography_pb2_grpc.py +766 -13
  21. scanoss/api/dependencies/__init__.py +17 -17
  22. scanoss/api/dependencies/v2/__init__.py +17 -17
  23. scanoss/api/dependencies/v2/scanoss_dependencies_pb2.py +56 -29
  24. scanoss/api/dependencies/v2/scanoss_dependencies_pb2_grpc.py +94 -8
  25. scanoss/api/geoprovenance/__init__.py +23 -0
  26. scanoss/api/geoprovenance/v2/__init__.py +23 -0
  27. scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2.py +92 -0
  28. scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2_grpc.py +381 -0
  29. scanoss/api/licenses/__init__.py +23 -0
  30. scanoss/api/licenses/v2/__init__.py +23 -0
  31. scanoss/api/licenses/v2/scanoss_licenses_pb2.py +84 -0
  32. scanoss/api/licenses/v2/scanoss_licenses_pb2_grpc.py +302 -0
  33. scanoss/api/scanning/__init__.py +17 -17
  34. scanoss/api/scanning/v2/__init__.py +17 -17
  35. scanoss/api/scanning/v2/scanoss_scanning_pb2.py +42 -13
  36. scanoss/api/scanning/v2/scanoss_scanning_pb2_grpc.py +86 -7
  37. scanoss/api/semgrep/__init__.py +17 -17
  38. scanoss/api/semgrep/v2/__init__.py +17 -17
  39. scanoss/api/semgrep/v2/scanoss_semgrep_pb2.py +50 -23
  40. scanoss/api/semgrep/v2/scanoss_semgrep_pb2_grpc.py +151 -16
  41. scanoss/api/vulnerabilities/__init__.py +17 -17
  42. scanoss/api/vulnerabilities/v2/__init__.py +17 -17
  43. scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2.py +78 -31
  44. scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2_grpc.py +282 -18
  45. scanoss/cli.py +2359 -370
  46. scanoss/components.py +187 -94
  47. scanoss/constants.py +22 -0
  48. scanoss/cryptography.py +308 -0
  49. scanoss/csvoutput.py +91 -58
  50. scanoss/cyclonedx.py +221 -63
  51. scanoss/data/build_date.txt +1 -1
  52. scanoss/data/osadl-copyleft.json +133 -0
  53. scanoss/data/scanoss-settings-schema.json +254 -0
  54. scanoss/delta.py +197 -0
  55. scanoss/export/__init__.py +23 -0
  56. scanoss/export/dependency_track.py +227 -0
  57. scanoss/file_filters.py +582 -0
  58. scanoss/filecount.py +75 -69
  59. scanoss/gitlabqualityreport.py +214 -0
  60. scanoss/header_filter.py +563 -0
  61. scanoss/inspection/__init__.py +23 -0
  62. scanoss/inspection/policy_check/__init__.py +0 -0
  63. scanoss/inspection/policy_check/dependency_track/__init__.py +0 -0
  64. scanoss/inspection/policy_check/dependency_track/project_violation.py +479 -0
  65. scanoss/inspection/policy_check/policy_check.py +222 -0
  66. scanoss/inspection/policy_check/scanoss/__init__.py +0 -0
  67. scanoss/inspection/policy_check/scanoss/copyleft.py +243 -0
  68. scanoss/inspection/policy_check/scanoss/undeclared_component.py +309 -0
  69. scanoss/inspection/summary/__init__.py +0 -0
  70. scanoss/inspection/summary/component_summary.py +170 -0
  71. scanoss/inspection/summary/license_summary.py +191 -0
  72. scanoss/inspection/summary/match_summary.py +341 -0
  73. scanoss/inspection/utils/file_utils.py +44 -0
  74. scanoss/inspection/utils/license_utils.py +123 -0
  75. scanoss/inspection/utils/markdown_utils.py +63 -0
  76. scanoss/inspection/utils/scan_result_processor.py +417 -0
  77. scanoss/osadl.py +125 -0
  78. scanoss/results.py +275 -0
  79. scanoss/scancodedeps.py +87 -38
  80. scanoss/scanner.py +431 -539
  81. scanoss/scanners/__init__.py +23 -0
  82. scanoss/scanners/container_scanner.py +476 -0
  83. scanoss/scanners/folder_hasher.py +358 -0
  84. scanoss/scanners/scanner_config.py +73 -0
  85. scanoss/scanners/scanner_hfh.py +252 -0
  86. scanoss/scanoss_settings.py +337 -0
  87. scanoss/scanossapi.py +140 -101
  88. scanoss/scanossbase.py +59 -22
  89. scanoss/scanossgrpc.py +799 -251
  90. scanoss/scanpostprocessor.py +294 -0
  91. scanoss/scantype.py +22 -21
  92. scanoss/services/dependency_track_service.py +132 -0
  93. scanoss/spdxlite.py +532 -174
  94. scanoss/threadeddependencies.py +148 -47
  95. scanoss/threadedscanning.py +53 -37
  96. scanoss/utils/__init__.py +23 -0
  97. scanoss/utils/abstract_presenter.py +103 -0
  98. scanoss/utils/crc64.py +96 -0
  99. scanoss/utils/file.py +84 -0
  100. scanoss/utils/scanoss_scan_results_utils.py +41 -0
  101. scanoss/utils/simhash.py +198 -0
  102. scanoss/winnowing.py +241 -63
  103. {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/METADATA +18 -9
  104. scanoss-1.43.1.dist-info/RECORD +110 -0
  105. {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/WHEEL +1 -1
  106. scanoss-1.12.2.dist-info/RECORD +0 -58
  107. {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/entry_points.txt +0 -0
  108. {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info/licenses}/LICENSE +0 -0
  109. {scanoss-1.12.2.dist-info → scanoss-1.43.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,358 @@
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Dict, List, Literal, Optional
6
+
7
+ from progress.bar import Bar
8
+
9
+ from scanoss.constants import DEFAULT_HFH_DEPTH
10
+ from scanoss.file_filters import FileFilters
11
+ from scanoss.scanoss_settings import ScanossSettings
12
+ from scanoss.scanossbase import ScanossBase
13
+ from scanoss.utils.abstract_presenter import AbstractPresenter
14
+ from scanoss.utils.crc64 import CRC64
15
+ from scanoss.utils.simhash import WordFeatureSet, fingerprint, simhash, vectorize_bytes
16
+
17
+ MINIMUM_FILE_COUNT = 8
18
+ MINIMUM_CONCATENATED_NAME_LENGTH = 32
19
+
20
+ class DirectoryNode:
21
+ """
22
+ Represents a node in the directory tree for folder hashing.
23
+ """
24
+
25
+ def __init__(self, path: str):
26
+ self.path = path
27
+ self.is_dir = True
28
+ self.children: Dict[str, DirectoryNode] = {}
29
+ self.files: List[DirectoryFile] = []
30
+
31
+
32
+ class DirectoryFile:
33
+ """
34
+ Represents a file in the directory tree for folder hashing.
35
+ """
36
+
37
+ def __init__(self, path: str, key: List[bytes], key_str: str):
38
+ self.path = path
39
+ self.key = key
40
+ self.key_str = key_str
41
+
42
+
43
+ @dataclass
44
+ class FolderHasherConfig:
45
+ debug: bool = False
46
+ trace: bool = False
47
+ quiet: bool = False
48
+ output_file: Optional[str] = None
49
+ output_format: Literal['json'] = 'json'
50
+ settings_file: Optional[str] = None
51
+ skip_settings_file: bool = False
52
+
53
+
54
+ def create_folder_hasher_config_from_args(args) -> FolderHasherConfig:
55
+ return FolderHasherConfig(
56
+ debug=args.debug,
57
+ trace=args.trace,
58
+ quiet=args.quiet,
59
+ output_file=getattr(args, 'output', None),
60
+ output_format=getattr(args, 'format', 'json'),
61
+ settings_file=getattr(args, 'settings', None),
62
+ skip_settings_file=getattr(args, 'skip_settings_file', False),
63
+ )
64
+
65
+
66
+ class FolderHasher:
67
+ """
68
+ Folder Hasher.
69
+
70
+ This class is used to produce a folder hash for a given directory.
71
+
72
+ It builds a directory tree (DirectoryNode) and computes the associated
73
+ hash data for the folder.
74
+
75
+ Args:
76
+ scan_dir (str): The directory to be hashed.
77
+ config (FolderHasherConfig): Configuration parameters for the folder hasher.
78
+ scanoss_settings (Optional[ScanossSettings]): Optional settings for Scanoss.
79
+ depth (int): How many levels to hash from the root directory (default: 1).
80
+ """
81
+
82
+ def __init__(
83
+ self,
84
+ scan_dir: str,
85
+ config: FolderHasherConfig,
86
+ scanoss_settings: Optional[ScanossSettings] = None,
87
+ depth: int = DEFAULT_HFH_DEPTH,
88
+ ):
89
+ self.base = ScanossBase(
90
+ debug=config.debug,
91
+ trace=config.trace,
92
+ quiet=config.quiet,
93
+ )
94
+ self.file_filters = FileFilters(
95
+ debug=config.debug,
96
+ trace=config.trace,
97
+ quiet=config.quiet,
98
+ scanoss_settings=scanoss_settings,
99
+ is_folder_hashing_scan=True,
100
+ )
101
+ self.presenter = FolderHasherPresenter(
102
+ self,
103
+ debug=config.debug,
104
+ trace=config.trace,
105
+ quiet=config.quiet,
106
+ )
107
+
108
+ self.scan_dir = scan_dir
109
+ self.tree = None
110
+ self.depth = depth
111
+
112
+ def hash_directory(self, path: str) -> dict:
113
+ """
114
+ Generate the folder hashing request structure from a directory path.
115
+
116
+ This method builds a directory tree (DirectoryNode) and computes the associated
117
+ hash data for the folder.
118
+
119
+ Args:
120
+ path (str): The root directory path.
121
+
122
+ Returns:
123
+ dict: The folder hash request structure.
124
+ """
125
+
126
+ root_node = self._build_root_node(path)
127
+ tree = self._hash_calc_from_node(root_node)
128
+
129
+ self.tree = tree
130
+
131
+ return tree
132
+
133
+ def _build_root_node(
134
+ self,
135
+ path: str,
136
+ ) -> DirectoryNode:
137
+ """
138
+ Build a directory tree from the given path with file information.
139
+
140
+ The tree includes DirectoryNode objects populated with filtered file items,
141
+ each containing their relative path and CRC64 hash key.
142
+
143
+ Args:
144
+ path (str): The directory path to build the tree from.
145
+
146
+ Returns:
147
+ DirectoryNode: The root node representing the directory.
148
+ """
149
+ root = Path(path).resolve()
150
+ root_node = DirectoryNode(str(root))
151
+
152
+ all_files = [
153
+ f for f in root.rglob('*') if f.is_file()
154
+ ]
155
+ filtered_files = self.file_filters.get_filtered_files_from_files(all_files, str(root))
156
+
157
+ # Sort the files by name to ensure the hash is the same for the same folder
158
+ filtered_files.sort()
159
+
160
+ bar_ctx = Bar('Hashing files...', max=len(filtered_files))
161
+
162
+ with bar_ctx as bar:
163
+ full_file_path = ''
164
+ for file_path in filtered_files:
165
+ try:
166
+ file_path_obj = Path(file_path) if isinstance(file_path, str) else file_path
167
+ full_file_path = file_path_obj if file_path_obj.is_absolute() else root / file_path_obj
168
+
169
+ self.base.print_debug(f'\nHashing file {str(full_file_path)}')
170
+
171
+ file_bytes = full_file_path.read_bytes()
172
+ key = CRC64.get_hash_buff(file_bytes)
173
+ key_str = ''.join(f'{b:02x}' for b in key)
174
+ rel_path = str(full_file_path.relative_to(root))
175
+
176
+ file_item = DirectoryFile(rel_path, key, key_str)
177
+
178
+ current_node = root_node
179
+ for part in Path(rel_path).parent.parts:
180
+ child_path = str(Path(current_node.path) / part)
181
+ if child_path not in current_node.children:
182
+ current_node.children[child_path] = DirectoryNode(child_path)
183
+ current_node = current_node.children[child_path]
184
+ current_node.files.append(file_item)
185
+
186
+ root_node.files.append(file_item)
187
+
188
+ except Exception as e:
189
+ self.base.print_debug(f'Skipping file {full_file_path}: {str(e)}')
190
+
191
+ bar.next()
192
+ return root_node
193
+
194
+ def _hash_calc_from_node(self, node: DirectoryNode, current_depth: int = 1) -> dict:
195
+ """
196
+ Recursively compute folder hash data for a directory node.
197
+
198
+ The hash data includes the path identifier, simhash for file names,
199
+ simhash for file content, directory hash, language extensions, and children node hash information.
200
+
201
+ Args:
202
+ node (DirectoryNode): The directory node to compute the hash for.
203
+ current_depth (int): The current depth level (1-based, root is depth 1).
204
+
205
+ Returns:
206
+ dict: The computed hash data for the node.
207
+ """
208
+ hash_data = self._hash_calc(node)
209
+
210
+ # Safely calculate relative path
211
+ try:
212
+ node_path = Path(node.path).resolve()
213
+ scan_dir_path = Path(self.scan_dir).resolve()
214
+ rel_path = node_path.relative_to(scan_dir_path)
215
+ except ValueError:
216
+ # If relative_to fails, use the node path as is or a fallback
217
+ rel_path = Path(node.path).name if node.path else Path('.')
218
+
219
+ # Only process children if we haven't reached the depth limit
220
+ children = []
221
+ if current_depth < self.depth:
222
+ children = [self._hash_calc_from_node(child, current_depth + 1) for child in node.children.values()]
223
+
224
+ return {
225
+ 'path_id': str(rel_path),
226
+ 'sim_hash_names': f'{hash_data["name_hash"]:02x}' if hash_data['name_hash'] is not None else None,
227
+ 'sim_hash_content': f'{hash_data["content_hash"]:02x}' if hash_data['content_hash'] is not None else None,
228
+ 'sim_hash_dir_names': f'{hash_data["dir_hash"]:02x}' if hash_data['dir_hash'] is not None else None,
229
+ 'lang_extensions': hash_data['lang_extensions'],
230
+ 'children': children,
231
+ }
232
+
233
+ def _hash_calc(self, node: DirectoryNode) -> dict:
234
+ """
235
+ Compute folder hash values for a given directory node.
236
+
237
+ The method aggregates unique file keys and sorted file names to generate
238
+ simhash-based hash values for both file names and file contents.
239
+
240
+ The most significant byte of the name simhash is then replaced by a computed head value.
241
+
242
+ Args:
243
+ node (DirectoryNode): The directory node containing file items.
244
+
245
+ Returns:
246
+ dict: A dictionary with 'name_hash', 'content_hash', 'dir_hash', and 'lang_extensions' keys.
247
+ """
248
+ processed_hashes = set()
249
+ unique_file_names = set()
250
+ unique_directories = set()
251
+ extension_map = {}
252
+ file_hashes = []
253
+ selected_names = []
254
+
255
+ for file in node.files:
256
+ key_str = file.key_str
257
+
258
+ file_name = os.path.basename(file.path)
259
+
260
+ file_name_without_extension, extension = os.path.splitext(file_name)
261
+ current_directory = os.path.dirname(file.path)
262
+
263
+ if extension and len(extension) > 1:
264
+ ext_without_dot = extension[1:]
265
+ extension_map[ext_without_dot] = extension_map.get(ext_without_dot, 0) + 1
266
+
267
+ current_directory.replace(self.scan_dir, '', 1).lstrip(os.path.sep)
268
+ parts = current_directory.split(os.path.sep)
269
+ for d in parts:
270
+ if d in {'', '.', '..'}:
271
+ continue
272
+ unique_directories.add(d)
273
+
274
+ processed_hashes.add(key_str)
275
+ unique_file_names.add(file_name_without_extension)
276
+ selected_names.append(file_name)
277
+ file_hashes.append(file.key)
278
+
279
+ if len(selected_names) < MINIMUM_FILE_COUNT:
280
+ return {'name_hash': None, 'content_hash': None, 'dir_hash': None, 'lang_extensions': None}
281
+
282
+ selected_names.sort()
283
+ concatenated_names = ''.join(selected_names)
284
+
285
+ if len(concatenated_names.encode('utf-8')) < MINIMUM_CONCATENATED_NAME_LENGTH:
286
+ return {'name_hash': None, 'content_hash': None, 'dir_hash': None, 'lang_extensions': None}
287
+
288
+ # Concatenate the unique file names without the extensions, adding a space and sorting them alphabetically
289
+ unique_file_names_list = list(unique_file_names)
290
+ unique_file_names_list.sort()
291
+ concatenated_names = ' '.join(unique_file_names_list)
292
+
293
+ # We do the same for the directory names, adding a space and sorting them alphabetically
294
+ unique_directories_list = list(unique_directories)
295
+ unique_directories_list.sort()
296
+ concatenated_directories = ' '.join(unique_directories_list)
297
+
298
+ names_simhash = simhash(WordFeatureSet(concatenated_names.encode('utf-8')))
299
+ dir_simhash = simhash(WordFeatureSet(concatenated_directories.encode('utf-8')))
300
+ content_simhash = fingerprint(vectorize_bytes(file_hashes))
301
+
302
+ # Debug logging similar to Go implementation
303
+ self.base.print_debug(f'Unique file names: {unique_file_names_list}')
304
+ self.base.print_debug(f'Unique directories: {unique_directories_list}')
305
+ self.base.print_debug(f'{dir_simhash:x}/{names_simhash:x} - {content_simhash:x} - {extension_map}')
306
+
307
+ return {
308
+ 'name_hash': names_simhash,
309
+ 'content_hash': content_simhash,
310
+ 'dir_hash': dir_simhash,
311
+ 'lang_extensions': extension_map,
312
+ }
313
+
314
+ def present(self, output_format: Optional[str] = None, output_file: Optional[str] = None):
315
+ """Present the hashed tree in the selected format"""
316
+ self.presenter.present(output_format=output_format, output_file=output_file)
317
+
318
+
319
+ class FolderHasherPresenter(AbstractPresenter):
320
+ """
321
+ FolderHasher presenter class
322
+ Handles the presentation of the folder hashing scan results
323
+ """
324
+
325
+ def __init__(self, folder_hasher: FolderHasher, **kwargs):
326
+ super().__init__(**kwargs)
327
+ self.folder_hasher = folder_hasher
328
+
329
+ def _format_json_output(self) -> str:
330
+ """
331
+ Format the scan output data into a JSON object
332
+
333
+ Returns:
334
+ str: The formatted JSON string
335
+ """
336
+ return json.dumps(self.folder_hasher.tree, indent=2)
337
+
338
+ def _format_plain_output(self) -> str:
339
+ """
340
+ Format the scan output data into a plain text string
341
+ """
342
+ return (
343
+ json.dumps(self.folder_hasher.tree, indent=2)
344
+ if isinstance(self.folder_hasher.tree, dict)
345
+ else str(self.folder_hasher.tree)
346
+ )
347
+
348
+ def _format_cyclonedx_output(self) -> str:
349
+ raise NotImplementedError('CycloneDX output is not implemented')
350
+
351
+ def _format_spdxlite_output(self) -> str:
352
+ raise NotImplementedError('SPDXlite output is not implemented')
353
+
354
+ def _format_csv_output(self) -> str:
355
+ raise NotImplementedError('CSV output is not implemented')
356
+
357
+ def _format_raw_output(self) -> str:
358
+ raise NotImplementedError('Raw output is not implemented')
@@ -0,0 +1,73 @@
1
+ """
2
+ SPDX-License-Identifier: MIT
3
+
4
+ Copyright (c) 2025, SCANOSS
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in
14
+ all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ THE SOFTWARE.
23
+ """
24
+
25
+ from dataclasses import dataclass
26
+ from typing import Optional
27
+
28
+ from pypac.parser import PACFile
29
+
30
+ from scanoss.constants import (
31
+ DEFAULT_NB_THREADS,
32
+ DEFAULT_POST_SIZE,
33
+ DEFAULT_SC_TIMEOUT,
34
+ DEFAULT_TIMEOUT,
35
+ )
36
+
37
+
38
+ @dataclass
39
+ class ScannerConfig:
40
+ debug: bool = False
41
+ trace: bool = False
42
+ quiet: bool = False
43
+ api_key: Optional[str] = None
44
+ url: Optional[str] = None
45
+ grpc_url: Optional[str] = None
46
+ post_size: int = DEFAULT_POST_SIZE
47
+ timeout: int = DEFAULT_TIMEOUT
48
+ sc_timeout: int = DEFAULT_SC_TIMEOUT
49
+ nb_threads: int = DEFAULT_NB_THREADS
50
+ proxy: Optional[str] = None
51
+ grpc_proxy: Optional[str] = None
52
+
53
+ ca_cert: Optional[str] = None
54
+ pac: Optional[PACFile] = None
55
+
56
+
57
+ def create_scanner_config_from_args(args) -> ScannerConfig:
58
+ return ScannerConfig(
59
+ debug=args.debug,
60
+ trace=args.trace,
61
+ quiet=args.quiet,
62
+ api_key=getattr(args, 'key', None),
63
+ url=getattr(args, 'api_url', None),
64
+ grpc_url=getattr(args, 'grpc_url', None),
65
+ post_size=getattr(args, 'post_size', DEFAULT_POST_SIZE),
66
+ timeout=getattr(args, 'timeout', DEFAULT_TIMEOUT),
67
+ sc_timeout=getattr(args, 'sc_timeout', DEFAULT_SC_TIMEOUT),
68
+ nb_threads=getattr(args, 'nb_threads', DEFAULT_NB_THREADS),
69
+ proxy=getattr(args, 'proxy', None),
70
+ grpc_proxy=getattr(args, 'grpc_proxy', None),
71
+ ca_cert=getattr(args, 'ca_cert', None),
72
+ pac=getattr(args, 'pac', None),
73
+ )
@@ -0,0 +1,252 @@
1
+ """
2
+ SPDX-License-Identifier: MIT
3
+
4
+ Copyright (c) 2025, SCANOSS
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in
14
+ all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ THE SOFTWARE.
23
+ """
24
+
25
+ import json
26
+ import threading
27
+ import time
28
+ from typing import Dict, Optional
29
+
30
+ from progress.spinner import Spinner
31
+
32
+ from scanoss.constants import (
33
+ DEFAULT_HFH_DEPTH,
34
+ DEFAULT_HFH_MIN_ACCEPTED_SCORE,
35
+ DEFAULT_HFH_RANK_THRESHOLD,
36
+ DEFAULT_HFH_RECURSIVE_THRESHOLD,
37
+ )
38
+ from scanoss.cyclonedx import CycloneDx
39
+ from scanoss.file_filters import FileFilters
40
+ from scanoss.scanners.folder_hasher import FolderHasher
41
+ from scanoss.scanners.scanner_config import ScannerConfig
42
+ from scanoss.scanoss_settings import ScanossSettings
43
+ from scanoss.scanossbase import ScanossBase
44
+ from scanoss.scanossgrpc import ScanossGrpc
45
+ from scanoss.utils.abstract_presenter import AbstractPresenter
46
+
47
+
48
+ class ScannerHFH:
49
+ """
50
+ Folder Hashing Scanner.
51
+
52
+ This scanner processes a directory, computes CRC64 hashes for the files,
53
+ and calculates simhash values based on file names and content to detect folder-level similarities.
54
+ """
55
+
56
+ def __init__( # noqa: PLR0913
57
+ self,
58
+ scan_dir: str,
59
+ config: ScannerConfig,
60
+ client: Optional[ScanossGrpc] = None,
61
+ scanoss_settings: Optional[ScanossSettings] = None,
62
+ rank_threshold: int = DEFAULT_HFH_RANK_THRESHOLD,
63
+ depth: int = DEFAULT_HFH_DEPTH,
64
+ recursive_threshold: float = DEFAULT_HFH_RECURSIVE_THRESHOLD,
65
+ min_accepted_score: float = DEFAULT_HFH_MIN_ACCEPTED_SCORE,
66
+ use_grpc: bool = False,
67
+ ):
68
+ """
69
+ Initialize the ScannerHFH.
70
+
71
+ Args:
72
+ scan_dir (str): The directory to be scanned.
73
+ config (ScannerConfig): Configuration parameters for the scanner.
74
+ client (ScanossGrpc): gRPC client for communicating with the scanning service.
75
+ scanoss_settings (Optional[ScanossSettings]): Optional settings for Scanoss.
76
+ rank_threshold (int): Get results with rank below this threshold (default: 5).
77
+ depth (int): How many levels to scan (default: 1).
78
+ recursive_threshold (float): Minimum score threshold to consider a match (default: 0.25).
79
+ min_accepted_score (float): Only show results with a score at or above this threshold (default: 0.15).
80
+ """
81
+ self.base = ScanossBase(
82
+ debug=config.debug,
83
+ trace=config.trace,
84
+ quiet=config.quiet,
85
+ )
86
+ self.presenter = ScannerHFHPresenter(
87
+ self,
88
+ debug=config.debug,
89
+ trace=config.trace,
90
+ quiet=config.quiet,
91
+ )
92
+ self.file_filters = FileFilters(
93
+ debug=config.debug,
94
+ trace=config.trace,
95
+ quiet=config.quiet,
96
+ scanoss_settings=scanoss_settings,
97
+ )
98
+ self.folder_hasher = FolderHasher(
99
+ scan_dir=scan_dir,
100
+ config=config,
101
+ scanoss_settings=scanoss_settings,
102
+ depth=depth,
103
+ )
104
+
105
+ self.scan_dir = scan_dir
106
+ self.client = client
107
+ self.scan_results = None
108
+ self.rank_threshold = rank_threshold
109
+ self.recursive_threshold = recursive_threshold
110
+ self.min_accepted_score = min_accepted_score
111
+ self.use_grpc = use_grpc
112
+
113
+ def _execute_grpc_scan(self, hfh_request: Dict) -> None:
114
+ """
115
+ Execute folder hash scan.
116
+
117
+ Args:
118
+ hfh_request: Request dictionary for the gRPC call
119
+ """
120
+ try:
121
+ self.scan_results = self.client.folder_hash_scan(hfh_request, self.use_grpc)
122
+ except Exception as e:
123
+ self.base.print_stderr(f'Error during folder hash scan: {e}')
124
+ self.scan_results = None
125
+
126
+ def scan(self) -> Optional[Dict]:
127
+ """
128
+ Scan the provided directory using the folder hashing algorithm.
129
+
130
+ Returns:
131
+ Optional[Dict]: The folder hash response from the gRPC client, or None if an error occurs.
132
+ """
133
+ hfh_request = {
134
+ 'root': self.folder_hasher.hash_directory(path=self.scan_dir),
135
+ 'rank_threshold': self.rank_threshold,
136
+ 'recursive_threshold': self.recursive_threshold,
137
+ 'min_accepted_score': self.min_accepted_score,
138
+ }
139
+
140
+ spinner_ctx = Spinner('Scanning folder...')
141
+
142
+ with spinner_ctx as spinner:
143
+ grpc_thread = threading.Thread(target=self._execute_grpc_scan, args=(hfh_request,))
144
+ grpc_thread.start()
145
+
146
+ while grpc_thread.is_alive():
147
+ spinner.next()
148
+ time.sleep(0.1)
149
+
150
+ grpc_thread.join()
151
+
152
+ return self.scan_results
153
+
154
+ def present(self, output_format: str = None, output_file: str = None):
155
+ """Present the results in the selected format"""
156
+ self.presenter.present(output_format=output_format, output_file=output_file)
157
+
158
+
159
+ class ScannerHFHPresenter(AbstractPresenter):
160
+ """
161
+ ScannerHFH presenter class
162
+ Handles the presentation of the folder hashing scan results
163
+ """
164
+
165
+ def __init__(self, scanner: ScannerHFH, **kwargs):
166
+ super().__init__(**kwargs)
167
+ self.scanner = scanner
168
+
169
+ def _format_json_output(self) -> str:
170
+ """
171
+ Format the scan output data into a JSON object
172
+
173
+ Returns:
174
+ str: The formatted JSON string
175
+ """
176
+ return json.dumps(self.scanner.scan_results, indent=2)
177
+
178
+ def _format_plain_output(self) -> str:
179
+ """
180
+ Format the scan output data into a plain text string
181
+ """
182
+ return (
183
+ json.dumps(self.scanner.scan_results, indent=2)
184
+ if isinstance(self.scanner.scan_results, dict)
185
+ else str(self.scanner.scan_results)
186
+ )
187
+
188
+ def _format_cyclonedx_output(self) -> str: # noqa: PLR0911
189
+ if not self.scanner.scan_results:
190
+ return ''
191
+ try:
192
+ if 'results' not in self.scanner.scan_results or not self.scanner.scan_results['results']:
193
+ self.base.print_stderr('ERROR: No scan results found')
194
+ return ''
195
+
196
+ first_result = self.scanner.scan_results['results'][0]
197
+
198
+ best_match_components = [c for c in first_result.get('components', []) if c.get('order') == 1]
199
+ if not best_match_components:
200
+ self.base.print_stderr('ERROR: No best match component found')
201
+ return ''
202
+
203
+ best_match_component = best_match_components[0]
204
+ if not best_match_component.get('versions'):
205
+ self.base.print_stderr('ERROR: No versions found for best match component')
206
+ return ''
207
+
208
+ best_match_version = best_match_component['versions'][0]
209
+ purl = best_match_component['purl']
210
+
211
+ get_dependencies_json_request = {
212
+ 'files': [
213
+ {
214
+ 'file': f'{best_match_component["name"]}:{best_match_version["version"]}',
215
+ 'purls': [{'purl': purl, 'requirement': best_match_version['version']}],
216
+ }
217
+ ]
218
+ }
219
+
220
+ get_vulnerabilities_json_request = {
221
+ 'components': [{'purl': purl, 'requirement': best_match_version['version']}],
222
+ }
223
+
224
+ decorated_scan_results = self.scanner.client.get_dependencies(get_dependencies_json_request)
225
+ vulnerabilities = self.scanner.client.get_vulnerabilities_json(get_vulnerabilities_json_request)
226
+
227
+ cdx = CycloneDx(self.base.debug)
228
+ scan_results = {}
229
+ for f in decorated_scan_results['files']:
230
+ scan_results[f['file']] = [f]
231
+ success, cdx_output = cdx.produce_from_json(scan_results)
232
+ if not success:
233
+ error_msg = 'ERROR: Failed to produce CycloneDX output'
234
+ self.base.print_stderr(error_msg)
235
+ return None
236
+
237
+ if vulnerabilities:
238
+ cdx_output = cdx.append_vulnerabilities(cdx_output, vulnerabilities, purl)
239
+
240
+ return json.dumps(cdx_output, indent=2)
241
+ except Exception as e:
242
+ self.base.print_stderr(f'ERROR: Failed to get license information: {e}')
243
+ return None
244
+
245
+ def _format_spdxlite_output(self) -> str:
246
+ raise NotImplementedError('SPDXlite output is not implemented')
247
+
248
+ def _format_csv_output(self) -> str:
249
+ raise NotImplementedError('CSV output is not implemented')
250
+
251
+ def _format_raw_output(self) -> str:
252
+ raise NotImplementedError('Raw output is not implemented')