scanoss 1.24.0__tar.gz → 1.25.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scanoss-1.24.0/src/scanoss.egg-info → scanoss-1.25.0}/PKG-INFO +1 -1
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/__init__.py +1 -1
- scanoss-1.25.0/src/scanoss/data/build_date.txt +1 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/inspection/policy_check.py +77 -47
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/winnowing.py +64 -7
- {scanoss-1.24.0 → scanoss-1.25.0/src/scanoss.egg-info}/PKG-INFO +1 -1
- scanoss-1.25.0/tests/test_winnowing.py +393 -0
- scanoss-1.24.0/src/scanoss/data/build_date.txt +0 -1
- scanoss-1.24.0/tests/test_winnowing.py +0 -82
- {scanoss-1.24.0 → scanoss-1.25.0}/LICENSE +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/PACKAGE.md +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/README.md +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/pyproject.toml +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/setup.cfg +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/protoc_gen_swagger/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/protoc_gen_swagger/options/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/protoc_gen_swagger/options/annotations_pb2.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/protoc_gen_swagger/options/annotations_pb2_grpc.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/protoc_gen_swagger/options/openapiv2_pb2.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/protoc_gen_swagger/options/openapiv2_pb2_grpc.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/common/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/common/v2/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/common/v2/scanoss_common_pb2.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/common/v2/scanoss_common_pb2_grpc.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/components/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/components/v2/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/components/v2/scanoss_components_pb2.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/components/v2/scanoss_components_pb2_grpc.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/cryptography/v2/scanoss_cryptography_pb2.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/cryptography/v2/scanoss_cryptography_pb2_grpc.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/dependencies/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/dependencies/v2/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/dependencies/v2/scanoss_dependencies_pb2.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/dependencies/v2/scanoss_dependencies_pb2_grpc.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/geoprovenance/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/geoprovenance/v2/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2_grpc.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/scanning/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/scanning/v2/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/scanning/v2/scanoss_scanning_pb2.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/scanning/v2/scanoss_scanning_pb2_grpc.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/semgrep/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/semgrep/v2/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/semgrep/v2/scanoss_semgrep_pb2.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/semgrep/v2/scanoss_semgrep_pb2_grpc.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/vulnerabilities/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/vulnerabilities/v2/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2_grpc.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/cli.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/components.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/constants.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/cryptography.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/csvoutput.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/cyclonedx.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/data/scanoss-settings-schema.json +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/data/spdx-exceptions.json +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/data/spdx-licenses.json +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/file_filters.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/filecount.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/inspection/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/inspection/copyleft.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/inspection/undeclared_component.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/inspection/utils/license_utils.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/results.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scancodedeps.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scanner.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scanners/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scanners/container_scanner.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scanners/folder_hasher.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scanners/scanner_config.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scanners/scanner_hfh.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scanoss_settings.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scanossapi.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scanossbase.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scanossgrpc.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scanpostprocessor.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/scantype.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/spdxlite.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/threadeddependencies.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/threadedscanning.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/utils/__init__.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/utils/abstract_presenter.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/utils/crc64.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/utils/file.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/utils/simhash.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss.egg-info/SOURCES.txt +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss.egg-info/dependency_links.txt +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss.egg-info/entry_points.txt +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss.egg-info/requires.txt +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss.egg-info/top_level.txt +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/tests/test_csv_output.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/tests/test_file_filters.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/tests/test_policy_inspect.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/tests/test_scan_post_processor.py +0 -0
- {scanoss-1.24.0 → scanoss-1.25.0}/tests/test_spdxlite.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
date: 20250610161304, utime: 1749571984
|
|
@@ -26,9 +26,10 @@ import json
|
|
|
26
26
|
import os.path
|
|
27
27
|
from abc import abstractmethod
|
|
28
28
|
from enum import Enum
|
|
29
|
-
from typing import
|
|
30
|
-
|
|
29
|
+
from typing import Any, Callable, Dict, List
|
|
30
|
+
|
|
31
31
|
from ..scanossbase import ScanossBase
|
|
32
|
+
from .utils.license_utils import LicenseUtil
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
class PolicyStatus(Enum):
|
|
@@ -87,7 +88,7 @@ class PolicyCheck(ScanossBase):
|
|
|
87
88
|
|
|
88
89
|
VALID_FORMATS = {'md', 'json', 'jira_md'}
|
|
89
90
|
|
|
90
|
-
def __init__(
|
|
91
|
+
def __init__( # noqa: PLR0913
|
|
91
92
|
self,
|
|
92
93
|
debug: bool = False,
|
|
93
94
|
trace: bool = True,
|
|
@@ -181,10 +182,9 @@ class PolicyCheck(ScanossBase):
|
|
|
181
182
|
:param status: The new component status
|
|
182
183
|
:return: The updated components dictionary
|
|
183
184
|
"""
|
|
184
|
-
|
|
185
185
|
# Determine the component key and purl based on component type
|
|
186
186
|
if id in [ComponentID.FILE.value, ComponentID.SNIPPET.value]:
|
|
187
|
-
purl = new_component['purl'][0] # Take first purl for these component types
|
|
187
|
+
purl = new_component['purl'][0] # Take the first purl for these component types
|
|
188
188
|
else:
|
|
189
189
|
purl = new_component['purl']
|
|
190
190
|
|
|
@@ -195,14 +195,13 @@ class PolicyCheck(ScanossBase):
|
|
|
195
195
|
'licenses': {},
|
|
196
196
|
'status': status,
|
|
197
197
|
}
|
|
198
|
-
|
|
199
198
|
if not new_component.get('licenses'):
|
|
200
|
-
self.
|
|
199
|
+
self.print_debug(f'WARNING: Results missing licenses. Skipping: {new_component}')
|
|
201
200
|
return components
|
|
202
201
|
# Process licenses for this component
|
|
203
|
-
for
|
|
204
|
-
if
|
|
205
|
-
spdxid =
|
|
202
|
+
for license_item in new_component['licenses']:
|
|
203
|
+
if license_item.get('name'):
|
|
204
|
+
spdxid = license_item['name']
|
|
206
205
|
components[component_key]['licenses'][spdxid] = {
|
|
207
206
|
'spdxid': spdxid,
|
|
208
207
|
'copyleft': self.license_util.is_copyleft(spdxid),
|
|
@@ -210,71 +209,103 @@ class PolicyCheck(ScanossBase):
|
|
|
210
209
|
}
|
|
211
210
|
return components
|
|
212
211
|
|
|
213
|
-
def
|
|
212
|
+
def _get_components_data(self, results: Dict[str, Any], components: Dict[str, Any]) -> Dict[str, Any]:
|
|
214
213
|
"""
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
This function iterates through the results dictionary, identifying components from
|
|
218
|
-
different sources (files, snippets, and dependencies). It consolidates this information
|
|
219
|
-
into a list of unique components, each with its associated licenses and other details.
|
|
214
|
+
Extract and process file and snippet components from results.
|
|
220
215
|
|
|
221
216
|
:param results: A dictionary containing the raw results of a component scan
|
|
222
|
-
:
|
|
217
|
+
:param components: Existing components dictionary to update
|
|
218
|
+
:return: Updated components dictionary with file and snippet data
|
|
223
219
|
"""
|
|
224
|
-
if results is None:
|
|
225
|
-
self.print_stderr(f'ERROR: Results cannot be empty')
|
|
226
|
-
return None
|
|
227
|
-
components = {}
|
|
228
220
|
for component in results.values():
|
|
229
221
|
for c in component:
|
|
230
222
|
component_id = c.get('id')
|
|
231
223
|
if not component_id:
|
|
232
|
-
self.
|
|
224
|
+
self.print_debug(f'WARNING: Result missing id. Skipping: {c}')
|
|
233
225
|
continue
|
|
234
226
|
status = c.get('status')
|
|
235
|
-
if not
|
|
236
|
-
self.
|
|
227
|
+
if not status:
|
|
228
|
+
self.print_debug(f'WARNING: Result missing status. Skipping: {c}')
|
|
237
229
|
continue
|
|
238
230
|
if component_id in [ComponentID.FILE.value, ComponentID.SNIPPET.value]:
|
|
239
231
|
if not c.get('purl'):
|
|
240
|
-
self.
|
|
232
|
+
self.print_debug(f'WARNING: Result missing purl. Skipping: {c}')
|
|
241
233
|
continue
|
|
242
234
|
if len(c.get('purl')) <= 0:
|
|
243
|
-
self.
|
|
235
|
+
self.print_debug(f'WARNING: Result missing purls. Skipping: {c}')
|
|
244
236
|
continue
|
|
245
237
|
if not c.get('version'):
|
|
246
|
-
self.
|
|
238
|
+
self.print_msg(f'WARNING: Result missing version. Skipping: {c}')
|
|
247
239
|
continue
|
|
248
240
|
component_key = f'{c["purl"][0]}@{c["version"]}'
|
|
249
|
-
# Initialize or update the component entry
|
|
250
241
|
if component_key not in components:
|
|
251
242
|
components = self._append_component(components, c, component_id, status)
|
|
243
|
+
# End component loop
|
|
244
|
+
# End components loop
|
|
245
|
+
return components
|
|
252
246
|
|
|
253
|
-
|
|
247
|
+
def _get_dependencies_data(self, results: Dict[str, Any], components: Dict[str, Any]) -> Dict[str, Any]:
|
|
248
|
+
"""
|
|
249
|
+
Extract and process dependency components from results.
|
|
250
|
+
|
|
251
|
+
:param results: A dictionary containing the raw results of a component scan
|
|
252
|
+
:param components: Existing components dictionary to update
|
|
253
|
+
:return: Updated components dictionary with dependency data
|
|
254
|
+
"""
|
|
255
|
+
for component in results.values():
|
|
256
|
+
for c in component:
|
|
257
|
+
component_id = c.get('id')
|
|
258
|
+
if not component_id:
|
|
259
|
+
self.print_debug(f'WARNING: Result missing id. Skipping: {c}')
|
|
260
|
+
continue
|
|
261
|
+
status = c.get('status')
|
|
262
|
+
if not status:
|
|
263
|
+
self.print_debug(f'WARNING: Result missing status. Skipping: {c}')
|
|
264
|
+
continue
|
|
265
|
+
if component_id == ComponentID.DEPENDENCY.value:
|
|
254
266
|
if c.get('dependencies') is None:
|
|
255
267
|
continue
|
|
256
|
-
for
|
|
257
|
-
if not
|
|
258
|
-
self.
|
|
259
|
-
continue
|
|
260
|
-
if len(d.get('purl')) <= 0:
|
|
261
|
-
self.print_stderr(f'WARNING: Result missing purls. Skipping.')
|
|
268
|
+
for dependency in c['dependencies']:
|
|
269
|
+
if not dependency.get('purl'):
|
|
270
|
+
self.print_debug(f'WARNING: Dependency result missing purl. Skipping: {dependency}')
|
|
262
271
|
continue
|
|
263
|
-
if not
|
|
264
|
-
self.
|
|
272
|
+
if not dependency.get('version'):
|
|
273
|
+
self.print_msg(f'WARNING: Dependency result missing version. Skipping: {dependency}')
|
|
265
274
|
continue
|
|
266
|
-
component_key = f'{
|
|
275
|
+
component_key = f'{dependency["purl"]}@{dependency["version"]}'
|
|
267
276
|
if component_key not in components:
|
|
268
|
-
components = self._append_component(components,
|
|
269
|
-
# End
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
277
|
+
components = self._append_component(components, dependency, component_id, status)
|
|
278
|
+
# End dependency loop
|
|
279
|
+
# End component loop
|
|
280
|
+
# End of result loop
|
|
281
|
+
return components
|
|
282
|
+
|
|
283
|
+
def _get_components_from_results(self, results: Dict[str, Any]) -> list or None:
|
|
284
|
+
"""
|
|
285
|
+
Process the results dictionary to extract and format component information.
|
|
286
|
+
|
|
287
|
+
This function iterates through the results dictionary, identifying components from
|
|
288
|
+
different sources (files, snippets, and dependencies). It consolidates this information
|
|
289
|
+
into a list of unique components, each with its associated licenses and other details.
|
|
290
|
+
|
|
291
|
+
:param results: A dictionary containing the raw results of a component scan
|
|
292
|
+
:return: A list of dictionaries, each representing a unique component with its details
|
|
293
|
+
"""
|
|
294
|
+
if results is None:
|
|
295
|
+
self.print_stderr('ERROR: Results cannot be empty')
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
components = {}
|
|
299
|
+
# Extract file and snippet components
|
|
300
|
+
components = self._get_components_data(results, components)
|
|
301
|
+
# Extract dependency components
|
|
302
|
+
components = self._get_dependencies_data(results, components)
|
|
303
|
+
# Convert to list and process licenses
|
|
304
|
+
results_list = list(components.values())
|
|
305
|
+
for component in results_list:
|
|
275
306
|
component['licenses'] = list(component['licenses'].values())
|
|
276
307
|
|
|
277
|
-
return
|
|
308
|
+
return results_list
|
|
278
309
|
|
|
279
310
|
def generate_table(self, headers, rows, centered_columns=None):
|
|
280
311
|
"""
|
|
@@ -403,7 +434,6 @@ class PolicyCheck(ScanossBase):
|
|
|
403
434
|
components = self._get_components_from_results(self.results)
|
|
404
435
|
return components
|
|
405
436
|
|
|
406
|
-
|
|
407
437
|
#
|
|
408
438
|
# End of PolicyCheck Class
|
|
409
439
|
#
|
|
@@ -32,9 +32,10 @@ import hashlib
|
|
|
32
32
|
import pathlib
|
|
33
33
|
import platform
|
|
34
34
|
import re
|
|
35
|
+
from typing import Tuple
|
|
35
36
|
|
|
36
|
-
from crc32c import crc32c
|
|
37
37
|
from binaryornot.check import is_binary
|
|
38
|
+
from crc32c import crc32c
|
|
38
39
|
|
|
39
40
|
from .scanossbase import ScanossBase
|
|
40
41
|
|
|
@@ -157,7 +158,7 @@ class Winnowing(ScanossBase):
|
|
|
157
158
|
a list of WFP fingerprints with their corresponding line numbers.
|
|
158
159
|
"""
|
|
159
160
|
|
|
160
|
-
def __init__(
|
|
161
|
+
def __init__( # noqa: PLR0913
|
|
161
162
|
self,
|
|
162
163
|
size_limit: bool = False,
|
|
163
164
|
debug: bool = False,
|
|
@@ -197,6 +198,7 @@ class Winnowing(ScanossBase):
|
|
|
197
198
|
self.strip_hpsm_ids = strip_hpsm_ids
|
|
198
199
|
self.strip_snippet_ids = strip_snippet_ids
|
|
199
200
|
self.hpsm = hpsm
|
|
201
|
+
self.is_windows = platform.system() == 'Windows'
|
|
200
202
|
if hpsm:
|
|
201
203
|
self.crc8_maxim_dow_table = []
|
|
202
204
|
self.crc8_generate_table()
|
|
@@ -218,11 +220,11 @@ class Winnowing(ScanossBase):
|
|
|
218
220
|
return byte
|
|
219
221
|
if byte >= ASCII_a:
|
|
220
222
|
return byte
|
|
221
|
-
if (byte >=
|
|
223
|
+
if (byte >= ASCII_A) and (byte <= ASCII_Z):
|
|
222
224
|
return byte + 32
|
|
223
225
|
return 0
|
|
224
226
|
|
|
225
|
-
def __skip_snippets(self, file: str, src: str) -> bool:
|
|
227
|
+
def __skip_snippets(self, file: str, src: str) -> bool: # noqa: PLR0911
|
|
226
228
|
"""
|
|
227
229
|
Determine files that are not of interest based on their content or file extension
|
|
228
230
|
Parameters
|
|
@@ -351,7 +353,55 @@ class Winnowing(ScanossBase):
|
|
|
351
353
|
self.print_debug(f'Stripped snippet ids from {file}')
|
|
352
354
|
return wfp
|
|
353
355
|
|
|
354
|
-
def
|
|
356
|
+
def __detect_line_endings(self, contents: bytes) -> Tuple[bool, bool, bool]:
|
|
357
|
+
"""Detect the types of line endings present in file contents.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
contents: File contents as bytes.
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Tuple of (has_crlf, has_lf_only, has_cr_only, has_mixed) indicating which line ending types are present.
|
|
364
|
+
"""
|
|
365
|
+
has_crlf = b'\r\n' in contents
|
|
366
|
+
# For LF detection, we need to find LF that's not part of CRLF
|
|
367
|
+
content_without_crlf = contents.replace(b'\r\n', b'')
|
|
368
|
+
has_standalone_lf = b'\n' in content_without_crlf
|
|
369
|
+
# For CR detection, we need to find CR that's not part of CRLF
|
|
370
|
+
has_standalone_cr = b'\r' in content_without_crlf
|
|
371
|
+
|
|
372
|
+
return has_crlf, has_standalone_lf, has_standalone_cr
|
|
373
|
+
|
|
374
|
+
def __calculate_opposite_line_ending_hash(self, contents: bytes):
|
|
375
|
+
"""Calculate hash for contents with opposite line endings.
|
|
376
|
+
|
|
377
|
+
If the file is primarily Unix (LF), calculates Windows (CRLF) hash.
|
|
378
|
+
If the file is primarily Windows (CRLF), calculates Unix (LF) hash.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
contents: File contents as bytes.
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
Hash with opposite line endings as hex string, or None if no line endings detected.
|
|
385
|
+
"""
|
|
386
|
+
has_crlf, has_standalone_lf, has_standalone_cr = self.__detect_line_endings(contents)
|
|
387
|
+
|
|
388
|
+
if not has_crlf and not has_standalone_lf and not has_standalone_cr:
|
|
389
|
+
return None
|
|
390
|
+
|
|
391
|
+
# Normalize all line endings to LF first
|
|
392
|
+
normalized = contents.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
|
|
393
|
+
|
|
394
|
+
# Determine the dominant line ending type
|
|
395
|
+
if has_crlf and not has_standalone_lf and not has_standalone_cr:
|
|
396
|
+
# File is Windows (CRLF) - produce Unix (LF) hash
|
|
397
|
+
opposite_contents = normalized
|
|
398
|
+
else:
|
|
399
|
+
# File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash
|
|
400
|
+
opposite_contents = normalized.replace(b'\n', b'\r\n')
|
|
401
|
+
|
|
402
|
+
return hashlib.md5(opposite_contents).hexdigest()
|
|
403
|
+
|
|
404
|
+
def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str: # noqa: PLR0912, PLR0915
|
|
355
405
|
"""
|
|
356
406
|
Generate a Winnowing fingerprint (WFP) for the given file contents
|
|
357
407
|
Parameters
|
|
@@ -371,7 +421,7 @@ class Winnowing(ScanossBase):
|
|
|
371
421
|
content_length = len(contents)
|
|
372
422
|
original_filename = file
|
|
373
423
|
|
|
374
|
-
if
|
|
424
|
+
if self.is_windows:
|
|
375
425
|
original_filename = file.replace('\\', '/')
|
|
376
426
|
wfp_filename = repr(original_filename).strip("'") # return a utf-8 compatible version of the filename
|
|
377
427
|
if self.obfuscate: # hide the real size of the file and its name, but keep the suffix
|
|
@@ -380,6 +430,13 @@ class Winnowing(ScanossBase):
|
|
|
380
430
|
self.file_map[wfp_filename] = original_filename # Save the file name map for later (reverse lookup)
|
|
381
431
|
|
|
382
432
|
wfp = 'file={0},{1},{2}\n'.format(file_md5, content_length, wfp_filename)
|
|
433
|
+
|
|
434
|
+
# Add opposite line ending hash based on line ending analysis
|
|
435
|
+
if not bin_file:
|
|
436
|
+
opposite_hash = self.__calculate_opposite_line_ending_hash(contents)
|
|
437
|
+
if opposite_hash is not None:
|
|
438
|
+
wfp += f'fh2={opposite_hash}\n'
|
|
439
|
+
|
|
383
440
|
# We don't process snippets for binaries, or other uninteresting files, or if we're requested to skip
|
|
384
441
|
if bin_file or self.skip_snippets or self.__skip_snippets(file, contents.decode('utf-8', 'ignore')):
|
|
385
442
|
return wfp
|
|
@@ -467,7 +524,7 @@ class Winnowing(ScanossBase):
|
|
|
467
524
|
for i, byte in enumerate(content):
|
|
468
525
|
c = byte
|
|
469
526
|
if c == ASCII_LF: # When there is a new line
|
|
470
|
-
if
|
|
527
|
+
if list_normalized:
|
|
471
528
|
crc_lines.append(self.crc8_buffer(list_normalized))
|
|
472
529
|
list_normalized = []
|
|
473
530
|
elif last_line + 1 == i:
|
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2021, SCANOSS
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
|
14
|
+
all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
+
THE SOFTWARE.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import platform
|
|
26
|
+
import unittest
|
|
27
|
+
from unittest.mock import patch
|
|
28
|
+
|
|
29
|
+
from scanoss.winnowing import Winnowing
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class MyTestCase(unittest.TestCase):
|
|
33
|
+
"""
|
|
34
|
+
Exercise the Winnowing class
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def test_winnowing(self):
|
|
38
|
+
winnowing = Winnowing(debug=True)
|
|
39
|
+
filename = 'test-file.c'
|
|
40
|
+
contents = 'c code contents'
|
|
41
|
+
content_types = bytes(contents, encoding='raw_unicode_escape')
|
|
42
|
+
wfp = winnowing.wfp_for_contents(filename, False, content_types)
|
|
43
|
+
print(f'WFP for {filename}: {wfp}')
|
|
44
|
+
self.assertIsNotNone(wfp)
|
|
45
|
+
filename = __file__
|
|
46
|
+
wfp = winnowing.wfp_for_file(filename, filename)
|
|
47
|
+
print(f'WFP for {filename}: {wfp}')
|
|
48
|
+
self.assertIsNotNone(wfp)
|
|
49
|
+
|
|
50
|
+
def test_snippet_skip(self):
|
|
51
|
+
winnowing = Winnowing(debug=True)
|
|
52
|
+
filename = 'test-file.jar'
|
|
53
|
+
contents = 'jar file contents'
|
|
54
|
+
content_types = bytes(contents, encoding='raw_unicode_escape')
|
|
55
|
+
wfp = winnowing.wfp_for_contents(filename, False, content_types)
|
|
56
|
+
print(f'WFP for {filename}: {wfp}')
|
|
57
|
+
self.assertIsNotNone(wfp)
|
|
58
|
+
|
|
59
|
+
def test_snippet_strip(self):
|
|
60
|
+
winnowing = Winnowing(
|
|
61
|
+
debug=True, hpsm=True, strip_snippet_ids=['d5e54c33,b03faabe'], strip_hpsm_ids=['0d2fffaffc62d18']
|
|
62
|
+
)
|
|
63
|
+
filename = 'test-file.py'
|
|
64
|
+
with open(__file__, 'rb') as f:
|
|
65
|
+
contents = f.read()
|
|
66
|
+
print('--- Test snippet and HPSM strip ---')
|
|
67
|
+
wfp = winnowing.wfp_for_contents(filename, False, contents)
|
|
68
|
+
found = 0
|
|
69
|
+
print(f'WFP for {filename}: {wfp}')
|
|
70
|
+
try:
|
|
71
|
+
found = wfp.index('d5e54c33,b03faabe')
|
|
72
|
+
except ValueError:
|
|
73
|
+
found = -1
|
|
74
|
+
self.assertEqual(found, -1)
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
found = wfp.index('0d2fffaffc62d18')
|
|
78
|
+
except ValueError:
|
|
79
|
+
found = -1
|
|
80
|
+
self.assertEqual(found, -1)
|
|
81
|
+
|
|
82
|
+
def test_windows_hash_calculation(self):
|
|
83
|
+
"""Test Windows-specific hash calculation with CRLF line endings."""
|
|
84
|
+
import hashlib
|
|
85
|
+
|
|
86
|
+
# Test content with LF line endings
|
|
87
|
+
content_lf = b'line1\nline2\nline3\n'
|
|
88
|
+
# Expected content with CRLF line endings for Windows hash
|
|
89
|
+
content_crlf = b'line1\r\nline2\r\nline3\r\n'
|
|
90
|
+
|
|
91
|
+
# Calculate the expected Windows hash manually
|
|
92
|
+
expected_windows_hash = hashlib.md5(content_crlf).hexdigest()
|
|
93
|
+
lf_hash = hashlib.md5(content_lf).hexdigest()
|
|
94
|
+
|
|
95
|
+
print(f'LF content hash: {lf_hash}')
|
|
96
|
+
print(f'CRLF content hash (Windows): {expected_windows_hash}')
|
|
97
|
+
|
|
98
|
+
# They should be different
|
|
99
|
+
self.assertNotEqual(lf_hash, expected_windows_hash)
|
|
100
|
+
|
|
101
|
+
@patch('platform.system')
|
|
102
|
+
def test_windows_wfp_includes_fh2(self, mock_platform):
|
|
103
|
+
"""Test that WFP includes fh2 hash when running on Windows."""
|
|
104
|
+
# Mock Windows environment
|
|
105
|
+
mock_platform.return_value = 'Windows'
|
|
106
|
+
winnowing = Winnowing(debug=True)
|
|
107
|
+
|
|
108
|
+
filename = 'test-file.c'
|
|
109
|
+
content = b'int main() {\n return 0;\n}\n'
|
|
110
|
+
|
|
111
|
+
wfp = winnowing.wfp_for_contents(filename, False, content)
|
|
112
|
+
|
|
113
|
+
print(f'Windows WFP output:\n{wfp}')
|
|
114
|
+
|
|
115
|
+
# Check that WFP contains fh2 line
|
|
116
|
+
self.assertIn('fh2=', wfp)
|
|
117
|
+
|
|
118
|
+
# Extract the fh2 hash from WFP
|
|
119
|
+
lines = wfp.split('\n')
|
|
120
|
+
fh2_line = [line for line in lines if line.startswith('fh2=')]
|
|
121
|
+
self.assertEqual(len(fh2_line), 1)
|
|
122
|
+
|
|
123
|
+
fh2_hash = fh2_line[0].split('=')[1]
|
|
124
|
+
|
|
125
|
+
# Verify it matches expected CRLF conversion
|
|
126
|
+
import hashlib
|
|
127
|
+
content_crlf = content.replace(b'\n', b'\r\n')
|
|
128
|
+
expected_hash = hashlib.md5(content_crlf).hexdigest()
|
|
129
|
+
self.assertEqual(fh2_hash, expected_hash)
|
|
130
|
+
|
|
131
|
+
def test_line_ending_detection(self):
|
|
132
|
+
"""Test line ending detection logic."""
|
|
133
|
+
winnowing = Winnowing(debug=True)
|
|
134
|
+
|
|
135
|
+
# Test LF only
|
|
136
|
+
content_lf = b'line1\nline2\nline3\n'
|
|
137
|
+
has_crlf, has_lf, has_cr = winnowing._Winnowing__detect_line_endings(content_lf)
|
|
138
|
+
self.assertFalse(has_crlf)
|
|
139
|
+
self.assertTrue(has_lf)
|
|
140
|
+
self.assertFalse(has_cr)
|
|
141
|
+
|
|
142
|
+
# Test CRLF only
|
|
143
|
+
content_crlf = b'line1\r\nline2\r\nline3\r\n'
|
|
144
|
+
has_crlf, has_lf, has_cr = winnowing._Winnowing__detect_line_endings(content_crlf)
|
|
145
|
+
self.assertTrue(has_crlf)
|
|
146
|
+
self.assertFalse(has_lf)
|
|
147
|
+
self.assertFalse(has_cr)
|
|
148
|
+
|
|
149
|
+
# Test CR only (old Mac style)
|
|
150
|
+
content_cr = b'line1\rline2\rline3\r'
|
|
151
|
+
has_crlf, has_lf, has_cr = winnowing._Winnowing__detect_line_endings(content_cr)
|
|
152
|
+
self.assertFalse(has_crlf)
|
|
153
|
+
self.assertFalse(has_lf)
|
|
154
|
+
self.assertTrue(has_cr)
|
|
155
|
+
|
|
156
|
+
# Test mixed CRLF and LF
|
|
157
|
+
content_mixed = b'line1\r\nline2\nline3\r\n'
|
|
158
|
+
has_crlf, has_lf, has_cr = winnowing._Winnowing__detect_line_endings(content_mixed)
|
|
159
|
+
self.assertTrue(has_crlf)
|
|
160
|
+
self.assertTrue(has_lf)
|
|
161
|
+
self.assertFalse(has_cr)
|
|
162
|
+
|
|
163
|
+
def test_opposite_hash_logic(self):
|
|
164
|
+
"""Test the logic of opposite hash calculation."""
|
|
165
|
+
winnowing = Winnowing(debug=True)
|
|
166
|
+
|
|
167
|
+
# Test different line ending scenarios
|
|
168
|
+
content_lf = b'line1\nline2\nline3\n'
|
|
169
|
+
content_crlf = b'line1\r\nline2\r\nline3\r\n'
|
|
170
|
+
content_cr = b'line1\rline2\rline3\r'
|
|
171
|
+
content_mixed = b'line1\r\nline2\nline3\r'
|
|
172
|
+
|
|
173
|
+
hash_lf = winnowing._Winnowing__calculate_opposite_line_ending_hash(content_lf)
|
|
174
|
+
hash_crlf = winnowing._Winnowing__calculate_opposite_line_ending_hash(content_crlf)
|
|
175
|
+
hash_cr = winnowing._Winnowing__calculate_opposite_line_ending_hash(content_cr)
|
|
176
|
+
hash_mixed = winnowing._Winnowing__calculate_opposite_line_ending_hash(content_mixed)
|
|
177
|
+
|
|
178
|
+
print(f'LF opposite hash: {hash_lf}')
|
|
179
|
+
print(f'CRLF opposite hash: {hash_crlf}')
|
|
180
|
+
print(f'CR opposite hash: {hash_cr}')
|
|
181
|
+
print(f'Mixed opposite hash: {hash_mixed}')
|
|
182
|
+
|
|
183
|
+
# LF, CR, and mixed content should all produce CRLF hash (same result)
|
|
184
|
+
self.assertEqual(hash_lf, hash_cr)
|
|
185
|
+
self.assertEqual(hash_lf, hash_mixed)
|
|
186
|
+
|
|
187
|
+
# CRLF content should produce LF hash (different from the others)
|
|
188
|
+
self.assertNotEqual(hash_crlf, hash_lf)
|
|
189
|
+
|
|
190
|
+
@unittest.skipUnless(platform.system() == 'Windows', 'Windows-specific test')
|
|
191
|
+
def test_actual_windows_behavior(self):
|
|
192
|
+
"""Test actual Windows behavior when running on Windows."""
|
|
193
|
+
winnowing = Winnowing(debug=True)
|
|
194
|
+
filename = 'test-file.c'
|
|
195
|
+
content = b'int main() {\n return 0;\n}\n'
|
|
196
|
+
|
|
197
|
+
wfp = winnowing.wfp_for_contents(filename, False, content)
|
|
198
|
+
|
|
199
|
+
print(f'Actual Windows WFP:\n{wfp}')
|
|
200
|
+
|
|
201
|
+
# On actual Windows with LF content, should include fh2
|
|
202
|
+
# Should always generate fh2 when line endings are present
|
|
203
|
+
self.assertIn('fh2=', wfp)
|
|
204
|
+
|
|
205
|
+
def test_empty_file_fh2(self):
|
|
206
|
+
"""Test fh2 behavior with empty files."""
|
|
207
|
+
winnowing = Winnowing(debug=True)
|
|
208
|
+
content = b''
|
|
209
|
+
wfp = winnowing.wfp_for_contents('empty.txt', False, content)
|
|
210
|
+
|
|
211
|
+
print(f'Empty file WFP:\n{wfp}')
|
|
212
|
+
|
|
213
|
+
# Empty files should not generate fh2
|
|
214
|
+
self.assertNotIn('fh2=', wfp)
|
|
215
|
+
|
|
216
|
+
def test_no_line_endings_fh2(self):
|
|
217
|
+
"""Test files without any line endings."""
|
|
218
|
+
winnowing = Winnowing(debug=True)
|
|
219
|
+
content = b'no line endings here'
|
|
220
|
+
wfp = winnowing.wfp_for_contents('noline.txt', False, content)
|
|
221
|
+
|
|
222
|
+
print(f'No line endings WFP:\n{wfp}')
|
|
223
|
+
|
|
224
|
+
# Files without line endings should not generate fh2
|
|
225
|
+
self.assertNotIn('fh2=', wfp)
|
|
226
|
+
|
|
227
|
+
def test_all_platforms_generate_fh2(self):
|
|
228
|
+
"""Test that all platforms generate fh2 when line endings are present."""
|
|
229
|
+
winnowing = Winnowing(debug=True)
|
|
230
|
+
content = b'line1\nline2\n'
|
|
231
|
+
wfp = winnowing.wfp_for_contents('test.txt', False, content)
|
|
232
|
+
|
|
233
|
+
print(f'Platform-independent WFP:\n{wfp}')
|
|
234
|
+
|
|
235
|
+
# Any platform should generate fh2 when line endings are present
|
|
236
|
+
self.assertIn('fh2=', wfp)
|
|
237
|
+
|
|
238
|
+
def test_verify_opposite_hash_calculation(self):
|
|
239
|
+
"""Test that the opposite hash calculation works correctly."""
|
|
240
|
+
winnowing = Winnowing(debug=True)
|
|
241
|
+
|
|
242
|
+
# Test LF -> CRLF conversion
|
|
243
|
+
content_lf = b'line1\nline2\nline3\n'
|
|
244
|
+
wfp_lf = winnowing.wfp_for_contents('test_lf.txt', False, content_lf)
|
|
245
|
+
|
|
246
|
+
# Test CRLF -> LF conversion
|
|
247
|
+
content_crlf = b'line1\r\nline2\r\nline3\r\n'
|
|
248
|
+
wfp_crlf = winnowing.wfp_for_contents('test_crlf.txt', False, content_crlf)
|
|
249
|
+
|
|
250
|
+
print(f'LF content WFP:\n{wfp_lf}')
|
|
251
|
+
print(f'CRLF content WFP:\n{wfp_crlf}')
|
|
252
|
+
|
|
253
|
+
# Both should generate fh2
|
|
254
|
+
self.assertIn('fh2=', wfp_lf)
|
|
255
|
+
self.assertIn('fh2=', wfp_crlf)
|
|
256
|
+
|
|
257
|
+
# Extract fh2 values
|
|
258
|
+
lf_fh2 = wfp_lf.split('fh2=')[1].split('\n')[0]
|
|
259
|
+
crlf_fh2 = wfp_crlf.split('fh2=')[1].split('\n')[0]
|
|
260
|
+
|
|
261
|
+
# The fh2 values should be swapped (LF file gets CRLF hash, CRLF file gets LF hash)
|
|
262
|
+
import hashlib
|
|
263
|
+
expected_lf_to_crlf = hashlib.md5(content_lf.replace(b'\n', b'\r\n')).hexdigest()
|
|
264
|
+
expected_crlf_to_lf = hashlib.md5(content_crlf.replace(b'\r\n', b'\n')).hexdigest()
|
|
265
|
+
|
|
266
|
+
self.assertEqual(lf_fh2, expected_lf_to_crlf)
|
|
267
|
+
self.assertEqual(crlf_fh2, expected_crlf_to_lf)
|
|
268
|
+
|
|
269
|
+
def test_binary_file_with_line_endings(self):
|
|
270
|
+
"""Test binary files with embedded line endings."""
|
|
271
|
+
winnowing = Winnowing(debug=True)
|
|
272
|
+
# Binary content with embedded newlines
|
|
273
|
+
content = b'\x00\x01\n\x02\x03\r\n\x04'
|
|
274
|
+
wfp = winnowing.wfp_for_contents('binary.bin', True, content)
|
|
275
|
+
|
|
276
|
+
print(f'Binary file WFP:\n{wfp}')
|
|
277
|
+
|
|
278
|
+
# Binary files should not generate fh2
|
|
279
|
+
self.assertNotIn('fh2=', wfp)
|
|
280
|
+
|
|
281
|
+
def test_cr_only_line_endings(self):
|
|
282
|
+
"""Test classic Mac CR-only line endings."""
|
|
283
|
+
winnowing = Winnowing(debug=True)
|
|
284
|
+
content = b'line1\rline2\rline3\r'
|
|
285
|
+
wfp = winnowing.wfp_for_contents('mac.txt', False, content)
|
|
286
|
+
|
|
287
|
+
print(f'CR-only WFP:\n{wfp}')
|
|
288
|
+
|
|
289
|
+
# Should generate fh2 (platform independent)
|
|
290
|
+
self.assertIn('fh2=', wfp)
|
|
291
|
+
|
|
292
|
+
# Should normalize CR to CRLF for the opposite hash
|
|
293
|
+
import hashlib
|
|
294
|
+
expected = content.replace(b'\r', b'\r\n')
|
|
295
|
+
expected_hash = hashlib.md5(expected).hexdigest()
|
|
296
|
+
self.assertIn(f'fh2={expected_hash}', wfp)
|
|
297
|
+
|
|
298
|
+
def test_whitespace_only_file(self):
|
|
299
|
+
"""Test files with only whitespace characters."""
|
|
300
|
+
winnowing = Winnowing(debug=True)
|
|
301
|
+
content = b' \n\t\n \n'
|
|
302
|
+
wfp = winnowing.wfp_for_contents('whitespace.txt', False, content)
|
|
303
|
+
|
|
304
|
+
print(f'Whitespace-only WFP:\n{wfp}')
|
|
305
|
+
|
|
306
|
+
# Should generate fh2 since it has line endings
|
|
307
|
+
self.assertIn('fh2=', wfp)
|
|
308
|
+
|
|
309
|
+
def test_mixed_complex_line_endings(self):
|
|
310
|
+
"""Test complex mixed line ending scenarios."""
|
|
311
|
+
winnowing = Winnowing(debug=True)
|
|
312
|
+
# Mix of CRLF, LF, and CR
|
|
313
|
+
content = b'line1\r\nline2\nline3\rline4\r\nline5\n'
|
|
314
|
+
wfp = winnowing.wfp_for_contents('mixed.txt', False, content)
|
|
315
|
+
|
|
316
|
+
print(f'Mixed line endings WFP:\n{wfp}')
|
|
317
|
+
|
|
318
|
+
# Should generate fh2
|
|
319
|
+
self.assertIn('fh2=', wfp)
|
|
320
|
+
|
|
321
|
+
# Verify the hash calculation
|
|
322
|
+
import hashlib
|
|
323
|
+
normalized = content.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
|
|
324
|
+
expected_crlf = normalized.replace(b'\n', b'\r\n')
|
|
325
|
+
expected_hash = hashlib.md5(expected_crlf).hexdigest()
|
|
326
|
+
self.assertIn(f'fh2={expected_hash}', wfp)
|
|
327
|
+
|
|
328
|
+
def test_fh2_with_skip_snippets(self):
|
|
329
|
+
"""Test fh2 generation when skip_snippets is enabled."""
|
|
330
|
+
winnowing = Winnowing(debug=True, skip_snippets=True)
|
|
331
|
+
content = b'line1\nline2\nline3\n'
|
|
332
|
+
wfp = winnowing.wfp_for_contents('test.txt', False, content)
|
|
333
|
+
|
|
334
|
+
print(f'Skip snippets WFP:\n{wfp}')
|
|
335
|
+
|
|
336
|
+
# Should still generate fh2 even when skipping snippets
|
|
337
|
+
self.assertIn('fh2=', wfp)
|
|
338
|
+
# But should not contain snippet fingerprints (line numbers)
|
|
339
|
+
lines = wfp.split('\n')
|
|
340
|
+
snippet_lines = [line for line in lines if '=' in line and line[0].isdigit()]
|
|
341
|
+
self.assertEqual(len(snippet_lines), 0)
|
|
342
|
+
|
|
343
|
+
def test_fh2_with_obfuscation(self):
|
|
344
|
+
"""Test fh2 generation with obfuscation enabled."""
|
|
345
|
+
winnowing = Winnowing(debug=True, obfuscate=True)
|
|
346
|
+
content = b'line1\nline2\nline3\n'
|
|
347
|
+
wfp = winnowing.wfp_for_contents('test.txt', False, content)
|
|
348
|
+
|
|
349
|
+
print(f'Obfuscated WFP:\n{wfp}')
|
|
350
|
+
|
|
351
|
+
# Should still generate fh2 with obfuscation
|
|
352
|
+
self.assertIn('fh2=', wfp)
|
|
353
|
+
# Filename should be obfuscated
|
|
354
|
+
self.assertIn('1.txt', wfp)
|
|
355
|
+
self.assertNotIn('test.txt', wfp)
|
|
356
|
+
|
|
357
|
+
def test_large_file_with_line_endings(self):
|
|
358
|
+
"""Test large files with many line endings."""
|
|
359
|
+
winnowing = Winnowing(debug=True, size_limit=True, post_size=1) # 1KB limit
|
|
360
|
+
# Create content larger than the limit
|
|
361
|
+
content = b'line\n' * 1000 # Should exceed 1KB
|
|
362
|
+
wfp = winnowing.wfp_for_contents('large.txt', False, content)
|
|
363
|
+
|
|
364
|
+
print(f'Large file WFP length: {len(wfp)}')
|
|
365
|
+
|
|
366
|
+
# Should still generate fh2 even with size limits
|
|
367
|
+
self.assertIn('fh2=', wfp)
|
|
368
|
+
|
|
369
|
+
def test_single_line_no_newline(self):
|
|
370
|
+
"""Test single line files without trailing newline."""
|
|
371
|
+
winnowing = Winnowing(debug=True)
|
|
372
|
+
content = b'single line without newline'
|
|
373
|
+
wfp = winnowing.wfp_for_contents('single.txt', False, content)
|
|
374
|
+
|
|
375
|
+
print(f'Single line no newline WFP:\n{wfp}')
|
|
376
|
+
|
|
377
|
+
# Should not generate fh2 (no line endings)
|
|
378
|
+
self.assertNotIn('fh2=', wfp)
|
|
379
|
+
|
|
380
|
+
def test_file_with_null_bytes_and_newlines(self):
|
|
381
|
+
"""Test files with null bytes mixed with newlines."""
|
|
382
|
+
winnowing = Winnowing(debug=True)
|
|
383
|
+
content = b'line1\x00\nline2\x00\x00\nline3\n'
|
|
384
|
+
wfp = winnowing.wfp_for_contents('nullbytes.txt', False, content)
|
|
385
|
+
|
|
386
|
+
print(f'Null bytes with newlines WFP:\n{wfp}')
|
|
387
|
+
|
|
388
|
+
# Should generate fh2 (has line endings)
|
|
389
|
+
self.assertIn('fh2=', wfp)
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
if __name__ == '__main__':
|
|
393
|
+
unittest.main()
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
date: 20250528081438, utime: 1748420078
|
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
SPDX-License-Identifier: MIT
|
|
3
|
-
|
|
4
|
-
Copyright (c) 2021, SCANOSS
|
|
5
|
-
|
|
6
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
-
in the Software without restriction, including without limitation the rights
|
|
9
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
-
furnished to do so, subject to the following conditions:
|
|
12
|
-
|
|
13
|
-
The above copyright notice and this permission notice shall be included in
|
|
14
|
-
all copies or substantial portions of the Software.
|
|
15
|
-
|
|
16
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
-
THE SOFTWARE.
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
import unittest
|
|
26
|
-
|
|
27
|
-
from scanoss.winnowing import Winnowing
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class MyTestCase(unittest.TestCase):
|
|
31
|
-
"""
|
|
32
|
-
Exercise the Winnowing class
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
def test_winnowing(self):
|
|
36
|
-
winnowing = Winnowing(debug=True)
|
|
37
|
-
filename = 'test-file.c'
|
|
38
|
-
contents = 'c code contents'
|
|
39
|
-
content_types = bytes(contents, encoding='raw_unicode_escape')
|
|
40
|
-
wfp = winnowing.wfp_for_contents(filename, False, content_types)
|
|
41
|
-
print(f'WFP for {filename}: {wfp}')
|
|
42
|
-
self.assertIsNotNone(wfp)
|
|
43
|
-
filename = __file__
|
|
44
|
-
wfp = winnowing.wfp_for_file(filename, filename)
|
|
45
|
-
print(f'WFP for {filename}: {wfp}')
|
|
46
|
-
self.assertIsNotNone(wfp)
|
|
47
|
-
|
|
48
|
-
def test_snippet_skip(self):
|
|
49
|
-
winnowing = Winnowing(debug=True)
|
|
50
|
-
filename = 'test-file.jar'
|
|
51
|
-
contents = 'jar file contents'
|
|
52
|
-
content_types = bytes(contents, encoding='raw_unicode_escape')
|
|
53
|
-
wfp = winnowing.wfp_for_contents(filename, False, content_types)
|
|
54
|
-
print(f'WFP for {filename}: {wfp}')
|
|
55
|
-
self.assertIsNotNone(wfp)
|
|
56
|
-
|
|
57
|
-
def test_snippet_strip(self):
|
|
58
|
-
winnowing = Winnowing(
|
|
59
|
-
debug=True, hpsm=True, strip_snippet_ids=['d5e54c33,b03faabe'], strip_hpsm_ids=['0d2fffaffc62d18']
|
|
60
|
-
)
|
|
61
|
-
filename = 'test-file.py'
|
|
62
|
-
with open(__file__, 'rb') as f:
|
|
63
|
-
contents = f.read()
|
|
64
|
-
print('--- Test snippet and HPSM strip ---')
|
|
65
|
-
wfp = winnowing.wfp_for_contents(filename, False, contents)
|
|
66
|
-
found = 0
|
|
67
|
-
print(f'WFP for {filename}: {wfp}')
|
|
68
|
-
try:
|
|
69
|
-
found = wfp.index('d5e54c33,b03faabe')
|
|
70
|
-
except ValueError:
|
|
71
|
-
found = -1
|
|
72
|
-
self.assertEqual(found, -1)
|
|
73
|
-
|
|
74
|
-
try:
|
|
75
|
-
found = wfp.index('0d2fffaffc62d18')
|
|
76
|
-
except ValueError:
|
|
77
|
-
found = -1
|
|
78
|
-
self.assertEqual(found, -1)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
if __name__ == '__main__':
|
|
82
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/components/v2/scanoss_components_pb2_grpc.py
RENAMED
|
File without changes
|
{scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/cryptography/v2/scanoss_cryptography_pb2.py
RENAMED
|
File without changes
|
{scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/cryptography/v2/scanoss_cryptography_pb2_grpc.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/dependencies/v2/scanoss_dependencies_pb2.py
RENAMED
|
File without changes
|
{scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/dependencies/v2/scanoss_dependencies_pb2_grpc.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2.py
RENAMED
|
File without changes
|
{scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2_grpc.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|