scanoss 1.20.6__py3-none-any.whl → 1.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protoc_gen_swagger/options/annotations_pb2.py +9 -12
- protoc_gen_swagger/options/annotations_pb2_grpc.py +1 -1
- protoc_gen_swagger/options/openapiv2_pb2.py +96 -98
- protoc_gen_swagger/options/openapiv2_pb2_grpc.py +1 -1
- scanoss/__init__.py +1 -1
- scanoss/api/common/v2/scanoss_common_pb2.py +20 -18
- scanoss/api/common/v2/scanoss_common_pb2_grpc.py +1 -1
- scanoss/api/components/v2/scanoss_components_pb2.py +38 -48
- scanoss/api/components/v2/scanoss_components_pb2_grpc.py +96 -142
- scanoss/api/cryptography/v2/scanoss_cryptography_pb2.py +42 -22
- scanoss/api/cryptography/v2/scanoss_cryptography_pb2_grpc.py +185 -75
- scanoss/api/dependencies/v2/scanoss_dependencies_pb2.py +32 -30
- scanoss/api/dependencies/v2/scanoss_dependencies_pb2_grpc.py +83 -75
- scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2.py +49 -0
- scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2_grpc.py +142 -0
- scanoss/api/scanning/v2/scanoss_scanning_pb2.py +20 -10
- scanoss/api/scanning/v2/scanoss_scanning_pb2_grpc.py +70 -40
- scanoss/api/semgrep/v2/scanoss_semgrep_pb2.py +18 -22
- scanoss/api/semgrep/v2/scanoss_semgrep_pb2_grpc.py +49 -71
- scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2.py +27 -37
- scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2_grpc.py +72 -109
- scanoss/cli.py +393 -84
- scanoss/components.py +21 -11
- scanoss/constants.py +12 -0
- scanoss/data/build_date.txt +1 -1
- scanoss/file_filters.py +272 -57
- scanoss/results.py +92 -109
- scanoss/scanners/__init__.py +23 -0
- scanoss/scanners/container_scanner.py +474 -0
- scanoss/scanners/folder_hasher.py +302 -0
- scanoss/scanners/scanner_config.py +73 -0
- scanoss/scanners/scanner_hfh.py +173 -0
- scanoss/scanoss_settings.py +9 -5
- scanoss/scanossbase.py +9 -3
- scanoss/scanossgrpc.py +143 -18
- scanoss/threadedscanning.py +6 -6
- scanoss/utils/abstract_presenter.py +103 -0
- scanoss/utils/crc64.py +96 -0
- scanoss/utils/simhash.py +198 -0
- {scanoss-1.20.6.dist-info → scanoss-1.23.0.dist-info}/METADATA +2 -1
- scanoss-1.23.0.dist-info/RECORD +83 -0
- {scanoss-1.20.6.dist-info → scanoss-1.23.0.dist-info}/WHEEL +1 -1
- scanoss/api/provenance/v2/scanoss_provenance_pb2.py +0 -42
- scanoss/api/provenance/v2/scanoss_provenance_pb2_grpc.py +0 -108
- scanoss-1.20.6.dist-info/RECORD +0 -74
- /scanoss/api/{provenance → geoprovenance}/__init__.py +0 -0
- /scanoss/api/{provenance → geoprovenance}/v2/__init__.py +0 -0
- {scanoss-1.20.6.dist-info → scanoss-1.23.0.dist-info}/entry_points.txt +0 -0
- {scanoss-1.20.6.dist-info → scanoss-1.23.0.dist-info}/licenses/LICENSE +0 -0
- {scanoss-1.20.6.dist-info → scanoss-1.23.0.dist-info}/top_level.txt +0 -0
scanoss/scanossgrpc.py
CHANGED
|
@@ -26,6 +26,9 @@ import concurrent.futures
|
|
|
26
26
|
import json
|
|
27
27
|
import os
|
|
28
28
|
import uuid
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
from enum import IntEnum
|
|
31
|
+
from typing import Dict, Optional
|
|
29
32
|
from urllib.parse import urlparse
|
|
30
33
|
|
|
31
34
|
import grpc
|
|
@@ -33,6 +36,9 @@ from google.protobuf.json_format import MessageToDict, ParseDict
|
|
|
33
36
|
from pypac.parser import PACFile
|
|
34
37
|
from pypac.resolver import ProxyResolver
|
|
35
38
|
|
|
39
|
+
from scanoss.api.scanning.v2.scanoss_scanning_pb2_grpc import ScanningStub
|
|
40
|
+
from scanoss.constants import DEFAULT_TIMEOUT
|
|
41
|
+
|
|
36
42
|
from . import __version__
|
|
37
43
|
from .api.common.v2.scanoss_common_pb2 import (
|
|
38
44
|
EchoRequest,
|
|
@@ -52,8 +58,9 @@ from .api.cryptography.v2.scanoss_cryptography_pb2 import AlgorithmResponse
|
|
|
52
58
|
from .api.cryptography.v2.scanoss_cryptography_pb2_grpc import CryptographyStub
|
|
53
59
|
from .api.dependencies.v2.scanoss_dependencies_pb2 import DependencyRequest
|
|
54
60
|
from .api.dependencies.v2.scanoss_dependencies_pb2_grpc import DependenciesStub
|
|
55
|
-
from .api.
|
|
56
|
-
from .api.
|
|
61
|
+
from .api.geoprovenance.v2.scanoss_geoprovenance_pb2 import ContributorResponse
|
|
62
|
+
from .api.geoprovenance.v2.scanoss_geoprovenance_pb2_grpc import GeoProvenanceStub
|
|
63
|
+
from .api.scanning.v2.scanoss_scanning_pb2 import HFHRequest
|
|
57
64
|
from .api.semgrep.v2.scanoss_semgrep_pb2 import SemgrepResponse
|
|
58
65
|
from .api.semgrep.v2.scanoss_semgrep_pb2_grpc import SemgrepStub
|
|
59
66
|
from .api.vulnerabilities.v2.scanoss_vulnerabilities_pb2 import VulnerabilityResponse
|
|
@@ -68,6 +75,23 @@ SCANOSS_API_KEY = os.environ.get('SCANOSS_API_KEY') if os.environ.get('SCANOSS_A
|
|
|
68
75
|
MAX_CONCURRENT_REQUESTS = 5
|
|
69
76
|
|
|
70
77
|
|
|
78
|
+
class ScanossGrpcError(Exception):
|
|
79
|
+
"""
|
|
80
|
+
Custom exception for SCANOSS gRPC errors
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class ScanossGrpcStatusCode(IntEnum):
|
|
87
|
+
"""Status codes for SCANOSS gRPC responses"""
|
|
88
|
+
|
|
89
|
+
SUCCESS = 1
|
|
90
|
+
SUCCESS_WITH_WARNINGS = 2
|
|
91
|
+
FAILED_WITH_WARNINGS = 3
|
|
92
|
+
FAILED = 4
|
|
93
|
+
|
|
94
|
+
|
|
71
95
|
class ScanossGrpc(ScanossBase):
|
|
72
96
|
"""
|
|
73
97
|
Client for gRPC functionality
|
|
@@ -113,7 +137,6 @@ class ScanossGrpc(ScanossBase):
|
|
|
113
137
|
self.req_headers = req_headers
|
|
114
138
|
self.metadata = []
|
|
115
139
|
|
|
116
|
-
|
|
117
140
|
if self.api_key:
|
|
118
141
|
self.metadata.append(('x-api-key', api_key)) # Set API key if we have one
|
|
119
142
|
if ver_details:
|
|
@@ -146,7 +169,8 @@ class ScanossGrpc(ScanossBase):
|
|
|
146
169
|
self.dependencies_stub = DependenciesStub(grpc.insecure_channel(self.url))
|
|
147
170
|
self.semgrep_stub = SemgrepStub(grpc.insecure_channel(self.url))
|
|
148
171
|
self.vuln_stub = VulnerabilitiesStub(grpc.insecure_channel(self.url))
|
|
149
|
-
self.provenance_stub =
|
|
172
|
+
self.provenance_stub = GeoProvenanceStub(grpc.insecure_channel(self.url))
|
|
173
|
+
self.scanning_stub = ScanningStub(grpc.insecure_channel(self.url))
|
|
150
174
|
else:
|
|
151
175
|
if ca_cert is not None:
|
|
152
176
|
credentials = grpc.ssl_channel_credentials(cert_data) # secure with specified certificate
|
|
@@ -157,7 +181,8 @@ class ScanossGrpc(ScanossBase):
|
|
|
157
181
|
self.dependencies_stub = DependenciesStub(grpc.secure_channel(self.url, credentials))
|
|
158
182
|
self.semgrep_stub = SemgrepStub(grpc.secure_channel(self.url, credentials))
|
|
159
183
|
self.vuln_stub = VulnerabilitiesStub(grpc.secure_channel(self.url, credentials))
|
|
160
|
-
self.provenance_stub =
|
|
184
|
+
self.provenance_stub = GeoProvenanceStub(grpc.secure_channel(self.url, credentials))
|
|
185
|
+
self.scanning_stub = ScanningStub(grpc.secure_channel(self.url, credentials))
|
|
161
186
|
|
|
162
187
|
@classmethod
|
|
163
188
|
def _load_cert(cls, cert_file: str) -> bytes:
|
|
@@ -437,6 +462,59 @@ class ScanossGrpc(ScanossBase):
|
|
|
437
462
|
return resp_dict
|
|
438
463
|
return None
|
|
439
464
|
|
|
465
|
+
def folder_hash_scan(self, request: Dict) -> Optional[Dict]:
|
|
466
|
+
"""
|
|
467
|
+
Client function to call the rpc for Folder Hashing Scan
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
request (Dict): Folder Hash Request
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
Optional[Dict]: Folder Hash Response, or None if the request was not succesfull
|
|
474
|
+
"""
|
|
475
|
+
return self._call_rpc(
|
|
476
|
+
self.scanning_stub.FolderHashScan,
|
|
477
|
+
request,
|
|
478
|
+
HFHRequest,
|
|
479
|
+
'Sending folder hash scan data (rqId: {rqId})...',
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
def _call_rpc(self, rpc_method, request_input, request_type, debug_msg: Optional[str] = None) -> Optional[Dict]:
|
|
483
|
+
"""
|
|
484
|
+
Call a gRPC method and return the response as a dictionary
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
rpc_method (): The gRPC stub method
|
|
488
|
+
request_input (): Either a dict or a gRPC request object.
|
|
489
|
+
request_type (): The type of the gRPC request object.
|
|
490
|
+
debug_msg (str, optional): Debug message template that can include {rqId} placeholder.
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
dict: The parsed gRPC response as a dictionary, or None if something went wrong
|
|
494
|
+
"""
|
|
495
|
+
request_id = str(uuid.uuid4())
|
|
496
|
+
|
|
497
|
+
if isinstance(request_input, dict):
|
|
498
|
+
request_obj = ParseDict(request_input, request_type())
|
|
499
|
+
else:
|
|
500
|
+
request_obj = request_input
|
|
501
|
+
|
|
502
|
+
metadata = self.metadata[:] + [('x-request-id', request_id)]
|
|
503
|
+
|
|
504
|
+
self.print_debug(debug_msg.format(rqId=request_id))
|
|
505
|
+
try:
|
|
506
|
+
resp = rpc_method(request_obj, metadata=metadata, timeout=self.timeout)
|
|
507
|
+
except grpc.RpcError as e:
|
|
508
|
+
raise ScanossGrpcError(
|
|
509
|
+
f'{e.__class__.__name__} while sending gRPC message (rqId: {request_id}): {e.details()}'
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
if resp and not self._check_status_response(resp.status, request_id):
|
|
513
|
+
return None
|
|
514
|
+
|
|
515
|
+
resp_dict = MessageToDict(resp, preserving_proto_field_name=True)
|
|
516
|
+
return resp_dict
|
|
517
|
+
|
|
440
518
|
def _check_status_response(self, status_response: StatusResponse, request_id: str = None) -> bool:
|
|
441
519
|
"""
|
|
442
520
|
Check the response object to see if the command was successful or not
|
|
@@ -444,21 +522,18 @@ class ScanossGrpc(ScanossBase):
|
|
|
444
522
|
:return: True if successful, False otherwise
|
|
445
523
|
"""
|
|
446
524
|
|
|
447
|
-
SUCCEDED_WITH_WARNINGS_STATUS_CODE = 2
|
|
448
|
-
FAILED_STATUS_CODE = 3
|
|
449
|
-
|
|
450
525
|
if not status_response:
|
|
451
526
|
self.print_stderr(f'Warning: No status response supplied (rqId: {request_id}). Assuming it was ok.')
|
|
452
527
|
return True
|
|
453
528
|
self.print_debug(f'Checking response status (rqId: {request_id}): {status_response}')
|
|
454
529
|
status_code: StatusCode = status_response.status
|
|
455
|
-
if status_code >
|
|
530
|
+
if status_code > ScanossGrpcStatusCode.SUCCESS:
|
|
456
531
|
ret_val = False # default to failed
|
|
457
532
|
msg = 'Unsuccessful'
|
|
458
|
-
if status_code ==
|
|
533
|
+
if status_code == ScanossGrpcStatusCode.SUCCESS_WITH_WARNINGS:
|
|
459
534
|
msg = 'Succeeded with warnings'
|
|
460
535
|
ret_val = True # No need to fail as it succeeded with warnings
|
|
461
|
-
elif status_code ==
|
|
536
|
+
elif status_code == ScanossGrpcStatusCode.FAILED_WITH_WARNINGS:
|
|
462
537
|
msg = 'Failed with warnings'
|
|
463
538
|
self.print_stderr(f'{msg} (rqId: {request_id} - status: {status_code}): {status_response.message}')
|
|
464
539
|
return ret_val
|
|
@@ -496,13 +571,13 @@ class ScanossGrpc(ScanossBase):
|
|
|
496
571
|
self.print_stderr('ERROR: No message supplied to send to gRPC service.')
|
|
497
572
|
return None
|
|
498
573
|
request_id = str(uuid.uuid4())
|
|
499
|
-
resp:
|
|
574
|
+
resp: ContributorResponse
|
|
500
575
|
try:
|
|
501
576
|
request = ParseDict(purls, PurlRequest()) # Parse the JSON/Dict into the purl request object
|
|
502
577
|
metadata = self.metadata[:]
|
|
503
578
|
metadata.append(('x-request-id', request_id)) # Set a Request ID
|
|
504
579
|
self.print_debug(f'Sending data for provenance decoration (rqId: {request_id})...')
|
|
505
|
-
resp = self.provenance_stub.
|
|
580
|
+
resp = self.provenance_stub.GetComponentContributors(request, metadata=metadata, timeout=self.timeout)
|
|
506
581
|
except Exception as e:
|
|
507
582
|
self.print_stderr(
|
|
508
583
|
f'ERROR: {e.__class__.__name__} Problem encountered sending gRPC message (rqId: {request_id}): {e}'
|
|
@@ -515,20 +590,70 @@ class ScanossGrpc(ScanossBase):
|
|
|
515
590
|
return resp_dict
|
|
516
591
|
return None
|
|
517
592
|
|
|
593
|
+
def get_provenance_origin(self, request: Dict) -> Optional[Dict]:
|
|
594
|
+
"""
|
|
595
|
+
Client function to call the rpc for GetComponentOrigin
|
|
596
|
+
|
|
597
|
+
Args:
|
|
598
|
+
request (Dict): GetComponentOrigin Request
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
Optional[Dict]: OriginResponse, or None if the request was not successfull
|
|
602
|
+
"""
|
|
603
|
+
return self._call_rpc(
|
|
604
|
+
self.provenance_stub.GetComponentOrigin,
|
|
605
|
+
request,
|
|
606
|
+
PurlRequest,
|
|
607
|
+
'Sending data for provenance origin decoration (rqId: {rqId})...',
|
|
608
|
+
)
|
|
609
|
+
|
|
518
610
|
def load_generic_headers(self):
|
|
519
611
|
"""
|
|
520
|
-
|
|
612
|
+
Adds custom headers from req_headers to metadata.
|
|
521
613
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
614
|
+
If x-api-key is present and no URL is configured (directly or via
|
|
615
|
+
environment), sets URL to the premium endpoint (DEFAULT_URL2).
|
|
616
|
+
"""
|
|
525
617
|
if self.req_headers: # Load generic headers
|
|
526
618
|
for key, value in self.req_headers.items():
|
|
527
|
-
if key == 'x-api-key':
|
|
619
|
+
if key == 'x-api-key': # Set premium URL if x-api-key header is set
|
|
528
620
|
if not self.url and not os.environ.get('SCANOSS_GRPC_URL'):
|
|
529
621
|
self.url = DEFAULT_URL2 # API key specific and no alternative URL, so use the default premium
|
|
530
622
|
self.api_key = value
|
|
531
623
|
self.metadata.append((key, value))
|
|
624
|
+
|
|
625
|
+
|
|
532
626
|
#
|
|
533
627
|
# End of ScanossGrpc Class
|
|
534
628
|
#
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
@dataclass
|
|
632
|
+
class GrpcConfig:
|
|
633
|
+
url: str = DEFAULT_URL
|
|
634
|
+
api_key: Optional[str] = SCANOSS_API_KEY
|
|
635
|
+
debug: Optional[bool] = False
|
|
636
|
+
trace: Optional[bool] = False
|
|
637
|
+
quiet: Optional[bool] = False
|
|
638
|
+
ver_details: Optional[str] = None
|
|
639
|
+
ca_cert: Optional[str] = None
|
|
640
|
+
pac: Optional[PACFile] = None
|
|
641
|
+
timeout: Optional[int] = DEFAULT_TIMEOUT
|
|
642
|
+
proxy: Optional[str] = None
|
|
643
|
+
grpc_proxy: Optional[str] = None
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def create_grpc_config_from_args(args) -> GrpcConfig:
|
|
647
|
+
return GrpcConfig(
|
|
648
|
+
url=getattr(args, 'api2url', DEFAULT_URL),
|
|
649
|
+
api_key=getattr(args, 'key', SCANOSS_API_KEY),
|
|
650
|
+
debug=getattr(args, 'debug', False),
|
|
651
|
+
trace=getattr(args, 'trace', False),
|
|
652
|
+
quiet=getattr(args, 'quiet', False),
|
|
653
|
+
ver_details=getattr(args, 'ver_details', None),
|
|
654
|
+
ca_cert=getattr(args, 'ca_cert', None),
|
|
655
|
+
pac=getattr(args, 'pac', None),
|
|
656
|
+
timeout=getattr(args, 'timeout', DEFAULT_TIMEOUT),
|
|
657
|
+
proxy=getattr(args, 'proxy', None),
|
|
658
|
+
grpc_proxy=getattr(args, 'grpc_proxy', None),
|
|
659
|
+
)
|
scanoss/threadedscanning.py
CHANGED
|
@@ -23,13 +23,13 @@ SPDX-License-Identifier: MIT
|
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
25
|
import os
|
|
26
|
+
import queue
|
|
26
27
|
import sys
|
|
27
28
|
import threading
|
|
28
|
-
import queue
|
|
29
29
|
import time
|
|
30
|
-
|
|
31
|
-
from typing import Dict, List
|
|
32
30
|
from dataclasses import dataclass
|
|
31
|
+
from typing import Dict, List
|
|
32
|
+
|
|
33
33
|
from progress.bar import Bar
|
|
34
34
|
|
|
35
35
|
from .scanossapi import ScanossApi
|
|
@@ -49,8 +49,6 @@ class ThreadedScanning(ScanossBase):
|
|
|
49
49
|
Multiple threads pull messages off this queue, process the request and put the results into an output queue
|
|
50
50
|
"""
|
|
51
51
|
|
|
52
|
-
inputs: queue.Queue = queue.Queue()
|
|
53
|
-
output: queue.Queue = queue.Queue()
|
|
54
52
|
bar: Bar = None
|
|
55
53
|
|
|
56
54
|
def __init__(
|
|
@@ -65,6 +63,8 @@ class ThreadedScanning(ScanossBase):
|
|
|
65
63
|
:param nb_threads: Number of thread to run (default 5)
|
|
66
64
|
"""
|
|
67
65
|
super().__init__(debug, trace, quiet)
|
|
66
|
+
self.inputs = queue.Queue()
|
|
67
|
+
self.output = queue.Queue()
|
|
68
68
|
self.scanapi = scanapi
|
|
69
69
|
self.nb_threads = nb_threads
|
|
70
70
|
self._isatty = sys.stderr.isatty()
|
|
@@ -134,7 +134,7 @@ class ThreadedScanning(ScanossBase):
|
|
|
134
134
|
:param wfp: WFP to add to queue
|
|
135
135
|
"""
|
|
136
136
|
if wfp is None or wfp == '':
|
|
137
|
-
self.print_stderr(
|
|
137
|
+
self.print_stderr('Warning: empty WFP. Skipping from scan...')
|
|
138
138
|
else:
|
|
139
139
|
self.inputs.put(wfp)
|
|
140
140
|
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
from scanoss.scanossbase import ScanossBase
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AbstractPresenter(ABC):
|
|
7
|
+
"""
|
|
8
|
+
Abstract presenter class for presenting output in a given format.
|
|
9
|
+
Subclasses must implement the _format_json_output and _format_plain_output methods.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
debug: bool = False,
|
|
15
|
+
trace: bool = False,
|
|
16
|
+
quiet: bool = False,
|
|
17
|
+
output_file: str = None,
|
|
18
|
+
output_format: str = None,
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Initialize the presenter with the given output file and format.
|
|
22
|
+
"""
|
|
23
|
+
self.AVAILABLE_OUTPUT_FORMATS = ['json', 'plain', 'cyclonedx', 'spdxlite', 'csv', 'raw']
|
|
24
|
+
self.base = ScanossBase(debug=debug, trace=trace, quiet=quiet)
|
|
25
|
+
self.output_file = output_file
|
|
26
|
+
self.output_format = output_format
|
|
27
|
+
|
|
28
|
+
def present(self, output_format: str = None, output_file: str = None):
|
|
29
|
+
"""
|
|
30
|
+
Present the formatted output to a file if provided; otherwise, print to stdout.
|
|
31
|
+
"""
|
|
32
|
+
file_path = output_file or self.output_file
|
|
33
|
+
fmt = output_format or self.output_format
|
|
34
|
+
|
|
35
|
+
if fmt and fmt not in self.AVAILABLE_OUTPUT_FORMATS:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
f"ERROR: Invalid output format '{fmt}'. Valid values are: {', '.join(self.AVAILABLE_OUTPUT_FORMATS)}"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
if fmt == 'json':
|
|
41
|
+
content = self._format_json_output()
|
|
42
|
+
elif fmt == 'plain':
|
|
43
|
+
content = self._format_plain_output()
|
|
44
|
+
elif fmt == 'cyclonedx':
|
|
45
|
+
content = self._format_cyclonedx_output()
|
|
46
|
+
elif fmt == 'spdxlite':
|
|
47
|
+
content = self._format_spdxlite_output()
|
|
48
|
+
elif fmt == 'csv':
|
|
49
|
+
content = self._format_csv_output()
|
|
50
|
+
elif fmt == 'raw':
|
|
51
|
+
content = self._format_raw_output()
|
|
52
|
+
else:
|
|
53
|
+
content = self._format_plain_output()
|
|
54
|
+
|
|
55
|
+
self._present_output(content, file_path)
|
|
56
|
+
|
|
57
|
+
def _present_output(self, content: str, file_path: str = None):
|
|
58
|
+
"""
|
|
59
|
+
If a file path is provided, write to that file; otherwise, print the content to stdout.
|
|
60
|
+
"""
|
|
61
|
+
self.base.print_to_file_or_stdout(content, file_path)
|
|
62
|
+
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def _format_cyclonedx_output(self) -> str:
|
|
65
|
+
"""
|
|
66
|
+
Return a CycloneDX string representation of the data.
|
|
67
|
+
"""
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def _format_spdxlite_output(self) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Return a SPDX-Lite string representation of the data.
|
|
74
|
+
"""
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def _format_csv_output(self) -> str:
|
|
79
|
+
"""
|
|
80
|
+
Return a CSV string representation of the data.
|
|
81
|
+
"""
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
@abstractmethod
|
|
85
|
+
def _format_json_output(self) -> str:
|
|
86
|
+
"""
|
|
87
|
+
Return a JSON string representation of the data.
|
|
88
|
+
"""
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def _format_plain_output(self) -> str:
|
|
93
|
+
"""
|
|
94
|
+
Return a plain text string representation of the data.
|
|
95
|
+
"""
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
@abstractmethod
|
|
99
|
+
def _format_raw_output(self) -> str:
|
|
100
|
+
"""
|
|
101
|
+
Return a raw string representation of the data.
|
|
102
|
+
"""
|
|
103
|
+
pass
|
scanoss/utils/crc64.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2025, SCANOSS
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
|
14
|
+
all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
+
THE SOFTWARE.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import struct
|
|
26
|
+
from typing import List
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CRC64:
|
|
30
|
+
"""
|
|
31
|
+
CRC64 ECMA implementation matching Go's hash/crc64 package.
|
|
32
|
+
Uses polynomial: 0xC96C5795D7870F42
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
POLY = 0xC96C5795D7870F42
|
|
36
|
+
_TABLE = None
|
|
37
|
+
|
|
38
|
+
def __init__(self):
|
|
39
|
+
if CRC64._TABLE is None:
|
|
40
|
+
CRC64._TABLE = self._make_table()
|
|
41
|
+
self.crc = 0xFFFFFFFFFFFFFFFF # Initial value
|
|
42
|
+
|
|
43
|
+
def _make_table(self) -> list:
|
|
44
|
+
"""Generate the CRC64 lookup table."""
|
|
45
|
+
table = []
|
|
46
|
+
for i in range(256):
|
|
47
|
+
crc = i
|
|
48
|
+
for _ in range(8):
|
|
49
|
+
if crc & 1:
|
|
50
|
+
crc = (crc >> 1) ^ self.POLY
|
|
51
|
+
else:
|
|
52
|
+
crc >>= 1
|
|
53
|
+
table.append(crc)
|
|
54
|
+
return table
|
|
55
|
+
|
|
56
|
+
def update(self, data: bytes) -> None:
|
|
57
|
+
"""Update the CRC with new data."""
|
|
58
|
+
if isinstance(data, str):
|
|
59
|
+
data = data.encode('utf-8')
|
|
60
|
+
|
|
61
|
+
crc = self.crc
|
|
62
|
+
for b in data:
|
|
63
|
+
crc = (crc >> 8) ^ CRC64._TABLE[(crc ^ b) & 0xFF] # Use class-level table
|
|
64
|
+
self.crc = crc
|
|
65
|
+
|
|
66
|
+
def digest(self) -> int:
|
|
67
|
+
"""Get the current CRC value."""
|
|
68
|
+
return self.crc ^ 0xFFFFFFFFFFFFFFFF # Final XOR value
|
|
69
|
+
|
|
70
|
+
def hexdigest(self):
|
|
71
|
+
"""Get the current CRC value as a hexadecimal string."""
|
|
72
|
+
return format(self.digest(), '016x')
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def checksum(cls, data: bytes) -> int:
|
|
76
|
+
"""Calculate CRC64 checksum for the given data."""
|
|
77
|
+
crc = cls()
|
|
78
|
+
crc.update(data)
|
|
79
|
+
return crc.digest()
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def get_hash_buff(cls, buff: bytes) -> List[bytes]:
|
|
83
|
+
"""
|
|
84
|
+
Get the hash value of the given buffer, and converts it to 8 bytes in big-endian order.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
buff (bytes): The buffer to get the hash value of.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
bytes: The hash value of the given buffer, and converts it to 8 bytes in big-endian order.
|
|
91
|
+
"""
|
|
92
|
+
crc = cls()
|
|
93
|
+
crc.update(buff)
|
|
94
|
+
hash_val = crc.digest()
|
|
95
|
+
|
|
96
|
+
return list(struct.pack('>Q', hash_val))
|
scanoss/utils/simhash.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2025, SCANOSS
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
|
14
|
+
all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
+
THE SOFTWARE.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import re
|
|
26
|
+
import unicodedata
|
|
27
|
+
|
|
28
|
+
FNV64_OFFSET_BASIS = 14695981039346656037
|
|
29
|
+
FNV64_PRIME = 1099511628211
|
|
30
|
+
MASK64 = 0xFFFFFFFFFFFFFFFF
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def fnv1_64(data: bytes) -> int:
|
|
34
|
+
"""Compute the 64‐bit FNV‑1 hash of data."""
|
|
35
|
+
h = FNV64_OFFSET_BASIS
|
|
36
|
+
for b in data:
|
|
37
|
+
h = (h * FNV64_PRIME) & MASK64
|
|
38
|
+
h = h ^ b
|
|
39
|
+
return h
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class SimhashFeature:
|
|
43
|
+
def __init__(self, hash_value: int, weight: int = 1):
|
|
44
|
+
self.hash_value = hash_value
|
|
45
|
+
self.weight = weight
|
|
46
|
+
|
|
47
|
+
def sum(self) -> int:
|
|
48
|
+
"""Return the 64-bit hash (sum) of this feature."""
|
|
49
|
+
return self.hash_value
|
|
50
|
+
|
|
51
|
+
def get_weight(self) -> int:
|
|
52
|
+
"""Return the weight of this feature."""
|
|
53
|
+
return self.weight
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def new_feature(f: bytes) -> SimhashFeature:
|
|
57
|
+
"""Return a new feature for the given byte slice with weight 1."""
|
|
58
|
+
return SimhashFeature(fnv1_64(f), 1)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def new_feature_with_weight(f: bytes, weight: int) -> SimhashFeature:
|
|
62
|
+
"""Return a new feature for the given byte slice with the given weight."""
|
|
63
|
+
return SimhashFeature(fnv1_64(f), weight)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def vectorize(features: list) -> list:
|
|
67
|
+
"""
|
|
68
|
+
Given a list of features, return a 64-element vector.
|
|
69
|
+
Each feature contributes its weight to each coordinate,
|
|
70
|
+
added if that bit is set and subtracted otherwise.
|
|
71
|
+
"""
|
|
72
|
+
v = [0] * 64
|
|
73
|
+
for feature in features:
|
|
74
|
+
h = feature.sum()
|
|
75
|
+
w = feature.get_weight()
|
|
76
|
+
for i in range(64):
|
|
77
|
+
if ((h >> i) & 1) == 1:
|
|
78
|
+
v[i] += w
|
|
79
|
+
else:
|
|
80
|
+
v[i] -= w
|
|
81
|
+
return v
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def vectorize_bytes(features: list) -> list:
|
|
85
|
+
"""
|
|
86
|
+
Given a list of byte slices, treat each as a feature (with weight 1)
|
|
87
|
+
by computing its FNV-1 hash.
|
|
88
|
+
"""
|
|
89
|
+
v = [0] * 64
|
|
90
|
+
for feat in features:
|
|
91
|
+
h = fnv1_64(feat)
|
|
92
|
+
for i in range(64):
|
|
93
|
+
if ((h >> i) & 1) == 1:
|
|
94
|
+
v[i] += 1
|
|
95
|
+
else:
|
|
96
|
+
v[i] -= 1
|
|
97
|
+
return v
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def fingerprint(v: list) -> int:
|
|
101
|
+
"""
|
|
102
|
+
Given a 64-element vector, return a 64-bit fingerprint.
|
|
103
|
+
For each bit i, if v[i] >= 0, set bit i to 1; otherwise leave it 0.
|
|
104
|
+
"""
|
|
105
|
+
f = 0
|
|
106
|
+
for i in range(64):
|
|
107
|
+
if v[i] >= 0:
|
|
108
|
+
f |= 1 << i
|
|
109
|
+
return f
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def compare(a: int, b: int) -> int:
|
|
113
|
+
"""
|
|
114
|
+
Calculate the Hamming distance between two 64-bit integers.
|
|
115
|
+
(The number of differing bits.)
|
|
116
|
+
"""
|
|
117
|
+
v = a ^ b
|
|
118
|
+
c = 0
|
|
119
|
+
while v:
|
|
120
|
+
v &= v - 1
|
|
121
|
+
c += 1
|
|
122
|
+
return c
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def simhash(fs) -> int:
|
|
126
|
+
"""
|
|
127
|
+
Given a feature set (an object with a get_features() method),
|
|
128
|
+
return its 64-bit simhash.
|
|
129
|
+
"""
|
|
130
|
+
return fingerprint(vectorize(fs.get_features()))
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def simhash_bytes(b: list) -> int:
|
|
134
|
+
"""
|
|
135
|
+
Given a list of byte slices, return the simhash.
|
|
136
|
+
"""
|
|
137
|
+
return fingerprint(vectorize_bytes(b))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
boundaries = re.compile(rb"[\w']+(?:\://[\w\./]+){0,1}")
|
|
141
|
+
unicode_boundaries = re.compile(r"[\w'-]+", re.UNICODE)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# --- Helper Functions for Feature Extraction ---
|
|
145
|
+
def _get_features_bytes(b: bytes, pattern: re.Pattern) -> list:
|
|
146
|
+
"""
|
|
147
|
+
Split the given byte string using the given regex pattern,
|
|
148
|
+
and return a list of features (each created with new_feature).
|
|
149
|
+
"""
|
|
150
|
+
words = pattern.findall(b)
|
|
151
|
+
return [new_feature(word) for word in words]
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _get_features_str(s: str, pattern) -> list:
|
|
155
|
+
"""
|
|
156
|
+
Split the given string using the given regex pattern,
|
|
157
|
+
and return a list of features (each created by encoding to UTF-8).
|
|
158
|
+
"""
|
|
159
|
+
words = pattern.findall(s)
|
|
160
|
+
return [new_feature(word.encode('utf-8')) for word in words]
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class WordFeatureSet:
|
|
164
|
+
def __init__(self, b: bytes):
|
|
165
|
+
# Normalize the input to lowercase.
|
|
166
|
+
self.b = b.lower()
|
|
167
|
+
|
|
168
|
+
def get_features(self) -> list:
|
|
169
|
+
return _get_features_bytes(self.b, boundaries)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class UnicodeWordFeatureSet:
|
|
173
|
+
def __init__(self, b: bytes, norm_form: str = 'NFC'):
|
|
174
|
+
# Decode, normalize (using the provided form), and lowercase.
|
|
175
|
+
text = b.decode('utf-8')
|
|
176
|
+
normalized = unicodedata.normalize(norm_form, text)
|
|
177
|
+
self.text = normalized.lower()
|
|
178
|
+
|
|
179
|
+
def get_features(self) -> list:
|
|
180
|
+
return _get_features_str(self.text, unicode_boundaries)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def shingle(w: int, b: list) -> list:
|
|
184
|
+
"""
|
|
185
|
+
Return the w-shingling of the given set of byte slices.
|
|
186
|
+
For example, if b is [b"this", b"is", b"a", b"test"]
|
|
187
|
+
and w == 2, the result is [b"this is", b"is a", b"a test"].
|
|
188
|
+
"""
|
|
189
|
+
if w < 1:
|
|
190
|
+
raise ValueError('simhash.shingle(): k must be a positive integer')
|
|
191
|
+
if w == 1:
|
|
192
|
+
return b
|
|
193
|
+
w = min(w, len(b))
|
|
194
|
+
count = len(b) - w + 1
|
|
195
|
+
shingles = []
|
|
196
|
+
for i in range(count):
|
|
197
|
+
shingles.append(b' '.join(b[i : i + w]))
|
|
198
|
+
return shingles
|