scanoss 1.20.6__py3-none-any.whl → 1.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protoc_gen_swagger/options/annotations_pb2.py +9 -12
- protoc_gen_swagger/options/annotations_pb2_grpc.py +1 -1
- protoc_gen_swagger/options/openapiv2_pb2.py +96 -98
- protoc_gen_swagger/options/openapiv2_pb2_grpc.py +1 -1
- scanoss/__init__.py +1 -1
- scanoss/api/common/v2/scanoss_common_pb2.py +20 -18
- scanoss/api/common/v2/scanoss_common_pb2_grpc.py +1 -1
- scanoss/api/components/v2/scanoss_components_pb2.py +38 -48
- scanoss/api/components/v2/scanoss_components_pb2_grpc.py +96 -142
- scanoss/api/cryptography/v2/scanoss_cryptography_pb2.py +42 -22
- scanoss/api/cryptography/v2/scanoss_cryptography_pb2_grpc.py +185 -75
- scanoss/api/dependencies/v2/scanoss_dependencies_pb2.py +32 -30
- scanoss/api/dependencies/v2/scanoss_dependencies_pb2_grpc.py +83 -75
- scanoss/api/provenance/v2/scanoss_provenance_pb2.py +20 -21
- scanoss/api/provenance/v2/scanoss_provenance_pb2_grpc.py +1 -1
- scanoss/api/scanning/v2/scanoss_scanning_pb2.py +20 -10
- scanoss/api/scanning/v2/scanoss_scanning_pb2_grpc.py +70 -40
- scanoss/api/semgrep/v2/scanoss_semgrep_pb2.py +18 -22
- scanoss/api/semgrep/v2/scanoss_semgrep_pb2_grpc.py +49 -71
- scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2.py +27 -37
- scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2_grpc.py +72 -109
- scanoss/cli.py +384 -80
- scanoss/constants.py +12 -0
- scanoss/data/build_date.txt +1 -1
- scanoss/file_filters.py +272 -57
- scanoss/results.py +92 -109
- scanoss/scanners/__init__.py +23 -0
- scanoss/scanners/container_scanner.py +474 -0
- scanoss/scanners/folder_hasher.py +302 -0
- scanoss/scanners/scanner_config.py +73 -0
- scanoss/scanners/scanner_hfh.py +172 -0
- scanoss/scanoss_settings.py +9 -5
- scanoss/scanossbase.py +9 -3
- scanoss/scanossgrpc.py +124 -13
- scanoss/threadedscanning.py +6 -6
- scanoss/utils/abstract_presenter.py +103 -0
- scanoss/utils/crc64.py +96 -0
- scanoss/utils/simhash.py +198 -0
- {scanoss-1.20.6.dist-info → scanoss-1.22.0.dist-info}/METADATA +2 -1
- {scanoss-1.20.6.dist-info → scanoss-1.22.0.dist-info}/RECORD +44 -35
- {scanoss-1.20.6.dist-info → scanoss-1.22.0.dist-info}/WHEEL +1 -1
- {scanoss-1.20.6.dist-info → scanoss-1.22.0.dist-info}/entry_points.txt +0 -0
- {scanoss-1.20.6.dist-info → scanoss-1.22.0.dist-info}/licenses/LICENSE +0 -0
- {scanoss-1.20.6.dist-info → scanoss-1.22.0.dist-info}/top_level.txt +0 -0
scanoss/scanossgrpc.py
CHANGED
|
@@ -26,6 +26,9 @@ import concurrent.futures
|
|
|
26
26
|
import json
|
|
27
27
|
import os
|
|
28
28
|
import uuid
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
from enum import IntEnum
|
|
31
|
+
from typing import Dict, Optional
|
|
29
32
|
from urllib.parse import urlparse
|
|
30
33
|
|
|
31
34
|
import grpc
|
|
@@ -33,6 +36,10 @@ from google.protobuf.json_format import MessageToDict, ParseDict
|
|
|
33
36
|
from pypac.parser import PACFile
|
|
34
37
|
from pypac.resolver import ProxyResolver
|
|
35
38
|
|
|
39
|
+
from scanoss.api.provenance.v2.scanoss_provenance_pb2_grpc import ProvenanceStub
|
|
40
|
+
from scanoss.api.scanning.v2.scanoss_scanning_pb2_grpc import ScanningStub
|
|
41
|
+
from scanoss.constants import DEFAULT_TIMEOUT
|
|
42
|
+
|
|
36
43
|
from . import __version__
|
|
37
44
|
from .api.common.v2.scanoss_common_pb2 import (
|
|
38
45
|
EchoRequest,
|
|
@@ -53,7 +60,7 @@ from .api.cryptography.v2.scanoss_cryptography_pb2_grpc import CryptographyStub
|
|
|
53
60
|
from .api.dependencies.v2.scanoss_dependencies_pb2 import DependencyRequest
|
|
54
61
|
from .api.dependencies.v2.scanoss_dependencies_pb2_grpc import DependenciesStub
|
|
55
62
|
from .api.provenance.v2.scanoss_provenance_pb2 import ProvenanceResponse
|
|
56
|
-
from .api.
|
|
63
|
+
from .api.scanning.v2.scanoss_scanning_pb2 import HFHRequest
|
|
57
64
|
from .api.semgrep.v2.scanoss_semgrep_pb2 import SemgrepResponse
|
|
58
65
|
from .api.semgrep.v2.scanoss_semgrep_pb2_grpc import SemgrepStub
|
|
59
66
|
from .api.vulnerabilities.v2.scanoss_vulnerabilities_pb2 import VulnerabilityResponse
|
|
@@ -68,6 +75,23 @@ SCANOSS_API_KEY = os.environ.get('SCANOSS_API_KEY') if os.environ.get('SCANOSS_A
|
|
|
68
75
|
MAX_CONCURRENT_REQUESTS = 5
|
|
69
76
|
|
|
70
77
|
|
|
78
|
+
class ScanossGrpcError(Exception):
|
|
79
|
+
"""
|
|
80
|
+
Custom exception for SCANOSS gRPC errors
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class ScanossGrpcStatusCode(IntEnum):
|
|
87
|
+
"""Status codes for SCANOSS gRPC responses"""
|
|
88
|
+
|
|
89
|
+
SUCCESS = 1
|
|
90
|
+
SUCCESS_WITH_WARNINGS = 2
|
|
91
|
+
FAILED_WITH_WARNINGS = 3
|
|
92
|
+
FAILED = 4
|
|
93
|
+
|
|
94
|
+
|
|
71
95
|
class ScanossGrpc(ScanossBase):
|
|
72
96
|
"""
|
|
73
97
|
Client for gRPC functionality
|
|
@@ -113,7 +137,6 @@ class ScanossGrpc(ScanossBase):
|
|
|
113
137
|
self.req_headers = req_headers
|
|
114
138
|
self.metadata = []
|
|
115
139
|
|
|
116
|
-
|
|
117
140
|
if self.api_key:
|
|
118
141
|
self.metadata.append(('x-api-key', api_key)) # Set API key if we have one
|
|
119
142
|
if ver_details:
|
|
@@ -147,6 +170,7 @@ class ScanossGrpc(ScanossBase):
|
|
|
147
170
|
self.semgrep_stub = SemgrepStub(grpc.insecure_channel(self.url))
|
|
148
171
|
self.vuln_stub = VulnerabilitiesStub(grpc.insecure_channel(self.url))
|
|
149
172
|
self.provenance_stub = ProvenanceStub(grpc.insecure_channel(self.url))
|
|
173
|
+
self.scanning_stub = ScanningStub(grpc.insecure_channel(self.url))
|
|
150
174
|
else:
|
|
151
175
|
if ca_cert is not None:
|
|
152
176
|
credentials = grpc.ssl_channel_credentials(cert_data) # secure with specified certificate
|
|
@@ -158,6 +182,7 @@ class ScanossGrpc(ScanossBase):
|
|
|
158
182
|
self.semgrep_stub = SemgrepStub(grpc.secure_channel(self.url, credentials))
|
|
159
183
|
self.vuln_stub = VulnerabilitiesStub(grpc.secure_channel(self.url, credentials))
|
|
160
184
|
self.provenance_stub = ProvenanceStub(grpc.secure_channel(self.url, credentials))
|
|
185
|
+
self.scanning_stub = ScanningStub(grpc.secure_channel(self.url, credentials))
|
|
161
186
|
|
|
162
187
|
@classmethod
|
|
163
188
|
def _load_cert(cls, cert_file: str) -> bytes:
|
|
@@ -437,6 +462,62 @@ class ScanossGrpc(ScanossBase):
|
|
|
437
462
|
return resp_dict
|
|
438
463
|
return None
|
|
439
464
|
|
|
465
|
+
def folder_hash_scan(self, request: Dict) -> Dict:
|
|
466
|
+
"""
|
|
467
|
+
Client function to call the rpc for Folder Hashing Scan
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
request (Dict): Folder Hash Request
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
Dict: Folder Hash Response
|
|
474
|
+
"""
|
|
475
|
+
return self._call_rpc(
|
|
476
|
+
self.scanning_stub.FolderHashScan,
|
|
477
|
+
request,
|
|
478
|
+
HFHRequest,
|
|
479
|
+
'Sending folder hash scan data (rqId: {rqId})...',
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
def _call_rpc(self, rpc_method, request_input, request_type, debug_msg: Optional[str] = None) -> dict:
|
|
483
|
+
"""
|
|
484
|
+
Call a gRPC method and return the response as a dictionary
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
rpc_method (): The gRPC stub method
|
|
488
|
+
request_input (): Either a dict or a gRPC request object.
|
|
489
|
+
request_type (): The type of the gRPC request object.
|
|
490
|
+
debug_msg (str, optional): Debug message template that can include {rqId} placeholder.
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
dict: The parsed gRPC response as a dictionary, or None if an error occurred.
|
|
494
|
+
"""
|
|
495
|
+
|
|
496
|
+
request_id = str(uuid.uuid4())
|
|
497
|
+
|
|
498
|
+
if isinstance(request_input, dict):
|
|
499
|
+
request_obj = ParseDict(request_input, request_type())
|
|
500
|
+
else:
|
|
501
|
+
request_obj = request_input
|
|
502
|
+
|
|
503
|
+
metadata = self.metadata[:] + [('x-request-id', request_id)]
|
|
504
|
+
|
|
505
|
+
self.print_debug(debug_msg.format(rqId=request_id))
|
|
506
|
+
|
|
507
|
+
try:
|
|
508
|
+
resp = rpc_method(request_obj, metadata=metadata, timeout=self.timeout)
|
|
509
|
+
except grpc.RpcError as e:
|
|
510
|
+
raise ScanossGrpcError(
|
|
511
|
+
f'{e.__class__.__name__} while sending gRPC message (rqId: {request_id}): {e.details()}'
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
if resp and not self._check_status_response(resp.status, request_id):
|
|
515
|
+
raise ScanossGrpcError(f'Unsuccessful status response (rqId: {request_id}).')
|
|
516
|
+
|
|
517
|
+
resp_dict = MessageToDict(resp, preserving_proto_field_name=True)
|
|
518
|
+
resp_dict.pop('status', None)
|
|
519
|
+
return resp_dict
|
|
520
|
+
|
|
440
521
|
def _check_status_response(self, status_response: StatusResponse, request_id: str = None) -> bool:
|
|
441
522
|
"""
|
|
442
523
|
Check the response object to see if the command was successful or not
|
|
@@ -444,21 +525,18 @@ class ScanossGrpc(ScanossBase):
|
|
|
444
525
|
:return: True if successful, False otherwise
|
|
445
526
|
"""
|
|
446
527
|
|
|
447
|
-
SUCCEDED_WITH_WARNINGS_STATUS_CODE = 2
|
|
448
|
-
FAILED_STATUS_CODE = 3
|
|
449
|
-
|
|
450
528
|
if not status_response:
|
|
451
529
|
self.print_stderr(f'Warning: No status response supplied (rqId: {request_id}). Assuming it was ok.')
|
|
452
530
|
return True
|
|
453
531
|
self.print_debug(f'Checking response status (rqId: {request_id}): {status_response}')
|
|
454
532
|
status_code: StatusCode = status_response.status
|
|
455
|
-
if status_code >
|
|
533
|
+
if status_code > ScanossGrpcStatusCode.SUCCESS:
|
|
456
534
|
ret_val = False # default to failed
|
|
457
535
|
msg = 'Unsuccessful'
|
|
458
|
-
if status_code ==
|
|
536
|
+
if status_code == ScanossGrpcStatusCode.SUCCESS_WITH_WARNINGS:
|
|
459
537
|
msg = 'Succeeded with warnings'
|
|
460
538
|
ret_val = True # No need to fail as it succeeded with warnings
|
|
461
|
-
elif status_code ==
|
|
539
|
+
elif status_code == ScanossGrpcStatusCode.FAILED_WITH_WARNINGS:
|
|
462
540
|
msg = 'Failed with warnings'
|
|
463
541
|
self.print_stderr(f'{msg} (rqId: {request_id} - status: {status_code}): {status_response.message}')
|
|
464
542
|
return ret_val
|
|
@@ -517,18 +595,51 @@ class ScanossGrpc(ScanossBase):
|
|
|
517
595
|
|
|
518
596
|
def load_generic_headers(self):
|
|
519
597
|
"""
|
|
520
|
-
|
|
598
|
+
Adds custom headers from req_headers to metadata.
|
|
521
599
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
600
|
+
If x-api-key is present and no URL is configured (directly or via
|
|
601
|
+
environment), sets URL to the premium endpoint (DEFAULT_URL2).
|
|
602
|
+
"""
|
|
525
603
|
if self.req_headers: # Load generic headers
|
|
526
604
|
for key, value in self.req_headers.items():
|
|
527
|
-
if key == 'x-api-key':
|
|
605
|
+
if key == 'x-api-key': # Set premium URL if x-api-key header is set
|
|
528
606
|
if not self.url and not os.environ.get('SCANOSS_GRPC_URL'):
|
|
529
607
|
self.url = DEFAULT_URL2 # API key specific and no alternative URL, so use the default premium
|
|
530
608
|
self.api_key = value
|
|
531
609
|
self.metadata.append((key, value))
|
|
610
|
+
|
|
611
|
+
|
|
532
612
|
#
|
|
533
613
|
# End of ScanossGrpc Class
|
|
534
614
|
#
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
@dataclass
|
|
618
|
+
class GrpcConfig:
|
|
619
|
+
url: str = DEFAULT_URL
|
|
620
|
+
api_key: Optional[str] = SCANOSS_API_KEY
|
|
621
|
+
debug: Optional[bool] = False
|
|
622
|
+
trace: Optional[bool] = False
|
|
623
|
+
quiet: Optional[bool] = False
|
|
624
|
+
ver_details: Optional[str] = None
|
|
625
|
+
ca_cert: Optional[str] = None
|
|
626
|
+
pac: Optional[PACFile] = None
|
|
627
|
+
timeout: Optional[int] = DEFAULT_TIMEOUT
|
|
628
|
+
proxy: Optional[str] = None
|
|
629
|
+
grpc_proxy: Optional[str] = None
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def create_grpc_config_from_args(args) -> GrpcConfig:
|
|
633
|
+
return GrpcConfig(
|
|
634
|
+
url=getattr(args, 'api2url', DEFAULT_URL),
|
|
635
|
+
api_key=getattr(args, 'key', SCANOSS_API_KEY),
|
|
636
|
+
debug=getattr(args, 'debug', False),
|
|
637
|
+
trace=getattr(args, 'trace', False),
|
|
638
|
+
quiet=getattr(args, 'quiet', False),
|
|
639
|
+
ver_details=getattr(args, 'ver_details', None),
|
|
640
|
+
ca_cert=getattr(args, 'ca_cert', None),
|
|
641
|
+
pac=getattr(args, 'pac', None),
|
|
642
|
+
timeout=getattr(args, 'timeout', DEFAULT_TIMEOUT),
|
|
643
|
+
proxy=getattr(args, 'proxy', None),
|
|
644
|
+
grpc_proxy=getattr(args, 'grpc_proxy', None),
|
|
645
|
+
)
|
scanoss/threadedscanning.py
CHANGED
|
@@ -23,13 +23,13 @@ SPDX-License-Identifier: MIT
|
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
25
|
import os
|
|
26
|
+
import queue
|
|
26
27
|
import sys
|
|
27
28
|
import threading
|
|
28
|
-
import queue
|
|
29
29
|
import time
|
|
30
|
-
|
|
31
|
-
from typing import Dict, List
|
|
32
30
|
from dataclasses import dataclass
|
|
31
|
+
from typing import Dict, List
|
|
32
|
+
|
|
33
33
|
from progress.bar import Bar
|
|
34
34
|
|
|
35
35
|
from .scanossapi import ScanossApi
|
|
@@ -49,8 +49,6 @@ class ThreadedScanning(ScanossBase):
|
|
|
49
49
|
Multiple threads pull messages off this queue, process the request and put the results into an output queue
|
|
50
50
|
"""
|
|
51
51
|
|
|
52
|
-
inputs: queue.Queue = queue.Queue()
|
|
53
|
-
output: queue.Queue = queue.Queue()
|
|
54
52
|
bar: Bar = None
|
|
55
53
|
|
|
56
54
|
def __init__(
|
|
@@ -65,6 +63,8 @@ class ThreadedScanning(ScanossBase):
|
|
|
65
63
|
:param nb_threads: Number of thread to run (default 5)
|
|
66
64
|
"""
|
|
67
65
|
super().__init__(debug, trace, quiet)
|
|
66
|
+
self.inputs = queue.Queue()
|
|
67
|
+
self.output = queue.Queue()
|
|
68
68
|
self.scanapi = scanapi
|
|
69
69
|
self.nb_threads = nb_threads
|
|
70
70
|
self._isatty = sys.stderr.isatty()
|
|
@@ -134,7 +134,7 @@ class ThreadedScanning(ScanossBase):
|
|
|
134
134
|
:param wfp: WFP to add to queue
|
|
135
135
|
"""
|
|
136
136
|
if wfp is None or wfp == '':
|
|
137
|
-
self.print_stderr(
|
|
137
|
+
self.print_stderr('Warning: empty WFP. Skipping from scan...')
|
|
138
138
|
else:
|
|
139
139
|
self.inputs.put(wfp)
|
|
140
140
|
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
from scanoss.scanossbase import ScanossBase
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class AbstractPresenter(ABC):
|
|
7
|
+
"""
|
|
8
|
+
Abstract presenter class for presenting output in a given format.
|
|
9
|
+
Subclasses must implement the _format_json_output and _format_plain_output methods.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
debug: bool = False,
|
|
15
|
+
trace: bool = False,
|
|
16
|
+
quiet: bool = False,
|
|
17
|
+
output_file: str = None,
|
|
18
|
+
output_format: str = None,
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Initialize the presenter with the given output file and format.
|
|
22
|
+
"""
|
|
23
|
+
self.AVAILABLE_OUTPUT_FORMATS = ['json', 'plain', 'cyclonedx', 'spdxlite', 'csv', 'raw']
|
|
24
|
+
self.base = ScanossBase(debug=debug, trace=trace, quiet=quiet)
|
|
25
|
+
self.output_file = output_file
|
|
26
|
+
self.output_format = output_format
|
|
27
|
+
|
|
28
|
+
def present(self, output_format: str = None, output_file: str = None):
|
|
29
|
+
"""
|
|
30
|
+
Present the formatted output to a file if provided; otherwise, print to stdout.
|
|
31
|
+
"""
|
|
32
|
+
file_path = output_file or self.output_file
|
|
33
|
+
fmt = output_format or self.output_format
|
|
34
|
+
|
|
35
|
+
if fmt and fmt not in self.AVAILABLE_OUTPUT_FORMATS:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
f"ERROR: Invalid output format '{fmt}'. Valid values are: {', '.join(self.AVAILABLE_OUTPUT_FORMATS)}"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
if fmt == 'json':
|
|
41
|
+
content = self._format_json_output()
|
|
42
|
+
elif fmt == 'plain':
|
|
43
|
+
content = self._format_plain_output()
|
|
44
|
+
elif fmt == 'cyclonedx':
|
|
45
|
+
content = self._format_cyclonedx_output()
|
|
46
|
+
elif fmt == 'spdxlite':
|
|
47
|
+
content = self._format_spdxlite_output()
|
|
48
|
+
elif fmt == 'csv':
|
|
49
|
+
content = self._format_csv_output()
|
|
50
|
+
elif fmt == 'raw':
|
|
51
|
+
content = self._format_raw_output()
|
|
52
|
+
else:
|
|
53
|
+
content = self._format_plain_output()
|
|
54
|
+
|
|
55
|
+
self._present_output(content, file_path)
|
|
56
|
+
|
|
57
|
+
def _present_output(self, content: str, file_path: str = None):
|
|
58
|
+
"""
|
|
59
|
+
If a file path is provided, write to that file; otherwise, print the content to stdout.
|
|
60
|
+
"""
|
|
61
|
+
self.base.print_to_file_or_stdout(content, file_path)
|
|
62
|
+
|
|
63
|
+
@abstractmethod
|
|
64
|
+
def _format_cyclonedx_output(self) -> str:
|
|
65
|
+
"""
|
|
66
|
+
Return a CycloneDX string representation of the data.
|
|
67
|
+
"""
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def _format_spdxlite_output(self) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Return a SPDX-Lite string representation of the data.
|
|
74
|
+
"""
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def _format_csv_output(self) -> str:
|
|
79
|
+
"""
|
|
80
|
+
Return a CSV string representation of the data.
|
|
81
|
+
"""
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
@abstractmethod
|
|
85
|
+
def _format_json_output(self) -> str:
|
|
86
|
+
"""
|
|
87
|
+
Return a JSON string representation of the data.
|
|
88
|
+
"""
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def _format_plain_output(self) -> str:
|
|
93
|
+
"""
|
|
94
|
+
Return a plain text string representation of the data.
|
|
95
|
+
"""
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
@abstractmethod
|
|
99
|
+
def _format_raw_output(self) -> str:
|
|
100
|
+
"""
|
|
101
|
+
Return a raw string representation of the data.
|
|
102
|
+
"""
|
|
103
|
+
pass
|
scanoss/utils/crc64.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2025, SCANOSS
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
|
14
|
+
all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
+
THE SOFTWARE.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import struct
|
|
26
|
+
from typing import List
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CRC64:
|
|
30
|
+
"""
|
|
31
|
+
CRC64 ECMA implementation matching Go's hash/crc64 package.
|
|
32
|
+
Uses polynomial: 0xC96C5795D7870F42
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
POLY = 0xC96C5795D7870F42
|
|
36
|
+
_TABLE = None
|
|
37
|
+
|
|
38
|
+
def __init__(self):
|
|
39
|
+
if CRC64._TABLE is None:
|
|
40
|
+
CRC64._TABLE = self._make_table()
|
|
41
|
+
self.crc = 0xFFFFFFFFFFFFFFFF # Initial value
|
|
42
|
+
|
|
43
|
+
def _make_table(self) -> list:
|
|
44
|
+
"""Generate the CRC64 lookup table."""
|
|
45
|
+
table = []
|
|
46
|
+
for i in range(256):
|
|
47
|
+
crc = i
|
|
48
|
+
for _ in range(8):
|
|
49
|
+
if crc & 1:
|
|
50
|
+
crc = (crc >> 1) ^ self.POLY
|
|
51
|
+
else:
|
|
52
|
+
crc >>= 1
|
|
53
|
+
table.append(crc)
|
|
54
|
+
return table
|
|
55
|
+
|
|
56
|
+
def update(self, data: bytes) -> None:
|
|
57
|
+
"""Update the CRC with new data."""
|
|
58
|
+
if isinstance(data, str):
|
|
59
|
+
data = data.encode('utf-8')
|
|
60
|
+
|
|
61
|
+
crc = self.crc
|
|
62
|
+
for b in data:
|
|
63
|
+
crc = (crc >> 8) ^ CRC64._TABLE[(crc ^ b) & 0xFF] # Use class-level table
|
|
64
|
+
self.crc = crc
|
|
65
|
+
|
|
66
|
+
def digest(self) -> int:
|
|
67
|
+
"""Get the current CRC value."""
|
|
68
|
+
return self.crc ^ 0xFFFFFFFFFFFFFFFF # Final XOR value
|
|
69
|
+
|
|
70
|
+
def hexdigest(self):
|
|
71
|
+
"""Get the current CRC value as a hexadecimal string."""
|
|
72
|
+
return format(self.digest(), '016x')
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def checksum(cls, data: bytes) -> int:
|
|
76
|
+
"""Calculate CRC64 checksum for the given data."""
|
|
77
|
+
crc = cls()
|
|
78
|
+
crc.update(data)
|
|
79
|
+
return crc.digest()
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def get_hash_buff(cls, buff: bytes) -> List[bytes]:
|
|
83
|
+
"""
|
|
84
|
+
Get the hash value of the given buffer, and converts it to 8 bytes in big-endian order.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
buff (bytes): The buffer to get the hash value of.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
bytes: The hash value of the given buffer, and converts it to 8 bytes in big-endian order.
|
|
91
|
+
"""
|
|
92
|
+
crc = cls()
|
|
93
|
+
crc.update(buff)
|
|
94
|
+
hash_val = crc.digest()
|
|
95
|
+
|
|
96
|
+
return list(struct.pack('>Q', hash_val))
|
scanoss/utils/simhash.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2025, SCANOSS
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
|
14
|
+
all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
+
THE SOFTWARE.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import re
|
|
26
|
+
import unicodedata
|
|
27
|
+
|
|
28
|
+
FNV64_OFFSET_BASIS = 14695981039346656037
|
|
29
|
+
FNV64_PRIME = 1099511628211
|
|
30
|
+
MASK64 = 0xFFFFFFFFFFFFFFFF
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def fnv1_64(data: bytes) -> int:
|
|
34
|
+
"""Compute the 64‐bit FNV‑1 hash of data."""
|
|
35
|
+
h = FNV64_OFFSET_BASIS
|
|
36
|
+
for b in data:
|
|
37
|
+
h = (h * FNV64_PRIME) & MASK64
|
|
38
|
+
h = h ^ b
|
|
39
|
+
return h
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class SimhashFeature:
|
|
43
|
+
def __init__(self, hash_value: int, weight: int = 1):
|
|
44
|
+
self.hash_value = hash_value
|
|
45
|
+
self.weight = weight
|
|
46
|
+
|
|
47
|
+
def sum(self) -> int:
|
|
48
|
+
"""Return the 64-bit hash (sum) of this feature."""
|
|
49
|
+
return self.hash_value
|
|
50
|
+
|
|
51
|
+
def get_weight(self) -> int:
|
|
52
|
+
"""Return the weight of this feature."""
|
|
53
|
+
return self.weight
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def new_feature(f: bytes) -> SimhashFeature:
|
|
57
|
+
"""Return a new feature for the given byte slice with weight 1."""
|
|
58
|
+
return SimhashFeature(fnv1_64(f), 1)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def new_feature_with_weight(f: bytes, weight: int) -> SimhashFeature:
|
|
62
|
+
"""Return a new feature for the given byte slice with the given weight."""
|
|
63
|
+
return SimhashFeature(fnv1_64(f), weight)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def vectorize(features: list) -> list:
|
|
67
|
+
"""
|
|
68
|
+
Given a list of features, return a 64-element vector.
|
|
69
|
+
Each feature contributes its weight to each coordinate,
|
|
70
|
+
added if that bit is set and subtracted otherwise.
|
|
71
|
+
"""
|
|
72
|
+
v = [0] * 64
|
|
73
|
+
for feature in features:
|
|
74
|
+
h = feature.sum()
|
|
75
|
+
w = feature.get_weight()
|
|
76
|
+
for i in range(64):
|
|
77
|
+
if ((h >> i) & 1) == 1:
|
|
78
|
+
v[i] += w
|
|
79
|
+
else:
|
|
80
|
+
v[i] -= w
|
|
81
|
+
return v
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def vectorize_bytes(features: list) -> list:
|
|
85
|
+
"""
|
|
86
|
+
Given a list of byte slices, treat each as a feature (with weight 1)
|
|
87
|
+
by computing its FNV-1 hash.
|
|
88
|
+
"""
|
|
89
|
+
v = [0] * 64
|
|
90
|
+
for feat in features:
|
|
91
|
+
h = fnv1_64(feat)
|
|
92
|
+
for i in range(64):
|
|
93
|
+
if ((h >> i) & 1) == 1:
|
|
94
|
+
v[i] += 1
|
|
95
|
+
else:
|
|
96
|
+
v[i] -= 1
|
|
97
|
+
return v
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def fingerprint(v: list) -> int:
|
|
101
|
+
"""
|
|
102
|
+
Given a 64-element vector, return a 64-bit fingerprint.
|
|
103
|
+
For each bit i, if v[i] >= 0, set bit i to 1; otherwise leave it 0.
|
|
104
|
+
"""
|
|
105
|
+
f = 0
|
|
106
|
+
for i in range(64):
|
|
107
|
+
if v[i] >= 0:
|
|
108
|
+
f |= 1 << i
|
|
109
|
+
return f
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def compare(a: int, b: int) -> int:
|
|
113
|
+
"""
|
|
114
|
+
Calculate the Hamming distance between two 64-bit integers.
|
|
115
|
+
(The number of differing bits.)
|
|
116
|
+
"""
|
|
117
|
+
v = a ^ b
|
|
118
|
+
c = 0
|
|
119
|
+
while v:
|
|
120
|
+
v &= v - 1
|
|
121
|
+
c += 1
|
|
122
|
+
return c
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def simhash(fs) -> int:
|
|
126
|
+
"""
|
|
127
|
+
Given a feature set (an object with a get_features() method),
|
|
128
|
+
return its 64-bit simhash.
|
|
129
|
+
"""
|
|
130
|
+
return fingerprint(vectorize(fs.get_features()))
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def simhash_bytes(b: list) -> int:
|
|
134
|
+
"""
|
|
135
|
+
Given a list of byte slices, return the simhash.
|
|
136
|
+
"""
|
|
137
|
+
return fingerprint(vectorize_bytes(b))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
boundaries = re.compile(rb"[\w']+(?:\://[\w\./]+){0,1}")
|
|
141
|
+
unicode_boundaries = re.compile(r"[\w'-]+", re.UNICODE)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# --- Helper Functions for Feature Extraction ---
|
|
145
|
+
def _get_features_bytes(b: bytes, pattern: re.Pattern) -> list:
|
|
146
|
+
"""
|
|
147
|
+
Split the given byte string using the given regex pattern,
|
|
148
|
+
and return a list of features (each created with new_feature).
|
|
149
|
+
"""
|
|
150
|
+
words = pattern.findall(b)
|
|
151
|
+
return [new_feature(word) for word in words]
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _get_features_str(s: str, pattern) -> list:
|
|
155
|
+
"""
|
|
156
|
+
Split the given string using the given regex pattern,
|
|
157
|
+
and return a list of features (each created by encoding to UTF-8).
|
|
158
|
+
"""
|
|
159
|
+
words = pattern.findall(s)
|
|
160
|
+
return [new_feature(word.encode('utf-8')) for word in words]
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class WordFeatureSet:
|
|
164
|
+
def __init__(self, b: bytes):
|
|
165
|
+
# Normalize the input to lowercase.
|
|
166
|
+
self.b = b.lower()
|
|
167
|
+
|
|
168
|
+
def get_features(self) -> list:
|
|
169
|
+
return _get_features_bytes(self.b, boundaries)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class UnicodeWordFeatureSet:
|
|
173
|
+
def __init__(self, b: bytes, norm_form: str = 'NFC'):
|
|
174
|
+
# Decode, normalize (using the provided form), and lowercase.
|
|
175
|
+
text = b.decode('utf-8')
|
|
176
|
+
normalized = unicodedata.normalize(norm_form, text)
|
|
177
|
+
self.text = normalized.lower()
|
|
178
|
+
|
|
179
|
+
def get_features(self) -> list:
|
|
180
|
+
return _get_features_str(self.text, unicode_boundaries)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def shingle(w: int, b: list) -> list:
|
|
184
|
+
"""
|
|
185
|
+
Return the w-shingling of the given set of byte slices.
|
|
186
|
+
For example, if b is [b"this", b"is", b"a", b"test"]
|
|
187
|
+
and w == 2, the result is [b"this is", b"is a", b"a test"].
|
|
188
|
+
"""
|
|
189
|
+
if w < 1:
|
|
190
|
+
raise ValueError('simhash.shingle(): k must be a positive integer')
|
|
191
|
+
if w == 1:
|
|
192
|
+
return b
|
|
193
|
+
w = min(w, len(b))
|
|
194
|
+
count = len(b) - w + 1
|
|
195
|
+
shingles = []
|
|
196
|
+
for i in range(count):
|
|
197
|
+
shingles.append(b' '.join(b[i : i + w]))
|
|
198
|
+
return shingles
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scanoss
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.22.0
|
|
4
4
|
Summary: Simple Python library to leverage the SCANOSS APIs
|
|
5
5
|
Home-page: https://scanoss.com
|
|
6
6
|
Author: SCANOSS
|
|
@@ -29,6 +29,7 @@ Requires-Dist: importlib_resources
|
|
|
29
29
|
Requires-Dist: packageurl-python
|
|
30
30
|
Requires-Dist: pathspec
|
|
31
31
|
Requires-Dist: jsonschema
|
|
32
|
+
Requires-Dist: crc
|
|
32
33
|
Provides-Extra: fast-winnowing
|
|
33
34
|
Requires-Dist: scanoss_winnowing>=0.5.0; extra == "fast-winnowing"
|
|
34
35
|
Dynamic: license-file
|