scanoss 1.20.6__py3-none-any.whl → 1.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. protoc_gen_swagger/options/annotations_pb2.py +9 -12
  2. protoc_gen_swagger/options/annotations_pb2_grpc.py +1 -1
  3. protoc_gen_swagger/options/openapiv2_pb2.py +96 -98
  4. protoc_gen_swagger/options/openapiv2_pb2_grpc.py +1 -1
  5. scanoss/__init__.py +1 -1
  6. scanoss/api/common/v2/scanoss_common_pb2.py +20 -18
  7. scanoss/api/common/v2/scanoss_common_pb2_grpc.py +1 -1
  8. scanoss/api/components/v2/scanoss_components_pb2.py +38 -48
  9. scanoss/api/components/v2/scanoss_components_pb2_grpc.py +96 -142
  10. scanoss/api/cryptography/v2/scanoss_cryptography_pb2.py +42 -22
  11. scanoss/api/cryptography/v2/scanoss_cryptography_pb2_grpc.py +185 -75
  12. scanoss/api/dependencies/v2/scanoss_dependencies_pb2.py +32 -30
  13. scanoss/api/dependencies/v2/scanoss_dependencies_pb2_grpc.py +83 -75
  14. scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2.py +49 -0
  15. scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2_grpc.py +142 -0
  16. scanoss/api/scanning/v2/scanoss_scanning_pb2.py +20 -10
  17. scanoss/api/scanning/v2/scanoss_scanning_pb2_grpc.py +70 -40
  18. scanoss/api/semgrep/v2/scanoss_semgrep_pb2.py +18 -22
  19. scanoss/api/semgrep/v2/scanoss_semgrep_pb2_grpc.py +49 -71
  20. scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2.py +27 -37
  21. scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2_grpc.py +72 -109
  22. scanoss/cli.py +393 -84
  23. scanoss/components.py +21 -11
  24. scanoss/constants.py +12 -0
  25. scanoss/data/build_date.txt +1 -1
  26. scanoss/file_filters.py +272 -57
  27. scanoss/results.py +92 -109
  28. scanoss/scanners/__init__.py +23 -0
  29. scanoss/scanners/container_scanner.py +474 -0
  30. scanoss/scanners/folder_hasher.py +302 -0
  31. scanoss/scanners/scanner_config.py +73 -0
  32. scanoss/scanners/scanner_hfh.py +173 -0
  33. scanoss/scanoss_settings.py +9 -5
  34. scanoss/scanossbase.py +9 -3
  35. scanoss/scanossgrpc.py +143 -18
  36. scanoss/threadedscanning.py +6 -6
  37. scanoss/utils/abstract_presenter.py +103 -0
  38. scanoss/utils/crc64.py +96 -0
  39. scanoss/utils/simhash.py +198 -0
  40. {scanoss-1.20.6.dist-info → scanoss-1.23.0.dist-info}/METADATA +2 -1
  41. scanoss-1.23.0.dist-info/RECORD +83 -0
  42. {scanoss-1.20.6.dist-info → scanoss-1.23.0.dist-info}/WHEEL +1 -1
  43. scanoss/api/provenance/v2/scanoss_provenance_pb2.py +0 -42
  44. scanoss/api/provenance/v2/scanoss_provenance_pb2_grpc.py +0 -108
  45. scanoss-1.20.6.dist-info/RECORD +0 -74
  46. /scanoss/api/{provenance → geoprovenance}/__init__.py +0 -0
  47. /scanoss/api/{provenance → geoprovenance}/v2/__init__.py +0 -0
  48. {scanoss-1.20.6.dist-info → scanoss-1.23.0.dist-info}/entry_points.txt +0 -0
  49. {scanoss-1.20.6.dist-info → scanoss-1.23.0.dist-info}/licenses/LICENSE +0 -0
  50. {scanoss-1.20.6.dist-info → scanoss-1.23.0.dist-info}/top_level.txt +0 -0
scanoss/scanossgrpc.py CHANGED
@@ -26,6 +26,9 @@ import concurrent.futures
26
26
  import json
27
27
  import os
28
28
  import uuid
29
+ from dataclasses import dataclass
30
+ from enum import IntEnum
31
+ from typing import Dict, Optional
29
32
  from urllib.parse import urlparse
30
33
 
31
34
  import grpc
@@ -33,6 +36,9 @@ from google.protobuf.json_format import MessageToDict, ParseDict
33
36
  from pypac.parser import PACFile
34
37
  from pypac.resolver import ProxyResolver
35
38
 
39
+ from scanoss.api.scanning.v2.scanoss_scanning_pb2_grpc import ScanningStub
40
+ from scanoss.constants import DEFAULT_TIMEOUT
41
+
36
42
  from . import __version__
37
43
  from .api.common.v2.scanoss_common_pb2 import (
38
44
  EchoRequest,
@@ -52,8 +58,9 @@ from .api.cryptography.v2.scanoss_cryptography_pb2 import AlgorithmResponse
52
58
  from .api.cryptography.v2.scanoss_cryptography_pb2_grpc import CryptographyStub
53
59
  from .api.dependencies.v2.scanoss_dependencies_pb2 import DependencyRequest
54
60
  from .api.dependencies.v2.scanoss_dependencies_pb2_grpc import DependenciesStub
55
- from .api.provenance.v2.scanoss_provenance_pb2 import ProvenanceResponse
56
- from .api.provenance.v2.scanoss_provenance_pb2_grpc import ProvenanceStub
61
+ from .api.geoprovenance.v2.scanoss_geoprovenance_pb2 import ContributorResponse
62
+ from .api.geoprovenance.v2.scanoss_geoprovenance_pb2_grpc import GeoProvenanceStub
63
+ from .api.scanning.v2.scanoss_scanning_pb2 import HFHRequest
57
64
  from .api.semgrep.v2.scanoss_semgrep_pb2 import SemgrepResponse
58
65
  from .api.semgrep.v2.scanoss_semgrep_pb2_grpc import SemgrepStub
59
66
  from .api.vulnerabilities.v2.scanoss_vulnerabilities_pb2 import VulnerabilityResponse
@@ -68,6 +75,23 @@ SCANOSS_API_KEY = os.environ.get('SCANOSS_API_KEY') if os.environ.get('SCANOSS_A
68
75
  MAX_CONCURRENT_REQUESTS = 5
69
76
 
70
77
 
78
+ class ScanossGrpcError(Exception):
79
+ """
80
+ Custom exception for SCANOSS gRPC errors
81
+ """
82
+
83
+ pass
84
+
85
+
86
+ class ScanossGrpcStatusCode(IntEnum):
87
+ """Status codes for SCANOSS gRPC responses"""
88
+
89
+ SUCCESS = 1
90
+ SUCCESS_WITH_WARNINGS = 2
91
+ FAILED_WITH_WARNINGS = 3
92
+ FAILED = 4
93
+
94
+
71
95
  class ScanossGrpc(ScanossBase):
72
96
  """
73
97
  Client for gRPC functionality
@@ -113,7 +137,6 @@ class ScanossGrpc(ScanossBase):
113
137
  self.req_headers = req_headers
114
138
  self.metadata = []
115
139
 
116
-
117
140
  if self.api_key:
118
141
  self.metadata.append(('x-api-key', api_key)) # Set API key if we have one
119
142
  if ver_details:
@@ -146,7 +169,8 @@ class ScanossGrpc(ScanossBase):
146
169
  self.dependencies_stub = DependenciesStub(grpc.insecure_channel(self.url))
147
170
  self.semgrep_stub = SemgrepStub(grpc.insecure_channel(self.url))
148
171
  self.vuln_stub = VulnerabilitiesStub(grpc.insecure_channel(self.url))
149
- self.provenance_stub = ProvenanceStub(grpc.insecure_channel(self.url))
172
+ self.provenance_stub = GeoProvenanceStub(grpc.insecure_channel(self.url))
173
+ self.scanning_stub = ScanningStub(grpc.insecure_channel(self.url))
150
174
  else:
151
175
  if ca_cert is not None:
152
176
  credentials = grpc.ssl_channel_credentials(cert_data) # secure with specified certificate
@@ -157,7 +181,8 @@ class ScanossGrpc(ScanossBase):
157
181
  self.dependencies_stub = DependenciesStub(grpc.secure_channel(self.url, credentials))
158
182
  self.semgrep_stub = SemgrepStub(grpc.secure_channel(self.url, credentials))
159
183
  self.vuln_stub = VulnerabilitiesStub(grpc.secure_channel(self.url, credentials))
160
- self.provenance_stub = ProvenanceStub(grpc.secure_channel(self.url, credentials))
184
+ self.provenance_stub = GeoProvenanceStub(grpc.secure_channel(self.url, credentials))
185
+ self.scanning_stub = ScanningStub(grpc.secure_channel(self.url, credentials))
161
186
 
162
187
  @classmethod
163
188
  def _load_cert(cls, cert_file: str) -> bytes:
@@ -437,6 +462,59 @@ class ScanossGrpc(ScanossBase):
437
462
  return resp_dict
438
463
  return None
439
464
 
465
+ def folder_hash_scan(self, request: Dict) -> Optional[Dict]:
466
+ """
467
+ Client function to call the rpc for Folder Hashing Scan
468
+
469
+ Args:
470
+ request (Dict): Folder Hash Request
471
+
472
+ Returns:
473
+ Optional[Dict]: Folder Hash Response, or None if the request was not succesfull
474
+ """
475
+ return self._call_rpc(
476
+ self.scanning_stub.FolderHashScan,
477
+ request,
478
+ HFHRequest,
479
+ 'Sending folder hash scan data (rqId: {rqId})...',
480
+ )
481
+
482
+ def _call_rpc(self, rpc_method, request_input, request_type, debug_msg: Optional[str] = None) -> Optional[Dict]:
483
+ """
484
+ Call a gRPC method and return the response as a dictionary
485
+
486
+ Args:
487
+ rpc_method (): The gRPC stub method
488
+ request_input (): Either a dict or a gRPC request object.
489
+ request_type (): The type of the gRPC request object.
490
+ debug_msg (str, optional): Debug message template that can include {rqId} placeholder.
491
+
492
+ Returns:
493
+ dict: The parsed gRPC response as a dictionary, or None if something went wrong
494
+ """
495
+ request_id = str(uuid.uuid4())
496
+
497
+ if isinstance(request_input, dict):
498
+ request_obj = ParseDict(request_input, request_type())
499
+ else:
500
+ request_obj = request_input
501
+
502
+ metadata = self.metadata[:] + [('x-request-id', request_id)]
503
+
504
+ self.print_debug(debug_msg.format(rqId=request_id))
505
+ try:
506
+ resp = rpc_method(request_obj, metadata=metadata, timeout=self.timeout)
507
+ except grpc.RpcError as e:
508
+ raise ScanossGrpcError(
509
+ f'{e.__class__.__name__} while sending gRPC message (rqId: {request_id}): {e.details()}'
510
+ )
511
+
512
+ if resp and not self._check_status_response(resp.status, request_id):
513
+ return None
514
+
515
+ resp_dict = MessageToDict(resp, preserving_proto_field_name=True)
516
+ return resp_dict
517
+
440
518
  def _check_status_response(self, status_response: StatusResponse, request_id: str = None) -> bool:
441
519
  """
442
520
  Check the response object to see if the command was successful or not
@@ -444,21 +522,18 @@ class ScanossGrpc(ScanossBase):
444
522
  :return: True if successful, False otherwise
445
523
  """
446
524
 
447
- SUCCEDED_WITH_WARNINGS_STATUS_CODE = 2
448
- FAILED_STATUS_CODE = 3
449
-
450
525
  if not status_response:
451
526
  self.print_stderr(f'Warning: No status response supplied (rqId: {request_id}). Assuming it was ok.')
452
527
  return True
453
528
  self.print_debug(f'Checking response status (rqId: {request_id}): {status_response}')
454
529
  status_code: StatusCode = status_response.status
455
- if status_code > 1:
530
+ if status_code > ScanossGrpcStatusCode.SUCCESS:
456
531
  ret_val = False # default to failed
457
532
  msg = 'Unsuccessful'
458
- if status_code == SUCCEDED_WITH_WARNINGS_STATUS_CODE:
533
+ if status_code == ScanossGrpcStatusCode.SUCCESS_WITH_WARNINGS:
459
534
  msg = 'Succeeded with warnings'
460
535
  ret_val = True # No need to fail as it succeeded with warnings
461
- elif status_code == FAILED_STATUS_CODE:
536
+ elif status_code == ScanossGrpcStatusCode.FAILED_WITH_WARNINGS:
462
537
  msg = 'Failed with warnings'
463
538
  self.print_stderr(f'{msg} (rqId: {request_id} - status: {status_code}): {status_response.message}')
464
539
  return ret_val
@@ -496,13 +571,13 @@ class ScanossGrpc(ScanossBase):
496
571
  self.print_stderr('ERROR: No message supplied to send to gRPC service.')
497
572
  return None
498
573
  request_id = str(uuid.uuid4())
499
- resp: ProvenanceResponse
574
+ resp: ContributorResponse
500
575
  try:
501
576
  request = ParseDict(purls, PurlRequest()) # Parse the JSON/Dict into the purl request object
502
577
  metadata = self.metadata[:]
503
578
  metadata.append(('x-request-id', request_id)) # Set a Request ID
504
579
  self.print_debug(f'Sending data for provenance decoration (rqId: {request_id})...')
505
- resp = self.provenance_stub.GetComponentProvenance(request, metadata=metadata, timeout=self.timeout)
580
+ resp = self.provenance_stub.GetComponentContributors(request, metadata=metadata, timeout=self.timeout)
506
581
  except Exception as e:
507
582
  self.print_stderr(
508
583
  f'ERROR: {e.__class__.__name__} Problem encountered sending gRPC message (rqId: {request_id}): {e}'
@@ -515,20 +590,70 @@ class ScanossGrpc(ScanossBase):
515
590
  return resp_dict
516
591
  return None
517
592
 
593
+ def get_provenance_origin(self, request: Dict) -> Optional[Dict]:
594
+ """
595
+ Client function to call the rpc for GetComponentOrigin
596
+
597
+ Args:
598
+ request (Dict): GetComponentOrigin Request
599
+
600
+ Returns:
601
+ Optional[Dict]: OriginResponse, or None if the request was not successfull
602
+ """
603
+ return self._call_rpc(
604
+ self.provenance_stub.GetComponentOrigin,
605
+ request,
606
+ PurlRequest,
607
+ 'Sending data for provenance origin decoration (rqId: {rqId})...',
608
+ )
609
+
518
610
  def load_generic_headers(self):
519
611
  """
520
- Adds custom headers from req_headers to metadata.
612
+ Adds custom headers from req_headers to metadata.
521
613
 
522
- If x-api-key is present and no URL is configured (directly or via
523
- environment), sets URL to the premium endpoint (DEFAULT_URL2).
524
- """
614
+ If x-api-key is present and no URL is configured (directly or via
615
+ environment), sets URL to the premium endpoint (DEFAULT_URL2).
616
+ """
525
617
  if self.req_headers: # Load generic headers
526
618
  for key, value in self.req_headers.items():
527
- if key == 'x-api-key': # Set premium URL if x-api-key header is set
619
+ if key == 'x-api-key': # Set premium URL if x-api-key header is set
528
620
  if not self.url and not os.environ.get('SCANOSS_GRPC_URL'):
529
621
  self.url = DEFAULT_URL2 # API key specific and no alternative URL, so use the default premium
530
622
  self.api_key = value
531
623
  self.metadata.append((key, value))
624
+
625
+
532
626
  #
533
627
  # End of ScanossGrpc Class
534
628
  #
629
+
630
+
631
+ @dataclass
632
+ class GrpcConfig:
633
+ url: str = DEFAULT_URL
634
+ api_key: Optional[str] = SCANOSS_API_KEY
635
+ debug: Optional[bool] = False
636
+ trace: Optional[bool] = False
637
+ quiet: Optional[bool] = False
638
+ ver_details: Optional[str] = None
639
+ ca_cert: Optional[str] = None
640
+ pac: Optional[PACFile] = None
641
+ timeout: Optional[int] = DEFAULT_TIMEOUT
642
+ proxy: Optional[str] = None
643
+ grpc_proxy: Optional[str] = None
644
+
645
+
646
+ def create_grpc_config_from_args(args) -> GrpcConfig:
647
+ return GrpcConfig(
648
+ url=getattr(args, 'api2url', DEFAULT_URL),
649
+ api_key=getattr(args, 'key', SCANOSS_API_KEY),
650
+ debug=getattr(args, 'debug', False),
651
+ trace=getattr(args, 'trace', False),
652
+ quiet=getattr(args, 'quiet', False),
653
+ ver_details=getattr(args, 'ver_details', None),
654
+ ca_cert=getattr(args, 'ca_cert', None),
655
+ pac=getattr(args, 'pac', None),
656
+ timeout=getattr(args, 'timeout', DEFAULT_TIMEOUT),
657
+ proxy=getattr(args, 'proxy', None),
658
+ grpc_proxy=getattr(args, 'grpc_proxy', None),
659
+ )
@@ -23,13 +23,13 @@ SPDX-License-Identifier: MIT
23
23
  """
24
24
 
25
25
  import os
26
+ import queue
26
27
  import sys
27
28
  import threading
28
- import queue
29
29
  import time
30
-
31
- from typing import Dict, List
32
30
  from dataclasses import dataclass
31
+ from typing import Dict, List
32
+
33
33
  from progress.bar import Bar
34
34
 
35
35
  from .scanossapi import ScanossApi
@@ -49,8 +49,6 @@ class ThreadedScanning(ScanossBase):
49
49
  Multiple threads pull messages off this queue, process the request and put the results into an output queue
50
50
  """
51
51
 
52
- inputs: queue.Queue = queue.Queue()
53
- output: queue.Queue = queue.Queue()
54
52
  bar: Bar = None
55
53
 
56
54
  def __init__(
@@ -65,6 +63,8 @@ class ThreadedScanning(ScanossBase):
65
63
  :param nb_threads: Number of thread to run (default 5)
66
64
  """
67
65
  super().__init__(debug, trace, quiet)
66
+ self.inputs = queue.Queue()
67
+ self.output = queue.Queue()
68
68
  self.scanapi = scanapi
69
69
  self.nb_threads = nb_threads
70
70
  self._isatty = sys.stderr.isatty()
@@ -134,7 +134,7 @@ class ThreadedScanning(ScanossBase):
134
134
  :param wfp: WFP to add to queue
135
135
  """
136
136
  if wfp is None or wfp == '':
137
- self.print_stderr(f'Warning: empty WFP. Skipping from scan...')
137
+ self.print_stderr('Warning: empty WFP. Skipping from scan...')
138
138
  else:
139
139
  self.inputs.put(wfp)
140
140
 
@@ -0,0 +1,103 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ from scanoss.scanossbase import ScanossBase
4
+
5
+
6
+ class AbstractPresenter(ABC):
7
+ """
8
+ Abstract presenter class for presenting output in a given format.
9
+ Subclasses must implement the _format_json_output and _format_plain_output methods.
10
+ """
11
+
12
+ def __init__(
13
+ self,
14
+ debug: bool = False,
15
+ trace: bool = False,
16
+ quiet: bool = False,
17
+ output_file: str = None,
18
+ output_format: str = None,
19
+ ):
20
+ """
21
+ Initialize the presenter with the given output file and format.
22
+ """
23
+ self.AVAILABLE_OUTPUT_FORMATS = ['json', 'plain', 'cyclonedx', 'spdxlite', 'csv', 'raw']
24
+ self.base = ScanossBase(debug=debug, trace=trace, quiet=quiet)
25
+ self.output_file = output_file
26
+ self.output_format = output_format
27
+
28
+ def present(self, output_format: str = None, output_file: str = None):
29
+ """
30
+ Present the formatted output to a file if provided; otherwise, print to stdout.
31
+ """
32
+ file_path = output_file or self.output_file
33
+ fmt = output_format or self.output_format
34
+
35
+ if fmt and fmt not in self.AVAILABLE_OUTPUT_FORMATS:
36
+ raise ValueError(
37
+ f"ERROR: Invalid output format '{fmt}'. Valid values are: {', '.join(self.AVAILABLE_OUTPUT_FORMATS)}"
38
+ )
39
+
40
+ if fmt == 'json':
41
+ content = self._format_json_output()
42
+ elif fmt == 'plain':
43
+ content = self._format_plain_output()
44
+ elif fmt == 'cyclonedx':
45
+ content = self._format_cyclonedx_output()
46
+ elif fmt == 'spdxlite':
47
+ content = self._format_spdxlite_output()
48
+ elif fmt == 'csv':
49
+ content = self._format_csv_output()
50
+ elif fmt == 'raw':
51
+ content = self._format_raw_output()
52
+ else:
53
+ content = self._format_plain_output()
54
+
55
+ self._present_output(content, file_path)
56
+
57
+ def _present_output(self, content: str, file_path: str = None):
58
+ """
59
+ If a file path is provided, write to that file; otherwise, print the content to stdout.
60
+ """
61
+ self.base.print_to_file_or_stdout(content, file_path)
62
+
63
+ @abstractmethod
64
+ def _format_cyclonedx_output(self) -> str:
65
+ """
66
+ Return a CycloneDX string representation of the data.
67
+ """
68
+ pass
69
+
70
+ @abstractmethod
71
+ def _format_spdxlite_output(self) -> str:
72
+ """
73
+ Return a SPDX-Lite string representation of the data.
74
+ """
75
+ pass
76
+
77
+ @abstractmethod
78
+ def _format_csv_output(self) -> str:
79
+ """
80
+ Return a CSV string representation of the data.
81
+ """
82
+ pass
83
+
84
+ @abstractmethod
85
+ def _format_json_output(self) -> str:
86
+ """
87
+ Return a JSON string representation of the data.
88
+ """
89
+ pass
90
+
91
+ @abstractmethod
92
+ def _format_plain_output(self) -> str:
93
+ """
94
+ Return a plain text string representation of the data.
95
+ """
96
+ pass
97
+
98
+ @abstractmethod
99
+ def _format_raw_output(self) -> str:
100
+ """
101
+ Return a raw string representation of the data.
102
+ """
103
+ pass
scanoss/utils/crc64.py ADDED
@@ -0,0 +1,96 @@
1
+ """
2
+ SPDX-License-Identifier: MIT
3
+
4
+ Copyright (c) 2025, SCANOSS
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in
14
+ all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ THE SOFTWARE.
23
+ """
24
+
25
+ import struct
26
+ from typing import List
27
+
28
+
29
+ class CRC64:
30
+ """
31
+ CRC64 ECMA implementation matching Go's hash/crc64 package.
32
+ Uses polynomial: 0xC96C5795D7870F42
33
+ """
34
+
35
+ POLY = 0xC96C5795D7870F42
36
+ _TABLE = None
37
+
38
+ def __init__(self):
39
+ if CRC64._TABLE is None:
40
+ CRC64._TABLE = self._make_table()
41
+ self.crc = 0xFFFFFFFFFFFFFFFF # Initial value
42
+
43
+ def _make_table(self) -> list:
44
+ """Generate the CRC64 lookup table."""
45
+ table = []
46
+ for i in range(256):
47
+ crc = i
48
+ for _ in range(8):
49
+ if crc & 1:
50
+ crc = (crc >> 1) ^ self.POLY
51
+ else:
52
+ crc >>= 1
53
+ table.append(crc)
54
+ return table
55
+
56
+ def update(self, data: bytes) -> None:
57
+ """Update the CRC with new data."""
58
+ if isinstance(data, str):
59
+ data = data.encode('utf-8')
60
+
61
+ crc = self.crc
62
+ for b in data:
63
+ crc = (crc >> 8) ^ CRC64._TABLE[(crc ^ b) & 0xFF] # Use class-level table
64
+ self.crc = crc
65
+
66
+ def digest(self) -> int:
67
+ """Get the current CRC value."""
68
+ return self.crc ^ 0xFFFFFFFFFFFFFFFF # Final XOR value
69
+
70
+ def hexdigest(self):
71
+ """Get the current CRC value as a hexadecimal string."""
72
+ return format(self.digest(), '016x')
73
+
74
+ @classmethod
75
+ def checksum(cls, data: bytes) -> int:
76
+ """Calculate CRC64 checksum for the given data."""
77
+ crc = cls()
78
+ crc.update(data)
79
+ return crc.digest()
80
+
81
+ @classmethod
82
+ def get_hash_buff(cls, buff: bytes) -> List[bytes]:
83
+ """
84
+ Get the hash value of the given buffer, and converts it to 8 bytes in big-endian order.
85
+
86
+ Args:
87
+ buff (bytes): The buffer to get the hash value of.
88
+
89
+ Returns:
90
+ bytes: The hash value of the given buffer, and converts it to 8 bytes in big-endian order.
91
+ """
92
+ crc = cls()
93
+ crc.update(buff)
94
+ hash_val = crc.digest()
95
+
96
+ return list(struct.pack('>Q', hash_val))
@@ -0,0 +1,198 @@
1
+ """
2
+ SPDX-License-Identifier: MIT
3
+
4
+ Copyright (c) 2025, SCANOSS
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in
14
+ all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ THE SOFTWARE.
23
+ """
24
+
25
+ import re
26
+ import unicodedata
27
+
28
+ FNV64_OFFSET_BASIS = 14695981039346656037
29
+ FNV64_PRIME = 1099511628211
30
+ MASK64 = 0xFFFFFFFFFFFFFFFF
31
+
32
+
33
+ def fnv1_64(data: bytes) -> int:
34
+ """Compute the 64‐bit FNV‑1 hash of data."""
35
+ h = FNV64_OFFSET_BASIS
36
+ for b in data:
37
+ h = (h * FNV64_PRIME) & MASK64
38
+ h = h ^ b
39
+ return h
40
+
41
+
42
+ class SimhashFeature:
43
+ def __init__(self, hash_value: int, weight: int = 1):
44
+ self.hash_value = hash_value
45
+ self.weight = weight
46
+
47
+ def sum(self) -> int:
48
+ """Return the 64-bit hash (sum) of this feature."""
49
+ return self.hash_value
50
+
51
+ def get_weight(self) -> int:
52
+ """Return the weight of this feature."""
53
+ return self.weight
54
+
55
+
56
+ def new_feature(f: bytes) -> SimhashFeature:
57
+ """Return a new feature for the given byte slice with weight 1."""
58
+ return SimhashFeature(fnv1_64(f), 1)
59
+
60
+
61
+ def new_feature_with_weight(f: bytes, weight: int) -> SimhashFeature:
62
+ """Return a new feature for the given byte slice with the given weight."""
63
+ return SimhashFeature(fnv1_64(f), weight)
64
+
65
+
66
+ def vectorize(features: list) -> list:
67
+ """
68
+ Given a list of features, return a 64-element vector.
69
+ Each feature contributes its weight to each coordinate,
70
+ added if that bit is set and subtracted otherwise.
71
+ """
72
+ v = [0] * 64
73
+ for feature in features:
74
+ h = feature.sum()
75
+ w = feature.get_weight()
76
+ for i in range(64):
77
+ if ((h >> i) & 1) == 1:
78
+ v[i] += w
79
+ else:
80
+ v[i] -= w
81
+ return v
82
+
83
+
84
+ def vectorize_bytes(features: list) -> list:
85
+ """
86
+ Given a list of byte slices, treat each as a feature (with weight 1)
87
+ by computing its FNV-1 hash.
88
+ """
89
+ v = [0] * 64
90
+ for feat in features:
91
+ h = fnv1_64(feat)
92
+ for i in range(64):
93
+ if ((h >> i) & 1) == 1:
94
+ v[i] += 1
95
+ else:
96
+ v[i] -= 1
97
+ return v
98
+
99
+
100
+ def fingerprint(v: list) -> int:
101
+ """
102
+ Given a 64-element vector, return a 64-bit fingerprint.
103
+ For each bit i, if v[i] >= 0, set bit i to 1; otherwise leave it 0.
104
+ """
105
+ f = 0
106
+ for i in range(64):
107
+ if v[i] >= 0:
108
+ f |= 1 << i
109
+ return f
110
+
111
+
112
+ def compare(a: int, b: int) -> int:
113
+ """
114
+ Calculate the Hamming distance between two 64-bit integers.
115
+ (The number of differing bits.)
116
+ """
117
+ v = a ^ b
118
+ c = 0
119
+ while v:
120
+ v &= v - 1
121
+ c += 1
122
+ return c
123
+
124
+
125
+ def simhash(fs) -> int:
126
+ """
127
+ Given a feature set (an object with a get_features() method),
128
+ return its 64-bit simhash.
129
+ """
130
+ return fingerprint(vectorize(fs.get_features()))
131
+
132
+
133
+ def simhash_bytes(b: list) -> int:
134
+ """
135
+ Given a list of byte slices, return the simhash.
136
+ """
137
+ return fingerprint(vectorize_bytes(b))
138
+
139
+
140
+ boundaries = re.compile(rb"[\w']+(?:\://[\w\./]+){0,1}")
141
+ unicode_boundaries = re.compile(r"[\w'-]+", re.UNICODE)
142
+
143
+
144
+ # --- Helper Functions for Feature Extraction ---
145
+ def _get_features_bytes(b: bytes, pattern: re.Pattern) -> list:
146
+ """
147
+ Split the given byte string using the given regex pattern,
148
+ and return a list of features (each created with new_feature).
149
+ """
150
+ words = pattern.findall(b)
151
+ return [new_feature(word) for word in words]
152
+
153
+
154
+ def _get_features_str(s: str, pattern) -> list:
155
+ """
156
+ Split the given string using the given regex pattern,
157
+ and return a list of features (each created by encoding to UTF-8).
158
+ """
159
+ words = pattern.findall(s)
160
+ return [new_feature(word.encode('utf-8')) for word in words]
161
+
162
+
163
+ class WordFeatureSet:
164
+ def __init__(self, b: bytes):
165
+ # Normalize the input to lowercase.
166
+ self.b = b.lower()
167
+
168
+ def get_features(self) -> list:
169
+ return _get_features_bytes(self.b, boundaries)
170
+
171
+
172
+ class UnicodeWordFeatureSet:
173
+ def __init__(self, b: bytes, norm_form: str = 'NFC'):
174
+ # Decode, normalize (using the provided form), and lowercase.
175
+ text = b.decode('utf-8')
176
+ normalized = unicodedata.normalize(norm_form, text)
177
+ self.text = normalized.lower()
178
+
179
+ def get_features(self) -> list:
180
+ return _get_features_str(self.text, unicode_boundaries)
181
+
182
+
183
+ def shingle(w: int, b: list) -> list:
184
+ """
185
+ Return the w-shingling of the given set of byte slices.
186
+ For example, if b is [b"this", b"is", b"a", b"test"]
187
+ and w == 2, the result is [b"this is", b"is a", b"a test"].
188
+ """
189
+ if w < 1:
190
+ raise ValueError('simhash.shingle(): k must be a positive integer')
191
+ if w == 1:
192
+ return b
193
+ w = min(w, len(b))
194
+ count = len(b) - w + 1
195
+ shingles = []
196
+ for i in range(count):
197
+ shingles.append(b' '.join(b[i : i + w]))
198
+ return shingles