scanoss 1.26.2__py3-none-any.whl → 1.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scanoss/__init__.py CHANGED
@@ -22,4 +22,4 @@ SPDX-License-Identifier: MIT
22
22
  THE SOFTWARE.
23
23
  """
24
24
 
25
- __version__ = '1.26.2'
25
+ __version__ = '1.27.0'
@@ -16,28 +16,34 @@ from google.api import annotations_pb2 as google_dot_api_dot_annotations__pb2
16
16
  from protoc_gen_swagger.options import annotations_pb2 as protoc__gen__swagger_dot_options_dot_annotations__pb2
17
17
 
18
18
 
19
- DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n.scanoss/api/scanning/v2/scanoss-scanning.proto\x12\x17scanoss.api.scanning.v2\x1a*scanoss/api/common/v2/scanoss-common.proto\x1a\x1cgoogle/api/annotations.proto\x1a,protoc-gen-swagger/options/annotations.proto\"\xff\x01\n\nHFHRequest\x12\x12\n\nbest_match\x18\x01 \x01(\x08\x12\x11\n\tthreshold\x18\x02 \x01(\x05\x12:\n\x04root\x18\x03 \x01(\x0b\x32,.scanoss.api.scanning.v2.HFHRequest.Children\x1a\x8d\x01\n\x08\x43hildren\x12\x0f\n\x07path_id\x18\x01 \x01(\t\x12\x16\n\x0esim_hash_names\x18\x02 \x01(\t\x12\x18\n\x10sim_hash_content\x18\x03 \x01(\t\x12>\n\x08\x63hildren\x18\x04 \x03(\x0b\x32,.scanoss.api.scanning.v2.HFHRequest.Children\"\xc1\x02\n\x0bHFHResponse\x12<\n\x07results\x18\x01 \x03(\x0b\x32+.scanoss.api.scanning.v2.HFHResponse.Result\x12\x35\n\x06status\x18\x02 \x01(\x0b\x32%.scanoss.api.common.v2.StatusResponse\x1a\x39\n\tComponent\x12\x0c\n\x04purl\x18\x01 \x01(\t\x12\x10\n\x08versions\x18\x02 \x03(\t\x12\x0c\n\x04rank\x18\x03 \x01(\x05\x1a\x81\x01\n\x06Result\x12\x0f\n\x07path_id\x18\x01 \x01(\t\x12\x42\n\ncomponents\x18\x02 \x03(\x0b\x32..scanoss.api.scanning.v2.HFHResponse.Component\x12\x13\n\x0bprobability\x18\x03 \x01(\x02\x12\r\n\x05stage\x18\x04 \x01(\x05\x32\x81\x02\n\x08Scanning\x12q\n\x04\x45\x63ho\x12\".scanoss.api.common.v2.EchoRequest\x1a#.scanoss.api.common.v2.EchoResponse\" \x82\xd3\xe4\x93\x02\x1a\"\x15/api/v2/scanning/echo:\x01*\x12\x81\x01\n\x0e\x46olderHashScan\x12#.scanoss.api.scanning.v2.HFHRequest\x1a$.scanoss.api.scanning.v2.HFHResponse\"$\x82\xd3\xe4\x93\x02\x1e\"\x19/api/v2/scanning/hfh/scan:\x01*B\x8a\x02Z1github.com/scanoss/papi/api/scanningv2;scanningv2\x92\x41\xd3\x01\x12m\n\x18SCANOSS Scanning Service\"L\n\x10scanoss-scanning\x12#https://github.com/scanoss/scanning\x1a\x13support@scanoss.com2\x03\x32.0*\x01\x01\x32\x10\x61pplication/json:\x10\x61pplication/jsonR;\n\x03\x34\x30\x34\x12\x34\n*Returned when the resource does not exist.\x12\x06\n\x04\x9a\x02\x01\x07\x62\x06proto3')
19
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n.scanoss/api/scanning/v2/scanoss-scanning.proto\x12\x17scanoss.api.scanning.v2\x1a*scanoss/api/common/v2/scanoss-common.proto\x1a\x1cgoogle/api/annotations.proto\x1a,protoc-gen-swagger/options/annotations.proto\"\xc5\x03\n\nHFHRequest\x12:\n\x04root\x18\x01 \x01(\x0b\x32,.scanoss.api.scanning.v2.HFHRequest.Children\x12\x16\n\x0erank_threshold\x18\x02 \x01(\x05\x12\x10\n\x08\x63\x61tegory\x18\x03 \x01(\t\x12\x13\n\x0bquery_limit\x18\x04 \x01(\x05\x1a\xbb\x02\n\x08\x43hildren\x12\x0f\n\x07path_id\x18\x01 \x01(\t\x12\x16\n\x0esim_hash_names\x18\x02 \x01(\t\x12\x18\n\x10sim_hash_content\x18\x03 \x01(\t\x12>\n\x08\x63hildren\x18\x04 \x03(\x0b\x32,.scanoss.api.scanning.v2.HFHRequest.Children\x12\x1a\n\x12sim_hash_dir_names\x18\x05 \x01(\t\x12Y\n\x0flang_extensions\x18\x06 \x03(\x0b\x32@.scanoss.api.scanning.v2.HFHRequest.Children.LangExtensionsEntry\x1a\x35\n\x13LangExtensionsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x05:\x02\x38\x01\"\xa3\x03\n\x0bHFHResponse\x12<\n\x07results\x18\x01 \x03(\x0b\x32+.scanoss.api.scanning.v2.HFHResponse.Result\x12\x35\n\x06status\x18\x02 \x01(\x0b\x32%.scanoss.api.common.v2.StatusResponse\x1a)\n\x07Version\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x1a\x94\x01\n\tComponent\x12\x0c\n\x04purl\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x0e\n\x06vendor\x18\x03 \x01(\t\x12>\n\x08versions\x18\x04 \x03(\x0b\x32,.scanoss.api.scanning.v2.HFHResponse.Version\x12\x0c\n\x04rank\x18\x05 \x01(\x05\x12\r\n\x05order\x18\x06 \x01(\x05\x1a]\n\x06Result\x12\x0f\n\x07path_id\x18\x01 \x01(\t\x12\x42\n\ncomponents\x18\x02 \x03(\x0b\x32..scanoss.api.scanning.v2.HFHResponse.Component2\x81\x02\n\x08Scanning\x12q\n\x04\x45\x63ho\x12\".scanoss.api.common.v2.EchoRequest\x1a#.scanoss.api.common.v2.EchoResponse\" \x82\xd3\xe4\x93\x02\x1a\"\x15/api/v2/scanning/echo:\x01*\x12\x81\x01\n\x0e\x46olderHashScan\x12#.scanoss.api.scanning.v2.HFHRequest\x1a$.scanoss.api.scanning.v2.HFHResponse\"$\x82\xd3\xe4\x93\x02\x1e\"\x19/api/v2/scanning/hfh/scan:\x01*B\x8a\x02Z1github.com/scanoss/papi/api/scanningv2;scanningv2\x92\x41\xd3\x01\x12m\n\x18SCANOSS Scanning Service\"L\n\x10scanoss-scanning\x12#https://github.com/scanoss/scanning\x1a\x13support@scanoss.com2\x03\x32.0*\x01\x01\x32\x10\x61pplication/json:\x10\x61pplication/jsonR;\n\x03\x34\x30\x34\x12\x34\n*Returned when the resource does not exist.\x12\x06\n\x04\x9a\x02\x01\x07\x62\x06proto3')
20
20
 
21
- _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
22
- _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'scanoss.api.scanning.v2.scanoss_scanning_pb2', globals())
23
- if _descriptor._USE_C_DESCRIPTORS == False:
24
-
25
- DESCRIPTOR._options = None
26
- DESCRIPTOR._serialized_options = b'Z1github.com/scanoss/papi/api/scanningv2;scanningv2\222A\323\001\022m\n\030SCANOSS Scanning Service\"L\n\020scanoss-scanning\022#https://github.com/scanoss/scanning\032\023support@scanoss.com2\0032.0*\001\0012\020application/json:\020application/jsonR;\n\003404\0224\n*Returned when the resource does not exist.\022\006\n\004\232\002\001\007'
27
- _SCANNING.methods_by_name['Echo']._options = None
28
- _SCANNING.methods_by_name['Echo']._serialized_options = b'\202\323\344\223\002\032\"\025/api/v2/scanning/echo:\001*'
29
- _SCANNING.methods_by_name['FolderHashScan']._options = None
30
- _SCANNING.methods_by_name['FolderHashScan']._serialized_options = b'\202\323\344\223\002\036\"\031/api/v2/scanning/hfh/scan:\001*'
31
- _HFHREQUEST._serialized_start=196
32
- _HFHREQUEST._serialized_end=451
33
- _HFHREQUEST_CHILDREN._serialized_start=310
34
- _HFHREQUEST_CHILDREN._serialized_end=451
35
- _HFHRESPONSE._serialized_start=454
36
- _HFHRESPONSE._serialized_end=775
37
- _HFHRESPONSE_COMPONENT._serialized_start=586
38
- _HFHRESPONSE_COMPONENT._serialized_end=643
39
- _HFHRESPONSE_RESULT._serialized_start=646
40
- _HFHRESPONSE_RESULT._serialized_end=775
41
- _SCANNING._serialized_start=778
42
- _SCANNING._serialized_end=1035
21
+ _globals = globals()
22
+ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
23
+ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'scanoss.api.scanning.v2.scanoss_scanning_pb2', _globals)
24
+ if not _descriptor._USE_C_DESCRIPTORS:
25
+ _globals['DESCRIPTOR']._loaded_options = None
26
+ _globals['DESCRIPTOR']._serialized_options = b'Z1github.com/scanoss/papi/api/scanningv2;scanningv2\222A\323\001\022m\n\030SCANOSS Scanning Service\"L\n\020scanoss-scanning\022#https://github.com/scanoss/scanning\032\023support@scanoss.com2\0032.0*\001\0012\020application/json:\020application/jsonR;\n\003404\0224\n*Returned when the resource does not exist.\022\006\n\004\232\002\001\007'
27
+ _globals['_HFHREQUEST_CHILDREN_LANGEXTENSIONSENTRY']._loaded_options = None
28
+ _globals['_HFHREQUEST_CHILDREN_LANGEXTENSIONSENTRY']._serialized_options = b'8\001'
29
+ _globals['_SCANNING'].methods_by_name['Echo']._loaded_options = None
30
+ _globals['_SCANNING'].methods_by_name['Echo']._serialized_options = b'\202\323\344\223\002\032\"\025/api/v2/scanning/echo:\001*'
31
+ _globals['_SCANNING'].methods_by_name['FolderHashScan']._loaded_options = None
32
+ _globals['_SCANNING'].methods_by_name['FolderHashScan']._serialized_options = b'\202\323\344\223\002\036\"\031/api/v2/scanning/hfh/scan:\001*'
33
+ _globals['_HFHREQUEST']._serialized_start=196
34
+ _globals['_HFHREQUEST']._serialized_end=649
35
+ _globals['_HFHREQUEST_CHILDREN']._serialized_start=334
36
+ _globals['_HFHREQUEST_CHILDREN']._serialized_end=649
37
+ _globals['_HFHREQUEST_CHILDREN_LANGEXTENSIONSENTRY']._serialized_start=596
38
+ _globals['_HFHREQUEST_CHILDREN_LANGEXTENSIONSENTRY']._serialized_end=649
39
+ _globals['_HFHRESPONSE']._serialized_start=652
40
+ _globals['_HFHRESPONSE']._serialized_end=1071
41
+ _globals['_HFHRESPONSE_VERSION']._serialized_start=784
42
+ _globals['_HFHRESPONSE_VERSION']._serialized_end=825
43
+ _globals['_HFHRESPONSE_COMPONENT']._serialized_start=828
44
+ _globals['_HFHRESPONSE_COMPONENT']._serialized_end=976
45
+ _globals['_HFHRESPONSE_RESULT']._serialized_start=978
46
+ _globals['_HFHRESPONSE_RESULT']._serialized_end=1071
47
+ _globals['_SCANNING']._serialized_start=1074
48
+ _globals['_SCANNING']._serialized_end=1331
43
49
  # @@protoc_insertion_point(module_scope)
scanoss/cli.py CHANGED
@@ -54,6 +54,7 @@ from . import __version__
54
54
  from .components import Components
55
55
  from .constants import (
56
56
  DEFAULT_API_TIMEOUT,
57
+ DEFAULT_HFH_RANK_THRESHOLD,
57
58
  DEFAULT_POST_SIZE,
58
59
  DEFAULT_RETRY,
59
60
  DEFAULT_TIMEOUT,
@@ -623,24 +624,16 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
623
624
  '--format',
624
625
  '-f',
625
626
  type=str,
626
- choices=['json'],
627
+ choices=['json', 'cyclonedx'],
627
628
  default='json',
628
629
  help='Result output format (optional - default: json)',
629
630
  )
630
631
  p_folder_scan.add_argument(
631
- '--best-match',
632
- '-bm',
633
- action='store_true',
634
- default=False,
635
- help='Enable best match mode (optional - default: False)',
636
- )
637
- p_folder_scan.add_argument(
638
- '--threshold',
632
+ '--rank-threshold',
639
633
  type=int,
640
- choices=range(1, 101),
641
- metavar='1-100',
642
- default=100,
643
- help='Threshold for result matching (optional - default: 100)',
634
+ default=DEFAULT_HFH_RANK_THRESHOLD,
635
+ help='Filter results to only show those with rank value at or below this threshold (e.g., --rank-threshold 3 '
636
+ 'returns results with rank 1, 2, or 3). Lower rank values indicate higher quality matches.',
644
637
  )
645
638
  p_folder_scan.set_defaults(func=folder_hashing_scan)
646
639
 
@@ -1455,7 +1448,7 @@ def utils_certloc(*_):
1455
1448
  Run the "utils certloc" sub-command
1456
1449
  :param _: ignored/unused
1457
1450
  """
1458
- import certifi # noqa: PLC0415,I001
1451
+ import certifi # noqa: PLC0415,I001
1459
1452
 
1460
1453
  print(f'CA Cert File: {certifi.where()}')
1461
1454
 
@@ -1466,11 +1459,11 @@ def utils_cert_download(_, args): # pylint: disable=PLR0912 # noqa: PLR0912
1466
1459
  :param _: ignore/unused
1467
1460
  :param args: Parsed arguments
1468
1461
  """
1469
- import socket # noqa: PLC0415,I001
1470
- import traceback # noqa: PLC0415,I001
1471
- from urllib.parse import urlparse # noqa: PLC0415,I001
1462
+ import socket # noqa: PLC0415,I001
1463
+ import traceback # noqa: PLC0415,I001
1464
+ from urllib.parse import urlparse # noqa: PLC0415,I001
1472
1465
 
1473
- from OpenSSL import SSL, crypto # noqa: PLC0415,I001
1466
+ from OpenSSL import SSL, crypto # noqa: PLC0415,I001
1474
1467
 
1475
1468
  file = sys.stdout
1476
1469
  if args.output:
@@ -1518,7 +1511,7 @@ def utils_pac_proxy(_, args):
1518
1511
  :param _: ignore/unused
1519
1512
  :param args: Parsed arguments
1520
1513
  """
1521
- from pypac.resolver import ProxyResolver # noqa: PLC0415,I001
1514
+ from pypac.resolver import ProxyResolver # noqa: PLC0415,I001
1522
1515
 
1523
1516
  if not args.pac:
1524
1517
  print_stderr('Error: No pac file option specified.')
@@ -1592,7 +1585,7 @@ def crypto_algorithms(parser, args):
1592
1585
  sys.exit(1)
1593
1586
  except Exception as e:
1594
1587
  if args.debug:
1595
- import traceback # noqa: PLC0415,I001
1588
+ import traceback # noqa: PLC0415,I001
1596
1589
 
1597
1590
  traceback.print_exc()
1598
1591
  print_stderr(f'ERROR: {e}')
@@ -1634,7 +1627,7 @@ def crypto_hints(parser, args):
1634
1627
  sys.exit(1)
1635
1628
  except Exception as e:
1636
1629
  if args.debug:
1637
- import traceback # noqa: PLC0415,I001
1630
+ import traceback # noqa: PLC0415,I001
1638
1631
 
1639
1632
  traceback.print_exc()
1640
1633
  print_stderr(f'ERROR: {e}')
@@ -1676,7 +1669,7 @@ def crypto_versions_in_range(parser, args):
1676
1669
  sys.exit(1)
1677
1670
  except Exception as e:
1678
1671
  if args.debug:
1679
- import traceback # noqa: PLC0415,I001
1672
+ import traceback # noqa: PLC0415,I001
1680
1673
 
1681
1674
  traceback.print_exc()
1682
1675
  print_stderr(f'ERROR: {e}')
@@ -1965,11 +1958,9 @@ def folder_hashing_scan(parser, args):
1965
1958
  config=scanner_config,
1966
1959
  client=client,
1967
1960
  scanoss_settings=scanoss_settings,
1961
+ rank_threshold=args.rank_threshold,
1968
1962
  )
1969
1963
 
1970
- scanner.best_match = args.best_match
1971
- scanner.threshold = args.threshold
1972
-
1973
1964
  if scanner.scan():
1974
1965
  scanner.present(output_file=args.output, output_format=args.format)
1975
1966
  except ScanossGrpcError as e:
scanoss/constants.py CHANGED
@@ -12,3 +12,5 @@ DEFAULT_URL = 'https://api.osskb.org' # default free service URL
12
12
  DEFAULT_URL2 = 'https://api.scanoss.com' # default premium service URL
13
13
 
14
14
  DEFAULT_API_TIMEOUT = 600
15
+
16
+ DEFAULT_HFH_RANK_THRESHOLD = 5
@@ -1 +1 @@
1
- date: 20250624181246, utime: 1750788766
1
+ date: 20250708100043, utime: 1751968843
scanoss/file_filters.py CHANGED
@@ -25,7 +25,7 @@ SPDX-License-Identifier: MIT
25
25
  import os
26
26
  import sys
27
27
  from pathlib import Path
28
- from typing import List
28
+ from typing import List, Optional
29
29
 
30
30
  from pathspec import GitIgnoreSpec
31
31
 
@@ -511,7 +511,7 @@ class FileFilters(ScanossBase):
511
511
  # Now filter the files and return the reduced list
512
512
  return self.get_filtered_files_from_files(all_files, str(root_path))
513
513
 
514
- def get_filtered_files_from_files(self, files: List[str], scan_root: str = None) -> List[str]:
514
+ def get_filtered_files_from_files(self, files: List[str], scan_root: Optional[str] = None) -> List[str]:
515
515
  """
516
516
  Retrieve a list of files to scan or fingerprint from a given list of files based on filter settings.
517
517
 
@@ -615,8 +615,13 @@ class FileFilters(ScanossBase):
615
615
  # Default patterns for skipping directories
616
616
  if not self.all_folders:
617
617
  DEFAULT_SKIPPED_DIR_LIST = DEFAULT_SKIPPED_DIRS_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIRS
618
+ DEFAULT_SKIPPED_DIR_EXT_LIST = (
619
+ DEFAULT_SKIPPED_DIR_EXT_HFH if self.is_folder_hashing_scan else DEFAULT_SKIPPED_DIR_EXT
620
+ )
618
621
  for dir_name in DEFAULT_SKIPPED_DIR_LIST:
619
622
  patterns.append(f'{dir_name}/')
623
+ for dir_extension in DEFAULT_SKIPPED_DIR_EXT_LIST:
624
+ patterns.append(f'*{dir_extension}/')
620
625
 
621
626
  # Custom patterns added in SCANOSS settings file
622
627
  if self.scanoss_settings:
@@ -37,7 +37,7 @@ class Copyleft(PolicyCheck):
37
37
  def __init__( # noqa: PLR0913
38
38
  self,
39
39
  debug: bool = False,
40
- trace: bool = True,
40
+ trace: bool = False,
41
41
  quiet: bool = False,
42
42
  filepath: str = None,
43
43
  format_type: str = 'json',
@@ -66,7 +66,7 @@ class InspectBase(ScanossBase):
66
66
  def __init__( # noqa: PLR0913
67
67
  self,
68
68
  debug: bool = False,
69
- trace: bool = True,
69
+ trace: bool = False,
70
70
  quiet: bool = False,
71
71
  filepath: str = None,
72
72
  output: str = None,
@@ -152,9 +152,6 @@ class InspectBase(ScanossBase):
152
152
  'declared': 1 if status == 'identified' else 0,
153
153
  'undeclared': 1 if status == 'pending' else 0
154
154
  }
155
- if not new_component.get('licenses'):
156
- self.print_debug(f'WARNING: Results missing licenses. Skipping: {new_component}')
157
- return components
158
155
 
159
156
  ## Append license to component
160
157
  self._append_license_to_component(components, new_component, component_key)
@@ -179,6 +176,11 @@ class InspectBase(ScanossBase):
179
176
  new_component: Component whose licenses need to be processed
180
177
  component_key: purl + version of the component to be updated
181
178
  """
179
+ # If not licenses are present
180
+ if not new_component.get('licenses'):
181
+ self.print_debug(f'WARNING: Results missing licenses. Skipping: {new_component}')
182
+ return
183
+
182
184
  licenses_order_by_source_priority = self._get_licenses_order_by_source_priority(new_component['licenses'])
183
185
  # Process licenses for this component
184
186
  for license_item in licenses_order_by_source_priority:
@@ -42,7 +42,7 @@ class LicenseSummary(InspectBase):
42
42
  def __init__( # noqa: PLR0913
43
43
  self,
44
44
  debug: bool = False,
45
- trace: bool = True,
45
+ trace: bool = False,
46
46
  quiet: bool = False,
47
47
  filepath: str = None,
48
48
  status: str = None,
@@ -64,7 +64,7 @@ class PolicyCheck(InspectBase):
64
64
  def __init__( # noqa: PLR0913
65
65
  self,
66
66
  debug: bool = False,
67
- trace: bool = True,
67
+ trace: bool = False,
68
68
  quiet: bool = False,
69
69
  filepath: str = None,
70
70
  format_type: str = None,
@@ -37,7 +37,7 @@ class UndeclaredComponent(PolicyCheck):
37
37
  def __init__( # noqa: PLR0913
38
38
  self,
39
39
  debug: bool = False,
40
- trace: bool = True,
40
+ trace: bool = False,
41
41
  quiet: bool = False,
42
42
  filepath: str = None,
43
43
  format_type: str = 'json',
@@ -15,7 +15,7 @@ from scanoss.utils.simhash import WordFeatureSet, fingerprint, simhash, vectoriz
15
15
 
16
16
  MINIMUM_FILE_COUNT = 8
17
17
  MINIMUM_CONCATENATED_NAME_LENGTH = 32
18
- MINIMUM_FILE_NAME_LENGTH = 32
18
+ MAXIMUM_FILE_NAME_LENGTH = 32
19
19
 
20
20
 
21
21
  class DirectoryNode:
@@ -35,7 +35,7 @@ class DirectoryFile:
35
35
  Represents a file in the directory tree for folder hashing.
36
36
  """
37
37
 
38
- def __init__(self, path: str, key: bytes, key_str: str):
38
+ def __init__(self, path: str, key: List[bytes], key_str: str):
39
39
  self.path = path
40
40
  self.key = key
41
41
  self.key_str = key_str
@@ -77,7 +77,7 @@ class FolderHasher:
77
77
  def __init__(
78
78
  self,
79
79
  scan_dir: str,
80
- config: Optional[FolderHasherConfig] = None,
80
+ config: FolderHasherConfig,
81
81
  scanoss_settings: Optional[ScanossSettings] = None,
82
82
  ):
83
83
  self.base = ScanossBase(
@@ -140,7 +140,7 @@ class FolderHasher:
140
140
  root_node = DirectoryNode(str(root))
141
141
 
142
142
  all_files = [
143
- f for f in root.rglob('*') if f.is_file() and len(f.name.encode('utf-8')) <= MINIMUM_FILE_NAME_LENGTH
143
+ f for f in root.rglob('*') if f.is_file() and len(f.name.encode('utf-8')) <= MAXIMUM_FILE_NAME_LENGTH
144
144
  ]
145
145
  filtered_files = self.file_filters.get_filtered_files_from_files(all_files, str(root))
146
146
 
@@ -185,7 +185,7 @@ class FolderHasher:
185
185
  Recursively compute folder hash data for a directory node.
186
186
 
187
187
  The hash data includes the path identifier, simhash for file names,
188
- simhash for file content, and children node hash information.
188
+ simhash for file content, directory hash, language extensions, and children node hash information.
189
189
 
190
190
  Args:
191
191
  node (DirectoryNode): The directory node to compute the hash for.
@@ -194,11 +194,22 @@ class FolderHasher:
194
194
  dict: The computed hash data for the node.
195
195
  """
196
196
  hash_data = self._hash_calc(node)
197
+
198
+ # Safely calculate relative path
199
+ try:
200
+ node_path = Path(node.path).resolve()
201
+ scan_dir_path = Path(self.scan_dir).resolve()
202
+ rel_path = node_path.relative_to(scan_dir_path)
203
+ except ValueError:
204
+ # If relative_to fails, use the node path as is or a fallback
205
+ rel_path = Path(node.path).name if node.path else Path('.')
197
206
 
198
207
  return {
199
- 'path_id': node.path,
208
+ 'path_id': str(rel_path),
200
209
  'sim_hash_names': f'{hash_data["name_hash"]:02x}' if hash_data['name_hash'] is not None else None,
201
210
  'sim_hash_content': f'{hash_data["content_hash"]:02x}' if hash_data['content_hash'] is not None else None,
211
+ 'sim_hash_dir_names': f'{hash_data["dir_hash"]:02x}' if hash_data['dir_hash'] is not None else None,
212
+ 'lang_extensions': hash_data['lang_extensions'],
202
213
  'children': [self._hash_calc_from_node(child) for child in node.children.values()],
203
214
  }
204
215
 
@@ -215,9 +226,12 @@ class FolderHasher:
215
226
  node (DirectoryNode): The directory node containing file items.
216
227
 
217
228
  Returns:
218
- dict: A dictionary with 'name_hash' and 'content_hash' keys.
229
+ dict: A dictionary with 'name_hash', 'content_hash', 'dir_hash', and 'lang_extensions' keys.
219
230
  """
220
231
  processed_hashes = set()
232
+ unique_file_names = set()
233
+ unique_directories = set()
234
+ extension_map = {}
221
235
  file_hashes = []
222
236
  selected_names = []
223
237
 
@@ -225,37 +239,64 @@ class FolderHasher:
225
239
  key_str = file.key_str
226
240
  if key_str in processed_hashes:
227
241
  continue
228
- processed_hashes.add(key_str)
229
242
 
230
- selected_names.append(os.path.basename(file.path))
243
+ file_name = os.path.basename(file.path)
244
+
245
+ file_name_without_extension, extension = os.path.splitext(file_name)
246
+ current_directory = os.path.dirname(file.path)
247
+
248
+ if extension and len(extension) > 1:
249
+ ext_without_dot = extension[1:]
250
+ extension_map[ext_without_dot] = extension_map.get(ext_without_dot, 0) + 1
251
+
252
+ current_directory.replace(self.scan_dir, '', 1).lstrip(os.path.sep)
253
+ parts = current_directory.split(os.path.sep)
254
+ for d in parts:
255
+ if d in {'', '.', '..'}:
256
+ continue
257
+ unique_directories.add(d)
231
258
 
232
- file_key = bytes(file.key)
233
- file_hashes.append(file_key)
259
+ processed_hashes.add(key_str)
260
+ unique_file_names.add(file_name_without_extension)
261
+ selected_names.append(file_name)
262
+ file_hashes.append(file.key)
234
263
 
235
264
  if len(selected_names) < MINIMUM_FILE_COUNT:
236
- return {
237
- 'name_hash': None,
238
- 'content_hash': None,
239
- }
265
+ return {'name_hash': None, 'content_hash': None, 'dir_hash': None, 'lang_extensions': None}
240
266
 
241
267
  selected_names.sort()
242
268
  concatenated_names = ''.join(selected_names)
243
269
 
244
270
  if len(concatenated_names.encode('utf-8')) < MINIMUM_CONCATENATED_NAME_LENGTH:
245
- return {
246
- 'name_hash': None,
247
- 'content_hash': None,
248
- }
271
+ return {'name_hash': None, 'content_hash': None, 'dir_hash': None, 'lang_extensions': None}
272
+
273
+ # Concatenate the unique file names without the extensions, adding a space and sorting them alphabetically
274
+ unique_file_names_list = list(unique_file_names)
275
+ unique_file_names_list.sort()
276
+ concatenated_names = ' '.join(unique_file_names_list)
277
+
278
+ # We do the same for the directory names, adding a space and sorting them alphabetically
279
+ unique_directories_list = list(unique_directories)
280
+ unique_directories_list.sort()
281
+ concatenated_directories = ' '.join(unique_directories_list)
249
282
 
250
283
  names_simhash = simhash(WordFeatureSet(concatenated_names.encode('utf-8')))
284
+ dir_simhash = simhash(WordFeatureSet(concatenated_directories.encode('utf-8')))
251
285
  content_simhash = fingerprint(vectorize_bytes(file_hashes))
252
286
 
287
+ # Debug logging similar to Go implementation
288
+ self.base.print_debug(f'Unique file names: {unique_file_names_list}')
289
+ self.base.print_debug(f'Unique directories: {unique_directories_list}')
290
+ self.base.print_debug(f'{dir_simhash:x}/{names_simhash:x} - {content_simhash:x} - {extension_map}')
291
+
253
292
  return {
254
293
  'name_hash': names_simhash,
255
294
  'content_hash': content_simhash,
295
+ 'dir_hash': dir_simhash,
296
+ 'lang_extensions': extension_map,
256
297
  }
257
298
 
258
- def present(self, output_format: str = None, output_file: str = None):
299
+ def present(self, output_format: Optional[str] = None, output_file: Optional[str] = None):
259
300
  """Present the hashed tree in the selected format"""
260
301
  self.presenter.present(output_format=output_format, output_file=output_file)
261
302
 
@@ -29,6 +29,8 @@ from typing import Dict, Optional
29
29
 
30
30
  from progress.spinner import Spinner
31
31
 
32
+ from scanoss.constants import DEFAULT_HFH_RANK_THRESHOLD
33
+ from scanoss.cyclonedx import CycloneDx
32
34
  from scanoss.file_filters import FileFilters
33
35
  from scanoss.scanners.folder_hasher import FolderHasher
34
36
  from scanoss.scanners.scanner_config import ScannerConfig
@@ -52,6 +54,7 @@ class ScannerHFH:
52
54
  config: ScannerConfig,
53
55
  client: Optional[ScanossGrpc] = None,
54
56
  scanoss_settings: Optional[ScanossSettings] = None,
57
+ rank_threshold: int = DEFAULT_HFH_RANK_THRESHOLD,
55
58
  ):
56
59
  """
57
60
  Initialize the ScannerHFH.
@@ -61,6 +64,7 @@ class ScannerHFH:
61
64
  config (ScannerConfig): Configuration parameters for the scanner.
62
65
  client (ScanossGrpc): gRPC client for communicating with the scanning service.
63
66
  scanoss_settings (Optional[ScanossSettings]): Optional settings for Scanoss.
67
+ rank_threshold (int): Get results with rank below this threshold (default: 5).
64
68
  """
65
69
  self.base = ScanossBase(
66
70
  debug=config.debug,
@@ -88,8 +92,7 @@ class ScannerHFH:
88
92
  self.scan_dir = scan_dir
89
93
  self.client = client
90
94
  self.scan_results = None
91
- self.best_match = False
92
- self.threshold = 100
95
+ self.rank_threshold = rank_threshold
93
96
 
94
97
  def scan(self) -> Optional[Dict]:
95
98
  """
@@ -100,8 +103,7 @@ class ScannerHFH:
100
103
  """
101
104
  hfh_request = {
102
105
  'root': self.folder_hasher.hash_directory(self.scan_dir),
103
- 'threshold': self.threshold,
104
- 'best_match': self.best_match,
106
+ 'rank_threshold': self.rank_threshold,
105
107
  }
106
108
 
107
109
  spinner = Spinner('Scanning folder...')
@@ -161,7 +163,50 @@ class ScannerHFHPresenter(AbstractPresenter):
161
163
  )
162
164
 
163
165
  def _format_cyclonedx_output(self) -> str:
164
- raise NotImplementedError('CycloneDX output is not implemented')
166
+ if not self.scanner.scan_results:
167
+ return ''
168
+ try:
169
+ if 'results' not in self.scanner.scan_results or not self.scanner.scan_results['results']:
170
+ self.base.print_stderr('ERROR: No scan results found')
171
+ return ''
172
+
173
+ first_result = self.scanner.scan_results['results'][0]
174
+
175
+ best_match_components = [c for c in first_result.get('components', []) if c.get('order') == 1]
176
+ if not best_match_components:
177
+ self.base.print_stderr('ERROR: No best match component found')
178
+ return ''
179
+
180
+ best_match_component = best_match_components[0]
181
+ if not best_match_component.get('versions'):
182
+ self.base.print_stderr('ERROR: No versions found for best match component')
183
+ return ''
184
+
185
+ best_match_version = best_match_component['versions'][0]
186
+ purl = best_match_component['purl']
187
+
188
+ get_dependencies_json_request = {
189
+ 'files': [
190
+ {
191
+ 'file': f'{best_match_component["name"]}:{best_match_version["version"]}',
192
+ 'purls': [{'purl': purl, 'requirement': best_match_version['version']}],
193
+ }
194
+ ]
195
+ }
196
+
197
+ decorated_scan_results = self.scanner.client.get_dependencies(get_dependencies_json_request)
198
+
199
+ cdx = CycloneDx(self.base.debug, self.output_file)
200
+ scan_results = {}
201
+ for f in decorated_scan_results['files']:
202
+ scan_results[f['file']] = [f]
203
+ if not cdx.produce_from_json(scan_results, self.output_file):
204
+ error_msg = 'ERROR: Failed to produce CycloneDX output'
205
+ self.base.print_stderr(error_msg)
206
+ raise ValueError(error_msg)
207
+ except Exception as e:
208
+ self.base.print_stderr(f'ERROR: Failed to get license information: {e}')
209
+ return None
165
210
 
166
211
  def _format_spdxlite_output(self) -> str:
167
212
  raise NotImplementedError('SPDXlite output is not implemented')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scanoss
3
- Version: 1.26.2
3
+ Version: 1.27.0
4
4
  Summary: Simple Python library to leverage the SCANOSS APIs
5
5
  Home-page: https://scanoss.com
6
6
  Author: SCANOSS
@@ -4,14 +4,14 @@ protoc_gen_swagger/options/annotations_pb2.py,sha256=b25EDD6gssUWnFby9gxgcpLIROT
4
4
  protoc_gen_swagger/options/annotations_pb2_grpc.py,sha256=1oboBPFxaTEXt9Aw7EAj8gXHDCNMhZD2VXqocC9l_gk,159
5
5
  protoc_gen_swagger/options/openapiv2_pb2.py,sha256=vYElGp8E1vGHszvWqX97zNG9GFJ7u2QcdK9ouq0XdyI,14939
6
6
  protoc_gen_swagger/options/openapiv2_pb2_grpc.py,sha256=1oboBPFxaTEXt9Aw7EAj8gXHDCNMhZD2VXqocC9l_gk,159
7
- scanoss/__init__.py,sha256=xDDQJcWC9AfGANHfK3qAJabNjabXJyODs8piC_PNc6U,1146
8
- scanoss/cli.py,sha256=yjK4oawNzecarQYYlkElOiHFDDAZx_zKdSXf_gQvqXk,72678
7
+ scanoss/__init__.py,sha256=YH4I-lAz5Zn3nEU1mwGqNZPPhcS1o4Lu6itgmXKlV0c,1146
8
+ scanoss/cli.py,sha256=9ELIAJy06g4KyvnALzPSQ_Rh1ypALbyQGGKrjb4sCOk,72615
9
9
  scanoss/components.py,sha256=b0R9DdKuXqyQiw5nZZwjQ6NJXBr1U9gyx1RI2FP9ozA,14511
10
- scanoss/constants.py,sha256=FWCZG8gQputKwV7XwvW1GuwDXL4wDLQyVRGdwygg578,320
10
+ scanoss/constants.py,sha256=On8mQ-8ardVMHSJ7WOJqeTvGXIOWPLCgUanjE7Wk-wE,351
11
11
  scanoss/cryptography.py,sha256=Q39MOCscP-OFvrnPXaPOMFFkc8OKnf3mC3SgZYEtCog,9407
12
12
  scanoss/csvoutput.py,sha256=qNKRwcChSkgIwLm00kZiVX6iHVQUF4Apl-sMbzJ5Taw,10192
13
13
  scanoss/cyclonedx.py,sha256=UktDuqZUbXSggdt864Pg8ziTD7sdEQtLxfYL7vd_ZCE,12756
14
- scanoss/file_filters.py,sha256=_1Ehb_rLnHw_-6N5Zhh4Es2lz6rlx0LozGPn-u52cok,20338
14
+ scanoss/file_filters.py,sha256=2DzyvSVR7We7U36UurtJj3cdQturUjDl8j3OIqmv4Pg,20638
15
15
  scanoss/filecount.py,sha256=RZjKQ6M5P_RQg0_PMD2tsRe5Z8f98ke0sxYVjPDN8iQ,6538
16
16
  scanoss/results.py,sha256=47ZXXuU2sDjYa5vhtbWTmikit9jHhA0rsYKwkvZFI5w,9252
17
17
  scanoss/scancodedeps.py,sha256=JbpoGW1POtPMmowzfwa4oh8sSBeeQCqaW9onvc4UFYM,11517
@@ -47,7 +47,7 @@ scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2.py,sha256=Z4k9qvU2klesnPR
47
47
  scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2_grpc.py,sha256=B6sv2Taagt05CMWaw7T6silg7PW8E6xQVbqn_4-v14U,6854
48
48
  scanoss/api/scanning/__init__.py,sha256=hx-P78xbDsh6WQIigewkJ7Y7y1fqc_eYnyHC5IZTKmo,1122
49
49
  scanoss/api/scanning/v2/__init__.py,sha256=hx-P78xbDsh6WQIigewkJ7Y7y1fqc_eYnyHC5IZTKmo,1122
50
- scanoss/api/scanning/v2/scanoss_scanning_pb2.py,sha256=rHScTNN5_jsgLu_y3UIPTjeBX74GUw0kyjA-IMoz7mQ,4324
50
+ scanoss/api/scanning/v2/scanoss_scanning_pb2.py,sha256=-TLfUUVCytOSc1-PtGl2g2-IlOrYuktQ2lRxFq5vM6A,5493
51
51
  scanoss/api/scanning/v2/scanoss_scanning_pb2_grpc.py,sha256=kyP1JRjyHlUR9vc0MXSJDvEGBiROEu5WvHvt737g27Q,4670
52
52
  scanoss/api/semgrep/__init__.py,sha256=UAhvL2dFNZsG4g3I8HCauwQK6e0QoEFhMGqZ_9GgGhI,1122
53
53
  scanoss/api/semgrep/v2/__init__.py,sha256=UAhvL2dFNZsG4g3I8HCauwQK6e0QoEFhMGqZ_9GgGhI,1122
@@ -57,31 +57,31 @@ scanoss/api/vulnerabilities/__init__.py,sha256=IFrDk_DTJgKSZmmU-nuLXuq_s8sQZlrSC
57
57
  scanoss/api/vulnerabilities/v2/__init__.py,sha256=IFrDk_DTJgKSZmmU-nuLXuq_s8sQZlrSCHhIDMJT4r0,1122
58
58
  scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2.py,sha256=CFhF80av8tenGvn9AIsGEtRJPuV2dC_syA5JLZb2lDw,5464
59
59
  scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2_grpc.py,sha256=HlS4k4Zmx6RIAqaO9I96jD-eyF5yU6Xx04pVm7pdqOg,6864
60
- scanoss/data/build_date.txt,sha256=JZiS5Od8jOg1I9NAinAP5k79mVNoARNi3bvCVs5X7XU,40
60
+ scanoss/data/build_date.txt,sha256=MYvdFBxu-jVdZOEyHhyOpGsXpUjQK19aUZUWJivaIgU,40
61
61
  scanoss/data/scanoss-settings-schema.json,sha256=ClkRYAkjAN0Sk704G8BE_Ok006oQ6YnIGmX84CF8h9w,8798
62
62
  scanoss/data/spdx-exceptions.json,sha256=s7UTYxC7jqQXr11YBlIWYCNwN6lRDFTR33Y8rpN_dA4,17953
63
63
  scanoss/data/spdx-licenses.json,sha256=A6Z0q82gaTLtnopBfzeIVZjJFxkdRW1g2TuumQc-lII,228794
64
64
  scanoss/inspection/__init__.py,sha256=D4C0lWLuNp8k_BjQZEc07WZcUgAvriVwQWOk063b0ZU,1122
65
65
  scanoss/inspection/component_summary.py,sha256=h1l3rF6NnoK0wMkS4ib6rDfcza2aqunyoMDbN2lw2G4,4049
66
- scanoss/inspection/copyleft.py,sha256=iCArNWZo5TOX7K68e-YD_6mQNRDOE9V35UoSMs23_qY,9236
67
- scanoss/inspection/inspect_base.py,sha256=wnE2KdATJFC2HqcPUTaKXUnM-3hBK0GZ98-wJE0VE-c,18170
68
- scanoss/inspection/license_summary.py,sha256=T3I8E6ljqYF0ngIcY3Ke2WaNeCzrdpN0RQ02RG_3Thk,5763
69
- scanoss/inspection/policy_check.py,sha256=NS39dvePZbpusGRnEUKN9JFiMdSyva0msFE6On6bK8Q,8329
70
- scanoss/inspection/undeclared_component.py,sha256=qjc30agrIXU_07CJCgLvvFT-sQvJa1Hb0lS1UjM6aj8,11495
66
+ scanoss/inspection/copyleft.py,sha256=ZSA97Vc3o06e66r4SCxwLKjGZOjv6lo92sWbvpzKHvo,9237
67
+ scanoss/inspection/inspect_base.py,sha256=buvJ9l3VJatzX5tNj7GOC5kSFQiFMpcYm8e1Iqolho0,18199
68
+ scanoss/inspection/license_summary.py,sha256=1iSVkjNa3oj-XEA-tNNqpwBOLb-i_jkXRTuu9Fcr0q4,5764
69
+ scanoss/inspection/policy_check.py,sha256=R9-7PxDHGzXCDVF8sWE3KcORgICDuZbx1-xvSot_C-g,8330
70
+ scanoss/inspection/undeclared_component.py,sha256=HGto8-ZBccrtczIARughG298Cwqb4k1BLCihkbmiFnk,11496
71
71
  scanoss/inspection/utils/license_utils.py,sha256=Zb6QLmVJb86lKCwZyBsmwakyAtY1SXa54kUyyKmWMqA,5093
72
72
  scanoss/scanners/__init__.py,sha256=D4C0lWLuNp8k_BjQZEc07WZcUgAvriVwQWOk063b0ZU,1122
73
73
  scanoss/scanners/container_scanner.py,sha256=leP4roes6B9B95F49mJ0P_F8WcKCQkvJgk9azWyJrjg,16294
74
- scanoss/scanners/folder_hasher.py,sha256=ePWinOTN3neSVz7T81TAF7GZVAGNYGJ8SfhM5LBYWb8,9824
74
+ scanoss/scanners/folder_hasher.py,sha256=-qvTtMC0iPj7zS8nMSZZJyt9d62MeQIK0LcrNDkt7yc,12267
75
75
  scanoss/scanners/scanner_config.py,sha256=egG7cw3S2akU-D9M1aLE5jLrfz_c8e7_DIotMnnpM84,2601
76
- scanoss/scanners/scanner_hfh.py,sha256=KksrC1XnOv7mXSlGyo_AJXwtPfKjqffkttdPoNDc-AQ,5802
76
+ scanoss/scanners/scanner_hfh.py,sha256=CGTRzg9Epyyi7DCvQXVY91A8P0GGl8bzfr0zRCaM3XA,7906
77
77
  scanoss/utils/__init__.py,sha256=0hjb5ktavp7utJzFhGMPImPaZiHWgilM2HwvTp5lXJE,1122
78
78
  scanoss/utils/abstract_presenter.py,sha256=teiDTxBj5jBMCk2T8i4l1BJPf_u4zBLWrtCTFHSSECM,3148
79
79
  scanoss/utils/crc64.py,sha256=TMrwQimSdE6imhFOUL7oAG6Kxu-8qMpGWMuMg8QpSVs,3169
80
80
  scanoss/utils/file.py,sha256=62cA9a17TU9ZvfA3FY5HY4-QOajJeSrc8S6xLA_f-3M,2980
81
81
  scanoss/utils/simhash.py,sha256=6iu8DOcecPAY36SZjCOzrrLMT9oIE7-gI6QuYwUQ7B0,5793
82
- scanoss-1.26.2.dist-info/licenses/LICENSE,sha256=LLUaXoiyOroIbr5ubAyrxBOwSRLTm35ETO2FmLpy8QQ,1074
83
- scanoss-1.26.2.dist-info/METADATA,sha256=ReKdCqWwRMsZl2IHLb2VpX7wxJPHfF4sXafR4uT94ak,6060
84
- scanoss-1.26.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
85
- scanoss-1.26.2.dist-info/entry_points.txt,sha256=Uy28xnaDL5KQ7V77sZD5VLDXPNxYYzSr5tsqtiXVzAs,48
86
- scanoss-1.26.2.dist-info/top_level.txt,sha256=V11PrQ6Pnrc-nDF9xnisnJ8e6-i7HqSIKVNqduRWcL8,27
87
- scanoss-1.26.2.dist-info/RECORD,,
82
+ scanoss-1.27.0.dist-info/licenses/LICENSE,sha256=LLUaXoiyOroIbr5ubAyrxBOwSRLTm35ETO2FmLpy8QQ,1074
83
+ scanoss-1.27.0.dist-info/METADATA,sha256=TB02dYgadlHHeQhCDWJiSRDJxQ52lT10TuWFTdE6W1E,6060
84
+ scanoss-1.27.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
85
+ scanoss-1.27.0.dist-info/entry_points.txt,sha256=Uy28xnaDL5KQ7V77sZD5VLDXPNxYYzSr5tsqtiXVzAs,48
86
+ scanoss-1.27.0.dist-info/top_level.txt,sha256=V11PrQ6Pnrc-nDF9xnisnJ8e6-i7HqSIKVNqduRWcL8,27
87
+ scanoss-1.27.0.dist-info/RECORD,,