PyPI - scanoss - Versions diffs - 1.24.0__tar.gz → 1.25.0__tar.gz - Mend

scanoss 1.24.0tar.gz → 1.25.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

{scanoss-1.24.0/src/scanoss.egg-info → scanoss-1.25.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scanoss
-Version: 1.24.0
+Version: 1.25.0
 Summary: Simple Python library to leverage the SCANOSS APIs
 Home-page: https://scanoss.com
 Author: SCANOSS

{scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/__init__.py RENAMED Viewed

@@ -22,4 +22,4 @@ SPDX-License-Identifier: MIT
   THE SOFTWARE.
 """
-__version__ = '1.24.0'
+__version__ = '1.25.0'

scanoss-1.25.0/src/scanoss/data/build_date.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ date: 20250610161304, utime: 1749571984

{scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/inspection/policy_check.py RENAMED Viewed

@@ -26,9 +26,10 @@ import json
 import os.path
 from abc import abstractmethod
 from enum import Enum
-from typing import Callable, List, Dict, Any
-from .utils.license_utils import LicenseUtil
+from typing import Any, Callable, Dict, List
 from ..scanossbase import ScanossBase
+from .utils.license_utils import LicenseUtil
 class PolicyStatus(Enum):
@@ -87,7 +88,7 @@ class PolicyCheck(ScanossBase):
     VALID_FORMATS = {'md', 'json', 'jira_md'}
-    def __init__(
+    def __init__( # noqa: PLR0913
         self,
         debug: bool = False,
         trace: bool = True,
@@ -181,10 +182,9 @@ class PolicyCheck(ScanossBase):
         :param status: The new component status
         :return: The updated components dictionary
         """
         # Determine the component key and purl based on component type
         if id in [ComponentID.FILE.value, ComponentID.SNIPPET.value]:
-            purl = new_component['purl'][0]  # Take first purl for these component types
+            purl = new_component['purl'][0]  # Take the first purl for these component types
         else:
             purl = new_component['purl']
@@ -195,14 +195,13 @@ class PolicyCheck(ScanossBase):
             'licenses': {},
             'status': status,
         }
         if not new_component.get('licenses'):
-            self.print_stderr(f'WARNING: Results missing licenses. Skipping.')
+            self.print_debug(f'WARNING: Results missing licenses. Skipping: {new_component}')
             return components
         # Process licenses for this component
-        for l in new_component['licenses']:
-            if l.get('name'):
-                spdxid = l['name']
+        for license_item in new_component['licenses']:
+            if license_item.get('name'):
+                spdxid = license_item['name']
                 components[component_key]['licenses'][spdxid] = {
                     'spdxid': spdxid,
                     'copyleft': self.license_util.is_copyleft(spdxid),
@@ -210,71 +209,103 @@ class PolicyCheck(ScanossBase):
                 }
         return components
-    def _get_components_from_results(self, results: Dict[str, Any]) -> list or None:
+    def _get_components_data(self, results: Dict[str, Any], components: Dict[str, Any]) -> Dict[str, Any]:
         """
-        Process the results dictionary to extract and format component information.
-        This function iterates through the results dictionary, identifying components from
-        different sources (files, snippets, and dependencies). It consolidates this information
-        into a list of unique components, each with its associated licenses and other details.
+        Extract and process file and snippet components from results.
         :param results: A dictionary containing the raw results of a component scan
-        :return: A list of dictionaries, each representing a unique component with its details
+        :param components: Existing components dictionary to update
+        :return: Updated components dictionary with file and snippet data
         """
-        if results is None:
-            self.print_stderr(f'ERROR: Results cannot be empty')
-            return None
-        components = {}
         for component in results.values():
             for c in component:
                 component_id = c.get('id')
                 if not component_id:
-                    self.print_stderr(f'WARNING: Result missing id. Skipping.')
+                    self.print_debug(f'WARNING: Result missing id. Skipping: {c}')
                     continue
                 status = c.get('status')
-                if not component_id:
-                    self.print_stderr(f'WARNING: Result missing status. Skipping.')
+                if not status:
+                    self.print_debug(f'WARNING: Result missing status. Skipping: {c}')
                     continue
                 if component_id in [ComponentID.FILE.value, ComponentID.SNIPPET.value]:
                     if not c.get('purl'):
-                        self.print_stderr(f'WARNING: Result missing purl. Skipping.')
+                        self.print_debug(f'WARNING: Result missing purl. Skipping: {c}')
                         continue
                     if len(c.get('purl')) <= 0:
-                        self.print_stderr(f'WARNING: Result missing purls. Skipping.')
+                        self.print_debug(f'WARNING: Result missing purls. Skipping: {c}')
                         continue
                     if not c.get('version'):
-                        self.print_stderr(f'WARNING: Result missing version. Skipping.')
+                        self.print_msg(f'WARNING: Result missing version. Skipping: {c}')
                         continue
                     component_key = f'{c["purl"][0]}@{c["version"]}'
-                    # Initialize or update the component entry
                     if component_key not in components:
                         components = self._append_component(components, c, component_id, status)
+            # End component loop
+        # End components loop
+        return components
-                if c['id'] == ComponentID.DEPENDENCY.value:
+    def _get_dependencies_data(self, results: Dict[str, Any], components: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Extract and process dependency components from results.
+        :param results: A dictionary containing the raw results of a component scan
+        :param components: Existing components dictionary to update
+        :return: Updated components dictionary with dependency data
+        """
+        for component in results.values():
+            for c in component:
+                component_id = c.get('id')
+                if not component_id:
+                    self.print_debug(f'WARNING: Result missing id. Skipping: {c}')
+                    continue
+                status = c.get('status')
+                if not status:
+                    self.print_debug(f'WARNING: Result missing status. Skipping: {c}')
+                    continue
+                if component_id == ComponentID.DEPENDENCY.value:
                     if c.get('dependencies') is None:
                         continue
-                    for d in c['dependencies']:
-                        if not d.get('purl'):
-                            self.print_stderr(f'WARNING: Result missing purl. Skipping.')
-                            continue
-                        if len(d.get('purl')) <= 0:
-                            self.print_stderr(f'WARNING: Result missing purls. Skipping.')
+                    for dependency in c['dependencies']:
+                        if not dependency.get('purl'):
+                            self.print_debug(f'WARNING: Dependency result missing purl. Skipping: {dependency}')
                             continue
-                        if not d.get('version'):
-                            self.print_stderr(f'WARNING: Result missing version. Skipping.')
+                        if not dependency.get('version'):
+                            self.print_msg(f'WARNING: Dependency result missing version. Skipping: {dependency}')
                             continue
-                        component_key = f'{d["purl"]}@{d["version"]}'
+                        component_key = f'{dependency["purl"]}@{dependency["version"]}'
                         if component_key not in components:
-                            components = self._append_component(components, d, component_id, status)
-                    # End of dependencies loop
-                # End if
-            # End of component loop
-        # End of results loop
-        results = list(components.values())
-        for component in results:
+                            components = self._append_component(components, dependency, component_id, status)
+                    # End dependency loop
+            # End component loop
+        # End of result loop
+        return components
+    def _get_components_from_results(self, results: Dict[str, Any]) -> list or None:
+        """
+        Process the results dictionary to extract and format component information.
+        This function iterates through the results dictionary, identifying components from
+        different sources (files, snippets, and dependencies). It consolidates this information
+        into a list of unique components, each with its associated licenses and other details.
+        :param results: A dictionary containing the raw results of a component scan
+        :return: A list of dictionaries, each representing a unique component with its details
+        """
+        if results is None:
+            self.print_stderr('ERROR: Results cannot be empty')
+            return None
+        components = {}
+        # Extract file and snippet components
+        components = self._get_components_data(results, components)
+        # Extract dependency components
+        components = self._get_dependencies_data(results, components)
+        # Convert to list and process licenses
+        results_list = list(components.values())
+        for component in results_list:
             component['licenses'] = list(component['licenses'].values())
-        return results
+        return results_list
     def generate_table(self, headers, rows, centered_columns=None):
         """
@@ -403,7 +434,6 @@ class PolicyCheck(ScanossBase):
         components = self._get_components_from_results(self.results)
         return components
 #
 # End of PolicyCheck Class
 #

{scanoss-1.24.0 → scanoss-1.25.0}/src/scanoss/winnowing.py RENAMED Viewed

@@ -32,9 +32,10 @@ import hashlib
 import pathlib
 import platform
 import re
+from typing import Tuple
-from crc32c import crc32c
 from binaryornot.check import is_binary
+from crc32c import crc32c
 from .scanossbase import ScanossBase
@@ -157,7 +158,7 @@ class Winnowing(ScanossBase):
     a list of WFP fingerprints with their corresponding line numbers.
     """
-    def __init__(
+    def __init__(  # noqa: PLR0913
         self,
         size_limit: bool = False,
         debug: bool = False,
@@ -197,6 +198,7 @@ class Winnowing(ScanossBase):
         self.strip_hpsm_ids = strip_hpsm_ids
         self.strip_snippet_ids = strip_snippet_ids
         self.hpsm = hpsm
+        self.is_windows = platform.system() == 'Windows'
         if hpsm:
             self.crc8_maxim_dow_table = []
             self.crc8_generate_table()
@@ -218,11 +220,11 @@ class Winnowing(ScanossBase):
             return byte
         if byte >= ASCII_a:
             return byte
-        if (byte >= 65) and (byte <= 90):
+        if (byte >= ASCII_A) and (byte <= ASCII_Z):
             return byte + 32
         return 0
-    def __skip_snippets(self, file: str, src: str) -> bool:
+    def __skip_snippets(self, file: str, src: str) -> bool:  # noqa: PLR0911
         """
         Determine files that are not of interest based on their content or file extension
         Parameters
@@ -351,7 +353,55 @@ class Winnowing(ScanossBase):
             self.print_debug(f'Stripped snippet ids from {file}')
         return wfp
-    def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:
+    def __detect_line_endings(self, contents: bytes) -> Tuple[bool, bool, bool]:
+        """Detect the types of line endings present in file contents.
+        Args:
+            contents: File contents as bytes.
+        Returns:
+            Tuple of (has_crlf, has_lf_only, has_cr_only, has_mixed) indicating which line ending types are present.
+        """
+        has_crlf = b'\r\n' in contents
+        # For LF detection, we need to find LF that's not part of CRLF
+        content_without_crlf = contents.replace(b'\r\n', b'')
+        has_standalone_lf = b'\n' in content_without_crlf
+        # For CR detection, we need to find CR that's not part of CRLF
+        has_standalone_cr = b'\r' in content_without_crlf
+        return has_crlf, has_standalone_lf, has_standalone_cr
+    def __calculate_opposite_line_ending_hash(self, contents: bytes):
+        """Calculate hash for contents with opposite line endings.
+        If the file is primarily Unix (LF), calculates Windows (CRLF) hash.
+        If the file is primarily Windows (CRLF), calculates Unix (LF) hash.
+        Args:
+            contents: File contents as bytes.
+        Returns:
+            Hash with opposite line endings as hex string, or None if no line endings detected.
+        """
+        has_crlf, has_standalone_lf, has_standalone_cr = self.__detect_line_endings(contents)
+        if not has_crlf and not has_standalone_lf and not has_standalone_cr:
+            return None
+        # Normalize all line endings to LF first
+        normalized = contents.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
+        # Determine the dominant line ending type
+        if has_crlf and not has_standalone_lf and not has_standalone_cr:
+            # File is Windows (CRLF) - produce Unix (LF) hash
+            opposite_contents = normalized
+        else:
+            # File is Unix (LF/CR) or mixed - produce Windows (CRLF) hash
+            opposite_contents = normalized.replace(b'\n', b'\r\n')
+        return hashlib.md5(opposite_contents).hexdigest()
+    def wfp_for_contents(self, file: str, bin_file: bool, contents: bytes) -> str:  # noqa: PLR0912, PLR0915
         """
         Generate a Winnowing fingerprint (WFP) for the given file contents
         Parameters
@@ -371,7 +421,7 @@ class Winnowing(ScanossBase):
         content_length = len(contents)
         original_filename = file
-        if platform.system() == 'Windows':
+        if self.is_windows:
             original_filename = file.replace('\\', '/')
         wfp_filename = repr(original_filename).strip("'")  # return a utf-8 compatible version of the filename
         if self.obfuscate:  # hide the real size of the file and its name, but keep the suffix
@@ -380,6 +430,13 @@ class Winnowing(ScanossBase):
             self.file_map[wfp_filename] = original_filename  # Save the file name map for later (reverse lookup)
         wfp = 'file={0},{1},{2}\n'.format(file_md5, content_length, wfp_filename)
+        # Add opposite line ending hash based on line ending analysis
+        if not bin_file:
+            opposite_hash = self.__calculate_opposite_line_ending_hash(contents)
+            if opposite_hash is not None:
+                wfp += f'fh2={opposite_hash}\n'
         # We don't process snippets for binaries, or other uninteresting files, or if we're requested to skip
         if bin_file or self.skip_snippets or self.__skip_snippets(file, contents.decode('utf-8', 'ignore')):
             return wfp
@@ -467,7 +524,7 @@ class Winnowing(ScanossBase):
         for i, byte in enumerate(content):
             c = byte
             if c == ASCII_LF:  # When there is a new line
-                if len(list_normalized):
+                if list_normalized:
                     crc_lines.append(self.crc8_buffer(list_normalized))
                     list_normalized = []
                 elif last_line + 1 == i:

{scanoss-1.24.0 → scanoss-1.25.0/src/scanoss.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: scanoss
-Version: 1.24.0
+Version: 1.25.0
 Summary: Simple Python library to leverage the SCANOSS APIs
 Home-page: https://scanoss.com
 Author: SCANOSS

scanoss-1.25.0/tests/test_winnowing.py ADDED Viewed

@@ -0,0 +1,393 @@
+"""
+SPDX-License-Identifier: MIT
+  Copyright (c) 2021, SCANOSS
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+"""
+import platform
+import unittest
+from unittest.mock import patch
+from scanoss.winnowing import Winnowing
+class MyTestCase(unittest.TestCase):
+    """
+    Exercise the Winnowing class
+    """
+    def test_winnowing(self):
+        winnowing = Winnowing(debug=True)
+        filename = 'test-file.c'
+        contents = 'c code contents'
+        content_types = bytes(contents, encoding='raw_unicode_escape')
+        wfp = winnowing.wfp_for_contents(filename, False, content_types)
+        print(f'WFP for {filename}: {wfp}')
+        self.assertIsNotNone(wfp)
+        filename = __file__
+        wfp = winnowing.wfp_for_file(filename, filename)
+        print(f'WFP for {filename}: {wfp}')
+        self.assertIsNotNone(wfp)
+    def test_snippet_skip(self):
+        winnowing = Winnowing(debug=True)
+        filename = 'test-file.jar'
+        contents = 'jar file contents'
+        content_types = bytes(contents, encoding='raw_unicode_escape')
+        wfp = winnowing.wfp_for_contents(filename, False, content_types)
+        print(f'WFP for {filename}: {wfp}')
+        self.assertIsNotNone(wfp)
+    def test_snippet_strip(self):
+        winnowing = Winnowing(
+            debug=True, hpsm=True, strip_snippet_ids=['d5e54c33,b03faabe'], strip_hpsm_ids=['0d2fffaffc62d18']
+        )
+        filename = 'test-file.py'
+        with open(__file__, 'rb') as f:
+            contents = f.read()
+        print('--- Test snippet and HPSM strip ---')
+        wfp = winnowing.wfp_for_contents(filename, False, contents)
+        found = 0
+        print(f'WFP for {filename}: {wfp}')
+        try:
+            found = wfp.index('d5e54c33,b03faabe')
+        except ValueError:
+            found = -1
+        self.assertEqual(found, -1)
+        try:
+            found = wfp.index('0d2fffaffc62d18')
+        except ValueError:
+            found = -1
+        self.assertEqual(found, -1)
+    def test_windows_hash_calculation(self):
+        """Test Windows-specific hash calculation with CRLF line endings."""
+        import hashlib
+        # Test content with LF line endings
+        content_lf = b'line1\nline2\nline3\n'
+        # Expected content with CRLF line endings for Windows hash
+        content_crlf = b'line1\r\nline2\r\nline3\r\n'
+        # Calculate the expected Windows hash manually
+        expected_windows_hash = hashlib.md5(content_crlf).hexdigest()
+        lf_hash = hashlib.md5(content_lf).hexdigest()
+        print(f'LF content hash: {lf_hash}')
+        print(f'CRLF content hash (Windows): {expected_windows_hash}')
+        # They should be different
+        self.assertNotEqual(lf_hash, expected_windows_hash)
+    @patch('platform.system')
+    def test_windows_wfp_includes_fh2(self, mock_platform):
+        """Test that WFP includes fh2 hash when running on Windows."""
+        # Mock Windows environment
+        mock_platform.return_value = 'Windows'
+        winnowing = Winnowing(debug=True)
+        filename = 'test-file.c'
+        content = b'int main() {\n    return 0;\n}\n'
+        wfp = winnowing.wfp_for_contents(filename, False, content)
+        print(f'Windows WFP output:\n{wfp}')
+        # Check that WFP contains fh2 line
+        self.assertIn('fh2=', wfp)
+        # Extract the fh2 hash from WFP
+        lines = wfp.split('\n')
+        fh2_line = [line for line in lines if line.startswith('fh2=')]
+        self.assertEqual(len(fh2_line), 1)
+        fh2_hash = fh2_line[0].split('=')[1]
+        # Verify it matches expected CRLF conversion
+        import hashlib
+        content_crlf = content.replace(b'\n', b'\r\n')
+        expected_hash = hashlib.md5(content_crlf).hexdigest()
+        self.assertEqual(fh2_hash, expected_hash)
+    def test_line_ending_detection(self):
+        """Test line ending detection logic."""
+        winnowing = Winnowing(debug=True)
+        # Test LF only
+        content_lf = b'line1\nline2\nline3\n'
+        has_crlf, has_lf, has_cr = winnowing._Winnowing__detect_line_endings(content_lf)
+        self.assertFalse(has_crlf)
+        self.assertTrue(has_lf)
+        self.assertFalse(has_cr)
+        # Test CRLF only
+        content_crlf = b'line1\r\nline2\r\nline3\r\n'
+        has_crlf, has_lf, has_cr = winnowing._Winnowing__detect_line_endings(content_crlf)
+        self.assertTrue(has_crlf)
+        self.assertFalse(has_lf)
+        self.assertFalse(has_cr)
+        # Test CR only (old Mac style)
+        content_cr = b'line1\rline2\rline3\r'
+        has_crlf, has_lf, has_cr = winnowing._Winnowing__detect_line_endings(content_cr)
+        self.assertFalse(has_crlf)
+        self.assertFalse(has_lf)
+        self.assertTrue(has_cr)
+        # Test mixed CRLF and LF
+        content_mixed = b'line1\r\nline2\nline3\r\n'
+        has_crlf, has_lf, has_cr = winnowing._Winnowing__detect_line_endings(content_mixed)
+        self.assertTrue(has_crlf)
+        self.assertTrue(has_lf)
+        self.assertFalse(has_cr)
+    def test_opposite_hash_logic(self):
+        """Test the logic of opposite hash calculation."""
+        winnowing = Winnowing(debug=True)
+        # Test different line ending scenarios
+        content_lf = b'line1\nline2\nline3\n'
+        content_crlf = b'line1\r\nline2\r\nline3\r\n'
+        content_cr = b'line1\rline2\rline3\r'
+        content_mixed = b'line1\r\nline2\nline3\r'
+        hash_lf = winnowing._Winnowing__calculate_opposite_line_ending_hash(content_lf)
+        hash_crlf = winnowing._Winnowing__calculate_opposite_line_ending_hash(content_crlf)
+        hash_cr = winnowing._Winnowing__calculate_opposite_line_ending_hash(content_cr)
+        hash_mixed = winnowing._Winnowing__calculate_opposite_line_ending_hash(content_mixed)
+        print(f'LF opposite hash: {hash_lf}')
+        print(f'CRLF opposite hash: {hash_crlf}')
+        print(f'CR opposite hash: {hash_cr}')
+        print(f'Mixed opposite hash: {hash_mixed}')
+        # LF, CR, and mixed content should all produce CRLF hash (same result)
+        self.assertEqual(hash_lf, hash_cr)
+        self.assertEqual(hash_lf, hash_mixed)
+        # CRLF content should produce LF hash (different from the others)
+        self.assertNotEqual(hash_crlf, hash_lf)
+    @unittest.skipUnless(platform.system() == 'Windows', 'Windows-specific test')
+    def test_actual_windows_behavior(self):
+        """Test actual Windows behavior when running on Windows."""
+        winnowing = Winnowing(debug=True)
+        filename = 'test-file.c'
+        content = b'int main() {\n    return 0;\n}\n'
+        wfp = winnowing.wfp_for_contents(filename, False, content)
+        print(f'Actual Windows WFP:\n{wfp}')
+        # On actual Windows with LF content, should include fh2
+        # Should always generate fh2 when line endings are present
+        self.assertIn('fh2=', wfp)
+    def test_empty_file_fh2(self):
+        """Test fh2 behavior with empty files."""
+        winnowing = Winnowing(debug=True)
+        content = b''
+        wfp = winnowing.wfp_for_contents('empty.txt', False, content)
+        print(f'Empty file WFP:\n{wfp}')
+        # Empty files should not generate fh2
+        self.assertNotIn('fh2=', wfp)
+    def test_no_line_endings_fh2(self):
+        """Test files without any line endings."""
+        winnowing = Winnowing(debug=True)
+        content = b'no line endings here'
+        wfp = winnowing.wfp_for_contents('noline.txt', False, content)
+        print(f'No line endings WFP:\n{wfp}')
+        # Files without line endings should not generate fh2
+        self.assertNotIn('fh2=', wfp)
+    def test_all_platforms_generate_fh2(self):
+        """Test that all platforms generate fh2 when line endings are present."""
+        winnowing = Winnowing(debug=True)
+        content = b'line1\nline2\n'
+        wfp = winnowing.wfp_for_contents('test.txt', False, content)
+        print(f'Platform-independent WFP:\n{wfp}')
+        # Any platform should generate fh2 when line endings are present
+        self.assertIn('fh2=', wfp)
+    def test_verify_opposite_hash_calculation(self):
+        """Test that the opposite hash calculation works correctly."""
+        winnowing = Winnowing(debug=True)
+        # Test LF -> CRLF conversion
+        content_lf = b'line1\nline2\nline3\n'
+        wfp_lf = winnowing.wfp_for_contents('test_lf.txt', False, content_lf)
+        # Test CRLF -> LF conversion
+        content_crlf = b'line1\r\nline2\r\nline3\r\n'
+        wfp_crlf = winnowing.wfp_for_contents('test_crlf.txt', False, content_crlf)
+        print(f'LF content WFP:\n{wfp_lf}')
+        print(f'CRLF content WFP:\n{wfp_crlf}')
+        # Both should generate fh2
+        self.assertIn('fh2=', wfp_lf)
+        self.assertIn('fh2=', wfp_crlf)
+        # Extract fh2 values
+        lf_fh2 = wfp_lf.split('fh2=')[1].split('\n')[0]
+        crlf_fh2 = wfp_crlf.split('fh2=')[1].split('\n')[0]
+        # The fh2 values should be swapped (LF file gets CRLF hash, CRLF file gets LF hash)
+        import hashlib
+        expected_lf_to_crlf = hashlib.md5(content_lf.replace(b'\n', b'\r\n')).hexdigest()
+        expected_crlf_to_lf = hashlib.md5(content_crlf.replace(b'\r\n', b'\n')).hexdigest()
+        self.assertEqual(lf_fh2, expected_lf_to_crlf)
+        self.assertEqual(crlf_fh2, expected_crlf_to_lf)
+    def test_binary_file_with_line_endings(self):
+        """Test binary files with embedded line endings."""
+        winnowing = Winnowing(debug=True)
+        # Binary content with embedded newlines
+        content = b'\x00\x01\n\x02\x03\r\n\x04'
+        wfp = winnowing.wfp_for_contents('binary.bin', True, content)
+        print(f'Binary file WFP:\n{wfp}')
+        # Binary files should not generate fh2
+        self.assertNotIn('fh2=', wfp)
+    def test_cr_only_line_endings(self):
+        """Test classic Mac CR-only line endings."""
+        winnowing = Winnowing(debug=True)
+        content = b'line1\rline2\rline3\r'
+        wfp = winnowing.wfp_for_contents('mac.txt', False, content)
+        print(f'CR-only WFP:\n{wfp}')
+        # Should generate fh2 (platform independent)
+        self.assertIn('fh2=', wfp)
+        # Should normalize CR to CRLF for the opposite hash
+        import hashlib
+        expected = content.replace(b'\r', b'\r\n')
+        expected_hash = hashlib.md5(expected).hexdigest()
+        self.assertIn(f'fh2={expected_hash}', wfp)
+    def test_whitespace_only_file(self):
+        """Test files with only whitespace characters."""
+        winnowing = Winnowing(debug=True)
+        content = b'   \n\t\n   \n'
+        wfp = winnowing.wfp_for_contents('whitespace.txt', False, content)
+        print(f'Whitespace-only WFP:\n{wfp}')
+        # Should generate fh2 since it has line endings
+        self.assertIn('fh2=', wfp)
+    def test_mixed_complex_line_endings(self):
+        """Test complex mixed line ending scenarios."""
+        winnowing = Winnowing(debug=True)
+        # Mix of CRLF, LF, and CR
+        content = b'line1\r\nline2\nline3\rline4\r\nline5\n'
+        wfp = winnowing.wfp_for_contents('mixed.txt', False, content)
+        print(f'Mixed line endings WFP:\n{wfp}')
+        # Should generate fh2
+        self.assertIn('fh2=', wfp)
+        # Verify the hash calculation
+        import hashlib
+        normalized = content.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
+        expected_crlf = normalized.replace(b'\n', b'\r\n')
+        expected_hash = hashlib.md5(expected_crlf).hexdigest()
+        self.assertIn(f'fh2={expected_hash}', wfp)
+    def test_fh2_with_skip_snippets(self):
+        """Test fh2 generation when skip_snippets is enabled."""
+        winnowing = Winnowing(debug=True, skip_snippets=True)
+        content = b'line1\nline2\nline3\n'
+        wfp = winnowing.wfp_for_contents('test.txt', False, content)
+        print(f'Skip snippets WFP:\n{wfp}')
+        # Should still generate fh2 even when skipping snippets
+        self.assertIn('fh2=', wfp)
+        # But should not contain snippet fingerprints (line numbers)
+        lines = wfp.split('\n')
+        snippet_lines = [line for line in lines if '=' in line and line[0].isdigit()]
+        self.assertEqual(len(snippet_lines), 0)
+    def test_fh2_with_obfuscation(self):
+        """Test fh2 generation with obfuscation enabled."""
+        winnowing = Winnowing(debug=True, obfuscate=True)
+        content = b'line1\nline2\nline3\n'
+        wfp = winnowing.wfp_for_contents('test.txt', False, content)
+        print(f'Obfuscated WFP:\n{wfp}')
+        # Should still generate fh2 with obfuscation
+        self.assertIn('fh2=', wfp)
+        # Filename should be obfuscated
+        self.assertIn('1.txt', wfp)
+        self.assertNotIn('test.txt', wfp)
+    def test_large_file_with_line_endings(self):
+        """Test large files with many line endings."""
+        winnowing = Winnowing(debug=True, size_limit=True, post_size=1)  # 1KB limit
+        # Create content larger than the limit
+        content = b'line\n' * 1000  # Should exceed 1KB
+        wfp = winnowing.wfp_for_contents('large.txt', False, content)
+        print(f'Large file WFP length: {len(wfp)}')
+        # Should still generate fh2 even with size limits
+        self.assertIn('fh2=', wfp)
+    def test_single_line_no_newline(self):
+        """Test single line files without trailing newline."""
+        winnowing = Winnowing(debug=True)
+        content = b'single line without newline'
+        wfp = winnowing.wfp_for_contents('single.txt', False, content)
+        print(f'Single line no newline WFP:\n{wfp}')
+        # Should not generate fh2 (no line endings)
+        self.assertNotIn('fh2=', wfp)
+    def test_file_with_null_bytes_and_newlines(self):
+        """Test files with null bytes mixed with newlines."""
+        winnowing = Winnowing(debug=True)
+        content = b'line1\x00\nline2\x00\x00\nline3\n'
+        wfp = winnowing.wfp_for_contents('nullbytes.txt', False, content)
+        print(f'Null bytes with newlines WFP:\n{wfp}')
+        # Should generate fh2 (has line endings)
+        self.assertIn('fh2=', wfp)
+if __name__ == '__main__':
+    unittest.main()

scanoss-1.24.0/src/scanoss/data/build_date.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- date: 20250528081438, utime: 1748420078

scanoss-1.24.0/tests/test_winnowing.py DELETED Viewed

@@ -1,82 +0,0 @@
-"""
-SPDX-License-Identifier: MIT
-  Copyright (c) 2021, SCANOSS
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so, subject to the following conditions:
-  The above copyright notice and this permission notice shall be included in
-  all copies or substantial portions of the Software.
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-  THE SOFTWARE.
-"""
-import unittest
-from scanoss.winnowing import Winnowing
-class MyTestCase(unittest.TestCase):
-    """
-    Exercise the Winnowing class
-    """
-    def test_winnowing(self):
-        winnowing = Winnowing(debug=True)
-        filename = 'test-file.c'
-        contents = 'c code contents'
-        content_types = bytes(contents, encoding='raw_unicode_escape')
-        wfp = winnowing.wfp_for_contents(filename, False, content_types)
-        print(f'WFP for {filename}: {wfp}')
-        self.assertIsNotNone(wfp)
-        filename = __file__
-        wfp = winnowing.wfp_for_file(filename, filename)
-        print(f'WFP for {filename}: {wfp}')
-        self.assertIsNotNone(wfp)
-    def test_snippet_skip(self):
-        winnowing = Winnowing(debug=True)
-        filename = 'test-file.jar'
-        contents = 'jar file contents'
-        content_types = bytes(contents, encoding='raw_unicode_escape')
-        wfp = winnowing.wfp_for_contents(filename, False, content_types)
-        print(f'WFP for {filename}: {wfp}')
-        self.assertIsNotNone(wfp)
-    def test_snippet_strip(self):
-        winnowing = Winnowing(
-            debug=True, hpsm=True, strip_snippet_ids=['d5e54c33,b03faabe'], strip_hpsm_ids=['0d2fffaffc62d18']
-        )
-        filename = 'test-file.py'
-        with open(__file__, 'rb') as f:
-            contents = f.read()
-        print('--- Test snippet and HPSM strip ---')
-        wfp = winnowing.wfp_for_contents(filename, False, contents)
-        found = 0
-        print(f'WFP for {filename}: {wfp}')
-        try:
-            found = wfp.index('d5e54c33,b03faabe')
-        except ValueError:
-            found = -1
-        self.assertEqual(found, -1)
-        try:
-            found = wfp.index('0d2fffaffc62d18')
-        except ValueError:
-            found = -1
-        self.assertEqual(found, -1)
-if __name__ == '__main__':
-    unittest.main()