scanoss 1.41.1__tar.gz → 1.43.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. {scanoss-1.41.1/src/scanoss.egg-info → scanoss-1.43.0}/PKG-INFO +1 -1
  2. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/__init__.py +1 -1
  3. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/cli.py +17 -5
  4. scanoss-1.43.0/src/scanoss/data/build_date.txt +1 -0
  5. scanoss-1.43.0/src/scanoss/header_filter.py +563 -0
  6. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scanner.py +14 -137
  7. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scanossapi.py +1 -1
  8. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scanossbase.py +1 -1
  9. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/winnowing.py +71 -19
  10. {scanoss-1.41.1 → scanoss-1.43.0/src/scanoss.egg-info}/PKG-INFO +1 -1
  11. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss.egg-info/SOURCES.txt +2 -0
  12. scanoss-1.43.0/tests/test_headers_filter.py +370 -0
  13. {scanoss-1.41.1 → scanoss-1.43.0}/tests/test_winnowing.py +144 -2
  14. scanoss-1.41.1/src/scanoss/data/build_date.txt +0 -1
  15. {scanoss-1.41.1 → scanoss-1.43.0}/LICENSE +0 -0
  16. {scanoss-1.41.1 → scanoss-1.43.0}/PACKAGE.md +0 -0
  17. {scanoss-1.41.1 → scanoss-1.43.0}/README.md +0 -0
  18. {scanoss-1.41.1 → scanoss-1.43.0}/pyproject.toml +0 -0
  19. {scanoss-1.41.1 → scanoss-1.43.0}/setup.cfg +0 -0
  20. {scanoss-1.41.1 → scanoss-1.43.0}/src/protoc_gen_swagger/__init__.py +0 -0
  21. {scanoss-1.41.1 → scanoss-1.43.0}/src/protoc_gen_swagger/options/__init__.py +0 -0
  22. {scanoss-1.41.1 → scanoss-1.43.0}/src/protoc_gen_swagger/options/annotations_pb2.py +0 -0
  23. {scanoss-1.41.1 → scanoss-1.43.0}/src/protoc_gen_swagger/options/annotations_pb2.pyi +0 -0
  24. {scanoss-1.41.1 → scanoss-1.43.0}/src/protoc_gen_swagger/options/annotations_pb2_grpc.py +0 -0
  25. {scanoss-1.41.1 → scanoss-1.43.0}/src/protoc_gen_swagger/options/openapiv2_pb2.py +0 -0
  26. {scanoss-1.41.1 → scanoss-1.43.0}/src/protoc_gen_swagger/options/openapiv2_pb2.pyi +0 -0
  27. {scanoss-1.41.1 → scanoss-1.43.0}/src/protoc_gen_swagger/options/openapiv2_pb2_grpc.py +0 -0
  28. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/__init__.py +0 -0
  29. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/common/__init__.py +0 -0
  30. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/common/v2/__init__.py +0 -0
  31. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/common/v2/scanoss_common_pb2.py +0 -0
  32. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/common/v2/scanoss_common_pb2_grpc.py +0 -0
  33. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/components/__init__.py +0 -0
  34. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/components/v2/__init__.py +0 -0
  35. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/components/v2/scanoss_components_pb2.py +0 -0
  36. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/components/v2/scanoss_components_pb2_grpc.py +0 -0
  37. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/cryptography/v2/scanoss_cryptography_pb2.py +0 -0
  38. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/cryptography/v2/scanoss_cryptography_pb2_grpc.py +0 -0
  39. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/dependencies/__init__.py +0 -0
  40. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/dependencies/v2/__init__.py +0 -0
  41. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/dependencies/v2/scanoss_dependencies_pb2.py +0 -0
  42. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/dependencies/v2/scanoss_dependencies_pb2_grpc.py +0 -0
  43. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/geoprovenance/__init__.py +0 -0
  44. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/geoprovenance/v2/__init__.py +0 -0
  45. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2.py +0 -0
  46. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/geoprovenance/v2/scanoss_geoprovenance_pb2_grpc.py +0 -0
  47. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/licenses/__init__.py +0 -0
  48. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/licenses/v2/__init__.py +0 -0
  49. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/licenses/v2/scanoss_licenses_pb2.py +0 -0
  50. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/licenses/v2/scanoss_licenses_pb2_grpc.py +0 -0
  51. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/scanning/__init__.py +0 -0
  52. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/scanning/v2/__init__.py +0 -0
  53. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/scanning/v2/scanoss_scanning_pb2.py +0 -0
  54. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/scanning/v2/scanoss_scanning_pb2_grpc.py +0 -0
  55. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/semgrep/__init__.py +0 -0
  56. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/semgrep/v2/__init__.py +0 -0
  57. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/semgrep/v2/scanoss_semgrep_pb2.py +0 -0
  58. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/semgrep/v2/scanoss_semgrep_pb2_grpc.py +0 -0
  59. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/vulnerabilities/__init__.py +0 -0
  60. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/vulnerabilities/v2/__init__.py +0 -0
  61. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2.py +0 -0
  62. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/api/vulnerabilities/v2/scanoss_vulnerabilities_pb2_grpc.py +0 -0
  63. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/components.py +0 -0
  64. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/constants.py +0 -0
  65. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/cryptography.py +0 -0
  66. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/csvoutput.py +0 -0
  67. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/cyclonedx.py +0 -0
  68. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/data/osadl-copyleft.json +0 -0
  69. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/data/scanoss-settings-schema.json +0 -0
  70. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/data/spdx-exceptions.json +0 -0
  71. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/data/spdx-licenses.json +0 -0
  72. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/delta.py +0 -0
  73. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/export/__init__.py +0 -0
  74. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/export/dependency_track.py +0 -0
  75. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/file_filters.py +0 -0
  76. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/filecount.py +0 -0
  77. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/gitlabqualityreport.py +0 -0
  78. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/__init__.py +0 -0
  79. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/policy_check/__init__.py +0 -0
  80. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/policy_check/dependency_track/__init__.py +0 -0
  81. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/policy_check/dependency_track/project_violation.py +0 -0
  82. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/policy_check/policy_check.py +0 -0
  83. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/policy_check/scanoss/__init__.py +0 -0
  84. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/policy_check/scanoss/copyleft.py +0 -0
  85. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/policy_check/scanoss/undeclared_component.py +0 -0
  86. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/summary/__init__.py +0 -0
  87. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/summary/component_summary.py +0 -0
  88. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/summary/license_summary.py +0 -0
  89. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/summary/match_summary.py +0 -0
  90. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/utils/file_utils.py +0 -0
  91. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/utils/license_utils.py +0 -0
  92. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/utils/markdown_utils.py +0 -0
  93. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/inspection/utils/scan_result_processor.py +0 -0
  94. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/osadl.py +0 -0
  95. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/results.py +0 -0
  96. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scancodedeps.py +0 -0
  97. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scanners/__init__.py +0 -0
  98. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scanners/container_scanner.py +0 -0
  99. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scanners/folder_hasher.py +0 -0
  100. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scanners/scanner_config.py +0 -0
  101. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scanners/scanner_hfh.py +0 -0
  102. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scanoss_settings.py +0 -0
  103. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scanossgrpc.py +0 -0
  104. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scanpostprocessor.py +0 -0
  105. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/scantype.py +0 -0
  106. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/services/dependency_track_service.py +0 -0
  107. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/spdxlite.py +0 -0
  108. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/threadeddependencies.py +0 -0
  109. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/threadedscanning.py +0 -0
  110. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/utils/__init__.py +0 -0
  111. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/utils/abstract_presenter.py +0 -0
  112. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/utils/crc64.py +0 -0
  113. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/utils/file.py +0 -0
  114. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/utils/scanoss_scan_results_utils.py +0 -0
  115. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss/utils/simhash.py +0 -0
  116. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss.egg-info/dependency_links.txt +0 -0
  117. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss.egg-info/entry_points.txt +0 -0
  118. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss.egg-info/requires.txt +0 -0
  119. {scanoss-1.41.1 → scanoss-1.43.0}/src/scanoss.egg-info/top_level.txt +0 -0
  120. {scanoss-1.41.1 → scanoss-1.43.0}/tests/test_csv_output.py +0 -0
  121. {scanoss-1.41.1 → scanoss-1.43.0}/tests/test_file_filters.py +0 -0
  122. {scanoss-1.41.1 → scanoss-1.43.0}/tests/test_osadl.py +0 -0
  123. {scanoss-1.41.1 → scanoss-1.43.0}/tests/test_policy_inspect.py +0 -0
  124. {scanoss-1.41.1 → scanoss-1.43.0}/tests/test_scan_post_processor.py +0 -0
  125. {scanoss-1.41.1 → scanoss-1.43.0}/tests/test_spdxlite.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scanoss
3
- Version: 1.41.1
3
+ Version: 1.43.0
4
4
  Summary: Simple Python library to leverage the SCANOSS APIs
5
5
  Home-page: https://scanoss.com
6
6
  Author: SCANOSS
@@ -22,4 +22,4 @@ SPDX-License-Identifier: MIT
22
22
  THE SOFTWARE.
23
23
  """
24
24
 
25
- __version__ = '1.41.1'
25
+ __version__ = '1.43.0'
@@ -170,7 +170,6 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
170
170
  default=DEFAULT_RETRY,
171
171
  help='Retry limit for API communication (optional - default 5)',
172
172
  )
173
- p_scan.add_argument('--no-wfp-output', action='store_true', help='Skip WFP file generation')
174
173
  p_scan.add_argument('--dependencies', '-D', action='store_true', help='Add Dependency scanning')
175
174
  p_scan.add_argument('--dependencies-only', action='store_true', help='Run Dependency scanning only')
176
175
  p_scan.add_argument(
@@ -1096,6 +1095,19 @@ def setup_args() -> None: # noqa: PLR0912, PLR0915
1096
1095
  p.add_argument('--skip-md5', '-5', type=str, action='append', help='Skip files matching MD5.')
1097
1096
  p.add_argument('--strip-hpsm', '-G', type=str, action='append', help='Strip HPSM string from WFP.')
1098
1097
  p.add_argument('--strip-snippet', '-N', type=str, action='append', help='Strip Snippet ID string from WFP.')
1098
+ p.add_argument(
1099
+ '--skip-headers',
1100
+ '-skh',
1101
+ action='store_true',
1102
+ help='Skip license headers, comments and imports at the beginning of files.',
1103
+ )
1104
+ p.add_argument(
1105
+ '--skip-headers-limit',
1106
+ '-shl',
1107
+ type=int,
1108
+ default=0,
1109
+ help='Maximum number of lines to skip when filtering headers (default: 0 = no limit).',
1110
+ )
1099
1111
 
1100
1112
  # Global Scan/GRPC options
1101
1113
  for p in [
@@ -1388,6 +1400,8 @@ def wfp(parser, args):
1388
1400
  strip_hpsm_ids=args.strip_hpsm,
1389
1401
  strip_snippet_ids=args.strip_snippet,
1390
1402
  scan_settings=scan_settings,
1403
+ skip_headers=args.skip_headers,
1404
+ skip_headers_limit=args.skip_headers_limit,
1391
1405
  )
1392
1406
  if args.stdin:
1393
1407
  contents = sys.stdin.buffer.read()
@@ -1537,9 +1551,6 @@ def scan(parser, args): # noqa: PLR0912, PLR0915
1537
1551
  if args.retry < 0:
1538
1552
  print_stderr(f'POST retry (--retry) too small: {args.retry}. Reverting to default.')
1539
1553
 
1540
- if not os.access(os.getcwd(), os.W_OK): # Make sure the current directory is writable. If not disable saving WFP
1541
- print_stderr(f'Warning: Current directory is not writable: {os.getcwd()}')
1542
- args.no_wfp_output = True
1543
1554
  if args.ca_cert and not os.path.exists(args.ca_cert):
1544
1555
  print_stderr(f'Error: Certificate file does not exist: {args.ca_cert}.')
1545
1556
  sys.exit(1)
@@ -1558,7 +1569,6 @@ def scan(parser, args): # noqa: PLR0912, PLR0915
1558
1569
  nb_threads=args.threads,
1559
1570
  post_size=args.post_size,
1560
1571
  timeout=args.timeout,
1561
- no_wfp_file=args.no_wfp_output,
1562
1572
  all_extensions=args.all_extensions,
1563
1573
  all_folders=args.all_folders,
1564
1574
  hidden_files_folders=args.all_hidden,
@@ -1583,6 +1593,8 @@ def scan(parser, args): # noqa: PLR0912, PLR0915
1583
1593
  scan_settings=scan_settings,
1584
1594
  req_headers=process_req_headers(args.header),
1585
1595
  use_grpc=args.grpc,
1596
+ skip_headers=args.skip_headers,
1597
+ skip_headers_limit=args.skip_headers_limit,
1586
1598
  )
1587
1599
  if args.wfp:
1588
1600
  if not scanner.is_file_or_snippet_scan():
@@ -0,0 +1 @@
1
+ date: 20260105093002, utime: 1767605402
@@ -0,0 +1,563 @@
1
+ """
2
+ SPDX-License-Identifier: MIT
3
+
4
+ Copyright (c) 2025, SCANOSS
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in
14
+ all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ THE SOFTWARE.
23
+
24
+ Line Filter Module - Identifies where real source code implementation begins.
25
+
26
+ This module analyzes source code files and determines which lines are:
27
+ - License headers
28
+ - Documentation comments
29
+ - Imports/includes
30
+ - Blank lines
31
+
32
+ And returns the content from where the real implementation begins.
33
+ """
34
+
35
+ import re
36
+ from pathlib import Path
37
+ from typing import Optional, Tuple
38
+
39
+ from .scanossbase import ScanossBase
40
+
41
+
42
+ class LanguagePatterns:
43
+ """
44
+ Regex patterns for different programming languages.
45
+
46
+ This class provides a collection of regex patterns for identifying different
47
+ programming constructs, handling imports, comments, and license statements
48
+ across various programming languages. The main purpose of this class is to
49
+ assist in parsing or analysing code written in different languages efficiently.
50
+
51
+ :ivar COMMENT_PATTERNS: A dictionary containing regex patterns to identify
52
+ single-line and multi-line comments in various programming languages.
53
+ :ivar IMPORT_PATTERNS: A dictionary mapping programming languages to their
54
+ respective regex patterns for identifying import statements or package
55
+ includes it.
56
+ :ivar LICENSE_KEYWORDS: A list of keywords commonly found in license texts
57
+ or statements, often used to detect the presence of licensing information.
58
+ """
59
+ # Comment patterns (single-line and multi-line start/end)
60
+ COMMENT_PATTERNS = {
61
+ # C-style languages: C, C++, Java, JavaScript, TypeScript, Go,
62
+ # Rust, C#, PHP, Kotlin, Scala, Dart, Objective-C
63
+ 'c_style': {
64
+ 'single_line': r'^\s*//.*$',
65
+ 'multi_start': r'^\s*/\*',
66
+ 'multi_end': r'\*/\s*$',
67
+ 'multi_single': r'^\s*/\*.*\*/\s*$',
68
+ },
69
+ # Python, shell scripts, Ruby, Perl, R, Julia, YAML
70
+ 'python_style': {
71
+ 'single_line': r'^\s*#.*$',
72
+ 'doc_string_start': r'^\s*"""',
73
+ 'doc_string_end': r'"""\s*$',
74
+ },
75
+ # Lua, SQL, Haskell
76
+ 'lua_style': {
77
+ 'single_line': r'^\s*--.*$',
78
+ 'multi_start': r'^\s*--\[\[',
79
+ 'multi_end': r'\]\]\s*$',
80
+ },
81
+ # HTML, XML
82
+ 'html_style': {
83
+ 'multi_start': r'^\s*<!--',
84
+ 'multi_end': r'-->\s*$',
85
+ 'multi_single': r'^\s*<!--.*-->\s*$',
86
+ },
87
+ }
88
+ # Import/include patterns by language
89
+ IMPORT_PATTERNS = {
90
+ 'python': [
91
+ r'^\s*import\s+',
92
+ r'^\s*from\s+.*\s+import\s+',
93
+ ],
94
+ 'javascript': [
95
+ r'^\s*import\s+.*\s+from\s+',
96
+ r'^\s*import\s+["\']',
97
+ r'^\s*import\s+type\s+',
98
+ r'^\s*export\s+\*\s+from\s+',
99
+ r'^\s*export\s+\{.*\}\s+from\s+',
100
+ r'^\s*const\s+.*\s*=\s*require\(',
101
+ r'^\s*var\s+.*\s*=\s*require\(',
102
+ r'^\s*let\s+.*\s*=\s*require\(',
103
+ ],
104
+ 'typescript': [
105
+ r'^\s*import\s+',
106
+ r'^\s*export\s+.*\s+from\s+',
107
+ r'^\s*import\s+type\s+',
108
+ r'^\s*import\s+\{.*\}\s+from\s+',
109
+ ],
110
+ 'java': [
111
+ r'^\s*import\s+',
112
+ r'^\s*package\s+',
113
+ ],
114
+ 'kotlin': [
115
+ r'^\s*import\s+',
116
+ r'^\s*package\s+',
117
+ ],
118
+ 'scala': [
119
+ r'^\s*import\s+',
120
+ r'^\s*package\s+',
121
+ ],
122
+ 'go': [
123
+ r'^\s*import\s+\(',
124
+ r'^\s*import\s+"',
125
+ r'^\s*package\s+',
126
+ r'^\s*"[^"]*"\s*$', # Imports inside import () block
127
+ # Imports with alias: name "package"
128
+ r'^\s*[a-zA-Z_][a-zA-Z0-9_]*\s+"[^"]*"\s*$',
129
+ r'^\s*_\s+"[^"]*"\s*$', # _ "package" imports
130
+ ],
131
+ 'rust': [
132
+ r'^\s*use\s+',
133
+ r'^\s*extern\s+crate\s+',
134
+ r'^\s*mod\s+',
135
+ ],
136
+ 'cpp': [
137
+ r'^\s*#include\s+',
138
+ r'^\s*#pragma\s+',
139
+ r'^\s*#ifndef\s+.*_H.*', # Header guards: #ifndef FOO_H
140
+ r'^\s*#define\s+.*_H.*', # Header guards: #define FOO_H
141
+ # #endif at end of file (may have comment)
142
+ r'^\s*#endif\s+(//.*)?\s*$',
143
+ ],
144
+ 'csharp': [
145
+ r'^\s*using\s+',
146
+ r'^\s*namespace\s+',
147
+ ],
148
+ 'php': [
149
+ r'^\s*use\s+',
150
+ r'^\s*require\s+',
151
+ r'^\s*require_once\s+',
152
+ r'^\s*include\s+',
153
+ r'^\s*include_once\s+',
154
+ r'^\s*namespace\s+',
155
+ ],
156
+ 'swift': [
157
+ r'^\s*import\s+',
158
+ ],
159
+ 'ruby': [
160
+ r'^\s*require\s+',
161
+ r'^\s*require_relative\s+',
162
+ r'^\s*load\s+',
163
+ ],
164
+ 'perl': [
165
+ r'^\s*use\s+',
166
+ r'^\s*require\s+',
167
+ ],
168
+ 'r': [
169
+ r'^\s*library\(',
170
+ r'^\s*require\(',
171
+ r'^\s*source\(',
172
+ ],
173
+ 'lua': [
174
+ r'^\s*require\s+',
175
+ r'^\s*local\s+.*\s*=\s*require\(',
176
+ ],
177
+ 'dart': [
178
+ r'^\s*import\s+',
179
+ r'^\s*export\s+',
180
+ r'^\s*part\s+',
181
+ ],
182
+ 'haskell': [
183
+ r'^\s*import\s+',
184
+ r'^\s*module\s+',
185
+ ],
186
+ 'elixir': [
187
+ r'^\s*import\s+',
188
+ r'^\s*alias\s+',
189
+ r'^\s*require\s+',
190
+ r'^\s*use\s+',
191
+ ],
192
+ 'clojure': [
193
+ r'^\s*\(\s*ns\s+',
194
+ r'^\s*\(\s*require\s+',
195
+ r'^\s*\(\s*import\s+',
196
+ ],
197
+ }
198
+ # Keywords that indicate licenses
199
+ LICENSE_KEYWORDS = [
200
+ 'copyright', 'license', 'licensed', 'all rights reserved',
201
+ 'permission', 'redistribution', 'warranty', 'liability',
202
+ 'apache', 'mit', 'gpl', 'bsd', 'mozilla', 'author:',
203
+ 'spdx-license', 'contributors', 'licensee'
204
+ ]
205
+
206
+ COMPLETE_DOCSTRING_QUOTE_COUNT = 2
207
+ LICENSE_HEADER_MAX_LINES = 50
208
+ # Map of file extensions to programming languages
209
+ EXT_MAP = {
210
+ '.py': 'python',
211
+ '.js': 'javascript',
212
+ '.mjs': 'javascript',
213
+ '.cjs': 'javascript',
214
+ '.ts': 'typescript',
215
+ '.tsx': 'typescript',
216
+ '.jsx': 'javascript',
217
+ '.java': 'java',
218
+ '.kt': 'kotlin',
219
+ '.kts': 'kotlin',
220
+ '.scala': 'scala',
221
+ '.sc': 'scala',
222
+ '.go': 'go',
223
+ '.rs': 'rust',
224
+ '.cpp': 'cpp',
225
+ '.cc': 'cpp',
226
+ '.cxx': 'cpp',
227
+ '.c': 'cpp',
228
+ '.h': 'cpp',
229
+ '.hpp': 'cpp',
230
+ '.hxx': 'cpp',
231
+ '.cs': 'csharp',
232
+ '.php': 'php',
233
+ '.swift': 'swift',
234
+ '.rb': 'ruby',
235
+ '.pl': 'perl',
236
+ '.pm': 'perl',
237
+ '.r': 'r',
238
+ '.R': 'r',
239
+ '.lua': 'lua',
240
+ '.dart': 'dart',
241
+ '.hs': 'haskell',
242
+ '.ex': 'elixir',
243
+ '.exs': 'elixir',
244
+ '.clj': 'clojure',
245
+ '.cljs': 'clojure',
246
+ '.m': 'cpp', # Objective-C
247
+ '.mm': 'cpp', # Objective-C++
248
+ # Shell scripts share Python's # comment style, but lack dedicated
249
+ # import patterns (source/. commands won't be filtered)
250
+ '.sh': 'python',
251
+ '.bash': 'python',
252
+ '.zsh': 'python',
253
+ '.fish': 'python',
254
+ }
255
+
256
+
257
+ def is_blank_line(stripped_line: str) -> bool:
258
+ """
259
+ Check if a line is blank.
260
+
261
+ This method determines whether a given string `line` is blank by checking
262
+ if it consists entirely of whitespace or is empty.
263
+
264
+ :param stripped_line: The string to be evaluated.
265
+ :return: True if the string is blank, otherwise False.
266
+ """
267
+ return len(stripped_line) == 0
268
+
269
+
270
+ def is_shebang(stripped_line: str) -> bool:
271
+ """
272
+ Check if the given line is a shebang line.
273
+
274
+ This function determines if the provided string is a shebang line,
275
+ which indicates the path to the interpreter that should execute the
276
+ script.
277
+
278
+ :param stripped_line: The string to check if it's a shebang line.
279
+ :return: True if the given line starts with '#!', otherwise False.
280
+ """
281
+ return stripped_line.startswith('#!')
282
+
283
+
284
+ class HeaderFilter(ScanossBase):
285
+ """
286
+ Source code file analyser that filters headers, comments, and imports.
287
+
288
+ This class processes code files and returns only the real
289
+ implementation content, omitting licenses, documentation comments,
290
+ and imports.
291
+ """
292
+
293
+ def __init__(
294
+ self,
295
+ debug: bool = False,
296
+ trace: bool = False,
297
+ quiet: bool = False,
298
+ skip_limit: Optional[int] = None
299
+ ):
300
+ """
301
+ Initialise HeaderFilter
302
+ Parameters
303
+ ----------
304
+ skip_limit: int
305
+ Maximum number of lines to skip when analysing a file.
306
+ If set, then stop stripping data after this number of lines.
307
+ (None/0 = unlimited by default)
308
+ """
309
+ super().__init__(debug, trace, quiet)
310
+ self.patterns = LanguagePatterns()
311
+ self.max_lines = skip_limit
312
+
313
+ def filter(self, file: str, decoded_contents: str) -> int:
314
+ """
315
+ Main method that filters file content
316
+ Parameters
317
+ ----------
318
+ :param file: File path (used to detect extension)
319
+ :param decoded_contents: File contents in utf-8 encoding
320
+ Return
321
+ ------
322
+ - line_offset: Number of lines skipped from the beginning
323
+ (0 if no filtering)
324
+ """
325
+ if not decoded_contents or not file:
326
+ self.print_msg(f'No file or contents provided, skipping line filter for: {file}')
327
+ return 0
328
+ self.print_debug(f'HeaderFilter processing file: {file}')
329
+ # Detect language
330
+ language = self.detect_language(file)
331
+ # If language is not supported, return original content
332
+ if not language:
333
+ self.print_debug(f'Skipping line filter for unsupported language: {file}')
334
+ return 0
335
+ lines = decoded_contents.splitlines(keepends=True)
336
+ num_lines = len(lines)
337
+ if num_lines == 0:
338
+ self.print_msg(f'No lines in file: {file}')
339
+ return 0
340
+ self.print_debug(f'Analysing {num_lines} lines for file: {file}')
341
+
342
+ # Find the first implementation line (optimised - stops at first match)
343
+ implementation_start = self.find_first_implementation_line(lines, language)
344
+ # If no implementation, return empty
345
+ if implementation_start is None:
346
+ self.print_debug(f'No implementation found in file: {file}')
347
+ return 0
348
+ # Calculate how many lines were filtered out (line_offset)
349
+ line_offset = implementation_start - 1
350
+ # Apply max_lines limit if configured
351
+ if self.max_lines is not None and 0 < self.max_lines < line_offset:
352
+ self.print_trace(
353
+ f'Line offset {line_offset} exceeds max_lines {self.max_lines}, '
354
+ f'capping at {self.max_lines} for: {file}'
355
+ )
356
+ line_offset = self.max_lines
357
+
358
+ if line_offset > 0:
359
+ self.print_debug(f'Filtered out {line_offset} lines from beginning of {file} (language: {language})')
360
+ return line_offset
361
+
362
+ def detect_language(self, file_path: str) -> Optional[str]:
363
+ """
364
+ Detects the programming language based on the provided file extension.
365
+
366
+ This function uses a predefined mapping between file extensions and programming
367
+ languages to determine the language associated with the file. If the file extension
368
+ is found in the mapping, the corresponding language is returned. Otherwise, it
369
+ returns None.
370
+
371
+ :param file_path: Path to the file whose programming language needs to be detected.
372
+ :return: The programming language corresponding to the file extension if mapped,
373
+ otherwise None.
374
+ """
375
+ path = Path(file_path)
376
+ extension = path.suffix.lower()
377
+ if extension:
378
+ detected_language = EXT_MAP.get(extension)
379
+ if detected_language:
380
+ self.print_debug(f'Detected language "{detected_language}" for extension "{extension}"')
381
+ else:
382
+ self.print_debug(f'No language mapping found for extension "{extension}"')
383
+ else:
384
+ self.print_debug(f'No file extension found, skipping language detection for: {file_path}')
385
+ detected_language = None
386
+ return detected_language
387
+
388
+ def is_license_header(self, line: str) -> bool:
389
+ """
390
+ Check if the line appears to be part of a license header.
391
+
392
+ This method evaluates a given line of text to determine whether it
393
+ contains keywords that suggest it is part of a license header. It
394
+ performs a case-insensitive check against a predefined set of license
395
+ keywords.
396
+
397
+ :param line: The line of text to check.
398
+ :return: True if the line contains keywords indicating it is part of a
399
+ license header; False otherwise.
400
+ """
401
+ line_lower = line.lower()
402
+ return any(keyword in line_lower for keyword in self.patterns.LICENSE_KEYWORDS)
403
+
404
+ def get_comment_style(self, language: str) -> str:
405
+ """
406
+ Return the comment style associated with a given programming language.
407
+
408
+ This method determines the appropriate comment style to use based on the
409
+ specified programming language. Supported languages include those with C-style
410
+ comments, Python-style comments, and Lua-style comments. If the language does
411
+ not match any of the explicitly defined groups, a default of `c_style` is
412
+ returned.
413
+
414
+ :param language: The name of the programming language for which the comment
415
+ style needs to be determined.
416
+ :return: The comment style for the provided programming language. Possible
417
+ values are 'c_style', 'python_style', or 'lua_style'.
418
+ """
419
+ if language:
420
+ if language in ['cpp', 'java', 'kotlin', 'scala', 'javascript', 'typescript',
421
+ 'go', 'rust', 'csharp', 'php', 'swift', 'dart']:
422
+ return 'c_style'
423
+ if language in ['python', 'ruby', 'perl', 'r']:
424
+ return 'python_style'
425
+ if language in ['lua', 'haskell']:
426
+ return 'lua_style'
427
+ self.print_debug(f'No comment style defined for language "{language}", using default: "c_style"')
428
+ return 'c_style' # Default
429
+
430
+ def is_comment(self, line: str, in_multiline: bool, patterns: dict) -> Tuple[bool, bool]: # noqa: PLR0911
431
+ """
432
+ Check if a line is a comment
433
+
434
+ :param patterns: comment patterns
435
+ :param line: Line to check
436
+ :param in_multiline: Whether we're currently in a multiline comment
437
+ :return: Tuple of (is_comment, still_in_multiline)
438
+ """
439
+ if not patterns:
440
+ self.print_msg('No comment patterns defined, skipping comment check')
441
+ return False, in_multiline
442
+ # If we're in a multiline comment
443
+ if in_multiline:
444
+ # Check if the comment ends
445
+ if 'multi_end' in patterns and re.search(patterns['multi_end'], line):
446
+ return True, False
447
+ if 'doc_string_end' in patterns and re.search(patterns['doc_string_end'], line):
448
+ return True, False
449
+ return True, True
450
+ # Single-line comment
451
+ if 'single_line' in patterns and re.match(patterns['single_line'], line):
452
+ return True, False
453
+ # Multiline comment complete in one line
454
+ if 'multi_single' in patterns and re.match(patterns['multi_single'], line):
455
+ return True, False
456
+ # Start of multiline comment (C-style)
457
+ if 'multi_start' in patterns and re.search(patterns['multi_start'], line):
458
+ # If it also ends on the same line
459
+ if 'multi_end' in patterns and re.search(patterns['multi_end'], line):
460
+ return True, False
461
+ return True, True
462
+ # Start of docstring (Python)
463
+ if 'doc_string_start' in patterns and '"""' in line:
464
+ # Count how many quotes there are
465
+ count = line.count('"""')
466
+ if count == COMPLETE_DOCSTRING_QUOTE_COUNT: # Complete docstring in one line
467
+ return True, False
468
+ if count == 1: # Start of a multiline docstring
469
+ return True, True
470
+ # Default response: not a comment
471
+ return False, in_multiline
472
+
473
+ def is_import(self, line: str, patterns: dict) -> bool:
474
+ """
475
+ Check if a line of code is an import or include statement for a given programming language.
476
+
477
+ This function determines whether a specific line of code matches any
478
+ import/include patterns defined for the provided programming language.
479
+ It relies on predefined regular expression patterns.
480
+
481
+ :param patterns: import patterns for the given language.
482
+ :param line: A single line of code to check.
483
+ :return: True if the line matches any import/include pattern for the given language,
484
+ otherwise False.
485
+ """
486
+ if not patterns:
487
+ self.print_debug('No import patterns defined, skipping import check')
488
+ return any(re.match(pattern, line) for pattern in patterns)
489
+
490
+ def find_first_implementation_line(self, lines: list[str], language: str) -> Optional[int]: # noqa: PLR0912
491
+ """
492
+ Find the line number where the implementation begins (optimised version).
493
+ Returns as soon as the first implementation line is found.
494
+
495
+ :param lines: List of code lines
496
+ :param language: Programming language
497
+ :return: Line number (1-indexed) where implementation starts, or None if not found
498
+ """
499
+ if not lines or not language:
500
+ self.print_debug('No lines or language provided, skipping implementation line detection')
501
+ return None
502
+ in_multiline_comment = False
503
+ in_license_section = False
504
+ in_import_block = False # To handle import blocks in Go
505
+ consecutive_imports_count = 0
506
+ # Get comment & import patterns for the language
507
+ comment_patterns = self.patterns.COMMENT_PATTERNS[self.get_comment_style(language)]
508
+ import_patterns = self.patterns.IMPORT_PATTERNS[language]
509
+ # Iterate through lines trying to find the first implementation line
510
+ for i, line in enumerate(lines):
511
+ line_number = i + 1
512
+ stripped = line.strip()
513
+ # Shebang (only first line) or blank line
514
+ if (i == 0 and is_shebang(stripped)) or is_blank_line(stripped):
515
+ continue
516
+ # Check if it's a comment
517
+ is_a_comment, in_multiline_comment = self.is_comment(line, in_multiline_comment, comment_patterns)
518
+ if is_a_comment:
519
+ # Check if it's part of the license header
520
+ if self.is_license_header(line):
521
+ if not in_license_section:
522
+ self.print_trace(f'Line {line_number}: Detected license header section')
523
+ in_license_section = True
524
+ # If still in the license section (first lines)
525
+ elif in_license_section and line_number < LICENSE_HEADER_MAX_LINES:
526
+ pass # Still in the license section. Keep looking.
527
+ else:
528
+ if in_license_section:
529
+ self.print_trace(f'Line {line_number}: End of license header section')
530
+ in_license_section = False
531
+ continue
532
+ # If not a comment but we find a non-empty line, end license section
533
+ if not is_a_comment:
534
+ in_license_section = False
535
+ # Handle import blocks in Go
536
+ if language == 'go':
537
+ if stripped.startswith('import ('):
538
+ self.print_trace(f'Line {line_number}: Detected Go import block start')
539
+ in_import_block = True
540
+ continue
541
+ if in_import_block:
542
+ if stripped == ')':
543
+ self.print_trace(f'Line {line_number}: Detected Go import block end')
544
+ in_import_block = False
545
+ continue
546
+ if (stripped.startswith('"') or stripped.startswith('_') or
547
+ re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*\s+"', stripped)):
548
+ # It's part of the import block
549
+ continue
550
+ # Check if it's an import
551
+ if self.is_import(line, import_patterns):
552
+ if consecutive_imports_count == 0:
553
+ self.print_trace(f'Line {line_number}: Detected import section')
554
+ consecutive_imports_count += 1
555
+ continue
556
+ # If we get here, it's implementation code - return immediately!
557
+ self.print_trace(f'Line {line_number}: First implementation line detected')
558
+ return line_number
559
+ # End for loop?
560
+ return None
561
+ #
562
+ # End of HeaderFilter Class
563
+ #