fosslight-source 2.1.19__py3-none-any.whl → 2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fosslight_source/_help.py CHANGED
@@ -27,7 +27,7 @@ _HELP_MESSAGE_SOURCE_SCANNER = f"""
27
27
  \t\t\t ({', '.join(SUPPORT_FORMAT)})
28
28
  \t\t\t Multiple formats can be specified separated by space.
29
29
  Options only for FOSSLight Source Scanner
30
- -s <scanner>\t Select which scanner to be run (scancode, scanoss, all)
30
+ -s <scanner>\t Select which scanner to be run (scancode, scanoss, kb, all)
31
31
  -j\t\t\t Generate raw result of scanners in json format
32
32
  -t <float>\t\t Stop scancode scanning if scanning takes longer than a timeout in seconds.
33
33
  -c <core>\t\t Select the number of cores to be scanned with ScanCode or threads with SCANOSS.
@@ -10,12 +10,9 @@ import fosslight_util.constant as constant
10
10
  from fosslight_util.get_pom_license import get_license_from_pom
11
11
  from ._license_matched import MatchedLicense
12
12
  from ._scan_item import SourceItem
13
- from ._scan_item import is_exclude_dir
14
- from ._scan_item import is_exclude_file
15
13
  from ._scan_item import replace_word
16
14
  from ._scan_item import is_notice_file
17
15
  from ._scan_item import is_manifest_file
18
- from ._scan_item import is_package_dir
19
16
  from typing import Tuple
20
17
 
21
18
  logger = logging.getLogger(constant.LOGGER_NAME)
@@ -29,6 +26,14 @@ SPDX_REPLACE_WORDS = ["(", ")"]
29
26
  KEY_AND = r"(?<=\s)and(?=\s)"
30
27
  KEY_OR = r"(?<=\s)or(?=\s)"
31
28
  GPL_LICENSE_PATTERN = r'((a|l)?gpl|gfdl)' # GPL, LGPL, AGPL, GFDL
29
+ SOURCE_EXTENSIONS = [
30
+ '.java', '.cpp', '.c', '.cc', '.cxx', '.c++', '.h', '.hh', '.hpp', '.hxx', '.h++',
31
+ '.cs', '.py', '.pyw', '.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx',
32
+ '.go', '.rs', '.rb', '.php', '.swift', '.kt', '.kts', '.scala', '.sc',
33
+ '.m', '.mm', '.dart', '.lua', '.pl', '.pm', '.r', '.R',
34
+ '.hs', '.clj', '.cljs', '.ex', '.exs', '.groovy', '.gradle',
35
+ '.vue', '.svelte', '.asm', '.s', '.i', '.ii'
36
+ ]
32
37
 
33
38
 
34
39
  def is_gpl_family_license(licenses: list) -> bool:
@@ -75,8 +80,6 @@ def parsing_scancode_32_earlier(scancode_file_list: list, has_error: bool = Fals
75
80
  msg = []
76
81
  scancode_file_item = []
77
82
  license_list = {} # Key :[license]+[matched_text], value: MatchedLicense()
78
- prev_dir = ""
79
- prev_dir_value = False
80
83
 
81
84
  if scancode_file_list:
82
85
  for file in scancode_file_list:
@@ -88,22 +91,11 @@ def parsing_scancode_32_earlier(scancode_file_list: list, has_error: bool = Fals
88
91
  is_binary = file.get("is_binary", False)
89
92
  if "type" in file:
90
93
  is_dir = file["type"] == "directory"
91
- if is_dir:
92
- prev_dir_value = is_exclude_dir(file_path)
93
- prev_dir = file_path
94
-
95
94
  if not is_binary and not is_dir:
96
95
  licenses = file.get("licenses", [])
97
96
  copyright_list = file.get("copyrights", [])
98
97
 
99
98
  result_item = SourceItem(file_path)
100
- is_pkg, pkg_path = is_package_dir(os.path.dirname(file_path))
101
- if is_pkg:
102
- result_item.source_name_or_path = pkg_path
103
- if not any(x.source_name_or_path == result_item.source_name_or_path for x in scancode_file_item):
104
- result_item.exclude = True
105
- scancode_file_item.append(result_item)
106
- continue
107
99
 
108
100
  if has_error and "scan_errors" in file:
109
101
  error_msg = file.get("scan_errors", [])
@@ -230,8 +222,6 @@ def parsing_scancode_32_earlier(scancode_file_list: list, has_error: bool = Fals
230
222
  set(license_expression_list))
231
223
  result_item.comment = ','.join(license_expression_list)
232
224
 
233
- if is_exclude_file(file_path, prev_dir, prev_dir_value):
234
- result_item.exclude = True
235
225
  scancode_file_item.append(result_item)
236
226
  except Exception as ex:
237
227
  msg.append(f"Error Parsing item: {ex}")
@@ -263,17 +253,9 @@ def parsing_scancode_32_later(
263
253
  is_binary = file.get("is_binary", False)
264
254
  is_dir = file.get("type", "") == "directory"
265
255
  if (not file_path) or is_binary or is_dir:
256
+ logger.info(f"Skipping {file_path} because it is binary or directory")
266
257
  continue
267
-
268
258
  result_item = SourceItem(file_path)
269
- is_pkg, pkg_path = is_package_dir(os.path.dirname(file_path))
270
- if is_pkg:
271
- result_item.source_name_or_path = pkg_path
272
- if not any(x.source_name_or_path == result_item.source_name_or_path for x in scancode_file_item):
273
- result_item.exclude = True
274
- scancode_file_item.append(result_item)
275
- continue
276
-
277
259
  if has_error:
278
260
  error_msg = file.get("scan_errors", [])
279
261
  if error_msg:
@@ -326,9 +308,11 @@ def parsing_scancode_32_later(
326
308
  license_list[lic_matched_key] = lic_info
327
309
  license_detected.append(found_lic)
328
310
  result_item.licenses = license_detected
329
-
330
- result_item.exclude = is_exclude_file(file_path)
331
- result_item.is_license_text = file.get("percentage_of_license_text", 0) > 90 or is_notice_file(file_path)
311
+ file_ext = os.path.splitext(file_path)[1].lower()
312
+ is_source_file = file_ext and file_ext in SOURCE_EXTENSIONS
313
+ result_item.is_license_text = is_notice_file(file_path) or (
314
+ file.get("percentage_of_license_text", 0) > 90 and not is_source_file
315
+ )
332
316
 
333
317
  detected_without_pom = []
334
318
  if is_manifest_file(file_path) and len(license_detected) > 0:
@@ -371,7 +355,7 @@ def parsing_scancode_32_later(
371
355
  license_expression = file.get("detected_license_expression", "")
372
356
  if license_expression_spdx:
373
357
  license_expression = license_expression_spdx
374
- if license_expression:
358
+ if license_expression and "OR" in license_expression:
375
359
  result_item.comment = license_expression
376
360
 
377
361
  scancode_file_item.append(result_item)
@@ -3,12 +3,9 @@
3
3
  # Copyright (c) 2020 LG Electronics Inc.
4
4
  # SPDX-License-Identifier: Apache-2.0
5
5
 
6
- import os
7
6
  import logging
8
7
  import fosslight_util.constant as constant
9
8
  from ._scan_item import SourceItem
10
- from ._scan_item import is_exclude_file
11
- from ._scan_item import is_package_dir
12
9
  from ._scan_item import replace_word
13
10
  from typing import Tuple
14
11
 
@@ -18,7 +15,7 @@ SCANOSS_INFO_HEADER = ['No', 'Source Path', 'Component Declared', 'SPDX Tag',
18
15
  'Matched Rate (line number)', 'scanoss_fileURL']
19
16
 
20
17
 
21
- def parsing_extraInfo(scanned_result: dict) -> list:
18
+ def parsing_extra_info(scanned_result: dict) -> list:
22
19
  scanoss_extra_info = []
23
20
  for scan_item in scanned_result:
24
21
  license_w_source = scan_item.scanoss_reference
@@ -37,22 +34,14 @@ def parsing_extraInfo(scanned_result: dict) -> list:
37
34
  return scanoss_extra_info
38
35
 
39
36
 
40
- def parsing_scanResult(scanoss_report: dict, path_to_scan: str = "", path_to_exclude: list = []) -> Tuple[bool, list]:
37
+ def parsing_scan_result(scanoss_report: dict, excluded_files: set = None) -> Tuple[bool, list]:
41
38
  scanoss_file_item = []
42
- abs_path_to_exclude = [os.path.abspath(os.path.join(path_to_scan, path)) for path in path_to_exclude]
43
39
 
44
40
  for file_path, findings in scanoss_report.items():
45
- abs_file_path = os.path.abspath(os.path.join(path_to_scan, file_path))
46
- if any(os.path.commonpath([abs_file_path, exclude_path]) == exclude_path for exclude_path in abs_path_to_exclude):
41
+ file_path_normalized = file_path.replace('\\', '/')
42
+ if file_path_normalized in excluded_files:
47
43
  continue
48
44
  result_item = SourceItem(file_path)
49
- is_pkg, pkg_path = is_package_dir(os.path.dirname(file_path))
50
- if is_pkg:
51
- result_item.source_name_or_path = pkg_path
52
- if not any(x.source_name_or_path == result_item.source_name_or_path for x in scanoss_file_item):
53
- result_item.exclude = True
54
- scanoss_file_item.append(result_item)
55
- continue
56
45
 
57
46
  if 'id' in findings[0]:
58
47
  if "none" == findings[0]['id']:
@@ -86,9 +75,6 @@ def parsing_scanResult(scanoss_report: dict, path_to_scan: str = "", path_to_exc
86
75
  result_item.licenses = license_detected
87
76
  result_item.scanoss_reference = license_w_source
88
77
 
89
- if is_exclude_file(file_path):
90
- result_item.exclude = True
91
-
92
78
  if 'file_url' in findings[0]:
93
79
  result_item.fileURL = findings[0]['file_url']
94
80
  if 'matched' in findings[0]:
@@ -6,6 +6,10 @@
6
6
  import os
7
7
  import logging
8
8
  import re
9
+ import json
10
+ import hashlib
11
+ import urllib.request
12
+ import urllib.error
9
13
  import fosslight_util.constant as constant
10
14
  from fosslight_util.oss_item import FileItem, OssItem, get_checksum_sha1
11
15
 
@@ -15,17 +19,10 @@ _notice_filename = ['licen[cs]e[s]?', 'notice[s]?', 'legal', 'copyright[s]?', 'c
15
19
  '[a,l]?gpl[-]?[1-3]?[.,-,_]?[0-1]?', 'mit', 'bsd[-]?[0-4]?', 'bsd[-]?[0-4][-]?clause[s]?',
16
20
  'apache[-,_]?[1-2]?[.,-,_]?[0-2]?']
17
21
  _manifest_filename = [r'.*\.pom$', r'package\.json$', r'setup\.py$', r'pubspec\.yaml$', r'.*\.podspec$', r'Cargo\.toml$']
18
- _exclude_filename = ["changelog", "config.guess", "config.sub", "changes", "ltmain.sh",
19
- "configure", "configure.ac", "depcomp", "compile", "missing", "makefile"]
20
- _exclude_extension = [".m4", ".in", ".po"]
21
- _exclude_directory = ["test", "tests", "doc", "docs"]
22
- _exclude_directory = [os.path.sep + dir_name +
23
- os.path.sep for dir_name in _exclude_directory]
24
- _exclude_directory.append("/.")
25
- _package_directory = ["node_modules", "venv", "Pods", "Carthage"]
26
22
  MAX_LICENSE_LENGTH = 200
27
23
  MAX_LICENSE_TOTAL_LENGTH = 600
28
24
  SUBSTRING_LICENSE_COMMENT = "Maximum character limit (License)"
25
+ KB_URL = "http://fosslight-kb.lge.com/query"
29
26
 
30
27
 
31
28
  class SourceItem(FileItem):
@@ -77,7 +74,90 @@ class SourceItem(FileItem):
77
74
  else:
78
75
  self._licenses = value
79
76
 
80
- def set_oss_item(self) -> None:
77
+ def _get_md5_hash(self, path_to_scan: str = "") -> str:
78
+ try:
79
+ file_path = self.source_name_or_path
80
+ if path_to_scan and not os.path.isabs(file_path):
81
+ file_path = os.path.join(path_to_scan, file_path)
82
+ file_path = os.path.normpath(file_path)
83
+
84
+ if os.path.isfile(file_path):
85
+ md5_hash = hashlib.md5()
86
+ with open(file_path, "rb") as f:
87
+ for chunk in iter(lambda: f.read(4096), b""):
88
+ md5_hash.update(chunk)
89
+ return md5_hash.hexdigest()
90
+ except FileNotFoundError:
91
+ logger.warning(f"File not found: {self.source_name_or_path}")
92
+ except PermissionError:
93
+ logger.warning(f"Permission denied: {self.source_name_or_path}")
94
+ except Exception as e:
95
+ logger.warning(f"Failed to compute MD5 for {self.source_name_or_path}: {e}")
96
+ return ""
97
+
98
+ def _get_origin_url_from_md5_hash(self, md5_hash: str) -> str:
99
+ try:
100
+ request = urllib.request.Request(KB_URL, data=json.dumps({"file_hash": md5_hash}).encode('utf-8'), method='POST')
101
+ request.add_header('Accept', 'application/json')
102
+ request.add_header('Content-Type', 'application/json')
103
+
104
+ with urllib.request.urlopen(request, timeout=10) as response:
105
+ data = json.loads(response.read().decode())
106
+ if isinstance(data, dict):
107
+ # Only extract output if return_code is 0 (success)
108
+ return_code = data.get('return_code', -1)
109
+ if return_code == 0:
110
+ output = data.get('output', '')
111
+ if output:
112
+ return output
113
+ except urllib.error.URLError as e:
114
+ logger.warning(f"Failed to fetch origin_url from API for MD5 hash {md5_hash}: {e}")
115
+ except json.JSONDecodeError as e:
116
+ logger.warning(f"Failed to parse API response for MD5 hash {md5_hash}: {e}")
117
+ except Exception as e:
118
+ logger.warning(f"Error getting origin_url for MD5 hash {md5_hash}: {e}")
119
+ return ""
120
+
121
+ def _extract_oss_info_from_url(self, url: str) -> tuple:
122
+ """
123
+ Extract OSS name, version, and repository URL from GitHub URL.
124
+
125
+ Supported patterns:
126
+ - https://github.com/{owner}/{repo}/archive/{version}.zip
127
+ - https://github.com/{owner}/{repo}/archive/{tag}/{version}.zip
128
+ - https://github.com/{owner}/{repo}/releases/download/{version}/{filename}
129
+
130
+ :param url: GitHub URL to extract information from
131
+ :return: tuple of (repo_name, version, repo_url)
132
+ """
133
+ try:
134
+ repo_match = re.search(r'github\.com/([^/]+)/([^/]+)/', url)
135
+ if not repo_match:
136
+ return "", "", ""
137
+
138
+ owner = repo_match.group(1)
139
+ repo_name = repo_match.group(2)
140
+ repo_url = f"https://github.com/{owner}/{repo_name}"
141
+ version = ""
142
+ # Extract version from releases pattern first: /releases/download/{version}/
143
+ releases_match = re.search(r'/releases/download/([^/]+)/', url)
144
+ if releases_match:
145
+ version = releases_match.group(1)
146
+ else:
147
+ # Extract version from archive pattern: /archive/{version}.zip or /archive/{tag}/{version}.zip
148
+ # For pattern with tag, take the last segment before .zip
149
+ archive_match = re.search(r'/archive/(.+?)(?:\.zip|\.tar\.gz)?(?:[?#]|$)', url)
150
+ if archive_match:
151
+ version_path = archive_match.group(1)
152
+ version = version_path.split('/')[-1] if '/' in version_path else version_path
153
+ if re.match(r'^[0-9a-f]{7,40}$', version, re.IGNORECASE):
154
+ version = ""
155
+ return repo_name, version, repo_url
156
+ except Exception as e:
157
+ logger.debug(f"Failed to extract OSS info from URL {url}: {e}")
158
+ return "", "", ""
159
+
160
+ def set_oss_item(self, path_to_scan: str = "", run_kb: bool = False) -> None:
81
161
  self.oss_items = []
82
162
  if self.download_location:
83
163
  for url in self.download_location:
@@ -87,6 +167,20 @@ class SourceItem(FileItem):
87
167
  self.oss_items.append(item)
88
168
  else:
89
169
  item = OssItem(self.oss_name, self.oss_version, self.licenses)
170
+ if run_kb and not self.is_license_text:
171
+ md5_hash = self._get_md5_hash(path_to_scan)
172
+ if md5_hash:
173
+ origin_url = self._get_origin_url_from_md5_hash(md5_hash)
174
+ if origin_url:
175
+ extracted_name, extracted_version, repo_url = self._extract_oss_info_from_url(origin_url)
176
+ if extracted_name:
177
+ self.oss_name = extracted_name
178
+ if extracted_version:
179
+ self.oss_version = extracted_version
180
+ download_url = repo_url if repo_url else origin_url
181
+ self.download_location = [download_url]
182
+ item = OssItem(self.oss_name, self.oss_version, self.licenses, download_url)
183
+
90
184
  item.copyright = "\n".join(self.copyright)
91
185
  item.comment = self.comment
92
186
  self.oss_items.append(item)
@@ -107,39 +201,6 @@ class SourceItem(FileItem):
107
201
  return self.source_name_or_path == other.source_name_or_path
108
202
 
109
203
 
110
- def is_exclude_dir(dir_path: str) -> bool:
111
- if dir_path:
112
- dir_path = dir_path.lower()
113
- dir_path = dir_path if dir_path.endswith(
114
- os.path.sep) else dir_path + os.path.sep
115
- dir_path = dir_path if dir_path.startswith(
116
- os.path.sep) else os.path.sep + dir_path
117
- return any(dir_name in dir_path for dir_name in _exclude_directory)
118
- return False
119
-
120
-
121
- def is_exclude_file(file_path: str, prev_dir: str = None, prev_dir_exclude_value: bool = None) -> bool:
122
- file_path = file_path.lower()
123
- filename = os.path.basename(file_path)
124
- if os.path.splitext(filename)[1] in _exclude_extension:
125
- return True
126
- if filename.startswith('.') or filename in _exclude_filename:
127
- return True
128
-
129
- dir_path = os.path.dirname(file_path)
130
- if prev_dir is not None: # running ScanCode
131
- if dir_path == prev_dir:
132
- return prev_dir_exclude_value
133
- else:
134
- # There will be no execution of this else statement.
135
- # Because scancode json output results are sorted by path,
136
- # most of them will match the previous if statement.
137
- return is_exclude_dir(dir_path)
138
- else: # running SCANOSS
139
- return is_exclude_dir(dir_path)
140
- return False
141
-
142
-
143
204
  def is_notice_file(file_path: str) -> bool:
144
205
  pattern = r"({})(?<!w)".format("|".join(_notice_filename))
145
206
  filename = os.path.basename(file_path)
@@ -150,34 +211,3 @@ def is_manifest_file(file_path: str) -> bool:
150
211
  pattern = r"({})$".format("|".join(_manifest_filename))
151
212
  filename = os.path.basename(file_path)
152
213
  return bool(re.match(pattern, filename, re.IGNORECASE))
153
-
154
-
155
- def is_package_dir(dir_path: str) -> bool:
156
- # scancode and scanoss use '/' as path separator regardless of OS
157
- dir_path = dir_path.replace('\\', '/')
158
- path_parts = dir_path.split('/')
159
-
160
- for pkg_dir in _package_directory:
161
- if pkg_dir in path_parts:
162
- pkg_index = path_parts.index(pkg_dir)
163
- pkg_path = '/'.join(path_parts[:pkg_index + 1])
164
- return True, pkg_path
165
- return False, ""
166
-
167
-
168
- def get_excluded_paths(path_to_scan: str, custom_excluded_paths: list = []) -> list:
169
- path_to_exclude = custom_excluded_paths.copy()
170
- abs_path_to_scan = os.path.abspath(path_to_scan)
171
-
172
- for root, dirs, files in os.walk(path_to_scan):
173
- for dir_name in dirs:
174
- dir_path = os.path.join(root, dir_name)
175
- rel_path = os.path.relpath(dir_path, abs_path_to_scan)
176
- if dir_name in _package_directory:
177
- if rel_path not in path_to_exclude:
178
- path_to_exclude.append(rel_path)
179
- elif is_exclude_dir(rel_path):
180
- if rel_path not in path_to_exclude:
181
- path_to_exclude.append(rel_path)
182
-
183
- return path_to_exclude
fosslight_source/cli.py CHANGED
@@ -8,23 +8,24 @@ import os
8
8
  import platform
9
9
  import warnings
10
10
  import logging
11
+ import urllib.request
12
+ import urllib.error
11
13
  from datetime import datetime
12
14
  import fosslight_util.constant as constant
13
15
  from fosslight_util.set_log import init_log
14
16
  from fosslight_util.timer_thread import TimerThread
15
- from fosslight_util.exclude import excluding_files
16
17
  from ._help import print_version, print_help_msg_source_scanner
17
18
  from ._license_matched import get_license_list_to_print
18
19
  from fosslight_util.output_format import check_output_formats_v2, write_output_file
19
20
  from fosslight_util.correct import correct_with_yaml
20
21
  from .run_scancode import run_scan
21
- from ._scan_item import get_excluded_paths
22
+ from fosslight_util.exclude import get_excluded_paths
22
23
  from .run_scanoss import run_scanoss_py
23
24
  from .run_scanoss import get_scanoss_extra_info
24
25
  import yaml
25
26
  import argparse
26
27
  from .run_spdx_extractor import get_spdx_downloads
27
- from ._scan_item import SourceItem
28
+ from ._scan_item import SourceItem, KB_URL
28
29
  from fosslight_util.oss_item import ScannerItem
29
30
  from typing import Tuple
30
31
 
@@ -35,7 +36,10 @@ SCANOSS_HEADER = {SRC_SHEET_NAME: ['ID', 'Source Path', 'OSS Name',
35
36
  MERGED_HEADER = {SRC_SHEET_NAME: ['ID', 'Source Path', 'OSS Name',
36
37
  'OSS Version', 'License', 'Download Location',
37
38
  'Homepage', 'Copyright Text', 'Exclude', 'Comment', 'license_reference']}
38
- SCANNER_TYPE = ['scancode', 'scanoss', 'all', '']
39
+ SCANNER_TYPE = ['kb', 'scancode', 'scanoss', 'all']
40
+ EXCLUDE_FILENAME = ["changelog", "config.guess", "config.sub", "changes", "ltmain.sh",
41
+ "configure", "configure.ac", "depcomp", "compile", "missing", "Makefile"]
42
+ EXCLUDE_FILE_EXTENSION = [".m4", ".in", ".po"]
39
43
 
40
44
  logger = logging.getLogger(constant.LOGGER_NAME)
41
45
  warnings.filterwarnings("ignore", category=FutureWarning)
@@ -124,23 +128,6 @@ def main() -> None:
124
128
  sys.exit(1)
125
129
 
126
130
 
127
- def count_files(path_to_scan: str, path_to_exclude: list) -> Tuple[int, int]:
128
- total_files = 0
129
- excluded_files = 0
130
- abs_path_to_exclude = [os.path.abspath(os.path.join(path_to_scan, path)) for path in path_to_exclude]
131
-
132
- for root, _, files in os.walk(path_to_scan):
133
- for file in files:
134
- file_path = os.path.join(root, file)
135
- abs_file_path = os.path.abspath(file_path)
136
- if any(os.path.commonpath([abs_file_path, exclude_path]) == exclude_path
137
- for exclude_path in abs_path_to_exclude):
138
- excluded_files += 1
139
- total_files += 1
140
-
141
- return total_files, excluded_files
142
-
143
-
144
131
  def create_report_file(
145
132
  _start_time: str, merged_result: list,
146
133
  license_list: list, scanoss_result: list,
@@ -148,7 +135,7 @@ def create_report_file(
148
135
  output_path: str = "", output_files: list = [],
149
136
  output_extensions: list = [], correct_mode: bool = True,
150
137
  correct_filepath: str = "", path_to_scan: str = "", path_to_exclude: list = [],
151
- formats: list = [], excluded_file_list: list = [], api_limit_exceed: bool = False
138
+ formats: list = [], api_limit_exceed: bool = False, files_count: int = 0
152
139
  ) -> 'ScannerItem':
153
140
  """
154
141
  Create report files for given scanned result.
@@ -207,7 +194,6 @@ def create_report_file(
207
194
 
208
195
  scan_item = ScannerItem(PKG_NAME, _start_time)
209
196
  scan_item.set_cover_pathinfo(path_to_scan, path_to_exclude)
210
- files_count, _ = count_files(path_to_scan, path_to_exclude)
211
197
  scan_item.set_cover_comment(f"Scanned files: {files_count}")
212
198
 
213
199
  if api_limit_exceed:
@@ -221,12 +207,6 @@ def create_report_file(
221
207
 
222
208
  if merged_result:
223
209
  sheet_list = {}
224
- # Remove results that are in excluding file list
225
- for i in range(len(merged_result) - 1, -1, -1): # Iterate from last to first
226
- item_path = merged_result[i].source_name_or_path # Assuming SourceItem has 'file_path' attribute
227
- if item_path in excluded_file_list:
228
- del merged_result[i] # Delete matching item
229
-
230
210
  scan_item.append_file_items(merged_result, PKG_NAME)
231
211
 
232
212
  if selected_scanner == 'scanoss':
@@ -269,17 +249,35 @@ def create_report_file(
269
249
  return scan_item
270
250
 
271
251
 
272
- def merge_results(scancode_result: list = [], scanoss_result: list = [], spdx_downloads: dict = {}) -> list:
252
+ def check_kb_server_reachable() -> bool:
253
+ try:
254
+ request = urllib.request.Request(KB_URL, method='HEAD')
255
+ with urllib.request.urlopen(request, timeout=5) as response:
256
+ logger.debug(f"KB server is reachable. Response status: {response.status}")
257
+ return response.status != 404
258
+ except urllib.error.HTTPError as e:
259
+ return e.code != 404
260
+ except urllib.error.URLError:
261
+ return False
262
+ except Exception:
263
+ return False
264
+
265
+
266
+ def merge_results(
267
+ scancode_result: list = [], scanoss_result: list = [], spdx_downloads: dict = {},
268
+ path_to_scan: str = "", run_kb: bool = False
269
+ ) -> list:
273
270
 
274
271
  """
275
272
  Merge scanner results and spdx parsing result.
276
273
  :param scancode_result: list of scancode results in SourceItem.
277
274
  :param scanoss_result: list of scanoss results in SourceItem.
278
275
  :param spdx_downloads: dictionary of spdx parsed results.
276
+ :param path_to_scan: path to the scanned directory for constructing absolute file paths.
277
+ :param run_kb: if True, load kb result.
279
278
  :return merged_result: list of merged result in SourceItem.
280
279
  """
281
280
 
282
- # If anything that is found at SCANOSS only exist, add it to result.
283
281
  scancode_result.extend([item for item in scanoss_result if item not in scancode_result])
284
282
 
285
283
  # If download loc. in SPDX form found, overwrite the scanner result.
@@ -293,9 +291,15 @@ def merge_results(scancode_result: list = [], scanoss_result: list = [], spdx_do
293
291
  new_result_item = SourceItem(file_name)
294
292
  new_result_item.download_location = download_location
295
293
  scancode_result.append(new_result_item)
294
+ if run_kb and not check_kb_server_reachable():
295
+ run_kb = False
296
+ if run_kb:
297
+ logger.info("KB server is reachable. Loading data from OSS KB.")
298
+ else:
299
+ logger.info("Skipping KB lookup.")
296
300
 
297
301
  for item in scancode_result:
298
- item.set_oss_item()
302
+ item.set_oss_item(path_to_scan, run_kb)
299
303
 
300
304
  return scancode_result
301
305
 
@@ -339,30 +343,38 @@ def run_scanners(
339
343
 
340
344
  logger, result_log = init_log(os.path.join(output_path, f"fosslight_log_src_{start_time}.txt"),
341
345
  True, logging.INFO, logging.DEBUG, PKG_NAME, path_to_scan, path_to_exclude)
342
- excluded_file_list = excluding_files(path_to_exclude, path_to_scan)
343
346
 
344
347
  if '.xlsx' not in output_extensions and print_matched_text:
345
348
  logger.warning("-m option is only available for excel.")
346
349
  print_matched_text = False
347
350
 
348
351
  if success:
349
- excluded_path_with_default_exclusion = get_excluded_paths(path_to_scan, path_to_exclude)
350
- if selected_scanner == 'scancode' or selected_scanner == 'all' or selected_scanner == '':
352
+ path_to_exclude_with_filename = path_to_exclude + EXCLUDE_FILENAME
353
+ excluded_path_with_default_exclusion, excluded_path_without_dot, excluded_files, cnt_file_except_skipped = (
354
+ get_excluded_paths(path_to_scan, path_to_exclude_with_filename, EXCLUDE_FILE_EXTENSION))
355
+ logger.debug(f"Skipped paths: {excluded_path_with_default_exclusion}")
356
+
357
+ if not selected_scanner:
358
+ selected_scanner = 'all'
359
+ if selected_scanner in ['scancode', 'all', 'kb']:
351
360
  success, result_log[RESULT_KEY], scancode_result, license_list = run_scan(path_to_scan, output_file_name,
352
361
  write_json_file, num_cores, True,
353
362
  print_matched_text, formats, called_by_cli,
354
363
  time_out, correct_mode, correct_filepath,
355
- excluded_path_with_default_exclusion)
356
- if selected_scanner == 'scanoss' or selected_scanner == 'all' or selected_scanner == '':
364
+ excluded_path_with_default_exclusion,
365
+ excluded_files)
366
+ excluded_files = set(excluded_files) if excluded_files else set()
367
+ if selected_scanner in ['scanoss', 'all']:
357
368
  scanoss_result, api_limit_exceed = run_scanoss_py(path_to_scan, output_file_name, formats, True, write_json_file,
358
- num_cores, excluded_path_with_default_exclusion)
369
+ num_cores, excluded_path_with_default_exclusion, excluded_files)
359
370
  if selected_scanner in SCANNER_TYPE:
360
- spdx_downloads = get_spdx_downloads(path_to_scan, excluded_path_with_default_exclusion)
361
- merged_result = merge_results(scancode_result, scanoss_result, spdx_downloads)
371
+ run_kb = True if selected_scanner in ['kb', 'all'] else False
372
+ spdx_downloads = get_spdx_downloads(path_to_scan, excluded_files)
373
+ merged_result = merge_results(scancode_result, scanoss_result, spdx_downloads, path_to_scan, run_kb)
362
374
  scan_item = create_report_file(start_time, merged_result, license_list, scanoss_result, selected_scanner,
363
375
  print_matched_text, output_path, output_files, output_extensions, correct_mode,
364
- correct_filepath, path_to_scan, path_to_exclude, formats, excluded_file_list,
365
- api_limit_exceed)
376
+ correct_filepath, path_to_scan, excluded_path_without_dot, formats,
377
+ api_limit_exceed, cnt_file_except_skipped)
366
378
  else:
367
379
  print_help_msg_source_scanner()
368
380
  result_log[RESULT_KEY] = "Unsupported scanner"
@@ -29,7 +29,8 @@ def run_scan(
29
29
  return_results: bool = False, need_license: bool = False,
30
30
  formats: list = [], called_by_cli: bool = False,
31
31
  time_out: int = 120, correct_mode: bool = True,
32
- correct_filepath: str = "", path_to_exclude: list = []
32
+ correct_filepath: str = "", path_to_exclude: list = [],
33
+ excluded_files: list = []
33
34
  ) -> Tuple[bool, str, list, list]:
34
35
  if not called_by_cli:
35
36
  global logger
@@ -74,7 +75,6 @@ def run_scan(
74
75
  if os.path.isdir(path_to_scan):
75
76
  try:
76
77
  time_out = float(time_out)
77
- logger.debug(f"Skipped by Scancode: {path_to_exclude}")
78
78
  pretty_params = {}
79
79
  pretty_params["path_to_scan"] = path_to_scan
80
80
  pretty_params["path_to_exclude"] = path_to_exclude
@@ -91,39 +91,40 @@ def run_scan(
91
91
  exclude_path_normalized = os.path.normpath(exclude_path).replace("\\", "/")
92
92
 
93
93
  if exclude_path_normalized.endswith("/**"):
94
- exclude_path_normalized = exclude_path_normalized[:-3]
95
- elif exclude_path_normalized.endswith("**"):
96
- exclude_path_normalized = exclude_path_normalized.rstrip("*")
97
-
98
- if exclude_path_normalized.startswith("**/"):
99
- exclude_path_normalized = exclude_path_normalized[3:]
100
-
101
- full_exclude_path = os.path.join(abs_path_to_scan, exclude_path)
102
- is_dir = os.path.isdir(full_exclude_path)
103
- is_file = os.path.isfile(full_exclude_path)
104
- if is_dir:
105
- dir_name = os.path.basename(exclude_path_normalized.rstrip("/"))
106
- base_path = exclude_path_normalized.rstrip("/")
107
-
108
- if dir_name:
109
- total_files_to_excluded.append(dir_name)
110
- max_depth = 0
111
- for root, dirs, files in os.walk(full_exclude_path):
112
- depth = root[len(full_exclude_path):].count(os.sep)
113
- max_depth = max(max_depth, depth)
114
- for depth in range(1, max_depth + 2):
115
- pattern = base_path + "/*" * depth
116
- total_files_to_excluded.append(pattern)
94
+ base_dir = exclude_path_normalized[:-3].rstrip("/")
95
+ if base_dir:
96
+ full_exclude_path = os.path.join(abs_path_to_scan, base_dir)
97
+ if os.path.isdir(full_exclude_path):
98
+ total_files_to_excluded.append(base_dir)
99
+ total_files_to_excluded.append(exclude_path_normalized)
100
+ else:
101
+ total_files_to_excluded.append(exclude_path_normalized)
117
102
  else:
118
103
  total_files_to_excluded.append(exclude_path_normalized)
119
- elif is_file:
120
- total_files_to_excluded.append(exclude_path_normalized)
121
104
  else:
122
- if "/" in exclude_path_normalized:
123
- dir_name = os.path.basename(exclude_path_normalized.rstrip("/"))
124
- if dir_name:
125
- total_files_to_excluded.append(dir_name)
126
- total_files_to_excluded.append(exclude_path_normalized)
105
+ has_glob_chars = any(char in exclude_path_normalized for char in ['*', '?', '['])
106
+ if not has_glob_chars:
107
+ full_exclude_path = os.path.join(abs_path_to_scan, exclude_path_normalized)
108
+ is_dir = os.path.isdir(full_exclude_path)
109
+ is_file = os.path.isfile(full_exclude_path)
110
+ else:
111
+ is_dir = False
112
+ is_file = False
113
+
114
+ if is_dir:
115
+ base_path = exclude_path_normalized.rstrip("/")
116
+ if base_path:
117
+ total_files_to_excluded.append(base_path)
118
+ total_files_to_excluded.append(f"{base_path}/**")
119
+ else:
120
+ total_files_to_excluded.append(exclude_path_normalized)
121
+ elif is_file:
122
+ total_files_to_excluded.append(f"**/{exclude_path_normalized}")
123
+ else:
124
+ total_files_to_excluded.append(exclude_path_normalized)
125
+
126
+ if excluded_files:
127
+ total_files_to_excluded.extend(f"**/{file_path}" for file_path in excluded_files)
127
128
 
128
129
  total_files_to_excluded = sorted(list(set(total_files_to_excluded)))
129
130
  ignore_tuple = tuple(total_files_to_excluded)
@@ -12,8 +12,8 @@ from datetime import datetime
12
12
  import fosslight_util.constant as constant
13
13
  from fosslight_util.set_log import init_log
14
14
  from fosslight_util.output_format import check_output_formats_v2 # , write_output_file
15
- from ._parsing_scanoss_file import parsing_scanResult # scanoss
16
- from ._parsing_scanoss_file import parsing_extraInfo # scanoss
15
+ from ._parsing_scanoss_file import parsing_scan_result # scanoss
16
+ from ._parsing_scanoss_file import parsing_extra_info # scanoss
17
17
  import shutil
18
18
  from pathlib import Path
19
19
  from scanoss.scanner import Scanner, ScanType
@@ -28,11 +28,12 @@ SCANOSS_OUTPUT_FILE = "scanoss_raw_result.json"
28
28
 
29
29
 
30
30
  def get_scanoss_extra_info(scanned_result: dict) -> list:
31
- return parsing_extraInfo(scanned_result)
31
+ return parsing_extra_info(scanned_result)
32
32
 
33
33
 
34
- def run_scanoss_py(path_to_scan: str, output_file_name: str = "", format: list = [], called_by_cli: bool = False,
35
- write_json_file: bool = False, num_threads: int = -1, path_to_exclude: list = []) -> list:
34
+ def run_scanoss_py(path_to_scan: str, output_file_name: str = "", format: list = [],
35
+ called_by_cli: bool = False, write_json_file: bool = False, num_threads: int = -1,
36
+ path_to_exclude: list = [], excluded_files: set = None) -> list:
36
37
  """
37
38
  Run scanoss.py for the given path.
38
39
 
@@ -72,7 +73,7 @@ def run_scanoss_py(path_to_scan: str, output_file_name: str = "", format: list =
72
73
  try:
73
74
  scanner = Scanner(
74
75
  ignore_cert_errors=True,
75
- skip_folders=path_to_exclude,
76
+ skip_folders=list(path_to_exclude) if path_to_exclude else [],
76
77
  scan_output=output_json_file,
77
78
  scan_options=ScanType.SCAN_SNIPPETS.value,
78
79
  nb_threads=num_threads if num_threads > 0 else 10
@@ -86,30 +87,16 @@ def run_scanoss_py(path_to_scan: str, output_file_name: str = "", format: list =
86
87
  logger.debug(f"{captured_output}")
87
88
 
88
89
  if os.path.isfile(output_json_file):
89
- total_files_to_excluded = []
90
- if path_to_exclude:
91
- for path in path_to_exclude:
92
- path = os.path.join(path_to_scan, os.path.relpath(path, os.path.abspath(path_to_scan))) \
93
- if not os.path.isabs(path_to_scan) and os.path.isabs(path) else os.path.join(path_to_scan, path)
94
- if os.path.isdir(path):
95
- for root, _, files in os.walk(path):
96
- root = root[len(path_to_scan) + 1:]
97
- total_files_to_excluded.extend([os.path.normpath(os.path.join(root, file)).replace('\\', '/')
98
- for file in files])
99
- elif os.path.isfile(path):
100
- path = path[len(path_to_scan) + 1:]
101
- total_files_to_excluded.append(os.path.normpath(path).replace('\\', '/'))
102
-
103
90
  with open(output_json_file, "r") as st_json:
104
91
  st_python = json.load(st_json)
105
- for key_to_exclude in total_files_to_excluded:
92
+ for key_to_exclude in excluded_files:
106
93
  if key_to_exclude in st_python:
107
94
  del st_python[key_to_exclude]
108
95
  with open(output_json_file, 'w') as st_json:
109
96
  json.dump(st_python, st_json, indent=4)
110
97
  with open(output_json_file, "r") as st_json:
111
98
  st_python = json.load(st_json)
112
- scanoss_file_list = parsing_scanResult(st_python, path_to_scan, path_to_exclude)
99
+ scanoss_file_list = parsing_scan_result(st_python, excluded_files)
113
100
 
114
101
  except Exception as error:
115
102
  logger.debug(f"SCANOSS Parsing {path_to_scan}: {error}")
@@ -12,39 +12,26 @@ import mmap
12
12
  logger = logging.getLogger(constant.LOGGER_NAME)
13
13
 
14
14
 
15
- def get_file_list(path_to_scan: str, path_to_exclude: list = []) -> list:
16
- file_list = []
17
- abs_path_to_exclude = [os.path.abspath(os.path.join(path_to_scan, path)) for path in path_to_exclude]
15
+ def get_spdx_downloads(path_to_scan: str, path_to_exclude: set = None) -> dict:
16
+ download_dict = {}
17
+ find_word = re.compile(rb"SPDX-PackageDownloadLocation\s*:\s*(\S+)", re.IGNORECASE)
18
+ abs_path_to_scan = os.path.abspath(path_to_scan)
19
+
18
20
  for root, dirs, files in os.walk(path_to_scan):
19
21
  for file in files:
20
22
  file_path = os.path.join(root, file)
21
- abs_file_path = os.path.abspath(file_path)
22
- if any(os.path.commonpath([abs_file_path, exclude_path]) == exclude_path
23
- for exclude_path in abs_path_to_exclude):
23
+ rel_path_file = os.path.relpath(file_path, abs_path_to_scan).replace('\\', '/')
24
+ if rel_path_file in path_to_exclude:
24
25
  continue
25
- file_list.append(file_path)
26
- return file_list
27
-
28
-
29
- def get_spdx_downloads(path_to_scan: str, path_to_exclude: list = []) -> dict:
30
- download_dict = {}
31
- find_word = re.compile(rb"SPDX-PackageDownloadLocation\s*:\s*(\S+)", re.IGNORECASE)
32
-
33
- file_list = get_file_list(path_to_scan, path_to_exclude)
34
-
35
- for file in file_list:
36
- try:
37
- rel_path_file = os.path.relpath(file, path_to_scan)
38
- # remove the path_to_scan from the file paths
39
- if os.path.getsize(file) > 0:
40
- with open(file, "r") as f:
41
- with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mmap_obj:
42
- for word in find_word.findall(mmap_obj):
43
- if rel_path_file in download_dict:
44
- download_dict[rel_path_file].append(word.decode('utf-8'))
45
- else:
46
- download_dict[rel_path_file] = [word.decode('utf-8')]
47
- except Exception as ex:
48
- msg = str(ex)
49
- logger.warning(f"Failed to extract SPDX download location. {rel_path_file}, {msg}")
26
+ try:
27
+ if os.path.getsize(file_path) > 0:
28
+ with open(file_path, "r") as f:
29
+ with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mmap_obj:
30
+ for word in find_word.findall(mmap_obj):
31
+ if rel_path_file in download_dict:
32
+ download_dict[rel_path_file].append(word.decode('utf-8'))
33
+ else:
34
+ download_dict[rel_path_file] = [word.decode('utf-8')]
35
+ except Exception as ex:
36
+ logger.warning(f"Failed to extract SPDX download location. {rel_path_file}, {ex}")
50
37
  return download_dict
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fosslight_source
3
- Version: 2.1.19
3
+ Version: 2.2.1
4
4
  Summary: FOSSLight Source Scanner
5
5
  Home-page: https://github.com/fosslight/fosslight_source_scanner
6
6
  Download-URL: https://github.com/fosslight/fosslight_source_scanner
@@ -17,7 +17,7 @@ License-File: LICENSE
17
17
  Requires-Dist: pyparsing
18
18
  Requires-Dist: scanoss>=1.18.0
19
19
  Requires-Dist: XlsxWriter
20
- Requires-Dist: fosslight_util>=2.1.31
20
+ Requires-Dist: fosslight_util>=2.1.34
21
21
  Requires-Dist: PyYAML
22
22
  Requires-Dist: wheel>=0.38.1
23
23
  Requires-Dist: intbitset
@@ -0,0 +1,16 @@
1
+ fosslight_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ fosslight_source/_help.py,sha256=Ge6g9GKWGza11E74PFnBMqsj40UCUz-a_xArDZ1FClU,2316
3
+ fosslight_source/_license_matched.py,sha256=-3H881XQjFDafRttBsuboS3VbCPYEvPH1pwWXptknE4,2164
4
+ fosslight_source/_parsing_scancode_file_item.py,sha256=-shPakF0oQWDzxWFylE2dQ93O4tgCudYM2zvX4K5glQ,19386
5
+ fosslight_source/_parsing_scanoss_file.py,sha256=L3iHqmQF2jeSpHYuYSre44doXKy-BoX0u1Lm2IfJSU8,3866
6
+ fosslight_source/_scan_item.py,sha256=NMmYaqdpNM-yeJxXPVPmoPo_thOnaAGRXYDEcpD6s2Y,9415
7
+ fosslight_source/cli.py,sha256=qbp87Rhe5c2hIcF1-5TR6btPeOCe32Ffq1pxJM9ADcY,17303
8
+ fosslight_source/run_scancode.py,sha256=TFyNLV6P9rSBo9royDoG6az4l7Tkpl8Gr66IFK1DBU8,9021
9
+ fosslight_source/run_scanoss.py,sha256=_gdA4kOByI4saT4bDvMwIabpxtpH4f_yruHdBtb_g-o,4852
10
+ fosslight_source/run_spdx_extractor.py,sha256=LLyYKpkpD5Qxkv_qTNBrFtKvrmnXw9SFYZCkpishb_c,1537
11
+ fosslight_source-2.2.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
+ fosslight_source-2.2.1.dist-info/METADATA,sha256=dXxCjJqwvuDy1Yz4KTltcIeO6etLzTx0AbwF_x0aGkU,3557
13
+ fosslight_source-2.2.1.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
14
+ fosslight_source-2.2.1.dist-info/entry_points.txt,sha256=G4bBRWqSrJ68g-2M-JtNDrSZsdym_M7_KohQ2qR1vG8,113
15
+ fosslight_source-2.2.1.dist-info/top_level.txt,sha256=C2vw-0OIent84Vq-UEk1gt_kK1EL8dIItzBzp3WNyA4,17
16
+ fosslight_source-2.2.1.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- fosslight_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- fosslight_source/_help.py,sha256=QuoQvxBPV00IfuD2ft88uRQXMSdrL2rJB7CQr05w3Ng,2312
3
- fosslight_source/_license_matched.py,sha256=-3H881XQjFDafRttBsuboS3VbCPYEvPH1pwWXptknE4,2164
4
- fosslight_source/_parsing_scancode_file_item.py,sha256=VZf_-5f7DZi8Zkj9Bx9LETTxcW-9f0KyNQD_DVOUNes,20024
5
- fosslight_source/_parsing_scanoss_file.py,sha256=0f5JzjnFU-kcPZRX7OKnextyvANjKwwNZeyCJVC7eME,4624
6
- fosslight_source/_scan_item.py,sha256=5HWJ8j58snEjTqzYtKRB8RVfywVrzivkJQ6WMh7nBwA,7299
7
- fosslight_source/cli.py,sha256=ApdTDaLEuH1LskLtcMrLyeRDIgIJUZlOp3RrWaju2Pc,16854
8
- fosslight_source/run_scancode.py,sha256=h8HWoZr5R17kXCYjiR56ZTdpDwpFKPAurpUpjTvT424,9006
9
- fosslight_source/run_scanoss.py,sha256=8wu3sa-YBqjfb5x2dbDJuAdw3rrExueOW23WdzqDCaU,5721
10
- fosslight_source/run_spdx_extractor.py,sha256=Hr9sTv06cJaVITy8amwexIW2FV8_rUcFw6hKmR9ZYws,1990
11
- fosslight_source-2.1.19.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
12
- fosslight_source-2.1.19.dist-info/METADATA,sha256=E_y220fkjF5KUrZUglR9MscDfD9AnBX5h63kO1FkAOg,3558
13
- fosslight_source-2.1.19.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
14
- fosslight_source-2.1.19.dist-info/entry_points.txt,sha256=G4bBRWqSrJ68g-2M-JtNDrSZsdym_M7_KohQ2qR1vG8,113
15
- fosslight_source-2.1.19.dist-info/top_level.txt,sha256=C2vw-0OIent84Vq-UEk1gt_kK1EL8dIItzBzp3WNyA4,17
16
- fosslight_source-2.1.19.dist-info/RECORD,,