fosslight-source 2.2.17__tar.gz → 2.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {fosslight_source-2.2.17/src/fosslight_source.egg-info → fosslight_source-2.3.1}/PKG-INFO +2 -1
  2. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/pyproject.toml +3 -1
  3. fosslight_source-2.3.1/src/fosslight_source/_kb_client.py +239 -0
  4. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source/_parsing_scancode_file_item.py +1 -1
  5. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source/_scan_item.py +24 -50
  6. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source/cli.py +179 -86
  7. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source/run_scancode.py +11 -2
  8. {fosslight_source-2.2.17 → fosslight_source-2.3.1/src/fosslight_source.egg-info}/PKG-INFO +2 -1
  9. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source.egg-info/SOURCES.txt +1 -0
  10. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source.egg-info/requires.txt +3 -0
  11. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/tests/test_tox.py +65 -2
  12. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/LICENSE +0 -0
  13. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/MANIFEST.in +0 -0
  14. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/README.md +0 -0
  15. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/setup.cfg +0 -0
  16. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source/__init__.py +0 -0
  17. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source/_help.py +0 -0
  18. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source/_license_matched.py +0 -0
  19. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source/_parsing_scanoss_file.py +0 -0
  20. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source/run_manifest_extractor.py +0 -0
  21. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source/run_scanoss.py +0 -0
  22. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source/run_spdx_extractor.py +0 -0
  23. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source.egg-info/dependency_links.txt +0 -0
  24. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source.egg-info/entry_points.txt +0 -0
  25. {fosslight_source-2.2.17 → fosslight_source-2.3.1}/src/fosslight_source.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fosslight_source
3
- Version: 2.2.17
3
+ Version: 2.3.1
4
4
  Summary: FOSSLight Source Scanner
5
5
  Author: LG Electronics
6
6
  License-Expression: Apache-2.0
@@ -26,6 +26,7 @@ Requires-Dist: wheel>=0.38.1
26
26
  Requires-Dist: intbitset
27
27
  Requires-Dist: fosslight_binary>=5.1.22
28
28
  Requires-Dist: scancode-toolkit>=32.0.2
29
+ Requires-Dist: cryptography<49; platform_system == "Darwin" and platform_machine == "x86_64"
29
30
  Requires-Dist: fingerprints==1.2.3
30
31
  Requires-Dist: normality==2.6.1
31
32
  Requires-Dist: psycopg2-binary>=2.9.10; python_version >= "3.13"
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
7
7
 
8
8
  [project]
9
9
  name = "fosslight_source"
10
- version = "2.2.17"
10
+ version = "2.3.1"
11
11
  description = "FOSSLight Source Scanner"
12
12
  readme = "README.md"
13
13
  license = "Apache-2.0"
@@ -35,6 +35,8 @@ dependencies = [
35
35
  "intbitset",
36
36
  "fosslight_binary>=5.1.22",
37
37
  "scancode-toolkit>=32.0.2",
38
+ # cryptography 49.x does not provide macOS x86_64 wheels, causing source builds to require OpenSSL/pkg-config.
39
+ "cryptography<49; platform_system == 'Darwin' and platform_machine == 'x86_64'",
38
40
  "fingerprints==1.2.3",
39
41
  "normality==2.6.1",
40
42
  # Python 3.13+ needs psycopg2-binary 2.9.10+ (has wheels; 2.9.9 builds fail with _PyInterpreterState_Get)
@@ -0,0 +1,239 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright (c) 2020 LG Electronics Inc.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ import json
7
+ import logging
8
+ import time
9
+ import urllib.error
10
+ import urllib.request
11
+ from typing import Dict, List, NamedTuple, Optional
12
+
13
+ import fosslight_util.constant as constant
14
+
15
+ logger = logging.getLogger(constant.LOGGER_NAME)
16
+
17
+ _SCAN_JOB_POLL_INTERVAL_SEC = 1.0
18
+ _SCAN_JOB_POLL_MAX_INTERVAL_SEC = 10.0
19
+ _SCAN_JOB_REQUEST_TIMEOUT_SEC = 30
20
+ _SCAN_JOB_MIN_WAIT_SEC = 300
21
+ _SCAN_JOB_PER_HASH_SEC = 35
22
+
23
+
24
+ def _kb_request(
25
+ kb_url: str,
26
+ path: str,
27
+ *,
28
+ method: str = "GET",
29
+ payload: dict | None = None,
30
+ kb_token: str = "",
31
+ timeout: int = _SCAN_JOB_REQUEST_TIMEOUT_SEC,
32
+ ) -> dict:
33
+ data = None
34
+ if payload is not None:
35
+ data = json.dumps(payload).encode("utf-8")
36
+ request = urllib.request.Request(f"{kb_url.rstrip('/')}/{path.lstrip('/')}", data=data, method=method)
37
+ request.add_header("Accept", "application/json")
38
+ if payload is not None:
39
+ request.add_header("Content-Type", "application/json")
40
+ if kb_token:
41
+ request.add_header("Authorization", f"Bearer {kb_token}")
42
+
43
+ with urllib.request.urlopen(request, timeout=timeout) as response:
44
+ body = response.read().decode()
45
+ return json.loads(body) if body else {}
46
+
47
+
48
+ def _estimate_job_wait_timeout(file_hash_count: int) -> float:
49
+ return float(max(_SCAN_JOB_MIN_WAIT_SEC, file_hash_count * _SCAN_JOB_PER_HASH_SEC))
50
+
51
+
52
+ def _coerce_count(value, default: int) -> int:
53
+ if value is None:
54
+ return default
55
+ try:
56
+ count = int(value)
57
+ except (TypeError, ValueError):
58
+ return default
59
+ return count if count >= 0 else default
60
+
61
+
62
+ def _extract_response_message(response_body: dict) -> Optional[str]:
63
+ message = response_body.get("message")
64
+ if isinstance(message, str):
65
+ message = message.strip()
66
+ if message:
67
+ return message
68
+ return None
69
+
70
+
71
+ def _scan_job_failure_message(response_body: dict) -> Optional[str]:
72
+ """Return server message when a scan/jobs response indicates failure."""
73
+ message = _extract_response_message(response_body)
74
+ if not message:
75
+ return None
76
+
77
+ status = response_body.get("status")
78
+ if status is None or str(status).lower() == "failed":
79
+ return message
80
+
81
+ if not response_body.get("job_id"):
82
+ return message
83
+
84
+ return None
85
+
86
+
87
+ def _parse_http_error_body(error: urllib.error.HTTPError) -> dict:
88
+ try:
89
+ raw = error.read().decode()
90
+ return json.loads(raw) if raw else {}
91
+ except (json.JSONDecodeError, UnicodeDecodeError, OSError):
92
+ return {}
93
+
94
+
95
+ class KbScanJobResult(NamedTuple):
96
+ origin_urls: Dict[str, str]
97
+ failure_message: Optional[str]
98
+ requested_count: int
99
+ returned_count: int
100
+
101
+
102
+ def _kb_scan_job_result(
103
+ origin_urls: Dict[str, str],
104
+ failure_message: Optional[str],
105
+ requested_count: int,
106
+ ) -> KbScanJobResult:
107
+ return KbScanJobResult(
108
+ origin_urls=origin_urls,
109
+ failure_message=failure_message,
110
+ requested_count=requested_count,
111
+ returned_count=len(origin_urls),
112
+ )
113
+
114
+
115
+ def fetch_origin_urls_via_scan_job(
116
+ file_hashes: List[str],
117
+ kb_url: str,
118
+ kb_token: str,
119
+ ) -> KbScanJobResult:
120
+ """
121
+ Create a POST /scan/jobs request, poll until completion, and return a file_hash -> origin_url map.
122
+ :param file_hashes: list of MD5 file hashes to look up.
123
+ :param kb_url: KB API base URL.
124
+ :param kb_token: KB API bearer token.
125
+ :return: origin URLs, optional failure message, and requested/returned file_hash counts.
126
+ """
127
+ unique_hashes = list(dict.fromkeys(h for h in file_hashes if h))
128
+ requested_count = len(unique_hashes)
129
+ if not unique_hashes:
130
+ return _kb_scan_job_result({}, None, 0)
131
+
132
+ create_payload = {"file_hashes": unique_hashes}
133
+ try:
134
+ created = _kb_request(kb_url, "scan/jobs", method="POST", payload=create_payload, kb_token=kb_token)
135
+ except urllib.error.HTTPError as e:
136
+ failure_message = _scan_job_failure_message(_parse_http_error_body(e))
137
+ if failure_message:
138
+ logger.warning(f"KB scan job create failed: {failure_message}")
139
+ return _kb_scan_job_result({}, failure_message, requested_count)
140
+ logger.warning(f"KB scan job create failed: HTTP {e.code} {e.reason}")
141
+ return _kb_scan_job_result({}, None, requested_count)
142
+ except urllib.error.URLError as e:
143
+ logger.warning(f"KB scan job create failed: {e}")
144
+ return _kb_scan_job_result({}, None, requested_count)
145
+ except Exception as e:
146
+ logger.warning(f"KB scan job create failed: {e}")
147
+ return _kb_scan_job_result({}, None, requested_count)
148
+
149
+ failure_message = _scan_job_failure_message(created)
150
+ if failure_message:
151
+ logger.warning(f"KB scan job create failed: {failure_message}")
152
+ return _kb_scan_job_result({}, failure_message, requested_count)
153
+
154
+ if str(created.get("status", "")).lower() == "failed":
155
+ logger.warning("KB scan job create failed")
156
+ return _kb_scan_job_result({}, None, requested_count)
157
+
158
+ job_id = created.get("job_id", "")
159
+ if not job_id:
160
+ logger.warning("KB scan job create response missing job_id")
161
+ return _kb_scan_job_result({}, None, requested_count)
162
+
163
+ fallback_count = len(unique_hashes)
164
+ accepted = _coerce_count(
165
+ created.get("accepted"),
166
+ _coerce_count(created.get("total"), fallback_count),
167
+ )
168
+ skipped = _coerce_count(created.get("skipped"), 0)
169
+ logger.info(
170
+ f"KB scan job created: job_id={job_id}, total={created.get('total', fallback_count)}, "
171
+ f"accepted={accepted}, skipped={skipped}"
172
+ )
173
+ if skipped:
174
+ logger.warning(f"KB scan job rate-limited: {skipped} file_hash(es) skipped by server")
175
+ if accepted == 0:
176
+ failure_message = (
177
+ f"rate-limited: {skipped} file_hash(es) skipped by server"
178
+ if skipped
179
+ else "scan job accepted no file_hashes"
180
+ )
181
+ return _kb_scan_job_result({}, failure_message, requested_count)
182
+
183
+ deadline = time.monotonic() + _estimate_job_wait_timeout(accepted)
184
+ interval = _SCAN_JOB_POLL_INTERVAL_SEC
185
+ origin_urls: Dict[str, str] = {}
186
+
187
+ while time.monotonic() < deadline:
188
+ try:
189
+ status = _kb_request(kb_url, f"scan/jobs/{job_id}", kb_token=kb_token)
190
+ except urllib.error.HTTPError as e:
191
+ if e.code == 404:
192
+ logger.warning(f"KB scan job not found: {job_id}")
193
+ return _kb_scan_job_result(origin_urls, "scan job not found", requested_count)
194
+ failure_message = _scan_job_failure_message(_parse_http_error_body(e))
195
+ if failure_message:
196
+ logger.warning(f"KB scan job status failed: {failure_message}")
197
+ return _kb_scan_job_result(origin_urls, failure_message, requested_count)
198
+ logger.warning(f"KB scan job status failed: HTTP {e.code}")
199
+ time.sleep(interval)
200
+ interval = min(interval * 1.5, _SCAN_JOB_POLL_MAX_INTERVAL_SEC)
201
+ continue
202
+ except urllib.error.URLError as e:
203
+ logger.warning(f"KB scan job status failed: {e}")
204
+ time.sleep(interval)
205
+ interval = min(interval * 1.5, _SCAN_JOB_POLL_MAX_INTERVAL_SEC)
206
+ continue
207
+ except Exception as e:
208
+ logger.warning(f"KB scan job status parse failed: {e}")
209
+ time.sleep(interval)
210
+ interval = min(interval * 1.5, _SCAN_JOB_POLL_MAX_INTERVAL_SEC)
211
+ continue
212
+
213
+ job_status = status.get("status", "")
214
+ if job_status == "completed":
215
+ for row in status.get("results", []):
216
+ if not isinstance(row, dict):
217
+ continue
218
+ file_hash = row.get("file_hash", "")
219
+ if row.get("success") and row.get("output") and file_hash:
220
+ origin_urls[file_hash] = row["output"]
221
+ logger.info(
222
+ f"KB scan job completed: job_id={job_id}, "
223
+ f"matched={len(origin_urls)}, failed={status.get('failed', 0)}"
224
+ )
225
+ return _kb_scan_job_result(origin_urls, None, requested_count)
226
+
227
+ if job_status == "failed":
228
+ failure_message = _scan_job_failure_message(status)
229
+ if failure_message:
230
+ logger.warning(f"KB scan job failed: job_id={job_id}, message={failure_message}")
231
+ else:
232
+ logger.warning(f"KB scan job failed: job_id={job_id}")
233
+ return _kb_scan_job_result(origin_urls, failure_message or "scan job failed", requested_count)
234
+
235
+ time.sleep(interval)
236
+ interval = min(interval * 1.5, _SCAN_JOB_POLL_MAX_INTERVAL_SEC)
237
+
238
+ logger.warning(f"KB scan job timed out: job_id={job_id}")
239
+ return _kb_scan_job_result(origin_urls, "scan job timed out", requested_count)
@@ -15,7 +15,7 @@ from typing import Tuple
15
15
 
16
16
  logger = logging.getLogger(constant.LOGGER_NAME)
17
17
  REMOVE_LICENSE = ["warranty-disclaimer"]
18
- regex = re.compile(r'licenseref-(\S+)', re.IGNORECASE)
18
+ regex = re.compile(r'licenseref-([a-z0-9\.\-]+)', re.IGNORECASE)
19
19
  find_word = re.compile(rb"SPDX-PackageDownloadLocation\s*:\s*(\S+)", re.IGNORECASE)
20
20
  KEYWORD_SPDX_ID = r'SPDX-License-Identifier\s*[\S]+'
21
21
  KEYWORD_DOWNLOAD_LOC = r'DownloadLocation\s*[\S]+'
@@ -6,11 +6,7 @@
6
6
  import os
7
7
  import logging
8
8
  import re
9
- import json
10
- import base64
11
9
  import hashlib
12
- import urllib.request
13
- import urllib.error
14
10
  import fosslight_util.constant as constant
15
11
  from fosslight_util.oss_item import FileItem, OssItem, get_checksum_sha1
16
12
 
@@ -63,8 +59,9 @@ class SourceItem(FileItem):
63
59
  self.oss_version = ""
64
60
 
65
61
  self.checksum = get_checksum_sha1(value)
66
- self.kb_origin_url = "" # URL from OSS KB (_get_origin_url_from_md5_hash)
62
+ self.kb_origin_url = "" # URL from OSS KB
67
63
  self.kb_evidence = "" # Evidence from KB API (exact_match or code snippet)
64
+ self._cached_kb_md5 = "" # MD5 precomputed for KB lookup (set by _collect_kb_file_hashes)
68
65
 
69
66
  def __del__(self) -> None:
70
67
  pass
@@ -124,37 +121,18 @@ class SourceItem(FileItem):
124
121
  logger.debug(f"Failed to compute MD5 for {self.source_name_or_path}: {e}")
125
122
  return md5_hex, wfp
126
123
 
127
- def _get_origin_url_from_md5_hash(
128
- self, md5_hash: str, wfp: str = "", kb_url: str = DEFAULT_KB_URL, kb_token: str = ""
129
- ) -> str:
130
- """Return origin_url from KB API."""
131
- try:
132
- payload = {"file_hash": md5_hash}
133
- if wfp and wfp.strip():
134
- payload["wfp_base64"] = base64.b64encode(wfp.strip().encode("utf-8")).decode("ascii")
135
- request = urllib.request.Request(
136
- f"{kb_url}query", data=json.dumps(payload).encode('utf-8'), method='POST'
137
- )
138
- request.add_header('Accept', 'application/json')
139
- request.add_header('Content-Type', 'application/json')
140
- if kb_token:
141
- request.add_header('Authorization', f'Bearer {kb_token}')
142
-
143
- with urllib.request.urlopen(request, timeout=10) as response:
144
- data = json.loads(response.read().decode())
145
- if isinstance(data, dict):
146
- return_code = data.get('return_code', -1)
147
- if return_code == 0:
148
- output = data.get('output', '')
149
- if output:
150
- return output
151
- except urllib.error.URLError as e:
152
- logger.debug(f"Failed to fetch origin_url from API for MD5 hash {md5_hash}: {e}")
153
- except json.JSONDecodeError as e:
154
- logger.debug(f"Failed to parse API response for MD5 hash {md5_hash}: {e}")
155
- except Exception as e:
156
- logger.debug(f"Error getting origin_url for MD5 hash {md5_hash}: {e}")
157
- return ""
124
+ def _apply_kb_origin_url(self, origin_url: str) -> tuple[str, str, str]:
125
+ """Apply KB origin URL and return (oss_name, oss_version, download_url)."""
126
+ self.kb_origin_url = origin_url
127
+ self.kb_evidence = "exact_match"
128
+ extracted_name, extracted_version, repo_url = self._extract_oss_info_from_url(origin_url)
129
+ if extracted_name:
130
+ self.oss_name = extracted_name
131
+ if extracted_version:
132
+ self.oss_version = extracted_version
133
+ download_url = repo_url if repo_url else origin_url
134
+ self.download_location = [download_url]
135
+ return self.oss_name, self.oss_version, download_url
158
136
 
159
137
  def _extract_oss_info_from_url(self, url: str) -> tuple:
160
138
  """
@@ -196,7 +174,9 @@ class SourceItem(FileItem):
196
174
  return "", "", ""
197
175
 
198
176
  def set_oss_item(
199
- self, path_to_scan: str = "", run_kb: bool = False, kb_url: str = DEFAULT_KB_URL, kb_token: str = ""
177
+ self,
178
+ path_to_scan: str = "",
179
+ kb_origin_urls: dict[str, str] | None = None,
200
180
  ) -> None:
201
181
  self.oss_items = []
202
182
  if self.download_location:
@@ -207,21 +187,15 @@ class SourceItem(FileItem):
207
187
  self.oss_items.append(item)
208
188
  else:
209
189
  item = OssItem(self.oss_name, self.oss_version, self.licenses)
210
- if run_kb and not self.is_license_text:
211
- md5_hash, wfp = self._get_hash(path_to_scan)
190
+ if kb_origin_urls and not self.is_license_text:
191
+ md5_hash = self._cached_kb_md5
192
+ if not md5_hash:
193
+ md5_hash, _wfp = self._get_hash(path_to_scan)
212
194
  if md5_hash:
213
- origin_url = self._get_origin_url_from_md5_hash(md5_hash, wfp, kb_url, kb_token)
195
+ origin_url = kb_origin_urls.get(md5_hash, "")
214
196
  if origin_url:
215
- self.kb_origin_url = origin_url
216
- self.kb_evidence = "exact_match"
217
- extracted_name, extracted_version, repo_url = self._extract_oss_info_from_url(origin_url)
218
- if extracted_name:
219
- self.oss_name = extracted_name
220
- if extracted_version:
221
- self.oss_version = extracted_version
222
- download_url = repo_url if repo_url else origin_url
223
- self.download_location = [download_url]
224
- item = OssItem(self.oss_name, self.oss_version, self.licenses, download_url)
197
+ oss_name, oss_version, download_url = self._apply_kb_origin_url(origin_url)
198
+ item = OssItem(oss_name, oss_version, self.licenses, download_url)
225
199
 
226
200
  item.copyright = "\n".join(self.copyright)
227
201
  item.comment = self.comment
@@ -25,12 +25,14 @@ from fosslight_util.exclude import get_excluded_paths
25
25
  from .run_scanoss import run_scanoss_py
26
26
  from .run_scanoss import get_scanoss_extra_info
27
27
  import yaml
28
+ import tqdm
28
29
  import argparse
29
30
  from .run_spdx_extractor import get_spdx_downloads
30
31
  from .run_manifest_extractor import get_manifest_licenses
31
- from ._scan_item import SourceItem, resolve_kb_config
32
+ from ._scan_item import SourceItem, resolve_kb_config, is_notice_file
33
+ from ._kb_client import fetch_origin_urls_via_scan_job
32
34
  from fosslight_util.oss_item import ScannerItem
33
- from typing import Tuple
35
+ from typing import Optional, Tuple
34
36
  from ._scan_item import is_manifest_file
35
37
  import shutil
36
38
 
@@ -330,11 +332,57 @@ def mark_oss_info_correction_files_as_excluded(scan_results: list) -> None:
330
332
  item.comment = OSS_INFO_CORRECTION_COMMENT
331
333
 
332
334
 
335
+ def _collect_kb_file_hashes(
336
+ scancode_result: list,
337
+ path_to_scan: str,
338
+ excluded_files: set,
339
+ hide_progress: bool,
340
+ ) -> tuple[list[str], list[tuple[SourceItem, str]]]:
341
+ """Collect MD5 hashes from scancode results and walk targets, plus (extra_item, md5) candidates.
342
+
343
+ Skips license/notice files and scancode_result items that already have download_location.
344
+ ScanOSS/SPDX results are merged into scancode_result before this runs.
345
+ """
346
+ file_hashes: list[str] = []
347
+ extra_candidates: list[tuple[SourceItem, str]] = []
348
+
349
+ for item in scancode_result:
350
+ if item.is_license_text or is_notice_file(item.source_name_or_path):
351
+ continue
352
+ if item.download_location:
353
+ continue
354
+ md5_hash, _wfp = item._get_hash(path_to_scan)
355
+ if md5_hash:
356
+ item._cached_kb_md5 = md5_hash
357
+ file_hashes.append(md5_hash)
358
+
359
+ abs_path_to_scan = os.path.abspath(path_to_scan)
360
+ scancode_paths = {item.source_name_or_path for item in scancode_result}
361
+
362
+ files_to_scan = []
363
+ for root, _dirs, files in os.walk(path_to_scan):
364
+ for file in files:
365
+ files_to_scan.append(os.path.join(root, file))
366
+
367
+ for file_path in tqdm.tqdm(files_to_scan, desc="KB Hashing", disable=hide_progress):
368
+ rel_path = os.path.relpath(file_path, abs_path_to_scan).replace("\\", "/")
369
+ if rel_path in scancode_paths or rel_path in excluded_files or is_notice_file(file_path):
370
+ continue
371
+ extra_item = SourceItem(rel_path)
372
+ md5_hash, _wfp = extra_item._get_hash(path_to_scan)
373
+ if md5_hash:
374
+ extra_item._cached_kb_md5 = md5_hash
375
+ file_hashes.append(md5_hash)
376
+ extra_candidates.append((extra_item, md5_hash))
377
+
378
+ return file_hashes, extra_candidates
379
+
380
+
333
381
  def merge_results(
334
382
  scancode_result: list = [], scanoss_result: list = [], spdx_downloads: dict = {},
335
383
  path_to_scan: str = "", run_kb: bool = False, manifest_licenses: dict = {},
336
384
  excluded_files: set = None, hide_progress: bool = False, kb_url: str = "", kb_token: str = ""
337
- ) -> list:
385
+ ) -> tuple[list, Optional[str], int, int]:
338
386
 
339
387
  """
340
388
  Merge scanner results and spdx parsing result.
@@ -346,7 +394,7 @@ def merge_results(
346
394
  :param excluded_files: set of relative paths to exclude from KB-only file discovery.
347
395
  :param kb_url: KB API base URL.
348
396
  :param kb_token: KB API bearer token.
349
- :return merged_result: list of merged result in SourceItem.
397
+ :return: (merged_result, kb failure message, requested file_hash count, returned match count).
350
398
  """
351
399
  if excluded_files is None:
352
400
  excluded_files = set()
@@ -381,32 +429,60 @@ def merge_results(
381
429
  new_result_item.is_manifest_file = True
382
430
  scancode_result.append(new_result_item)
383
431
 
432
+ kb_origin_urls: dict[str, str] = {}
433
+ kb_status_message: Optional[str] = None
434
+ kb_requested_count = 0
435
+ kb_returned_count = 0
436
+ extra_candidates: list[tuple[SourceItem, str]] = []
437
+ if run_kb:
438
+ file_hashes, extra_candidates = _collect_kb_file_hashes(
439
+ scancode_result, path_to_scan, excluded_files, hide_progress
440
+ )
441
+ if file_hashes:
442
+ kb_result = fetch_origin_urls_via_scan_job(file_hashes, kb_url, kb_token)
443
+ kb_origin_urls = kb_result.origin_urls
444
+ kb_status_message = kb_result.failure_message
445
+ kb_requested_count = kb_result.requested_count
446
+ kb_returned_count = kb_result.returned_count
447
+
384
448
  for item in scancode_result:
385
- item.set_oss_item(path_to_scan, run_kb, kb_url, kb_token)
449
+ item.set_oss_item(path_to_scan, kb_origin_urls=kb_origin_urls)
386
450
 
387
451
  # Add OSSItem for files in path_to_scan that are not in scancode_result
388
452
  # when KB returns an origin URL for their MD5 hash (skip excluded_files)
389
453
  if run_kb:
390
- import tqdm
391
- abs_path_to_scan = os.path.abspath(path_to_scan)
392
- scancode_paths = {item.source_name_or_path for item in scancode_result}
393
-
394
- files_to_scan = []
395
- for root, _dirs, files in os.walk(path_to_scan):
396
- for file in files:
397
- files_to_scan.append(os.path.join(root, file))
398
-
399
- for file_path in tqdm.tqdm(files_to_scan, desc="KB Scanning", disable=hide_progress):
400
- rel_path = os.path.relpath(file_path, abs_path_to_scan).replace("\\", "/")
401
- if rel_path in scancode_paths or rel_path in excluded_files:
402
- continue
403
- extra_item = SourceItem(rel_path)
404
- extra_item.set_oss_item(path_to_scan, run_kb, kb_url, kb_token)
454
+ for extra_item, _md5_hash in extra_candidates:
455
+ extra_item.set_oss_item(path_to_scan, kb_origin_urls=kb_origin_urls)
405
456
  if extra_item.download_location:
406
457
  scancode_result.append(extra_item)
407
- scancode_paths.add(rel_path)
408
458
 
409
- return scancode_result
459
+ return scancode_result, kb_status_message, kb_requested_count, kb_returned_count
460
+
461
+
462
+ def _finalize_temp_output(
463
+ temp_output_path: str,
464
+ final_output_path: str,
465
+ publish: bool,
466
+ log: Optional[logging.Logger] = None,
467
+ ) -> bool:
468
+ """Copy scan artifacts from temp dir, then always remove the temp directory."""
469
+ if not temp_output_path or not os.path.isdir(temp_output_path):
470
+ return True
471
+ publish_ok = True
472
+ try:
473
+ if publish:
474
+ shutil.copytree(temp_output_path, final_output_path, dirs_exist_ok=True)
475
+ except Exception as ex:
476
+ publish_ok = False
477
+ if log:
478
+ log.error(f"Failed to publish scan artifacts: {ex}")
479
+ finally:
480
+ try:
481
+ shutil.rmtree(temp_output_path)
482
+ except Exception as ex:
483
+ if log:
484
+ log.debug(f"Failed to cleanup temp output directory: {ex}")
485
+ return publish_ok
410
486
 
411
487
 
412
488
  def run_scanners(
@@ -454,77 +530,94 @@ def run_scanners(
454
530
  output_path = os.getcwd()
455
531
  final_output_path = output_path
456
532
  output_path = os.path.join(os.path.dirname(output_path), f'.fosslight_temp_{start_time}')
533
+ publish_temp_output = False
534
+ logger = None
535
+ publish_ok = True
457
536
 
458
- logger, result_log = init_log(os.path.join(output_path, f"fosslight_log_src_{start_time}.txt"),
459
- True, logging.INFO, logging.DEBUG, PKG_NAME, path_to_scan, path_to_exclude)
537
+ try:
538
+ logger, result_log = init_log(os.path.join(output_path, f"fosslight_log_src_{start_time}.txt"),
539
+ True, logging.INFO, logging.DEBUG, PKG_NAME, path_to_scan, path_to_exclude)
460
540
 
461
- logger.info(f"Tool Info : {result_log['Tool Info']}")
541
+ logger.info(f"Tool Info : {result_log['Tool Info']}")
462
542
 
463
- if '.xlsx' not in output_extensions and print_matched_text:
464
- logger.warning("-m option is only available for excel.")
465
- print_matched_text = False
543
+ if '.xlsx' not in output_extensions and print_matched_text:
544
+ logger.warning("-m option is only available for excel.")
545
+ print_matched_text = False
466
546
 
467
- if success:
468
- if all_exclude_mode and len(all_exclude_mode) == 4:
469
- (excluded_path_with_default_exclusion,
470
- excluded_path_without_dot,
471
- excluded_files,
472
- cnt_file_except_skipped) = all_exclude_mode
473
- else:
474
- path_to_exclude_with_filename = path_to_exclude
475
- (excluded_path_with_default_exclusion,
476
- excluded_path_without_dot,
477
- excluded_files,
478
- cnt_file_except_skipped) = get_excluded_paths(path_to_scan, path_to_exclude_with_filename)
479
- logger.debug(f"Skipped paths: {excluded_path_with_default_exclusion}")
480
-
481
- if not selected_scanner:
482
- selected_scanner = ALL_MODE
483
- if selected_scanner in ['scancode', ALL_MODE]:
484
- success, result_log[RESULT_KEY], scancode_result, license_list = run_scan(path_to_scan, output_file_name,
485
- write_json_file, num_cores, True,
486
- print_matched_text, formats, called_by_cli,
487
- time_out, correct_mode, correct_filepath,
488
- excluded_path_with_default_exclusion,
489
- excluded_files, hide_progress)
490
- excluded_files = set(excluded_files) if excluded_files else set()
491
- if selected_scanner in ['scanoss', ALL_MODE]:
492
- scanoss_result, api_limit_exceed = run_scanoss_py(path_to_scan, output_path, formats, True, num_cores,
493
- excluded_path_with_default_exclusion, excluded_files,
494
- write_json_file, hide_progress)
495
-
496
- run_kb_msg = ""
497
- if selected_scanner in SCANNER_TYPE:
498
- run_kb = True if selected_scanner in ['kb', ALL_MODE] else False
499
- if run_kb:
500
- if not check_kb_server_reachable(kb_url, kb_token):
501
- run_kb = False
502
- run_kb_msg = f"KB({kb_url}) Unreachable"
503
- else:
504
- run_kb_msg = f"KB({kb_url}) Enabled"
505
-
506
- spdx_downloads, manifest_licenses = metadata_collector(path_to_scan, excluded_files)
507
- merged_result = merge_results(scancode_result, scanoss_result, spdx_downloads,
508
- path_to_scan, run_kb, manifest_licenses, excluded_files,
509
- hide_progress, kb_url, kb_token)
510
- mark_oss_info_correction_files_as_excluded(merged_result)
511
- scan_item = create_report_file(start_time, merged_result, license_list, scanoss_result, selected_scanner,
512
- print_matched_text, output_path, output_files, output_extensions, correct_mode,
513
- correct_filepath, path_to_scan, excluded_path_without_dot, formats,
514
- api_limit_exceed, cnt_file_except_skipped, final_output_path, run_kb_msg)
547
+ if success:
548
+ if all_exclude_mode and len(all_exclude_mode) == 4:
549
+ (excluded_path_with_default_exclusion,
550
+ excluded_path_without_dot,
551
+ excluded_files,
552
+ cnt_file_except_skipped) = all_exclude_mode
553
+ else:
554
+ path_to_exclude_with_filename = path_to_exclude
555
+ (excluded_path_with_default_exclusion,
556
+ excluded_path_without_dot,
557
+ excluded_files,
558
+ cnt_file_except_skipped) = get_excluded_paths(path_to_scan, path_to_exclude_with_filename)
559
+ logger.debug(f"Skipped paths: {excluded_path_with_default_exclusion}")
560
+
561
+ if not selected_scanner:
562
+ selected_scanner = ALL_MODE
563
+ if selected_scanner in ['scancode', ALL_MODE]:
564
+ success, result_log[RESULT_KEY], scancode_result, license_list = run_scan(
565
+ path_to_scan, output_file_name, write_json_file, num_cores, True,
566
+ print_matched_text, formats, called_by_cli, time_out, correct_mode,
567
+ correct_filepath, excluded_path_with_default_exclusion,
568
+ excluded_files, hide_progress,
569
+ )
570
+ excluded_files = set(excluded_files) if excluded_files else set()
571
+ if selected_scanner in ['scanoss', ALL_MODE]:
572
+ scanoss_result, api_limit_exceed = run_scanoss_py(path_to_scan, output_path, formats, True, num_cores,
573
+ excluded_path_with_default_exclusion, excluded_files,
574
+ write_json_file, hide_progress)
575
+
576
+ run_kb_msg = ""
577
+ if selected_scanner in SCANNER_TYPE:
578
+ run_kb = True if selected_scanner in ['kb', ALL_MODE] else False
579
+ if run_kb:
580
+ if not check_kb_server_reachable(kb_url, kb_token):
581
+ run_kb = False
582
+ run_kb_msg = f"KB({kb_url}) Unreachable"
583
+
584
+ spdx_downloads, manifest_licenses = metadata_collector(path_to_scan, excluded_files)
585
+ merged_result, kb_status_message, kb_requested_count, kb_returned_count = merge_results(
586
+ scancode_result, scanoss_result, spdx_downloads,
587
+ path_to_scan, run_kb, manifest_licenses, excluded_files,
588
+ hide_progress, kb_url, kb_token,
589
+ )
590
+ if kb_status_message:
591
+ run_kb_msg = f"KB({kb_url}) {kb_status_message}"
592
+ elif run_kb and kb_requested_count > 0:
593
+ run_kb_msg = (
594
+ f"KB({kb_url}) response : {kb_returned_count}/"
595
+ f" requested: {kb_requested_count}"
596
+ )
597
+ mark_oss_info_correction_files_as_excluded(merged_result)
598
+ scan_item = create_report_file(start_time, merged_result, license_list, scanoss_result, selected_scanner,
599
+ print_matched_text, output_path, output_files, output_extensions, correct_mode,
600
+ correct_filepath, path_to_scan, excluded_path_without_dot, formats,
601
+ api_limit_exceed, cnt_file_except_skipped, final_output_path, run_kb_msg)
602
+ else:
603
+ print_help_msg_source_scanner()
604
+ result_log[RESULT_KEY] = "Unsupported scanner"
605
+ success = False
515
606
  else:
516
- print_help_msg_source_scanner()
517
- result_log[RESULT_KEY] = "Unsupported scanner"
607
+ result_log[RESULT_KEY] = f"Format error. {msg}"
518
608
  success = False
519
- else:
520
- result_log[RESULT_KEY] = f"Format error. {msg}"
521
- success = False
522
609
 
523
- try:
524
- shutil.copytree(output_path, final_output_path, dirs_exist_ok=True)
525
- shutil.rmtree(output_path)
526
- except Exception as ex:
527
- logger.debug(f"Failed to move temp files: {ex}")
610
+ publish_temp_output = True
611
+ finally:
612
+ publish_ok = _finalize_temp_output(output_path, final_output_path, publish_temp_output, logger)
613
+
614
+ if publish_temp_output and not publish_ok:
615
+ success = False
616
+ prev_msg = result_log.get(RESULT_KEY, "")
617
+ result_log[RESULT_KEY] = (
618
+ f"{prev_msg}, Failed to publish scan artifacts" if prev_msg
619
+ else "Failed to publish scan artifacts"
620
+ )
528
621
 
529
622
  return success, result_log.get(RESULT_KEY, ""), scan_item, license_list, scanoss_result
530
623
 
@@ -63,14 +63,23 @@ def _apply_scancode_unset_workaround(kwargs: dict) -> None:
63
63
  logger.debug("scancode UNSET workaround skipped: %s", ex)
64
64
 
65
65
 
66
+ def _directory_ignore_pattern(dir_name: str) -> str:
67
+ """Path-based glob for a directory name (avoids matching the scan root itself)."""
68
+ normalized = dir_name.strip().strip("/").replace("\\", "/")
69
+ if not normalized:
70
+ return dir_name
71
+ return f"**/{normalized}/**"
72
+
73
+
66
74
  def _default_scancode_coarse_ignore_patterns() -> frozenset:
67
75
  """
68
76
  Coarse ignore patterns aligned with fosslight_util.get_excluded_paths() rules.
69
- Uses segment-style globs so scancode does not need one pattern per file.
77
+ Directory names use path-based globs (e.g. **/tests/**) so they do not match
78
+ the scan root directory name itself.
70
79
  """
71
80
  patterns = {".*"}
72
81
  for name in PACKAGE_DIRECTORY + EXCLUDE_DIRECTORY:
73
- patterns.add(name)
82
+ patterns.add(_directory_ignore_pattern(name))
74
83
  for ext in EXCLUDE_FILE_EXTENSION:
75
84
  patterns.add(f"*.{ext}")
76
85
  for name in EXCLUDE_FILENAME:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fosslight_source
3
- Version: 2.2.17
3
+ Version: 2.3.1
4
4
  Summary: FOSSLight Source Scanner
5
5
  Author: LG Electronics
6
6
  License-Expression: Apache-2.0
@@ -26,6 +26,7 @@ Requires-Dist: wheel>=0.38.1
26
26
  Requires-Dist: intbitset
27
27
  Requires-Dist: fosslight_binary>=5.1.22
28
28
  Requires-Dist: scancode-toolkit>=32.0.2
29
+ Requires-Dist: cryptography<49; platform_system == "Darwin" and platform_machine == "x86_64"
29
30
  Requires-Dist: fingerprints==1.2.3
30
31
  Requires-Dist: normality==2.6.1
31
32
  Requires-Dist: psycopg2-binary>=2.9.10; python_version >= "3.13"
@@ -4,6 +4,7 @@ README.md
4
4
  pyproject.toml
5
5
  src/fosslight_source/__init__.py
6
6
  src/fosslight_source/_help.py
7
+ src/fosslight_source/_kb_client.py
7
8
  src/fosslight_source/_license_matched.py
8
9
  src/fosslight_source/_parsing_scancode_file_item.py
9
10
  src/fosslight_source/_parsing_scanoss_file.py
@@ -12,6 +12,9 @@ fingerprints==1.2.3
12
12
  normality==2.6.1
13
13
  tqdm
14
14
 
15
+ [:platform_system == "Darwin" and platform_machine == "x86_64"]
16
+ cryptography<49
17
+
15
18
  [:python_version < "3.11"]
16
19
  tomli
17
20
 
@@ -3,10 +3,13 @@
3
3
  # Copyright (c) 2020 LG Electronics Inc.
4
4
  # SPDX-License-Identifier: Apache-2.0
5
5
  import os
6
+ import shlex
6
7
  import subprocess
7
8
  import pytest
8
9
  import shutil
9
10
  import sys
11
+ import csv
12
+ import glob
10
13
 
11
14
  # Add project root to sys.path for importing FL Source modules
12
15
  sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
@@ -18,6 +21,26 @@ from fosslight_source._parsing_scancode_file_item import (
18
21
  )
19
22
 
20
23
  remove_directories = ["test_scan", "test_scan2", "test_scan3"]
24
+ TEST_FILES_SCAN_DIR = "test_scan"
25
+
26
+
27
+ def _parse_license_tokens(license_value: str) -> set[str]:
28
+ return {token.strip().lower() for token in (license_value or "").split(",") if token.strip()}
29
+
30
+
31
+ def _read_src_csv_rows(csv_path: str) -> list[dict]:
32
+ with open(csv_path, "r", encoding="utf-8") as file:
33
+ return list(csv.DictReader(file, delimiter="\t"))
34
+
35
+
36
+ def _rows_for_source(rows: list[dict], source_name: str) -> list[dict]:
37
+ return [row for row in rows if row.get("Source Path") == source_name]
38
+
39
+
40
+ def _find_scan_csv(output_dir: str) -> str:
41
+ csv_files = sorted(glob.glob(os.path.join(output_dir, "*.csv")))
42
+ assert csv_files, f"No CSV report found under {output_dir}"
43
+ return csv_files[-1]
21
44
 
22
45
 
23
46
  @pytest.fixture(scope="module", autouse=True)
@@ -31,8 +54,22 @@ def setup_test_result_dir():
31
54
 
32
55
 
33
56
  def run_command(command):
34
- process = subprocess.run(command, shell=True, capture_output=True, text=True)
35
- success = (process.returncode == 0)
57
+ command = command.strip()
58
+ if command.startswith("fosslight_source"):
59
+ args = shlex.split(command, posix=(os.name != "nt"))[1:]
60
+ if os.environ.get("FOSSLIGHT_USE_LOCAL_SRC"):
61
+ src_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src"))
62
+ env = os.environ.copy()
63
+ existing = env.get("PYTHONPATH", "")
64
+ env["PYTHONPATH"] = src_path if not existing else f"{src_path}{os.pathsep}{existing}"
65
+ cmd = [sys.executable, "-m", "fosslight_source.cli", *args]
66
+ process = subprocess.run(cmd, capture_output=True, text=True, env=env)
67
+ else:
68
+ cmd = ["fosslight_source", *args]
69
+ process = subprocess.run(cmd, capture_output=True, text=True)
70
+ else:
71
+ process = subprocess.run(command, shell=True, capture_output=True, text=True)
72
+ success = process.returncode == 0
36
73
  return success, process.stdout if success else process.stderr
37
74
 
38
75
 
@@ -112,6 +149,32 @@ def test_run():
112
149
  assert len(scan2_files) > 0, "Test Run: No scan files created in test_scan2 directory"
113
150
 
114
151
 
152
+ def test_test_files_scan_results():
153
+ os.makedirs(TEST_FILES_SCAN_DIR, exist_ok=True)
154
+
155
+ success, msg = run_command(
156
+ f"fosslight_source -p tests/test_files -s scancode -f csv -o {TEST_FILES_SCAN_DIR}/"
157
+ )
158
+ assert success is True, f"Test Run: test_files scan failed: {msg}"
159
+
160
+ csv_path = _find_scan_csv(TEST_FILES_SCAN_DIR)
161
+ rows = _read_src_csv_rows(csv_path)
162
+
163
+ sample_rows = _rows_for_source(rows, "sample.cpp")
164
+ assert sample_rows, "Test Run: sample.cpp not found in scan result"
165
+ for row in sample_rows:
166
+ licenses = _parse_license_tokens(row.get("License", ""))
167
+ assert "apache-2.0" in licenses, f"sample.cpp missing Apache-2.0 license: {row.get('License')}"
168
+ assert "mit" in licenses, f"sample.cpp missing MIT license: {row.get('License')}"
169
+
170
+ temp_rows = _rows_for_source(rows, "temp.cpp")
171
+ assert temp_rows, "Test Run: temp.cpp not found in scan result"
172
+ temp_row = temp_rows[0]
173
+ temp_licenses = _parse_license_tokens(temp_row.get("License", ""))
174
+ assert "apache-2.0" in temp_licenses, f"temp.cpp missing Apache-2.0 license: {temp_row.get('License')}"
175
+ assert (temp_row.get("Copyright Text") or "").strip(), "Test Run: temp.cpp copyright not extracted"
176
+
177
+
115
178
  def test_help_command():
116
179
  success, msg = run_command("fosslight_source -h")
117
180
  assert success is True, f"Test Release: Help command failed :{msg}"