skip-trace 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skip_trace/__about__.py +13 -3
- skip_trace/__init__.py +0 -2
- skip_trace/analysis/content_scanner.py +189 -0
- skip_trace/analysis/evidence.py +1 -1
- skip_trace/analysis/scoring.py +46 -1
- skip_trace/analysis/source_scanner.py +1 -1
- skip_trace/cli.py +1 -1
- skip_trace/collectors/__init__.py +2 -2
- skip_trace/collectors/github_files.py +359 -0
- skip_trace/collectors/package_files.py +232 -41
- skip_trace/collectors/pypi.py +1 -1
- skip_trace/collectors/pypi_attestations.py +160 -0
- skip_trace/collectors/sigstore.py +160 -0
- skip_trace/collectors/urls.py +96 -0
- skip_trace/m.py +287 -0
- skip_trace/main.py +103 -85
- skip_trace/reporting/md_reporter.py +68 -4
- skip_trace/schemas.py +21 -0
- skip_trace/utils/http_client.py +18 -0
- {skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/METADATA +7 -3
- skip_trace-0.1.1.dist-info/RECORD +39 -0
- skip_trace-0.1.0.dist-info/RECORD +0 -33
- {skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/WHEEL +0 -0
- {skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/entry_points.txt +0 -0
- {skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,96 @@
|
|
1
|
+
# skip_trace/collectors/urls.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import datetime
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
from typing import List, Set
|
8
|
+
|
9
|
+
from bs4 import BeautifulSoup
|
10
|
+
|
11
|
+
from ..analysis.content_scanner import scan_text
|
12
|
+
from ..analysis.evidence import generate_evidence_id
|
13
|
+
from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
|
14
|
+
from ..utils import http_client
|
15
|
+
from ..utils.cache import get_cached_data, set_cached_data
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
URL_CACHE_DIR = ".urls"
|
19
|
+
|
20
|
+
|
21
|
+
def _ensure_download_dir():
|
22
|
+
"""Ensures the URL cache directory and .gitignore exist."""
|
23
|
+
os.makedirs(URL_CACHE_DIR, exist_ok=True)
|
24
|
+
gitignore_path = os.path.join(URL_CACHE_DIR, ".gitignore")
|
25
|
+
if not os.path.exists(gitignore_path):
|
26
|
+
with open(gitignore_path, "w", encoding="utf-8") as f:
|
27
|
+
f.write("*\n")
|
28
|
+
|
29
|
+
|
30
|
+
def collect_from_urls(urls: Set[str]) -> List[EvidenceRecord]:
|
31
|
+
"""
|
32
|
+
Downloads, caches, and scans a list of URLs for evidence.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
urls: A set of unique URLs to scan.
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
A list of EvidenceRecord objects from the URLs.
|
39
|
+
"""
|
40
|
+
_ensure_download_dir()
|
41
|
+
all_evidence: List[EvidenceRecord] = []
|
42
|
+
now = datetime.datetime.now(datetime.timezone.utc)
|
43
|
+
|
44
|
+
for url in urls:
|
45
|
+
logger.info(f"Analyzing URL: {url}")
|
46
|
+
cached_data = get_cached_data("url", url)
|
47
|
+
|
48
|
+
status_code = -1
|
49
|
+
content = ""
|
50
|
+
|
51
|
+
if cached_data:
|
52
|
+
logger.debug(f"Using cached content for {url}")
|
53
|
+
status_code = cached_data.get("status_code", -1)
|
54
|
+
content = cached_data.get("content", "")
|
55
|
+
else:
|
56
|
+
response = http_client.make_request_safe(url)
|
57
|
+
if response:
|
58
|
+
status_code = response.status_code
|
59
|
+
if status_code == 200:
|
60
|
+
content = response.text
|
61
|
+
set_cached_data(
|
62
|
+
"url", url, {"status_code": status_code, "content": content}
|
63
|
+
)
|
64
|
+
else:
|
65
|
+
set_cached_data(
|
66
|
+
"url", url, {"status_code": -1, "content": ""}
|
67
|
+
) # Cache connection failure
|
68
|
+
|
69
|
+
# Create an evidence record for the URL status itself
|
70
|
+
status_value = {"status_code": status_code}
|
71
|
+
status_record = EvidenceRecord(
|
72
|
+
id=generate_evidence_id(
|
73
|
+
EvidenceSource.URL, EvidenceKind.URL_STATUS, url, str(status_value), url
|
74
|
+
),
|
75
|
+
source=EvidenceSource.URL,
|
76
|
+
locator=url,
|
77
|
+
kind=EvidenceKind.URL_STATUS,
|
78
|
+
value=status_value,
|
79
|
+
observed_at=now,
|
80
|
+
confidence=0.0, # This is informational, not for scoring
|
81
|
+
notes=f"HTTP status for {url} was {status_code}.",
|
82
|
+
)
|
83
|
+
all_evidence.append(status_record)
|
84
|
+
|
85
|
+
if content:
|
86
|
+
try:
|
87
|
+
soup = BeautifulSoup(content, "html.parser")
|
88
|
+
text_content = soup.get_text(separator=" ", strip=True)
|
89
|
+
url_evidence = scan_text(text_content, url, EvidenceSource.URL)
|
90
|
+
if url_evidence:
|
91
|
+
logger.info(f"Found {len(url_evidence)} evidence records on {url}")
|
92
|
+
all_evidence.extend(url_evidence)
|
93
|
+
except Exception as e:
|
94
|
+
logger.warning(f"Could not parse or scan HTML from {url}: {e}")
|
95
|
+
|
96
|
+
return all_evidence
|
skip_trace/m.py
ADDED
@@ -0,0 +1,287 @@
|
|
1
|
+
# skip_trace/main.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import argparse
|
5
|
+
import dataclasses
|
6
|
+
import json
|
7
|
+
import logging
|
8
|
+
import sys
|
9
|
+
from typing import Set
|
10
|
+
from urllib.parse import urlparse
|
11
|
+
|
12
|
+
import tldextract
|
13
|
+
from rich.logging import RichHandler
|
14
|
+
|
15
|
+
from . import schemas
|
16
|
+
from .analysis import evidence as evidence_analyzer
|
17
|
+
from .analysis import scoring
|
18
|
+
from .collectors import (
|
19
|
+
github,
|
20
|
+
github_files,
|
21
|
+
package_files,
|
22
|
+
pypi,
|
23
|
+
pypi_attestations,
|
24
|
+
urls,
|
25
|
+
whois,
|
26
|
+
)
|
27
|
+
from .config import CONFIG
|
28
|
+
from .exceptions import CollectorError, NetworkError, NoEvidenceError
|
29
|
+
from .reporting import json_reporter, md_reporter
|
30
|
+
|
31
|
+
# Create a logger instance for this module
|
32
|
+
logger = logging.getLogger(__name__)
|
33
|
+
|
34
|
+
|
35
|
+
def setup_logging(level: str = "INFO"):
|
36
|
+
"""Configures the application's logger.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
level: The minimum logging level to display (e.g., "INFO", "DEBUG").
|
40
|
+
"""
|
41
|
+
logging.basicConfig(
|
42
|
+
level=level,
|
43
|
+
format="%(message)s",
|
44
|
+
datefmt="[%X]",
|
45
|
+
handlers=[RichHandler(rich_tracebacks=True, show_path=False)],
|
46
|
+
)
|
47
|
+
|
48
|
+
|
49
|
+
def run_who_owns(args: argparse.Namespace) -> int:
|
50
|
+
"""Handler for the 'who-owns' command."""
|
51
|
+
logger.info(f"Executing 'who-owns' for package: {args.package}")
|
52
|
+
|
53
|
+
try:
|
54
|
+
# Collect initial data from PyPI
|
55
|
+
metadata = pypi.fetch_package_metadata(args.package, args.version)
|
56
|
+
package_name = metadata.get("info", {}).get("name", args.package)
|
57
|
+
package_version = metadata.get("info", {}).get("version")
|
58
|
+
logger.info(
|
59
|
+
f"Successfully fetched metadata for {package_name} v{package_version}"
|
60
|
+
)
|
61
|
+
|
62
|
+
# Analyze primary package metadata
|
63
|
+
evidence_records, pypi_maintainers = evidence_analyzer.extract_from_pypi(
|
64
|
+
metadata
|
65
|
+
)
|
66
|
+
|
67
|
+
logger.info(f"Evidence records so far {len(evidence_records)} -- pypi metadata")
|
68
|
+
|
69
|
+
# Check for PyPI attestations
|
70
|
+
attestation_evidence = pypi_attestations.collect(metadata)
|
71
|
+
evidence_records.extend(attestation_evidence)
|
72
|
+
logger.info(
|
73
|
+
f"Evidence records so far {len(evidence_records)} -- collected from PyPI attestations"
|
74
|
+
)
|
75
|
+
|
76
|
+
# Analyze package contents for deep evidence
|
77
|
+
try:
|
78
|
+
package_files_evidence = package_files.collect_from_package_files(metadata)
|
79
|
+
evidence_records.extend(package_files_evidence)
|
80
|
+
logger.info(
|
81
|
+
f"Evidence records so far {len(evidence_records)} -- collected from source code in package"
|
82
|
+
)
|
83
|
+
except CollectorError as e:
|
84
|
+
logger.warning(f"Could not analyze package files for {package_name}: {e}")
|
85
|
+
|
86
|
+
# Cross-Reference for more PyPI evidence
|
87
|
+
cross_ref_evidence = pypi.cross_reference_by_user(package_name)
|
88
|
+
evidence_records.extend(cross_ref_evidence)
|
89
|
+
logger.info(
|
90
|
+
f"Evidence records so far {len(evidence_records)} -- user cross ref"
|
91
|
+
)
|
92
|
+
|
93
|
+
# Fetch evidence from code repositories found in PyPI evidence
|
94
|
+
repo_urls = set()
|
95
|
+
for record in evidence_records:
|
96
|
+
if (
|
97
|
+
record.source == schemas.EvidenceSource.PYPI
|
98
|
+
and record.kind == schemas.EvidenceKind.ORGANIZATION
|
99
|
+
):
|
100
|
+
url = record.value.get("url")
|
101
|
+
if url and "github.com" in url:
|
102
|
+
repo_urls.add(url)
|
103
|
+
|
104
|
+
for url in repo_urls:
|
105
|
+
logger.info(f"Analyzing GitHub repository: {url}")
|
106
|
+
try:
|
107
|
+
github_evidence = github.extract_from_repo_url(url)
|
108
|
+
evidence_records.extend(github_evidence)
|
109
|
+
logger.info(
|
110
|
+
f"Evidence records so far {len(evidence_records)} -- collected from github"
|
111
|
+
)
|
112
|
+
except CollectorError as e:
|
113
|
+
logger.warning(f"Could not fully analyze GitHub repo {url}: {e}")
|
114
|
+
|
115
|
+
# NEW: Collect evidence from GitHub files (SECURITY.md, FUNDING.yml, contributors)
|
116
|
+
try:
|
117
|
+
github_files_evidence = github_files.collect_from_repo_url(url)
|
118
|
+
evidence_records.extend(github_files_evidence)
|
119
|
+
logger.info(
|
120
|
+
f"Evidence records so far {len(evidence_records)} -- collected from github files"
|
121
|
+
)
|
122
|
+
except CollectorError as e:
|
123
|
+
logger.warning(f"Could not collect GitHub files for {url}: {e}")
|
124
|
+
|
125
|
+
# Extract domains and perform WHOIS lookups
|
126
|
+
domains_to_check: Set[str] = set()
|
127
|
+
urls_to_scan: Set[str] = set()
|
128
|
+
ignored_domains = set(CONFIG.get("whois_ignored_domains", []))
|
129
|
+
|
130
|
+
for record in evidence_records:
|
131
|
+
# Extract domains for WHOIS
|
132
|
+
if email := record.value.get("email"):
|
133
|
+
if "@" in email:
|
134
|
+
domain = email.split("@")[1]
|
135
|
+
if domain not in ignored_domains:
|
136
|
+
domains_to_check.add(domain)
|
137
|
+
# Extract domains and full URLs
|
138
|
+
if url := record.value.get("url"):
|
139
|
+
urls_to_scan.add(url)
|
140
|
+
|
141
|
+
# If it's a GitHub repo URL, also scan the user/org URL.
|
142
|
+
try:
|
143
|
+
parsed_url = urlparse(url)
|
144
|
+
if "github.com" in parsed_url.netloc:
|
145
|
+
path_parts = [p for p in parsed_url.path.split("/") if p]
|
146
|
+
if len(path_parts) >= 2: # e.g., /owner/repo
|
147
|
+
user_url = f"{parsed_url.scheme}://{parsed_url.netloc}/{path_parts[0]}"
|
148
|
+
urls_to_scan.add(user_url)
|
149
|
+
except Exception as e:
|
150
|
+
logger.debug(f"Could not parse user URL from {url}: {e}")
|
151
|
+
|
152
|
+
# Gather domains from URLs for WHOIS, respecting the ignore list
|
153
|
+
extracted = tldextract.extract(url)
|
154
|
+
if extracted.registered_domain:
|
155
|
+
if extracted.registered_domain not in ignored_domains:
|
156
|
+
domains_to_check.add(extracted.registered_domain)
|
157
|
+
urls_to_scan.add(url)
|
158
|
+
|
159
|
+
# Perform WHOIS lookups
|
160
|
+
logger.info(f"Domains for WHOIS: {', '.join(sorted(list(domains_to_check)))}")
|
161
|
+
if domains_to_check:
|
162
|
+
for domain in domains_to_check:
|
163
|
+
try:
|
164
|
+
evidence_records.extend(whois.collect_from_domain(domain))
|
165
|
+
logger.info(
|
166
|
+
f"Evidence records so far {len(evidence_records)} -- collected from domains/whois"
|
167
|
+
)
|
168
|
+
except CollectorError as e:
|
169
|
+
logger.warning(f"WHOIS failed for {domain}: {e}")
|
170
|
+
|
171
|
+
# Scan homepage URLs
|
172
|
+
logger.info(f"URLs to scan: {', '.join(sorted(list(urls_to_scan)))}")
|
173
|
+
if urls_to_scan:
|
174
|
+
try:
|
175
|
+
evidence_records.extend(urls.collect_from_urls(urls_to_scan))
|
176
|
+
logger.info(
|
177
|
+
f"Evidence records so far {len(evidence_records)} -- collected from urls"
|
178
|
+
)
|
179
|
+
except CollectorError as e:
|
180
|
+
logger.warning(f"URL scanning failed: {e}")
|
181
|
+
|
182
|
+
# Score all collected evidence
|
183
|
+
owner_candidates = scoring.score_owners(evidence_records)
|
184
|
+
|
185
|
+
# Assemble final result object
|
186
|
+
package_result = schemas.PackageResult(
|
187
|
+
package=package_name,
|
188
|
+
version=package_version,
|
189
|
+
owners=owner_candidates,
|
190
|
+
maintainers=pypi_maintainers,
|
191
|
+
evidence=evidence_records,
|
192
|
+
)
|
193
|
+
|
194
|
+
# 10. Report
|
195
|
+
if args.output_format == "json":
|
196
|
+
json_reporter.render(package_result)
|
197
|
+
else:
|
198
|
+
md_reporter.render(package_result)
|
199
|
+
|
200
|
+
# Exit codes
|
201
|
+
top_score = owner_candidates[0].score if owner_candidates else 0
|
202
|
+
return 0 if top_score >= 0.5 else 101
|
203
|
+
except NoEvidenceError as e:
|
204
|
+
logger.error(f"{type(e).__name__}: {e}")
|
205
|
+
return 101 # As per the PEP for "No usable evidence"
|
206
|
+
except NetworkError as e:
|
207
|
+
print(f"Error: A network problem occurred: {e}", file=sys.stderr)
|
208
|
+
return 101
|
209
|
+
|
210
|
+
|
211
|
+
# --- Handler for the `explain` command ---
|
212
|
+
def run_explain(args: argparse.Namespace) -> int:
|
213
|
+
"""Handler for the 'explain' command."""
|
214
|
+
logger.info(f"Explaining evidence for package: {args.package}")
|
215
|
+
try:
|
216
|
+
metadata = pypi.fetch_package_metadata(args.package)
|
217
|
+
evidence_records, _ = evidence_analyzer.extract_from_pypi(metadata)
|
218
|
+
|
219
|
+
if args.id:
|
220
|
+
record = next(
|
221
|
+
(r for r in evidence_records if r.id.startswith(args.id)), None
|
222
|
+
)
|
223
|
+
if record:
|
224
|
+
print(json.dumps(dataclasses.asdict(record), indent=2, default=str))
|
225
|
+
return 0
|
226
|
+
logger.error(f"Evidence ID matching '{args.id}' not found.")
|
227
|
+
return 1
|
228
|
+
# Show all evidence
|
229
|
+
output: list[dict[str, str | None]] = [
|
230
|
+
dataclasses.asdict(r) for r in evidence_records
|
231
|
+
]
|
232
|
+
print(json.dumps(output, indent=2, default=str))
|
233
|
+
return 0
|
234
|
+
|
235
|
+
except (NoEvidenceError, NetworkError) as e:
|
236
|
+
logger.error(f"{type(e).__name__}: {e}")
|
237
|
+
return 101
|
238
|
+
|
239
|
+
|
240
|
+
def run_venv(args: argparse.Namespace) -> int:
|
241
|
+
"""Handler for the 'venv' command."""
|
242
|
+
print("Executing 'venv' command...")
|
243
|
+
print(f" Path: {args.path or 'current environment'}")
|
244
|
+
# TODO: Implement the actual logic
|
245
|
+
return 200 # Placeholder for "No anonymous"
|
246
|
+
|
247
|
+
|
248
|
+
def run_reqs(args: argparse.Namespace) -> int:
|
249
|
+
"""Handler for the 'reqs' command."""
|
250
|
+
print("Executing 'reqs' command...")
|
251
|
+
print(f" Requirements File: {args.requirements_file}")
|
252
|
+
# TODO: Implement the actual logic
|
253
|
+
return 200 # Placeholder for "No anonymous"
|
254
|
+
|
255
|
+
|
256
|
+
# ... Add placeholder functions for other commands ...
|
257
|
+
|
258
|
+
|
259
|
+
def run_command(args: argparse.Namespace) -> int:
|
260
|
+
"""
|
261
|
+
Dispatches the parsed arguments to the appropriate handler function.
|
262
|
+
|
263
|
+
Args:
|
264
|
+
args: The parsed arguments from argparse.
|
265
|
+
|
266
|
+
Returns:
|
267
|
+
An exit code.
|
268
|
+
"""
|
269
|
+
# Prefer --verbose if set
|
270
|
+
log_level = "DEBUG" if args.log_level == "DEBUG" else args.log_level
|
271
|
+
setup_logging(log_level)
|
272
|
+
command_handlers = {
|
273
|
+
"who-owns": run_who_owns,
|
274
|
+
"explain": run_explain,
|
275
|
+
"venv": run_venv,
|
276
|
+
"reqs": run_reqs,
|
277
|
+
# "graph": run_graph,
|
278
|
+
# "cache": run_cache,
|
279
|
+
# "policy": run_policy,
|
280
|
+
}
|
281
|
+
|
282
|
+
handler = command_handlers.get(args.command)
|
283
|
+
|
284
|
+
if handler:
|
285
|
+
return handler(args)
|
286
|
+
print(f"Error: Command '{args.command}' is not yet implemented.", file=sys.stderr)
|
287
|
+
return 2
|
skip_trace/main.py
CHANGED
@@ -7,6 +7,7 @@ import json
|
|
7
7
|
import logging
|
8
8
|
import sys
|
9
9
|
from typing import Set
|
10
|
+
from urllib.parse import urlparse
|
10
11
|
|
11
12
|
import tldextract
|
12
13
|
from rich.logging import RichHandler
|
@@ -14,11 +15,18 @@ from rich.logging import RichHandler
|
|
14
15
|
from . import schemas
|
15
16
|
from .analysis import evidence as evidence_analyzer
|
16
17
|
from .analysis import scoring
|
17
|
-
from .collectors import
|
18
|
+
from .collectors import (
|
19
|
+
github,
|
20
|
+
github_files,
|
21
|
+
package_files,
|
22
|
+
pypi,
|
23
|
+
pypi_attestations,
|
24
|
+
urls,
|
25
|
+
whois,
|
26
|
+
)
|
18
27
|
from .config import CONFIG
|
19
28
|
from .exceptions import CollectorError, NetworkError, NoEvidenceError
|
20
29
|
from .reporting import json_reporter, md_reporter
|
21
|
-
from .utils.validation import is_valid_email
|
22
30
|
|
23
31
|
# Create a logger instance for this module
|
24
32
|
logger = logging.getLogger(__name__)
|
@@ -43,24 +51,46 @@ def run_who_owns(args: argparse.Namespace) -> int:
|
|
43
51
|
logger.info(f"Executing 'who-owns' for package: {args.package}")
|
44
52
|
|
45
53
|
try:
|
46
|
-
#
|
54
|
+
# Collect initial data from PyPI
|
47
55
|
metadata = pypi.fetch_package_metadata(args.package, args.version)
|
48
56
|
package_name = metadata.get("info", {}).get("name", args.package)
|
49
57
|
package_version = metadata.get("info", {}).get("version")
|
50
|
-
logger.
|
58
|
+
logger.info(
|
51
59
|
f"Successfully fetched metadata for {package_name} v{package_version}"
|
52
60
|
)
|
53
61
|
|
54
|
-
#
|
62
|
+
# Analyze primary package metadata
|
55
63
|
evidence_records, pypi_maintainers = evidence_analyzer.extract_from_pypi(
|
56
64
|
metadata
|
57
65
|
)
|
58
66
|
|
59
|
-
|
67
|
+
logger.info(f"Evidence records so far {len(evidence_records)} -- pypi metadata")
|
68
|
+
|
69
|
+
# Check for PyPI attestations
|
70
|
+
attestation_evidence = pypi_attestations.collect(metadata)
|
71
|
+
evidence_records.extend(attestation_evidence)
|
72
|
+
logger.info(
|
73
|
+
f"Evidence records so far {len(evidence_records)} -- collected from PyPI attestations"
|
74
|
+
)
|
75
|
+
|
76
|
+
# Analyze package contents for deep evidence
|
77
|
+
try:
|
78
|
+
package_files_evidence = package_files.collect_from_package_files(metadata)
|
79
|
+
evidence_records.extend(package_files_evidence)
|
80
|
+
logger.info(
|
81
|
+
f"Evidence records so far {len(evidence_records)} -- collected from source code in package"
|
82
|
+
)
|
83
|
+
except CollectorError as e:
|
84
|
+
logger.warning(f"Could not analyze package files for {package_name}: {e}")
|
85
|
+
|
86
|
+
# Cross-Reference for more PyPI evidence
|
60
87
|
cross_ref_evidence = pypi.cross_reference_by_user(package_name)
|
61
88
|
evidence_records.extend(cross_ref_evidence)
|
89
|
+
logger.info(
|
90
|
+
f"Evidence records so far {len(evidence_records)} -- user cross ref"
|
91
|
+
)
|
62
92
|
|
63
|
-
#
|
93
|
+
# Fetch evidence from code repositories found in PyPI evidence
|
64
94
|
repo_urls = set()
|
65
95
|
for record in evidence_records:
|
66
96
|
if (
|
@@ -76,76 +106,83 @@ def run_who_owns(args: argparse.Namespace) -> int:
|
|
76
106
|
try:
|
77
107
|
github_evidence = github.extract_from_repo_url(url)
|
78
108
|
evidence_records.extend(github_evidence)
|
109
|
+
logger.info(
|
110
|
+
f"Evidence records so far {len(evidence_records)} -- collected from github"
|
111
|
+
)
|
79
112
|
except CollectorError as e:
|
80
113
|
logger.warning(f"Could not fully analyze GitHub repo {url}: {e}")
|
81
114
|
|
82
|
-
|
115
|
+
# NEW: Collect evidence from GitHub files (SECURITY.md, FUNDING.yml, contributors)
|
116
|
+
try:
|
117
|
+
github_files_evidence = github_files.collect_from_repo_url(url)
|
118
|
+
evidence_records.extend(github_files_evidence)
|
119
|
+
logger.info(
|
120
|
+
f"Evidence records so far {len(evidence_records)} -- collected from github files"
|
121
|
+
)
|
122
|
+
except CollectorError as e:
|
123
|
+
logger.warning(f"Could not collect GitHub files for {url}: {e}")
|
124
|
+
|
125
|
+
# Extract domains and perform WHOIS lookups
|
83
126
|
domains_to_check: Set[str] = set()
|
127
|
+
urls_to_scan: Set[str] = set()
|
84
128
|
ignored_domains = set(CONFIG.get("whois_ignored_domains", []))
|
85
129
|
|
86
130
|
for record in evidence_records:
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
):
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
elif contact_value and "://" in contact_value:
|
118
|
-
extracted = tldextract.extract(contact_value)
|
119
|
-
if extracted.registered_domain:
|
120
|
-
potential_domains.add(extracted.registered_domain)
|
121
|
-
|
122
|
-
# Add valid domains to the main set to be checked
|
123
|
-
for domain in potential_domains:
|
124
|
-
if domain not in ignored_domains:
|
125
|
-
domains_to_check.add(domain)
|
126
|
-
|
131
|
+
# Extract domains for WHOIS
|
132
|
+
if email := record.value.get("email"):
|
133
|
+
if "@" in email:
|
134
|
+
domain = email.split("@")[1]
|
135
|
+
if domain not in ignored_domains:
|
136
|
+
domains_to_check.add(domain)
|
137
|
+
# Extract domains and full URLs
|
138
|
+
if url := record.value.get("url"):
|
139
|
+
urls_to_scan.add(url)
|
140
|
+
|
141
|
+
# If it's a GitHub repo URL, also scan the user/org URL.
|
142
|
+
try:
|
143
|
+
parsed_url = urlparse(url)
|
144
|
+
if "github.com" in parsed_url.netloc:
|
145
|
+
path_parts = [p for p in parsed_url.path.split("/") if p]
|
146
|
+
if len(path_parts) >= 2: # e.g., /owner/repo
|
147
|
+
user_url = f"{parsed_url.scheme}://{parsed_url.netloc}/{path_parts[0]}"
|
148
|
+
urls_to_scan.add(user_url)
|
149
|
+
except Exception as e:
|
150
|
+
logger.debug(f"Could not parse user URL from {url}: {e}")
|
151
|
+
|
152
|
+
# Gather domains from URLs for WHOIS, respecting the ignore list
|
153
|
+
extracted = tldextract.extract(url)
|
154
|
+
if extracted.registered_domain:
|
155
|
+
if extracted.registered_domain not in ignored_domains:
|
156
|
+
domains_to_check.add(extracted.registered_domain)
|
157
|
+
urls_to_scan.add(url)
|
158
|
+
|
159
|
+
# Perform WHOIS lookups
|
160
|
+
logger.info(f"Domains for WHOIS: {', '.join(sorted(list(domains_to_check)))}")
|
127
161
|
if domains_to_check:
|
128
|
-
logger.info(
|
129
|
-
f"Found domains for WHOIS lookup: {', '.join(sorted(list(domains_to_check)))}"
|
130
|
-
)
|
131
162
|
for domain in domains_to_check:
|
132
163
|
try:
|
133
|
-
|
134
|
-
|
164
|
+
evidence_records.extend(whois.collect_from_domain(domain))
|
165
|
+
logger.info(
|
166
|
+
f"Evidence records so far {len(evidence_records)} -- collected from domains/whois"
|
167
|
+
)
|
135
168
|
except CollectorError as e:
|
136
|
-
logger.warning(f"
|
169
|
+
logger.warning(f"WHOIS failed for {domain}: {e}")
|
137
170
|
|
138
|
-
#
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
171
|
+
# Scan homepage URLs
|
172
|
+
logger.info(f"URLs to scan: {', '.join(sorted(list(urls_to_scan)))}")
|
173
|
+
if urls_to_scan:
|
174
|
+
try:
|
175
|
+
evidence_records.extend(urls.collect_from_urls(urls_to_scan))
|
176
|
+
logger.info(
|
177
|
+
f"Evidence records so far {len(evidence_records)} -- collected from urls"
|
178
|
+
)
|
179
|
+
except CollectorError as e:
|
180
|
+
logger.warning(f"URL scanning failed: {e}")
|
144
181
|
|
145
|
-
#
|
182
|
+
# Score all collected evidence
|
146
183
|
owner_candidates = scoring.score_owners(evidence_records)
|
147
184
|
|
148
|
-
#
|
185
|
+
# Assemble final result object
|
149
186
|
package_result = schemas.PackageResult(
|
150
187
|
package=package_name,
|
151
188
|
version=package_version,
|
@@ -154,30 +191,15 @@ def run_who_owns(args: argparse.Namespace) -> int:
|
|
154
191
|
evidence=evidence_records,
|
155
192
|
)
|
156
193
|
|
157
|
-
#
|
194
|
+
# 10. Report
|
158
195
|
if args.output_format == "json":
|
159
196
|
json_reporter.render(package_result)
|
160
197
|
else:
|
161
198
|
md_reporter.render(package_result)
|
162
199
|
|
163
|
-
#
|
164
|
-
# Using placeholder thresholds for now
|
200
|
+
# Exit codes
|
165
201
|
top_score = owner_candidates[0].score if owner_candidates else 0
|
166
|
-
if top_score >= 0.
|
167
|
-
return 0 # Success
|
168
|
-
if top_score >= 0.5:
|
169
|
-
return 0 # Indeterminate # The tool didn't fail
|
170
|
-
return 101 # No usable evidence
|
171
|
-
|
172
|
-
# TODO: Pass evidence_records to the scoring engine
|
173
|
-
# Later, this will be replaced by a call to the analysis and reporting modules.
|
174
|
-
# For example:
|
175
|
-
#
|
176
|
-
# evidence = analysis.evidence.extract_from_pypi(metadata)
|
177
|
-
# owners = analysis.scoring.score_owners(evidence)
|
178
|
-
# package_result = schemas.PackageResult(package=args.package, owners=owners, evidence=evidence)
|
179
|
-
# reporting.json_reporter.render(package_result)
|
180
|
-
# return 0
|
202
|
+
return 0 if top_score >= 0.5 else 101
|
181
203
|
except NoEvidenceError as e:
|
182
204
|
logger.error(f"{type(e).__name__}: {e}")
|
183
205
|
return 101 # As per the PEP for "No usable evidence"
|
@@ -195,13 +217,11 @@ def run_explain(args: argparse.Namespace) -> int:
|
|
195
217
|
evidence_records, _ = evidence_analyzer.extract_from_pypi(metadata)
|
196
218
|
|
197
219
|
if args.id:
|
198
|
-
# Filter for a specific evidence ID
|
199
220
|
record = next(
|
200
221
|
(r for r in evidence_records if r.id.startswith(args.id)), None
|
201
222
|
)
|
202
223
|
if record:
|
203
|
-
|
204
|
-
print(json.dumps(output_record, indent=2, default=str))
|
224
|
+
print(json.dumps(dataclasses.asdict(record), indent=2, default=str))
|
205
225
|
return 0
|
206
226
|
logger.error(f"Evidence ID matching '{args.id}' not found.")
|
207
227
|
return 1
|
@@ -209,7 +229,6 @@ def run_explain(args: argparse.Namespace) -> int:
|
|
209
229
|
output: list[dict[str, str | None]] = [
|
210
230
|
dataclasses.asdict(r) for r in evidence_records
|
211
231
|
]
|
212
|
-
|
213
232
|
print(json.dumps(output, indent=2, default=str))
|
214
233
|
return 0
|
215
234
|
|
@@ -255,7 +274,6 @@ def run_command(args: argparse.Namespace) -> int:
|
|
255
274
|
"explain": run_explain,
|
256
275
|
"venv": run_venv,
|
257
276
|
"reqs": run_reqs,
|
258
|
-
# "explain": run_explain,
|
259
277
|
# "graph": run_graph,
|
260
278
|
# "cache": run_cache,
|
261
279
|
# "policy": run_policy,
|