skip-trace 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skip_trace/__about__.py +19 -0
- skip_trace/__init__.py +6 -0
- skip_trace/__main__.py +9 -0
- skip_trace/analysis/__init__.py +4 -0
- skip_trace/analysis/evidence.py +312 -0
- skip_trace/analysis/ner.py +58 -0
- skip_trace/analysis/scoring.py +282 -0
- skip_trace/analysis/source_scanner.py +411 -0
- skip_trace/cli.py +177 -0
- skip_trace/collectors/__init__.py +4 -0
- skip_trace/collectors/github.py +241 -0
- skip_trace/collectors/package_files.py +150 -0
- skip_trace/collectors/pypi.py +158 -0
- skip_trace/collectors/whois.py +202 -0
- skip_trace/config.py +165 -0
- skip_trace/exceptions.py +22 -0
- skip_trace/main.py +269 -0
- skip_trace/py.typed.py +0 -0
- skip_trace/reporting/__init__.py +0 -0
- skip_trace/reporting/json_reporter.py +22 -0
- skip_trace/reporting/md_reporter.py +115 -0
- skip_trace/schemas.py +131 -0
- skip_trace/utils/__init__.py +4 -0
- skip_trace/utils/cache.py +77 -0
- skip_trace/utils/cli_suggestions.py +91 -0
- skip_trace/utils/http_client.py +45 -0
- skip_trace/utils/safe_targz.py +161 -0
- skip_trace/utils/validation.py +52 -0
- skip_trace-0.1.0.dist-info/METADATA +125 -0
- skip_trace-0.1.0.dist-info/RECORD +33 -0
- skip_trace-0.1.0.dist-info/WHEEL +4 -0
- skip_trace-0.1.0.dist-info/entry_points.txt +2 -0
- skip_trace-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,241 @@
|
|
1
|
+
# skip_trace/collectors/github.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import datetime
|
5
|
+
import logging
|
6
|
+
from typing import Dict, List, Optional, Set
|
7
|
+
from urllib.parse import urlparse
|
8
|
+
|
9
|
+
from bs4 import BeautifulSoup
|
10
|
+
from github import Github, GithubException
|
11
|
+
from github.NamedUser import NamedUser
|
12
|
+
|
13
|
+
from ..analysis.evidence import generate_evidence_id
|
14
|
+
from ..config import CONFIG
|
15
|
+
from ..exceptions import CollectorError, NetworkError
|
16
|
+
from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
|
17
|
+
from ..utils import http_client
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
_github_client: Optional[Github] = None
|
22
|
+
|
23
|
+
|
24
|
+
def get_github_client() -> Optional[Github]:
|
25
|
+
"""
|
26
|
+
Initializes and returns a singleton PyGithub client instance.
|
27
|
+
|
28
|
+
Uses the GITHUB_TOKEN from the config if available.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
An authenticated Github client instance, or None if the token is missing.
|
32
|
+
"""
|
33
|
+
global _github_client
|
34
|
+
if _github_client:
|
35
|
+
return _github_client
|
36
|
+
|
37
|
+
github_config = CONFIG.get("github", {})
|
38
|
+
api_key = github_config.get("api_key")
|
39
|
+
|
40
|
+
if not api_key:
|
41
|
+
logger.warning(
|
42
|
+
"GITHUB_TOKEN not found in environment. GitHub API requests will be unauthenticated and rate-limited."
|
43
|
+
)
|
44
|
+
_github_client = Github()
|
45
|
+
else:
|
46
|
+
logger.debug("Authenticating to GitHub API with token.")
|
47
|
+
_github_client = Github(api_key)
|
48
|
+
|
49
|
+
return _github_client
|
50
|
+
|
51
|
+
|
52
|
+
def _parse_repo_url(url: str) -> Optional[str]:
|
53
|
+
"""Parses a GitHub URL to extract the 'owner/repo' string."""
|
54
|
+
try:
|
55
|
+
parsed = urlparse(url)
|
56
|
+
if parsed.hostname and "github.com" in parsed.hostname:
|
57
|
+
path = parsed.path.strip("/")
|
58
|
+
if ".git" in path:
|
59
|
+
path = path.replace(".git", "")
|
60
|
+
if len(path.split("/")) >= 2:
|
61
|
+
return "/".join(path.split("/")[:2])
|
62
|
+
except Exception:
|
63
|
+
pass # nosec # noqa
|
64
|
+
logger.debug(f"Could not parse a valid GitHub repository from URL: {url}")
|
65
|
+
return None
|
66
|
+
|
67
|
+
|
68
|
+
def _scrape_socials_from_html(html_url: str) -> Dict[str, str]:
|
69
|
+
"""Scrapes a user's GitHub profile page for social media and blog links."""
|
70
|
+
contacts: Dict[str, str] = {}
|
71
|
+
try:
|
72
|
+
logger.debug(f"Scraping GitHub profile HTML page for social links: {html_url}")
|
73
|
+
response = http_client.make_request(html_url)
|
74
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
75
|
+
|
76
|
+
# Find all links within the user profile section
|
77
|
+
# profile_links = soup.select('div[data-bio-টারের] a[href], ul.vcard-details a[href]')
|
78
|
+
# brittle!
|
79
|
+
profile_links = soup.select(
|
80
|
+
"div.user-profile-bio a[href], ul.vcard-details a[href]"
|
81
|
+
)
|
82
|
+
for link in profile_links:
|
83
|
+
href = link.get("href")
|
84
|
+
if not href:
|
85
|
+
continue
|
86
|
+
|
87
|
+
# Simple heuristic mapping of domain to platform name
|
88
|
+
if "linkedin.com/in" in href and "linkedin" not in contacts:
|
89
|
+
contacts["linkedin"] = href # type: ignore[assignment]
|
90
|
+
elif (
|
91
|
+
"mastodon.social" in href or "fosstodon.org" in href
|
92
|
+
) and "mastodon" not in contacts:
|
93
|
+
contacts["mastodon"] = href # type: ignore[assignment]
|
94
|
+
elif "twitter.com" in href and "twitter" not in contacts:
|
95
|
+
# Prefer the twitter_username from API, but take this if needed
|
96
|
+
contacts["twitter"] = href # type: ignore[assignment]
|
97
|
+
elif "blog." in href or "medium.com" in href and "blog" not in contacts:
|
98
|
+
contacts["blog"] = href # type: ignore[assignment]
|
99
|
+
|
100
|
+
except NetworkError as e:
|
101
|
+
logger.warning(f"Could not scrape GitHub profile page {html_url}: {e}")
|
102
|
+
except Exception as e:
|
103
|
+
logger.error(f"Error during social scraping for {html_url}: {e}")
|
104
|
+
|
105
|
+
return contacts
|
106
|
+
|
107
|
+
|
108
|
+
def _create_records_from_user_profile(user: NamedUser) -> List[EvidenceRecord]:
|
109
|
+
"""Creates evidence records from a full GitHub user profile."""
|
110
|
+
records = []
|
111
|
+
name = user.name or user.login
|
112
|
+
now = datetime.datetime.now(datetime.timezone.utc)
|
113
|
+
|
114
|
+
# Evidence for company affiliation
|
115
|
+
if user.company:
|
116
|
+
value: dict[str, str | None] = {"user_name": name, "company_name": user.company}
|
117
|
+
records.append(
|
118
|
+
EvidenceRecord(
|
119
|
+
id=generate_evidence_id(
|
120
|
+
EvidenceSource.REPO,
|
121
|
+
EvidenceKind.USER_COMPANY,
|
122
|
+
user.html_url,
|
123
|
+
str(value),
|
124
|
+
name,
|
125
|
+
hint="company",
|
126
|
+
),
|
127
|
+
source=EvidenceSource.REPO,
|
128
|
+
locator=user.html_url,
|
129
|
+
kind=EvidenceKind.USER_COMPANY,
|
130
|
+
value=value,
|
131
|
+
observed_at=now,
|
132
|
+
confidence=0.8,
|
133
|
+
notes=f"User '{name}' lists company affiliation as '{user.company}'.",
|
134
|
+
)
|
135
|
+
)
|
136
|
+
|
137
|
+
# Evidence for other profile contacts
|
138
|
+
profile_contacts = {
|
139
|
+
"email": user.email,
|
140
|
+
"twitter": (
|
141
|
+
f"https://twitter.com/{user.twitter_username}"
|
142
|
+
if user.twitter_username
|
143
|
+
else None
|
144
|
+
),
|
145
|
+
"blog": user.blog,
|
146
|
+
}
|
147
|
+
|
148
|
+
# Scrape HTML for links not available in the API
|
149
|
+
scraped_contacts = _scrape_socials_from_html(user.html_url)
|
150
|
+
# The scraped contacts take precedence if they exist
|
151
|
+
profile_contacts.update(scraped_contacts)
|
152
|
+
|
153
|
+
# Filter out empty values
|
154
|
+
profile_contacts = {k: v for k, v in profile_contacts.items() if v}
|
155
|
+
if profile_contacts:
|
156
|
+
profile_info = {"user_name": name, "contacts": profile_contacts}
|
157
|
+
records.append(
|
158
|
+
EvidenceRecord(
|
159
|
+
id=generate_evidence_id(
|
160
|
+
EvidenceSource.REPO,
|
161
|
+
EvidenceKind.USER_PROFILE,
|
162
|
+
user.html_url,
|
163
|
+
str(profile_info), # TODO: stringify this better?
|
164
|
+
name,
|
165
|
+
hint="profile",
|
166
|
+
),
|
167
|
+
source=EvidenceSource.REPO,
|
168
|
+
locator=user.html_url,
|
169
|
+
kind=EvidenceKind.USER_PROFILE,
|
170
|
+
value=profile_info,
|
171
|
+
observed_at=now,
|
172
|
+
confidence=0.9,
|
173
|
+
notes=f"Found contact details on GitHub user profile for '{name}'.",
|
174
|
+
)
|
175
|
+
)
|
176
|
+
|
177
|
+
return records
|
178
|
+
|
179
|
+
|
180
|
+
def extract_from_repo_url(repo_url: str) -> List[EvidenceRecord]:
|
181
|
+
"""
|
182
|
+
Extracts ownership evidence from a GitHub repository URL.
|
183
|
+
|
184
|
+
Args:
|
185
|
+
repo_url: The full URL of the GitHub repository.
|
186
|
+
|
187
|
+
Returns:
|
188
|
+
A list of EvidenceRecord objects.
|
189
|
+
"""
|
190
|
+
evidence = []
|
191
|
+
processed_users: Set[str] = set()
|
192
|
+
repo_full_name = _parse_repo_url(repo_url)
|
193
|
+
if not repo_full_name:
|
194
|
+
return []
|
195
|
+
|
196
|
+
client = get_github_client()
|
197
|
+
if not client:
|
198
|
+
return []
|
199
|
+
|
200
|
+
try:
|
201
|
+
logger.debug(f"Fetching repository details for '{repo_full_name}'")
|
202
|
+
repo = client.get_repo(repo_full_name)
|
203
|
+
|
204
|
+
# 1. Process the repository owner's full profile
|
205
|
+
owner = repo.owner
|
206
|
+
if owner.login not in processed_users:
|
207
|
+
evidence.extend(_create_records_from_user_profile(owner))
|
208
|
+
processed_users.add(owner.login)
|
209
|
+
|
210
|
+
# 2. Process recent commit authors' full profiles
|
211
|
+
logger.debug(f"Fetching recent commits for '{repo_full_name}'")
|
212
|
+
commits = repo.get_commits()
|
213
|
+
# Limit to the most recent 25 commits to avoid excessive API usage
|
214
|
+
for i, commit in enumerate(commits):
|
215
|
+
if i >= 10: # Limit to recent 10 to reduce API calls
|
216
|
+
break
|
217
|
+
# commit.author is a full NamedUser if available
|
218
|
+
if (
|
219
|
+
isinstance(commit.author, NamedUser)
|
220
|
+
and commit.author.login not in processed_users
|
221
|
+
):
|
222
|
+
evidence.extend(_create_records_from_user_profile(commit.author))
|
223
|
+
processed_users.add(commit.author.login)
|
224
|
+
|
225
|
+
except GithubException as e:
|
226
|
+
logger.error(f"GitHub API error for '{repo_full_name}': {e.status} {e.data}")
|
227
|
+
raise CollectorError(
|
228
|
+
f"Could not access GitHub repository '{repo_full_name}'"
|
229
|
+
) from e
|
230
|
+
except Exception as e:
|
231
|
+
logger.error(
|
232
|
+
f"An unexpected error occurred while processing GitHub repo '{repo_full_name}': {e}"
|
233
|
+
)
|
234
|
+
raise CollectorError(
|
235
|
+
f"Unexpected error for GitHub repo '{repo_full_name}'"
|
236
|
+
) from e
|
237
|
+
|
238
|
+
logger.info(
|
239
|
+
f"Extracted {len(evidence)} evidence records from GitHub user profiles for repo '{repo_full_name}'."
|
240
|
+
)
|
241
|
+
return evidence
|
@@ -0,0 +1,150 @@
|
|
1
|
+
# skip_trace/collectors/package_files.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import logging
|
5
|
+
import os
|
6
|
+
import shutil
|
7
|
+
import tarfile
|
8
|
+
import zipfile
|
9
|
+
from typing import Any, Dict, List, Optional
|
10
|
+
|
11
|
+
from ..analysis import source_scanner
|
12
|
+
from ..exceptions import CollectorError, NetworkError
|
13
|
+
from ..schemas import EvidenceRecord
|
14
|
+
from ..utils import http_client
|
15
|
+
from ..utils.safe_targz import safe_extract_auto
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
PACKAGE_DOWNLOAD_DIR = ".packages"
|
19
|
+
|
20
|
+
|
21
|
+
def _ensure_download_dir():
|
22
|
+
"""Ensures the package download directory and .gitignore exist."""
|
23
|
+
os.makedirs(PACKAGE_DOWNLOAD_DIR, exist_ok=True)
|
24
|
+
gitignore_path = os.path.join(PACKAGE_DOWNLOAD_DIR, ".gitignore")
|
25
|
+
if not os.path.exists(gitignore_path):
|
26
|
+
with open(gitignore_path, "w", encoding="utf-8") as f:
|
27
|
+
f.write("*\n")
|
28
|
+
|
29
|
+
|
30
|
+
def _find_download_url(metadata: Dict[str, Any]) -> Optional[str]:
|
31
|
+
"""Finds the best distribution URL from PyPI metadata."""
|
32
|
+
urls = metadata.get("urls", [])
|
33
|
+
if not urls:
|
34
|
+
return None
|
35
|
+
|
36
|
+
# Prioritize wheels, then sdist, then anything else
|
37
|
+
wheel_url = None
|
38
|
+
sdist_url = None
|
39
|
+
for url_info in urls:
|
40
|
+
packagetype = url_info.get("packagetype")
|
41
|
+
if packagetype == "bdist_wheel":
|
42
|
+
wheel_url = url_info.get("url")
|
43
|
+
elif packagetype == "sdist":
|
44
|
+
sdist_url = url_info.get("url")
|
45
|
+
|
46
|
+
# Return in order of preference: wheel, then sdist, then fallback
|
47
|
+
return wheel_url or sdist_url or (urls[0].get("url") if urls else None)
|
48
|
+
|
49
|
+
|
50
|
+
def collect_from_package_files(metadata: Dict[str, Any]) -> List[EvidenceRecord]:
|
51
|
+
"""
|
52
|
+
Downloads, extracts, and scans a package's files for evidence.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
metadata: The PyPI JSON metadata for the package.
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
A list of EvidenceRecord objects found within the package files.
|
59
|
+
"""
|
60
|
+
info = metadata.get("info", {})
|
61
|
+
package_name = info.get("name", "unknown")
|
62
|
+
package_version = info.get("version", "latest")
|
63
|
+
logger.info(f"Starting file analysis for {package_name} v{package_version}")
|
64
|
+
|
65
|
+
download_url = _find_download_url(metadata)
|
66
|
+
if not download_url:
|
67
|
+
logger.warning(
|
68
|
+
f"No download URL found for {package_name}. Skipping file analysis."
|
69
|
+
)
|
70
|
+
return []
|
71
|
+
|
72
|
+
_ensure_download_dir()
|
73
|
+
filename = os.path.basename(download_url)
|
74
|
+
download_path = os.path.join(PACKAGE_DOWNLOAD_DIR, filename)
|
75
|
+
|
76
|
+
# Download the file if it doesn't already exist
|
77
|
+
if not os.path.exists(download_path):
|
78
|
+
logger.info(f"Downloading {filename} from {download_url}")
|
79
|
+
try:
|
80
|
+
with http_client.get_client().stream("GET", download_url) as response:
|
81
|
+
response.raise_for_status()
|
82
|
+
with open(download_path, "wb") as f:
|
83
|
+
for chunk in response.iter_bytes():
|
84
|
+
f.write(chunk)
|
85
|
+
except (
|
86
|
+
NetworkError,
|
87
|
+
http_client.httpx.RequestError,
|
88
|
+
http_client.httpx.HTTPStatusError,
|
89
|
+
) as e:
|
90
|
+
raise CollectorError(f"Failed to download package {filename}: {e}") from e
|
91
|
+
|
92
|
+
# Determine the persistent extraction directory path from the filename
|
93
|
+
base_filename = filename
|
94
|
+
for ext in [".whl", ".zip", ".tar.gz", ".tgz", ".tar.bz2"]:
|
95
|
+
if base_filename.endswith(ext):
|
96
|
+
base_filename = base_filename[: -len(ext)]
|
97
|
+
break
|
98
|
+
extract_dir = os.path.join(PACKAGE_DOWNLOAD_DIR, base_filename)
|
99
|
+
|
100
|
+
# Extract the archive ONLY if the destination directory doesn't already exist
|
101
|
+
if not os.path.exists(extract_dir):
|
102
|
+
logger.info(f"Extracting {download_path} to {extract_dir}")
|
103
|
+
os.makedirs(extract_dir, exist_ok=True)
|
104
|
+
try:
|
105
|
+
if download_path.endswith((".whl", ".zip")):
|
106
|
+
with zipfile.ZipFile(download_path, "r") as zf: # nosec # noqa
|
107
|
+
zf.extractall(extract_dir) # nosec # noqa
|
108
|
+
elif download_path.endswith(
|
109
|
+
(".tar.gz", ".tgz", ".tar.bz2", ".tar.xz", ".tar")
|
110
|
+
):
|
111
|
+
safe_extract_auto(download_path, extract_dir)
|
112
|
+
# elif download_path.endswith((".tar.gz", ".tgz")):
|
113
|
+
# with tarfile.open(download_path, "r:gz") as tf: # nosec # noqa
|
114
|
+
# tf.extractall(extract_dir) # nosec # noqa
|
115
|
+
# elif download_path.endswith(".tar.bz2"):
|
116
|
+
# with tarfile.open(download_path, "r:bz2") as tf: # nosec # noqa
|
117
|
+
# tf.extractall(extract_dir) # nosec # noqa
|
118
|
+
else:
|
119
|
+
logger.warning(
|
120
|
+
f"Unsupported archive format for {filename}. Skipping file scan."
|
121
|
+
)
|
122
|
+
shutil.rmtree(extract_dir) # Clean up the empty dir
|
123
|
+
return []
|
124
|
+
except (zipfile.BadZipFile, tarfile.TarError, PermissionError) as e:
|
125
|
+
logger.error(f"Failed to extract archive {download_path}: {e}")
|
126
|
+
# Clean up potentially corrupted extraction on error
|
127
|
+
shutil.rmtree(extract_dir, ignore_errors=True)
|
128
|
+
return []
|
129
|
+
else:
|
130
|
+
logger.info(f"Using cached package files from {extract_dir}")
|
131
|
+
|
132
|
+
# Determine the actual directory to scan (handles sdists with a single top-level folder)
|
133
|
+
scan_target_dir = extract_dir
|
134
|
+
# This logic applies to sdists, which often have a root folder. Wheels do not.
|
135
|
+
if filename.endswith((".tar.gz", ".tgz", ".tar.bz2")):
|
136
|
+
try:
|
137
|
+
dir_contents = os.listdir(extract_dir)
|
138
|
+
if len(dir_contents) == 1 and os.path.isdir(
|
139
|
+
os.path.join(extract_dir, dir_contents[0])
|
140
|
+
):
|
141
|
+
scan_target_dir = os.path.join(extract_dir, dir_contents[0])
|
142
|
+
except FileNotFoundError:
|
143
|
+
logger.error(
|
144
|
+
f"Extraction directory {extract_dir} not found after apparent success. Check permissions."
|
145
|
+
)
|
146
|
+
return []
|
147
|
+
|
148
|
+
locator_prefix = f"{package_name}-{package_version}"
|
149
|
+
evidence = source_scanner.scan_directory(scan_target_dir, locator_prefix)
|
150
|
+
return evidence
|
@@ -0,0 +1,158 @@
|
|
1
|
+
# skip_trace/collectors/pypi.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import datetime
|
5
|
+
import logging
|
6
|
+
from typing import Any, Dict, List, Optional, Set
|
7
|
+
|
8
|
+
from bs4 import BeautifulSoup
|
9
|
+
|
10
|
+
from ..analysis.evidence import extract_from_pypi as analyze_pypi_metadata
|
11
|
+
from ..analysis.evidence import generate_evidence_id
|
12
|
+
from ..exceptions import NetworkError, NoEvidenceError
|
13
|
+
from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
|
14
|
+
from ..utils import http_client
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
PYPI_JSON_API_URL = "https://pypi.org/pypi"
|
18
|
+
PYPI_PROJECT_URL = "https://pypi.org/project"
|
19
|
+
|
20
|
+
|
21
|
+
def fetch_package_metadata(
|
22
|
+
package_name: str, version: Optional[str] = None
|
23
|
+
) -> Dict[str, Any]:
|
24
|
+
"""
|
25
|
+
Fetches package metadata from the PyPI JSON API.
|
26
|
+
|
27
|
+
:param package_name: The name of the package.
|
28
|
+
:param version: The optional specific version of the package.
|
29
|
+
:raises NoEvidenceError: If the package is not found (404).
|
30
|
+
:raises NetworkError: For other network or HTTP errors.
|
31
|
+
:return: A dictionary containing the package's JSON metadata.
|
32
|
+
"""
|
33
|
+
if version:
|
34
|
+
url = f"{PYPI_JSON_API_URL}/{package_name}/{version}/json"
|
35
|
+
else:
|
36
|
+
url = f"{PYPI_JSON_API_URL}/{package_name}/json"
|
37
|
+
|
38
|
+
try:
|
39
|
+
response = http_client.make_request(url)
|
40
|
+
return response.json()
|
41
|
+
except NetworkError as e:
|
42
|
+
if "404" in str(e):
|
43
|
+
raise NoEvidenceError(
|
44
|
+
f"Package '{package_name}'"
|
45
|
+
f"{f' version {version}' if version else ''} not found on PyPI."
|
46
|
+
) from e
|
47
|
+
raise
|
48
|
+
|
49
|
+
|
50
|
+
def _scrape_user_profile_url(package_name: str) -> Optional[str]:
|
51
|
+
"""Scrapes the PyPI project page to find the user profile URL."""
|
52
|
+
try:
|
53
|
+
url = f"{PYPI_PROJECT_URL}/{package_name}/"
|
54
|
+
logger.debug(f"Scraping project page for user link: {url}")
|
55
|
+
response = http_client.make_request(url)
|
56
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
57
|
+
|
58
|
+
# The user link is typically in a `p` tag with the class 'sidebar-section__user-gravatar-text'
|
59
|
+
user_link = soup.find("a", href=lambda href: href and href.startswith("/user/"))
|
60
|
+
if user_link and user_link.has_attr("href"):
|
61
|
+
profile_url = f"https://pypi.org{user_link['href']}"
|
62
|
+
logger.debug(f"Found user profile URL: {profile_url}")
|
63
|
+
return profile_url
|
64
|
+
except NetworkError as e:
|
65
|
+
logger.warning(f"Could not scrape project page for '{package_name}': {e}")
|
66
|
+
return None
|
67
|
+
|
68
|
+
|
69
|
+
def _fetch_other_package_urls(user_profile_url: str) -> Set[str]:
|
70
|
+
"""Scrapes a user's profile page to find their other packages."""
|
71
|
+
packages = set()
|
72
|
+
try:
|
73
|
+
logger.debug(f"Scraping user profile for other packages: {user_profile_url}")
|
74
|
+
response = http_client.make_request(user_profile_url)
|
75
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
76
|
+
|
77
|
+
# Links to packages are in a 'package-snippet' class
|
78
|
+
for link in soup.find_all("a", class_="package-snippet"):
|
79
|
+
if link.has_attr("href") and link["href"].startswith("/project/"): # type: ignore[union-attr]
|
80
|
+
packages.add(link["href"].split("/")[2]) # type: ignore[union-attr]
|
81
|
+
logger.debug(f"Found {len(packages)} other packages by user.")
|
82
|
+
return packages
|
83
|
+
except NetworkError as e:
|
84
|
+
logger.warning(f"Could not scrape user profile page '{user_profile_url}': {e}")
|
85
|
+
return packages
|
86
|
+
|
87
|
+
|
88
|
+
def cross_reference_by_user(package_name: str) -> List[EvidenceRecord]:
|
89
|
+
"""
|
90
|
+
Finds other packages by the same user to uncover more evidence.
|
91
|
+
Also creates an evidence record for the PyPI user itself.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
package_name: The name of the starting package.
|
95
|
+
|
96
|
+
Returns:
|
97
|
+
A list of new EvidenceRecord objects found from related packages.
|
98
|
+
"""
|
99
|
+
new_evidence: List[EvidenceRecord] = []
|
100
|
+
profile_url = _scrape_user_profile_url(package_name)
|
101
|
+
|
102
|
+
# --- NEW: Always create evidence for the PyPI user if found ---
|
103
|
+
if profile_url:
|
104
|
+
try:
|
105
|
+
username = profile_url.strip("/").rsplit("/", maxsplit=1)[-1]
|
106
|
+
value = {"name": username, "url": profile_url}
|
107
|
+
record = EvidenceRecord(
|
108
|
+
id=generate_evidence_id(
|
109
|
+
EvidenceSource.PYPI,
|
110
|
+
EvidenceKind.PYPI_USER,
|
111
|
+
profile_url,
|
112
|
+
str(value),
|
113
|
+
username,
|
114
|
+
),
|
115
|
+
source=EvidenceSource.PYPI,
|
116
|
+
locator=profile_url,
|
117
|
+
kind=EvidenceKind.PYPI_USER,
|
118
|
+
value=value,
|
119
|
+
observed_at=datetime.datetime.now(datetime.timezone.utc),
|
120
|
+
confidence=0.50, # This is a strong signal
|
121
|
+
notes=f"Package is published by PyPI user '{username}'.",
|
122
|
+
)
|
123
|
+
new_evidence.append(record)
|
124
|
+
logger.debug(f"Created evidence record for PyPI user '{username}'.")
|
125
|
+
except (IndexError, TypeError) as e:
|
126
|
+
logger.warning(
|
127
|
+
f"Could not parse username from profile URL '{profile_url}': {e}"
|
128
|
+
)
|
129
|
+
|
130
|
+
# --- Continue with existing cross-referencing logic ---
|
131
|
+
if not profile_url:
|
132
|
+
return []
|
133
|
+
|
134
|
+
other_packages = _fetch_other_package_urls(profile_url)
|
135
|
+
if not other_packages:
|
136
|
+
return new_evidence
|
137
|
+
|
138
|
+
# new_evidence: List[EvidenceRecord] = []
|
139
|
+
# Limit to analyzing a few other packages to avoid excessive requests
|
140
|
+
for other_pkg in list(other_packages)[:3]:
|
141
|
+
if other_pkg == package_name:
|
142
|
+
continue
|
143
|
+
try:
|
144
|
+
logger.info(f"Cross-referencing with related package: '{other_pkg}'")
|
145
|
+
metadata = fetch_package_metadata(other_pkg)
|
146
|
+
# We only care about strong signals (like repo URLs) from other packages
|
147
|
+
evidence, _ = analyze_pypi_metadata(metadata)
|
148
|
+
for record in evidence:
|
149
|
+
if "repository URL" in record.notes:
|
150
|
+
new_evidence.append(record)
|
151
|
+
except NoEvidenceError:
|
152
|
+
logger.debug(f"Skipping related package '{other_pkg}', not found.")
|
153
|
+
continue
|
154
|
+
|
155
|
+
logger.info(
|
156
|
+
f"Found {len(new_evidence)} new evidence records via user cross-reference."
|
157
|
+
)
|
158
|
+
return new_evidence
|