skip-trace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,241 @@
1
+ # skip_trace/collectors/github.py
2
+ from __future__ import annotations
3
+
4
+ import datetime
5
+ import logging
6
+ from typing import Dict, List, Optional, Set
7
+ from urllib.parse import urlparse
8
+
9
+ from bs4 import BeautifulSoup
10
+ from github import Github, GithubException
11
+ from github.NamedUser import NamedUser
12
+
13
+ from ..analysis.evidence import generate_evidence_id
14
+ from ..config import CONFIG
15
+ from ..exceptions import CollectorError, NetworkError
16
+ from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
17
+ from ..utils import http_client
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _github_client: Optional[Github] = None
22
+
23
+
24
+ def get_github_client() -> Optional[Github]:
25
+ """
26
+ Initializes and returns a singleton PyGithub client instance.
27
+
28
+ Uses the GITHUB_TOKEN from the config if available.
29
+
30
+ Returns:
31
+ An authenticated Github client instance, or None if the token is missing.
32
+ """
33
+ global _github_client
34
+ if _github_client:
35
+ return _github_client
36
+
37
+ github_config = CONFIG.get("github", {})
38
+ api_key = github_config.get("api_key")
39
+
40
+ if not api_key:
41
+ logger.warning(
42
+ "GITHUB_TOKEN not found in environment. GitHub API requests will be unauthenticated and rate-limited."
43
+ )
44
+ _github_client = Github()
45
+ else:
46
+ logger.debug("Authenticating to GitHub API with token.")
47
+ _github_client = Github(api_key)
48
+
49
+ return _github_client
50
+
51
+
52
+ def _parse_repo_url(url: str) -> Optional[str]:
53
+ """Parses a GitHub URL to extract the 'owner/repo' string."""
54
+ try:
55
+ parsed = urlparse(url)
56
+ if parsed.hostname and "github.com" in parsed.hostname:
57
+ path = parsed.path.strip("/")
58
+ if ".git" in path:
59
+ path = path.replace(".git", "")
60
+ if len(path.split("/")) >= 2:
61
+ return "/".join(path.split("/")[:2])
62
+ except Exception:
63
+ pass # nosec # noqa
64
+ logger.debug(f"Could not parse a valid GitHub repository from URL: {url}")
65
+ return None
66
+
67
+
68
+ def _scrape_socials_from_html(html_url: str) -> Dict[str, str]:
69
+ """Scrapes a user's GitHub profile page for social media and blog links."""
70
+ contacts: Dict[str, str] = {}
71
+ try:
72
+ logger.debug(f"Scraping GitHub profile HTML page for social links: {html_url}")
73
+ response = http_client.make_request(html_url)
74
+ soup = BeautifulSoup(response.text, "html.parser")
75
+
76
+ # Find all links within the user profile section
77
+ # profile_links = soup.select('div[data-bio-টারের] a[href], ul.vcard-details a[href]')
78
+ # brittle!
79
+ profile_links = soup.select(
80
+ "div.user-profile-bio a[href], ul.vcard-details a[href]"
81
+ )
82
+ for link in profile_links:
83
+ href = link.get("href")
84
+ if not href:
85
+ continue
86
+
87
+ # Simple heuristic mapping of domain to platform name
88
+ if "linkedin.com/in" in href and "linkedin" not in contacts:
89
+ contacts["linkedin"] = href # type: ignore[assignment]
90
+ elif (
91
+ "mastodon.social" in href or "fosstodon.org" in href
92
+ ) and "mastodon" not in contacts:
93
+ contacts["mastodon"] = href # type: ignore[assignment]
94
+ elif "twitter.com" in href and "twitter" not in contacts:
95
+ # Prefer the twitter_username from API, but take this if needed
96
+ contacts["twitter"] = href # type: ignore[assignment]
97
+ elif "blog." in href or "medium.com" in href and "blog" not in contacts:
98
+ contacts["blog"] = href # type: ignore[assignment]
99
+
100
+ except NetworkError as e:
101
+ logger.warning(f"Could not scrape GitHub profile page {html_url}: {e}")
102
+ except Exception as e:
103
+ logger.error(f"Error during social scraping for {html_url}: {e}")
104
+
105
+ return contacts
106
+
107
+
108
+ def _create_records_from_user_profile(user: NamedUser) -> List[EvidenceRecord]:
109
+ """Creates evidence records from a full GitHub user profile."""
110
+ records = []
111
+ name = user.name or user.login
112
+ now = datetime.datetime.now(datetime.timezone.utc)
113
+
114
+ # Evidence for company affiliation
115
+ if user.company:
116
+ value: dict[str, str | None] = {"user_name": name, "company_name": user.company}
117
+ records.append(
118
+ EvidenceRecord(
119
+ id=generate_evidence_id(
120
+ EvidenceSource.REPO,
121
+ EvidenceKind.USER_COMPANY,
122
+ user.html_url,
123
+ str(value),
124
+ name,
125
+ hint="company",
126
+ ),
127
+ source=EvidenceSource.REPO,
128
+ locator=user.html_url,
129
+ kind=EvidenceKind.USER_COMPANY,
130
+ value=value,
131
+ observed_at=now,
132
+ confidence=0.8,
133
+ notes=f"User '{name}' lists company affiliation as '{user.company}'.",
134
+ )
135
+ )
136
+
137
+ # Evidence for other profile contacts
138
+ profile_contacts = {
139
+ "email": user.email,
140
+ "twitter": (
141
+ f"https://twitter.com/{user.twitter_username}"
142
+ if user.twitter_username
143
+ else None
144
+ ),
145
+ "blog": user.blog,
146
+ }
147
+
148
+ # Scrape HTML for links not available in the API
149
+ scraped_contacts = _scrape_socials_from_html(user.html_url)
150
+ # The scraped contacts take precedence if they exist
151
+ profile_contacts.update(scraped_contacts)
152
+
153
+ # Filter out empty values
154
+ profile_contacts = {k: v for k, v in profile_contacts.items() if v}
155
+ if profile_contacts:
156
+ profile_info = {"user_name": name, "contacts": profile_contacts}
157
+ records.append(
158
+ EvidenceRecord(
159
+ id=generate_evidence_id(
160
+ EvidenceSource.REPO,
161
+ EvidenceKind.USER_PROFILE,
162
+ user.html_url,
163
+ str(profile_info), # TODO: stringify this better?
164
+ name,
165
+ hint="profile",
166
+ ),
167
+ source=EvidenceSource.REPO,
168
+ locator=user.html_url,
169
+ kind=EvidenceKind.USER_PROFILE,
170
+ value=profile_info,
171
+ observed_at=now,
172
+ confidence=0.9,
173
+ notes=f"Found contact details on GitHub user profile for '{name}'.",
174
+ )
175
+ )
176
+
177
+ return records
178
+
179
+
180
+ def extract_from_repo_url(repo_url: str) -> List[EvidenceRecord]:
181
+ """
182
+ Extracts ownership evidence from a GitHub repository URL.
183
+
184
+ Args:
185
+ repo_url: The full URL of the GitHub repository.
186
+
187
+ Returns:
188
+ A list of EvidenceRecord objects.
189
+ """
190
+ evidence = []
191
+ processed_users: Set[str] = set()
192
+ repo_full_name = _parse_repo_url(repo_url)
193
+ if not repo_full_name:
194
+ return []
195
+
196
+ client = get_github_client()
197
+ if not client:
198
+ return []
199
+
200
+ try:
201
+ logger.debug(f"Fetching repository details for '{repo_full_name}'")
202
+ repo = client.get_repo(repo_full_name)
203
+
204
+ # 1. Process the repository owner's full profile
205
+ owner = repo.owner
206
+ if owner.login not in processed_users:
207
+ evidence.extend(_create_records_from_user_profile(owner))
208
+ processed_users.add(owner.login)
209
+
210
+ # 2. Process recent commit authors' full profiles
211
+ logger.debug(f"Fetching recent commits for '{repo_full_name}'")
212
+ commits = repo.get_commits()
213
+ # Limit to the most recent 25 commits to avoid excessive API usage
214
+ for i, commit in enumerate(commits):
215
+ if i >= 10: # Limit to recent 10 to reduce API calls
216
+ break
217
+ # commit.author is a full NamedUser if available
218
+ if (
219
+ isinstance(commit.author, NamedUser)
220
+ and commit.author.login not in processed_users
221
+ ):
222
+ evidence.extend(_create_records_from_user_profile(commit.author))
223
+ processed_users.add(commit.author.login)
224
+
225
+ except GithubException as e:
226
+ logger.error(f"GitHub API error for '{repo_full_name}': {e.status} {e.data}")
227
+ raise CollectorError(
228
+ f"Could not access GitHub repository '{repo_full_name}'"
229
+ ) from e
230
+ except Exception as e:
231
+ logger.error(
232
+ f"An unexpected error occurred while processing GitHub repo '{repo_full_name}': {e}"
233
+ )
234
+ raise CollectorError(
235
+ f"Unexpected error for GitHub repo '{repo_full_name}'"
236
+ ) from e
237
+
238
+ logger.info(
239
+ f"Extracted {len(evidence)} evidence records from GitHub user profiles for repo '{repo_full_name}'."
240
+ )
241
+ return evidence
@@ -0,0 +1,150 @@
1
+ # skip_trace/collectors/package_files.py
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import os
6
+ import shutil
7
+ import tarfile
8
+ import zipfile
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from ..analysis import source_scanner
12
+ from ..exceptions import CollectorError, NetworkError
13
+ from ..schemas import EvidenceRecord
14
+ from ..utils import http_client
15
+ from ..utils.safe_targz import safe_extract_auto
16
+
17
+ logger = logging.getLogger(__name__)
18
+ PACKAGE_DOWNLOAD_DIR = ".packages"
19
+
20
+
21
+ def _ensure_download_dir():
22
+ """Ensures the package download directory and .gitignore exist."""
23
+ os.makedirs(PACKAGE_DOWNLOAD_DIR, exist_ok=True)
24
+ gitignore_path = os.path.join(PACKAGE_DOWNLOAD_DIR, ".gitignore")
25
+ if not os.path.exists(gitignore_path):
26
+ with open(gitignore_path, "w", encoding="utf-8") as f:
27
+ f.write("*\n")
28
+
29
+
30
+ def _find_download_url(metadata: Dict[str, Any]) -> Optional[str]:
31
+ """Finds the best distribution URL from PyPI metadata."""
32
+ urls = metadata.get("urls", [])
33
+ if not urls:
34
+ return None
35
+
36
+ # Prioritize wheels, then sdist, then anything else
37
+ wheel_url = None
38
+ sdist_url = None
39
+ for url_info in urls:
40
+ packagetype = url_info.get("packagetype")
41
+ if packagetype == "bdist_wheel":
42
+ wheel_url = url_info.get("url")
43
+ elif packagetype == "sdist":
44
+ sdist_url = url_info.get("url")
45
+
46
+ # Return in order of preference: wheel, then sdist, then fallback
47
+ return wheel_url or sdist_url or (urls[0].get("url") if urls else None)
48
+
49
+
50
+ def collect_from_package_files(metadata: Dict[str, Any]) -> List[EvidenceRecord]:
51
+ """
52
+ Downloads, extracts, and scans a package's files for evidence.
53
+
54
+ Args:
55
+ metadata: The PyPI JSON metadata for the package.
56
+
57
+ Returns:
58
+ A list of EvidenceRecord objects found within the package files.
59
+ """
60
+ info = metadata.get("info", {})
61
+ package_name = info.get("name", "unknown")
62
+ package_version = info.get("version", "latest")
63
+ logger.info(f"Starting file analysis for {package_name} v{package_version}")
64
+
65
+ download_url = _find_download_url(metadata)
66
+ if not download_url:
67
+ logger.warning(
68
+ f"No download URL found for {package_name}. Skipping file analysis."
69
+ )
70
+ return []
71
+
72
+ _ensure_download_dir()
73
+ filename = os.path.basename(download_url)
74
+ download_path = os.path.join(PACKAGE_DOWNLOAD_DIR, filename)
75
+
76
+ # Download the file if it doesn't already exist
77
+ if not os.path.exists(download_path):
78
+ logger.info(f"Downloading {filename} from {download_url}")
79
+ try:
80
+ with http_client.get_client().stream("GET", download_url) as response:
81
+ response.raise_for_status()
82
+ with open(download_path, "wb") as f:
83
+ for chunk in response.iter_bytes():
84
+ f.write(chunk)
85
+ except (
86
+ NetworkError,
87
+ http_client.httpx.RequestError,
88
+ http_client.httpx.HTTPStatusError,
89
+ ) as e:
90
+ raise CollectorError(f"Failed to download package {filename}: {e}") from e
91
+
92
+ # Determine the persistent extraction directory path from the filename
93
+ base_filename = filename
94
+ for ext in [".whl", ".zip", ".tar.gz", ".tgz", ".tar.bz2"]:
95
+ if base_filename.endswith(ext):
96
+ base_filename = base_filename[: -len(ext)]
97
+ break
98
+ extract_dir = os.path.join(PACKAGE_DOWNLOAD_DIR, base_filename)
99
+
100
+ # Extract the archive ONLY if the destination directory doesn't already exist
101
+ if not os.path.exists(extract_dir):
102
+ logger.info(f"Extracting {download_path} to {extract_dir}")
103
+ os.makedirs(extract_dir, exist_ok=True)
104
+ try:
105
+ if download_path.endswith((".whl", ".zip")):
106
+ with zipfile.ZipFile(download_path, "r") as zf: # nosec # noqa
107
+ zf.extractall(extract_dir) # nosec # noqa
108
+ elif download_path.endswith(
109
+ (".tar.gz", ".tgz", ".tar.bz2", ".tar.xz", ".tar")
110
+ ):
111
+ safe_extract_auto(download_path, extract_dir)
112
+ # elif download_path.endswith((".tar.gz", ".tgz")):
113
+ # with tarfile.open(download_path, "r:gz") as tf: # nosec # noqa
114
+ # tf.extractall(extract_dir) # nosec # noqa
115
+ # elif download_path.endswith(".tar.bz2"):
116
+ # with tarfile.open(download_path, "r:bz2") as tf: # nosec # noqa
117
+ # tf.extractall(extract_dir) # nosec # noqa
118
+ else:
119
+ logger.warning(
120
+ f"Unsupported archive format for {filename}. Skipping file scan."
121
+ )
122
+ shutil.rmtree(extract_dir) # Clean up the empty dir
123
+ return []
124
+ except (zipfile.BadZipFile, tarfile.TarError, PermissionError) as e:
125
+ logger.error(f"Failed to extract archive {download_path}: {e}")
126
+ # Clean up potentially corrupted extraction on error
127
+ shutil.rmtree(extract_dir, ignore_errors=True)
128
+ return []
129
+ else:
130
+ logger.info(f"Using cached package files from {extract_dir}")
131
+
132
+ # Determine the actual directory to scan (handles sdists with a single top-level folder)
133
+ scan_target_dir = extract_dir
134
+ # This logic applies to sdists, which often have a root folder. Wheels do not.
135
+ if filename.endswith((".tar.gz", ".tgz", ".tar.bz2")):
136
+ try:
137
+ dir_contents = os.listdir(extract_dir)
138
+ if len(dir_contents) == 1 and os.path.isdir(
139
+ os.path.join(extract_dir, dir_contents[0])
140
+ ):
141
+ scan_target_dir = os.path.join(extract_dir, dir_contents[0])
142
+ except FileNotFoundError:
143
+ logger.error(
144
+ f"Extraction directory {extract_dir} not found after apparent success. Check permissions."
145
+ )
146
+ return []
147
+
148
+ locator_prefix = f"{package_name}-{package_version}"
149
+ evidence = source_scanner.scan_directory(scan_target_dir, locator_prefix)
150
+ return evidence
@@ -0,0 +1,158 @@
1
+ # skip_trace/collectors/pypi.py
2
+ from __future__ import annotations
3
+
4
+ import datetime
5
+ import logging
6
+ from typing import Any, Dict, List, Optional, Set
7
+
8
+ from bs4 import BeautifulSoup
9
+
10
+ from ..analysis.evidence import extract_from_pypi as analyze_pypi_metadata
11
+ from ..analysis.evidence import generate_evidence_id
12
+ from ..exceptions import NetworkError, NoEvidenceError
13
+ from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
14
+ from ..utils import http_client
15
+
16
+ logger = logging.getLogger(__name__)
17
+ PYPI_JSON_API_URL = "https://pypi.org/pypi"
18
+ PYPI_PROJECT_URL = "https://pypi.org/project"
19
+
20
+
21
+ def fetch_package_metadata(
22
+ package_name: str, version: Optional[str] = None
23
+ ) -> Dict[str, Any]:
24
+ """
25
+ Fetches package metadata from the PyPI JSON API.
26
+
27
+ :param package_name: The name of the package.
28
+ :param version: The optional specific version of the package.
29
+ :raises NoEvidenceError: If the package is not found (404).
30
+ :raises NetworkError: For other network or HTTP errors.
31
+ :return: A dictionary containing the package's JSON metadata.
32
+ """
33
+ if version:
34
+ url = f"{PYPI_JSON_API_URL}/{package_name}/{version}/json"
35
+ else:
36
+ url = f"{PYPI_JSON_API_URL}/{package_name}/json"
37
+
38
+ try:
39
+ response = http_client.make_request(url)
40
+ return response.json()
41
+ except NetworkError as e:
42
+ if "404" in str(e):
43
+ raise NoEvidenceError(
44
+ f"Package '{package_name}'"
45
+ f"{f' version {version}' if version else ''} not found on PyPI."
46
+ ) from e
47
+ raise
48
+
49
+
50
+ def _scrape_user_profile_url(package_name: str) -> Optional[str]:
51
+ """Scrapes the PyPI project page to find the user profile URL."""
52
+ try:
53
+ url = f"{PYPI_PROJECT_URL}/{package_name}/"
54
+ logger.debug(f"Scraping project page for user link: {url}")
55
+ response = http_client.make_request(url)
56
+ soup = BeautifulSoup(response.text, "html.parser")
57
+
58
+ # The user link is typically in a `p` tag with the class 'sidebar-section__user-gravatar-text'
59
+ user_link = soup.find("a", href=lambda href: href and href.startswith("/user/"))
60
+ if user_link and user_link.has_attr("href"):
61
+ profile_url = f"https://pypi.org{user_link['href']}"
62
+ logger.debug(f"Found user profile URL: {profile_url}")
63
+ return profile_url
64
+ except NetworkError as e:
65
+ logger.warning(f"Could not scrape project page for '{package_name}': {e}")
66
+ return None
67
+
68
+
69
+ def _fetch_other_package_urls(user_profile_url: str) -> Set[str]:
70
+ """Scrapes a user's profile page to find their other packages."""
71
+ packages = set()
72
+ try:
73
+ logger.debug(f"Scraping user profile for other packages: {user_profile_url}")
74
+ response = http_client.make_request(user_profile_url)
75
+ soup = BeautifulSoup(response.text, "html.parser")
76
+
77
+ # Links to packages are in a 'package-snippet' class
78
+ for link in soup.find_all("a", class_="package-snippet"):
79
+ if link.has_attr("href") and link["href"].startswith("/project/"): # type: ignore[union-attr]
80
+ packages.add(link["href"].split("/")[2]) # type: ignore[union-attr]
81
+ logger.debug(f"Found {len(packages)} other packages by user.")
82
+ return packages
83
+ except NetworkError as e:
84
+ logger.warning(f"Could not scrape user profile page '{user_profile_url}': {e}")
85
+ return packages
86
+
87
+
88
+ def cross_reference_by_user(package_name: str) -> List[EvidenceRecord]:
89
+ """
90
+ Finds other packages by the same user to uncover more evidence.
91
+ Also creates an evidence record for the PyPI user itself.
92
+
93
+ Args:
94
+ package_name: The name of the starting package.
95
+
96
+ Returns:
97
+ A list of new EvidenceRecord objects found from related packages.
98
+ """
99
+ new_evidence: List[EvidenceRecord] = []
100
+ profile_url = _scrape_user_profile_url(package_name)
101
+
102
+ # --- NEW: Always create evidence for the PyPI user if found ---
103
+ if profile_url:
104
+ try:
105
+ username = profile_url.strip("/").rsplit("/", maxsplit=1)[-1]
106
+ value = {"name": username, "url": profile_url}
107
+ record = EvidenceRecord(
108
+ id=generate_evidence_id(
109
+ EvidenceSource.PYPI,
110
+ EvidenceKind.PYPI_USER,
111
+ profile_url,
112
+ str(value),
113
+ username,
114
+ ),
115
+ source=EvidenceSource.PYPI,
116
+ locator=profile_url,
117
+ kind=EvidenceKind.PYPI_USER,
118
+ value=value,
119
+ observed_at=datetime.datetime.now(datetime.timezone.utc),
120
+ confidence=0.50, # This is a strong signal
121
+ notes=f"Package is published by PyPI user '{username}'.",
122
+ )
123
+ new_evidence.append(record)
124
+ logger.debug(f"Created evidence record for PyPI user '{username}'.")
125
+ except (IndexError, TypeError) as e:
126
+ logger.warning(
127
+ f"Could not parse username from profile URL '{profile_url}': {e}"
128
+ )
129
+
130
+ # --- Continue with existing cross-referencing logic ---
131
+ if not profile_url:
132
+ return []
133
+
134
+ other_packages = _fetch_other_package_urls(profile_url)
135
+ if not other_packages:
136
+ return new_evidence
137
+
138
+ # new_evidence: List[EvidenceRecord] = []
139
+ # Limit to analyzing a few other packages to avoid excessive requests
140
+ for other_pkg in list(other_packages)[:3]:
141
+ if other_pkg == package_name:
142
+ continue
143
+ try:
144
+ logger.info(f"Cross-referencing with related package: '{other_pkg}'")
145
+ metadata = fetch_package_metadata(other_pkg)
146
+ # We only care about strong signals (like repo URLs) from other packages
147
+ evidence, _ = analyze_pypi_metadata(metadata)
148
+ for record in evidence:
149
+ if "repository URL" in record.notes:
150
+ new_evidence.append(record)
151
+ except NoEvidenceError:
152
+ logger.debug(f"Skipping related package '{other_pkg}', not found.")
153
+ continue
154
+
155
+ logger.info(
156
+ f"Found {len(new_evidence)} new evidence records via user cross-reference."
157
+ )
158
+ return new_evidence