skip-trace 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skip_trace/__about__.py +13 -3
- skip_trace/__init__.py +0 -2
- skip_trace/analysis/content_scanner.py +189 -0
- skip_trace/analysis/evidence.py +1 -1
- skip_trace/analysis/scoring.py +46 -1
- skip_trace/analysis/source_scanner.py +1 -1
- skip_trace/cli.py +1 -1
- skip_trace/collectors/__init__.py +2 -2
- skip_trace/collectors/github_files.py +359 -0
- skip_trace/collectors/package_files.py +232 -41
- skip_trace/collectors/pypi.py +1 -1
- skip_trace/collectors/pypi_attestations.py +160 -0
- skip_trace/collectors/sigstore.py +160 -0
- skip_trace/collectors/urls.py +96 -0
- skip_trace/m.py +287 -0
- skip_trace/main.py +103 -85
- skip_trace/reporting/md_reporter.py +68 -4
- skip_trace/schemas.py +21 -0
- skip_trace/utils/http_client.py +18 -0
- {skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/METADATA +7 -3
- skip_trace-0.1.1.dist-info/RECORD +39 -0
- skip_trace-0.1.0.dist-info/RECORD +0 -33
- {skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/WHEEL +0 -0
- {skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/entry_points.txt +0 -0
- {skip_trace-0.1.0.dist-info → skip_trace-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,359 @@
|
|
1
|
+
# skip_trace/collectors/github_files.py
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import datetime
|
5
|
+
import logging
|
6
|
+
import re
|
7
|
+
from typing import List, Optional, Set
|
8
|
+
from urllib.parse import urlparse
|
9
|
+
|
10
|
+
from github import GithubException
|
11
|
+
|
12
|
+
from ..analysis.evidence import generate_evidence_id
|
13
|
+
from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
|
14
|
+
from ..utils import http_client
|
15
|
+
from .github import _create_records_from_user_profile, get_github_client
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
def _parse_repo_url(url: str) -> Optional[str]:
|
21
|
+
"""Parses a GitHub URL to extract the 'owner/repo' string."""
|
22
|
+
try:
|
23
|
+
parsed = urlparse(url)
|
24
|
+
if parsed.hostname and "github.com" in parsed.hostname:
|
25
|
+
path = parsed.path.strip("/")
|
26
|
+
if ".git" in path:
|
27
|
+
path = path.replace(".git", "")
|
28
|
+
if len(path.split("/")) >= 2:
|
29
|
+
return "/".join(path.split("/")[:2])
|
30
|
+
except Exception: # nosec
|
31
|
+
pass
|
32
|
+
logger.warning(f"Could not parse a valid GitHub repository from URL: {url}")
|
33
|
+
return None
|
34
|
+
|
35
|
+
|
36
|
+
def collect_security_policy(repo_url: str) -> List[EvidenceRecord]:
|
37
|
+
"""
|
38
|
+
Fetches and parses SECURITY.md from a GitHub repo.
|
39
|
+
|
40
|
+
Looks for security contact emails and responsible disclosure information.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
repo_url: The full URL of the GitHub repository.
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
A list of EvidenceRecord objects from the security policy.
|
47
|
+
"""
|
48
|
+
evidence: List[EvidenceRecord] = []
|
49
|
+
now = datetime.datetime.now(datetime.timezone.utc)
|
50
|
+
|
51
|
+
# Try common locations for security policy
|
52
|
+
security_paths = [
|
53
|
+
"SECURITY.md",
|
54
|
+
".github/SECURITY.md",
|
55
|
+
"docs/SECURITY.md",
|
56
|
+
"security.md",
|
57
|
+
".github/security.md",
|
58
|
+
]
|
59
|
+
|
60
|
+
repo_url = repo_url.rstrip("/")
|
61
|
+
|
62
|
+
for path in security_paths:
|
63
|
+
# Try both main and master branches
|
64
|
+
for branch in ["main", "master"]:
|
65
|
+
raw_url = f"{repo_url}/raw/{branch}/{path}"
|
66
|
+
response = http_client.make_request_safe(raw_url)
|
67
|
+
|
68
|
+
if response and response.status_code == 200:
|
69
|
+
content = response.text
|
70
|
+
logger.info(f"Found security policy at {raw_url}")
|
71
|
+
|
72
|
+
# Extract emails from the security policy
|
73
|
+
email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
|
74
|
+
emails = re.findall(email_pattern, content)
|
75
|
+
|
76
|
+
seen_emails: Set[str] = set()
|
77
|
+
for email in emails:
|
78
|
+
from ..utils.validation import is_valid_email
|
79
|
+
|
80
|
+
if valid_email := is_valid_email(email):
|
81
|
+
if valid_email in seen_emails:
|
82
|
+
continue
|
83
|
+
seen_emails.add(valid_email)
|
84
|
+
|
85
|
+
value = {
|
86
|
+
"email": valid_email,
|
87
|
+
"context": "security contact",
|
88
|
+
"source_file": path,
|
89
|
+
}
|
90
|
+
evidence.append(
|
91
|
+
EvidenceRecord(
|
92
|
+
id=generate_evidence_id(
|
93
|
+
EvidenceSource.REPO,
|
94
|
+
EvidenceKind.CONTACT,
|
95
|
+
raw_url,
|
96
|
+
str(value),
|
97
|
+
valid_email,
|
98
|
+
hint="security",
|
99
|
+
),
|
100
|
+
source=EvidenceSource.REPO,
|
101
|
+
locator=raw_url,
|
102
|
+
kind=EvidenceKind.CONTACT,
|
103
|
+
value=value,
|
104
|
+
observed_at=now,
|
105
|
+
confidence=0.85,
|
106
|
+
notes=f"Security contact email found in {path}.",
|
107
|
+
)
|
108
|
+
)
|
109
|
+
|
110
|
+
# Found a security file, no need to check other locations
|
111
|
+
return evidence
|
112
|
+
|
113
|
+
logger.debug(f"No security policy found for {repo_url}")
|
114
|
+
return evidence
|
115
|
+
|
116
|
+
|
117
|
+
def collect_funding_info(repo_url: str) -> List[EvidenceRecord]:
|
118
|
+
"""
|
119
|
+
Parses .github/FUNDING.yml for sponsor/funding identities.
|
120
|
+
|
121
|
+
GitHub sponsors, Patreon, Ko-fi, and other funding platforms often
|
122
|
+
provide alternative contact/identity information.
|
123
|
+
|
124
|
+
Args:
|
125
|
+
repo_url: The full URL of the GitHub repository.
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
A list of EvidenceRecord objects from funding configuration.
|
129
|
+
"""
|
130
|
+
evidence: List[EvidenceRecord] = []
|
131
|
+
now = datetime.datetime.now(datetime.timezone.utc)
|
132
|
+
|
133
|
+
repo_url = repo_url.rstrip("/")
|
134
|
+
|
135
|
+
# Try both main and master branches
|
136
|
+
for branch in ["main", "master"]:
|
137
|
+
funding_url = f"{repo_url}/raw/{branch}/.github/FUNDING.yml"
|
138
|
+
response = http_client.make_request_safe(funding_url)
|
139
|
+
|
140
|
+
if response and response.status_code == 200:
|
141
|
+
logger.info(f"Found funding configuration at {funding_url}")
|
142
|
+
|
143
|
+
try:
|
144
|
+
import yaml
|
145
|
+
|
146
|
+
data = yaml.safe_load(response.text)
|
147
|
+
|
148
|
+
# GitHub sponsors
|
149
|
+
if github := data.get("github"):
|
150
|
+
usernames = [github] if isinstance(github, str) else github
|
151
|
+
for username in usernames:
|
152
|
+
value = {
|
153
|
+
"username": username,
|
154
|
+
"platform": "github_sponsors",
|
155
|
+
"url": f"https://github.com/sponsors/{username}",
|
156
|
+
}
|
157
|
+
evidence.append(
|
158
|
+
EvidenceRecord(
|
159
|
+
id=generate_evidence_id(
|
160
|
+
EvidenceSource.REPO,
|
161
|
+
EvidenceKind.CONTACT,
|
162
|
+
funding_url,
|
163
|
+
str(value),
|
164
|
+
username,
|
165
|
+
hint="sponsor",
|
166
|
+
),
|
167
|
+
source=EvidenceSource.REPO,
|
168
|
+
locator=funding_url,
|
169
|
+
kind=EvidenceKind.CONTACT,
|
170
|
+
value=value,
|
171
|
+
observed_at=now,
|
172
|
+
confidence=0.75,
|
173
|
+
notes=f"GitHub Sponsors profile: {username}",
|
174
|
+
)
|
175
|
+
)
|
176
|
+
|
177
|
+
# Other funding platforms
|
178
|
+
platform_configs = {
|
179
|
+
"patreon": "https://www.patreon.com/{}",
|
180
|
+
"ko_fi": "https://ko-fi.com/{}",
|
181
|
+
"open_collective": "https://opencollective.com/{}",
|
182
|
+
"tidelift": "https://tidelift.com/funding/github/{}",
|
183
|
+
"community_bridge": "https://funding.communitybridge.org/projects/{}",
|
184
|
+
"liberapay": "https://liberapay.com/{}",
|
185
|
+
"issuehunt": "https://issuehunt.io/r/{}",
|
186
|
+
"buy_me_a_coffee": "https://buymeacoffee.com/{}",
|
187
|
+
}
|
188
|
+
|
189
|
+
for platform, url_template in platform_configs.items():
|
190
|
+
if value := data.get(platform):
|
191
|
+
usernames = [value] if isinstance(value, str) else value
|
192
|
+
for username in usernames:
|
193
|
+
contact_value = {
|
194
|
+
"username": username,
|
195
|
+
"platform": platform,
|
196
|
+
"url": url_template.format(username),
|
197
|
+
}
|
198
|
+
evidence.append(
|
199
|
+
EvidenceRecord(
|
200
|
+
id=generate_evidence_id(
|
201
|
+
EvidenceSource.REPO,
|
202
|
+
EvidenceKind.CONTACT,
|
203
|
+
funding_url,
|
204
|
+
str(contact_value),
|
205
|
+
username,
|
206
|
+
hint=platform,
|
207
|
+
),
|
208
|
+
source=EvidenceSource.REPO,
|
209
|
+
locator=funding_url,
|
210
|
+
kind=EvidenceKind.CONTACT,
|
211
|
+
value=contact_value,
|
212
|
+
observed_at=now,
|
213
|
+
confidence=0.70,
|
214
|
+
notes=f"Funding platform {platform}: {username}",
|
215
|
+
)
|
216
|
+
)
|
217
|
+
|
218
|
+
# Custom URLs (often personal websites or donation pages)
|
219
|
+
if custom := data.get("custom"):
|
220
|
+
custom_urls = [custom] if isinstance(custom, str) else custom
|
221
|
+
for url in custom_urls:
|
222
|
+
value = {
|
223
|
+
"url": url,
|
224
|
+
"platform": "custom_funding",
|
225
|
+
"label": "Custom funding URL",
|
226
|
+
}
|
227
|
+
evidence.append(
|
228
|
+
EvidenceRecord(
|
229
|
+
id=generate_evidence_id(
|
230
|
+
EvidenceSource.REPO,
|
231
|
+
EvidenceKind.PROJECT_URL,
|
232
|
+
funding_url,
|
233
|
+
str(value),
|
234
|
+
url,
|
235
|
+
hint="funding",
|
236
|
+
),
|
237
|
+
source=EvidenceSource.REPO,
|
238
|
+
locator=funding_url,
|
239
|
+
kind=EvidenceKind.PROJECT_URL,
|
240
|
+
value=value,
|
241
|
+
observed_at=now,
|
242
|
+
confidence=0.60,
|
243
|
+
notes=f"Custom funding URL: {url}",
|
244
|
+
)
|
245
|
+
)
|
246
|
+
|
247
|
+
# Found funding file, return
|
248
|
+
return evidence
|
249
|
+
|
250
|
+
except Exception as e:
|
251
|
+
logger.warning(f"Failed to parse FUNDING.yml from {funding_url}: {e}")
|
252
|
+
|
253
|
+
logger.debug(f"No funding configuration found for {repo_url}")
|
254
|
+
return evidence
|
255
|
+
|
256
|
+
|
257
|
+
def collect_top_contributors(repo_url: str) -> List[EvidenceRecord]:
|
258
|
+
"""
|
259
|
+
Fetches top contributors from a GitHub repo via the API.
|
260
|
+
|
261
|
+
Contributors often have rich profile information that can provide
|
262
|
+
additional identity and contact evidence.
|
263
|
+
|
264
|
+
Args:
|
265
|
+
repo_url: The full URL of the GitHub repository.
|
266
|
+
|
267
|
+
Returns:
|
268
|
+
A list of EvidenceRecord objects from contributor profiles.
|
269
|
+
"""
|
270
|
+
evidence: List[EvidenceRecord] = []
|
271
|
+
|
272
|
+
repo_full_name = _parse_repo_url(repo_url)
|
273
|
+
if not repo_full_name:
|
274
|
+
return []
|
275
|
+
|
276
|
+
client = get_github_client()
|
277
|
+
if not client:
|
278
|
+
logger.warning("GitHub client not available, skipping contributor analysis")
|
279
|
+
return []
|
280
|
+
|
281
|
+
try:
|
282
|
+
logger.info(f"Fetching contributors for {repo_full_name}")
|
283
|
+
repo = client.get_repo(repo_full_name)
|
284
|
+
contributors = repo.get_contributors()
|
285
|
+
|
286
|
+
# Limit to top 10 to avoid excessive API usage
|
287
|
+
processed_count = 0
|
288
|
+
for contributor in contributors:
|
289
|
+
if processed_count >= 10:
|
290
|
+
break
|
291
|
+
|
292
|
+
# Skip bots and automated accounts
|
293
|
+
if contributor.type == "Bot":
|
294
|
+
continue
|
295
|
+
|
296
|
+
# Reuse the existing profile extraction logic from github.py
|
297
|
+
contributor_evidence = _create_records_from_user_profile(contributor)
|
298
|
+
evidence.extend(contributor_evidence)
|
299
|
+
processed_count += 1
|
300
|
+
|
301
|
+
logger.info(f"Extracted evidence from {processed_count} contributors")
|
302
|
+
|
303
|
+
except GithubException as e:
|
304
|
+
logger.warning(
|
305
|
+
f"GitHub API error for contributors of '{repo_full_name}': {e.status}"
|
306
|
+
)
|
307
|
+
except Exception as e:
|
308
|
+
logger.error(
|
309
|
+
f"Unexpected error fetching contributors for '{repo_full_name}': {e}"
|
310
|
+
)
|
311
|
+
|
312
|
+
return evidence
|
313
|
+
|
314
|
+
|
315
|
+
def collect_from_repo_url(repo_url: str) -> List[EvidenceRecord]:
|
316
|
+
"""
|
317
|
+
Main entry point: collects evidence from all GitHub file sources.
|
318
|
+
|
319
|
+
This function coordinates the collection of evidence from:
|
320
|
+
- SECURITY.md files (security contacts)
|
321
|
+
- FUNDING.yml (funding/sponsor information)
|
322
|
+
- Contributors API (contributor profiles)
|
323
|
+
|
324
|
+
Args:
|
325
|
+
repo_url: The full URL of the GitHub repository.
|
326
|
+
|
327
|
+
Returns:
|
328
|
+
A combined list of all EvidenceRecord objects found.
|
329
|
+
"""
|
330
|
+
all_evidence: List[EvidenceRecord] = []
|
331
|
+
|
332
|
+
logger.info(f"Collecting evidence from GitHub files for {repo_url}")
|
333
|
+
|
334
|
+
# Collect from security policy
|
335
|
+
try:
|
336
|
+
security_evidence = collect_security_policy(repo_url)
|
337
|
+
all_evidence.extend(security_evidence)
|
338
|
+
logger.debug(f"Found {len(security_evidence)} records from security policy")
|
339
|
+
except Exception as e:
|
340
|
+
logger.warning(f"Error collecting security policy: {e}")
|
341
|
+
|
342
|
+
# Collect from funding configuration
|
343
|
+
try:
|
344
|
+
funding_evidence = collect_funding_info(repo_url)
|
345
|
+
all_evidence.extend(funding_evidence)
|
346
|
+
logger.debug(f"Found {len(funding_evidence)} records from funding config")
|
347
|
+
except Exception as e:
|
348
|
+
logger.warning(f"Error collecting funding info: {e}")
|
349
|
+
|
350
|
+
# Collect from contributors
|
351
|
+
try:
|
352
|
+
contributor_evidence = collect_top_contributors(repo_url)
|
353
|
+
all_evidence.extend(contributor_evidence)
|
354
|
+
logger.debug(f"Found {len(contributor_evidence)} records from contributors")
|
355
|
+
except Exception as e:
|
356
|
+
logger.warning(f"Error collecting contributors: {e}")
|
357
|
+
|
358
|
+
logger.info(f"Total evidence from GitHub files: {len(all_evidence)} records")
|
359
|
+
return all_evidence
|