skip-trace 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,359 @@
1
+ # skip_trace/collectors/github_files.py
2
+ from __future__ import annotations
3
+
4
+ import datetime
5
+ import logging
6
+ import re
7
+ from typing import List, Optional, Set
8
+ from urllib.parse import urlparse
9
+
10
+ from github import GithubException
11
+
12
+ from ..analysis.evidence import generate_evidence_id
13
+ from ..schemas import EvidenceKind, EvidenceRecord, EvidenceSource
14
+ from ..utils import http_client
15
+ from .github import _create_records_from_user_profile, get_github_client
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _parse_repo_url(url: str) -> Optional[str]:
21
+ """Parses a GitHub URL to extract the 'owner/repo' string."""
22
+ try:
23
+ parsed = urlparse(url)
24
+ if parsed.hostname and "github.com" in parsed.hostname:
25
+ path = parsed.path.strip("/")
26
+ if ".git" in path:
27
+ path = path.replace(".git", "")
28
+ if len(path.split("/")) >= 2:
29
+ return "/".join(path.split("/")[:2])
30
+ except Exception: # nosec
31
+ pass
32
+ logger.warning(f"Could not parse a valid GitHub repository from URL: {url}")
33
+ return None
34
+
35
+
36
+ def collect_security_policy(repo_url: str) -> List[EvidenceRecord]:
37
+ """
38
+ Fetches and parses SECURITY.md from a GitHub repo.
39
+
40
+ Looks for security contact emails and responsible disclosure information.
41
+
42
+ Args:
43
+ repo_url: The full URL of the GitHub repository.
44
+
45
+ Returns:
46
+ A list of EvidenceRecord objects from the security policy.
47
+ """
48
+ evidence: List[EvidenceRecord] = []
49
+ now = datetime.datetime.now(datetime.timezone.utc)
50
+
51
+ # Try common locations for security policy
52
+ security_paths = [
53
+ "SECURITY.md",
54
+ ".github/SECURITY.md",
55
+ "docs/SECURITY.md",
56
+ "security.md",
57
+ ".github/security.md",
58
+ ]
59
+
60
+ repo_url = repo_url.rstrip("/")
61
+
62
+ for path in security_paths:
63
+ # Try both main and master branches
64
+ for branch in ["main", "master"]:
65
+ raw_url = f"{repo_url}/raw/{branch}/{path}"
66
+ response = http_client.make_request_safe(raw_url)
67
+
68
+ if response and response.status_code == 200:
69
+ content = response.text
70
+ logger.info(f"Found security policy at {raw_url}")
71
+
72
+ # Extract emails from the security policy
73
+ email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
74
+ emails = re.findall(email_pattern, content)
75
+
76
+ seen_emails: Set[str] = set()
77
+ for email in emails:
78
+ from ..utils.validation import is_valid_email
79
+
80
+ if valid_email := is_valid_email(email):
81
+ if valid_email in seen_emails:
82
+ continue
83
+ seen_emails.add(valid_email)
84
+
85
+ value = {
86
+ "email": valid_email,
87
+ "context": "security contact",
88
+ "source_file": path,
89
+ }
90
+ evidence.append(
91
+ EvidenceRecord(
92
+ id=generate_evidence_id(
93
+ EvidenceSource.REPO,
94
+ EvidenceKind.CONTACT,
95
+ raw_url,
96
+ str(value),
97
+ valid_email,
98
+ hint="security",
99
+ ),
100
+ source=EvidenceSource.REPO,
101
+ locator=raw_url,
102
+ kind=EvidenceKind.CONTACT,
103
+ value=value,
104
+ observed_at=now,
105
+ confidence=0.85,
106
+ notes=f"Security contact email found in {path}.",
107
+ )
108
+ )
109
+
110
+ # Found a security file, no need to check other locations
111
+ return evidence
112
+
113
+ logger.debug(f"No security policy found for {repo_url}")
114
+ return evidence
115
+
116
+
117
+ def collect_funding_info(repo_url: str) -> List[EvidenceRecord]:
118
+ """
119
+ Parses .github/FUNDING.yml for sponsor/funding identities.
120
+
121
+ GitHub sponsors, Patreon, Ko-fi, and other funding platforms often
122
+ provide alternative contact/identity information.
123
+
124
+ Args:
125
+ repo_url: The full URL of the GitHub repository.
126
+
127
+ Returns:
128
+ A list of EvidenceRecord objects from funding configuration.
129
+ """
130
+ evidence: List[EvidenceRecord] = []
131
+ now = datetime.datetime.now(datetime.timezone.utc)
132
+
133
+ repo_url = repo_url.rstrip("/")
134
+
135
+ # Try both main and master branches
136
+ for branch in ["main", "master"]:
137
+ funding_url = f"{repo_url}/raw/{branch}/.github/FUNDING.yml"
138
+ response = http_client.make_request_safe(funding_url)
139
+
140
+ if response and response.status_code == 200:
141
+ logger.info(f"Found funding configuration at {funding_url}")
142
+
143
+ try:
144
+ import yaml
145
+
146
+ data = yaml.safe_load(response.text)
147
+
148
+ # GitHub sponsors
149
+ if github := data.get("github"):
150
+ usernames = [github] if isinstance(github, str) else github
151
+ for username in usernames:
152
+ value = {
153
+ "username": username,
154
+ "platform": "github_sponsors",
155
+ "url": f"https://github.com/sponsors/{username}",
156
+ }
157
+ evidence.append(
158
+ EvidenceRecord(
159
+ id=generate_evidence_id(
160
+ EvidenceSource.REPO,
161
+ EvidenceKind.CONTACT,
162
+ funding_url,
163
+ str(value),
164
+ username,
165
+ hint="sponsor",
166
+ ),
167
+ source=EvidenceSource.REPO,
168
+ locator=funding_url,
169
+ kind=EvidenceKind.CONTACT,
170
+ value=value,
171
+ observed_at=now,
172
+ confidence=0.75,
173
+ notes=f"GitHub Sponsors profile: {username}",
174
+ )
175
+ )
176
+
177
+ # Other funding platforms
178
+ platform_configs = {
179
+ "patreon": "https://www.patreon.com/{}",
180
+ "ko_fi": "https://ko-fi.com/{}",
181
+ "open_collective": "https://opencollective.com/{}",
182
+ "tidelift": "https://tidelift.com/funding/github/{}",
183
+ "community_bridge": "https://funding.communitybridge.org/projects/{}",
184
+ "liberapay": "https://liberapay.com/{}",
185
+ "issuehunt": "https://issuehunt.io/r/{}",
186
+ "buy_me_a_coffee": "https://buymeacoffee.com/{}",
187
+ }
188
+
189
+ for platform, url_template in platform_configs.items():
190
+ if value := data.get(platform):
191
+ usernames = [value] if isinstance(value, str) else value
192
+ for username in usernames:
193
+ contact_value = {
194
+ "username": username,
195
+ "platform": platform,
196
+ "url": url_template.format(username),
197
+ }
198
+ evidence.append(
199
+ EvidenceRecord(
200
+ id=generate_evidence_id(
201
+ EvidenceSource.REPO,
202
+ EvidenceKind.CONTACT,
203
+ funding_url,
204
+ str(contact_value),
205
+ username,
206
+ hint=platform,
207
+ ),
208
+ source=EvidenceSource.REPO,
209
+ locator=funding_url,
210
+ kind=EvidenceKind.CONTACT,
211
+ value=contact_value,
212
+ observed_at=now,
213
+ confidence=0.70,
214
+ notes=f"Funding platform {platform}: {username}",
215
+ )
216
+ )
217
+
218
+ # Custom URLs (often personal websites or donation pages)
219
+ if custom := data.get("custom"):
220
+ custom_urls = [custom] if isinstance(custom, str) else custom
221
+ for url in custom_urls:
222
+ value = {
223
+ "url": url,
224
+ "platform": "custom_funding",
225
+ "label": "Custom funding URL",
226
+ }
227
+ evidence.append(
228
+ EvidenceRecord(
229
+ id=generate_evidence_id(
230
+ EvidenceSource.REPO,
231
+ EvidenceKind.PROJECT_URL,
232
+ funding_url,
233
+ str(value),
234
+ url,
235
+ hint="funding",
236
+ ),
237
+ source=EvidenceSource.REPO,
238
+ locator=funding_url,
239
+ kind=EvidenceKind.PROJECT_URL,
240
+ value=value,
241
+ observed_at=now,
242
+ confidence=0.60,
243
+ notes=f"Custom funding URL: {url}",
244
+ )
245
+ )
246
+
247
+ # Found funding file, return
248
+ return evidence
249
+
250
+ except Exception as e:
251
+ logger.warning(f"Failed to parse FUNDING.yml from {funding_url}: {e}")
252
+
253
+ logger.debug(f"No funding configuration found for {repo_url}")
254
+ return evidence
255
+
256
+
257
+ def collect_top_contributors(repo_url: str) -> List[EvidenceRecord]:
258
+ """
259
+ Fetches top contributors from a GitHub repo via the API.
260
+
261
+ Contributors often have rich profile information that can provide
262
+ additional identity and contact evidence.
263
+
264
+ Args:
265
+ repo_url: The full URL of the GitHub repository.
266
+
267
+ Returns:
268
+ A list of EvidenceRecord objects from contributor profiles.
269
+ """
270
+ evidence: List[EvidenceRecord] = []
271
+
272
+ repo_full_name = _parse_repo_url(repo_url)
273
+ if not repo_full_name:
274
+ return []
275
+
276
+ client = get_github_client()
277
+ if not client:
278
+ logger.warning("GitHub client not available, skipping contributor analysis")
279
+ return []
280
+
281
+ try:
282
+ logger.info(f"Fetching contributors for {repo_full_name}")
283
+ repo = client.get_repo(repo_full_name)
284
+ contributors = repo.get_contributors()
285
+
286
+ # Limit to top 10 to avoid excessive API usage
287
+ processed_count = 0
288
+ for contributor in contributors:
289
+ if processed_count >= 10:
290
+ break
291
+
292
+ # Skip bots and automated accounts
293
+ if contributor.type == "Bot":
294
+ continue
295
+
296
+ # Reuse the existing profile extraction logic from github.py
297
+ contributor_evidence = _create_records_from_user_profile(contributor)
298
+ evidence.extend(contributor_evidence)
299
+ processed_count += 1
300
+
301
+ logger.info(f"Extracted evidence from {processed_count} contributors")
302
+
303
+ except GithubException as e:
304
+ logger.warning(
305
+ f"GitHub API error for contributors of '{repo_full_name}': {e.status}"
306
+ )
307
+ except Exception as e:
308
+ logger.error(
309
+ f"Unexpected error fetching contributors for '{repo_full_name}': {e}"
310
+ )
311
+
312
+ return evidence
313
+
314
+
315
+ def collect_from_repo_url(repo_url: str) -> List[EvidenceRecord]:
316
+ """
317
+ Main entry point: collects evidence from all GitHub file sources.
318
+
319
+ This function coordinates the collection of evidence from:
320
+ - SECURITY.md files (security contacts)
321
+ - FUNDING.yml (funding/sponsor information)
322
+ - Contributors API (contributor profiles)
323
+
324
+ Args:
325
+ repo_url: The full URL of the GitHub repository.
326
+
327
+ Returns:
328
+ A combined list of all EvidenceRecord objects found.
329
+ """
330
+ all_evidence: List[EvidenceRecord] = []
331
+
332
+ logger.info(f"Collecting evidence from GitHub files for {repo_url}")
333
+
334
+ # Collect from security policy
335
+ try:
336
+ security_evidence = collect_security_policy(repo_url)
337
+ all_evidence.extend(security_evidence)
338
+ logger.debug(f"Found {len(security_evidence)} records from security policy")
339
+ except Exception as e:
340
+ logger.warning(f"Error collecting security policy: {e}")
341
+
342
+ # Collect from funding configuration
343
+ try:
344
+ funding_evidence = collect_funding_info(repo_url)
345
+ all_evidence.extend(funding_evidence)
346
+ logger.debug(f"Found {len(funding_evidence)} records from funding config")
347
+ except Exception as e:
348
+ logger.warning(f"Error collecting funding info: {e}")
349
+
350
+ # Collect from contributors
351
+ try:
352
+ contributor_evidence = collect_top_contributors(repo_url)
353
+ all_evidence.extend(contributor_evidence)
354
+ logger.debug(f"Found {len(contributor_evidence)} records from contributors")
355
+ except Exception as e:
356
+ logger.warning(f"Error collecting contributors: {e}")
357
+
358
+ logger.info(f"Total evidence from GitHub files: {len(all_evidence)} records")
359
+ return all_evidence