ossuary-risk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,495 @@
1
+ """GitHub API collector - maintainer info, issues, sponsors status."""
2
+
3
+ import logging
4
+ import os
5
+ import re
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from typing import Any, Optional
9
+
10
+ import httpx
11
+
12
+ from ossuary.collectors.base import BaseCollector
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class IssueData:
19
+ """Extracted issue/PR data."""
20
+
21
+ number: int
22
+ title: str
23
+ body: str
24
+ state: str
25
+ is_pull_request: bool
26
+ author_login: str
27
+ created_at: str
28
+ updated_at: str
29
+ closed_at: Optional[str]
30
+ comments: list[dict[str, Any]] = field(default_factory=list)
31
+
32
+
33
+ @dataclass
34
+ class GitHubData:
35
+ """Data collected from GitHub API."""
36
+
37
+ # Repository info
38
+ owner: str = ""
39
+ repo: str = ""
40
+ owner_type: str = "" # User or Organization
41
+
42
+ # Maintainer info
43
+ maintainer_username: str = ""
44
+ maintainer_public_repos: int = 0
45
+ maintainer_total_stars: int = 0
46
+ maintainer_account_created: str = "" # ISO date string
47
+ maintainer_repos: list[dict] = field(default_factory=list) # Full repo data for reputation
48
+ maintainer_sponsor_count: int = 0
49
+ maintainer_orgs: list[str] = field(default_factory=list)
50
+ is_tier1_maintainer: bool = False # Deprecated, use reputation scorer
51
+ has_github_sponsors: bool = False
52
+
53
+ # Organization info
54
+ is_org_owned: bool = False
55
+ org_admin_count: int = 0
56
+
57
+ # CII badge
58
+ cii_badge_level: str = "none"
59
+
60
+ # Issues and PRs
61
+ issues: list[IssueData] = field(default_factory=list)
62
+
63
+
64
+ class GitHubCollector(BaseCollector):
65
+ """Collector for GitHub API data."""
66
+
67
+ API_BASE = "https://api.github.com"
68
+ GRAPHQL_URL = "https://api.github.com/graphql"
69
+
70
+ # Rate limiting
71
+ REQUEST_DELAY = 0.5
72
+ RATE_LIMIT_PAUSE = 60
73
+
74
+ # Tier-1 thresholds
75
+ TIER1_REPOS = 500
76
+ TIER1_STARS = 100_000
77
+
78
+ def __init__(self, token: Optional[str] = None):
79
+ """
80
+ Initialize GitHub collector.
81
+
82
+ Args:
83
+ token: GitHub personal access token. Defaults to GITHUB_TOKEN env var.
84
+ """
85
+ self.token = token or os.getenv("GITHUB_TOKEN")
86
+ self.client = httpx.AsyncClient(timeout=30.0)
87
+
88
+ if self.token:
89
+ self.client.headers["Authorization"] = f"Bearer {self.token}"
90
+ self.client.headers["Accept"] = "application/vnd.github.v3+json"
91
+
92
+ def is_available(self) -> bool:
93
+ """Check if GitHub token is available."""
94
+ return bool(self.token)
95
+
96
+ @staticmethod
97
+ def parse_repo_url(repo_url: str) -> tuple[Optional[str], Optional[str]]:
98
+ """
99
+ Parse owner and repo from GitHub URL.
100
+
101
+ Args:
102
+ repo_url: GitHub repository URL
103
+
104
+ Returns:
105
+ Tuple of (owner, repo) or (None, None) if parsing fails
106
+ """
107
+ patterns = [
108
+ r"github\.com[:/]([^/]+)/([^/]+?)(?:\.git)?/?$",
109
+ r"github\.com[:/]([^/]+)/([^/]+)",
110
+ ]
111
+
112
+ for pattern in patterns:
113
+ match = re.search(pattern, repo_url)
114
+ if match:
115
+ return match.group(1), match.group(2).replace(".git", "")
116
+
117
+ return None, None
118
+
119
+ async def _request(self, method: str, url: str, **kwargs) -> Optional[dict]:
120
+ """Make a rate-limit-aware request."""
121
+ time.sleep(self.REQUEST_DELAY)
122
+
123
+ try:
124
+ response = await self.client.request(method, url, **kwargs)
125
+
126
+ # Check rate limit
127
+ remaining = int(response.headers.get("X-RateLimit-Remaining", 1))
128
+ if remaining == 0:
129
+ reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
130
+ wait_time = max(reset_time - time.time(), self.RATE_LIMIT_PAUSE)
131
+ logger.warning(f"Rate limited. Waiting {wait_time:.0f} seconds...")
132
+ time.sleep(wait_time)
133
+ return await self._request(method, url, **kwargs)
134
+
135
+ if response.status_code == 404:
136
+ return None
137
+
138
+ response.raise_for_status()
139
+ return response.json()
140
+
141
+ except httpx.HTTPError as e:
142
+ logger.error(f"GitHub API error: {e}")
143
+ return None
144
+
145
+ async def _get(self, endpoint: str, params: Optional[dict] = None) -> Optional[dict]:
146
+ """GET request to GitHub REST API."""
147
+ url = f"{self.API_BASE}{endpoint}" if not endpoint.startswith("http") else endpoint
148
+ return await self._request("GET", url, params=params)
149
+
150
+ async def _graphql(self, query: str, variables: Optional[dict] = None) -> Optional[dict]:
151
+ """Execute GraphQL query."""
152
+ payload = {"query": query}
153
+ if variables:
154
+ payload["variables"] = variables
155
+
156
+ time.sleep(self.REQUEST_DELAY)
157
+
158
+ try:
159
+ response = await self.client.post(self.GRAPHQL_URL, json=payload)
160
+ response.raise_for_status()
161
+ data = response.json()
162
+
163
+ if "errors" in data:
164
+ logger.error(f"GraphQL errors: {data['errors']}")
165
+ return None
166
+
167
+ return data.get("data")
168
+
169
+ except httpx.HTTPError as e:
170
+ logger.error(f"GraphQL error: {e}")
171
+ return None
172
+
173
+ async def get_user(self, username: str) -> Optional[dict]:
174
+ """Get GitHub user profile."""
175
+ return await self._get(f"/users/{username}")
176
+
177
+ async def get_user_repos(self, username: str, max_pages: int = 10) -> list[dict]:
178
+ """Get all public repos for a user."""
179
+ repos = []
180
+ page = 1
181
+
182
+ while page <= max_pages:
183
+ data = await self._get(
184
+ f"/users/{username}/repos",
185
+ params={"per_page": 100, "page": page, "type": "owner"},
186
+ )
187
+
188
+ if not data:
189
+ break
190
+
191
+ repos.extend(data)
192
+
193
+ if len(data) < 100:
194
+ break
195
+
196
+ page += 1
197
+
198
+ return repos
199
+
200
+ async def get_maintainer_reputation(self, username: str) -> dict:
201
+ """Get maintainer reputation metrics."""
202
+ user = await self.get_user(username)
203
+ if not user:
204
+ return {"public_repos": 0, "total_stars": 0, "is_tier1": False}
205
+
206
+ public_repos = user.get("public_repos", 0)
207
+
208
+ # Calculate total stars
209
+ repos = await self.get_user_repos(username)
210
+ total_stars = sum(r.get("stargazers_count", 0) for r in repos)
211
+
212
+ is_tier1 = public_repos > self.TIER1_REPOS or total_stars > self.TIER1_STARS
213
+
214
+ return {
215
+ "public_repos": public_repos,
216
+ "total_stars": total_stars,
217
+ "is_tier1": is_tier1,
218
+ }
219
+
220
+ async def get_sponsors_status(self, username: str) -> bool:
221
+ """Check if user has GitHub Sponsors enabled."""
222
+ query = """
223
+ query($login: String!) {
224
+ user(login: $login) {
225
+ hasSponsorsListing
226
+ }
227
+ }
228
+ """
229
+
230
+ data = await self._graphql(query, {"login": username})
231
+ if not data:
232
+ return False
233
+
234
+ user = data.get("user", {})
235
+ return user.get("hasSponsorsListing", False)
236
+
237
+ async def get_sponsor_count(self, username: str) -> int:
238
+ """Get count of sponsors for a user."""
239
+ query = """
240
+ query($login: String!) {
241
+ user(login: $login) {
242
+ sponsors {
243
+ totalCount
244
+ }
245
+ }
246
+ }
247
+ """
248
+
249
+ data = await self._graphql(query, {"login": username})
250
+ if not data:
251
+ return 0
252
+
253
+ user = data.get("user", {})
254
+ sponsors = user.get("sponsors", {})
255
+ return sponsors.get("totalCount", 0)
256
+
257
+ async def get_user_orgs(self, username: str) -> list[str]:
258
+ """Get list of organizations a user belongs to."""
259
+ orgs_data = await self._get(f"/users/{username}/orgs")
260
+ if not orgs_data or not isinstance(orgs_data, list):
261
+ return []
262
+ return [org.get("login", "") for org in orgs_data if org.get("login")]
263
+
264
+ async def search_user_by_email(self, email: str) -> Optional[str]:
265
+ """
266
+ Search for GitHub username by email address.
267
+
268
+ Args:
269
+ email: Email address to search for
270
+
271
+ Returns:
272
+ GitHub username if found, None otherwise
273
+ """
274
+ if not email:
275
+ return None
276
+
277
+ # GitHub search API for users by email
278
+ result = await self._get("/search/users", params={"q": f"{email} in:email"})
279
+ if result and result.get("total_count", 0) > 0:
280
+ items = result.get("items", [])
281
+ if items:
282
+ return items[0].get("login")
283
+
284
+ return None
285
+
286
+ async def get_repo_contributors(self, owner: str, repo: str, limit: int = 10) -> list[dict]:
287
+ """
288
+ Get top contributors for a repository.
289
+
290
+ Args:
291
+ owner: Repository owner
292
+ repo: Repository name
293
+ limit: Maximum number of contributors to return
294
+
295
+ Returns:
296
+ List of contributor dicts with login, contributions count
297
+ """
298
+ contributors = await self._get(
299
+ f"/repos/{owner}/{repo}/contributors",
300
+ params={"per_page": limit}
301
+ )
302
+ if not contributors or not isinstance(contributors, list):
303
+ return []
304
+ return contributors
305
+
306
+ async def get_repo_info(self, owner: str, repo: str) -> Optional[dict]:
307
+ """Get repository information."""
308
+ return await self._get(f"/repos/{owner}/{repo}")
309
+
310
+ async def get_org_admins(self, owner: str, repo: str) -> dict:
311
+ """Check if repo is org-owned and estimate admin count."""
312
+ repo_data = await self.get_repo_info(owner, repo)
313
+ if not repo_data:
314
+ return {"is_org": False, "admin_count": 0}
315
+
316
+ owner_type = repo_data.get("owner", {}).get("type")
317
+
318
+ if owner_type != "Organization":
319
+ return {"is_org": False, "admin_count": 0}
320
+
321
+ # Try to get org members (may require permissions)
322
+ members = await self._get(f"/orgs/{owner}/members", params={"role": "admin"})
323
+ admin_count = len(members) if isinstance(members, list) else 1
324
+
325
+ return {"is_org": True, "admin_count": max(admin_count, 1)}
326
+
327
+ async def get_issues(
328
+ self,
329
+ owner: str,
330
+ repo: str,
331
+ state: str = "all",
332
+ per_page: int = 100,
333
+ include_comments: bool = True,
334
+ ) -> list[IssueData]:
335
+ """
336
+ Get issues and PRs from a repository.
337
+
338
+ Args:
339
+ owner: Repository owner
340
+ repo: Repository name
341
+ state: Issue state filter (all, open, closed)
342
+ per_page: Number of issues per page
343
+ include_comments: Whether to fetch comments for each issue
344
+
345
+ Returns:
346
+ List of IssueData objects
347
+ """
348
+ issues_data = await self._get(
349
+ f"/repos/{owner}/{repo}/issues",
350
+ params={"state": state, "per_page": per_page, "sort": "updated"},
351
+ )
352
+
353
+ if not issues_data:
354
+ return []
355
+
356
+ issues = []
357
+ for issue in issues_data:
358
+ issue_obj = IssueData(
359
+ number=issue.get("number"),
360
+ title=issue.get("title", ""),
361
+ body=issue.get("body", "") or "",
362
+ state=issue.get("state", ""),
363
+ is_pull_request="pull_request" in issue,
364
+ author_login=issue.get("user", {}).get("login", ""),
365
+ created_at=issue.get("created_at", ""),
366
+ updated_at=issue.get("updated_at", ""),
367
+ closed_at=issue.get("closed_at"),
368
+ )
369
+
370
+ # Fetch comments if requested
371
+ if include_comments and issue.get("comments", 0) > 0:
372
+ comments = await self._get(f"/repos/{owner}/{repo}/issues/{issue['number']}/comments")
373
+ if comments:
374
+ issue_obj.comments = [
375
+ {
376
+ "id": c.get("id"),
377
+ "author": c.get("user", {}).get("login", ""),
378
+ "body": c.get("body", ""),
379
+ "created_at": c.get("created_at", ""),
380
+ }
381
+ for c in comments
382
+ ]
383
+
384
+ issues.append(issue_obj)
385
+
386
+ return issues
387
+
388
+ async def collect(
389
+ self,
390
+ repo_url: str,
391
+ top_contributor_username: str = None,
392
+ top_contributor_email: str = None,
393
+ ) -> GitHubData:
394
+ """
395
+ Collect all GitHub data for a repository.
396
+
397
+ Args:
398
+ repo_url: GitHub repository URL
399
+ top_contributor_username: Override maintainer username (e.g., from git history)
400
+ top_contributor_email: Top contributor's email for GitHub lookup
401
+
402
+ Returns:
403
+ GitHubData with all collected information
404
+ """
405
+ owner, repo = self.parse_repo_url(repo_url)
406
+ if not owner or not repo:
407
+ logger.error(f"Could not parse repository URL: {repo_url}")
408
+ return GitHubData()
409
+
410
+ data = GitHubData(owner=owner, repo=repo)
411
+
412
+ # Get repo info
413
+ repo_info = await self.get_repo_info(owner, repo)
414
+ if repo_info:
415
+ data.owner_type = repo_info.get("owner", {}).get("type", "")
416
+
417
+ # Determine the actual maintainer username
418
+ maintainer_username = None
419
+
420
+ # Priority 1: Provided username
421
+ if top_contributor_username:
422
+ maintainer_username = top_contributor_username
423
+ logger.info(f"Using provided top contributor: {maintainer_username}")
424
+
425
+ # Priority 2: For orgs, get top contributor from GitHub API
426
+ if not maintainer_username and data.owner_type == "Organization":
427
+ logger.info(f"Repo is org-owned, finding top contributor...")
428
+ contributors = await self.get_repo_contributors(owner, repo, limit=1)
429
+ if contributors:
430
+ maintainer_username = contributors[0].get("login")
431
+ logger.info(f"Top contributor from GitHub API: {maintainer_username}")
432
+
433
+ # Priority 3: Search by email
434
+ if not maintainer_username and top_contributor_email:
435
+ logger.info(f"Searching GitHub for user with email: {top_contributor_email}")
436
+ maintainer_username = await self.search_user_by_email(top_contributor_email)
437
+ if maintainer_username:
438
+ logger.info(f"Found user by email: {maintainer_username}")
439
+
440
+ # Priority 4: Fall back to repo owner (if it's a user, not org)
441
+ if not maintainer_username:
442
+ if data.owner_type != "Organization":
443
+ maintainer_username = owner
444
+ logger.info(f"Using repo owner as maintainer: {maintainer_username}")
445
+ else:
446
+ # Last resort for orgs: try to get from repo info
447
+ maintainer_username = repo_info.get("owner", {}).get("login", owner) if repo_info else owner
448
+ logger.warning(f"Could not determine maintainer for org repo, using: {maintainer_username}")
449
+
450
+ data.maintainer_username = maintainer_username
451
+ logger.info(f"Final maintainer: {data.maintainer_username}")
452
+
453
+ # Get user profile (for account age)
454
+ user_profile = await self.get_user(data.maintainer_username)
455
+ if user_profile:
456
+ data.maintainer_account_created = user_profile.get("created_at", "")
457
+ data.maintainer_public_repos = user_profile.get("public_repos", 0)
458
+
459
+ # Get full repo list for reputation scoring
460
+ logger.info(f"Fetching repos for {data.maintainer_username}...")
461
+ repos = await self.get_user_repos(data.maintainer_username)
462
+ data.maintainer_repos = repos
463
+ data.maintainer_total_stars = sum(r.get("stargazers_count", 0) for r in repos)
464
+
465
+ # Check sponsors status and count
466
+ logger.info(f"Checking sponsors for {data.maintainer_username}...")
467
+ data.has_github_sponsors = await self.get_sponsors_status(data.maintainer_username)
468
+ if data.has_github_sponsors:
469
+ data.maintainer_sponsor_count = await self.get_sponsor_count(data.maintainer_username)
470
+
471
+ # Get user's organizations
472
+ logger.info(f"Fetching orgs for {data.maintainer_username}...")
473
+ data.maintainer_orgs = await self.get_user_orgs(data.maintainer_username)
474
+
475
+ # Legacy tier-1 check (deprecated, use ReputationScorer instead)
476
+ data.is_tier1_maintainer = (
477
+ data.maintainer_public_repos > self.TIER1_REPOS
478
+ or data.maintainer_total_stars > self.TIER1_STARS
479
+ )
480
+
481
+ # Check organization ownership of repo
482
+ logger.info(f"Checking organization status for {owner}/{repo}...")
483
+ org_info = await self.get_org_admins(owner, repo)
484
+ data.is_org_owned = org_info["is_org"]
485
+ data.org_admin_count = org_info["admin_count"]
486
+
487
+ # Get issues
488
+ logger.info(f"Fetching issues for {owner}/{repo}...")
489
+ data.issues = await self.get_issues(owner, repo)
490
+
491
+ return data
492
+
493
+ async def close(self):
494
+ """Close the HTTP client."""
495
+ await self.client.aclose()
@@ -0,0 +1,113 @@
1
+ """npm registry collector."""
2
+
3
+ import logging
4
+ from dataclasses import dataclass
5
+ from typing import Optional
6
+
7
+ import httpx
8
+
9
+ from ossuary.collectors.base import BaseCollector
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @dataclass
15
+ class NpmData:
16
+ """Data collected from npm registry."""
17
+
18
+ name: str = ""
19
+ version: str = ""
20
+ description: str = ""
21
+ homepage: str = ""
22
+ repository_url: str = ""
23
+ weekly_downloads: int = 0
24
+ maintainers: list[str] = None
25
+
26
+ def __post_init__(self):
27
+ if self.maintainers is None:
28
+ self.maintainers = []
29
+
30
+
31
+ class NpmCollector(BaseCollector):
32
+ """Collector for npm registry data."""
33
+
34
+ REGISTRY_URL = "https://registry.npmjs.org"
35
+ DOWNLOADS_URL = "https://api.npmjs.org/downloads"
36
+
37
+ def __init__(self):
38
+ """Initialize npm collector."""
39
+ self.client = httpx.AsyncClient(timeout=30.0)
40
+
41
+ def is_available(self) -> bool:
42
+ """npm collector is always available."""
43
+ return True
44
+
45
+ async def get_package_info(self, package_name: str) -> Optional[dict]:
46
+ """Get package metadata from npm registry."""
47
+ try:
48
+ response = await self.client.get(f"{self.REGISTRY_URL}/{package_name}")
49
+ if response.status_code == 200:
50
+ return response.json()
51
+ except httpx.HTTPError as e:
52
+ logger.error(f"npm registry error: {e}")
53
+ return None
54
+
55
+ async def get_weekly_downloads(self, package_name: str) -> int:
56
+ """Get weekly download count for a package."""
57
+ try:
58
+ response = await self.client.get(f"{self.DOWNLOADS_URL}/point/last-week/{package_name}")
59
+ if response.status_code == 200:
60
+ return response.json().get("downloads", 0)
61
+ except httpx.HTTPError as e:
62
+ logger.error(f"npm downloads API error: {e}")
63
+ return 0
64
+
65
+ async def collect(self, package_name: str) -> NpmData:
66
+ """
67
+ Collect npm package data.
68
+
69
+ Args:
70
+ package_name: npm package name
71
+
72
+ Returns:
73
+ NpmData with package information
74
+ """
75
+ data = NpmData(name=package_name)
76
+
77
+ # Get package metadata
78
+ pkg_info = await self.get_package_info(package_name)
79
+ if pkg_info:
80
+ latest = pkg_info.get("dist-tags", {}).get("latest", "")
81
+ data.version = latest
82
+ data.description = pkg_info.get("description", "")
83
+ data.homepage = pkg_info.get("homepage", "")
84
+
85
+ # Get repository URL
86
+ repo = pkg_info.get("repository", {})
87
+ if isinstance(repo, dict):
88
+ data.repository_url = repo.get("url", "")
89
+ elif isinstance(repo, str):
90
+ data.repository_url = repo
91
+
92
+ # Clean up repository URL
93
+ if data.repository_url:
94
+ data.repository_url = (
95
+ data.repository_url.replace("git+", "")
96
+ .replace("git://", "https://")
97
+ .replace(".git", "")
98
+ )
99
+ if data.repository_url.startswith("ssh://"):
100
+ data.repository_url = data.repository_url.replace("ssh://git@", "https://")
101
+
102
+ # Get maintainers
103
+ maintainers = pkg_info.get("maintainers", [])
104
+ data.maintainers = [m.get("name", "") for m in maintainers if isinstance(m, dict)]
105
+
106
+ # Get download stats
107
+ data.weekly_downloads = await self.get_weekly_downloads(package_name)
108
+
109
+ return data
110
+
111
+ async def close(self):
112
+ """Close the HTTP client."""
113
+ await self.client.aclose()