ossuary-risk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ossuary/__init__.py +7 -0
- ossuary/api/__init__.py +1 -0
- ossuary/api/main.py +173 -0
- ossuary/cli.py +309 -0
- ossuary/collectors/__init__.py +8 -0
- ossuary/collectors/base.py +26 -0
- ossuary/collectors/git.py +231 -0
- ossuary/collectors/github.py +495 -0
- ossuary/collectors/npm.py +113 -0
- ossuary/collectors/pypi.py +118 -0
- ossuary/db/__init__.py +15 -0
- ossuary/db/models.py +197 -0
- ossuary/db/session.py +49 -0
- ossuary/scoring/__init__.py +16 -0
- ossuary/scoring/engine.py +318 -0
- ossuary/scoring/factors.py +175 -0
- ossuary/scoring/reputation.py +326 -0
- ossuary/sentiment/__init__.py +5 -0
- ossuary/sentiment/analyzer.py +232 -0
- ossuary_risk-0.1.0.dist-info/METADATA +241 -0
- ossuary_risk-0.1.0.dist-info/RECORD +23 -0
- ossuary_risk-0.1.0.dist-info/WHEEL +4 -0
- ossuary_risk-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""Git repository collector - extracts commit history and metadata."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import tempfile
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
from git import Repo
|
|
15
|
+
from git.exc import GitCommandError, InvalidGitRepositoryError
|
|
16
|
+
|
|
17
|
+
from ossuary.collectors.base import BaseCollector
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class CommitData:
|
|
24
|
+
"""Extracted commit data."""
|
|
25
|
+
|
|
26
|
+
sha: str
|
|
27
|
+
author_name: str
|
|
28
|
+
author_email: str
|
|
29
|
+
authored_date: datetime
|
|
30
|
+
committer_name: str
|
|
31
|
+
committer_email: str
|
|
32
|
+
committed_date: datetime
|
|
33
|
+
message: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class GitMetrics:
|
|
38
|
+
"""Metrics extracted from git history."""
|
|
39
|
+
|
|
40
|
+
total_commits: int = 0
|
|
41
|
+
commits_last_year: int = 0
|
|
42
|
+
unique_contributors: int = 0
|
|
43
|
+
maintainer_concentration: float = 0.0
|
|
44
|
+
top_contributor_email: str = ""
|
|
45
|
+
top_contributor_name: str = ""
|
|
46
|
+
top_contributor_commits: int = 0
|
|
47
|
+
last_commit_date: Optional[datetime] = None
|
|
48
|
+
first_commit_date: Optional[datetime] = None
|
|
49
|
+
commits: list[CommitData] = None
|
|
50
|
+
|
|
51
|
+
def __post_init__(self):
|
|
52
|
+
if self.commits is None:
|
|
53
|
+
self.commits = []
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class GitCollector(BaseCollector):
|
|
57
|
+
"""Collector for git repository data."""
|
|
58
|
+
|
|
59
|
+
def __init__(self, repos_path: Optional[str] = None):
|
|
60
|
+
"""
|
|
61
|
+
Initialize the git collector.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
repos_path: Path to store cloned repositories. Defaults to ./repos
|
|
65
|
+
"""
|
|
66
|
+
self.repos_path = Path(repos_path or os.getenv("REPOS_PATH", "./repos"))
|
|
67
|
+
self.repos_path.mkdir(parents=True, exist_ok=True)
|
|
68
|
+
|
|
69
|
+
def is_available(self) -> bool:
|
|
70
|
+
"""Git collector is always available."""
|
|
71
|
+
return True
|
|
72
|
+
|
|
73
|
+
def _get_repo_path(self, repo_url: str) -> Path:
|
|
74
|
+
"""Get local path for a repository."""
|
|
75
|
+
# Create a hash-based directory name to avoid path issues
|
|
76
|
+
url_hash = hashlib.md5(repo_url.encode()).hexdigest()[:12]
|
|
77
|
+
# Extract repo name for readability
|
|
78
|
+
repo_name = repo_url.rstrip("/").split("/")[-1].replace(".git", "")
|
|
79
|
+
return self.repos_path / f"{repo_name}_{url_hash}"
|
|
80
|
+
|
|
81
|
+
def clone_or_update(self, repo_url: str) -> Path:
|
|
82
|
+
"""
|
|
83
|
+
Clone a repository or update if it already exists.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
repo_url: Git repository URL
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Path to the local repository
|
|
90
|
+
"""
|
|
91
|
+
repo_path = self._get_repo_path(repo_url)
|
|
92
|
+
|
|
93
|
+
if repo_path.exists():
|
|
94
|
+
try:
|
|
95
|
+
logger.info(f"Updating existing repository: {repo_path}")
|
|
96
|
+
repo = Repo(repo_path)
|
|
97
|
+
repo.remotes.origin.fetch()
|
|
98
|
+
return repo_path
|
|
99
|
+
except (InvalidGitRepositoryError, GitCommandError) as e:
|
|
100
|
+
logger.warning(f"Failed to update repository, re-cloning: {e}")
|
|
101
|
+
shutil.rmtree(repo_path)
|
|
102
|
+
|
|
103
|
+
logger.info(f"Cloning repository: {repo_url}")
|
|
104
|
+
try:
|
|
105
|
+
Repo.clone_from(repo_url, repo_path, depth=None) # Full clone for history
|
|
106
|
+
return repo_path
|
|
107
|
+
except GitCommandError as e:
|
|
108
|
+
logger.error(f"Failed to clone repository: {e}")
|
|
109
|
+
raise
|
|
110
|
+
|
|
111
|
+
def extract_commits(
|
|
112
|
+
self,
|
|
113
|
+
repo_path: Path,
|
|
114
|
+
since: Optional[datetime] = None,
|
|
115
|
+
until: Optional[datetime] = None,
|
|
116
|
+
) -> list[CommitData]:
|
|
117
|
+
"""
|
|
118
|
+
Extract commit data from a repository.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
repo_path: Path to the local repository
|
|
122
|
+
since: Only include commits after this date
|
|
123
|
+
until: Only include commits before this date
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
List of CommitData objects
|
|
127
|
+
"""
|
|
128
|
+
repo = Repo(repo_path)
|
|
129
|
+
commits = []
|
|
130
|
+
|
|
131
|
+
for commit in repo.iter_commits("--all"):
|
|
132
|
+
authored_date = datetime.fromtimestamp(commit.authored_date)
|
|
133
|
+
committed_date = datetime.fromtimestamp(commit.committed_date)
|
|
134
|
+
|
|
135
|
+
# Filter by date range
|
|
136
|
+
if since and authored_date < since:
|
|
137
|
+
continue
|
|
138
|
+
if until and authored_date > until:
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
commits.append(
|
|
142
|
+
CommitData(
|
|
143
|
+
sha=commit.hexsha,
|
|
144
|
+
author_name=commit.author.name or "",
|
|
145
|
+
author_email=commit.author.email or "",
|
|
146
|
+
authored_date=authored_date,
|
|
147
|
+
committer_name=commit.committer.name or "",
|
|
148
|
+
committer_email=commit.committer.email or "",
|
|
149
|
+
committed_date=committed_date,
|
|
150
|
+
message=commit.message,
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
return commits
|
|
155
|
+
|
|
156
|
+
def calculate_metrics(
|
|
157
|
+
self,
|
|
158
|
+
commits: list[CommitData],
|
|
159
|
+
cutoff_date: Optional[datetime] = None,
|
|
160
|
+
) -> GitMetrics:
|
|
161
|
+
"""
|
|
162
|
+
Calculate metrics from commit data.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
commits: List of commits to analyze
|
|
166
|
+
cutoff_date: Date to use as "now" for calculations (for T-1 analysis)
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
GitMetrics with calculated values
|
|
170
|
+
"""
|
|
171
|
+
if not commits:
|
|
172
|
+
return GitMetrics()
|
|
173
|
+
|
|
174
|
+
cutoff = cutoff_date or datetime.now()
|
|
175
|
+
one_year_ago = cutoff - timedelta(days=365)
|
|
176
|
+
|
|
177
|
+
# Filter commits for last year
|
|
178
|
+
recent_commits = [c for c in commits if c.authored_date >= one_year_ago and c.authored_date <= cutoff]
|
|
179
|
+
|
|
180
|
+
# Count commits by author email
|
|
181
|
+
author_counts: dict[str, int] = defaultdict(int)
|
|
182
|
+
author_names: dict[str, str] = {}
|
|
183
|
+
|
|
184
|
+
for commit in recent_commits:
|
|
185
|
+
email = commit.author_email.lower()
|
|
186
|
+
author_counts[email] += 1
|
|
187
|
+
author_names[email] = commit.author_name
|
|
188
|
+
|
|
189
|
+
# Find top contributor
|
|
190
|
+
total_recent = len(recent_commits)
|
|
191
|
+
unique_contributors = len(author_counts)
|
|
192
|
+
|
|
193
|
+
if author_counts:
|
|
194
|
+
top_email = max(author_counts, key=author_counts.get)
|
|
195
|
+
top_commits = author_counts[top_email]
|
|
196
|
+
concentration = (top_commits / total_recent * 100) if total_recent > 0 else 0
|
|
197
|
+
else:
|
|
198
|
+
top_email = ""
|
|
199
|
+
top_commits = 0
|
|
200
|
+
concentration = 100 # No commits = maximum concentration (abandoned)
|
|
201
|
+
|
|
202
|
+
# Sort commits by date
|
|
203
|
+
sorted_commits = sorted(commits, key=lambda c: c.authored_date)
|
|
204
|
+
|
|
205
|
+
return GitMetrics(
|
|
206
|
+
total_commits=len(commits),
|
|
207
|
+
commits_last_year=total_recent,
|
|
208
|
+
unique_contributors=unique_contributors,
|
|
209
|
+
maintainer_concentration=concentration,
|
|
210
|
+
top_contributor_email=top_email,
|
|
211
|
+
top_contributor_name=author_names.get(top_email, ""),
|
|
212
|
+
top_contributor_commits=top_commits,
|
|
213
|
+
last_commit_date=sorted_commits[-1].authored_date if sorted_commits else None,
|
|
214
|
+
first_commit_date=sorted_commits[0].authored_date if sorted_commits else None,
|
|
215
|
+
commits=recent_commits, # Store only recent commits for sentiment analysis
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
async def collect(self, repo_url: str, cutoff_date: Optional[datetime] = None) -> GitMetrics:
|
|
219
|
+
"""
|
|
220
|
+
Collect git data for a repository.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
repo_url: Git repository URL
|
|
224
|
+
cutoff_date: Date to use as "now" for T-1 analysis
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
GitMetrics with all calculated values
|
|
228
|
+
"""
|
|
229
|
+
repo_path = self.clone_or_update(repo_url)
|
|
230
|
+
commits = self.extract_commits(repo_path)
|
|
231
|
+
return self.calculate_metrics(commits, cutoff_date)
|