ossuary-risk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,231 @@
1
+ """Git repository collector - extracts commit history and metadata."""
2
+
3
+ import hashlib
4
+ import logging
5
+ import os
6
+ import shutil
7
+ import tempfile
8
+ from collections import defaultdict
9
+ from dataclasses import dataclass
10
+ from datetime import datetime, timedelta
11
+ from pathlib import Path
12
+ from typing import Optional
13
+
14
+ from git import Repo
15
+ from git.exc import GitCommandError, InvalidGitRepositoryError
16
+
17
+ from ossuary.collectors.base import BaseCollector
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @dataclass
23
+ class CommitData:
24
+ """Extracted commit data."""
25
+
26
+ sha: str
27
+ author_name: str
28
+ author_email: str
29
+ authored_date: datetime
30
+ committer_name: str
31
+ committer_email: str
32
+ committed_date: datetime
33
+ message: str
34
+
35
+
36
+ @dataclass
37
+ class GitMetrics:
38
+ """Metrics extracted from git history."""
39
+
40
+ total_commits: int = 0
41
+ commits_last_year: int = 0
42
+ unique_contributors: int = 0
43
+ maintainer_concentration: float = 0.0
44
+ top_contributor_email: str = ""
45
+ top_contributor_name: str = ""
46
+ top_contributor_commits: int = 0
47
+ last_commit_date: Optional[datetime] = None
48
+ first_commit_date: Optional[datetime] = None
49
+ commits: list[CommitData] = None
50
+
51
+ def __post_init__(self):
52
+ if self.commits is None:
53
+ self.commits = []
54
+
55
+
56
+ class GitCollector(BaseCollector):
57
+ """Collector for git repository data."""
58
+
59
+ def __init__(self, repos_path: Optional[str] = None):
60
+ """
61
+ Initialize the git collector.
62
+
63
+ Args:
64
+ repos_path: Path to store cloned repositories. Defaults to ./repos
65
+ """
66
+ self.repos_path = Path(repos_path or os.getenv("REPOS_PATH", "./repos"))
67
+ self.repos_path.mkdir(parents=True, exist_ok=True)
68
+
69
+ def is_available(self) -> bool:
70
+ """Git collector is always available."""
71
+ return True
72
+
73
+ def _get_repo_path(self, repo_url: str) -> Path:
74
+ """Get local path for a repository."""
75
+ # Create a hash-based directory name to avoid path issues
76
+ url_hash = hashlib.md5(repo_url.encode()).hexdigest()[:12]
77
+ # Extract repo name for readability
78
+ repo_name = repo_url.rstrip("/").split("/")[-1].replace(".git", "")
79
+ return self.repos_path / f"{repo_name}_{url_hash}"
80
+
81
+ def clone_or_update(self, repo_url: str) -> Path:
82
+ """
83
+ Clone a repository or update if it already exists.
84
+
85
+ Args:
86
+ repo_url: Git repository URL
87
+
88
+ Returns:
89
+ Path to the local repository
90
+ """
91
+ repo_path = self._get_repo_path(repo_url)
92
+
93
+ if repo_path.exists():
94
+ try:
95
+ logger.info(f"Updating existing repository: {repo_path}")
96
+ repo = Repo(repo_path)
97
+ repo.remotes.origin.fetch()
98
+ return repo_path
99
+ except (InvalidGitRepositoryError, GitCommandError) as e:
100
+ logger.warning(f"Failed to update repository, re-cloning: {e}")
101
+ shutil.rmtree(repo_path)
102
+
103
+ logger.info(f"Cloning repository: {repo_url}")
104
+ try:
105
+ Repo.clone_from(repo_url, repo_path, depth=None) # Full clone for history
106
+ return repo_path
107
+ except GitCommandError as e:
108
+ logger.error(f"Failed to clone repository: {e}")
109
+ raise
110
+
111
+ def extract_commits(
112
+ self,
113
+ repo_path: Path,
114
+ since: Optional[datetime] = None,
115
+ until: Optional[datetime] = None,
116
+ ) -> list[CommitData]:
117
+ """
118
+ Extract commit data from a repository.
119
+
120
+ Args:
121
+ repo_path: Path to the local repository
122
+ since: Only include commits after this date
123
+ until: Only include commits before this date
124
+
125
+ Returns:
126
+ List of CommitData objects
127
+ """
128
+ repo = Repo(repo_path)
129
+ commits = []
130
+
131
+ for commit in repo.iter_commits("--all"):
132
+ authored_date = datetime.fromtimestamp(commit.authored_date)
133
+ committed_date = datetime.fromtimestamp(commit.committed_date)
134
+
135
+ # Filter by date range
136
+ if since and authored_date < since:
137
+ continue
138
+ if until and authored_date > until:
139
+ continue
140
+
141
+ commits.append(
142
+ CommitData(
143
+ sha=commit.hexsha,
144
+ author_name=commit.author.name or "",
145
+ author_email=commit.author.email or "",
146
+ authored_date=authored_date,
147
+ committer_name=commit.committer.name or "",
148
+ committer_email=commit.committer.email or "",
149
+ committed_date=committed_date,
150
+ message=commit.message,
151
+ )
152
+ )
153
+
154
+ return commits
155
+
156
+ def calculate_metrics(
157
+ self,
158
+ commits: list[CommitData],
159
+ cutoff_date: Optional[datetime] = None,
160
+ ) -> GitMetrics:
161
+ """
162
+ Calculate metrics from commit data.
163
+
164
+ Args:
165
+ commits: List of commits to analyze
166
+ cutoff_date: Date to use as "now" for calculations (for T-1 analysis)
167
+
168
+ Returns:
169
+ GitMetrics with calculated values
170
+ """
171
+ if not commits:
172
+ return GitMetrics()
173
+
174
+ cutoff = cutoff_date or datetime.now()
175
+ one_year_ago = cutoff - timedelta(days=365)
176
+
177
+ # Filter commits for last year
178
+ recent_commits = [c for c in commits if c.authored_date >= one_year_ago and c.authored_date <= cutoff]
179
+
180
+ # Count commits by author email
181
+ author_counts: dict[str, int] = defaultdict(int)
182
+ author_names: dict[str, str] = {}
183
+
184
+ for commit in recent_commits:
185
+ email = commit.author_email.lower()
186
+ author_counts[email] += 1
187
+ author_names[email] = commit.author_name
188
+
189
+ # Find top contributor
190
+ total_recent = len(recent_commits)
191
+ unique_contributors = len(author_counts)
192
+
193
+ if author_counts:
194
+ top_email = max(author_counts, key=author_counts.get)
195
+ top_commits = author_counts[top_email]
196
+ concentration = (top_commits / total_recent * 100) if total_recent > 0 else 0
197
+ else:
198
+ top_email = ""
199
+ top_commits = 0
200
+ concentration = 100 # No commits = maximum concentration (abandoned)
201
+
202
+ # Sort commits by date
203
+ sorted_commits = sorted(commits, key=lambda c: c.authored_date)
204
+
205
+ return GitMetrics(
206
+ total_commits=len(commits),
207
+ commits_last_year=total_recent,
208
+ unique_contributors=unique_contributors,
209
+ maintainer_concentration=concentration,
210
+ top_contributor_email=top_email,
211
+ top_contributor_name=author_names.get(top_email, ""),
212
+ top_contributor_commits=top_commits,
213
+ last_commit_date=sorted_commits[-1].authored_date if sorted_commits else None,
214
+ first_commit_date=sorted_commits[0].authored_date if sorted_commits else None,
215
+ commits=recent_commits, # Store only recent commits for sentiment analysis
216
+ )
217
+
218
+ async def collect(self, repo_url: str, cutoff_date: Optional[datetime] = None) -> GitMetrics:
219
+ """
220
+ Collect git data for a repository.
221
+
222
+ Args:
223
+ repo_url: Git repository URL
224
+ cutoff_date: Date to use as "now" for T-1 analysis
225
+
226
+ Returns:
227
+ GitMetrics with all calculated values
228
+ """
229
+ repo_path = self.clone_or_update(repo_url)
230
+ commits = self.extract_commits(repo_path)
231
+ return self.calculate_metrics(commits, cutoff_date)