greenmining 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,282 @@
1
+ """Commit extractor for green microservices mining."""
2
+
3
+ import json
4
+ from datetime import datetime, timedelta
5
+ from pathlib import Path
6
+ from typing import Any, Optional
7
+
8
+ import click
9
+ from pydriller import Repository
10
+ from tqdm import tqdm
11
+
12
+ from greenmining.config import get_config
13
+ from greenmining.utils import (
14
+ colored_print,
15
+ format_timestamp,
16
+ load_json_file,
17
+ print_banner,
18
+ retry_on_exception,
19
+ save_json_file,
20
+ )
21
+
22
+
23
+ class CommitExtractor:
24
+ """Extracts commit data from repositories."""
25
+
26
+ def __init__(self, max_commits: int = 50, skip_merges: bool = True, days_back: int = 730):
27
+ """Initialize commit extractor.
28
+
29
+ Args:
30
+ max_commits: Maximum commits per repository
31
+ skip_merges: Skip merge commits
32
+ days_back: Only analyze commits from last N days
33
+ """
34
+ self.max_commits = max_commits
35
+ self.skip_merges = skip_merges
36
+ self.days_back = days_back
37
+ self.cutoff_date = datetime.now() - timedelta(days=days_back)
38
+
39
+ def extract_from_repositories(self, repositories: list[dict[str, Any]]) -> list[dict[str, Any]]:
40
+ """Extract commits from list of repositories.
41
+
42
+ Args:
43
+ repositories: List of repository metadata
44
+
45
+ Returns:
46
+ List of commit data dictionaries
47
+ """
48
+ all_commits = []
49
+ failed_repos = []
50
+
51
+ colored_print(f"\nExtracting commits from {len(repositories)} repositories...", "cyan")
52
+ colored_print(
53
+ f"Settings: max_commits={self.max_commits}, skip_merges={self.skip_merges}, days_back={self.days_back}",
54
+ "cyan",
55
+ )
56
+
57
+ with tqdm(total=len(repositories), desc="Processing repositories", unit="repo") as pbar:
58
+ for repo in repositories:
59
+ try:
60
+ commits = self._extract_repo_commits(repo)
61
+ all_commits.extend(commits)
62
+ pbar.set_postfix({"commits": len(all_commits), "failed": len(failed_repos)})
63
+ pbar.update(1)
64
+ except Exception as e:
65
+ colored_print(f"\nError processing {repo['full_name']}: {e}", "yellow")
66
+ failed_repos.append(repo["full_name"])
67
+ pbar.update(1)
68
+
69
+ if failed_repos:
70
+ colored_print(f"\nFailed to process {len(failed_repos)} repositories:", "yellow")
71
+ for repo_name in failed_repos[:5]:
72
+ colored_print(f" - {repo_name}", "yellow")
73
+ if len(failed_repos) > 5:
74
+ colored_print(f" ... and {len(failed_repos) - 5} more", "yellow")
75
+
76
+ return all_commits
77
+
78
+ @retry_on_exception(max_retries=2, delay=5.0, exceptions=(Exception,))
79
+ def _extract_repo_commits(self, repo: dict[str, Any]) -> list[dict[str, Any]]:
80
+ """Extract commits from a single repository.
81
+
82
+ Args:
83
+ repo: Repository metadata dictionary
84
+
85
+ Returns:
86
+ List of commit dictionaries
87
+ """
88
+ commits = []
89
+ repo_url = repo["clone_url"]
90
+ repo_name = repo["full_name"]
91
+
92
+ try:
93
+ # Use PyDriller to traverse commits
94
+ commit_count = 0
95
+
96
+ for commit in Repository(
97
+ repo_url, only_no_merge=self.skip_merges, since=self.cutoff_date
98
+ ).traverse_commits():
99
+
100
+ # Skip if reached max commits
101
+ if commit_count >= self.max_commits:
102
+ break
103
+
104
+ # Skip trivial commits
105
+ if not commit.msg or len(commit.msg.strip()) < 10:
106
+ continue
107
+
108
+ # Extract commit data
109
+ commit_data = self._extract_commit_metadata(commit, repo_name)
110
+ commits.append(commit_data)
111
+ commit_count += 1
112
+
113
+ except Exception as e:
114
+ colored_print(f"Error extracting commits from {repo_name}: {e}", "yellow")
115
+ raise
116
+
117
+ return commits
118
+
119
+ def _extract_commit_metadata(self, commit, repo_name: str) -> dict[str, Any]:
120
+ """Extract metadata from commit object.
121
+
122
+ Args:
123
+ commit: PyDriller commit object
124
+ repo_name: Repository name
125
+
126
+ Returns:
127
+ Dictionary with commit metadata
128
+ """
129
+ # Get modified files
130
+ files_changed = []
131
+ lines_added = 0
132
+ lines_deleted = 0
133
+
134
+ try:
135
+ for modified_file in commit.modified_files:
136
+ files_changed.append(modified_file.filename)
137
+ lines_added += modified_file.added_lines
138
+ lines_deleted += modified_file.deleted_lines
139
+ except Exception:
140
+ pass
141
+
142
+ return {
143
+ "commit_id": commit.hash,
144
+ "repo_name": repo_name,
145
+ "date": commit.committer_date.isoformat(),
146
+ "author": commit.author.name,
147
+ "author_email": commit.author.email,
148
+ "message": commit.msg.strip(),
149
+ "files_changed": files_changed[:20], # Limit to 20 files
150
+ "lines_added": lines_added,
151
+ "lines_deleted": lines_deleted,
152
+ "insertions": lines_added,
153
+ "deletions": lines_deleted,
154
+ "is_merge": commit.merge,
155
+ "branches": (
156
+ list(commit.branches) if hasattr(commit, "branches") and commit.branches else []
157
+ ),
158
+ "in_main_branch": commit.in_main_branch if hasattr(commit, "in_main_branch") else True,
159
+ }
160
+
161
+ def save_results(self, commits: list[dict[str, Any]], output_file: Path, repos_count: int):
162
+ """Save extracted commits to JSON file.
163
+
164
+ Args:
165
+ commits: List of commit data
166
+ output_file: Output file path
167
+ repos_count: Number of repositories processed
168
+ """
169
+ data = {
170
+ "metadata": {
171
+ "extracted_at": format_timestamp(),
172
+ "total_commits": len(commits),
173
+ "total_repos": repos_count,
174
+ "max_commits_per_repo": self.max_commits,
175
+ "skip_merges": self.skip_merges,
176
+ "days_back": self.days_back,
177
+ "cutoff_date": self.cutoff_date.isoformat(),
178
+ },
179
+ "commits": commits,
180
+ }
181
+
182
+ save_json_file(data, output_file)
183
+ colored_print(f"Saved {len(commits)} commits to {output_file}", "green")
184
+
185
+
186
+ @click.command()
187
+ @click.option("--max-commits", default=50, help="Maximum commits per repository")
188
+ @click.option("--skip-merges/--include-merges", default=True, help="Skip merge commits")
189
+ @click.option("--days-back", default=730, help="Only analyze commits from last N days")
190
+ @click.option(
191
+ "--repos-file", default=None, help="Input repositories file (default: data/repositories.json)"
192
+ )
193
+ @click.option("--output", default=None, help="Output file path (default: data/commits.json)")
194
+ @click.option("--config-file", default=".env", help="Path to .env configuration file")
195
+ def extract(
196
+ max_commits: int,
197
+ skip_merges: bool,
198
+ days_back: int,
199
+ repos_file: Optional[str],
200
+ output: Optional[str],
201
+ config_file: str,
202
+ ):
203
+ """Extract commits from fetched repositories."""
204
+ print_banner("Commit Data Extractor")
205
+
206
+ try:
207
+ # Load configuration
208
+ config = get_config(config_file)
209
+
210
+ # Determine input/output files
211
+ input_file = Path(repos_file) if repos_file else config.REPOS_FILE
212
+ output_file = Path(output) if output else config.COMMITS_FILE
213
+
214
+ # Check if input file exists
215
+ if not input_file.exists():
216
+ colored_print(f"Input file not found: {input_file}", "red")
217
+ colored_print("Please run 'fetch' command first to fetch repositories", "yellow")
218
+ exit(1)
219
+
220
+ # Load repositories
221
+ colored_print(f"Loading repositories from {input_file}...", "blue")
222
+ data = load_json_file(input_file)
223
+ repositories = data.get("repositories", [])
224
+
225
+ if not repositories:
226
+ colored_print("No repositories found in input file", "yellow")
227
+ exit(1)
228
+
229
+ colored_print(f"Loaded {len(repositories)} repositories", "green")
230
+
231
+ # Initialize extractor
232
+ extractor = CommitExtractor(
233
+ max_commits=max_commits, skip_merges=skip_merges, days_back=days_back
234
+ )
235
+
236
+ # Extract commits
237
+ commits = extractor.extract_from_repositories(repositories)
238
+
239
+ if not commits:
240
+ colored_print("No commits extracted", "yellow")
241
+ exit(1)
242
+
243
+ # Save results
244
+ extractor.save_results(commits, output_file, len(repositories))
245
+
246
+ # Display summary
247
+ colored_print(f"\n✓ Successfully extracted {len(commits)} commits", "green")
248
+ colored_print(f"Output saved to: {output_file}", "green")
249
+
250
+ # Calculate statistics
251
+ avg_commits = len(commits) / len(repositories)
252
+ colored_print("\nStatistics:", "cyan")
253
+ colored_print(f" Total repositories: {len(repositories)}", "white")
254
+ colored_print(f" Total commits: {len(commits)}", "white")
255
+ colored_print(f" Average commits per repo: {avg_commits:.1f}", "white")
256
+
257
+ # Show language breakdown
258
+ from collections import Counter
259
+
260
+ repo_languages = [repo["language"] for repo in repositories if repo.get("language")]
261
+ language_counts = Counter(repo_languages)
262
+
263
+ colored_print("\nLanguage breakdown:", "cyan")
264
+ for lang, count in language_counts.most_common(5):
265
+ colored_print(f" {lang}: {count} repos", "white")
266
+
267
+ except FileNotFoundError as e:
268
+ colored_print(f"File not found: {e}", "red")
269
+ exit(1)
270
+ except json.JSONDecodeError:
271
+ colored_print(f"Invalid JSON in input file: {input_file}", "red")
272
+ exit(1)
273
+ except Exception as e:
274
+ colored_print(f"Error: {e}", "red")
275
+ import traceback
276
+
277
+ traceback.print_exc()
278
+ exit(1)
279
+
280
+
281
+ if __name__ == "__main__":
282
+ extract()