greenmining 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__init__.py +20 -0
- greenmining/__main__.py +6 -0
- greenmining/__version__.py +3 -0
- greenmining/cli.py +370 -0
- greenmining/config.py +120 -0
- greenmining/controllers/__init__.py +11 -0
- greenmining/controllers/repository_controller.py +117 -0
- greenmining/gsf_patterns.py +802 -0
- greenmining/main.py +37 -0
- greenmining/models/__init__.py +12 -0
- greenmining/models/aggregated_stats.py +30 -0
- greenmining/models/analysis_result.py +48 -0
- greenmining/models/commit.py +71 -0
- greenmining/models/repository.py +89 -0
- greenmining/presenters/__init__.py +11 -0
- greenmining/presenters/console_presenter.py +141 -0
- greenmining/services/__init__.py +13 -0
- greenmining/services/commit_extractor.py +282 -0
- greenmining/services/data_aggregator.py +442 -0
- greenmining/services/data_analyzer.py +333 -0
- greenmining/services/github_fetcher.py +266 -0
- greenmining/services/reports.py +531 -0
- greenmining/utils.py +320 -0
- greenmining-0.1.4.dist-info/METADATA +335 -0
- greenmining-0.1.4.dist-info/RECORD +29 -0
- greenmining-0.1.4.dist-info/WHEEL +5 -0
- greenmining-0.1.4.dist-info/entry_points.txt +2 -0
- greenmining-0.1.4.dist-info/licenses/LICENSE +21 -0
- greenmining-0.1.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""Commit extractor for green microservices mining."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from pydriller import Repository
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
from greenmining.config import get_config
|
|
13
|
+
from greenmining.utils import (
|
|
14
|
+
colored_print,
|
|
15
|
+
format_timestamp,
|
|
16
|
+
load_json_file,
|
|
17
|
+
print_banner,
|
|
18
|
+
retry_on_exception,
|
|
19
|
+
save_json_file,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CommitExtractor:
|
|
24
|
+
"""Extracts commit data from repositories."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, max_commits: int = 50, skip_merges: bool = True, days_back: int = 730):
|
|
27
|
+
"""Initialize commit extractor.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
max_commits: Maximum commits per repository
|
|
31
|
+
skip_merges: Skip merge commits
|
|
32
|
+
days_back: Only analyze commits from last N days
|
|
33
|
+
"""
|
|
34
|
+
self.max_commits = max_commits
|
|
35
|
+
self.skip_merges = skip_merges
|
|
36
|
+
self.days_back = days_back
|
|
37
|
+
self.cutoff_date = datetime.now() - timedelta(days=days_back)
|
|
38
|
+
|
|
39
|
+
def extract_from_repositories(self, repositories: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
40
|
+
"""Extract commits from list of repositories.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
repositories: List of repository metadata
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
List of commit data dictionaries
|
|
47
|
+
"""
|
|
48
|
+
all_commits = []
|
|
49
|
+
failed_repos = []
|
|
50
|
+
|
|
51
|
+
colored_print(f"\nExtracting commits from {len(repositories)} repositories...", "cyan")
|
|
52
|
+
colored_print(
|
|
53
|
+
f"Settings: max_commits={self.max_commits}, skip_merges={self.skip_merges}, days_back={self.days_back}",
|
|
54
|
+
"cyan",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
with tqdm(total=len(repositories), desc="Processing repositories", unit="repo") as pbar:
|
|
58
|
+
for repo in repositories:
|
|
59
|
+
try:
|
|
60
|
+
commits = self._extract_repo_commits(repo)
|
|
61
|
+
all_commits.extend(commits)
|
|
62
|
+
pbar.set_postfix({"commits": len(all_commits), "failed": len(failed_repos)})
|
|
63
|
+
pbar.update(1)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
colored_print(f"\nError processing {repo['full_name']}: {e}", "yellow")
|
|
66
|
+
failed_repos.append(repo["full_name"])
|
|
67
|
+
pbar.update(1)
|
|
68
|
+
|
|
69
|
+
if failed_repos:
|
|
70
|
+
colored_print(f"\nFailed to process {len(failed_repos)} repositories:", "yellow")
|
|
71
|
+
for repo_name in failed_repos[:5]:
|
|
72
|
+
colored_print(f" - {repo_name}", "yellow")
|
|
73
|
+
if len(failed_repos) > 5:
|
|
74
|
+
colored_print(f" ... and {len(failed_repos) - 5} more", "yellow")
|
|
75
|
+
|
|
76
|
+
return all_commits
|
|
77
|
+
|
|
78
|
+
@retry_on_exception(max_retries=2, delay=5.0, exceptions=(Exception,))
|
|
79
|
+
def _extract_repo_commits(self, repo: dict[str, Any]) -> list[dict[str, Any]]:
|
|
80
|
+
"""Extract commits from a single repository.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
repo: Repository metadata dictionary
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of commit dictionaries
|
|
87
|
+
"""
|
|
88
|
+
commits = []
|
|
89
|
+
repo_url = repo["clone_url"]
|
|
90
|
+
repo_name = repo["full_name"]
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
# Use PyDriller to traverse commits
|
|
94
|
+
commit_count = 0
|
|
95
|
+
|
|
96
|
+
for commit in Repository(
|
|
97
|
+
repo_url, only_no_merge=self.skip_merges, since=self.cutoff_date
|
|
98
|
+
).traverse_commits():
|
|
99
|
+
|
|
100
|
+
# Skip if reached max commits
|
|
101
|
+
if commit_count >= self.max_commits:
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
# Skip trivial commits
|
|
105
|
+
if not commit.msg or len(commit.msg.strip()) < 10:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
# Extract commit data
|
|
109
|
+
commit_data = self._extract_commit_metadata(commit, repo_name)
|
|
110
|
+
commits.append(commit_data)
|
|
111
|
+
commit_count += 1
|
|
112
|
+
|
|
113
|
+
except Exception as e:
|
|
114
|
+
colored_print(f"Error extracting commits from {repo_name}: {e}", "yellow")
|
|
115
|
+
raise
|
|
116
|
+
|
|
117
|
+
return commits
|
|
118
|
+
|
|
119
|
+
def _extract_commit_metadata(self, commit, repo_name: str) -> dict[str, Any]:
|
|
120
|
+
"""Extract metadata from commit object.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
commit: PyDriller commit object
|
|
124
|
+
repo_name: Repository name
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Dictionary with commit metadata
|
|
128
|
+
"""
|
|
129
|
+
# Get modified files
|
|
130
|
+
files_changed = []
|
|
131
|
+
lines_added = 0
|
|
132
|
+
lines_deleted = 0
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
for modified_file in commit.modified_files:
|
|
136
|
+
files_changed.append(modified_file.filename)
|
|
137
|
+
lines_added += modified_file.added_lines
|
|
138
|
+
lines_deleted += modified_file.deleted_lines
|
|
139
|
+
except Exception:
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
"commit_id": commit.hash,
|
|
144
|
+
"repo_name": repo_name,
|
|
145
|
+
"date": commit.committer_date.isoformat(),
|
|
146
|
+
"author": commit.author.name,
|
|
147
|
+
"author_email": commit.author.email,
|
|
148
|
+
"message": commit.msg.strip(),
|
|
149
|
+
"files_changed": files_changed[:20], # Limit to 20 files
|
|
150
|
+
"lines_added": lines_added,
|
|
151
|
+
"lines_deleted": lines_deleted,
|
|
152
|
+
"insertions": lines_added,
|
|
153
|
+
"deletions": lines_deleted,
|
|
154
|
+
"is_merge": commit.merge,
|
|
155
|
+
"branches": (
|
|
156
|
+
list(commit.branches) if hasattr(commit, "branches") and commit.branches else []
|
|
157
|
+
),
|
|
158
|
+
"in_main_branch": commit.in_main_branch if hasattr(commit, "in_main_branch") else True,
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
def save_results(self, commits: list[dict[str, Any]], output_file: Path, repos_count: int):
|
|
162
|
+
"""Save extracted commits to JSON file.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
commits: List of commit data
|
|
166
|
+
output_file: Output file path
|
|
167
|
+
repos_count: Number of repositories processed
|
|
168
|
+
"""
|
|
169
|
+
data = {
|
|
170
|
+
"metadata": {
|
|
171
|
+
"extracted_at": format_timestamp(),
|
|
172
|
+
"total_commits": len(commits),
|
|
173
|
+
"total_repos": repos_count,
|
|
174
|
+
"max_commits_per_repo": self.max_commits,
|
|
175
|
+
"skip_merges": self.skip_merges,
|
|
176
|
+
"days_back": self.days_back,
|
|
177
|
+
"cutoff_date": self.cutoff_date.isoformat(),
|
|
178
|
+
},
|
|
179
|
+
"commits": commits,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
save_json_file(data, output_file)
|
|
183
|
+
colored_print(f"Saved {len(commits)} commits to {output_file}", "green")
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@click.command()
|
|
187
|
+
@click.option("--max-commits", default=50, help="Maximum commits per repository")
|
|
188
|
+
@click.option("--skip-merges/--include-merges", default=True, help="Skip merge commits")
|
|
189
|
+
@click.option("--days-back", default=730, help="Only analyze commits from last N days")
|
|
190
|
+
@click.option(
|
|
191
|
+
"--repos-file", default=None, help="Input repositories file (default: data/repositories.json)"
|
|
192
|
+
)
|
|
193
|
+
@click.option("--output", default=None, help="Output file path (default: data/commits.json)")
|
|
194
|
+
@click.option("--config-file", default=".env", help="Path to .env configuration file")
|
|
195
|
+
def extract(
|
|
196
|
+
max_commits: int,
|
|
197
|
+
skip_merges: bool,
|
|
198
|
+
days_back: int,
|
|
199
|
+
repos_file: Optional[str],
|
|
200
|
+
output: Optional[str],
|
|
201
|
+
config_file: str,
|
|
202
|
+
):
|
|
203
|
+
"""Extract commits from fetched repositories."""
|
|
204
|
+
print_banner("Commit Data Extractor")
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
# Load configuration
|
|
208
|
+
config = get_config(config_file)
|
|
209
|
+
|
|
210
|
+
# Determine input/output files
|
|
211
|
+
input_file = Path(repos_file) if repos_file else config.REPOS_FILE
|
|
212
|
+
output_file = Path(output) if output else config.COMMITS_FILE
|
|
213
|
+
|
|
214
|
+
# Check if input file exists
|
|
215
|
+
if not input_file.exists():
|
|
216
|
+
colored_print(f"Input file not found: {input_file}", "red")
|
|
217
|
+
colored_print("Please run 'fetch' command first to fetch repositories", "yellow")
|
|
218
|
+
exit(1)
|
|
219
|
+
|
|
220
|
+
# Load repositories
|
|
221
|
+
colored_print(f"Loading repositories from {input_file}...", "blue")
|
|
222
|
+
data = load_json_file(input_file)
|
|
223
|
+
repositories = data.get("repositories", [])
|
|
224
|
+
|
|
225
|
+
if not repositories:
|
|
226
|
+
colored_print("No repositories found in input file", "yellow")
|
|
227
|
+
exit(1)
|
|
228
|
+
|
|
229
|
+
colored_print(f"Loaded {len(repositories)} repositories", "green")
|
|
230
|
+
|
|
231
|
+
# Initialize extractor
|
|
232
|
+
extractor = CommitExtractor(
|
|
233
|
+
max_commits=max_commits, skip_merges=skip_merges, days_back=days_back
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Extract commits
|
|
237
|
+
commits = extractor.extract_from_repositories(repositories)
|
|
238
|
+
|
|
239
|
+
if not commits:
|
|
240
|
+
colored_print("No commits extracted", "yellow")
|
|
241
|
+
exit(1)
|
|
242
|
+
|
|
243
|
+
# Save results
|
|
244
|
+
extractor.save_results(commits, output_file, len(repositories))
|
|
245
|
+
|
|
246
|
+
# Display summary
|
|
247
|
+
colored_print(f"\n✓ Successfully extracted {len(commits)} commits", "green")
|
|
248
|
+
colored_print(f"Output saved to: {output_file}", "green")
|
|
249
|
+
|
|
250
|
+
# Calculate statistics
|
|
251
|
+
avg_commits = len(commits) / len(repositories)
|
|
252
|
+
colored_print("\nStatistics:", "cyan")
|
|
253
|
+
colored_print(f" Total repositories: {len(repositories)}", "white")
|
|
254
|
+
colored_print(f" Total commits: {len(commits)}", "white")
|
|
255
|
+
colored_print(f" Average commits per repo: {avg_commits:.1f}", "white")
|
|
256
|
+
|
|
257
|
+
# Show language breakdown
|
|
258
|
+
from collections import Counter
|
|
259
|
+
|
|
260
|
+
repo_languages = [repo["language"] for repo in repositories if repo.get("language")]
|
|
261
|
+
language_counts = Counter(repo_languages)
|
|
262
|
+
|
|
263
|
+
colored_print("\nLanguage breakdown:", "cyan")
|
|
264
|
+
for lang, count in language_counts.most_common(5):
|
|
265
|
+
colored_print(f" {lang}: {count} repos", "white")
|
|
266
|
+
|
|
267
|
+
except FileNotFoundError as e:
|
|
268
|
+
colored_print(f"File not found: {e}", "red")
|
|
269
|
+
exit(1)
|
|
270
|
+
except json.JSONDecodeError:
|
|
271
|
+
colored_print(f"Invalid JSON in input file: {input_file}", "red")
|
|
272
|
+
exit(1)
|
|
273
|
+
except Exception as e:
|
|
274
|
+
colored_print(f"Error: {e}", "red")
|
|
275
|
+
import traceback
|
|
276
|
+
|
|
277
|
+
traceback.print_exc()
|
|
278
|
+
exit(1)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
if __name__ == "__main__":
|
|
282
|
+
extract()
|