greenmining 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__init__.py +11 -29
- greenmining/__main__.py +9 -3
- greenmining/__version__.py +2 -2
- greenmining/analyzers/__init__.py +3 -7
- greenmining/analyzers/code_diff_analyzer.py +151 -61
- greenmining/analyzers/qualitative_analyzer.py +15 -81
- greenmining/analyzers/statistical_analyzer.py +8 -69
- greenmining/analyzers/temporal_analyzer.py +16 -72
- greenmining/config.py +105 -58
- greenmining/controllers/__init__.py +1 -5
- greenmining/controllers/repository_controller.py +153 -94
- greenmining/energy/__init__.py +13 -0
- greenmining/energy/base.py +165 -0
- greenmining/energy/codecarbon_meter.py +146 -0
- greenmining/energy/rapl.py +157 -0
- greenmining/gsf_patterns.py +4 -26
- greenmining/models/__init__.py +1 -5
- greenmining/models/aggregated_stats.py +4 -4
- greenmining/models/analysis_result.py +4 -4
- greenmining/models/commit.py +5 -5
- greenmining/models/repository.py +5 -5
- greenmining/presenters/__init__.py +1 -5
- greenmining/presenters/console_presenter.py +24 -24
- greenmining/services/__init__.py +10 -6
- greenmining/services/commit_extractor.py +8 -152
- greenmining/services/data_aggregator.py +45 -175
- greenmining/services/data_analyzer.py +9 -202
- greenmining/services/github_fetcher.py +212 -323
- greenmining/services/github_graphql_fetcher.py +371 -0
- greenmining/services/local_repo_analyzer.py +387 -0
- greenmining/services/reports.py +33 -137
- greenmining/utils.py +21 -149
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/METADATA +61 -151
- greenmining-1.0.4.dist-info/RECORD +37 -0
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/WHEEL +1 -1
- greenmining/analyzers/ml_feature_extractor.py +0 -512
- greenmining/analyzers/nlp_analyzer.py +0 -365
- greenmining/cli.py +0 -471
- greenmining/main.py +0 -37
- greenmining-1.0.3.dist-info/RECORD +0 -36
- greenmining-1.0.3.dist-info/entry_points.txt +0 -2
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/licenses/LICENSE +0 -0
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
# Local repository analyzer for direct GitHub URL analysis using PyDriller.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import tempfile
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, List, Optional, Generator
|
|
13
|
+
|
|
14
|
+
from pydriller import Repository
|
|
15
|
+
from pydriller.metrics.process.change_set import ChangeSet
|
|
16
|
+
from pydriller.metrics.process.code_churn import CodeChurn
|
|
17
|
+
from pydriller.metrics.process.commits_count import CommitsCount
|
|
18
|
+
from pydriller.metrics.process.contributors_count import ContributorsCount
|
|
19
|
+
from pydriller.metrics.process.contributors_experience import ContributorsExperience
|
|
20
|
+
from pydriller.metrics.process.history_complexity import HistoryComplexity
|
|
21
|
+
from pydriller.metrics.process.hunks_count import HunksCount
|
|
22
|
+
from pydriller.metrics.process.lines_count import LinesCount
|
|
23
|
+
|
|
24
|
+
from greenmining.gsf_patterns import get_pattern_by_keywords, is_green_aware, GSF_PATTERNS
|
|
25
|
+
from greenmining.utils import colored_print
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class CommitAnalysis:
|
|
30
|
+
# Analysis result for a single commit.
|
|
31
|
+
|
|
32
|
+
hash: str
|
|
33
|
+
message: str
|
|
34
|
+
author: str
|
|
35
|
+
author_email: str
|
|
36
|
+
date: datetime
|
|
37
|
+
green_aware: bool
|
|
38
|
+
gsf_patterns_matched: List[str]
|
|
39
|
+
pattern_count: int
|
|
40
|
+
pattern_details: List[Dict[str, Any]]
|
|
41
|
+
confidence: str
|
|
42
|
+
files_modified: List[str]
|
|
43
|
+
insertions: int
|
|
44
|
+
deletions: int
|
|
45
|
+
|
|
46
|
+
# PyDriller DMM metrics
|
|
47
|
+
dmm_unit_size: Optional[float] = None
|
|
48
|
+
dmm_unit_complexity: Optional[float] = None
|
|
49
|
+
dmm_unit_interfacing: Optional[float] = None
|
|
50
|
+
|
|
51
|
+
# Structural metrics (Lizard)
|
|
52
|
+
total_nloc: int = 0
|
|
53
|
+
total_complexity: int = 0
|
|
54
|
+
max_complexity: int = 0
|
|
55
|
+
methods_count: int = 0
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
58
|
+
# Convert to dictionary.
|
|
59
|
+
return {
|
|
60
|
+
"commit_hash": self.hash,
|
|
61
|
+
"message": self.message,
|
|
62
|
+
"author": self.author,
|
|
63
|
+
"author_email": self.author_email,
|
|
64
|
+
"date": self.date.isoformat() if self.date else None,
|
|
65
|
+
"green_aware": self.green_aware,
|
|
66
|
+
"gsf_patterns_matched": self.gsf_patterns_matched,
|
|
67
|
+
"pattern_count": self.pattern_count,
|
|
68
|
+
"pattern_details": self.pattern_details,
|
|
69
|
+
"confidence": self.confidence,
|
|
70
|
+
"files_modified": self.files_modified,
|
|
71
|
+
"insertions": self.insertions,
|
|
72
|
+
"deletions": self.deletions,
|
|
73
|
+
"dmm_unit_size": self.dmm_unit_size,
|
|
74
|
+
"dmm_unit_complexity": self.dmm_unit_complexity,
|
|
75
|
+
"dmm_unit_interfacing": self.dmm_unit_interfacing,
|
|
76
|
+
"total_nloc": self.total_nloc,
|
|
77
|
+
"total_complexity": self.total_complexity,
|
|
78
|
+
"max_complexity": self.max_complexity,
|
|
79
|
+
"methods_count": self.methods_count,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class RepositoryAnalysis:
|
|
85
|
+
# Complete analysis result for a repository.
|
|
86
|
+
|
|
87
|
+
url: str
|
|
88
|
+
name: str
|
|
89
|
+
total_commits: int
|
|
90
|
+
green_commits: int
|
|
91
|
+
green_commit_rate: float
|
|
92
|
+
commits: List[CommitAnalysis] = field(default_factory=list)
|
|
93
|
+
process_metrics: Dict[str, Any] = field(default_factory=dict)
|
|
94
|
+
|
|
95
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
96
|
+
# Convert to dictionary.
|
|
97
|
+
return {
|
|
98
|
+
"url": self.url,
|
|
99
|
+
"name": self.name,
|
|
100
|
+
"total_commits": self.total_commits,
|
|
101
|
+
"green_commits": self.green_commits,
|
|
102
|
+
"green_commit_rate": self.green_commit_rate,
|
|
103
|
+
"commits": [c.to_dict() for c in self.commits],
|
|
104
|
+
"process_metrics": self.process_metrics,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class LocalRepoAnalyzer:
|
|
109
|
+
# Analyze repositories directly from GitHub URLs using PyDriller.
|
|
110
|
+
|
|
111
|
+
def __init__(
|
|
112
|
+
self,
|
|
113
|
+
clone_path: Optional[Path] = None,
|
|
114
|
+
max_commits: int = 500,
|
|
115
|
+
days_back: int = 730,
|
|
116
|
+
skip_merges: bool = True,
|
|
117
|
+
compute_process_metrics: bool = True,
|
|
118
|
+
cleanup_after: bool = True,
|
|
119
|
+
):
|
|
120
|
+
# Initialize the local repository analyzer.
|
|
121
|
+
self.clone_path = clone_path or Path(tempfile.gettempdir()) / "greenmining_repos"
|
|
122
|
+
self.clone_path.mkdir(parents=True, exist_ok=True)
|
|
123
|
+
self.max_commits = max_commits
|
|
124
|
+
self.days_back = days_back
|
|
125
|
+
self.skip_merges = skip_merges
|
|
126
|
+
self.compute_process_metrics = compute_process_metrics
|
|
127
|
+
self.cleanup_after = cleanup_after
|
|
128
|
+
self.gsf_patterns = GSF_PATTERNS
|
|
129
|
+
|
|
130
|
+
def _parse_repo_url(self, url: str) -> tuple[str, str]:
|
|
131
|
+
# Parse repository URL to extract owner and name.
|
|
132
|
+
# Handle HTTPS URLs
|
|
133
|
+
https_pattern = r"github\.com[/:]([^/]+)/([^/\.]+)"
|
|
134
|
+
match = re.search(https_pattern, url)
|
|
135
|
+
if match:
|
|
136
|
+
return match.group(1), match.group(2).replace(".git", "")
|
|
137
|
+
|
|
138
|
+
# Handle SSH URLs
|
|
139
|
+
ssh_pattern = r"git@github\.com:([^/]+)/([^/\.]+)"
|
|
140
|
+
match = re.search(ssh_pattern, url)
|
|
141
|
+
if match:
|
|
142
|
+
return match.group(1), match.group(2).replace(".git", "")
|
|
143
|
+
|
|
144
|
+
raise ValueError(f"Could not parse GitHub URL: {url}")
|
|
145
|
+
|
|
146
|
+
def _get_pattern_details(self, matched_patterns: List[str]) -> List[Dict[str, Any]]:
|
|
147
|
+
# Get detailed pattern information.
|
|
148
|
+
details = []
|
|
149
|
+
for pattern_id, pattern in self.gsf_patterns.items():
|
|
150
|
+
if pattern["name"] in matched_patterns:
|
|
151
|
+
details.append({
|
|
152
|
+
"name": pattern["name"],
|
|
153
|
+
"category": pattern["category"],
|
|
154
|
+
"description": pattern["description"],
|
|
155
|
+
"sci_impact": pattern["sci_impact"],
|
|
156
|
+
})
|
|
157
|
+
return details
|
|
158
|
+
|
|
159
|
+
def analyze_commit(self, commit) -> CommitAnalysis:
|
|
160
|
+
# Analyze a single PyDriller commit object.
|
|
161
|
+
message = commit.msg or ""
|
|
162
|
+
|
|
163
|
+
# Green awareness check
|
|
164
|
+
green_aware = is_green_aware(message)
|
|
165
|
+
|
|
166
|
+
# GSF pattern matching
|
|
167
|
+
matched_patterns = get_pattern_by_keywords(message)
|
|
168
|
+
pattern_details = self._get_pattern_details(matched_patterns)
|
|
169
|
+
|
|
170
|
+
# Confidence calculation
|
|
171
|
+
pattern_count = len(matched_patterns)
|
|
172
|
+
confidence = "high" if pattern_count >= 2 else "medium" if pattern_count == 1 else "low"
|
|
173
|
+
|
|
174
|
+
# File modifications
|
|
175
|
+
files_modified = [mod.filename for mod in commit.modified_files]
|
|
176
|
+
insertions = sum(mod.added_lines for mod in commit.modified_files)
|
|
177
|
+
deletions = sum(mod.deleted_lines for mod in commit.modified_files)
|
|
178
|
+
|
|
179
|
+
# Delta Maintainability Model (if available)
|
|
180
|
+
dmm_unit_size = None
|
|
181
|
+
dmm_unit_complexity = None
|
|
182
|
+
dmm_unit_interfacing = None
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
dmm_unit_size = commit.dmm_unit_size
|
|
186
|
+
dmm_unit_complexity = commit.dmm_unit_complexity
|
|
187
|
+
dmm_unit_interfacing = commit.dmm_unit_interfacing
|
|
188
|
+
except Exception:
|
|
189
|
+
pass # DMM may not be available for all commits
|
|
190
|
+
|
|
191
|
+
# Structural metrics from Lizard (via PyDriller)
|
|
192
|
+
total_nloc = 0
|
|
193
|
+
total_complexity = 0
|
|
194
|
+
max_complexity = 0
|
|
195
|
+
methods_count = 0
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
for mod in commit.modified_files:
|
|
199
|
+
if mod.nloc:
|
|
200
|
+
total_nloc += mod.nloc
|
|
201
|
+
if mod.complexity:
|
|
202
|
+
total_complexity += mod.complexity
|
|
203
|
+
if mod.complexity > max_complexity:
|
|
204
|
+
max_complexity = mod.complexity
|
|
205
|
+
if mod.methods:
|
|
206
|
+
methods_count += len(mod.methods)
|
|
207
|
+
except Exception:
|
|
208
|
+
pass # Structural metrics may fail for some files
|
|
209
|
+
|
|
210
|
+
return CommitAnalysis(
|
|
211
|
+
hash=commit.hash,
|
|
212
|
+
message=message,
|
|
213
|
+
author=commit.author.name,
|
|
214
|
+
author_email=commit.author.email,
|
|
215
|
+
date=commit.author_date,
|
|
216
|
+
green_aware=green_aware,
|
|
217
|
+
gsf_patterns_matched=matched_patterns,
|
|
218
|
+
pattern_count=pattern_count,
|
|
219
|
+
pattern_details=pattern_details,
|
|
220
|
+
confidence=confidence,
|
|
221
|
+
files_modified=files_modified,
|
|
222
|
+
insertions=insertions,
|
|
223
|
+
deletions=deletions,
|
|
224
|
+
dmm_unit_size=dmm_unit_size,
|
|
225
|
+
dmm_unit_complexity=dmm_unit_complexity,
|
|
226
|
+
dmm_unit_interfacing=dmm_unit_interfacing,
|
|
227
|
+
total_nloc=total_nloc,
|
|
228
|
+
total_complexity=total_complexity,
|
|
229
|
+
max_complexity=max_complexity,
|
|
230
|
+
methods_count=methods_count,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
def analyze_repository(self, url: str) -> RepositoryAnalysis:
|
|
234
|
+
# Analyze a repository from its URL.
|
|
235
|
+
owner, repo_name = self._parse_repo_url(url)
|
|
236
|
+
full_name = f"{owner}/{repo_name}"
|
|
237
|
+
|
|
238
|
+
colored_print(f"\n Analyzing repository: {full_name}", "cyan")
|
|
239
|
+
|
|
240
|
+
# Calculate date range
|
|
241
|
+
since_date = datetime.now() - timedelta(days=self.days_back)
|
|
242
|
+
|
|
243
|
+
# Configure PyDriller Repository
|
|
244
|
+
repo_config = {
|
|
245
|
+
"path_to_repo": url,
|
|
246
|
+
"since": since_date,
|
|
247
|
+
"only_no_merge": self.skip_merges,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
# Clone to specific path if needed
|
|
251
|
+
local_path = self.clone_path / repo_name
|
|
252
|
+
if local_path.exists():
|
|
253
|
+
shutil.rmtree(local_path)
|
|
254
|
+
|
|
255
|
+
repo_config["clone_repo_to"] = str(self.clone_path)
|
|
256
|
+
|
|
257
|
+
colored_print(f" Cloning to: {local_path}", "cyan")
|
|
258
|
+
|
|
259
|
+
commits_analyzed = []
|
|
260
|
+
commit_count = 0
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
for commit in Repository(**repo_config).traverse_commits():
|
|
264
|
+
if commit_count >= self.max_commits:
|
|
265
|
+
break
|
|
266
|
+
|
|
267
|
+
try:
|
|
268
|
+
analysis = self.analyze_commit(commit)
|
|
269
|
+
commits_analyzed.append(analysis)
|
|
270
|
+
commit_count += 1
|
|
271
|
+
|
|
272
|
+
if commit_count % 50 == 0:
|
|
273
|
+
colored_print(f" Processed {commit_count} commits...", "cyan")
|
|
274
|
+
|
|
275
|
+
except Exception as e:
|
|
276
|
+
colored_print(f" Warning: Error analyzing commit {commit.hash[:8]}: {e}", "yellow")
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
colored_print(f" Analyzed {len(commits_analyzed)} commits", "green")
|
|
280
|
+
|
|
281
|
+
# Compute process metrics if enabled
|
|
282
|
+
process_metrics = {}
|
|
283
|
+
if self.compute_process_metrics and local_path.exists():
|
|
284
|
+
colored_print(" Computing process metrics...", "cyan")
|
|
285
|
+
process_metrics = self._compute_process_metrics(str(local_path))
|
|
286
|
+
|
|
287
|
+
# Calculate summary
|
|
288
|
+
green_commits = sum(1 for c in commits_analyzed if c.green_aware)
|
|
289
|
+
green_rate = green_commits / len(commits_analyzed) if commits_analyzed else 0
|
|
290
|
+
|
|
291
|
+
result = RepositoryAnalysis(
|
|
292
|
+
url=url,
|
|
293
|
+
name=full_name,
|
|
294
|
+
total_commits=len(commits_analyzed),
|
|
295
|
+
green_commits=green_commits,
|
|
296
|
+
green_commit_rate=green_rate,
|
|
297
|
+
commits=commits_analyzed,
|
|
298
|
+
process_metrics=process_metrics,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
return result
|
|
302
|
+
|
|
303
|
+
finally:
|
|
304
|
+
# Cleanup if requested
|
|
305
|
+
if self.cleanup_after and local_path.exists():
|
|
306
|
+
colored_print(f" Cleaning up: {local_path}", "cyan")
|
|
307
|
+
shutil.rmtree(local_path, ignore_errors=True)
|
|
308
|
+
|
|
309
|
+
def _compute_process_metrics(self, repo_path: str) -> Dict[str, Any]:
|
|
310
|
+
# Compute PyDriller process metrics for the repository.
|
|
311
|
+
metrics = {}
|
|
312
|
+
since_date = datetime.now() - timedelta(days=self.days_back)
|
|
313
|
+
to_date = datetime.now()
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
# ChangeSet metrics
|
|
317
|
+
cs = ChangeSet(repo_path, since=since_date, to=to_date)
|
|
318
|
+
metrics["change_set_max"] = cs.max()
|
|
319
|
+
metrics["change_set_avg"] = cs.avg()
|
|
320
|
+
except Exception as e:
|
|
321
|
+
colored_print(f" Warning: ChangeSet metrics failed: {e}", "yellow")
|
|
322
|
+
|
|
323
|
+
try:
|
|
324
|
+
# CodeChurn metrics
|
|
325
|
+
churn = CodeChurn(repo_path, since=since_date, to=to_date)
|
|
326
|
+
metrics["code_churn"] = churn.count()
|
|
327
|
+
except Exception as e:
|
|
328
|
+
colored_print(f" Warning: CodeChurn metrics failed: {e}", "yellow")
|
|
329
|
+
|
|
330
|
+
try:
|
|
331
|
+
# CommitsCount metrics
|
|
332
|
+
cc = CommitsCount(repo_path, since=since_date, to=to_date)
|
|
333
|
+
metrics["commits_per_file"] = cc.count()
|
|
334
|
+
except Exception as e:
|
|
335
|
+
colored_print(f" Warning: CommitsCount metrics failed: {e}", "yellow")
|
|
336
|
+
|
|
337
|
+
try:
|
|
338
|
+
# ContributorsCount metrics
|
|
339
|
+
contrib = ContributorsCount(repo_path, since=since_date, to=to_date)
|
|
340
|
+
metrics["contributors_per_file"] = contrib.count()
|
|
341
|
+
except Exception as e:
|
|
342
|
+
colored_print(f" Warning: ContributorsCount metrics failed: {e}", "yellow")
|
|
343
|
+
|
|
344
|
+
try:
|
|
345
|
+
# ContributorsExperience metrics
|
|
346
|
+
exp = ContributorsExperience(repo_path, since=since_date, to=to_date)
|
|
347
|
+
metrics["contributors_experience"] = exp.count()
|
|
348
|
+
except Exception as e:
|
|
349
|
+
colored_print(f" Warning: ContributorsExperience metrics failed: {e}", "yellow")
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
# HistoryComplexity metrics
|
|
353
|
+
hc = HistoryComplexity(repo_path, since=since_date, to=to_date)
|
|
354
|
+
metrics["history_complexity"] = hc.count()
|
|
355
|
+
except Exception as e:
|
|
356
|
+
colored_print(f" Warning: HistoryComplexity metrics failed: {e}", "yellow")
|
|
357
|
+
|
|
358
|
+
try:
|
|
359
|
+
# HunksCount metrics
|
|
360
|
+
hunks = HunksCount(repo_path, since=since_date, to=to_date)
|
|
361
|
+
metrics["hunks_count"] = hunks.count()
|
|
362
|
+
except Exception as e:
|
|
363
|
+
colored_print(f" Warning: HunksCount metrics failed: {e}", "yellow")
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
# LinesCount metrics
|
|
367
|
+
lines = LinesCount(repo_path, since=since_date, to=to_date)
|
|
368
|
+
metrics["lines_count"] = lines.count()
|
|
369
|
+
except Exception as e:
|
|
370
|
+
colored_print(f" Warning: LinesCount metrics failed: {e}", "yellow")
|
|
371
|
+
|
|
372
|
+
return metrics
|
|
373
|
+
|
|
374
|
+
def analyze_repositories(self, urls: List[str]) -> List[RepositoryAnalysis]:
|
|
375
|
+
# Analyze multiple repositories from URLs.
|
|
376
|
+
results = []
|
|
377
|
+
|
|
378
|
+
for i, url in enumerate(urls, 1):
|
|
379
|
+
colored_print(f"\n[{i}/{len(urls)}] Processing repository...", "cyan")
|
|
380
|
+
try:
|
|
381
|
+
result = self.analyze_repository(url)
|
|
382
|
+
results.append(result)
|
|
383
|
+
except Exception as e:
|
|
384
|
+
colored_print(f" Error analyzing {url}: {e}", "red")
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
return results
|