greenmining 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. greenmining/__init__.py +11 -29
  2. greenmining/__main__.py +9 -3
  3. greenmining/__version__.py +2 -2
  4. greenmining/analyzers/__init__.py +3 -7
  5. greenmining/analyzers/code_diff_analyzer.py +151 -61
  6. greenmining/analyzers/qualitative_analyzer.py +15 -81
  7. greenmining/analyzers/statistical_analyzer.py +8 -69
  8. greenmining/analyzers/temporal_analyzer.py +16 -72
  9. greenmining/config.py +105 -58
  10. greenmining/controllers/__init__.py +1 -5
  11. greenmining/controllers/repository_controller.py +153 -94
  12. greenmining/energy/__init__.py +13 -0
  13. greenmining/energy/base.py +165 -0
  14. greenmining/energy/codecarbon_meter.py +146 -0
  15. greenmining/energy/rapl.py +157 -0
  16. greenmining/gsf_patterns.py +4 -26
  17. greenmining/models/__init__.py +1 -5
  18. greenmining/models/aggregated_stats.py +4 -4
  19. greenmining/models/analysis_result.py +4 -4
  20. greenmining/models/commit.py +5 -5
  21. greenmining/models/repository.py +5 -5
  22. greenmining/presenters/__init__.py +1 -5
  23. greenmining/presenters/console_presenter.py +24 -24
  24. greenmining/services/__init__.py +10 -6
  25. greenmining/services/commit_extractor.py +8 -152
  26. greenmining/services/data_aggregator.py +45 -175
  27. greenmining/services/data_analyzer.py +9 -202
  28. greenmining/services/github_fetcher.py +212 -323
  29. greenmining/services/github_graphql_fetcher.py +371 -0
  30. greenmining/services/local_repo_analyzer.py +387 -0
  31. greenmining/services/reports.py +33 -137
  32. greenmining/utils.py +21 -149
  33. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/METADATA +169 -146
  34. greenmining-1.0.4.dist-info/RECORD +37 -0
  35. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/WHEEL +1 -1
  36. greenmining/analyzers/ml_feature_extractor.py +0 -512
  37. greenmining/analyzers/nlp_analyzer.py +0 -365
  38. greenmining/cli.py +0 -471
  39. greenmining/main.py +0 -37
  40. greenmining-1.0.2.dist-info/RECORD +0 -36
  41. greenmining-1.0.2.dist-info/entry_points.txt +0 -2
  42. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/licenses/LICENSE +0 -0
  43. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,387 @@
1
+ # Local repository analyzer for direct GitHub URL analysis using PyDriller.
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import shutil
8
+ import tempfile
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime, timedelta
11
+ from pathlib import Path
12
+ from typing import Any, Dict, List, Optional, Generator
13
+
14
+ from pydriller import Repository
15
+ from pydriller.metrics.process.change_set import ChangeSet
16
+ from pydriller.metrics.process.code_churn import CodeChurn
17
+ from pydriller.metrics.process.commits_count import CommitsCount
18
+ from pydriller.metrics.process.contributors_count import ContributorsCount
19
+ from pydriller.metrics.process.contributors_experience import ContributorsExperience
20
+ from pydriller.metrics.process.history_complexity import HistoryComplexity
21
+ from pydriller.metrics.process.hunks_count import HunksCount
22
+ from pydriller.metrics.process.lines_count import LinesCount
23
+
24
+ from greenmining.gsf_patterns import get_pattern_by_keywords, is_green_aware, GSF_PATTERNS
25
+ from greenmining.utils import colored_print
26
+
27
+
28
+ @dataclass
29
+ class CommitAnalysis:
30
+ # Analysis result for a single commit.
31
+
32
+ hash: str
33
+ message: str
34
+ author: str
35
+ author_email: str
36
+ date: datetime
37
+ green_aware: bool
38
+ gsf_patterns_matched: List[str]
39
+ pattern_count: int
40
+ pattern_details: List[Dict[str, Any]]
41
+ confidence: str
42
+ files_modified: List[str]
43
+ insertions: int
44
+ deletions: int
45
+
46
+ # PyDriller DMM metrics
47
+ dmm_unit_size: Optional[float] = None
48
+ dmm_unit_complexity: Optional[float] = None
49
+ dmm_unit_interfacing: Optional[float] = None
50
+
51
+ # Structural metrics (Lizard)
52
+ total_nloc: int = 0
53
+ total_complexity: int = 0
54
+ max_complexity: int = 0
55
+ methods_count: int = 0
56
+
57
+ def to_dict(self) -> Dict[str, Any]:
58
+ # Convert to dictionary.
59
+ return {
60
+ "commit_hash": self.hash,
61
+ "message": self.message,
62
+ "author": self.author,
63
+ "author_email": self.author_email,
64
+ "date": self.date.isoformat() if self.date else None,
65
+ "green_aware": self.green_aware,
66
+ "gsf_patterns_matched": self.gsf_patterns_matched,
67
+ "pattern_count": self.pattern_count,
68
+ "pattern_details": self.pattern_details,
69
+ "confidence": self.confidence,
70
+ "files_modified": self.files_modified,
71
+ "insertions": self.insertions,
72
+ "deletions": self.deletions,
73
+ "dmm_unit_size": self.dmm_unit_size,
74
+ "dmm_unit_complexity": self.dmm_unit_complexity,
75
+ "dmm_unit_interfacing": self.dmm_unit_interfacing,
76
+ "total_nloc": self.total_nloc,
77
+ "total_complexity": self.total_complexity,
78
+ "max_complexity": self.max_complexity,
79
+ "methods_count": self.methods_count,
80
+ }
81
+
82
+
83
+ @dataclass
84
+ class RepositoryAnalysis:
85
+ # Complete analysis result for a repository.
86
+
87
+ url: str
88
+ name: str
89
+ total_commits: int
90
+ green_commits: int
91
+ green_commit_rate: float
92
+ commits: List[CommitAnalysis] = field(default_factory=list)
93
+ process_metrics: Dict[str, Any] = field(default_factory=dict)
94
+
95
+ def to_dict(self) -> Dict[str, Any]:
96
+ # Convert to dictionary.
97
+ return {
98
+ "url": self.url,
99
+ "name": self.name,
100
+ "total_commits": self.total_commits,
101
+ "green_commits": self.green_commits,
102
+ "green_commit_rate": self.green_commit_rate,
103
+ "commits": [c.to_dict() for c in self.commits],
104
+ "process_metrics": self.process_metrics,
105
+ }
106
+
107
+
108
+ class LocalRepoAnalyzer:
109
+ # Analyze repositories directly from GitHub URLs using PyDriller.
110
+
111
+ def __init__(
112
+ self,
113
+ clone_path: Optional[Path] = None,
114
+ max_commits: int = 500,
115
+ days_back: int = 730,
116
+ skip_merges: bool = True,
117
+ compute_process_metrics: bool = True,
118
+ cleanup_after: bool = True,
119
+ ):
120
+ # Initialize the local repository analyzer.
121
+ self.clone_path = clone_path or Path(tempfile.gettempdir()) / "greenmining_repos"
122
+ self.clone_path.mkdir(parents=True, exist_ok=True)
123
+ self.max_commits = max_commits
124
+ self.days_back = days_back
125
+ self.skip_merges = skip_merges
126
+ self.compute_process_metrics = compute_process_metrics
127
+ self.cleanup_after = cleanup_after
128
+ self.gsf_patterns = GSF_PATTERNS
129
+
130
+ def _parse_repo_url(self, url: str) -> tuple[str, str]:
131
+ # Parse repository URL to extract owner and name.
132
+ # Handle HTTPS URLs
133
+ https_pattern = r"github\.com[/:]([^/]+)/([^/\.]+)"
134
+ match = re.search(https_pattern, url)
135
+ if match:
136
+ return match.group(1), match.group(2).replace(".git", "")
137
+
138
+ # Handle SSH URLs
139
+ ssh_pattern = r"git@github\.com:([^/]+)/([^/\.]+)"
140
+ match = re.search(ssh_pattern, url)
141
+ if match:
142
+ return match.group(1), match.group(2).replace(".git", "")
143
+
144
+ raise ValueError(f"Could not parse GitHub URL: {url}")
145
+
146
+ def _get_pattern_details(self, matched_patterns: List[str]) -> List[Dict[str, Any]]:
147
+ # Get detailed pattern information.
148
+ details = []
149
+ for pattern_id, pattern in self.gsf_patterns.items():
150
+ if pattern["name"] in matched_patterns:
151
+ details.append({
152
+ "name": pattern["name"],
153
+ "category": pattern["category"],
154
+ "description": pattern["description"],
155
+ "sci_impact": pattern["sci_impact"],
156
+ })
157
+ return details
158
+
159
+ def analyze_commit(self, commit) -> CommitAnalysis:
160
+ # Analyze a single PyDriller commit object.
161
+ message = commit.msg or ""
162
+
163
+ # Green awareness check
164
+ green_aware = is_green_aware(message)
165
+
166
+ # GSF pattern matching
167
+ matched_patterns = get_pattern_by_keywords(message)
168
+ pattern_details = self._get_pattern_details(matched_patterns)
169
+
170
+ # Confidence calculation
171
+ pattern_count = len(matched_patterns)
172
+ confidence = "high" if pattern_count >= 2 else "medium" if pattern_count == 1 else "low"
173
+
174
+ # File modifications
175
+ files_modified = [mod.filename for mod in commit.modified_files]
176
+ insertions = sum(mod.added_lines for mod in commit.modified_files)
177
+ deletions = sum(mod.deleted_lines for mod in commit.modified_files)
178
+
179
+ # Delta Maintainability Model (if available)
180
+ dmm_unit_size = None
181
+ dmm_unit_complexity = None
182
+ dmm_unit_interfacing = None
183
+
184
+ try:
185
+ dmm_unit_size = commit.dmm_unit_size
186
+ dmm_unit_complexity = commit.dmm_unit_complexity
187
+ dmm_unit_interfacing = commit.dmm_unit_interfacing
188
+ except Exception:
189
+ pass # DMM may not be available for all commits
190
+
191
+ # Structural metrics from Lizard (via PyDriller)
192
+ total_nloc = 0
193
+ total_complexity = 0
194
+ max_complexity = 0
195
+ methods_count = 0
196
+
197
+ try:
198
+ for mod in commit.modified_files:
199
+ if mod.nloc:
200
+ total_nloc += mod.nloc
201
+ if mod.complexity:
202
+ total_complexity += mod.complexity
203
+ if mod.complexity > max_complexity:
204
+ max_complexity = mod.complexity
205
+ if mod.methods:
206
+ methods_count += len(mod.methods)
207
+ except Exception:
208
+ pass # Structural metrics may fail for some files
209
+
210
+ return CommitAnalysis(
211
+ hash=commit.hash,
212
+ message=message,
213
+ author=commit.author.name,
214
+ author_email=commit.author.email,
215
+ date=commit.author_date,
216
+ green_aware=green_aware,
217
+ gsf_patterns_matched=matched_patterns,
218
+ pattern_count=pattern_count,
219
+ pattern_details=pattern_details,
220
+ confidence=confidence,
221
+ files_modified=files_modified,
222
+ insertions=insertions,
223
+ deletions=deletions,
224
+ dmm_unit_size=dmm_unit_size,
225
+ dmm_unit_complexity=dmm_unit_complexity,
226
+ dmm_unit_interfacing=dmm_unit_interfacing,
227
+ total_nloc=total_nloc,
228
+ total_complexity=total_complexity,
229
+ max_complexity=max_complexity,
230
+ methods_count=methods_count,
231
+ )
232
+
233
+ def analyze_repository(self, url: str) -> RepositoryAnalysis:
234
+ # Analyze a repository from its URL.
235
+ owner, repo_name = self._parse_repo_url(url)
236
+ full_name = f"{owner}/{repo_name}"
237
+
238
+ colored_print(f"\n Analyzing repository: {full_name}", "cyan")
239
+
240
+ # Calculate date range
241
+ since_date = datetime.now() - timedelta(days=self.days_back)
242
+
243
+ # Configure PyDriller Repository
244
+ repo_config = {
245
+ "path_to_repo": url,
246
+ "since": since_date,
247
+ "only_no_merge": self.skip_merges,
248
+ }
249
+
250
+ # Clone to specific path if needed
251
+ local_path = self.clone_path / repo_name
252
+ if local_path.exists():
253
+ shutil.rmtree(local_path)
254
+
255
+ repo_config["clone_repo_to"] = str(self.clone_path)
256
+
257
+ colored_print(f" Cloning to: {local_path}", "cyan")
258
+
259
+ commits_analyzed = []
260
+ commit_count = 0
261
+
262
+ try:
263
+ for commit in Repository(**repo_config).traverse_commits():
264
+ if commit_count >= self.max_commits:
265
+ break
266
+
267
+ try:
268
+ analysis = self.analyze_commit(commit)
269
+ commits_analyzed.append(analysis)
270
+ commit_count += 1
271
+
272
+ if commit_count % 50 == 0:
273
+ colored_print(f" Processed {commit_count} commits...", "cyan")
274
+
275
+ except Exception as e:
276
+ colored_print(f" Warning: Error analyzing commit {commit.hash[:8]}: {e}", "yellow")
277
+ continue
278
+
279
+ colored_print(f" Analyzed {len(commits_analyzed)} commits", "green")
280
+
281
+ # Compute process metrics if enabled
282
+ process_metrics = {}
283
+ if self.compute_process_metrics and local_path.exists():
284
+ colored_print(" Computing process metrics...", "cyan")
285
+ process_metrics = self._compute_process_metrics(str(local_path))
286
+
287
+ # Calculate summary
288
+ green_commits = sum(1 for c in commits_analyzed if c.green_aware)
289
+ green_rate = green_commits / len(commits_analyzed) if commits_analyzed else 0
290
+
291
+ result = RepositoryAnalysis(
292
+ url=url,
293
+ name=full_name,
294
+ total_commits=len(commits_analyzed),
295
+ green_commits=green_commits,
296
+ green_commit_rate=green_rate,
297
+ commits=commits_analyzed,
298
+ process_metrics=process_metrics,
299
+ )
300
+
301
+ return result
302
+
303
+ finally:
304
+ # Cleanup if requested
305
+ if self.cleanup_after and local_path.exists():
306
+ colored_print(f" Cleaning up: {local_path}", "cyan")
307
+ shutil.rmtree(local_path, ignore_errors=True)
308
+
309
+ def _compute_process_metrics(self, repo_path: str) -> Dict[str, Any]:
310
+ # Compute PyDriller process metrics for the repository.
311
+ metrics = {}
312
+ since_date = datetime.now() - timedelta(days=self.days_back)
313
+ to_date = datetime.now()
314
+
315
+ try:
316
+ # ChangeSet metrics
317
+ cs = ChangeSet(repo_path, since=since_date, to=to_date)
318
+ metrics["change_set_max"] = cs.max()
319
+ metrics["change_set_avg"] = cs.avg()
320
+ except Exception as e:
321
+ colored_print(f" Warning: ChangeSet metrics failed: {e}", "yellow")
322
+
323
+ try:
324
+ # CodeChurn metrics
325
+ churn = CodeChurn(repo_path, since=since_date, to=to_date)
326
+ metrics["code_churn"] = churn.count()
327
+ except Exception as e:
328
+ colored_print(f" Warning: CodeChurn metrics failed: {e}", "yellow")
329
+
330
+ try:
331
+ # CommitsCount metrics
332
+ cc = CommitsCount(repo_path, since=since_date, to=to_date)
333
+ metrics["commits_per_file"] = cc.count()
334
+ except Exception as e:
335
+ colored_print(f" Warning: CommitsCount metrics failed: {e}", "yellow")
336
+
337
+ try:
338
+ # ContributorsCount metrics
339
+ contrib = ContributorsCount(repo_path, since=since_date, to=to_date)
340
+ metrics["contributors_per_file"] = contrib.count()
341
+ except Exception as e:
342
+ colored_print(f" Warning: ContributorsCount metrics failed: {e}", "yellow")
343
+
344
+ try:
345
+ # ContributorsExperience metrics
346
+ exp = ContributorsExperience(repo_path, since=since_date, to=to_date)
347
+ metrics["contributors_experience"] = exp.count()
348
+ except Exception as e:
349
+ colored_print(f" Warning: ContributorsExperience metrics failed: {e}", "yellow")
350
+
351
+ try:
352
+ # HistoryComplexity metrics
353
+ hc = HistoryComplexity(repo_path, since=since_date, to=to_date)
354
+ metrics["history_complexity"] = hc.count()
355
+ except Exception as e:
356
+ colored_print(f" Warning: HistoryComplexity metrics failed: {e}", "yellow")
357
+
358
+ try:
359
+ # HunksCount metrics
360
+ hunks = HunksCount(repo_path, since=since_date, to=to_date)
361
+ metrics["hunks_count"] = hunks.count()
362
+ except Exception as e:
363
+ colored_print(f" Warning: HunksCount metrics failed: {e}", "yellow")
364
+
365
+ try:
366
+ # LinesCount metrics
367
+ lines = LinesCount(repo_path, since=since_date, to=to_date)
368
+ metrics["lines_count"] = lines.count()
369
+ except Exception as e:
370
+ colored_print(f" Warning: LinesCount metrics failed: {e}", "yellow")
371
+
372
+ return metrics
373
+
374
+ def analyze_repositories(self, urls: List[str]) -> List[RepositoryAnalysis]:
375
+ # Analyze multiple repositories from URLs.
376
+ results = []
377
+
378
+ for i, url in enumerate(urls, 1):
379
+ colored_print(f"\n[{i}/{len(urls)}] Processing repository...", "cyan")
380
+ try:
381
+ result = self.analyze_repository(url)
382
+ results.append(result)
383
+ except Exception as e:
384
+ colored_print(f" Error analyzing {url}: {e}", "red")
385
+ continue
386
+
387
+ return results