greenmining 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__init__.py +11 -29
- greenmining/__main__.py +9 -3
- greenmining/__version__.py +2 -2
- greenmining/analyzers/__init__.py +3 -7
- greenmining/analyzers/code_diff_analyzer.py +151 -61
- greenmining/analyzers/qualitative_analyzer.py +15 -81
- greenmining/analyzers/statistical_analyzer.py +8 -69
- greenmining/analyzers/temporal_analyzer.py +16 -72
- greenmining/config.py +105 -58
- greenmining/controllers/__init__.py +1 -5
- greenmining/controllers/repository_controller.py +153 -94
- greenmining/energy/__init__.py +13 -0
- greenmining/energy/base.py +165 -0
- greenmining/energy/codecarbon_meter.py +146 -0
- greenmining/energy/rapl.py +157 -0
- greenmining/gsf_patterns.py +4 -26
- greenmining/models/__init__.py +1 -5
- greenmining/models/aggregated_stats.py +4 -4
- greenmining/models/analysis_result.py +4 -4
- greenmining/models/commit.py +5 -5
- greenmining/models/repository.py +5 -5
- greenmining/presenters/__init__.py +1 -5
- greenmining/presenters/console_presenter.py +24 -24
- greenmining/services/__init__.py +10 -6
- greenmining/services/commit_extractor.py +8 -152
- greenmining/services/data_aggregator.py +45 -175
- greenmining/services/data_analyzer.py +9 -202
- greenmining/services/github_fetcher.py +212 -323
- greenmining/services/github_graphql_fetcher.py +371 -0
- greenmining/services/local_repo_analyzer.py +387 -0
- greenmining/services/reports.py +33 -137
- greenmining/utils.py +21 -149
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/METADATA +61 -151
- greenmining-1.0.4.dist-info/RECORD +37 -0
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/WHEEL +1 -1
- greenmining/analyzers/ml_feature_extractor.py +0 -512
- greenmining/analyzers/nlp_analyzer.py +0 -365
- greenmining/cli.py +0 -471
- greenmining/main.py +0 -37
- greenmining-1.0.3.dist-info/RECORD +0 -36
- greenmining-1.0.3.dist-info/entry_points.txt +0 -2
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/licenses/LICENSE +0 -0
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
# Commit extractor for green microservices mining.
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
@@ -7,7 +7,6 @@ from datetime import datetime, timedelta
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from typing import Any, Dict, List, Optional
|
|
9
9
|
|
|
10
|
-
import click
|
|
11
10
|
from github import Github
|
|
12
11
|
from tqdm import tqdm
|
|
13
12
|
|
|
@@ -24,7 +23,7 @@ from greenmining.utils import (
|
|
|
24
23
|
|
|
25
24
|
|
|
26
25
|
class CommitExtractor:
|
|
27
|
-
|
|
26
|
+
# Extracts commit data from repositories using GitHub API.
|
|
28
27
|
|
|
29
28
|
def __init__(
|
|
30
29
|
self,
|
|
@@ -34,15 +33,7 @@ class CommitExtractor:
|
|
|
34
33
|
github_token: str | None = None,
|
|
35
34
|
timeout: int = 60,
|
|
36
35
|
):
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
max_commits: Maximum commits per repository
|
|
41
|
-
skip_merges: Skip merge commits
|
|
42
|
-
days_back: Only analyze commits from last N days
|
|
43
|
-
github_token: GitHub API token (optional)
|
|
44
|
-
timeout: Timeout in seconds per repository (default: 60)
|
|
45
|
-
"""
|
|
36
|
+
# Initialize commit extractor.
|
|
46
37
|
self.max_commits = max_commits
|
|
47
38
|
self.skip_merges = skip_merges
|
|
48
39
|
self.days_back = days_back
|
|
@@ -51,14 +42,7 @@ class CommitExtractor:
|
|
|
51
42
|
self.timeout = timeout
|
|
52
43
|
|
|
53
44
|
def extract_from_repositories(self, repositories: list[dict[str, Any] | Repository]) -> list[dict[str, Any]]:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
repositories: List of repository metadata (dicts or Repository objects)
|
|
58
|
-
|
|
59
|
-
Returns:
|
|
60
|
-
List of commit data dictionaries
|
|
61
|
-
"""
|
|
45
|
+
# Extract commits from list of repositories.
|
|
62
46
|
all_commits = []
|
|
63
47
|
failed_repos = []
|
|
64
48
|
|
|
@@ -114,14 +98,7 @@ class CommitExtractor:
|
|
|
114
98
|
|
|
115
99
|
@retry_on_exception(max_retries=2, delay=5.0, exceptions=(Exception,))
|
|
116
100
|
def _extract_repo_commits(self, repo: dict[str, Any]) -> list[dict[str, Any]]:
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
Args:
|
|
120
|
-
repo: Repository metadata (dict or Repository object)
|
|
121
|
-
|
|
122
|
-
Returns:
|
|
123
|
-
List of commit dictionaries
|
|
124
|
-
"""
|
|
101
|
+
# Extract commits from a single repository using GitHub API.
|
|
125
102
|
commits = []
|
|
126
103
|
# Handle both Repository objects and dicts
|
|
127
104
|
repo_name = repo.full_name if isinstance(repo, Repository) else repo["full_name"]
|
|
@@ -163,15 +140,7 @@ class CommitExtractor:
|
|
|
163
140
|
return commits
|
|
164
141
|
|
|
165
142
|
def _extract_commit_metadata(self, commit, repo_name: str) -> dict[str, Any]:
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
Args:
|
|
169
|
-
commit: PyDriller commit object
|
|
170
|
-
repo_name: Repository name
|
|
171
|
-
|
|
172
|
-
Returns:
|
|
173
|
-
Dictionary with commit metadata
|
|
174
|
-
"""
|
|
143
|
+
# Extract metadata from commit object.
|
|
175
144
|
# Get modified files
|
|
176
145
|
files_changed = []
|
|
177
146
|
lines_added = 0
|
|
@@ -205,15 +174,7 @@ class CommitExtractor:
|
|
|
205
174
|
}
|
|
206
175
|
|
|
207
176
|
def _extract_commit_metadata_from_github(self, commit, repo_name: str) -> dict[str, Any]:
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
Args:
|
|
211
|
-
commit: GitHub API commit object
|
|
212
|
-
repo_name: Repository name
|
|
213
|
-
|
|
214
|
-
Returns:
|
|
215
|
-
Dictionary with commit metadata
|
|
216
|
-
"""
|
|
177
|
+
# Extract metadata from GitHub API commit object.
|
|
217
178
|
# Get modified files and stats
|
|
218
179
|
files_changed = []
|
|
219
180
|
lines_added = 0
|
|
@@ -245,13 +206,7 @@ class CommitExtractor:
|
|
|
245
206
|
}
|
|
246
207
|
|
|
247
208
|
def save_results(self, commits: list[dict[str, Any]], output_file: Path, repos_count: int):
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
Args:
|
|
251
|
-
commits: List of commit data
|
|
252
|
-
output_file: Output file path
|
|
253
|
-
repos_count: Number of repositories processed
|
|
254
|
-
"""
|
|
209
|
+
# Save extracted commits to JSON file.
|
|
255
210
|
data = {
|
|
256
211
|
"metadata": {
|
|
257
212
|
"extracted_at": format_timestamp(),
|
|
@@ -267,102 +222,3 @@ class CommitExtractor:
|
|
|
267
222
|
|
|
268
223
|
save_json_file(data, output_file)
|
|
269
224
|
colored_print(f"Saved {len(commits)} commits to {output_file}", "green")
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
@click.command()
|
|
273
|
-
@click.option("--max-commits", default=50, help="Maximum commits per repository")
|
|
274
|
-
@click.option("--skip-merges/--include-merges", default=True, help="Skip merge commits")
|
|
275
|
-
@click.option("--days-back", default=730, help="Only analyze commits from last N days")
|
|
276
|
-
@click.option(
|
|
277
|
-
"--repos-file", default=None, help="Input repositories file (default: data/repositories.json)"
|
|
278
|
-
)
|
|
279
|
-
@click.option("--output", default=None, help="Output file path (default: data/commits.json)")
|
|
280
|
-
@click.option("--config-file", default=".env", help="Path to .env configuration file")
|
|
281
|
-
def extract(
|
|
282
|
-
max_commits: int,
|
|
283
|
-
skip_merges: bool,
|
|
284
|
-
days_back: int,
|
|
285
|
-
repos_file: Optional[str],
|
|
286
|
-
output: Optional[str],
|
|
287
|
-
config_file: str,
|
|
288
|
-
):
|
|
289
|
-
"""Extract commits from fetched repositories."""
|
|
290
|
-
print_banner("Commit Data Extractor")
|
|
291
|
-
|
|
292
|
-
try:
|
|
293
|
-
# Load configuration
|
|
294
|
-
config = get_config(config_file)
|
|
295
|
-
|
|
296
|
-
# Determine input/output files
|
|
297
|
-
input_file = Path(repos_file) if repos_file else config.REPOS_FILE
|
|
298
|
-
output_file = Path(output) if output else config.COMMITS_FILE
|
|
299
|
-
|
|
300
|
-
# Check if input file exists
|
|
301
|
-
if not input_file.exists():
|
|
302
|
-
colored_print(f"Input file not found: {input_file}", "red")
|
|
303
|
-
colored_print("Please run 'fetch' command first to fetch repositories", "yellow")
|
|
304
|
-
exit(1)
|
|
305
|
-
|
|
306
|
-
# Load repositories
|
|
307
|
-
colored_print(f"Loading repositories from {input_file}...", "blue")
|
|
308
|
-
data = load_json_file(input_file)
|
|
309
|
-
repositories = data.get("repositories", [])
|
|
310
|
-
|
|
311
|
-
if not repositories:
|
|
312
|
-
colored_print("No repositories found in input file", "yellow")
|
|
313
|
-
exit(1)
|
|
314
|
-
|
|
315
|
-
colored_print(f"Loaded {len(repositories)} repositories", "green")
|
|
316
|
-
|
|
317
|
-
# Initialize extractor
|
|
318
|
-
extractor = CommitExtractor(
|
|
319
|
-
max_commits=max_commits, skip_merges=skip_merges, days_back=days_back
|
|
320
|
-
)
|
|
321
|
-
|
|
322
|
-
# Extract commits
|
|
323
|
-
commits = extractor.extract_from_repositories(repositories)
|
|
324
|
-
|
|
325
|
-
if not commits:
|
|
326
|
-
colored_print("No commits extracted", "yellow")
|
|
327
|
-
exit(1)
|
|
328
|
-
|
|
329
|
-
# Save results
|
|
330
|
-
extractor.save_results(commits, output_file, len(repositories))
|
|
331
|
-
|
|
332
|
-
# Display summary
|
|
333
|
-
colored_print(f"\n✓ Successfully extracted {len(commits)} commits", "green")
|
|
334
|
-
colored_print(f"Output saved to: {output_file}", "green")
|
|
335
|
-
|
|
336
|
-
# Calculate statistics
|
|
337
|
-
avg_commits = len(commits) / len(repositories)
|
|
338
|
-
colored_print("\nStatistics:", "cyan")
|
|
339
|
-
colored_print(f" Total repositories: {len(repositories)}", "white")
|
|
340
|
-
colored_print(f" Total commits: {len(commits)}", "white")
|
|
341
|
-
colored_print(f" Average commits per repo: {avg_commits:.1f}", "white")
|
|
342
|
-
|
|
343
|
-
# Show language breakdown
|
|
344
|
-
from collections import Counter
|
|
345
|
-
|
|
346
|
-
repo_languages = [repo["language"] for repo in repositories if repo.get("language")]
|
|
347
|
-
language_counts = Counter(repo_languages)
|
|
348
|
-
|
|
349
|
-
colored_print("\nLanguage breakdown:", "cyan")
|
|
350
|
-
for lang, count in language_counts.most_common(5):
|
|
351
|
-
colored_print(f" {lang}: {count} repos", "white")
|
|
352
|
-
|
|
353
|
-
except FileNotFoundError as e:
|
|
354
|
-
colored_print(f"File not found: {e}", "red")
|
|
355
|
-
exit(1)
|
|
356
|
-
except json.JSONDecodeError:
|
|
357
|
-
colored_print(f"Invalid JSON in input file: {input_file}", "red")
|
|
358
|
-
exit(1)
|
|
359
|
-
except Exception as e:
|
|
360
|
-
colored_print(f"Error: {e}", "red")
|
|
361
|
-
import traceback
|
|
362
|
-
|
|
363
|
-
traceback.print_exc()
|
|
364
|
-
exit(1)
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
if __name__ == "__main__":
|
|
368
|
-
extract()
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
# Data aggregator for green microservices analysis results.
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
@@ -7,11 +7,10 @@ from collections import defaultdict
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from typing import Any, Dict, List, Optional
|
|
9
9
|
|
|
10
|
-
import click
|
|
11
10
|
import pandas as pd
|
|
12
11
|
|
|
13
12
|
from greenmining.analyzers import (
|
|
14
|
-
|
|
13
|
+
StatisticalAnalyzer,
|
|
15
14
|
TemporalAnalyzer,
|
|
16
15
|
QualitativeAnalyzer,
|
|
17
16
|
)
|
|
@@ -29,27 +28,21 @@ from greenmining.utils import (
|
|
|
29
28
|
|
|
30
29
|
|
|
31
30
|
class DataAggregator:
|
|
32
|
-
|
|
31
|
+
# Aggregates analysis results and generates statistics.
|
|
33
32
|
|
|
34
33
|
def __init__(
|
|
35
34
|
self,
|
|
36
|
-
|
|
35
|
+
enable_stats: bool = False,
|
|
37
36
|
enable_temporal: bool = False,
|
|
38
37
|
temporal_granularity: str = "quarter",
|
|
39
38
|
):
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
Args:
|
|
43
|
-
enable_enhanced_stats: Enable enhanced statistical analysis
|
|
44
|
-
enable_temporal: Enable temporal trend analysis
|
|
45
|
-
temporal_granularity: Granularity for temporal analysis (day/week/month/quarter/year)
|
|
46
|
-
"""
|
|
47
|
-
self.enable_enhanced_stats = enable_enhanced_stats
|
|
39
|
+
# Initialize aggregator.
|
|
40
|
+
self.enable_stats = enable_stats
|
|
48
41
|
self.enable_temporal = enable_temporal
|
|
49
42
|
|
|
50
|
-
if self.
|
|
51
|
-
self.statistical_analyzer =
|
|
52
|
-
colored_print("
|
|
43
|
+
if self.enable_stats:
|
|
44
|
+
self.statistical_analyzer = StatisticalAnalyzer()
|
|
45
|
+
colored_print("Statistical analysis enabled", "cyan")
|
|
53
46
|
else:
|
|
54
47
|
self.statistical_analyzer = None
|
|
55
48
|
|
|
@@ -64,15 +57,7 @@ class DataAggregator:
|
|
|
64
57
|
def aggregate(
|
|
65
58
|
self, analysis_results: list[dict[str, Any]], repositories: list[dict[str, Any]]
|
|
66
59
|
) -> dict[str, Any]:
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
Args:
|
|
70
|
-
analysis_results: List of commit analysis results
|
|
71
|
-
repositories: List of repository metadata
|
|
72
|
-
|
|
73
|
-
Returns:
|
|
74
|
-
Aggregated statistics dictionary
|
|
75
|
-
"""
|
|
60
|
+
# Aggregate analysis results into summary statistics.
|
|
76
61
|
colored_print("\nAggregating analysis results...", "cyan")
|
|
77
62
|
|
|
78
63
|
# Summary statistics
|
|
@@ -90,15 +75,15 @@ class DataAggregator:
|
|
|
90
75
|
# Per-language statistics
|
|
91
76
|
per_language_stats = self._generate_language_stats(analysis_results, repositories)
|
|
92
77
|
|
|
93
|
-
#
|
|
94
|
-
|
|
95
|
-
if self.
|
|
78
|
+
# Statistical analysis (if enabled)
|
|
79
|
+
stats_analysis = None
|
|
80
|
+
if self.enable_stats and len(analysis_results) > 0:
|
|
96
81
|
try:
|
|
97
|
-
|
|
98
|
-
colored_print("
|
|
82
|
+
stats_analysis = self._generate_statistics(analysis_results)
|
|
83
|
+
colored_print(" Statistical analysis complete", "green")
|
|
99
84
|
except Exception as e:
|
|
100
|
-
colored_print(f"
|
|
101
|
-
|
|
85
|
+
colored_print(f" Statistics failed: {e}", "yellow")
|
|
86
|
+
stats_analysis = {"error": str(e)}
|
|
102
87
|
|
|
103
88
|
# Temporal trend analysis (if enabled)
|
|
104
89
|
temporal_analysis = None
|
|
@@ -116,9 +101,9 @@ class DataAggregator:
|
|
|
116
101
|
]
|
|
117
102
|
|
|
118
103
|
temporal_analysis = self.temporal_analyzer.analyze_trends(commits, analysis_results)
|
|
119
|
-
colored_print("
|
|
104
|
+
colored_print(" Temporal trend analysis complete", "green")
|
|
120
105
|
except Exception as e:
|
|
121
|
-
colored_print(f"
|
|
106
|
+
colored_print(f" Temporal analysis failed: {e}", "yellow")
|
|
122
107
|
temporal_analysis = {"error": str(e)}
|
|
123
108
|
|
|
124
109
|
result = {
|
|
@@ -129,8 +114,8 @@ class DataAggregator:
|
|
|
129
114
|
"per_language_stats": per_language_stats,
|
|
130
115
|
}
|
|
131
116
|
|
|
132
|
-
if
|
|
133
|
-
result["
|
|
117
|
+
if stats_analysis:
|
|
118
|
+
result["statistics"] = stats_analysis
|
|
134
119
|
|
|
135
120
|
if temporal_analysis:
|
|
136
121
|
result["temporal_analysis"] = temporal_analysis
|
|
@@ -140,7 +125,7 @@ class DataAggregator:
|
|
|
140
125
|
def _generate_summary(
|
|
141
126
|
self, results: list[dict[str, Any]], repos: list[dict[str, Any]]
|
|
142
127
|
) -> dict[str, Any]:
|
|
143
|
-
|
|
128
|
+
# Generate overall summary statistics.
|
|
144
129
|
total_commits = len(results)
|
|
145
130
|
green_aware_count = sum(1 for r in results if r.get("green_aware", False))
|
|
146
131
|
|
|
@@ -158,7 +143,7 @@ class DataAggregator:
|
|
|
158
143
|
}
|
|
159
144
|
|
|
160
145
|
def _analyze_known_patterns(self, results: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
161
|
-
|
|
146
|
+
# Analyze known green software patterns.
|
|
162
147
|
pattern_data = defaultdict(
|
|
163
148
|
lambda: {"count": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0, "example_commits": []}
|
|
164
149
|
)
|
|
@@ -209,7 +194,7 @@ class DataAggregator:
|
|
|
209
194
|
return patterns_list
|
|
210
195
|
|
|
211
196
|
def _analyze_emergent_patterns(self, results: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
212
|
-
|
|
197
|
+
# Analyze emergent patterns (placeholder for manual review).
|
|
213
198
|
emergent = []
|
|
214
199
|
|
|
215
200
|
for result in results:
|
|
@@ -228,7 +213,7 @@ class DataAggregator:
|
|
|
228
213
|
def _generate_repo_stats(
|
|
229
214
|
self, results: list[dict[str, Any]], repos: list[dict[str, Any]]
|
|
230
215
|
) -> list[dict[str, Any]]:
|
|
231
|
-
|
|
216
|
+
# Generate per-repository statistics.
|
|
232
217
|
repo_commits = defaultdict(list)
|
|
233
218
|
|
|
234
219
|
# Group commits by repository
|
|
@@ -270,7 +255,7 @@ class DataAggregator:
|
|
|
270
255
|
def _generate_language_stats(
|
|
271
256
|
self, results: list[dict[str, Any]], repos: list[dict[str, Any]]
|
|
272
257
|
) -> list[dict[str, Any]]:
|
|
273
|
-
|
|
258
|
+
# Generate per-language statistics.
|
|
274
259
|
# Create repo name to language mapping (handle both Repository objects and dicts)
|
|
275
260
|
repo_language_map = {}
|
|
276
261
|
for repo in repos:
|
|
@@ -306,23 +291,16 @@ class DataAggregator:
|
|
|
306
291
|
|
|
307
292
|
return language_stats
|
|
308
293
|
|
|
309
|
-
def
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
Args:
|
|
313
|
-
results: List of commit analysis results
|
|
314
|
-
|
|
315
|
-
Returns:
|
|
316
|
-
Dictionary with enhanced statistical analysis
|
|
317
|
-
"""
|
|
294
|
+
def _generate_statistics(self, results: list[dict[str, Any]]) -> dict[str, Any]:
|
|
295
|
+
# Generate statistical analysis.
|
|
318
296
|
# Prepare DataFrame
|
|
319
297
|
df = pd.DataFrame(results)
|
|
320
298
|
|
|
321
299
|
# Ensure required columns exist
|
|
322
300
|
if "date" not in df.columns or "green_aware" not in df.columns:
|
|
323
|
-
return {"error": "Missing required columns for
|
|
301
|
+
return {"error": "Missing required columns for statistics"}
|
|
324
302
|
|
|
325
|
-
|
|
303
|
+
stats_result = {}
|
|
326
304
|
|
|
327
305
|
# 1. Temporal Trend Analysis
|
|
328
306
|
if len(df) >= 8: # Need at least 8 data points
|
|
@@ -330,7 +308,7 @@ class DataAggregator:
|
|
|
330
308
|
df_copy = df.copy()
|
|
331
309
|
df_copy["commit_hash"] = df_copy.get("commit_hash", df_copy.index)
|
|
332
310
|
trends = self.statistical_analyzer.temporal_trend_analysis(df_copy)
|
|
333
|
-
|
|
311
|
+
stats_result["temporal_trends"] = {
|
|
334
312
|
"trend_direction": trends["trend"]["direction"],
|
|
335
313
|
"correlation": float(trends["trend"]["correlation"]),
|
|
336
314
|
"p_value": float(trends["trend"]["p_value"]),
|
|
@@ -338,20 +316,20 @@ class DataAggregator:
|
|
|
338
316
|
"monthly_data_points": len(trends.get("monthly_data", {})),
|
|
339
317
|
}
|
|
340
318
|
except Exception as e:
|
|
341
|
-
|
|
319
|
+
stats_result["temporal_trends"] = {"error": str(e)}
|
|
342
320
|
|
|
343
321
|
# 2. Pattern Correlation Analysis (if pattern columns exist)
|
|
344
322
|
pattern_cols = [col for col in df.columns if col.startswith("pattern_")]
|
|
345
323
|
if pattern_cols and len(pattern_cols) >= 2:
|
|
346
324
|
try:
|
|
347
325
|
correlations = self.statistical_analyzer.analyze_pattern_correlations(df)
|
|
348
|
-
|
|
326
|
+
stats_result["pattern_correlations"] = {
|
|
349
327
|
"significant_pairs_count": len(correlations["significant_pairs"]),
|
|
350
328
|
"significant_pairs": correlations["significant_pairs"][:5], # Top 5
|
|
351
329
|
"interpretation": correlations["interpretation"],
|
|
352
330
|
}
|
|
353
331
|
except Exception as e:
|
|
354
|
-
|
|
332
|
+
stats_result["pattern_correlations"] = {"error": str(e)}
|
|
355
333
|
|
|
356
334
|
# 3. Effect Size Analysis by Repository
|
|
357
335
|
if "repository" in df.columns:
|
|
@@ -369,7 +347,7 @@ class DataAggregator:
|
|
|
369
347
|
effect = self.statistical_analyzer.effect_size_analysis(
|
|
370
348
|
list(group1), list(group2)
|
|
371
349
|
)
|
|
372
|
-
|
|
350
|
+
stats_result["effect_size"] = {
|
|
373
351
|
"cohens_d": float(effect["cohens_d"]),
|
|
374
352
|
"magnitude": effect["magnitude"],
|
|
375
353
|
"mean_difference": float(effect["mean_difference"]),
|
|
@@ -377,10 +355,10 @@ class DataAggregator:
|
|
|
377
355
|
"comparison": "high_green_vs_low_green_repos",
|
|
378
356
|
}
|
|
379
357
|
except Exception as e:
|
|
380
|
-
|
|
358
|
+
stats_result["effect_size"] = {"error": str(e)}
|
|
381
359
|
|
|
382
360
|
# 4. Basic descriptive statistics
|
|
383
|
-
|
|
361
|
+
stats_result["descriptive"] = {
|
|
384
362
|
"total_commits": len(df),
|
|
385
363
|
"green_commits": int(df["green_aware"].sum()),
|
|
386
364
|
"green_rate_mean": float(df["green_aware"].mean()),
|
|
@@ -390,7 +368,7 @@ class DataAggregator:
|
|
|
390
368
|
),
|
|
391
369
|
}
|
|
392
370
|
|
|
393
|
-
return
|
|
371
|
+
return stats_result
|
|
394
372
|
|
|
395
373
|
def save_results(
|
|
396
374
|
self,
|
|
@@ -399,14 +377,7 @@ class DataAggregator:
|
|
|
399
377
|
csv_file: Path,
|
|
400
378
|
analysis_results: list[dict[str, Any]],
|
|
401
379
|
):
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
Args:
|
|
405
|
-
aggregated_data: Aggregated statistics
|
|
406
|
-
json_file: JSON output file path
|
|
407
|
-
csv_file: CSV output file path
|
|
408
|
-
analysis_results: Original analysis results for CSV
|
|
409
|
-
"""
|
|
380
|
+
# Save aggregated results to JSON and CSV files.
|
|
410
381
|
# Save JSON
|
|
411
382
|
save_json_file(aggregated_data, json_file)
|
|
412
383
|
colored_print(f"Saved aggregated statistics to {json_file}", "green")
|
|
@@ -434,17 +405,17 @@ class DataAggregator:
|
|
|
434
405
|
colored_print(f"Saved detailed results to {csv_file}", "green")
|
|
435
406
|
|
|
436
407
|
def print_summary(self, aggregated_data: dict[str, Any]):
|
|
437
|
-
|
|
408
|
+
# Print summary to console.
|
|
438
409
|
from tabulate import tabulate
|
|
439
410
|
|
|
440
411
|
summary = aggregated_data["summary"]
|
|
441
412
|
|
|
442
413
|
colored_print("\n" + "=" * 60, "cyan")
|
|
443
|
-
colored_print("
|
|
414
|
+
colored_print(" AGGREGATED STATISTICS SUMMARY", "cyan")
|
|
444
415
|
colored_print("=" * 60, "cyan")
|
|
445
416
|
|
|
446
417
|
# Overall summary
|
|
447
|
-
colored_print("\n
|
|
418
|
+
colored_print("\n Overall Statistics:", "blue")
|
|
448
419
|
summary_table = [
|
|
449
420
|
["Total Commits Analyzed", format_number(summary["total_commits"])],
|
|
450
421
|
[
|
|
@@ -458,7 +429,7 @@ class DataAggregator:
|
|
|
458
429
|
|
|
459
430
|
# Top patterns
|
|
460
431
|
if aggregated_data["known_patterns"]:
|
|
461
|
-
colored_print("\n
|
|
432
|
+
colored_print("\n Top Green Patterns Detected:", "blue")
|
|
462
433
|
pattern_table = []
|
|
463
434
|
for pattern in aggregated_data["known_patterns"][:10]:
|
|
464
435
|
pattern_table.append(
|
|
@@ -479,7 +450,7 @@ class DataAggregator:
|
|
|
479
450
|
|
|
480
451
|
# Top repositories
|
|
481
452
|
if aggregated_data["per_repo_stats"]:
|
|
482
|
-
colored_print("\n
|
|
453
|
+
colored_print("\n Top 10 Greenest Repositories:", "blue")
|
|
483
454
|
repo_table = []
|
|
484
455
|
for repo in aggregated_data["per_repo_stats"][:10]:
|
|
485
456
|
repo_table.append(
|
|
@@ -498,7 +469,7 @@ class DataAggregator:
|
|
|
498
469
|
|
|
499
470
|
# Language breakdown
|
|
500
471
|
if aggregated_data["per_language_stats"]:
|
|
501
|
-
colored_print("\n
|
|
472
|
+
colored_print("\n Language Breakdown:", "blue")
|
|
502
473
|
lang_table = []
|
|
503
474
|
for lang in aggregated_data["per_language_stats"]:
|
|
504
475
|
lang_table.append(
|
|
@@ -512,104 +483,3 @@ class DataAggregator:
|
|
|
512
483
|
print(
|
|
513
484
|
tabulate(lang_table, headers=["Language", "Total", "Green", "%"], tablefmt="simple")
|
|
514
485
|
)
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
@click.command()
|
|
518
|
-
@click.option(
|
|
519
|
-
"--analysis-file",
|
|
520
|
-
default=None,
|
|
521
|
-
help="Input analysis file (default: data/analysis_results.json)",
|
|
522
|
-
)
|
|
523
|
-
@click.option(
|
|
524
|
-
"--repos-file", default=None, help="Input repositories file (default: data/repositories.json)"
|
|
525
|
-
)
|
|
526
|
-
@click.option(
|
|
527
|
-
"--output-json",
|
|
528
|
-
default=None,
|
|
529
|
-
help="Output JSON file (default: data/aggregated_statistics.json)",
|
|
530
|
-
)
|
|
531
|
-
@click.option(
|
|
532
|
-
"--output-csv", default=None, help="Output CSV file (default: data/green_analysis_results.csv)"
|
|
533
|
-
)
|
|
534
|
-
@click.option("--config-file", default=".env", help="Path to .env configuration file")
|
|
535
|
-
def aggregate(
|
|
536
|
-
analysis_file: Optional[str],
|
|
537
|
-
repos_file: Optional[str],
|
|
538
|
-
output_json: Optional[str],
|
|
539
|
-
output_csv: Optional[str],
|
|
540
|
-
config_file: str,
|
|
541
|
-
):
|
|
542
|
-
"""Aggregate analysis results and generate statistics."""
|
|
543
|
-
print_banner("Data Aggregator")
|
|
544
|
-
|
|
545
|
-
try:
|
|
546
|
-
# Load configuration
|
|
547
|
-
config = get_config(config_file)
|
|
548
|
-
|
|
549
|
-
# Determine input/output files
|
|
550
|
-
analysis_input = Path(analysis_file) if analysis_file else config.ANALYSIS_FILE
|
|
551
|
-
repos_input = Path(repos_file) if repos_file else config.REPOS_FILE
|
|
552
|
-
json_output = Path(output_json) if output_json else config.AGGREGATED_FILE
|
|
553
|
-
csv_output = Path(output_csv) if output_csv else config.CSV_FILE
|
|
554
|
-
|
|
555
|
-
# Check if input files exist
|
|
556
|
-
if not analysis_input.exists():
|
|
557
|
-
colored_print(f"Analysis file not found: {analysis_input}", "red")
|
|
558
|
-
colored_print("Please run 'analyze' command first", "yellow")
|
|
559
|
-
exit(1)
|
|
560
|
-
|
|
561
|
-
if not repos_input.exists():
|
|
562
|
-
colored_print(f"Repositories file not found: {repos_input}", "red")
|
|
563
|
-
colored_print("Please run 'fetch' command first", "yellow")
|
|
564
|
-
exit(1)
|
|
565
|
-
|
|
566
|
-
# Load data
|
|
567
|
-
colored_print(f"Loading analysis results from {analysis_input}...", "blue")
|
|
568
|
-
analysis_data = load_json_file(analysis_input)
|
|
569
|
-
analysis_results = analysis_data.get("results", [])
|
|
570
|
-
|
|
571
|
-
colored_print(f"Loading repositories from {repos_input}...", "blue")
|
|
572
|
-
repos_data = load_json_file(repos_input)
|
|
573
|
-
repositories = repos_data.get("repositories", [])
|
|
574
|
-
|
|
575
|
-
if not analysis_results:
|
|
576
|
-
colored_print("No analysis results found", "yellow")
|
|
577
|
-
exit(1)
|
|
578
|
-
|
|
579
|
-
colored_print(
|
|
580
|
-
f"Loaded {len(analysis_results)} analysis results and {len(repositories)} repositories",
|
|
581
|
-
"green",
|
|
582
|
-
)
|
|
583
|
-
|
|
584
|
-
# Initialize aggregator
|
|
585
|
-
aggregator = DataAggregator()
|
|
586
|
-
|
|
587
|
-
# Aggregate data
|
|
588
|
-
aggregated_data = aggregator.aggregate(analysis_results, repositories)
|
|
589
|
-
|
|
590
|
-
# Save results
|
|
591
|
-
aggregator.save_results(aggregated_data, json_output, csv_output, analysis_results)
|
|
592
|
-
|
|
593
|
-
# Print summary
|
|
594
|
-
aggregator.print_summary(aggregated_data)
|
|
595
|
-
|
|
596
|
-
colored_print("\n✓ Aggregation complete!", "green")
|
|
597
|
-
colored_print(f"JSON output: {json_output}", "green")
|
|
598
|
-
colored_print(f"CSV output: {csv_output}", "green")
|
|
599
|
-
|
|
600
|
-
except FileNotFoundError as e:
|
|
601
|
-
colored_print(f"File not found: {e}", "red")
|
|
602
|
-
exit(1)
|
|
603
|
-
except json.JSONDecodeError as e:
|
|
604
|
-
colored_print(f"Invalid JSON: {e}", "red")
|
|
605
|
-
exit(1)
|
|
606
|
-
except Exception as e:
|
|
607
|
-
colored_print(f"Error: {e}", "red")
|
|
608
|
-
import traceback
|
|
609
|
-
|
|
610
|
-
traceback.print_exc()
|
|
611
|
-
exit(1)
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
if __name__ == "__main__":
|
|
615
|
-
aggregate()
|