greenmining 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,442 @@
1
+ """Data aggregator for green microservices analysis results."""
2
+
3
+ import json
4
+ from collections import defaultdict
5
+ from pathlib import Path
6
+ from typing import Any, Optional
7
+
8
+ import click
9
+ import pandas as pd
10
+
11
+ from greenmining.config import get_config
12
+ from greenmining.utils import (
13
+ colored_print,
14
+ format_number,
15
+ format_percentage,
16
+ load_json_file,
17
+ print_banner,
18
+ save_csv_file,
19
+ save_json_file,
20
+ )
21
+
22
+
23
+ class DataAggregator:
24
+ """Aggregates analysis results and generates statistics."""
25
+
26
+ def __init__(self):
27
+ """Initialize aggregator."""
28
+ pass
29
+
30
+ def aggregate(
31
+ self, analysis_results: list[dict[str, Any]], repositories: list[dict[str, Any]]
32
+ ) -> dict[str, Any]:
33
+ """Aggregate analysis results into summary statistics.
34
+
35
+ Args:
36
+ analysis_results: List of commit analysis results
37
+ repositories: List of repository metadata
38
+
39
+ Returns:
40
+ Aggregated statistics dictionary
41
+ """
42
+ colored_print("\nAggregating analysis results...", "cyan")
43
+
44
+ # Summary statistics
45
+ summary = self._generate_summary(analysis_results, repositories)
46
+
47
+ # Known patterns analysis
48
+ known_patterns = self._analyze_known_patterns(analysis_results)
49
+
50
+ # Emergent patterns (placeholder)
51
+ emergent_patterns = self._analyze_emergent_patterns(analysis_results)
52
+
53
+ # Per-repository statistics
54
+ per_repo_stats = self._generate_repo_stats(analysis_results, repositories)
55
+
56
+ # Per-language statistics
57
+ per_language_stats = self._generate_language_stats(analysis_results, repositories)
58
+
59
+ return {
60
+ "summary": summary,
61
+ "known_patterns": known_patterns,
62
+ "emergent_patterns": emergent_patterns,
63
+ "per_repo_stats": per_repo_stats,
64
+ "per_language_stats": per_language_stats,
65
+ }
66
+
67
+ def _generate_summary(
68
+ self, results: list[dict[str, Any]], repos: list[dict[str, Any]]
69
+ ) -> dict[str, Any]:
70
+ """Generate overall summary statistics."""
71
+ total_commits = len(results)
72
+ green_aware_count = sum(1 for r in results if r.get("green_aware", False))
73
+
74
+ # Count repos with at least one green commit
75
+ repos_with_green = len({r["repo_name"] for r in results if r.get("green_aware", False)})
76
+
77
+ return {
78
+ "total_commits": total_commits,
79
+ "green_aware_count": green_aware_count,
80
+ "green_aware_percentage": (
81
+ round(green_aware_count / total_commits * 100, 2) if total_commits > 0 else 0
82
+ ),
83
+ "repos_with_green_commits": repos_with_green,
84
+ "total_repos": len(repos),
85
+ }
86
+
87
+ def _analyze_known_patterns(self, results: list[dict[str, Any]]) -> list[dict[str, Any]]:
88
+ """Analyze known green software patterns."""
89
+ pattern_data = defaultdict(
90
+ lambda: {"count": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0, "example_commits": []}
91
+ )
92
+
93
+ for result in results:
94
+ pattern = result.get("known_pattern")
95
+ confidence = result.get("pattern_confidence", "NONE")
96
+
97
+ if pattern and pattern != "NONE DETECTED":
98
+ pattern_data[pattern]["count"] += 1
99
+ if confidence in ["HIGH", "MEDIUM", "LOW"]:
100
+ pattern_data[pattern][confidence] += 1
101
+
102
+ # Store example commits (max 3)
103
+ if len(pattern_data[pattern]["example_commits"]) < 3:
104
+ pattern_data[pattern]["example_commits"].append(result["commit_id"])
105
+
106
+ # Convert to list format
107
+ patterns_list = []
108
+ total_patterns = sum(p["count"] for p in pattern_data.values())
109
+
110
+ for pattern_name, data in sorted(
111
+ pattern_data.items(), key=lambda x: x[1]["count"], reverse=True
112
+ ):
113
+ patterns_list.append(
114
+ {
115
+ "pattern_name": pattern_name,
116
+ "count": data["count"],
117
+ "percentage": (
118
+ round(data["count"] / total_patterns * 100, 1) if total_patterns > 0 else 0
119
+ ),
120
+ "confidence_breakdown": {
121
+ "HIGH": data["HIGH"],
122
+ "MEDIUM": data["MEDIUM"],
123
+ "LOW": data["LOW"],
124
+ },
125
+ "example_commits": data["example_commits"],
126
+ }
127
+ )
128
+
129
+ return patterns_list
130
+
131
+ def _analyze_emergent_patterns(self, results: list[dict[str, Any]]) -> list[dict[str, Any]]:
132
+ """Analyze emergent patterns (placeholder for manual review)."""
133
+ emergent = []
134
+
135
+ for result in results:
136
+ if result.get("emergent_pattern") and result["emergent_pattern"] != "NONE":
137
+ emergent.append(
138
+ {
139
+ "pattern_name": "Novel pattern detected",
140
+ "count": 1,
141
+ "description": result["emergent_pattern"],
142
+ "example_commits": [result["commit_id"]],
143
+ }
144
+ )
145
+
146
+ return emergent
147
+
148
+ def _generate_repo_stats(
149
+ self, results: list[dict[str, Any]], repos: list[dict[str, Any]]
150
+ ) -> list[dict[str, Any]]:
151
+ """Generate per-repository statistics."""
152
+ repo_commits = defaultdict(list)
153
+
154
+ # Group commits by repository
155
+ for result in results:
156
+ repo_commits[result["repo_name"]].append(result)
157
+
158
+ # Calculate stats for each repo
159
+ repo_stats = []
160
+ for repo_name, commits in repo_commits.items():
161
+ green_commits = [c for c in commits if c.get("green_aware", False)]
162
+ patterns = [
163
+ c.get("known_pattern") for c in commits if c.get("known_pattern") != "NONE DETECTED"
164
+ ]
165
+ unique_patterns = list(set(patterns))
166
+
167
+ repo_stats.append(
168
+ {
169
+ "repo_name": repo_name,
170
+ "total_commits": len(commits),
171
+ "green_commits": len(green_commits),
172
+ "percentage": (
173
+ round(len(green_commits) / len(commits) * 100, 1) if commits else 0
174
+ ),
175
+ "patterns": unique_patterns,
176
+ }
177
+ )
178
+
179
+ # Sort by percentage descending
180
+ repo_stats.sort(key=lambda x: x["percentage"], reverse=True)
181
+
182
+ return repo_stats
183
+
184
+ def _generate_language_stats(
185
+ self, results: list[dict[str, Any]], repos: list[dict[str, Any]]
186
+ ) -> list[dict[str, Any]]:
187
+ """Generate per-language statistics."""
188
+ # Create repo name to language mapping
189
+ repo_language_map = {repo["full_name"]: repo.get("language", "Unknown") for repo in repos}
190
+
191
+ # Group commits by language
192
+ language_commits = defaultdict(list)
193
+ for result in results:
194
+ language = repo_language_map.get(result["repo_name"], "Unknown")
195
+ language_commits[language].append(result)
196
+
197
+ # Calculate stats for each language
198
+ language_stats = []
199
+ for language, commits in language_commits.items():
200
+ green_commits = [c for c in commits if c.get("green_aware", False)]
201
+
202
+ language_stats.append(
203
+ {
204
+ "language": language,
205
+ "total_commits": len(commits),
206
+ "green_commits": len(green_commits),
207
+ "percentage": (
208
+ round(len(green_commits) / len(commits) * 100, 1) if commits else 0
209
+ ),
210
+ }
211
+ )
212
+
213
+ # Sort by total commits descending
214
+ language_stats.sort(key=lambda x: x["total_commits"], reverse=True)
215
+
216
+ return language_stats
217
+
218
+ def save_results(
219
+ self,
220
+ aggregated_data: dict[str, Any],
221
+ json_file: Path,
222
+ csv_file: Path,
223
+ analysis_results: list[dict[str, Any]],
224
+ ):
225
+ """Save aggregated results to JSON and CSV files.
226
+
227
+ Args:
228
+ aggregated_data: Aggregated statistics
229
+ json_file: JSON output file path
230
+ csv_file: CSV output file path
231
+ analysis_results: Original analysis results for CSV
232
+ """
233
+ # Save JSON
234
+ save_json_file(aggregated_data, json_file)
235
+ colored_print(f"Saved aggregated statistics to {json_file}", "green")
236
+
237
+ # Create CSV with one row per commit
238
+ csv_data = []
239
+ for result in analysis_results:
240
+ csv_data.append(
241
+ {
242
+ "commit_id": result["commit_id"],
243
+ "repo_name": result["repo_name"],
244
+ "date": result.get("date", ""),
245
+ "commit_message": result.get("commit_message", "")[:200], # Truncate
246
+ "green_aware": result.get("green_aware", False),
247
+ "green_evidence": (
248
+ result.get("green_evidence", "")[:200]
249
+ if result.get("green_evidence")
250
+ else ""
251
+ ),
252
+ "known_pattern": result.get("known_pattern", ""),
253
+ "pattern_confidence": result.get("pattern_confidence", ""),
254
+ "lines_added": result.get("lines_added", 0),
255
+ "lines_deleted": result.get("lines_deleted", 0),
256
+ }
257
+ )
258
+
259
+ df = pd.DataFrame(csv_data)
260
+ save_csv_file(df, csv_file)
261
+ colored_print(f"Saved detailed results to {csv_file}", "green")
262
+
263
+ def print_summary(self, aggregated_data: dict[str, Any]):
264
+ """Print summary to console."""
265
+ from tabulate import tabulate
266
+
267
+ summary = aggregated_data["summary"]
268
+
269
+ colored_print("\n" + "=" * 60, "cyan")
270
+ colored_print("📊 AGGREGATED STATISTICS SUMMARY", "cyan")
271
+ colored_print("=" * 60, "cyan")
272
+
273
+ # Overall summary
274
+ colored_print("\n📈 Overall Statistics:", "blue")
275
+ summary_table = [
276
+ ["Total Commits Analyzed", format_number(summary["total_commits"])],
277
+ [
278
+ "Green-Aware Commits",
279
+ f"{format_number(summary['green_aware_count'])} ({format_percentage(summary['green_aware_percentage'])})",
280
+ ],
281
+ ["Total Repositories", format_number(summary["total_repos"])],
282
+ ["Repos with Green Commits", format_number(summary["repos_with_green_commits"])],
283
+ ]
284
+ print(tabulate(summary_table, tablefmt="simple"))
285
+
286
+ # Top patterns
287
+ if aggregated_data["known_patterns"]:
288
+ colored_print("\n🎯 Top Green Patterns Detected:", "blue")
289
+ pattern_table = []
290
+ for pattern in aggregated_data["known_patterns"][:10]:
291
+ pattern_table.append(
292
+ [
293
+ pattern["pattern_name"],
294
+ format_number(pattern["count"]),
295
+ format_percentage(pattern["percentage"]),
296
+ f"H:{pattern['confidence_breakdown']['HIGH']} M:{pattern['confidence_breakdown']['MEDIUM']} L:{pattern['confidence_breakdown']['LOW']}",
297
+ ]
298
+ )
299
+ print(
300
+ tabulate(
301
+ pattern_table,
302
+ headers=["Pattern", "Count", "%", "Confidence"],
303
+ tablefmt="simple",
304
+ )
305
+ )
306
+
307
+ # Top repositories
308
+ if aggregated_data["per_repo_stats"]:
309
+ colored_print("\n🏆 Top 10 Greenest Repositories:", "blue")
310
+ repo_table = []
311
+ for repo in aggregated_data["per_repo_stats"][:10]:
312
+ repo_table.append(
313
+ [
314
+ repo["repo_name"][:50],
315
+ format_number(repo["total_commits"]),
316
+ format_number(repo["green_commits"]),
317
+ format_percentage(repo["percentage"]),
318
+ ]
319
+ )
320
+ print(
321
+ tabulate(
322
+ repo_table, headers=["Repository", "Total", "Green", "%"], tablefmt="simple"
323
+ )
324
+ )
325
+
326
+ # Language breakdown
327
+ if aggregated_data["per_language_stats"]:
328
+ colored_print("\n💻 Language Breakdown:", "blue")
329
+ lang_table = []
330
+ for lang in aggregated_data["per_language_stats"]:
331
+ lang_table.append(
332
+ [
333
+ lang["language"],
334
+ format_number(lang["total_commits"]),
335
+ format_number(lang["green_commits"]),
336
+ format_percentage(lang["percentage"]),
337
+ ]
338
+ )
339
+ print(
340
+ tabulate(lang_table, headers=["Language", "Total", "Green", "%"], tablefmt="simple")
341
+ )
342
+
343
+
344
+ @click.command()
345
+ @click.option(
346
+ "--analysis-file",
347
+ default=None,
348
+ help="Input analysis file (default: data/analysis_results.json)",
349
+ )
350
+ @click.option(
351
+ "--repos-file", default=None, help="Input repositories file (default: data/repositories.json)"
352
+ )
353
+ @click.option(
354
+ "--output-json",
355
+ default=None,
356
+ help="Output JSON file (default: data/aggregated_statistics.json)",
357
+ )
358
+ @click.option(
359
+ "--output-csv", default=None, help="Output CSV file (default: data/green_analysis_results.csv)"
360
+ )
361
+ @click.option("--config-file", default=".env", help="Path to .env configuration file")
362
+ def aggregate(
363
+ analysis_file: Optional[str],
364
+ repos_file: Optional[str],
365
+ output_json: Optional[str],
366
+ output_csv: Optional[str],
367
+ config_file: str,
368
+ ):
369
+ """Aggregate analysis results and generate statistics."""
370
+ print_banner("Data Aggregator")
371
+
372
+ try:
373
+ # Load configuration
374
+ config = get_config(config_file)
375
+
376
+ # Determine input/output files
377
+ analysis_input = Path(analysis_file) if analysis_file else config.ANALYSIS_FILE
378
+ repos_input = Path(repos_file) if repos_file else config.REPOS_FILE
379
+ json_output = Path(output_json) if output_json else config.AGGREGATED_FILE
380
+ csv_output = Path(output_csv) if output_csv else config.CSV_FILE
381
+
382
+ # Check if input files exist
383
+ if not analysis_input.exists():
384
+ colored_print(f"Analysis file not found: {analysis_input}", "red")
385
+ colored_print("Please run 'analyze' command first", "yellow")
386
+ exit(1)
387
+
388
+ if not repos_input.exists():
389
+ colored_print(f"Repositories file not found: {repos_input}", "red")
390
+ colored_print("Please run 'fetch' command first", "yellow")
391
+ exit(1)
392
+
393
+ # Load data
394
+ colored_print(f"Loading analysis results from {analysis_input}...", "blue")
395
+ analysis_data = load_json_file(analysis_input)
396
+ analysis_results = analysis_data.get("results", [])
397
+
398
+ colored_print(f"Loading repositories from {repos_input}...", "blue")
399
+ repos_data = load_json_file(repos_input)
400
+ repositories = repos_data.get("repositories", [])
401
+
402
+ if not analysis_results:
403
+ colored_print("No analysis results found", "yellow")
404
+ exit(1)
405
+
406
+ colored_print(
407
+ f"Loaded {len(analysis_results)} analysis results and {len(repositories)} repositories",
408
+ "green",
409
+ )
410
+
411
+ # Initialize aggregator
412
+ aggregator = DataAggregator()
413
+
414
+ # Aggregate data
415
+ aggregated_data = aggregator.aggregate(analysis_results, repositories)
416
+
417
+ # Save results
418
+ aggregator.save_results(aggregated_data, json_output, csv_output, analysis_results)
419
+
420
+ # Print summary
421
+ aggregator.print_summary(aggregated_data)
422
+
423
+ colored_print("\n✓ Aggregation complete!", "green")
424
+ colored_print(f"JSON output: {json_output}", "green")
425
+ colored_print(f"CSV output: {csv_output}", "green")
426
+
427
+ except FileNotFoundError as e:
428
+ colored_print(f"File not found: {e}", "red")
429
+ exit(1)
430
+ except json.JSONDecodeError as e:
431
+ colored_print(f"Invalid JSON: {e}", "red")
432
+ exit(1)
433
+ except Exception as e:
434
+ colored_print(f"Error: {e}", "red")
435
+ import traceback
436
+
437
+ traceback.print_exc()
438
+ exit(1)
439
+
440
+
441
+ if __name__ == "__main__":
442
+ aggregate()