greenmining 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,531 @@
1
+ """Report generator for green microservices analysis."""
2
+
3
+ import json
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Any, Optional
7
+
8
+ import click
9
+
10
+ from greenmining.config import get_config
11
+ from greenmining.utils import (
12
+ colored_print,
13
+ format_number,
14
+ format_percentage,
15
+ load_json_file,
16
+ print_banner,
17
+ )
18
+
19
+
20
+ class ReportGenerator:
21
+ """Generates markdown report from aggregated statistics."""
22
+
23
+ def __init__(self):
24
+ """Initialize report generator."""
25
+ pass
26
+
27
+ def generate_report(
28
+ self,
29
+ aggregated_data: dict[str, Any],
30
+ analysis_data: dict[str, Any],
31
+ repos_data: dict[str, Any],
32
+ ) -> str:
33
+ """Generate comprehensive markdown report.
34
+
35
+ Args:
36
+ aggregated_data: Aggregated statistics
37
+ analysis_data: Original analysis results
38
+ repos_data: Repository metadata
39
+
40
+ Returns:
41
+ Markdown report content
42
+ """
43
+ report_sections = []
44
+
45
+ # Title and metadata
46
+ report_sections.append(self._generate_header())
47
+
48
+ # Executive Summary
49
+ report_sections.append(self._generate_executive_summary(aggregated_data))
50
+
51
+ # 1. Methodology
52
+ report_sections.append(self._generate_methodology(repos_data, analysis_data))
53
+
54
+ # 2. Results
55
+ report_sections.append(self._generate_results(aggregated_data))
56
+
57
+ # 3. Discussion
58
+ report_sections.append(self._generate_discussion(aggregated_data))
59
+
60
+ # 4. Limitations
61
+ report_sections.append(self._generate_limitations())
62
+
63
+ # 5. Conclusion
64
+ report_sections.append(self._generate_conclusion(aggregated_data))
65
+
66
+ return "\n\n".join(report_sections)
67
+
68
+ def _generate_header(self) -> str:
69
+ """Generate report header."""
70
+ return f"""# Mining Software Repositories for Green Microservices
71
+ ## Comprehensive Analysis Report
72
+
73
+ **Report Generated:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
74
+ **Analysis Type:** Keyword and Heuristic-Based Pattern Detection
75
+
76
+ ---"""
77
+
78
+ def _generate_executive_summary(self, data: dict[str, Any]) -> str:
79
+ """Generate executive summary."""
80
+ summary = data["summary"]
81
+ top_patterns = data["known_patterns"][:3] if data["known_patterns"] else []
82
+
83
+ pattern_text = ""
84
+ if top_patterns:
85
+ pattern_list = ", ".join(
86
+ [f"{p['pattern_name']} ({p['count']} occurrences)" for p in top_patterns]
87
+ )
88
+ pattern_text = f"The most common patterns detected include: {pattern_list}."
89
+
90
+ return f"""### Executive Summary
91
+
92
+ This report presents findings from analyzing **{format_number(summary['total_commits'])} commits** across **{format_number(summary['total_repos'])} microservice-based repositories** to identify green software engineering practices.
93
+
94
+ **Key Findings:**
95
+
96
+ - **{format_percentage(summary['green_aware_percentage'])}** of commits ({format_number(summary['green_aware_count'])}) explicitly mention energy efficiency, performance optimization, or sustainability concerns
97
+ - **{format_number(summary['repos_with_green_commits'])}** out of {format_number(summary['total_repos'])} repositories contain at least one green-aware commit
98
+ - {pattern_text if pattern_text else "Various green software patterns were detected across the analyzed commits."}
99
+
100
+ **Implications:**
101
+
102
+ These findings suggest that while green software practices are present in microservices development, there is significant room for increased awareness and adoption of energy-efficient patterns. The relatively low percentage of green-aware commits indicates an opportunity for the software engineering community to emphasize sustainability in distributed systems."""
103
+
104
+ def _generate_methodology(
105
+ self, repos_data: dict[str, Any], analysis_data: dict[str, Any]
106
+ ) -> str:
107
+ """Generate methodology section."""
108
+ metadata = repos_data.get("metadata", {})
109
+ analysis_metadata = analysis_data.get("metadata", {})
110
+
111
+ languages = ", ".join(metadata.get("languages", []))
112
+ search_keywords = ", ".join(metadata.get("search_keywords", []))
113
+
114
+ return f"""### 1. Methodology
115
+
116
+ #### 1.1 Repository Selection Criteria
117
+
118
+ Repositories were selected from GitHub based on the following criteria:
119
+
120
+ - **Keywords:** {search_keywords}
121
+ - **Programming Languages:** {languages}
122
+ - **Minimum Stars:** {metadata.get('min_stars', 100)} (to ensure established projects)
123
+ - **Sort Order:** Stars (descending)
124
+ - **Total Repositories:** {metadata.get('total_repos', 0)}
125
+
126
+ #### 1.2 Data Extraction Approach
127
+
128
+ Commit data was extracted using PyDriller library:
129
+
130
+ - **Commits Analyzed:** {analysis_metadata.get('total_commits_analyzed', 0)}
131
+ - **Time Window:** Last 2 years (730 days)
132
+ - **Merge Commits:** Excluded
133
+ - **Minimum Commit Message Length:** 10 characters
134
+
135
+ #### 1.3 Analysis Methodology
136
+
137
+ Commits were analyzed using a keyword and heuristic-based classification framework:
138
+
139
+ **Q1) Green Awareness Detection:**
140
+ - Searched for explicit mentions of energy, performance, sustainability, caching, optimization, and related keywords
141
+ - Analyzed file names for patterns (cache, performance, optimization)
142
+
143
+ **Q2) Known Pattern Detection:**
144
+ - Matched against predefined green software tactics:
145
+ - Resource pooling (connection pools, thread pools)
146
+ - Caching strategies (Redis, in-memory caches)
147
+ - Lazy initialization
148
+ - Database query optimization
149
+ - Asynchronous processing
150
+ - Code optimization
151
+ - Event-driven architecture
152
+ - Resource limits
153
+ - Service decommissioning
154
+ - Auto-scaling
155
+
156
+ **Q3) Emergent Pattern Detection:**
157
+ - Placeholder for manual review of novel microservice-specific patterns
158
+
159
+ #### 1.4 Limitations and Scope
160
+
161
+ - Analysis based on commit messages and file names only (no code inspection)
162
+ - Keyword matching may miss implicit green practices
163
+ - Limited to English language commit messages
164
+ - Focused on microservices architecture
165
+ - 2-year time window may not capture all historical practices"""
166
+
167
+ def _generate_results(self, data: dict[str, Any]) -> str:
168
+ """Generate results section."""
169
+ sections = []
170
+
171
+ # 2.1 Green Awareness
172
+ sections.append(self._generate_green_awareness_section(data))
173
+
174
+ # 2.2 Known Patterns
175
+ sections.append(self._generate_known_patterns_section(data))
176
+
177
+ # 2.3 Emerging Practices
178
+ sections.append(self._generate_emergent_patterns_section(data))
179
+
180
+ # 2.4 Per-Repository Analysis
181
+ sections.append(self._generate_repo_analysis_section(data))
182
+
183
+ return "### 2. Results\n\n" + "\n\n".join(sections)
184
+
185
+ def _generate_green_awareness_section(self, data: dict[str, Any]) -> str:
186
+ """Generate green awareness subsection."""
187
+ summary = data["summary"]
188
+ per_lang = data["per_language_stats"]
189
+ per_repo = data["per_repo_stats"]
190
+
191
+ # Top 10 repos table
192
+ top_repos_table = "| Repository | Total Commits | Green Commits | Percentage |\n|------------|---------------|---------------|------------|\n"
193
+ for repo in per_repo[:10]:
194
+ top_repos_table += f"| {repo['repo_name'][:50]} | {repo['total_commits']} | {repo['green_commits']} | {format_percentage(repo['percentage'])} |\n"
195
+
196
+ # Language table
197
+ lang_table = "| Language | Total Commits | Green Commits | Percentage |\n|----------|---------------|---------------|------------|\n"
198
+ for lang in per_lang:
199
+ lang_table += f"| {lang['language']} | {format_number(lang['total_commits'])} | {format_number(lang['green_commits'])} | {format_percentage(lang['percentage'])} |\n"
200
+
201
+ return f"""#### 2.1 Green Awareness in Commits
202
+
203
+ **Total commits analyzed:** {format_number(summary['total_commits'])}
204
+ **Commits with explicit green mention:** {format_number(summary['green_aware_count'])} ({format_percentage(summary['green_aware_percentage'])})
205
+
206
+ **Table: Top 10 Repositories with Highest Green Awareness**
207
+
208
+ {top_repos_table}
209
+
210
+ **Table: Green Awareness by Programming Language**
211
+
212
+ {lang_table}"""
213
+
214
+ def _generate_known_patterns_section(self, data: dict[str, Any]) -> str:
215
+ """Generate known patterns subsection."""
216
+ patterns = data["known_patterns"]
217
+
218
+ if not patterns:
219
+ return "#### 2.2 Known Green Patterns & Tactics Applied\n\nNo known patterns were detected in the analyzed commits."
220
+
221
+ # Patterns table
222
+ patterns_table = (
223
+ "| Pattern | Count | Percentage | High Conf. | Medium Conf. | Low Conf. |\n"
224
+ )
225
+ patterns_table += (
226
+ "|---------|-------|------------|------------|--------------|----------|\n"
227
+ )
228
+ for pattern in patterns:
229
+ conf = pattern["confidence_breakdown"]
230
+ patterns_table += f"| {pattern['pattern_name']} | {format_number(pattern['count'])} | {format_percentage(pattern['percentage'])} | {conf['HIGH']} | {conf['MEDIUM']} | {conf['LOW']} |\n"
231
+
232
+ # Pattern descriptions
233
+ pattern_details = []
234
+ for i, pattern in enumerate(patterns[:10], 1):
235
+ pattern_details.append(
236
+ f"""**{i}. {pattern['pattern_name']}**
237
+ - Frequency: {format_number(pattern['count'])} commits ({format_percentage(pattern['percentage'])})
238
+ - Confidence Distribution: HIGH={conf['HIGH']}, MEDIUM={conf['MEDIUM']}, LOW={conf['LOW']}
239
+ - Example Commits: {', '.join([c[:8] for c in pattern['example_commits'][:3]])}"""
240
+ )
241
+
242
+ return f"""#### 2.2 Known Green Patterns & Tactics Applied
243
+
244
+ The following table summarizes the known green software patterns detected in the dataset:
245
+
246
+ **Table: Known Patterns Ranked by Frequency**
247
+
248
+ {patterns_table}
249
+
250
+ **Detailed Pattern Analysis:**
251
+
252
+ {chr(10).join(pattern_details)}"""
253
+
254
+ def _generate_emergent_patterns_section(self, data: dict[str, Any]) -> str:
255
+ """Generate emergent patterns subsection."""
256
+ emergent = data["emergent_patterns"]
257
+
258
+ if not emergent:
259
+ return """#### 2.3 Emerging Practices Discovered
260
+
261
+ No novel microservice-specific green practices were automatically detected. Manual review of high-impact commits may reveal emerging patterns not captured by keyword matching."""
262
+
263
+ pattern_list = []
264
+ for pattern in emergent:
265
+ pattern_list.append(
266
+ f"""**Pattern:** {pattern['pattern_name']}
267
+ - Occurrences: {pattern['count']}
268
+ - Description: {pattern['description']}
269
+ - Example Commits: {', '.join([c[:8] for c in pattern['example_commits'][:3]])}"""
270
+ )
271
+
272
+ return f"""#### 2.3 Emerging Practices Discovered
273
+
274
+ {chr(10).join(pattern_list)}"""
275
+
276
+ def _generate_repo_analysis_section(self, data: dict[str, Any]) -> str:
277
+ """Generate per-repository analysis subsection."""
278
+ per_repo = data["per_repo_stats"]
279
+
280
+ # Top 10 greenest
281
+ top_10_table = (
282
+ "| Repository | Total Commits | Green Commits | Percentage | Patterns Detected |\n"
283
+ )
284
+ top_10_table += (
285
+ "|------------|---------------|---------------|------------|-------------------|\n"
286
+ )
287
+ for repo in per_repo[:10]:
288
+ patterns_str = ", ".join(repo["patterns"][:3]) if repo["patterns"] else "None"
289
+ top_10_table += f"| {repo['repo_name'][:50]} | {repo['total_commits']} | {repo['green_commits']} | {format_percentage(repo['percentage'])} | {patterns_str} |\n"
290
+
291
+ # Repos with no green mentions
292
+ no_green = [r for r in per_repo if r["green_commits"] == 0]
293
+ no_green_count = len(no_green)
294
+
295
+ return f"""#### 2.4 Per-Repository Analysis
296
+
297
+ **Top 10 Greenest Repositories (by % green-aware commits):**
298
+
299
+ {top_10_table}
300
+
301
+ **Repositories with No Green Mentions:** {no_green_count} out of {len(per_repo)} repositories had zero green-aware commits."""
302
+
303
+ def _generate_discussion(self, data: dict[str, Any]) -> str:
304
+ """Generate discussion section."""
305
+ summary = data["summary"]
306
+ green_pct = summary["green_aware_percentage"]
307
+
308
+ interpretation = (
309
+ "relatively low" if green_pct < 10 else "moderate" if green_pct < 20 else "high"
310
+ )
311
+
312
+ return f"""### 3. Discussion
313
+
314
+ #### 3.1 Interpretation of Findings
315
+
316
+ The analysis reveals that {format_percentage(green_pct)} of microservice commits explicitly address energy efficiency or sustainability concerns. This {interpretation} percentage suggests that:
317
+
318
+ 1. **Green software practices exist but are not mainstream:** While developers are applying some energy-efficient patterns, sustainability is not yet a primary concern in microservices development.
319
+
320
+ 2. **Implicit vs. Explicit practices:** Many optimizations (e.g., caching, async processing) may improve energy efficiency without explicitly mentioning it in commit messages.
321
+
322
+ 3. **Domain-specific awareness:** Some repositories show significantly higher green awareness, suggesting that certain domains (e.g., cloud-native, high-scale systems) are more conscious of resource efficiency.
323
+
324
+ #### 3.2 How Microservice Developers Approach Energy Efficiency
325
+
326
+ Based on the detected patterns, microservice developers primarily focus on:
327
+
328
+ - **Performance optimization** as a proxy for energy efficiency
329
+ - **Caching strategies** to reduce redundant computations
330
+ - **Resource pooling** to minimize connection overhead
331
+ - **Asynchronous processing** to improve resource utilization
332
+
333
+ #### 3.3 Gap Analysis: Literature vs. Practice
334
+
335
+ **Literature Emphasis:**
336
+ - Formal green software engineering methodologies
337
+ - Energy measurement and profiling
338
+ - Carbon-aware computing
339
+
340
+ **Practice Emphasis:**
341
+ - Performance optimization (implicitly green)
342
+ - Cost reduction (aligned with energy efficiency)
343
+ - Scalability patterns (may or may not be green)
344
+
345
+ **Gap:** Explicit sustainability terminology is rare in commit messages, even when applying green patterns.
346
+
347
+ #### 3.4 Implications for Green Software Engineering in Distributed Systems
348
+
349
+ 1. **Need for awareness:** Developers would benefit from education on how common optimizations contribute to sustainability
350
+ 2. **Tooling opportunity:** IDE plugins or CI/CD checks could highlight energy implications of code changes
351
+ 3. **Metrics integration:** Including energy/carbon metrics alongside performance metrics in monitoring dashboards
352
+ 4. **Best practices dissemination:** Green microservices patterns should be documented and promoted in the community"""
353
+
354
+ def _generate_limitations(self) -> str:
355
+ """Generate limitations section."""
356
+ return """### 4. Limitations
357
+
358
+ #### 4.1 Sample Size and Selection Bias
359
+
360
+ - Analysis limited to top-starred repositories, which may not represent typical microservices projects
361
+ - GitHub-centric sample excludes private enterprise repositories
362
+ - Selection based on keywords may miss relevant projects with different terminology
363
+
364
+ #### 4.2 Commit Message Analysis Limitations
365
+
366
+ - Commit messages may not fully describe code changes
367
+ - Keyword matching cannot detect implicit green practices in code
368
+ - English-only analysis excludes international projects
369
+ - Developers may not document energy implications in commit messages
370
+
371
+ #### 4.3 Scope Limitations
372
+
373
+ - 2-year time window may not capture long-term trends
374
+ - Focus on microservices excludes monolithic and other architectures
375
+ - No code-level analysis (only commit metadata)
376
+ - Heuristic classification may have false positives/negatives
377
+
378
+ #### 4.4 Future Work Suggestions
379
+
380
+ 1. **AI-powered analysis:** Use Claude Sonnet or similar LLMs for deeper semantic understanding
381
+ 2. **Code-level inspection:** Analyze actual code changes, not just commit messages
382
+ 3. **Longitudinal study:** Track green practices evolution over time
383
+ 4. **Developer surveys:** Complement automated analysis with developer perspectives
384
+ 5. **Energy measurement:** Correlate detected patterns with actual energy consumption data"""
385
+
386
+ def _generate_conclusion(self, data: dict[str, Any]) -> str:
387
+ """Generate conclusion section."""
388
+ summary = data["summary"]
389
+ top_patterns = (
390
+ [p["pattern_name"] for p in data["known_patterns"][:5]]
391
+ if data["known_patterns"]
392
+ else []
393
+ )
394
+
395
+ patterns_text = (
396
+ ", ".join(top_patterns[:3]) if top_patterns else "various optimization patterns"
397
+ )
398
+
399
+ return f"""### 5. Conclusion
400
+
401
+ #### 5.1 Summary of Key Findings
402
+
403
+ This study analyzed {format_number(summary['total_commits'])} commits from {format_number(summary['total_repos'])} microservice repositories and found:
404
+
405
+ 1. **{format_percentage(summary['green_aware_percentage'])}** of commits explicitly address energy/sustainability concerns
406
+ 2. **{format_number(summary['repos_with_green_commits'])}** repositories demonstrate some level of green awareness
407
+ 3. Common green patterns include: {patterns_text}
408
+
409
+ #### 5.2 Answers to Research Questions
410
+
411
+ **RQ1: What percentage of microservice commits explicitly mention energy efficiency?**
412
+ Answer: {format_percentage(summary['green_aware_percentage'])} of analyzed commits contain explicit mentions.
413
+
414
+ **RQ2: Which green software tactics are developers applying in practice?**
415
+ Answer: Developers primarily apply caching strategies, resource pooling, database optimization, and asynchronous processing patterns.
416
+
417
+ **RQ3: Are there novel microservice-specific green practices not yet documented?**
418
+ Answer: Automated keyword analysis found limited evidence of novel patterns. Manual review and AI-powered analysis may reveal more nuanced practices.
419
+
420
+ #### 5.3 Recommendations for Practitioners
421
+
422
+ 1. **Adopt explicit green terminology:** Document energy implications in commit messages and PR descriptions
423
+ 2. **Measure and monitor:** Integrate energy/carbon metrics into observability platforms
424
+ 3. **Apply known patterns:** Systematically apply caching, pooling, and optimization patterns with sustainability in mind
425
+ 4. **Education and training:** Incorporate green software engineering principles into team training
426
+
427
+ #### 5.4 Recommendations for Researchers
428
+
429
+ 1. **Develop better detection tools:** Create AI-powered tools for identifying green practices in code
430
+ 2. **Build pattern catalogs:** Document microservice-specific green patterns with examples
431
+ 3. **Conduct empirical studies:** Measure actual energy savings from detected patterns
432
+ 4. **Create benchmarks:** Establish baseline metrics for green microservices
433
+
434
+ ---
435
+
436
+ **Report End**
437
+
438
+ *For questions or additional analysis, please refer to the accompanying data files: `green_analysis_results.csv` and `aggregated_statistics.json`*"""
439
+
440
+ def save_report(self, report_content: str, output_file: Path):
441
+ """Save report to markdown file."""
442
+ output_file.parent.mkdir(parents=True, exist_ok=True)
443
+ with open(output_file, "w", encoding="utf-8") as f:
444
+ f.write(report_content)
445
+ colored_print(f"Saved report to {output_file}", "green")
446
+
447
+
448
+ @click.command()
449
+ @click.option("--aggregated-file", default=None, help="Input aggregated statistics file")
450
+ @click.option("--analysis-file", default=None, help="Input analysis results file")
451
+ @click.option("--repos-file", default=None, help="Input repositories file")
452
+ @click.option(
453
+ "--output-file",
454
+ default=None,
455
+ help="Output markdown file (default: data/green_microservices_analysis.md)",
456
+ )
457
+ @click.option("--config-file", default=".env", help="Path to .env configuration file")
458
+ def report(
459
+ aggregated_file: Optional[str],
460
+ analysis_file: Optional[str],
461
+ repos_file: Optional[str],
462
+ output_file: Optional[str],
463
+ config_file: str,
464
+ ):
465
+ """Generate comprehensive markdown report."""
466
+ print_banner("Report Generator")
467
+
468
+ try:
469
+ # Load configuration
470
+ config = get_config(config_file)
471
+
472
+ # Determine input/output files
473
+ agg_input = Path(aggregated_file) if aggregated_file else config.AGGREGATED_FILE
474
+ analysis_input = Path(analysis_file) if analysis_file else config.ANALYSIS_FILE
475
+ repos_input = Path(repos_file) if repos_file else config.REPOS_FILE
476
+ output = Path(output_file) if output_file else config.REPORT_FILE
477
+
478
+ # Check if input files exist
479
+ missing_files = []
480
+ if not agg_input.exists():
481
+ missing_files.append(str(agg_input))
482
+ if not analysis_input.exists():
483
+ missing_files.append(str(analysis_input))
484
+ if not repos_input.exists():
485
+ missing_files.append(str(repos_input))
486
+
487
+ if missing_files:
488
+ colored_print("Missing required input files:", "red")
489
+ for f in missing_files:
490
+ colored_print(f" - {f}", "red")
491
+ colored_print(
492
+ "\nPlease run the full pipeline first: fetch → extract → analyze → aggregate",
493
+ "yellow",
494
+ )
495
+ exit(1)
496
+
497
+ # Load data
498
+ colored_print("Loading data files...", "blue")
499
+ aggregated_data = load_json_file(agg_input)
500
+ analysis_data = load_json_file(analysis_input)
501
+ repos_data = load_json_file(repos_input)
502
+ colored_print("✓ Data loaded successfully", "green")
503
+
504
+ # Generate report
505
+ colored_print("\nGenerating report...", "blue")
506
+ generator = ReportGenerator()
507
+ report_content = generator.generate_report(aggregated_data, analysis_data, repos_data)
508
+
509
+ # Save report
510
+ generator.save_report(report_content, output)
511
+
512
+ colored_print("\n✓ Report generated successfully!", "green")
513
+ colored_print(f"Output: {output}", "green")
514
+ colored_print(f"Report size: {len(report_content):,} characters", "white")
515
+
516
+ except FileNotFoundError as e:
517
+ colored_print(f"File not found: {e}", "red")
518
+ exit(1)
519
+ except json.JSONDecodeError as e:
520
+ colored_print(f"Invalid JSON: {e}", "red")
521
+ exit(1)
522
+ except Exception as e:
523
+ colored_print(f"Error: {e}", "red")
524
+ import traceback
525
+
526
+ traceback.print_exc()
527
+ exit(1)
528
+
529
+
530
+ if __name__ == "__main__":
531
+ report()