greenmining 0.1.12__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__version__.py +1 -1
- greenmining/analyzers/__init__.py +17 -0
- greenmining/analyzers/code_diff_analyzer.py +238 -0
- greenmining/analyzers/ml_feature_extractor.py +512 -0
- greenmining/analyzers/nlp_analyzer.py +365 -0
- greenmining/analyzers/qualitative_analyzer.py +460 -0
- greenmining/analyzers/statistical_analyzer.py +245 -0
- greenmining/analyzers/temporal_analyzer.py +434 -0
- greenmining/cli.py +119 -24
- greenmining/config.py +21 -0
- greenmining/controllers/repository_controller.py +50 -2
- greenmining/gsf_patterns.py +10 -5
- greenmining/models/aggregated_stats.py +3 -1
- greenmining/models/commit.py +3 -0
- greenmining/models/repository.py +3 -1
- greenmining/presenters/console_presenter.py +3 -1
- greenmining/services/commit_extractor.py +37 -7
- greenmining/services/data_aggregator.py +171 -7
- greenmining/services/data_analyzer.py +111 -8
- greenmining/services/github_fetcher.py +62 -5
- greenmining/services/reports.py +123 -2
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/METADATA +250 -22
- greenmining-1.0.2.dist-info/RECORD +36 -0
- greenmining-0.1.12.dist-info/RECORD +0 -29
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/WHEEL +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/entry_points.txt +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/licenses/LICENSE +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/top_level.txt +0 -0
|
@@ -1,14 +1,22 @@
|
|
|
1
1
|
"""Data aggregator for green microservices analysis results."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import json
|
|
4
6
|
from collections import defaultdict
|
|
5
7
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Optional
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
7
9
|
|
|
8
10
|
import click
|
|
9
11
|
import pandas as pd
|
|
10
12
|
|
|
13
|
+
from greenmining.analyzers import (
|
|
14
|
+
EnhancedStatisticalAnalyzer,
|
|
15
|
+
TemporalAnalyzer,
|
|
16
|
+
QualitativeAnalyzer,
|
|
17
|
+
)
|
|
11
18
|
from greenmining.config import get_config
|
|
19
|
+
from greenmining.models.repository import Repository
|
|
12
20
|
from greenmining.utils import (
|
|
13
21
|
colored_print,
|
|
14
22
|
format_number,
|
|
@@ -23,9 +31,35 @@ from greenmining.utils import (
|
|
|
23
31
|
class DataAggregator:
|
|
24
32
|
"""Aggregates analysis results and generates statistics."""
|
|
25
33
|
|
|
26
|
-
def __init__(
|
|
27
|
-
|
|
28
|
-
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
enable_enhanced_stats: bool = False,
|
|
37
|
+
enable_temporal: bool = False,
|
|
38
|
+
temporal_granularity: str = "quarter",
|
|
39
|
+
):
|
|
40
|
+
"""Initialize aggregator.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
enable_enhanced_stats: Enable enhanced statistical analysis
|
|
44
|
+
enable_temporal: Enable temporal trend analysis
|
|
45
|
+
temporal_granularity: Granularity for temporal analysis (day/week/month/quarter/year)
|
|
46
|
+
"""
|
|
47
|
+
self.enable_enhanced_stats = enable_enhanced_stats
|
|
48
|
+
self.enable_temporal = enable_temporal
|
|
49
|
+
|
|
50
|
+
if self.enable_enhanced_stats:
|
|
51
|
+
self.statistical_analyzer = EnhancedStatisticalAnalyzer()
|
|
52
|
+
colored_print("Enhanced statistical analysis enabled", "cyan")
|
|
53
|
+
else:
|
|
54
|
+
self.statistical_analyzer = None
|
|
55
|
+
|
|
56
|
+
if self.enable_temporal:
|
|
57
|
+
self.temporal_analyzer = TemporalAnalyzer(granularity=temporal_granularity)
|
|
58
|
+
colored_print(
|
|
59
|
+
f"Temporal analysis enabled (granularity: {temporal_granularity})", "cyan"
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
self.temporal_analyzer = None
|
|
29
63
|
|
|
30
64
|
def aggregate(
|
|
31
65
|
self, analysis_results: list[dict[str, Any]], repositories: list[dict[str, Any]]
|
|
@@ -56,7 +90,38 @@ class DataAggregator:
|
|
|
56
90
|
# Per-language statistics
|
|
57
91
|
per_language_stats = self._generate_language_stats(analysis_results, repositories)
|
|
58
92
|
|
|
59
|
-
|
|
93
|
+
# Enhanced statistical analysis (if enabled)
|
|
94
|
+
enhanced_stats = None
|
|
95
|
+
if self.enable_enhanced_stats and len(analysis_results) > 0:
|
|
96
|
+
try:
|
|
97
|
+
enhanced_stats = self._generate_enhanced_statistics(analysis_results)
|
|
98
|
+
colored_print("✅ Enhanced statistical analysis complete", "green")
|
|
99
|
+
except Exception as e:
|
|
100
|
+
colored_print(f"⚠️ Enhanced statistics failed: {e}", "yellow")
|
|
101
|
+
enhanced_stats = {"error": str(e)}
|
|
102
|
+
|
|
103
|
+
# Temporal trend analysis (if enabled)
|
|
104
|
+
temporal_analysis = None
|
|
105
|
+
if self.enable_temporal and len(analysis_results) > 0:
|
|
106
|
+
try:
|
|
107
|
+
# Convert analysis results to commits format for temporal analyzer
|
|
108
|
+
commits = [
|
|
109
|
+
{
|
|
110
|
+
"hash": r.get("commit_hash", "unknown"),
|
|
111
|
+
"date": r.get("date"),
|
|
112
|
+
"message": r.get("message", ""),
|
|
113
|
+
"repository": r.get("repository", "unknown"),
|
|
114
|
+
}
|
|
115
|
+
for r in analysis_results
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
temporal_analysis = self.temporal_analyzer.analyze_trends(commits, analysis_results)
|
|
119
|
+
colored_print("✅ Temporal trend analysis complete", "green")
|
|
120
|
+
except Exception as e:
|
|
121
|
+
colored_print(f"⚠️ Temporal analysis failed: {e}", "yellow")
|
|
122
|
+
temporal_analysis = {"error": str(e)}
|
|
123
|
+
|
|
124
|
+
result = {
|
|
60
125
|
"summary": summary,
|
|
61
126
|
"known_patterns": known_patterns,
|
|
62
127
|
"emergent_patterns": emergent_patterns,
|
|
@@ -64,6 +129,14 @@ class DataAggregator:
|
|
|
64
129
|
"per_language_stats": per_language_stats,
|
|
65
130
|
}
|
|
66
131
|
|
|
132
|
+
if enhanced_stats:
|
|
133
|
+
result["enhanced_statistics"] = enhanced_stats
|
|
134
|
+
|
|
135
|
+
if temporal_analysis:
|
|
136
|
+
result["temporal_analysis"] = temporal_analysis
|
|
137
|
+
|
|
138
|
+
return result
|
|
139
|
+
|
|
67
140
|
def _generate_summary(
|
|
68
141
|
self, results: list[dict[str, Any]], repos: list[dict[str, Any]]
|
|
69
142
|
) -> dict[str, Any]:
|
|
@@ -198,8 +271,13 @@ class DataAggregator:
|
|
|
198
271
|
self, results: list[dict[str, Any]], repos: list[dict[str, Any]]
|
|
199
272
|
) -> list[dict[str, Any]]:
|
|
200
273
|
"""Generate per-language statistics."""
|
|
201
|
-
# Create repo name to language mapping
|
|
202
|
-
repo_language_map = {
|
|
274
|
+
# Create repo name to language mapping (handle both Repository objects and dicts)
|
|
275
|
+
repo_language_map = {}
|
|
276
|
+
for repo in repos:
|
|
277
|
+
if isinstance(repo, Repository):
|
|
278
|
+
repo_language_map[repo.full_name] = repo.language or "Unknown"
|
|
279
|
+
else:
|
|
280
|
+
repo_language_map[repo["full_name"]] = repo.get("language", "Unknown")
|
|
203
281
|
|
|
204
282
|
# Group commits by language
|
|
205
283
|
language_commits = defaultdict(list)
|
|
@@ -228,6 +306,92 @@ class DataAggregator:
|
|
|
228
306
|
|
|
229
307
|
return language_stats
|
|
230
308
|
|
|
309
|
+
def _generate_enhanced_statistics(self, results: list[dict[str, Any]]) -> dict[str, Any]:
|
|
310
|
+
"""Generate enhanced statistical analysis.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
results: List of commit analysis results
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
Dictionary with enhanced statistical analysis
|
|
317
|
+
"""
|
|
318
|
+
# Prepare DataFrame
|
|
319
|
+
df = pd.DataFrame(results)
|
|
320
|
+
|
|
321
|
+
# Ensure required columns exist
|
|
322
|
+
if "date" not in df.columns or "green_aware" not in df.columns:
|
|
323
|
+
return {"error": "Missing required columns for enhanced statistics"}
|
|
324
|
+
|
|
325
|
+
enhanced_stats = {}
|
|
326
|
+
|
|
327
|
+
# 1. Temporal Trend Analysis
|
|
328
|
+
if len(df) >= 8: # Need at least 8 data points
|
|
329
|
+
try:
|
|
330
|
+
df_copy = df.copy()
|
|
331
|
+
df_copy["commit_hash"] = df_copy.get("commit_hash", df_copy.index)
|
|
332
|
+
trends = self.statistical_analyzer.temporal_trend_analysis(df_copy)
|
|
333
|
+
enhanced_stats["temporal_trends"] = {
|
|
334
|
+
"trend_direction": trends["trend"]["direction"],
|
|
335
|
+
"correlation": float(trends["trend"]["correlation"]),
|
|
336
|
+
"p_value": float(trends["trend"]["p_value"]),
|
|
337
|
+
"significant": trends["trend"]["significant"],
|
|
338
|
+
"monthly_data_points": len(trends.get("monthly_data", {})),
|
|
339
|
+
}
|
|
340
|
+
except Exception as e:
|
|
341
|
+
enhanced_stats["temporal_trends"] = {"error": str(e)}
|
|
342
|
+
|
|
343
|
+
# 2. Pattern Correlation Analysis (if pattern columns exist)
|
|
344
|
+
pattern_cols = [col for col in df.columns if col.startswith("pattern_")]
|
|
345
|
+
if pattern_cols and len(pattern_cols) >= 2:
|
|
346
|
+
try:
|
|
347
|
+
correlations = self.statistical_analyzer.analyze_pattern_correlations(df)
|
|
348
|
+
enhanced_stats["pattern_correlations"] = {
|
|
349
|
+
"significant_pairs_count": len(correlations["significant_pairs"]),
|
|
350
|
+
"significant_pairs": correlations["significant_pairs"][:5], # Top 5
|
|
351
|
+
"interpretation": correlations["interpretation"],
|
|
352
|
+
}
|
|
353
|
+
except Exception as e:
|
|
354
|
+
enhanced_stats["pattern_correlations"] = {"error": str(e)}
|
|
355
|
+
|
|
356
|
+
# 3. Effect Size Analysis by Repository
|
|
357
|
+
if "repository" in df.columns:
|
|
358
|
+
try:
|
|
359
|
+
# Group by repository
|
|
360
|
+
green_rates_by_repo = df.groupby("repository")["green_aware"].mean()
|
|
361
|
+
if len(green_rates_by_repo) >= 2:
|
|
362
|
+
# Compare top vs bottom half
|
|
363
|
+
sorted_rates = sorted(green_rates_by_repo.values)
|
|
364
|
+
mid_point = len(sorted_rates) // 2
|
|
365
|
+
group1 = sorted_rates[:mid_point]
|
|
366
|
+
group2 = sorted_rates[mid_point:]
|
|
367
|
+
|
|
368
|
+
if len(group1) > 0 and len(group2) > 0:
|
|
369
|
+
effect = self.statistical_analyzer.effect_size_analysis(
|
|
370
|
+
list(group1), list(group2)
|
|
371
|
+
)
|
|
372
|
+
enhanced_stats["effect_size"] = {
|
|
373
|
+
"cohens_d": float(effect["cohens_d"]),
|
|
374
|
+
"magnitude": effect["magnitude"],
|
|
375
|
+
"mean_difference": float(effect["mean_difference"]),
|
|
376
|
+
"significant": effect["significant"],
|
|
377
|
+
"comparison": "high_green_vs_low_green_repos",
|
|
378
|
+
}
|
|
379
|
+
except Exception as e:
|
|
380
|
+
enhanced_stats["effect_size"] = {"error": str(e)}
|
|
381
|
+
|
|
382
|
+
# 4. Basic descriptive statistics
|
|
383
|
+
enhanced_stats["descriptive"] = {
|
|
384
|
+
"total_commits": len(df),
|
|
385
|
+
"green_commits": int(df["green_aware"].sum()),
|
|
386
|
+
"green_rate_mean": float(df["green_aware"].mean()),
|
|
387
|
+
"green_rate_std": float(df["green_aware"].std()) if len(df) > 1 else 0.0,
|
|
388
|
+
"unique_repositories": (
|
|
389
|
+
int(df["repository"].nunique()) if "repository" in df.columns else 0
|
|
390
|
+
),
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
return enhanced_stats
|
|
394
|
+
|
|
231
395
|
def save_results(
|
|
232
396
|
self,
|
|
233
397
|
aggregated_data: dict[str, Any],
|
|
@@ -1,14 +1,21 @@
|
|
|
1
1
|
"""Data analyzer for green microservices commits using GSF patterns."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import json
|
|
4
6
|
import re
|
|
5
7
|
from collections import Counter
|
|
6
8
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Optional
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
8
10
|
|
|
9
11
|
import click
|
|
10
12
|
from tqdm import tqdm
|
|
11
13
|
|
|
14
|
+
from greenmining.analyzers import (
|
|
15
|
+
CodeDiffAnalyzer,
|
|
16
|
+
NLPAnalyzer,
|
|
17
|
+
MLFeatureExtractor,
|
|
18
|
+
)
|
|
12
19
|
from greenmining.config import get_config
|
|
13
20
|
from greenmining.gsf_patterns import (
|
|
14
21
|
GREEN_KEYWORDS,
|
|
@@ -30,16 +37,49 @@ from greenmining.utils import (
|
|
|
30
37
|
class DataAnalyzer:
|
|
31
38
|
"""Analyzes commits for green software patterns using GSF (Green Software Foundation) patterns."""
|
|
32
39
|
|
|
33
|
-
def __init__(
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
batch_size: int = 10,
|
|
43
|
+
enable_diff_analysis: bool = False,
|
|
44
|
+
enable_nlp: bool = False,
|
|
45
|
+
enable_ml_features: bool = False,
|
|
46
|
+
):
|
|
34
47
|
"""Initialize analyzer with GSF patterns.
|
|
35
48
|
|
|
36
49
|
Args:
|
|
37
50
|
batch_size: Number of commits to process in each batch
|
|
51
|
+
enable_diff_analysis: Enable code diff analysis (slower but more accurate)
|
|
52
|
+
enable_nlp: Enable NLP-enhanced pattern detection
|
|
53
|
+
enable_ml_features: Enable ML feature extraction
|
|
38
54
|
"""
|
|
39
55
|
# Use GSF patterns from gsf_patterns.py
|
|
40
56
|
self.gsf_patterns = GSF_PATTERNS
|
|
41
57
|
self.green_keywords = GREEN_KEYWORDS
|
|
42
58
|
self.batch_size = batch_size
|
|
59
|
+
self.enable_diff_analysis = enable_diff_analysis
|
|
60
|
+
self.enable_nlp = enable_nlp
|
|
61
|
+
self.enable_ml_features = enable_ml_features
|
|
62
|
+
|
|
63
|
+
# Initialize code diff analyzer if enabled
|
|
64
|
+
if self.enable_diff_analysis:
|
|
65
|
+
self.diff_analyzer = CodeDiffAnalyzer()
|
|
66
|
+
colored_print("Code diff analysis enabled (may increase processing time)", "cyan")
|
|
67
|
+
else:
|
|
68
|
+
self.diff_analyzer = None
|
|
69
|
+
|
|
70
|
+
# Initialize NLP analyzer if enabled
|
|
71
|
+
if self.enable_nlp:
|
|
72
|
+
self.nlp_analyzer = NLPAnalyzer(enable_stemming=True, enable_synonyms=True)
|
|
73
|
+
colored_print("NLP analysis enabled (morphological variants + synonyms)", "cyan")
|
|
74
|
+
else:
|
|
75
|
+
self.nlp_analyzer = None
|
|
76
|
+
|
|
77
|
+
# Initialize ML feature extractor if enabled
|
|
78
|
+
if self.enable_ml_features:
|
|
79
|
+
self.ml_extractor = MLFeatureExtractor(green_keywords=list(GREEN_KEYWORDS))
|
|
80
|
+
colored_print("ML feature extraction enabled", "cyan")
|
|
81
|
+
else:
|
|
82
|
+
self.ml_extractor = None
|
|
43
83
|
|
|
44
84
|
def analyze_commits(
|
|
45
85
|
self, commits: list[dict[str, Any]], resume_from: int = 0
|
|
@@ -91,6 +131,42 @@ class DataAnalyzer:
|
|
|
91
131
|
# Q2: KNOWN GSF PATTERNS - Match against Green Software Foundation patterns
|
|
92
132
|
matched_patterns = get_pattern_by_keywords(message)
|
|
93
133
|
|
|
134
|
+
# Enhanced NLP analysis (if enabled)
|
|
135
|
+
nlp_results = None
|
|
136
|
+
if self.nlp_analyzer:
|
|
137
|
+
nlp_results = self.nlp_analyzer.analyze_text(message, list(self.green_keywords))
|
|
138
|
+
|
|
139
|
+
# Check if NLP found additional matches not caught by keyword matching
|
|
140
|
+
has_nlp_matches, additional_terms = self.nlp_analyzer.enhance_pattern_detection(
|
|
141
|
+
message, matched_patterns
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
if has_nlp_matches:
|
|
145
|
+
# NLP enhancement found additional evidence
|
|
146
|
+
green_aware = True
|
|
147
|
+
|
|
148
|
+
# Q3: CODE DIFF ANALYSIS (if enabled and diff data available)
|
|
149
|
+
diff_analysis = None
|
|
150
|
+
if self.diff_analyzer and commit.get("diff_data"):
|
|
151
|
+
try:
|
|
152
|
+
# Note: This requires commit object from PyDriller
|
|
153
|
+
# For now, we'll store a placeholder for future integration
|
|
154
|
+
diff_analysis = {
|
|
155
|
+
"enabled": True,
|
|
156
|
+
"status": "requires_pydriller_commit_object",
|
|
157
|
+
"patterns_detected": [],
|
|
158
|
+
"confidence": "none",
|
|
159
|
+
"evidence": {},
|
|
160
|
+
"metrics": {},
|
|
161
|
+
}
|
|
162
|
+
except Exception as e:
|
|
163
|
+
diff_analysis = {
|
|
164
|
+
"enabled": True,
|
|
165
|
+
"status": f"error: {str(e)}",
|
|
166
|
+
"patterns_detected": [],
|
|
167
|
+
"confidence": "none",
|
|
168
|
+
}
|
|
169
|
+
|
|
94
170
|
# Get detailed pattern info
|
|
95
171
|
pattern_details = []
|
|
96
172
|
for _pattern_id, pattern in self.gsf_patterns.items():
|
|
@@ -105,13 +181,14 @@ class DataAnalyzer:
|
|
|
105
181
|
)
|
|
106
182
|
|
|
107
183
|
# Calculate confidence based on number of patterns matched
|
|
108
|
-
confidence
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
184
|
+
# Boost confidence if diff analysis also detected patterns
|
|
185
|
+
pattern_count = len(matched_patterns)
|
|
186
|
+
if diff_analysis and diff_analysis.get("patterns_detected"):
|
|
187
|
+
pattern_count += len(diff_analysis["patterns_detected"])
|
|
188
|
+
|
|
189
|
+
confidence = "high" if pattern_count >= 2 else "medium" if pattern_count == 1 else "low"
|
|
113
190
|
|
|
114
|
-
|
|
191
|
+
result = {
|
|
115
192
|
"commit_hash": commit.get("hash", commit.get("commit_id", "unknown")),
|
|
116
193
|
"repository": commit.get("repository", commit.get("repo_name", "unknown")),
|
|
117
194
|
"author": commit.get("author", commit.get("author_name", "unknown")),
|
|
@@ -130,6 +207,32 @@ class DataAnalyzer:
|
|
|
130
207
|
"deletions": commit.get("lines_deleted", commit.get("deletions", 0)),
|
|
131
208
|
}
|
|
132
209
|
|
|
210
|
+
# Add diff analysis results if available
|
|
211
|
+
if diff_analysis:
|
|
212
|
+
result["diff_analysis"] = diff_analysis
|
|
213
|
+
|
|
214
|
+
# Add NLP analysis results if available
|
|
215
|
+
if nlp_results:
|
|
216
|
+
result["nlp_analysis"] = {
|
|
217
|
+
"total_matches": nlp_results["total_nlp_matches"],
|
|
218
|
+
"match_density": nlp_results["match_density"],
|
|
219
|
+
"morphological_count": len(nlp_results["morphological_matches"]),
|
|
220
|
+
"semantic_count": len(nlp_results["semantic_matches"]),
|
|
221
|
+
"phrase_count": len(nlp_results["phrase_matches"]),
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
# Add ML features if enabled
|
|
225
|
+
if self.enable_ml_features and self.ml_extractor:
|
|
226
|
+
# Note: Full feature extraction requires repository context
|
|
227
|
+
# For now, extract basic text features
|
|
228
|
+
text_features = self.ml_extractor.extract_text_features(message)
|
|
229
|
+
result["ml_features"] = {
|
|
230
|
+
"text": text_features,
|
|
231
|
+
"note": "Full ML features require repository and historical context",
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return result
|
|
235
|
+
|
|
133
236
|
def _check_green_awareness(self, message: str, files: list[str]) -> tuple[bool, Optional[str]]:
|
|
134
237
|
"""Check if commit explicitly mentions green/energy concerns.
|
|
135
238
|
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
"""GitHub repository fetcher for green microservices mining."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
from datetime import datetime
|
|
4
6
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Optional
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
6
8
|
|
|
7
9
|
import click
|
|
8
10
|
from github import Github, GithubException, RateLimitExceededException
|
|
@@ -26,6 +28,10 @@ class GitHubFetcher:
|
|
|
26
28
|
max_repos: int = 100,
|
|
27
29
|
min_stars: int = 100,
|
|
28
30
|
languages: Optional[list[str]] = None,
|
|
31
|
+
created_after: Optional[str] = None,
|
|
32
|
+
created_before: Optional[str] = None,
|
|
33
|
+
pushed_after: Optional[str] = None,
|
|
34
|
+
pushed_before: Optional[str] = None,
|
|
29
35
|
):
|
|
30
36
|
"""Initialize GitHub fetcher.
|
|
31
37
|
|
|
@@ -34,6 +40,10 @@ class GitHubFetcher:
|
|
|
34
40
|
max_repos: Maximum number of repositories to fetch
|
|
35
41
|
min_stars: Minimum number of stars required
|
|
36
42
|
languages: List of programming languages to filter
|
|
43
|
+
created_after: Repository created after date (YYYY-MM-DD)
|
|
44
|
+
created_before: Repository created before date (YYYY-MM-DD)
|
|
45
|
+
pushed_after: Repository pushed after date (YYYY-MM-DD)
|
|
46
|
+
pushed_before: Repository pushed before date (YYYY-MM-DD)
|
|
37
47
|
"""
|
|
38
48
|
self.github = Github(token)
|
|
39
49
|
self.max_repos = max_repos
|
|
@@ -47,6 +57,10 @@ class GitHubFetcher:
|
|
|
47
57
|
"C#",
|
|
48
58
|
"Rust",
|
|
49
59
|
]
|
|
60
|
+
self.created_after = created_after
|
|
61
|
+
self.created_before = created_before
|
|
62
|
+
self.pushed_after = pushed_after
|
|
63
|
+
self.pushed_before = pushed_before
|
|
50
64
|
|
|
51
65
|
def search_repositories(self) -> list[dict[str, Any]]:
|
|
52
66
|
"""Search for microservice repositories.
|
|
@@ -62,10 +76,8 @@ class GitHubFetcher:
|
|
|
62
76
|
f"Filters: min_stars={self.min_stars}, languages={', '.join(self.languages)}", "cyan"
|
|
63
77
|
)
|
|
64
78
|
|
|
65
|
-
# Build search query
|
|
66
|
-
|
|
67
|
-
language_query = " OR ".join([f"language:{lang}" for lang in self.languages])
|
|
68
|
-
query = f"({keyword_query}) ({language_query}) stars:>={self.min_stars}"
|
|
79
|
+
# Build search query with temporal filters
|
|
80
|
+
query = self._build_temporal_query(keywords)
|
|
69
81
|
|
|
70
82
|
try:
|
|
71
83
|
# Execute search
|
|
@@ -139,6 +151,51 @@ class GitHubFetcher:
|
|
|
139
151
|
"license": repo.license.name if repo.license else None,
|
|
140
152
|
}
|
|
141
153
|
|
|
154
|
+
def _build_temporal_query(self, keywords: list[str]) -> str:
|
|
155
|
+
"""
|
|
156
|
+
Build GitHub search query with temporal constraints.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
keywords: List of search keywords
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Complete search query string
|
|
163
|
+
"""
|
|
164
|
+
query_parts = []
|
|
165
|
+
|
|
166
|
+
# Keywords
|
|
167
|
+
keyword_query = " OR ".join(keywords)
|
|
168
|
+
query_parts.append(f"({keyword_query})")
|
|
169
|
+
|
|
170
|
+
# Languages
|
|
171
|
+
language_query = " OR ".join([f"language:{lang}" for lang in self.languages])
|
|
172
|
+
query_parts.append(f"({language_query})")
|
|
173
|
+
|
|
174
|
+
# Stars
|
|
175
|
+
query_parts.append(f"stars:>={self.min_stars}")
|
|
176
|
+
|
|
177
|
+
# Archived filter
|
|
178
|
+
query_parts.append("archived:false")
|
|
179
|
+
|
|
180
|
+
# Temporal filters
|
|
181
|
+
if self.created_after and self.created_before:
|
|
182
|
+
query_parts.append(f"created:{self.created_after}..{self.created_before}")
|
|
183
|
+
elif self.created_after:
|
|
184
|
+
query_parts.append(f"created:>={self.created_after}")
|
|
185
|
+
elif self.created_before:
|
|
186
|
+
query_parts.append(f"created:<={self.created_before}")
|
|
187
|
+
|
|
188
|
+
if self.pushed_after and self.pushed_before:
|
|
189
|
+
query_parts.append(f"pushed:{self.pushed_after}..{self.pushed_before}")
|
|
190
|
+
elif self.pushed_after:
|
|
191
|
+
query_parts.append(f"pushed:>={self.pushed_after}")
|
|
192
|
+
elif self.pushed_before:
|
|
193
|
+
query_parts.append(f"pushed:<={self.pushed_before}")
|
|
194
|
+
|
|
195
|
+
query = " ".join(query_parts)
|
|
196
|
+
colored_print(f"Query: {query}", "cyan")
|
|
197
|
+
return query
|
|
198
|
+
|
|
142
199
|
def _handle_rate_limit(self):
|
|
143
200
|
"""Handle GitHub API rate limiting."""
|
|
144
201
|
rate_limit = self.github.get_rate_limit()
|
greenmining/services/reports.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
"""Report
|
|
1
|
+
"""Report generation for green mining analysis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
import json
|
|
4
6
|
from datetime import datetime
|
|
5
7
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Optional
|
|
8
|
+
from typing import Any, Dict, Optional
|
|
7
9
|
|
|
8
10
|
import click
|
|
9
11
|
|
|
@@ -180,6 +182,11 @@ Commits were analyzed using a keyword and heuristic-based classification framewo
|
|
|
180
182
|
# 2.4 Per-Repository Analysis
|
|
181
183
|
sections.append(self._generate_repo_analysis_section(data))
|
|
182
184
|
|
|
185
|
+
# 2.5 Enhanced Statistics (if available)
|
|
186
|
+
enhanced_section = self._generate_enhanced_statistics_section(data)
|
|
187
|
+
if enhanced_section:
|
|
188
|
+
sections.append(enhanced_section)
|
|
189
|
+
|
|
183
190
|
return "### 2. Results\n\n" + "\n\n".join(sections)
|
|
184
191
|
|
|
185
192
|
def _generate_green_awareness_section(self, data: dict[str, Any]) -> str:
|
|
@@ -300,6 +307,120 @@ No novel microservice-specific green practices were automatically detected. Manu
|
|
|
300
307
|
|
|
301
308
|
**Repositories with No Green Mentions:** {no_green_count} out of {len(per_repo)} repositories had zero green-aware commits."""
|
|
302
309
|
|
|
310
|
+
def _generate_enhanced_statistics_section(self, data: dict[str, Any]) -> str:
|
|
311
|
+
"""Generate enhanced statistical analysis subsection.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
data: Aggregated data containing enhanced_statistics field
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Markdown section with enhanced statistics
|
|
318
|
+
"""
|
|
319
|
+
enhanced_stats = data.get("enhanced_statistics")
|
|
320
|
+
|
|
321
|
+
if not enhanced_stats:
|
|
322
|
+
return ""
|
|
323
|
+
|
|
324
|
+
# Handle error case
|
|
325
|
+
if "error" in enhanced_stats:
|
|
326
|
+
return f"""#### 2.5 Enhanced Statistical Analysis
|
|
327
|
+
|
|
328
|
+
**Note:** Enhanced statistical analysis encountered an error: {enhanced_stats['error']}
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
sections = []
|
|
332
|
+
sections.append("#### 2.5 Enhanced Statistical Analysis")
|
|
333
|
+
sections.append("")
|
|
334
|
+
sections.append(
|
|
335
|
+
"This section presents advanced statistical analyses of green software engineering patterns."
|
|
336
|
+
)
|
|
337
|
+
sections.append("")
|
|
338
|
+
|
|
339
|
+
# Temporal trends
|
|
340
|
+
temporal = enhanced_stats.get("temporal_trends", {})
|
|
341
|
+
if temporal and "error" not in temporal:
|
|
342
|
+
sections.append("##### Temporal Trends")
|
|
343
|
+
sections.append("")
|
|
344
|
+
|
|
345
|
+
if "overall_trend" in temporal:
|
|
346
|
+
trend_dir = temporal["overall_trend"].get("direction", "unknown")
|
|
347
|
+
trend_sig = temporal["overall_trend"].get("significant", False)
|
|
348
|
+
sections.append(f"**Overall Trend:** {trend_dir.capitalize()}")
|
|
349
|
+
if trend_sig:
|
|
350
|
+
sections.append(" (statistically significant)")
|
|
351
|
+
sections.append("")
|
|
352
|
+
|
|
353
|
+
if "monthly_stats" in temporal and temporal["monthly_stats"]:
|
|
354
|
+
sections.append("**Monthly Pattern Statistics:**")
|
|
355
|
+
sections.append("")
|
|
356
|
+
monthly = temporal["monthly_stats"]
|
|
357
|
+
sections.append(f"- Mean commits/month: {format_number(monthly.get('mean', 0))}")
|
|
358
|
+
sections.append(
|
|
359
|
+
f"- Median commits/month: {format_number(monthly.get('median', 0))}"
|
|
360
|
+
)
|
|
361
|
+
sections.append(f"- Std deviation: {format_number(monthly.get('std', 0))}")
|
|
362
|
+
sections.append("")
|
|
363
|
+
|
|
364
|
+
# Pattern correlations
|
|
365
|
+
correlations = enhanced_stats.get("pattern_correlations", {})
|
|
366
|
+
if correlations and "error" not in correlations:
|
|
367
|
+
sections.append("##### Pattern Correlations")
|
|
368
|
+
sections.append("")
|
|
369
|
+
|
|
370
|
+
top_corr = correlations.get("top_positive_correlations", [])
|
|
371
|
+
if top_corr:
|
|
372
|
+
sections.append("**Top Positive Correlations (|r| > 0.5):**")
|
|
373
|
+
sections.append("")
|
|
374
|
+
sections.append("| Pattern 1 | Pattern 2 | Correlation (r) |")
|
|
375
|
+
sections.append("|-----------|-----------|-----------------|")
|
|
376
|
+
for corr in top_corr[:5]:
|
|
377
|
+
sections.append(
|
|
378
|
+
f"| {corr['pattern1']} | {corr['pattern2']} | {corr['correlation']:.3f} |"
|
|
379
|
+
)
|
|
380
|
+
sections.append("")
|
|
381
|
+
else:
|
|
382
|
+
sections.append("No strong pattern correlations detected (|r| > 0.5).")
|
|
383
|
+
sections.append("")
|
|
384
|
+
|
|
385
|
+
# Effect sizes
|
|
386
|
+
effect_sizes = enhanced_stats.get("effect_size", {})
|
|
387
|
+
if effect_sizes and "error" not in effect_sizes:
|
|
388
|
+
sections.append("##### Effect Size Analysis")
|
|
389
|
+
sections.append("")
|
|
390
|
+
|
|
391
|
+
green_vs_nongreen = effect_sizes.get("green_vs_nongreen_patterns")
|
|
392
|
+
if green_vs_nongreen:
|
|
393
|
+
cohens_d = green_vs_nongreen.get("cohens_d", 0)
|
|
394
|
+
magnitude = green_vs_nongreen.get("magnitude", "negligible")
|
|
395
|
+
sections.append(f"**Green vs Non-Green Pattern Usage:**")
|
|
396
|
+
sections.append(f"- Cohen's d: {cohens_d:.3f}")
|
|
397
|
+
sections.append(f"- Effect magnitude: {magnitude.capitalize()}")
|
|
398
|
+
sections.append("")
|
|
399
|
+
|
|
400
|
+
# Descriptive statistics
|
|
401
|
+
descriptive = enhanced_stats.get("descriptive", {})
|
|
402
|
+
if descriptive and "error" not in descriptive:
|
|
403
|
+
sections.append("##### Descriptive Statistics")
|
|
404
|
+
sections.append("")
|
|
405
|
+
|
|
406
|
+
patterns = descriptive.get("patterns_per_commit", {})
|
|
407
|
+
if patterns:
|
|
408
|
+
sections.append("**Patterns per Commit:**")
|
|
409
|
+
sections.append(f"- Mean: {patterns.get('mean', 0):.2f}")
|
|
410
|
+
sections.append(f"- Median: {patterns.get('median', 0):.2f}")
|
|
411
|
+
sections.append(f"- Standard deviation: {patterns.get('std', 0):.2f}")
|
|
412
|
+
sections.append("")
|
|
413
|
+
|
|
414
|
+
repos = descriptive.get("green_commits_per_repo", {})
|
|
415
|
+
if repos:
|
|
416
|
+
sections.append("**Green Commits per Repository:**")
|
|
417
|
+
sections.append(f"- Mean: {repos.get('mean', 0):.2f}")
|
|
418
|
+
sections.append(f"- Median: {repos.get('median', 0):.2f}")
|
|
419
|
+
sections.append(f"- Standard deviation: {repos.get('std', 0):.2f}")
|
|
420
|
+
sections.append("")
|
|
421
|
+
|
|
422
|
+
return "\n".join(sections)
|
|
423
|
+
|
|
303
424
|
def _generate_discussion(self, data: dict[str, Any]) -> str:
|
|
304
425
|
"""Generate discussion section."""
|
|
305
426
|
summary = data["summary"]
|