greenmining 0.1.12__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,22 @@
1
1
  """Data aggregator for green microservices analysis results."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import json
4
6
  from collections import defaultdict
5
7
  from pathlib import Path
6
- from typing import Any, Optional
8
+ from typing import Any, Dict, List, Optional
7
9
 
8
10
  import click
9
11
  import pandas as pd
10
12
 
13
+ from greenmining.analyzers import (
14
+ EnhancedStatisticalAnalyzer,
15
+ TemporalAnalyzer,
16
+ QualitativeAnalyzer,
17
+ )
11
18
  from greenmining.config import get_config
19
+ from greenmining.models.repository import Repository
12
20
  from greenmining.utils import (
13
21
  colored_print,
14
22
  format_number,
@@ -23,9 +31,35 @@ from greenmining.utils import (
23
31
  class DataAggregator:
24
32
  """Aggregates analysis results and generates statistics."""
25
33
 
26
- def __init__(self):
27
- """Initialize aggregator."""
28
- pass
34
+ def __init__(
35
+ self,
36
+ enable_enhanced_stats: bool = False,
37
+ enable_temporal: bool = False,
38
+ temporal_granularity: str = "quarter",
39
+ ):
40
+ """Initialize aggregator.
41
+
42
+ Args:
43
+ enable_enhanced_stats: Enable enhanced statistical analysis
44
+ enable_temporal: Enable temporal trend analysis
45
+ temporal_granularity: Granularity for temporal analysis (day/week/month/quarter/year)
46
+ """
47
+ self.enable_enhanced_stats = enable_enhanced_stats
48
+ self.enable_temporal = enable_temporal
49
+
50
+ if self.enable_enhanced_stats:
51
+ self.statistical_analyzer = EnhancedStatisticalAnalyzer()
52
+ colored_print("Enhanced statistical analysis enabled", "cyan")
53
+ else:
54
+ self.statistical_analyzer = None
55
+
56
+ if self.enable_temporal:
57
+ self.temporal_analyzer = TemporalAnalyzer(granularity=temporal_granularity)
58
+ colored_print(
59
+ f"Temporal analysis enabled (granularity: {temporal_granularity})", "cyan"
60
+ )
61
+ else:
62
+ self.temporal_analyzer = None
29
63
 
30
64
  def aggregate(
31
65
  self, analysis_results: list[dict[str, Any]], repositories: list[dict[str, Any]]
@@ -56,7 +90,38 @@ class DataAggregator:
56
90
  # Per-language statistics
57
91
  per_language_stats = self._generate_language_stats(analysis_results, repositories)
58
92
 
59
- return {
93
+ # Enhanced statistical analysis (if enabled)
94
+ enhanced_stats = None
95
+ if self.enable_enhanced_stats and len(analysis_results) > 0:
96
+ try:
97
+ enhanced_stats = self._generate_enhanced_statistics(analysis_results)
98
+ colored_print("✅ Enhanced statistical analysis complete", "green")
99
+ except Exception as e:
100
+ colored_print(f"⚠️ Enhanced statistics failed: {e}", "yellow")
101
+ enhanced_stats = {"error": str(e)}
102
+
103
+ # Temporal trend analysis (if enabled)
104
+ temporal_analysis = None
105
+ if self.enable_temporal and len(analysis_results) > 0:
106
+ try:
107
+ # Convert analysis results to commits format for temporal analyzer
108
+ commits = [
109
+ {
110
+ "hash": r.get("commit_hash", "unknown"),
111
+ "date": r.get("date"),
112
+ "message": r.get("message", ""),
113
+ "repository": r.get("repository", "unknown"),
114
+ }
115
+ for r in analysis_results
116
+ ]
117
+
118
+ temporal_analysis = self.temporal_analyzer.analyze_trends(commits, analysis_results)
119
+ colored_print("✅ Temporal trend analysis complete", "green")
120
+ except Exception as e:
121
+ colored_print(f"⚠️ Temporal analysis failed: {e}", "yellow")
122
+ temporal_analysis = {"error": str(e)}
123
+
124
+ result = {
60
125
  "summary": summary,
61
126
  "known_patterns": known_patterns,
62
127
  "emergent_patterns": emergent_patterns,
@@ -64,6 +129,14 @@ class DataAggregator:
64
129
  "per_language_stats": per_language_stats,
65
130
  }
66
131
 
132
+ if enhanced_stats:
133
+ result["enhanced_statistics"] = enhanced_stats
134
+
135
+ if temporal_analysis:
136
+ result["temporal_analysis"] = temporal_analysis
137
+
138
+ return result
139
+
67
140
  def _generate_summary(
68
141
  self, results: list[dict[str, Any]], repos: list[dict[str, Any]]
69
142
  ) -> dict[str, Any]:
@@ -198,8 +271,13 @@ class DataAggregator:
198
271
  self, results: list[dict[str, Any]], repos: list[dict[str, Any]]
199
272
  ) -> list[dict[str, Any]]:
200
273
  """Generate per-language statistics."""
201
- # Create repo name to language mapping
202
- repo_language_map = {repo["full_name"]: repo.get("language", "Unknown") for repo in repos}
274
+ # Create repo name to language mapping (handle both Repository objects and dicts)
275
+ repo_language_map = {}
276
+ for repo in repos:
277
+ if isinstance(repo, Repository):
278
+ repo_language_map[repo.full_name] = repo.language or "Unknown"
279
+ else:
280
+ repo_language_map[repo["full_name"]] = repo.get("language", "Unknown")
203
281
 
204
282
  # Group commits by language
205
283
  language_commits = defaultdict(list)
@@ -228,6 +306,92 @@ class DataAggregator:
228
306
 
229
307
  return language_stats
230
308
 
309
+ def _generate_enhanced_statistics(self, results: list[dict[str, Any]]) -> dict[str, Any]:
310
+ """Generate enhanced statistical analysis.
311
+
312
+ Args:
313
+ results: List of commit analysis results
314
+
315
+ Returns:
316
+ Dictionary with enhanced statistical analysis
317
+ """
318
+ # Prepare DataFrame
319
+ df = pd.DataFrame(results)
320
+
321
+ # Ensure required columns exist
322
+ if "date" not in df.columns or "green_aware" not in df.columns:
323
+ return {"error": "Missing required columns for enhanced statistics"}
324
+
325
+ enhanced_stats = {}
326
+
327
+ # 1. Temporal Trend Analysis
328
+ if len(df) >= 8: # Need at least 8 data points
329
+ try:
330
+ df_copy = df.copy()
331
+ df_copy["commit_hash"] = df_copy.get("commit_hash", df_copy.index)
332
+ trends = self.statistical_analyzer.temporal_trend_analysis(df_copy)
333
+ enhanced_stats["temporal_trends"] = {
334
+ "trend_direction": trends["trend"]["direction"],
335
+ "correlation": float(trends["trend"]["correlation"]),
336
+ "p_value": float(trends["trend"]["p_value"]),
337
+ "significant": trends["trend"]["significant"],
338
+ "monthly_data_points": len(trends.get("monthly_data", {})),
339
+ }
340
+ except Exception as e:
341
+ enhanced_stats["temporal_trends"] = {"error": str(e)}
342
+
343
+ # 2. Pattern Correlation Analysis (if pattern columns exist)
344
+ pattern_cols = [col for col in df.columns if col.startswith("pattern_")]
345
+ if pattern_cols and len(pattern_cols) >= 2:
346
+ try:
347
+ correlations = self.statistical_analyzer.analyze_pattern_correlations(df)
348
+ enhanced_stats["pattern_correlations"] = {
349
+ "significant_pairs_count": len(correlations["significant_pairs"]),
350
+ "significant_pairs": correlations["significant_pairs"][:5], # Top 5
351
+ "interpretation": correlations["interpretation"],
352
+ }
353
+ except Exception as e:
354
+ enhanced_stats["pattern_correlations"] = {"error": str(e)}
355
+
356
+ # 3. Effect Size Analysis by Repository
357
+ if "repository" in df.columns:
358
+ try:
359
+ # Group by repository
360
+ green_rates_by_repo = df.groupby("repository")["green_aware"].mean()
361
+ if len(green_rates_by_repo) >= 2:
362
+ # Compare top vs bottom half
363
+ sorted_rates = sorted(green_rates_by_repo.values)
364
+ mid_point = len(sorted_rates) // 2
365
+ group1 = sorted_rates[:mid_point]
366
+ group2 = sorted_rates[mid_point:]
367
+
368
+ if len(group1) > 0 and len(group2) > 0:
369
+ effect = self.statistical_analyzer.effect_size_analysis(
370
+ list(group1), list(group2)
371
+ )
372
+ enhanced_stats["effect_size"] = {
373
+ "cohens_d": float(effect["cohens_d"]),
374
+ "magnitude": effect["magnitude"],
375
+ "mean_difference": float(effect["mean_difference"]),
376
+ "significant": effect["significant"],
377
+ "comparison": "high_green_vs_low_green_repos",
378
+ }
379
+ except Exception as e:
380
+ enhanced_stats["effect_size"] = {"error": str(e)}
381
+
382
+ # 4. Basic descriptive statistics
383
+ enhanced_stats["descriptive"] = {
384
+ "total_commits": len(df),
385
+ "green_commits": int(df["green_aware"].sum()),
386
+ "green_rate_mean": float(df["green_aware"].mean()),
387
+ "green_rate_std": float(df["green_aware"].std()) if len(df) > 1 else 0.0,
388
+ "unique_repositories": (
389
+ int(df["repository"].nunique()) if "repository" in df.columns else 0
390
+ ),
391
+ }
392
+
393
+ return enhanced_stats
394
+
231
395
  def save_results(
232
396
  self,
233
397
  aggregated_data: dict[str, Any],
@@ -1,14 +1,21 @@
1
1
  """Data analyzer for green microservices commits using GSF patterns."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import json
4
6
  import re
5
7
  from collections import Counter
6
8
  from pathlib import Path
7
- from typing import Any, Optional
9
+ from typing import Any, Dict, List, Optional, Tuple
8
10
 
9
11
  import click
10
12
  from tqdm import tqdm
11
13
 
14
+ from greenmining.analyzers import (
15
+ CodeDiffAnalyzer,
16
+ NLPAnalyzer,
17
+ MLFeatureExtractor,
18
+ )
12
19
  from greenmining.config import get_config
13
20
  from greenmining.gsf_patterns import (
14
21
  GREEN_KEYWORDS,
@@ -30,16 +37,49 @@ from greenmining.utils import (
30
37
  class DataAnalyzer:
31
38
  """Analyzes commits for green software patterns using GSF (Green Software Foundation) patterns."""
32
39
 
33
- def __init__(self, batch_size: int = 10):
40
+ def __init__(
41
+ self,
42
+ batch_size: int = 10,
43
+ enable_diff_analysis: bool = False,
44
+ enable_nlp: bool = False,
45
+ enable_ml_features: bool = False,
46
+ ):
34
47
  """Initialize analyzer with GSF patterns.
35
48
 
36
49
  Args:
37
50
  batch_size: Number of commits to process in each batch
51
+ enable_diff_analysis: Enable code diff analysis (slower but more accurate)
52
+ enable_nlp: Enable NLP-enhanced pattern detection
53
+ enable_ml_features: Enable ML feature extraction
38
54
  """
39
55
  # Use GSF patterns from gsf_patterns.py
40
56
  self.gsf_patterns = GSF_PATTERNS
41
57
  self.green_keywords = GREEN_KEYWORDS
42
58
  self.batch_size = batch_size
59
+ self.enable_diff_analysis = enable_diff_analysis
60
+ self.enable_nlp = enable_nlp
61
+ self.enable_ml_features = enable_ml_features
62
+
63
+ # Initialize code diff analyzer if enabled
64
+ if self.enable_diff_analysis:
65
+ self.diff_analyzer = CodeDiffAnalyzer()
66
+ colored_print("Code diff analysis enabled (may increase processing time)", "cyan")
67
+ else:
68
+ self.diff_analyzer = None
69
+
70
+ # Initialize NLP analyzer if enabled
71
+ if self.enable_nlp:
72
+ self.nlp_analyzer = NLPAnalyzer(enable_stemming=True, enable_synonyms=True)
73
+ colored_print("NLP analysis enabled (morphological variants + synonyms)", "cyan")
74
+ else:
75
+ self.nlp_analyzer = None
76
+
77
+ # Initialize ML feature extractor if enabled
78
+ if self.enable_ml_features:
79
+ self.ml_extractor = MLFeatureExtractor(green_keywords=list(GREEN_KEYWORDS))
80
+ colored_print("ML feature extraction enabled", "cyan")
81
+ else:
82
+ self.ml_extractor = None
43
83
 
44
84
  def analyze_commits(
45
85
  self, commits: list[dict[str, Any]], resume_from: int = 0
@@ -91,6 +131,42 @@ class DataAnalyzer:
91
131
  # Q2: KNOWN GSF PATTERNS - Match against Green Software Foundation patterns
92
132
  matched_patterns = get_pattern_by_keywords(message)
93
133
 
134
+ # Enhanced NLP analysis (if enabled)
135
+ nlp_results = None
136
+ if self.nlp_analyzer:
137
+ nlp_results = self.nlp_analyzer.analyze_text(message, list(self.green_keywords))
138
+
139
+ # Check if NLP found additional matches not caught by keyword matching
140
+ has_nlp_matches, additional_terms = self.nlp_analyzer.enhance_pattern_detection(
141
+ message, matched_patterns
142
+ )
143
+
144
+ if has_nlp_matches:
145
+ # NLP enhancement found additional evidence
146
+ green_aware = True
147
+
148
+ # Q3: CODE DIFF ANALYSIS (if enabled and diff data available)
149
+ diff_analysis = None
150
+ if self.diff_analyzer and commit.get("diff_data"):
151
+ try:
152
+ # Note: This requires commit object from PyDriller
153
+ # For now, we'll store a placeholder for future integration
154
+ diff_analysis = {
155
+ "enabled": True,
156
+ "status": "requires_pydriller_commit_object",
157
+ "patterns_detected": [],
158
+ "confidence": "none",
159
+ "evidence": {},
160
+ "metrics": {},
161
+ }
162
+ except Exception as e:
163
+ diff_analysis = {
164
+ "enabled": True,
165
+ "status": f"error: {str(e)}",
166
+ "patterns_detected": [],
167
+ "confidence": "none",
168
+ }
169
+
94
170
  # Get detailed pattern info
95
171
  pattern_details = []
96
172
  for _pattern_id, pattern in self.gsf_patterns.items():
@@ -105,13 +181,14 @@ class DataAnalyzer:
105
181
  )
106
182
 
107
183
  # Calculate confidence based on number of patterns matched
108
- confidence = (
109
- "high"
110
- if len(matched_patterns) >= 2
111
- else "medium" if len(matched_patterns) == 1 else "low"
112
- )
184
+ # Boost confidence if diff analysis also detected patterns
185
+ pattern_count = len(matched_patterns)
186
+ if diff_analysis and diff_analysis.get("patterns_detected"):
187
+ pattern_count += len(diff_analysis["patterns_detected"])
188
+
189
+ confidence = "high" if pattern_count >= 2 else "medium" if pattern_count == 1 else "low"
113
190
 
114
- return {
191
+ result = {
115
192
  "commit_hash": commit.get("hash", commit.get("commit_id", "unknown")),
116
193
  "repository": commit.get("repository", commit.get("repo_name", "unknown")),
117
194
  "author": commit.get("author", commit.get("author_name", "unknown")),
@@ -130,6 +207,32 @@ class DataAnalyzer:
130
207
  "deletions": commit.get("lines_deleted", commit.get("deletions", 0)),
131
208
  }
132
209
 
210
+ # Add diff analysis results if available
211
+ if diff_analysis:
212
+ result["diff_analysis"] = diff_analysis
213
+
214
+ # Add NLP analysis results if available
215
+ if nlp_results:
216
+ result["nlp_analysis"] = {
217
+ "total_matches": nlp_results["total_nlp_matches"],
218
+ "match_density": nlp_results["match_density"],
219
+ "morphological_count": len(nlp_results["morphological_matches"]),
220
+ "semantic_count": len(nlp_results["semantic_matches"]),
221
+ "phrase_count": len(nlp_results["phrase_matches"]),
222
+ }
223
+
224
+ # Add ML features if enabled
225
+ if self.enable_ml_features and self.ml_extractor:
226
+ # Note: Full feature extraction requires repository context
227
+ # For now, extract basic text features
228
+ text_features = self.ml_extractor.extract_text_features(message)
229
+ result["ml_features"] = {
230
+ "text": text_features,
231
+ "note": "Full ML features require repository and historical context",
232
+ }
233
+
234
+ return result
235
+
133
236
  def _check_green_awareness(self, message: str, files: list[str]) -> tuple[bool, Optional[str]]:
134
237
  """Check if commit explicitly mentions green/energy concerns.
135
238
 
@@ -1,8 +1,10 @@
1
1
  """GitHub repository fetcher for green microservices mining."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  from datetime import datetime
4
6
  from pathlib import Path
5
- from typing import Any, Optional
7
+ from typing import Any, Dict, List, Optional
6
8
 
7
9
  import click
8
10
  from github import Github, GithubException, RateLimitExceededException
@@ -26,6 +28,10 @@ class GitHubFetcher:
26
28
  max_repos: int = 100,
27
29
  min_stars: int = 100,
28
30
  languages: Optional[list[str]] = None,
31
+ created_after: Optional[str] = None,
32
+ created_before: Optional[str] = None,
33
+ pushed_after: Optional[str] = None,
34
+ pushed_before: Optional[str] = None,
29
35
  ):
30
36
  """Initialize GitHub fetcher.
31
37
 
@@ -34,6 +40,10 @@ class GitHubFetcher:
34
40
  max_repos: Maximum number of repositories to fetch
35
41
  min_stars: Minimum number of stars required
36
42
  languages: List of programming languages to filter
43
+ created_after: Repository created after date (YYYY-MM-DD)
44
+ created_before: Repository created before date (YYYY-MM-DD)
45
+ pushed_after: Repository pushed after date (YYYY-MM-DD)
46
+ pushed_before: Repository pushed before date (YYYY-MM-DD)
37
47
  """
38
48
  self.github = Github(token)
39
49
  self.max_repos = max_repos
@@ -47,6 +57,10 @@ class GitHubFetcher:
47
57
  "C#",
48
58
  "Rust",
49
59
  ]
60
+ self.created_after = created_after
61
+ self.created_before = created_before
62
+ self.pushed_after = pushed_after
63
+ self.pushed_before = pushed_before
50
64
 
51
65
  def search_repositories(self) -> list[dict[str, Any]]:
52
66
  """Search for microservice repositories.
@@ -62,10 +76,8 @@ class GitHubFetcher:
62
76
  f"Filters: min_stars={self.min_stars}, languages={', '.join(self.languages)}", "cyan"
63
77
  )
64
78
 
65
- # Build search query
66
- keyword_query = " OR ".join(keywords)
67
- language_query = " OR ".join([f"language:{lang}" for lang in self.languages])
68
- query = f"({keyword_query}) ({language_query}) stars:>={self.min_stars}"
79
+ # Build search query with temporal filters
80
+ query = self._build_temporal_query(keywords)
69
81
 
70
82
  try:
71
83
  # Execute search
@@ -139,6 +151,51 @@ class GitHubFetcher:
139
151
  "license": repo.license.name if repo.license else None,
140
152
  }
141
153
 
154
+ def _build_temporal_query(self, keywords: list[str]) -> str:
155
+ """
156
+ Build GitHub search query with temporal constraints.
157
+
158
+ Args:
159
+ keywords: List of search keywords
160
+
161
+ Returns:
162
+ Complete search query string
163
+ """
164
+ query_parts = []
165
+
166
+ # Keywords
167
+ keyword_query = " OR ".join(keywords)
168
+ query_parts.append(f"({keyword_query})")
169
+
170
+ # Languages
171
+ language_query = " OR ".join([f"language:{lang}" for lang in self.languages])
172
+ query_parts.append(f"({language_query})")
173
+
174
+ # Stars
175
+ query_parts.append(f"stars:>={self.min_stars}")
176
+
177
+ # Archived filter
178
+ query_parts.append("archived:false")
179
+
180
+ # Temporal filters
181
+ if self.created_after and self.created_before:
182
+ query_parts.append(f"created:{self.created_after}..{self.created_before}")
183
+ elif self.created_after:
184
+ query_parts.append(f"created:>={self.created_after}")
185
+ elif self.created_before:
186
+ query_parts.append(f"created:<={self.created_before}")
187
+
188
+ if self.pushed_after and self.pushed_before:
189
+ query_parts.append(f"pushed:{self.pushed_after}..{self.pushed_before}")
190
+ elif self.pushed_after:
191
+ query_parts.append(f"pushed:>={self.pushed_after}")
192
+ elif self.pushed_before:
193
+ query_parts.append(f"pushed:<={self.pushed_before}")
194
+
195
+ query = " ".join(query_parts)
196
+ colored_print(f"Query: {query}", "cyan")
197
+ return query
198
+
142
199
  def _handle_rate_limit(self):
143
200
  """Handle GitHub API rate limiting."""
144
201
  rate_limit = self.github.get_rate_limit()
@@ -1,9 +1,11 @@
1
- """Report generator for green microservices analysis."""
1
+ """Report generation for green mining analysis."""
2
+
3
+ from __future__ import annotations
2
4
 
3
5
  import json
4
6
  from datetime import datetime
5
7
  from pathlib import Path
6
- from typing import Any, Optional
8
+ from typing import Any, Dict, Optional
7
9
 
8
10
  import click
9
11
 
@@ -180,6 +182,11 @@ Commits were analyzed using a keyword and heuristic-based classification framewo
180
182
  # 2.4 Per-Repository Analysis
181
183
  sections.append(self._generate_repo_analysis_section(data))
182
184
 
185
+ # 2.5 Enhanced Statistics (if available)
186
+ enhanced_section = self._generate_enhanced_statistics_section(data)
187
+ if enhanced_section:
188
+ sections.append(enhanced_section)
189
+
183
190
  return "### 2. Results\n\n" + "\n\n".join(sections)
184
191
 
185
192
  def _generate_green_awareness_section(self, data: dict[str, Any]) -> str:
@@ -300,6 +307,120 @@ No novel microservice-specific green practices were automatically detected. Manu
300
307
 
301
308
  **Repositories with No Green Mentions:** {no_green_count} out of {len(per_repo)} repositories had zero green-aware commits."""
302
309
 
310
+ def _generate_enhanced_statistics_section(self, data: dict[str, Any]) -> str:
311
+ """Generate enhanced statistical analysis subsection.
312
+
313
+ Args:
314
+ data: Aggregated data containing enhanced_statistics field
315
+
316
+ Returns:
317
+ Markdown section with enhanced statistics
318
+ """
319
+ enhanced_stats = data.get("enhanced_statistics")
320
+
321
+ if not enhanced_stats:
322
+ return ""
323
+
324
+ # Handle error case
325
+ if "error" in enhanced_stats:
326
+ return f"""#### 2.5 Enhanced Statistical Analysis
327
+
328
+ **Note:** Enhanced statistical analysis encountered an error: {enhanced_stats['error']}
329
+ """
330
+
331
+ sections = []
332
+ sections.append("#### 2.5 Enhanced Statistical Analysis")
333
+ sections.append("")
334
+ sections.append(
335
+ "This section presents advanced statistical analyses of green software engineering patterns."
336
+ )
337
+ sections.append("")
338
+
339
+ # Temporal trends
340
+ temporal = enhanced_stats.get("temporal_trends", {})
341
+ if temporal and "error" not in temporal:
342
+ sections.append("##### Temporal Trends")
343
+ sections.append("")
344
+
345
+ if "overall_trend" in temporal:
346
+ trend_dir = temporal["overall_trend"].get("direction", "unknown")
347
+ trend_sig = temporal["overall_trend"].get("significant", False)
348
+ sections.append(f"**Overall Trend:** {trend_dir.capitalize()}")
349
+ if trend_sig:
350
+ sections.append(" (statistically significant)")
351
+ sections.append("")
352
+
353
+ if "monthly_stats" in temporal and temporal["monthly_stats"]:
354
+ sections.append("**Monthly Pattern Statistics:**")
355
+ sections.append("")
356
+ monthly = temporal["monthly_stats"]
357
+ sections.append(f"- Mean commits/month: {format_number(monthly.get('mean', 0))}")
358
+ sections.append(
359
+ f"- Median commits/month: {format_number(monthly.get('median', 0))}"
360
+ )
361
+ sections.append(f"- Std deviation: {format_number(monthly.get('std', 0))}")
362
+ sections.append("")
363
+
364
+ # Pattern correlations
365
+ correlations = enhanced_stats.get("pattern_correlations", {})
366
+ if correlations and "error" not in correlations:
367
+ sections.append("##### Pattern Correlations")
368
+ sections.append("")
369
+
370
+ top_corr = correlations.get("top_positive_correlations", [])
371
+ if top_corr:
372
+ sections.append("**Top Positive Correlations (|r| > 0.5):**")
373
+ sections.append("")
374
+ sections.append("| Pattern 1 | Pattern 2 | Correlation (r) |")
375
+ sections.append("|-----------|-----------|-----------------|")
376
+ for corr in top_corr[:5]:
377
+ sections.append(
378
+ f"| {corr['pattern1']} | {corr['pattern2']} | {corr['correlation']:.3f} |"
379
+ )
380
+ sections.append("")
381
+ else:
382
+ sections.append("No strong pattern correlations detected (|r| > 0.5).")
383
+ sections.append("")
384
+
385
+ # Effect sizes
386
+ effect_sizes = enhanced_stats.get("effect_size", {})
387
+ if effect_sizes and "error" not in effect_sizes:
388
+ sections.append("##### Effect Size Analysis")
389
+ sections.append("")
390
+
391
+ green_vs_nongreen = effect_sizes.get("green_vs_nongreen_patterns")
392
+ if green_vs_nongreen:
393
+ cohens_d = green_vs_nongreen.get("cohens_d", 0)
394
+ magnitude = green_vs_nongreen.get("magnitude", "negligible")
395
+ sections.append(f"**Green vs Non-Green Pattern Usage:**")
396
+ sections.append(f"- Cohen's d: {cohens_d:.3f}")
397
+ sections.append(f"- Effect magnitude: {magnitude.capitalize()}")
398
+ sections.append("")
399
+
400
+ # Descriptive statistics
401
+ descriptive = enhanced_stats.get("descriptive", {})
402
+ if descriptive and "error" not in descriptive:
403
+ sections.append("##### Descriptive Statistics")
404
+ sections.append("")
405
+
406
+ patterns = descriptive.get("patterns_per_commit", {})
407
+ if patterns:
408
+ sections.append("**Patterns per Commit:**")
409
+ sections.append(f"- Mean: {patterns.get('mean', 0):.2f}")
410
+ sections.append(f"- Median: {patterns.get('median', 0):.2f}")
411
+ sections.append(f"- Standard deviation: {patterns.get('std', 0):.2f}")
412
+ sections.append("")
413
+
414
+ repos = descriptive.get("green_commits_per_repo", {})
415
+ if repos:
416
+ sections.append("**Green Commits per Repository:**")
417
+ sections.append(f"- Mean: {repos.get('mean', 0):.2f}")
418
+ sections.append(f"- Median: {repos.get('median', 0):.2f}")
419
+ sections.append(f"- Standard deviation: {repos.get('std', 0):.2f}")
420
+ sections.append("")
421
+
422
+ return "\n".join(sections)
423
+
303
424
  def _generate_discussion(self, data: dict[str, Any]) -> str:
304
425
  """Generate discussion section."""
305
426
  summary = data["summary"]