greenmining 0.1.12__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,434 @@
1
+ """
2
+ Temporal and Historical Analysis for Green Software Practices
3
+
4
+ Implements time-series analysis from Soliman et al. (2017):
5
+ - Adoption trend analysis (when practices emerged)
6
+ - Velocity analysis (commit frequency over time)
7
+ - Pattern evolution tracking (which practices dominated when)
8
+ - Temporal correlations (do practices cluster in time?)
9
+
10
+ Addresses research questions:
11
+ 1. When did green practices emerge in software development?
12
+ 2. Are green practices increasing or decreasing over time?
13
+ 3. Which practices were early vs. late adopters?
14
+ 4. Do green practices correlate with project maturity?
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from datetime import datetime, timedelta
20
+ from typing import Dict, List, Optional, Tuple
21
+ from dataclasses import dataclass
22
+ from collections import defaultdict
23
+ import statistics
24
+
25
+
26
+ @dataclass
27
+ class TemporalMetrics:
28
+ """Metrics for a specific time period"""
29
+
30
+ period: str
31
+ start_date: datetime
32
+ end_date: datetime
33
+ commit_count: int
34
+ green_commit_count: int
35
+ green_awareness_rate: float
36
+ unique_patterns: int
37
+ dominant_pattern: Optional[str]
38
+ velocity: float # commits per day
39
+
40
+
41
+ @dataclass
42
+ class TrendAnalysis:
43
+ """Trend analysis results"""
44
+
45
+ trend_direction: str # 'increasing', 'decreasing', 'stable'
46
+ slope: float
47
+ r_squared: float
48
+ start_rate: float
49
+ end_rate: float
50
+ change_percentage: float
51
+
52
+
53
+ class TemporalAnalyzer:
54
+ """
55
+ Analyze temporal patterns in green software adoption.
56
+
57
+ Based on Soliman et al.: "Time-range filtering is standard practice"
58
+ Extends with: trend detection, velocity analysis, evolution tracking
59
+ """
60
+
61
+ def __init__(self, granularity: str = "quarter"):
62
+ """
63
+ Initialize temporal analyzer.
64
+
65
+ Args:
66
+ granularity: Time period granularity ('day', 'week', 'month', 'quarter', 'year')
67
+ """
68
+ self.granularity = granularity
69
+
70
+ def group_commits_by_period(
71
+ self, commits: List[Dict], date_field: str = "date"
72
+ ) -> Dict[str, List[Dict]]:
73
+ """
74
+ Group commits into time periods.
75
+
76
+ Args:
77
+ commits: List of commit dictionaries
78
+ date_field: Field containing commit date
79
+
80
+ Returns:
81
+ Dictionary mapping period strings to commit lists
82
+ """
83
+ periods = defaultdict(list)
84
+
85
+ for commit in commits:
86
+ date_str = commit.get(date_field)
87
+ if not date_str:
88
+ continue
89
+
90
+ # Parse date
91
+ try:
92
+ if isinstance(date_str, datetime):
93
+ date = date_str
94
+ else:
95
+ date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
96
+ except (ValueError, AttributeError):
97
+ continue
98
+
99
+ # Determine period
100
+ period_key = self._get_period_key(date)
101
+ periods[period_key].append(commit)
102
+
103
+ return dict(periods)
104
+
105
+ def _get_period_key(self, date: datetime) -> str:
106
+ """Get period key for a date based on granularity."""
107
+ if self.granularity == "day":
108
+ return date.strftime("%Y-%m-%d")
109
+ elif self.granularity == "week":
110
+ # ISO week number
111
+ return f"{date.year}-W{date.isocalendar()[1]:02d}"
112
+ elif self.granularity == "month":
113
+ return date.strftime("%Y-%m")
114
+ elif self.granularity == "quarter":
115
+ quarter = (date.month - 1) // 3 + 1
116
+ return f"{date.year}-Q{quarter}"
117
+ elif self.granularity == "year":
118
+ return str(date.year)
119
+ else:
120
+ return date.strftime("%Y-%m")
121
+
122
+ def _parse_period_key(self, period_key: str) -> Tuple[datetime, datetime]:
123
+ """Parse period key back to start and end dates."""
124
+ if "W" in period_key:
125
+ # Week format: 2024-W15
126
+ year, week = period_key.split("-W")
127
+ start = datetime.strptime(f"{year}-W{week}-1", "%Y-W%W-%w")
128
+ end = start + timedelta(days=6)
129
+ elif "Q" in period_key:
130
+ # Quarter format: 2024-Q1
131
+ year, quarter = period_key.split("-Q")
132
+ quarter_num = int(quarter)
133
+ start_month = (quarter_num - 1) * 3 + 1
134
+ start = datetime(int(year), start_month, 1)
135
+ # Calculate end of quarter
136
+ end_month = start_month + 2
137
+ if end_month == 12:
138
+ end = datetime(int(year), 12, 31, 23, 59, 59)
139
+ elif end_month > 12:
140
+ end = datetime(int(year) + 1, 1, 1) - timedelta(seconds=1)
141
+ else:
142
+ # Get last day of end_month
143
+ if end_month in [1, 3, 5, 7, 8, 10]:
144
+ end = datetime(int(year), end_month, 31, 23, 59, 59)
145
+ elif end_month in [4, 6, 9, 11]:
146
+ end = datetime(int(year), end_month, 30, 23, 59, 59)
147
+ else: # February
148
+ if int(year) % 4 == 0 and (int(year) % 100 != 0 or int(year) % 400 == 0):
149
+ end = datetime(int(year), end_month, 29, 23, 59, 59)
150
+ else:
151
+ end = datetime(int(year), end_month, 28, 23, 59, 59)
152
+ elif len(period_key) == 4:
153
+ # Year format: 2024
154
+ start = datetime(int(period_key), 1, 1)
155
+ end = datetime(int(period_key), 12, 31)
156
+ elif len(period_key) == 7:
157
+ # Month format: 2024-01
158
+ start = datetime.strptime(period_key, "%Y-%m")
159
+ next_month = start.month + 1 if start.month < 12 else 1
160
+ next_year = start.year if start.month < 12 else start.year + 1
161
+ end = datetime(next_year, next_month, 1) - timedelta(days=1)
162
+ elif len(period_key) == 10:
163
+ # Day format: 2024-01-15
164
+ start = datetime.strptime(period_key, "%Y-%m-%d")
165
+ end = start + timedelta(days=1) - timedelta(seconds=1)
166
+ else:
167
+ # Default: treat as month
168
+ start = datetime.strptime(period_key[:7], "%Y-%m")
169
+ end = start + timedelta(days=30)
170
+
171
+ return start, end
172
+
173
+ def calculate_period_metrics(
174
+ self, period_key: str, commits: List[Dict], analysis_results: List[Dict]
175
+ ) -> TemporalMetrics:
176
+ """
177
+ Calculate metrics for a time period.
178
+
179
+ Args:
180
+ period_key: Period identifier
181
+ commits: Commits in this period
182
+ analysis_results: Pattern analysis results for commits
183
+
184
+ Returns:
185
+ TemporalMetrics object
186
+ """
187
+ start_date, end_date = self._parse_period_key(period_key)
188
+
189
+ # Count green commits
190
+ commit_hashes = {c.get("hash", c.get("sha")) for c in commits}
191
+ green_results = [
192
+ r
193
+ for r in analysis_results
194
+ if r.get("commit_sha") in commit_hashes and r.get("is_green_aware", False)
195
+ ]
196
+
197
+ green_count = len(green_results)
198
+ total_count = len(commits)
199
+ green_rate = (green_count / total_count * 100) if total_count > 0 else 0
200
+
201
+ # Count unique patterns
202
+ all_patterns = set()
203
+ for result in green_results:
204
+ patterns = result.get("patterns_detected", [])
205
+ all_patterns.update(patterns)
206
+
207
+ # Find dominant pattern
208
+ pattern_counts = defaultdict(int)
209
+ for result in green_results:
210
+ for pattern in result.get("patterns_detected", []):
211
+ pattern_counts[pattern] += 1
212
+
213
+ dominant = max(pattern_counts.items(), key=lambda x: x[1])[0] if pattern_counts else None
214
+
215
+ # Calculate velocity (commits per day)
216
+ days = (end_date - start_date).days + 1
217
+ velocity = total_count / days if days > 0 else 0
218
+
219
+ return TemporalMetrics(
220
+ period=period_key,
221
+ start_date=start_date,
222
+ end_date=end_date,
223
+ commit_count=total_count,
224
+ green_commit_count=green_count,
225
+ green_awareness_rate=round(green_rate, 2),
226
+ unique_patterns=len(all_patterns),
227
+ dominant_pattern=dominant,
228
+ velocity=round(velocity, 2),
229
+ )
230
+
231
+ def analyze_trends(self, commits: List[Dict], analysis_results: List[Dict]) -> Dict:
232
+ """
233
+ Comprehensive temporal trend analysis.
234
+
235
+ Args:
236
+ commits: All commits to analyze
237
+ analysis_results: Pattern analysis results
238
+
239
+ Returns:
240
+ Dictionary with:
241
+ - periods: List of TemporalMetrics
242
+ - trend: TrendAnalysis
243
+ - adoption_curve: List of (period, cumulative_rate)
244
+ - velocity_trend: Velocity change over time
245
+ """
246
+ # Group by periods
247
+ grouped = self.group_commits_by_period(commits)
248
+
249
+ # Calculate metrics per period
250
+ periods = []
251
+ for period_key in sorted(grouped.keys()):
252
+ metrics = self.calculate_period_metrics(
253
+ period_key, grouped[period_key], analysis_results
254
+ )
255
+ periods.append(metrics)
256
+
257
+ # Analyze trend
258
+ trend = self._calculate_trend(periods)
259
+
260
+ # Calculate adoption curve (cumulative)
261
+ adoption_curve = self._calculate_adoption_curve(periods)
262
+
263
+ # Velocity trend
264
+ velocity_trend = self._calculate_velocity_trend(periods)
265
+
266
+ # Pattern evolution (which patterns emerged when)
267
+ pattern_evolution = self._analyze_pattern_evolution(periods, analysis_results)
268
+
269
+ return {
270
+ "periods": [self._metrics_to_dict(m) for m in periods],
271
+ "trend": self._trend_to_dict(trend),
272
+ "adoption_curve": adoption_curve,
273
+ "velocity_trend": velocity_trend,
274
+ "pattern_evolution": pattern_evolution,
275
+ "summary": {
276
+ "total_periods": len(periods),
277
+ "first_period": periods[0].period if periods else None,
278
+ "last_period": periods[-1].period if periods else None,
279
+ "overall_trend": trend.trend_direction if trend else "unknown",
280
+ "average_green_rate": round(
281
+ statistics.mean([p.green_awareness_rate for p in periods]) if periods else 0, 2
282
+ ),
283
+ },
284
+ }
285
+
286
+ def _calculate_trend(self, periods: List[TemporalMetrics]) -> Optional[TrendAnalysis]:
287
+ """Calculate linear trend using least squares regression."""
288
+ if len(periods) < 2:
289
+ return None
290
+
291
+ # Simple linear regression
292
+ n = len(periods)
293
+ x = list(range(n)) # Period index
294
+ y = [p.green_awareness_rate for p in periods]
295
+
296
+ # Calculate slope and intercept
297
+ x_mean = statistics.mean(x)
298
+ y_mean = statistics.mean(y)
299
+
300
+ numerator = sum((x[i] - x_mean) * (y[i] - y_mean) for i in range(n))
301
+ denominator = sum((x[i] - x_mean) ** 2 for i in range(n))
302
+
303
+ slope = numerator / denominator if denominator != 0 else 0
304
+ intercept = y_mean - slope * x_mean
305
+
306
+ # Calculate R²
307
+ y_pred = [slope * xi + intercept for xi in x]
308
+ ss_res = sum((y[i] - y_pred[i]) ** 2 for i in range(n))
309
+ ss_tot = sum((yi - y_mean) ** 2 for yi in y)
310
+ r_squared = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0
311
+
312
+ # Determine trend direction
313
+ if abs(slope) < 0.1:
314
+ direction = "stable"
315
+ elif slope > 0:
316
+ direction = "increasing"
317
+ else:
318
+ direction = "decreasing"
319
+
320
+ # Calculate change
321
+ start_rate = y[0]
322
+ end_rate = y[-1]
323
+ change = ((end_rate - start_rate) / start_rate * 100) if start_rate != 0 else 0
324
+
325
+ return TrendAnalysis(
326
+ trend_direction=direction,
327
+ slope=round(slope, 4),
328
+ r_squared=round(r_squared, 4),
329
+ start_rate=round(start_rate, 2),
330
+ end_rate=round(end_rate, 2),
331
+ change_percentage=round(change, 2),
332
+ )
333
+
334
+ def _calculate_adoption_curve(self, periods: List[TemporalMetrics]) -> List[Tuple[str, float]]:
335
+ """Calculate cumulative adoption over time."""
336
+ cumulative_green = 0
337
+ cumulative_total = 0
338
+ curve = []
339
+
340
+ for period in periods:
341
+ cumulative_green += period.green_commit_count
342
+ cumulative_total += period.commit_count
343
+ cumulative_rate = (
344
+ (cumulative_green / cumulative_total * 100) if cumulative_total > 0 else 0
345
+ )
346
+ curve.append((period.period, round(cumulative_rate, 2)))
347
+
348
+ return curve
349
+
350
+ def _calculate_velocity_trend(self, periods: List[TemporalMetrics]) -> Dict:
351
+ """Analyze velocity changes over time."""
352
+ if not periods:
353
+ return {}
354
+
355
+ velocities = [p.velocity for p in periods]
356
+
357
+ return {
358
+ "average_velocity": round(statistics.mean(velocities), 2),
359
+ "velocity_std": round(statistics.stdev(velocities), 2) if len(velocities) > 1 else 0,
360
+ "min_velocity": round(min(velocities), 2),
361
+ "max_velocity": round(max(velocities), 2),
362
+ "velocity_by_period": [(p.period, p.velocity) for p in periods],
363
+ }
364
+
365
+ def _analyze_pattern_evolution(
366
+ self, periods: List[TemporalMetrics], analysis_results: List[Dict]
367
+ ) -> Dict:
368
+ """Track when different patterns emerged and dominated."""
369
+ pattern_timeline = defaultdict(lambda: {"first_seen": None, "occurrences_by_period": {}})
370
+
371
+ for period in periods:
372
+ # Get commits in this period
373
+ period_patterns = defaultdict(int)
374
+
375
+ for result in analysis_results:
376
+ commit_date_str = result.get("commit_date")
377
+ if not commit_date_str:
378
+ continue
379
+
380
+ try:
381
+ if isinstance(commit_date_str, datetime):
382
+ commit_date = commit_date_str
383
+ else:
384
+ commit_date = datetime.fromisoformat(commit_date_str.replace("Z", "+00:00"))
385
+
386
+ if period.start_date <= commit_date <= period.end_date:
387
+ for pattern in result.get("patterns_detected", []):
388
+ period_patterns[pattern] += 1
389
+ except (ValueError, AttributeError):
390
+ continue
391
+
392
+ # Record occurrences
393
+ for pattern, count in period_patterns.items():
394
+ if pattern_timeline[pattern]["first_seen"] is None:
395
+ pattern_timeline[pattern]["first_seen"] = period.period
396
+ pattern_timeline[pattern]["occurrences_by_period"][period.period] = count
397
+
398
+ return {
399
+ pattern: {
400
+ "first_seen": data["first_seen"],
401
+ "total_occurrences": sum(data["occurrences_by_period"].values()),
402
+ "periods_active": len(data["occurrences_by_period"]),
403
+ "timeline": data["occurrences_by_period"],
404
+ }
405
+ for pattern, data in pattern_timeline.items()
406
+ }
407
+
408
+ def _metrics_to_dict(self, metrics: TemporalMetrics) -> Dict:
409
+ """Convert TemporalMetrics to dictionary."""
410
+ return {
411
+ "period": metrics.period,
412
+ "start_date": metrics.start_date.isoformat(),
413
+ "end_date": metrics.end_date.isoformat(),
414
+ "commit_count": metrics.commit_count,
415
+ "green_commit_count": metrics.green_commit_count,
416
+ "green_awareness_rate": metrics.green_awareness_rate,
417
+ "unique_patterns": metrics.unique_patterns,
418
+ "dominant_pattern": metrics.dominant_pattern,
419
+ "velocity": metrics.velocity,
420
+ }
421
+
422
+ def _trend_to_dict(self, trend: Optional[TrendAnalysis]) -> Dict:
423
+ """Convert TrendAnalysis to dictionary."""
424
+ if not trend:
425
+ return {}
426
+
427
+ return {
428
+ "trend_direction": trend.trend_direction,
429
+ "slope": trend.slope,
430
+ "r_squared": trend.r_squared,
431
+ "start_rate": trend.start_rate,
432
+ "end_rate": trend.end_rate,
433
+ "change_percentage": trend.change_percentage,
434
+ }