greenmining 0.1.11__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,460 @@
1
+ """
2
+ Qualitative Analysis Framework for Pattern Validation
3
+
4
+ Implements qualitative validation from Soliman et al. (2017):
5
+ - Stratified random sampling for manual validation
6
+ - Precision/recall calculation framework
7
+ - Inter-rater reliability support
8
+ - False positive/negative tracking
9
+
10
+ Based on Soliman et al.: 42/151 studies used qualitative analysis
11
+ Critical for: validating IR-based approaches, calculating accuracy metrics
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import random
17
+ from typing import Dict, List, Optional, Set, Tuple
18
+ from dataclasses import dataclass
19
+ from collections import defaultdict
20
+ import json
21
+
22
+
23
+ @dataclass
24
+ class ValidationSample:
25
+ """Represents a single validation sample"""
26
+
27
+ commit_sha: str
28
+ commit_message: str
29
+ code_diff: Optional[str]
30
+ repository: str
31
+ detected_patterns: List[str]
32
+ detection_method: str # 'keyword', 'nlp', 'code_diff'
33
+ validation_status: Optional[str] = None # 'pending', 'validated', 'rejected'
34
+ true_label: Optional[bool] = None # Ground truth after manual review
35
+ reviewer: Optional[str] = None
36
+ review_notes: Optional[str] = None
37
+
38
+
39
+ @dataclass
40
+ class ValidationMetrics:
41
+ """Precision/recall metrics for validation"""
42
+
43
+ true_positives: int
44
+ false_positives: int
45
+ true_negatives: int
46
+ false_negatives: int
47
+ precision: float
48
+ recall: float
49
+ f1_score: float
50
+ accuracy: float
51
+
52
+
53
+ class QualitativeAnalyzer:
54
+ """
55
+ Framework for manual validation and qualitative analysis.
56
+
57
+ Implements:
58
+ 1. Stratified sampling (ensure representation across categories)
59
+ 2. Validation workflow (export → review → import → calculate metrics)
60
+ 3. Precision/recall calculation
61
+ 4. Inter-rater reliability (if multiple reviewers)
62
+
63
+ Based on Soliman et al.: "42 studies used qualitative analysis for validation"
64
+ """
65
+
66
+ def __init__(self, sample_size: int = 30, stratify_by: str = "pattern"):
67
+ """
68
+ Initialize qualitative analyzer.
69
+
70
+ Args:
71
+ sample_size: Number of commits to sample for validation
72
+ stratify_by: Stratification method ('pattern', 'repository', 'time', 'random')
73
+ """
74
+ self.sample_size = sample_size
75
+ self.stratify_by = stratify_by
76
+ self.samples: List[ValidationSample] = []
77
+
78
+ def generate_validation_samples(
79
+ self, commits: List[Dict], analysis_results: List[Dict], include_negatives: bool = True
80
+ ) -> List[ValidationSample]:
81
+ """
82
+ Generate stratified validation samples.
83
+
84
+ Args:
85
+ commits: All commits
86
+ analysis_results: Pattern detection results
87
+ include_negatives: Include non-green commits for false negative detection
88
+
89
+ Returns:
90
+ List of ValidationSample objects
91
+ """
92
+ # Build commit lookup
93
+ commit_lookup = {c.get("hash", c.get("sha")): c for c in commits}
94
+
95
+ # Separate positives (detected as green) and negatives
96
+ positives = [r for r in analysis_results if r.get("is_green_aware", False)]
97
+ negatives = [r for r in analysis_results if not r.get("is_green_aware", False)]
98
+
99
+ samples = []
100
+
101
+ # Calculate sample distribution
102
+ if include_negatives:
103
+ # 80% positives, 20% negatives (to check false negatives)
104
+ pos_sample_size = int(self.sample_size * 0.8)
105
+ neg_sample_size = self.sample_size - pos_sample_size
106
+ else:
107
+ pos_sample_size = self.sample_size
108
+ neg_sample_size = 0
109
+
110
+ # Sample positives (stratified by pattern or repository)
111
+ if self.stratify_by == "pattern":
112
+ pos_samples = self._stratified_sample_by_pattern(positives, pos_sample_size)
113
+ elif self.stratify_by == "repository":
114
+ pos_samples = self._stratified_sample_by_repo(positives, commit_lookup, pos_sample_size)
115
+ else:
116
+ pos_samples = random.sample(positives, min(pos_sample_size, len(positives)))
117
+
118
+ # Sample negatives (random)
119
+ if include_negatives and negatives:
120
+ neg_samples = random.sample(negatives, min(neg_sample_size, len(negatives)))
121
+ else:
122
+ neg_samples = []
123
+
124
+ # Create ValidationSample objects
125
+ for result in pos_samples + neg_samples:
126
+ commit_sha = result.get("commit_sha")
127
+ commit = commit_lookup.get(commit_sha, {})
128
+
129
+ sample = ValidationSample(
130
+ commit_sha=commit_sha,
131
+ commit_message=commit.get("message", result.get("commit_message", "")),
132
+ code_diff=result.get("code_diff"),
133
+ repository=commit.get("repository", result.get("repository", "")),
134
+ detected_patterns=result.get("patterns_detected", []),
135
+ detection_method=result.get("detection_method", "keyword"),
136
+ validation_status="pending",
137
+ )
138
+ samples.append(sample)
139
+
140
+ self.samples = samples
141
+ return samples
142
+
143
+ def _stratified_sample_by_pattern(self, results: List[Dict], sample_size: int) -> List[Dict]:
144
+ """Stratified sampling ensuring each pattern category is represented."""
145
+ # Group by dominant pattern
146
+ pattern_groups = defaultdict(list)
147
+ for result in results:
148
+ patterns = result.get("patterns_detected", [])
149
+ if patterns:
150
+ # Use first pattern as primary
151
+ primary_pattern = patterns[0]
152
+ pattern_groups[primary_pattern].append(result)
153
+
154
+ # Calculate samples per pattern (proportional)
155
+ total = len(results)
156
+ samples = []
157
+
158
+ for pattern, group in pattern_groups.items():
159
+ proportion = len(group) / total
160
+ pattern_sample_size = max(1, int(sample_size * proportion))
161
+ pattern_samples = random.sample(group, min(pattern_sample_size, len(group)))
162
+ samples.extend(pattern_samples)
163
+
164
+ # If we have fewer than sample_size, add random extras
165
+ if len(samples) < sample_size and len(samples) < len(results):
166
+ remaining = [r for r in results if r not in samples]
167
+ extra_needed = min(sample_size - len(samples), len(remaining))
168
+ samples.extend(random.sample(remaining, extra_needed))
169
+
170
+ return samples[:sample_size]
171
+
172
+ def _stratified_sample_by_repo(
173
+ self, results: List[Dict], commit_lookup: Dict, sample_size: int
174
+ ) -> List[Dict]:
175
+ """Stratified sampling ensuring each repository is represented."""
176
+ # Group by repository
177
+ repo_groups = defaultdict(list)
178
+ for result in results:
179
+ commit_sha = result.get("commit_sha")
180
+ commit = commit_lookup.get(commit_sha, {})
181
+ repo = commit.get("repository", result.get("repository", "unknown"))
182
+ repo_groups[repo].append(result)
183
+
184
+ # Sample proportionally from each repo
185
+ samples = []
186
+ total = len(results)
187
+
188
+ for repo, group in repo_groups.items():
189
+ proportion = len(group) / total
190
+ repo_sample_size = max(1, int(sample_size * proportion))
191
+ repo_samples = random.sample(group, min(repo_sample_size, len(group)))
192
+ samples.extend(repo_samples)
193
+
194
+ return samples[:sample_size]
195
+
196
+ def export_samples_for_review(self, output_path: str) -> None:
197
+ """
198
+ Export validation samples to JSON for manual review.
199
+
200
+ Args:
201
+ output_path: Path to output JSON file
202
+ """
203
+ samples_data = []
204
+ for i, sample in enumerate(self.samples, 1):
205
+ samples_data.append(
206
+ {
207
+ "sample_id": i,
208
+ "commit_sha": sample.commit_sha,
209
+ "repository": sample.repository,
210
+ "commit_message": sample.commit_message,
211
+ "detected_patterns": sample.detected_patterns,
212
+ "detection_method": sample.detection_method,
213
+ "code_diff_preview": sample.code_diff[:500] if sample.code_diff else None,
214
+ "validation_status": sample.validation_status,
215
+ "true_label": sample.true_label,
216
+ "reviewer": sample.reviewer,
217
+ "review_notes": sample.review_notes,
218
+ "___INSTRUCTIONS___": "Set true_label to true/false, add reviewer name, add review_notes",
219
+ }
220
+ )
221
+
222
+ with open(output_path, "w") as f:
223
+ json.dump(samples_data, f, indent=2)
224
+
225
+ def import_validated_samples(self, input_path: str) -> None:
226
+ """
227
+ Import manually validated samples from JSON.
228
+
229
+ Args:
230
+ input_path: Path to JSON file with validated samples
231
+ """
232
+ with open(input_path, "r") as f:
233
+ samples_data = json.load(f)
234
+
235
+ # Update samples with validation results
236
+ for data in samples_data:
237
+ commit_sha = data["commit_sha"]
238
+
239
+ # Find matching sample
240
+ for sample in self.samples:
241
+ if sample.commit_sha == commit_sha:
242
+ sample.true_label = data.get("true_label")
243
+ sample.reviewer = data.get("reviewer")
244
+ sample.review_notes = data.get("review_notes")
245
+ sample.validation_status = (
246
+ "validated" if sample.true_label is not None else "pending"
247
+ )
248
+ break
249
+
250
+ def calculate_metrics(self) -> ValidationMetrics:
251
+ """
252
+ Calculate precision, recall, F1, and accuracy.
253
+
254
+ Returns:
255
+ ValidationMetrics object
256
+ """
257
+ # Count outcomes
258
+ tp = 0 # True positive: detected as green, truly green
259
+ fp = 0 # False positive: detected as green, not green
260
+ tn = 0 # True negative: not detected, truly not green
261
+ fn = 0 # False negative: not detected, but is green
262
+
263
+ for sample in self.samples:
264
+ if sample.true_label is None:
265
+ continue # Skip unvalidated samples
266
+
267
+ detected_as_green = len(sample.detected_patterns) > 0
268
+ truly_green = sample.true_label
269
+
270
+ if detected_as_green and truly_green:
271
+ tp += 1
272
+ elif detected_as_green and not truly_green:
273
+ fp += 1
274
+ elif not detected_as_green and not truly_green:
275
+ tn += 1
276
+ elif not detected_as_green and truly_green:
277
+ fn += 1
278
+
279
+ # Calculate metrics
280
+ total = tp + fp + tn + fn
281
+ precision = tp / (tp + fp) if (tp + fp) > 0 else 0
282
+ recall = tp / (tp + fn) if (tp + fn) > 0 else 0
283
+ f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
284
+ accuracy = (tp + tn) / total if total > 0 else 0
285
+
286
+ return ValidationMetrics(
287
+ true_positives=tp,
288
+ false_positives=fp,
289
+ true_negatives=tn,
290
+ false_negatives=fn,
291
+ precision=round(precision, 4),
292
+ recall=round(recall, 4),
293
+ f1_score=round(f1, 4),
294
+ accuracy=round(accuracy, 4),
295
+ )
296
+
297
+ def get_validation_report(self) -> Dict:
298
+ """
299
+ Generate comprehensive validation report.
300
+
301
+ Returns:
302
+ Dictionary with validation statistics and metrics
303
+ """
304
+ validated_count = sum(1 for s in self.samples if s.validation_status == "validated")
305
+ pending_count = sum(1 for s in self.samples if s.validation_status == "pending")
306
+
307
+ metrics = self.calculate_metrics() if validated_count > 0 else None
308
+
309
+ # Analyze false positives and false negatives
310
+ false_positives = [
311
+ {
312
+ "commit_sha": s.commit_sha,
313
+ "detected_patterns": s.detected_patterns,
314
+ "review_notes": s.review_notes,
315
+ }
316
+ for s in self.samples
317
+ if s.true_label is not None and len(s.detected_patterns) > 0 and not s.true_label
318
+ ]
319
+
320
+ false_negatives = [
321
+ {
322
+ "commit_sha": s.commit_sha,
323
+ "commit_message": s.commit_message[:100],
324
+ "review_notes": s.review_notes,
325
+ }
326
+ for s in self.samples
327
+ if s.true_label is not None and len(s.detected_patterns) == 0 and s.true_label
328
+ ]
329
+
330
+ # Pattern accuracy breakdown
331
+ pattern_accuracy = self._analyze_pattern_accuracy()
332
+
333
+ return {
334
+ "sampling": {
335
+ "total_samples": len(self.samples),
336
+ "validated_samples": validated_count,
337
+ "pending_samples": pending_count,
338
+ "validation_progress": (
339
+ round(validated_count / len(self.samples) * 100, 1) if self.samples else 0
340
+ ),
341
+ "stratification_method": self.stratify_by,
342
+ },
343
+ "metrics": {
344
+ "precision": metrics.precision if metrics else None,
345
+ "recall": metrics.recall if metrics else None,
346
+ "f1_score": metrics.f1_score if metrics else None,
347
+ "accuracy": metrics.accuracy if metrics else None,
348
+ "true_positives": metrics.true_positives if metrics else None,
349
+ "false_positives": metrics.false_positives if metrics else None,
350
+ "true_negatives": metrics.true_negatives if metrics else None,
351
+ "false_negatives": metrics.false_negatives if metrics else None,
352
+ },
353
+ "error_analysis": {
354
+ "false_positive_count": len(false_positives),
355
+ "false_negative_count": len(false_negatives),
356
+ "false_positives": false_positives[:5], # Top 5
357
+ "false_negatives": false_negatives[:5], # Top 5
358
+ },
359
+ "pattern_accuracy": pattern_accuracy,
360
+ }
361
+
362
+ def _analyze_pattern_accuracy(self) -> Dict:
363
+ """Analyze accuracy per pattern category."""
364
+ pattern_stats = defaultdict(lambda: {"tp": 0, "fp": 0})
365
+
366
+ for sample in self.samples:
367
+ if sample.true_label is None:
368
+ continue
369
+
370
+ for pattern in sample.detected_patterns:
371
+ if sample.true_label:
372
+ pattern_stats[pattern]["tp"] += 1
373
+ else:
374
+ pattern_stats[pattern]["fp"] += 1
375
+
376
+ # Calculate precision per pattern
377
+ pattern_accuracy = {}
378
+ for pattern, stats in pattern_stats.items():
379
+ total = stats["tp"] + stats["fp"]
380
+ precision = stats["tp"] / total if total > 0 else 0
381
+ pattern_accuracy[pattern] = {
382
+ "true_positives": stats["tp"],
383
+ "false_positives": stats["fp"],
384
+ "precision": round(precision, 4),
385
+ }
386
+
387
+ return pattern_accuracy
388
+
389
+ def get_inter_rater_reliability(
390
+ self,
391
+ samples_from_reviewer_a: List[ValidationSample],
392
+ samples_from_reviewer_b: List[ValidationSample],
393
+ ) -> Dict:
394
+ """
395
+ Calculate inter-rater reliability (Cohen's Kappa).
396
+
397
+ Args:
398
+ samples_from_reviewer_a: Samples validated by reviewer A
399
+ samples_from_reviewer_b: Samples validated by reviewer B (same commits)
400
+
401
+ Returns:
402
+ Dictionary with Cohen's Kappa and agreement statistics
403
+ """
404
+ # Match samples by commit_sha
405
+ matched_samples = []
406
+ for sample_a in samples_from_reviewer_a:
407
+ for sample_b in samples_from_reviewer_b:
408
+ if sample_a.commit_sha == sample_b.commit_sha:
409
+ matched_samples.append((sample_a, sample_b))
410
+ break
411
+
412
+ if not matched_samples:
413
+ return {"error": "No matching samples between reviewers"}
414
+
415
+ # Calculate agreement
416
+ agreements = 0
417
+ for sample_a, sample_b in matched_samples:
418
+ if sample_a.true_label == sample_b.true_label:
419
+ agreements += 1
420
+
421
+ observed_agreement = agreements / len(matched_samples)
422
+
423
+ # Calculate expected agreement (by chance)
424
+ a_positive = sum(1 for s, _ in matched_samples if s.true_label)
425
+ b_positive = sum(1 for _, s in matched_samples if s.true_label)
426
+ n = len(matched_samples)
427
+
428
+ p_a_yes = a_positive / n
429
+ p_b_yes = b_positive / n
430
+ expected_agreement = (p_a_yes * p_b_yes) + ((1 - p_a_yes) * (1 - p_b_yes))
431
+
432
+ # Cohen's Kappa
433
+ kappa = (
434
+ (observed_agreement - expected_agreement) / (1 - expected_agreement)
435
+ if expected_agreement < 1
436
+ else 1
437
+ )
438
+
439
+ return {
440
+ "cohens_kappa": round(kappa, 4),
441
+ "observed_agreement": round(observed_agreement, 4),
442
+ "expected_agreement": round(expected_agreement, 4),
443
+ "sample_count": n,
444
+ "interpretation": self._interpret_kappa(kappa),
445
+ }
446
+
447
+ def _interpret_kappa(self, kappa: float) -> str:
448
+ """Interpret Cohen's Kappa value."""
449
+ if kappa < 0:
450
+ return "Poor (less than chance)"
451
+ elif kappa < 0.20:
452
+ return "Slight"
453
+ elif kappa < 0.40:
454
+ return "Fair"
455
+ elif kappa < 0.60:
456
+ return "Moderate"
457
+ elif kappa < 0.80:
458
+ return "Substantial"
459
+ else:
460
+ return "Almost perfect"
@@ -0,0 +1,245 @@
1
+ """Enhanced statistical analyzer for green software patterns."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, List
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from scipy import stats
10
+
11
+
12
+ class EnhancedStatisticalAnalyzer:
13
+ """
14
+ Advanced statistical analyses for green software patterns.
15
+ Based on Soliman et al. quantitative validation techniques.
16
+ """
17
+
18
+ def analyze_pattern_correlations(self, commit_data: pd.DataFrame) -> Dict[str, Any]:
19
+ """
20
+ Analyze correlations between patterns.
21
+
22
+ Question: Do repositories that adopt caching also adopt resource limits?
23
+
24
+ Args:
25
+ commit_data: DataFrame with pattern columns
26
+
27
+ Returns:
28
+ Dictionary containing correlation matrix and significant pairs
29
+ """
30
+ # Create pattern co-occurrence matrix
31
+ pattern_columns = [col for col in commit_data.columns if col.startswith("pattern_")]
32
+
33
+ if not pattern_columns:
34
+ return {
35
+ "correlation_matrix": {},
36
+ "significant_pairs": [],
37
+ "interpretation": "No pattern columns found",
38
+ }
39
+
40
+ correlation_matrix = commit_data[pattern_columns].corr(method="pearson")
41
+
42
+ # Identify significant correlations
43
+ significant_pairs = []
44
+ for i, pattern1 in enumerate(pattern_columns):
45
+ for j, pattern2 in enumerate(pattern_columns[i + 1 :], start=i + 1):
46
+ corr_value = correlation_matrix.iloc[i, j]
47
+ if abs(corr_value) > 0.5: # Strong correlation threshold
48
+ significant_pairs.append(
49
+ {
50
+ "pattern1": pattern1,
51
+ "pattern2": pattern2,
52
+ "correlation": corr_value,
53
+ "strength": "strong" if abs(corr_value) > 0.7 else "moderate",
54
+ }
55
+ )
56
+
57
+ return {
58
+ "correlation_matrix": correlation_matrix.to_dict(),
59
+ "significant_pairs": significant_pairs,
60
+ "interpretation": self._interpret_correlations(significant_pairs),
61
+ }
62
+
63
+ def temporal_trend_analysis(self, commits_df: pd.DataFrame) -> Dict[str, Any]:
64
+ """
65
+ Analyze temporal trends in green awareness.
66
+
67
+ Techniques:
68
+ - Mann-Kendall trend test (monotonic trend detection)
69
+ - Seasonal decomposition (identify cyclical patterns)
70
+ - Change point detection (identify sudden shifts)
71
+
72
+ Args:
73
+ commits_df: DataFrame with date and green_aware columns
74
+
75
+ Returns:
76
+ Dictionary containing trend analysis results
77
+ """
78
+ # Prepare time series data
79
+ commits_df["date"] = pd.to_datetime(commits_df["date"])
80
+ commits_df = commits_df.sort_values("date")
81
+
82
+ # Monthly aggregation
83
+ monthly = (
84
+ commits_df.set_index("date")
85
+ .resample("ME")
86
+ .agg({"green_aware": "sum", "commit_hash": "count"})
87
+ )
88
+ monthly.columns = ["green_aware", "total_commits"]
89
+ monthly["green_rate"] = monthly["green_aware"] / monthly["total_commits"]
90
+
91
+ # Mann-Kendall trend test
92
+ mk_result = stats.kendalltau(range(len(monthly)), monthly["green_rate"])
93
+ trend_direction = "increasing" if mk_result.correlation > 0 else "decreasing"
94
+ trend_significant = bool(mk_result.pvalue < 0.05)
95
+
96
+ # Seasonal decomposition (requires at least 2 years of data)
97
+ seasonal_pattern = None
98
+ if len(monthly) >= 24:
99
+ try:
100
+ from statsmodels.tsa.seasonal import seasonal_decompose
101
+
102
+ decomposition = seasonal_decompose(
103
+ monthly["green_rate"], model="additive", period=12
104
+ )
105
+ seasonal_pattern = decomposition.seasonal.to_dict()
106
+ except Exception:
107
+ seasonal_pattern = None
108
+
109
+ # Change point detection (simple method: rolling window variance)
110
+ window_size = 3
111
+ monthly["rolling_var"] = monthly["green_rate"].rolling(window=window_size).var()
112
+ change_points = monthly[
113
+ monthly["rolling_var"]
114
+ > monthly["rolling_var"].mean() + 2 * monthly["rolling_var"].std()
115
+ ]
116
+
117
+ return {
118
+ "trend": {
119
+ "direction": trend_direction,
120
+ "significant": trend_significant,
121
+ "correlation": mk_result.correlation,
122
+ "p_value": mk_result.pvalue,
123
+ },
124
+ "seasonal_pattern": seasonal_pattern,
125
+ "change_points": change_points.index.tolist() if not change_points.empty else [],
126
+ "monthly_data": monthly.to_dict(),
127
+ }
128
+
129
+ def effect_size_analysis(self, group1: List[float], group2: List[float]) -> Dict[str, Any]:
130
+ """
131
+ Calculate effect size between two groups.
132
+
133
+ Use case: Compare green awareness between:
134
+ - Different programming languages
135
+ - Different time periods
136
+ - Different repository sizes
137
+
138
+ Args:
139
+ group1: First group values
140
+ group2: Second group values
141
+
142
+ Returns:
143
+ Dictionary containing effect size metrics
144
+ """
145
+ # Cohen's d (effect size)
146
+ mean1, mean2 = np.mean(group1), np.mean(group2)
147
+ std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
148
+ pooled_std = np.sqrt((std1**2 + std2**2) / 2)
149
+
150
+ if pooled_std == 0:
151
+ cohens_d = 0
152
+ else:
153
+ cohens_d = (mean1 - mean2) / pooled_std
154
+
155
+ # Interpretation
156
+ if abs(cohens_d) < 0.2:
157
+ magnitude = "negligible"
158
+ elif abs(cohens_d) < 0.5:
159
+ magnitude = "small"
160
+ elif abs(cohens_d) < 0.8:
161
+ magnitude = "medium"
162
+ else:
163
+ magnitude = "large"
164
+
165
+ # Statistical significance
166
+ t_stat, p_value = stats.ttest_ind(group1, group2)
167
+
168
+ return {
169
+ "cohens_d": cohens_d,
170
+ "magnitude": magnitude,
171
+ "mean_difference": mean1 - mean2,
172
+ "t_statistic": t_stat,
173
+ "p_value": p_value,
174
+ "significant": bool(p_value < 0.05),
175
+ }
176
+
177
+ def pattern_adoption_rate_analysis(self, commits_df: pd.DataFrame) -> Dict[str, Any]:
178
+ """
179
+ Analyze pattern adoption rates over repository lifetime.
180
+
181
+ Metrics:
182
+ - Time to first adoption (TTFA)
183
+ - Adoption acceleration
184
+ - Pattern stickiness (continued use after adoption)
185
+
186
+ Args:
187
+ commits_df: DataFrame with pattern and date columns
188
+
189
+ Returns:
190
+ Dictionary mapping patterns to adoption metrics
191
+ """
192
+ results = {}
193
+
194
+ for pattern in commits_df["pattern"].unique():
195
+ pattern_commits = commits_df[commits_df["pattern"] == pattern].sort_values("date")
196
+
197
+ if len(pattern_commits) == 0:
198
+ continue
199
+
200
+ # Time to first adoption
201
+ first_adoption = pattern_commits.iloc[0]["date"]
202
+ repo_start = commits_df["date"].min()
203
+ ttfa_days = (first_adoption - repo_start).days
204
+
205
+ # Adoption frequency over time
206
+ monthly_adoption = pattern_commits.set_index("date").resample("ME").size()
207
+
208
+ # Pattern stickiness (months with at least one adoption)
209
+ total_months = len(commits_df.set_index("date").resample("ME").size())
210
+ active_months = len(monthly_adoption[monthly_adoption > 0])
211
+ stickiness = active_months / total_months if total_months > 0 else 0
212
+
213
+ results[pattern] = {
214
+ "ttfa_days": ttfa_days,
215
+ "total_adoptions": len(pattern_commits),
216
+ "stickiness": stickiness,
217
+ "monthly_adoption_rate": monthly_adoption.mean(),
218
+ }
219
+
220
+ return results
221
+
222
+ def _interpret_correlations(self, significant_pairs: List[Dict[str, Any]]) -> str:
223
+ """
224
+ Generate interpretation of correlation results.
225
+
226
+ Args:
227
+ significant_pairs: List of significant correlation pairs
228
+
229
+ Returns:
230
+ Interpretation string
231
+ """
232
+ if not significant_pairs:
233
+ return "No significant correlations found between patterns."
234
+
235
+ interpretations = []
236
+ for pair in significant_pairs[:5]: # Top 5
237
+ p1 = pair["pattern1"].replace("pattern_", "")
238
+ p2 = pair["pattern2"].replace("pattern_", "")
239
+ corr = pair["correlation"]
240
+ if corr > 0:
241
+ interpretations.append(f"{p1} and {p2} tend to be adopted together (r={corr:.2f})")
242
+ else:
243
+ interpretations.append(f"{p1} and {p2} rarely co-occur (r={corr:.2f})")
244
+
245
+ return "; ".join(interpretations)