greenmining 0.1.12__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__version__.py +1 -1
- greenmining/analyzers/__init__.py +17 -0
- greenmining/analyzers/code_diff_analyzer.py +238 -0
- greenmining/analyzers/ml_feature_extractor.py +512 -0
- greenmining/analyzers/nlp_analyzer.py +365 -0
- greenmining/analyzers/qualitative_analyzer.py +460 -0
- greenmining/analyzers/statistical_analyzer.py +245 -0
- greenmining/analyzers/temporal_analyzer.py +434 -0
- greenmining/cli.py +119 -24
- greenmining/config.py +21 -0
- greenmining/controllers/repository_controller.py +50 -2
- greenmining/gsf_patterns.py +10 -5
- greenmining/models/aggregated_stats.py +3 -1
- greenmining/models/commit.py +3 -0
- greenmining/models/repository.py +3 -1
- greenmining/presenters/console_presenter.py +3 -1
- greenmining/services/commit_extractor.py +37 -7
- greenmining/services/data_aggregator.py +171 -7
- greenmining/services/data_analyzer.py +111 -8
- greenmining/services/github_fetcher.py +62 -5
- greenmining/services/reports.py +123 -2
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/METADATA +250 -22
- greenmining-1.0.2.dist-info/RECORD +36 -0
- greenmining-0.1.12.dist-info/RECORD +0 -29
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/WHEEL +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/entry_points.txt +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/licenses/LICENSE +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Qualitative Analysis Framework for Pattern Validation
|
|
3
|
+
|
|
4
|
+
Implements qualitative validation from Soliman et al. (2017):
|
|
5
|
+
- Stratified random sampling for manual validation
|
|
6
|
+
- Precision/recall calculation framework
|
|
7
|
+
- Inter-rater reliability support
|
|
8
|
+
- False positive/negative tracking
|
|
9
|
+
|
|
10
|
+
Based on Soliman et al.: 42/151 studies used qualitative analysis
|
|
11
|
+
Critical for: validating IR-based approaches, calculating accuracy metrics
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import random
|
|
17
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from collections import defaultdict
|
|
20
|
+
import json
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ValidationSample:
|
|
25
|
+
"""Represents a single validation sample"""
|
|
26
|
+
|
|
27
|
+
commit_sha: str
|
|
28
|
+
commit_message: str
|
|
29
|
+
code_diff: Optional[str]
|
|
30
|
+
repository: str
|
|
31
|
+
detected_patterns: List[str]
|
|
32
|
+
detection_method: str # 'keyword', 'nlp', 'code_diff'
|
|
33
|
+
validation_status: Optional[str] = None # 'pending', 'validated', 'rejected'
|
|
34
|
+
true_label: Optional[bool] = None # Ground truth after manual review
|
|
35
|
+
reviewer: Optional[str] = None
|
|
36
|
+
review_notes: Optional[str] = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class ValidationMetrics:
|
|
41
|
+
"""Precision/recall metrics for validation"""
|
|
42
|
+
|
|
43
|
+
true_positives: int
|
|
44
|
+
false_positives: int
|
|
45
|
+
true_negatives: int
|
|
46
|
+
false_negatives: int
|
|
47
|
+
precision: float
|
|
48
|
+
recall: float
|
|
49
|
+
f1_score: float
|
|
50
|
+
accuracy: float
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class QualitativeAnalyzer:
|
|
54
|
+
"""
|
|
55
|
+
Framework for manual validation and qualitative analysis.
|
|
56
|
+
|
|
57
|
+
Implements:
|
|
58
|
+
1. Stratified sampling (ensure representation across categories)
|
|
59
|
+
2. Validation workflow (export → review → import → calculate metrics)
|
|
60
|
+
3. Precision/recall calculation
|
|
61
|
+
4. Inter-rater reliability (if multiple reviewers)
|
|
62
|
+
|
|
63
|
+
Based on Soliman et al.: "42 studies used qualitative analysis for validation"
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(self, sample_size: int = 30, stratify_by: str = "pattern"):
|
|
67
|
+
"""
|
|
68
|
+
Initialize qualitative analyzer.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
sample_size: Number of commits to sample for validation
|
|
72
|
+
stratify_by: Stratification method ('pattern', 'repository', 'time', 'random')
|
|
73
|
+
"""
|
|
74
|
+
self.sample_size = sample_size
|
|
75
|
+
self.stratify_by = stratify_by
|
|
76
|
+
self.samples: List[ValidationSample] = []
|
|
77
|
+
|
|
78
|
+
def generate_validation_samples(
|
|
79
|
+
self, commits: List[Dict], analysis_results: List[Dict], include_negatives: bool = True
|
|
80
|
+
) -> List[ValidationSample]:
|
|
81
|
+
"""
|
|
82
|
+
Generate stratified validation samples.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
commits: All commits
|
|
86
|
+
analysis_results: Pattern detection results
|
|
87
|
+
include_negatives: Include non-green commits for false negative detection
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
List of ValidationSample objects
|
|
91
|
+
"""
|
|
92
|
+
# Build commit lookup
|
|
93
|
+
commit_lookup = {c.get("hash", c.get("sha")): c for c in commits}
|
|
94
|
+
|
|
95
|
+
# Separate positives (detected as green) and negatives
|
|
96
|
+
positives = [r for r in analysis_results if r.get("is_green_aware", False)]
|
|
97
|
+
negatives = [r for r in analysis_results if not r.get("is_green_aware", False)]
|
|
98
|
+
|
|
99
|
+
samples = []
|
|
100
|
+
|
|
101
|
+
# Calculate sample distribution
|
|
102
|
+
if include_negatives:
|
|
103
|
+
# 80% positives, 20% negatives (to check false negatives)
|
|
104
|
+
pos_sample_size = int(self.sample_size * 0.8)
|
|
105
|
+
neg_sample_size = self.sample_size - pos_sample_size
|
|
106
|
+
else:
|
|
107
|
+
pos_sample_size = self.sample_size
|
|
108
|
+
neg_sample_size = 0
|
|
109
|
+
|
|
110
|
+
# Sample positives (stratified by pattern or repository)
|
|
111
|
+
if self.stratify_by == "pattern":
|
|
112
|
+
pos_samples = self._stratified_sample_by_pattern(positives, pos_sample_size)
|
|
113
|
+
elif self.stratify_by == "repository":
|
|
114
|
+
pos_samples = self._stratified_sample_by_repo(positives, commit_lookup, pos_sample_size)
|
|
115
|
+
else:
|
|
116
|
+
pos_samples = random.sample(positives, min(pos_sample_size, len(positives)))
|
|
117
|
+
|
|
118
|
+
# Sample negatives (random)
|
|
119
|
+
if include_negatives and negatives:
|
|
120
|
+
neg_samples = random.sample(negatives, min(neg_sample_size, len(negatives)))
|
|
121
|
+
else:
|
|
122
|
+
neg_samples = []
|
|
123
|
+
|
|
124
|
+
# Create ValidationSample objects
|
|
125
|
+
for result in pos_samples + neg_samples:
|
|
126
|
+
commit_sha = result.get("commit_sha")
|
|
127
|
+
commit = commit_lookup.get(commit_sha, {})
|
|
128
|
+
|
|
129
|
+
sample = ValidationSample(
|
|
130
|
+
commit_sha=commit_sha,
|
|
131
|
+
commit_message=commit.get("message", result.get("commit_message", "")),
|
|
132
|
+
code_diff=result.get("code_diff"),
|
|
133
|
+
repository=commit.get("repository", result.get("repository", "")),
|
|
134
|
+
detected_patterns=result.get("patterns_detected", []),
|
|
135
|
+
detection_method=result.get("detection_method", "keyword"),
|
|
136
|
+
validation_status="pending",
|
|
137
|
+
)
|
|
138
|
+
samples.append(sample)
|
|
139
|
+
|
|
140
|
+
self.samples = samples
|
|
141
|
+
return samples
|
|
142
|
+
|
|
143
|
+
def _stratified_sample_by_pattern(self, results: List[Dict], sample_size: int) -> List[Dict]:
|
|
144
|
+
"""Stratified sampling ensuring each pattern category is represented."""
|
|
145
|
+
# Group by dominant pattern
|
|
146
|
+
pattern_groups = defaultdict(list)
|
|
147
|
+
for result in results:
|
|
148
|
+
patterns = result.get("patterns_detected", [])
|
|
149
|
+
if patterns:
|
|
150
|
+
# Use first pattern as primary
|
|
151
|
+
primary_pattern = patterns[0]
|
|
152
|
+
pattern_groups[primary_pattern].append(result)
|
|
153
|
+
|
|
154
|
+
# Calculate samples per pattern (proportional)
|
|
155
|
+
total = len(results)
|
|
156
|
+
samples = []
|
|
157
|
+
|
|
158
|
+
for pattern, group in pattern_groups.items():
|
|
159
|
+
proportion = len(group) / total
|
|
160
|
+
pattern_sample_size = max(1, int(sample_size * proportion))
|
|
161
|
+
pattern_samples = random.sample(group, min(pattern_sample_size, len(group)))
|
|
162
|
+
samples.extend(pattern_samples)
|
|
163
|
+
|
|
164
|
+
# If we have fewer than sample_size, add random extras
|
|
165
|
+
if len(samples) < sample_size and len(samples) < len(results):
|
|
166
|
+
remaining = [r for r in results if r not in samples]
|
|
167
|
+
extra_needed = min(sample_size - len(samples), len(remaining))
|
|
168
|
+
samples.extend(random.sample(remaining, extra_needed))
|
|
169
|
+
|
|
170
|
+
return samples[:sample_size]
|
|
171
|
+
|
|
172
|
+
def _stratified_sample_by_repo(
|
|
173
|
+
self, results: List[Dict], commit_lookup: Dict, sample_size: int
|
|
174
|
+
) -> List[Dict]:
|
|
175
|
+
"""Stratified sampling ensuring each repository is represented."""
|
|
176
|
+
# Group by repository
|
|
177
|
+
repo_groups = defaultdict(list)
|
|
178
|
+
for result in results:
|
|
179
|
+
commit_sha = result.get("commit_sha")
|
|
180
|
+
commit = commit_lookup.get(commit_sha, {})
|
|
181
|
+
repo = commit.get("repository", result.get("repository", "unknown"))
|
|
182
|
+
repo_groups[repo].append(result)
|
|
183
|
+
|
|
184
|
+
# Sample proportionally from each repo
|
|
185
|
+
samples = []
|
|
186
|
+
total = len(results)
|
|
187
|
+
|
|
188
|
+
for repo, group in repo_groups.items():
|
|
189
|
+
proportion = len(group) / total
|
|
190
|
+
repo_sample_size = max(1, int(sample_size * proportion))
|
|
191
|
+
repo_samples = random.sample(group, min(repo_sample_size, len(group)))
|
|
192
|
+
samples.extend(repo_samples)
|
|
193
|
+
|
|
194
|
+
return samples[:sample_size]
|
|
195
|
+
|
|
196
|
+
def export_samples_for_review(self, output_path: str) -> None:
|
|
197
|
+
"""
|
|
198
|
+
Export validation samples to JSON for manual review.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
output_path: Path to output JSON file
|
|
202
|
+
"""
|
|
203
|
+
samples_data = []
|
|
204
|
+
for i, sample in enumerate(self.samples, 1):
|
|
205
|
+
samples_data.append(
|
|
206
|
+
{
|
|
207
|
+
"sample_id": i,
|
|
208
|
+
"commit_sha": sample.commit_sha,
|
|
209
|
+
"repository": sample.repository,
|
|
210
|
+
"commit_message": sample.commit_message,
|
|
211
|
+
"detected_patterns": sample.detected_patterns,
|
|
212
|
+
"detection_method": sample.detection_method,
|
|
213
|
+
"code_diff_preview": sample.code_diff[:500] if sample.code_diff else None,
|
|
214
|
+
"validation_status": sample.validation_status,
|
|
215
|
+
"true_label": sample.true_label,
|
|
216
|
+
"reviewer": sample.reviewer,
|
|
217
|
+
"review_notes": sample.review_notes,
|
|
218
|
+
"___INSTRUCTIONS___": "Set true_label to true/false, add reviewer name, add review_notes",
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
with open(output_path, "w") as f:
|
|
223
|
+
json.dump(samples_data, f, indent=2)
|
|
224
|
+
|
|
225
|
+
def import_validated_samples(self, input_path: str) -> None:
|
|
226
|
+
"""
|
|
227
|
+
Import manually validated samples from JSON.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
input_path: Path to JSON file with validated samples
|
|
231
|
+
"""
|
|
232
|
+
with open(input_path, "r") as f:
|
|
233
|
+
samples_data = json.load(f)
|
|
234
|
+
|
|
235
|
+
# Update samples with validation results
|
|
236
|
+
for data in samples_data:
|
|
237
|
+
commit_sha = data["commit_sha"]
|
|
238
|
+
|
|
239
|
+
# Find matching sample
|
|
240
|
+
for sample in self.samples:
|
|
241
|
+
if sample.commit_sha == commit_sha:
|
|
242
|
+
sample.true_label = data.get("true_label")
|
|
243
|
+
sample.reviewer = data.get("reviewer")
|
|
244
|
+
sample.review_notes = data.get("review_notes")
|
|
245
|
+
sample.validation_status = (
|
|
246
|
+
"validated" if sample.true_label is not None else "pending"
|
|
247
|
+
)
|
|
248
|
+
break
|
|
249
|
+
|
|
250
|
+
def calculate_metrics(self) -> ValidationMetrics:
|
|
251
|
+
"""
|
|
252
|
+
Calculate precision, recall, F1, and accuracy.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
ValidationMetrics object
|
|
256
|
+
"""
|
|
257
|
+
# Count outcomes
|
|
258
|
+
tp = 0 # True positive: detected as green, truly green
|
|
259
|
+
fp = 0 # False positive: detected as green, not green
|
|
260
|
+
tn = 0 # True negative: not detected, truly not green
|
|
261
|
+
fn = 0 # False negative: not detected, but is green
|
|
262
|
+
|
|
263
|
+
for sample in self.samples:
|
|
264
|
+
if sample.true_label is None:
|
|
265
|
+
continue # Skip unvalidated samples
|
|
266
|
+
|
|
267
|
+
detected_as_green = len(sample.detected_patterns) > 0
|
|
268
|
+
truly_green = sample.true_label
|
|
269
|
+
|
|
270
|
+
if detected_as_green and truly_green:
|
|
271
|
+
tp += 1
|
|
272
|
+
elif detected_as_green and not truly_green:
|
|
273
|
+
fp += 1
|
|
274
|
+
elif not detected_as_green and not truly_green:
|
|
275
|
+
tn += 1
|
|
276
|
+
elif not detected_as_green and truly_green:
|
|
277
|
+
fn += 1
|
|
278
|
+
|
|
279
|
+
# Calculate metrics
|
|
280
|
+
total = tp + fp + tn + fn
|
|
281
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
|
|
282
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
|
|
283
|
+
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
|
|
284
|
+
accuracy = (tp + tn) / total if total > 0 else 0
|
|
285
|
+
|
|
286
|
+
return ValidationMetrics(
|
|
287
|
+
true_positives=tp,
|
|
288
|
+
false_positives=fp,
|
|
289
|
+
true_negatives=tn,
|
|
290
|
+
false_negatives=fn,
|
|
291
|
+
precision=round(precision, 4),
|
|
292
|
+
recall=round(recall, 4),
|
|
293
|
+
f1_score=round(f1, 4),
|
|
294
|
+
accuracy=round(accuracy, 4),
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
def get_validation_report(self) -> Dict:
|
|
298
|
+
"""
|
|
299
|
+
Generate comprehensive validation report.
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
Dictionary with validation statistics and metrics
|
|
303
|
+
"""
|
|
304
|
+
validated_count = sum(1 for s in self.samples if s.validation_status == "validated")
|
|
305
|
+
pending_count = sum(1 for s in self.samples if s.validation_status == "pending")
|
|
306
|
+
|
|
307
|
+
metrics = self.calculate_metrics() if validated_count > 0 else None
|
|
308
|
+
|
|
309
|
+
# Analyze false positives and false negatives
|
|
310
|
+
false_positives = [
|
|
311
|
+
{
|
|
312
|
+
"commit_sha": s.commit_sha,
|
|
313
|
+
"detected_patterns": s.detected_patterns,
|
|
314
|
+
"review_notes": s.review_notes,
|
|
315
|
+
}
|
|
316
|
+
for s in self.samples
|
|
317
|
+
if s.true_label is not None and len(s.detected_patterns) > 0 and not s.true_label
|
|
318
|
+
]
|
|
319
|
+
|
|
320
|
+
false_negatives = [
|
|
321
|
+
{
|
|
322
|
+
"commit_sha": s.commit_sha,
|
|
323
|
+
"commit_message": s.commit_message[:100],
|
|
324
|
+
"review_notes": s.review_notes,
|
|
325
|
+
}
|
|
326
|
+
for s in self.samples
|
|
327
|
+
if s.true_label is not None and len(s.detected_patterns) == 0 and s.true_label
|
|
328
|
+
]
|
|
329
|
+
|
|
330
|
+
# Pattern accuracy breakdown
|
|
331
|
+
pattern_accuracy = self._analyze_pattern_accuracy()
|
|
332
|
+
|
|
333
|
+
return {
|
|
334
|
+
"sampling": {
|
|
335
|
+
"total_samples": len(self.samples),
|
|
336
|
+
"validated_samples": validated_count,
|
|
337
|
+
"pending_samples": pending_count,
|
|
338
|
+
"validation_progress": (
|
|
339
|
+
round(validated_count / len(self.samples) * 100, 1) if self.samples else 0
|
|
340
|
+
),
|
|
341
|
+
"stratification_method": self.stratify_by,
|
|
342
|
+
},
|
|
343
|
+
"metrics": {
|
|
344
|
+
"precision": metrics.precision if metrics else None,
|
|
345
|
+
"recall": metrics.recall if metrics else None,
|
|
346
|
+
"f1_score": metrics.f1_score if metrics else None,
|
|
347
|
+
"accuracy": metrics.accuracy if metrics else None,
|
|
348
|
+
"true_positives": metrics.true_positives if metrics else None,
|
|
349
|
+
"false_positives": metrics.false_positives if metrics else None,
|
|
350
|
+
"true_negatives": metrics.true_negatives if metrics else None,
|
|
351
|
+
"false_negatives": metrics.false_negatives if metrics else None,
|
|
352
|
+
},
|
|
353
|
+
"error_analysis": {
|
|
354
|
+
"false_positive_count": len(false_positives),
|
|
355
|
+
"false_negative_count": len(false_negatives),
|
|
356
|
+
"false_positives": false_positives[:5], # Top 5
|
|
357
|
+
"false_negatives": false_negatives[:5], # Top 5
|
|
358
|
+
},
|
|
359
|
+
"pattern_accuracy": pattern_accuracy,
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
def _analyze_pattern_accuracy(self) -> Dict:
|
|
363
|
+
"""Analyze accuracy per pattern category."""
|
|
364
|
+
pattern_stats = defaultdict(lambda: {"tp": 0, "fp": 0})
|
|
365
|
+
|
|
366
|
+
for sample in self.samples:
|
|
367
|
+
if sample.true_label is None:
|
|
368
|
+
continue
|
|
369
|
+
|
|
370
|
+
for pattern in sample.detected_patterns:
|
|
371
|
+
if sample.true_label:
|
|
372
|
+
pattern_stats[pattern]["tp"] += 1
|
|
373
|
+
else:
|
|
374
|
+
pattern_stats[pattern]["fp"] += 1
|
|
375
|
+
|
|
376
|
+
# Calculate precision per pattern
|
|
377
|
+
pattern_accuracy = {}
|
|
378
|
+
for pattern, stats in pattern_stats.items():
|
|
379
|
+
total = stats["tp"] + stats["fp"]
|
|
380
|
+
precision = stats["tp"] / total if total > 0 else 0
|
|
381
|
+
pattern_accuracy[pattern] = {
|
|
382
|
+
"true_positives": stats["tp"],
|
|
383
|
+
"false_positives": stats["fp"],
|
|
384
|
+
"precision": round(precision, 4),
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
return pattern_accuracy
|
|
388
|
+
|
|
389
|
+
def get_inter_rater_reliability(
|
|
390
|
+
self,
|
|
391
|
+
samples_from_reviewer_a: List[ValidationSample],
|
|
392
|
+
samples_from_reviewer_b: List[ValidationSample],
|
|
393
|
+
) -> Dict:
|
|
394
|
+
"""
|
|
395
|
+
Calculate inter-rater reliability (Cohen's Kappa).
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
samples_from_reviewer_a: Samples validated by reviewer A
|
|
399
|
+
samples_from_reviewer_b: Samples validated by reviewer B (same commits)
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
Dictionary with Cohen's Kappa and agreement statistics
|
|
403
|
+
"""
|
|
404
|
+
# Match samples by commit_sha
|
|
405
|
+
matched_samples = []
|
|
406
|
+
for sample_a in samples_from_reviewer_a:
|
|
407
|
+
for sample_b in samples_from_reviewer_b:
|
|
408
|
+
if sample_a.commit_sha == sample_b.commit_sha:
|
|
409
|
+
matched_samples.append((sample_a, sample_b))
|
|
410
|
+
break
|
|
411
|
+
|
|
412
|
+
if not matched_samples:
|
|
413
|
+
return {"error": "No matching samples between reviewers"}
|
|
414
|
+
|
|
415
|
+
# Calculate agreement
|
|
416
|
+
agreements = 0
|
|
417
|
+
for sample_a, sample_b in matched_samples:
|
|
418
|
+
if sample_a.true_label == sample_b.true_label:
|
|
419
|
+
agreements += 1
|
|
420
|
+
|
|
421
|
+
observed_agreement = agreements / len(matched_samples)
|
|
422
|
+
|
|
423
|
+
# Calculate expected agreement (by chance)
|
|
424
|
+
a_positive = sum(1 for s, _ in matched_samples if s.true_label)
|
|
425
|
+
b_positive = sum(1 for _, s in matched_samples if s.true_label)
|
|
426
|
+
n = len(matched_samples)
|
|
427
|
+
|
|
428
|
+
p_a_yes = a_positive / n
|
|
429
|
+
p_b_yes = b_positive / n
|
|
430
|
+
expected_agreement = (p_a_yes * p_b_yes) + ((1 - p_a_yes) * (1 - p_b_yes))
|
|
431
|
+
|
|
432
|
+
# Cohen's Kappa
|
|
433
|
+
kappa = (
|
|
434
|
+
(observed_agreement - expected_agreement) / (1 - expected_agreement)
|
|
435
|
+
if expected_agreement < 1
|
|
436
|
+
else 1
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
return {
|
|
440
|
+
"cohens_kappa": round(kappa, 4),
|
|
441
|
+
"observed_agreement": round(observed_agreement, 4),
|
|
442
|
+
"expected_agreement": round(expected_agreement, 4),
|
|
443
|
+
"sample_count": n,
|
|
444
|
+
"interpretation": self._interpret_kappa(kappa),
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
def _interpret_kappa(self, kappa: float) -> str:
|
|
448
|
+
"""Interpret Cohen's Kappa value."""
|
|
449
|
+
if kappa < 0:
|
|
450
|
+
return "Poor (less than chance)"
|
|
451
|
+
elif kappa < 0.20:
|
|
452
|
+
return "Slight"
|
|
453
|
+
elif kappa < 0.40:
|
|
454
|
+
return "Fair"
|
|
455
|
+
elif kappa < 0.60:
|
|
456
|
+
return "Moderate"
|
|
457
|
+
elif kappa < 0.80:
|
|
458
|
+
return "Substantial"
|
|
459
|
+
else:
|
|
460
|
+
return "Almost perfect"
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""Enhanced statistical analyzer for green software patterns."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from scipy import stats
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EnhancedStatisticalAnalyzer:
|
|
13
|
+
"""
|
|
14
|
+
Advanced statistical analyses for green software patterns.
|
|
15
|
+
Based on Soliman et al. quantitative validation techniques.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def analyze_pattern_correlations(self, commit_data: pd.DataFrame) -> Dict[str, Any]:
|
|
19
|
+
"""
|
|
20
|
+
Analyze correlations between patterns.
|
|
21
|
+
|
|
22
|
+
Question: Do repositories that adopt caching also adopt resource limits?
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
commit_data: DataFrame with pattern columns
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Dictionary containing correlation matrix and significant pairs
|
|
29
|
+
"""
|
|
30
|
+
# Create pattern co-occurrence matrix
|
|
31
|
+
pattern_columns = [col for col in commit_data.columns if col.startswith("pattern_")]
|
|
32
|
+
|
|
33
|
+
if not pattern_columns:
|
|
34
|
+
return {
|
|
35
|
+
"correlation_matrix": {},
|
|
36
|
+
"significant_pairs": [],
|
|
37
|
+
"interpretation": "No pattern columns found",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
correlation_matrix = commit_data[pattern_columns].corr(method="pearson")
|
|
41
|
+
|
|
42
|
+
# Identify significant correlations
|
|
43
|
+
significant_pairs = []
|
|
44
|
+
for i, pattern1 in enumerate(pattern_columns):
|
|
45
|
+
for j, pattern2 in enumerate(pattern_columns[i + 1 :], start=i + 1):
|
|
46
|
+
corr_value = correlation_matrix.iloc[i, j]
|
|
47
|
+
if abs(corr_value) > 0.5: # Strong correlation threshold
|
|
48
|
+
significant_pairs.append(
|
|
49
|
+
{
|
|
50
|
+
"pattern1": pattern1,
|
|
51
|
+
"pattern2": pattern2,
|
|
52
|
+
"correlation": corr_value,
|
|
53
|
+
"strength": "strong" if abs(corr_value) > 0.7 else "moderate",
|
|
54
|
+
}
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
"correlation_matrix": correlation_matrix.to_dict(),
|
|
59
|
+
"significant_pairs": significant_pairs,
|
|
60
|
+
"interpretation": self._interpret_correlations(significant_pairs),
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
def temporal_trend_analysis(self, commits_df: pd.DataFrame) -> Dict[str, Any]:
|
|
64
|
+
"""
|
|
65
|
+
Analyze temporal trends in green awareness.
|
|
66
|
+
|
|
67
|
+
Techniques:
|
|
68
|
+
- Mann-Kendall trend test (monotonic trend detection)
|
|
69
|
+
- Seasonal decomposition (identify cyclical patterns)
|
|
70
|
+
- Change point detection (identify sudden shifts)
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
commits_df: DataFrame with date and green_aware columns
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Dictionary containing trend analysis results
|
|
77
|
+
"""
|
|
78
|
+
# Prepare time series data
|
|
79
|
+
commits_df["date"] = pd.to_datetime(commits_df["date"])
|
|
80
|
+
commits_df = commits_df.sort_values("date")
|
|
81
|
+
|
|
82
|
+
# Monthly aggregation
|
|
83
|
+
monthly = (
|
|
84
|
+
commits_df.set_index("date")
|
|
85
|
+
.resample("ME")
|
|
86
|
+
.agg({"green_aware": "sum", "commit_hash": "count"})
|
|
87
|
+
)
|
|
88
|
+
monthly.columns = ["green_aware", "total_commits"]
|
|
89
|
+
monthly["green_rate"] = monthly["green_aware"] / monthly["total_commits"]
|
|
90
|
+
|
|
91
|
+
# Mann-Kendall trend test
|
|
92
|
+
mk_result = stats.kendalltau(range(len(monthly)), monthly["green_rate"])
|
|
93
|
+
trend_direction = "increasing" if mk_result.correlation > 0 else "decreasing"
|
|
94
|
+
trend_significant = bool(mk_result.pvalue < 0.05)
|
|
95
|
+
|
|
96
|
+
# Seasonal decomposition (requires at least 2 years of data)
|
|
97
|
+
seasonal_pattern = None
|
|
98
|
+
if len(monthly) >= 24:
|
|
99
|
+
try:
|
|
100
|
+
from statsmodels.tsa.seasonal import seasonal_decompose
|
|
101
|
+
|
|
102
|
+
decomposition = seasonal_decompose(
|
|
103
|
+
monthly["green_rate"], model="additive", period=12
|
|
104
|
+
)
|
|
105
|
+
seasonal_pattern = decomposition.seasonal.to_dict()
|
|
106
|
+
except Exception:
|
|
107
|
+
seasonal_pattern = None
|
|
108
|
+
|
|
109
|
+
# Change point detection (simple method: rolling window variance)
|
|
110
|
+
window_size = 3
|
|
111
|
+
monthly["rolling_var"] = monthly["green_rate"].rolling(window=window_size).var()
|
|
112
|
+
change_points = monthly[
|
|
113
|
+
monthly["rolling_var"]
|
|
114
|
+
> monthly["rolling_var"].mean() + 2 * monthly["rolling_var"].std()
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
"trend": {
|
|
119
|
+
"direction": trend_direction,
|
|
120
|
+
"significant": trend_significant,
|
|
121
|
+
"correlation": mk_result.correlation,
|
|
122
|
+
"p_value": mk_result.pvalue,
|
|
123
|
+
},
|
|
124
|
+
"seasonal_pattern": seasonal_pattern,
|
|
125
|
+
"change_points": change_points.index.tolist() if not change_points.empty else [],
|
|
126
|
+
"monthly_data": monthly.to_dict(),
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
def effect_size_analysis(self, group1: List[float], group2: List[float]) -> Dict[str, Any]:
|
|
130
|
+
"""
|
|
131
|
+
Calculate effect size between two groups.
|
|
132
|
+
|
|
133
|
+
Use case: Compare green awareness between:
|
|
134
|
+
- Different programming languages
|
|
135
|
+
- Different time periods
|
|
136
|
+
- Different repository sizes
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
group1: First group values
|
|
140
|
+
group2: Second group values
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Dictionary containing effect size metrics
|
|
144
|
+
"""
|
|
145
|
+
# Cohen's d (effect size)
|
|
146
|
+
mean1, mean2 = np.mean(group1), np.mean(group2)
|
|
147
|
+
std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
|
|
148
|
+
pooled_std = np.sqrt((std1**2 + std2**2) / 2)
|
|
149
|
+
|
|
150
|
+
if pooled_std == 0:
|
|
151
|
+
cohens_d = 0
|
|
152
|
+
else:
|
|
153
|
+
cohens_d = (mean1 - mean2) / pooled_std
|
|
154
|
+
|
|
155
|
+
# Interpretation
|
|
156
|
+
if abs(cohens_d) < 0.2:
|
|
157
|
+
magnitude = "negligible"
|
|
158
|
+
elif abs(cohens_d) < 0.5:
|
|
159
|
+
magnitude = "small"
|
|
160
|
+
elif abs(cohens_d) < 0.8:
|
|
161
|
+
magnitude = "medium"
|
|
162
|
+
else:
|
|
163
|
+
magnitude = "large"
|
|
164
|
+
|
|
165
|
+
# Statistical significance
|
|
166
|
+
t_stat, p_value = stats.ttest_ind(group1, group2)
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
"cohens_d": cohens_d,
|
|
170
|
+
"magnitude": magnitude,
|
|
171
|
+
"mean_difference": mean1 - mean2,
|
|
172
|
+
"t_statistic": t_stat,
|
|
173
|
+
"p_value": p_value,
|
|
174
|
+
"significant": bool(p_value < 0.05),
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
def pattern_adoption_rate_analysis(self, commits_df: pd.DataFrame) -> Dict[str, Any]:
|
|
178
|
+
"""
|
|
179
|
+
Analyze pattern adoption rates over repository lifetime.
|
|
180
|
+
|
|
181
|
+
Metrics:
|
|
182
|
+
- Time to first adoption (TTFA)
|
|
183
|
+
- Adoption acceleration
|
|
184
|
+
- Pattern stickiness (continued use after adoption)
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
commits_df: DataFrame with pattern and date columns
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Dictionary mapping patterns to adoption metrics
|
|
191
|
+
"""
|
|
192
|
+
results = {}
|
|
193
|
+
|
|
194
|
+
for pattern in commits_df["pattern"].unique():
|
|
195
|
+
pattern_commits = commits_df[commits_df["pattern"] == pattern].sort_values("date")
|
|
196
|
+
|
|
197
|
+
if len(pattern_commits) == 0:
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
# Time to first adoption
|
|
201
|
+
first_adoption = pattern_commits.iloc[0]["date"]
|
|
202
|
+
repo_start = commits_df["date"].min()
|
|
203
|
+
ttfa_days = (first_adoption - repo_start).days
|
|
204
|
+
|
|
205
|
+
# Adoption frequency over time
|
|
206
|
+
monthly_adoption = pattern_commits.set_index("date").resample("ME").size()
|
|
207
|
+
|
|
208
|
+
# Pattern stickiness (months with at least one adoption)
|
|
209
|
+
total_months = len(commits_df.set_index("date").resample("ME").size())
|
|
210
|
+
active_months = len(monthly_adoption[monthly_adoption > 0])
|
|
211
|
+
stickiness = active_months / total_months if total_months > 0 else 0
|
|
212
|
+
|
|
213
|
+
results[pattern] = {
|
|
214
|
+
"ttfa_days": ttfa_days,
|
|
215
|
+
"total_adoptions": len(pattern_commits),
|
|
216
|
+
"stickiness": stickiness,
|
|
217
|
+
"monthly_adoption_rate": monthly_adoption.mean(),
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
return results
|
|
221
|
+
|
|
222
|
+
def _interpret_correlations(self, significant_pairs: List[Dict[str, Any]]) -> str:
|
|
223
|
+
"""
|
|
224
|
+
Generate interpretation of correlation results.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
significant_pairs: List of significant correlation pairs
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
Interpretation string
|
|
231
|
+
"""
|
|
232
|
+
if not significant_pairs:
|
|
233
|
+
return "No significant correlations found between patterns."
|
|
234
|
+
|
|
235
|
+
interpretations = []
|
|
236
|
+
for pair in significant_pairs[:5]: # Top 5
|
|
237
|
+
p1 = pair["pattern1"].replace("pattern_", "")
|
|
238
|
+
p2 = pair["pattern2"].replace("pattern_", "")
|
|
239
|
+
corr = pair["correlation"]
|
|
240
|
+
if corr > 0:
|
|
241
|
+
interpretations.append(f"{p1} and {p2} tend to be adopted together (r={corr:.2f})")
|
|
242
|
+
else:
|
|
243
|
+
interpretations.append(f"{p1} and {p2} rarely co-occur (r={corr:.2f})")
|
|
244
|
+
|
|
245
|
+
return "; ".join(interpretations)
|