greenmining 1.1.9__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,18 +1,14 @@
1
- greenmining/__init__.py,sha256=tQoU8k_I_ymm67bNlYo0laMgS_eP9ersn5NAkdRjRvc,3390
1
+ greenmining/__init__.py,sha256=W-CMUfo3U7pWBGAKtwT5XwjcMxbKjF5c42ViFFsOBS4,4096
2
2
  greenmining/__main__.py,sha256=NYOVS7D4w2XDLn6SyXHXPKE5GrNGOeoWSTb_KazgK5c,590
3
- greenmining/config.py,sha256=MQ5aPaa_Y9MZke774dmibz2-XSqRVsQiiNaLDr8f7S0,2771
4
3
  greenmining/gsf_patterns.py,sha256=UvNJPY3HlAx1SicwUqci40TlLg8lCL0tszSOH4haxQs,55921
5
4
  greenmining/utils.py,sha256=-dnLUw9taCzvQ2dk6uc66GAohOFiXJFKs9TLSEPk5kM,2893
6
- greenmining/analyzers/__init__.py,sha256=rTgpDfFE6za4QAHW59ncnS6TW02omn-TZMnYNVUIZp0,753
5
+ greenmining/analyzers/__init__.py,sha256=wnBrn8EyAHG_qnesOPAYkZyc-XigXWy2pI3bMeIoLH4,416
7
6
  greenmining/analyzers/code_diff_analyzer.py,sha256=1dk68R3O0RZG8gx1cm9B_UlZ1Uwyb_Q3oScRbCVx4tM,10950
8
7
  greenmining/analyzers/metrics_power_correlator.py,sha256=MgKXAIYjNihzzyilCd88_AMjZP9sdC6NkCAVbrvvOus,5957
9
- greenmining/analyzers/power_regression.py,sha256=j_SL8BHQi89zkjjKPPcjsPrvfDAeGpLeZujQiNw_RKI,7375
10
- greenmining/analyzers/qualitative_analyzer.py,sha256=5LiqP2It3q6_RLiLGkyGRZaRxg00dcyTPvlN5l-wq_k,15379
11
8
  greenmining/analyzers/statistical_analyzer.py,sha256=PA0w0sytRmMO6N1a2iH7VdA6Icg4DcyBLFXOGq7PepY,5942
12
9
  greenmining/analyzers/temporal_analyzer.py,sha256=JfTcAoI20oCFMehGrSRnDqhJTXI-RUbdCTMwDOTW9-g,14259
13
- greenmining/analyzers/version_power_analyzer.py,sha256=2P6zOqBg-ButtIhF-4cutiwD2Q1geMY49VFUghHXXoI,8119
14
10
  greenmining/controllers/__init__.py,sha256=UiAT6zBvC1z_9cJWfzq1cLA0I4r9b2vURHipj8oDczI,180
15
- greenmining/controllers/repository_controller.py,sha256=ZRMU6oUWUELW91qcZ_iBiRcDNT8ruTIy2aRjQbOb0O0,6571
11
+ greenmining/controllers/repository_controller.py,sha256=sjfbDhyRY59MsKLw0dkxzpe1QZKtm9ScO4E8VFYZy9A,6041
16
12
  greenmining/energy/__init__.py,sha256=GoCYh7hitWBoPMtan1HF1yezCHi7o4sa_YUJgGkeJc8,558
17
13
  greenmining/energy/base.py,sha256=3hIPgc4B0Nz9V7DTh2Xd6trDRtmozUBBpa5UWRuWzcw,5918
18
14
  greenmining/energy/carbon_reporter.py,sha256=bKIFlLhHfYzI4DBu_ff4GW1Psz4oSCAF4NmzQb-EShA,8298
@@ -24,17 +20,15 @@ greenmining/models/aggregated_stats.py,sha256=CZxjwXswvtmYPwpcbodLUsZpsbsNKBDIqv
24
20
  greenmining/models/analysis_result.py,sha256=YICTCEcrJxZ1R8Xaio3AZOjCGwMzC_62BMAL0J_XY1w,1509
25
21
  greenmining/models/commit.py,sha256=LCwDcRu4-BeCJQdk590oQNZZZM9t8W9FlaHlo9DCVmc,2415
26
22
  greenmining/models/repository.py,sha256=MUeCOtVMOsU4Oa_BBoB163Ij5BKytTKwbzoGORJx4rU,2850
27
- greenmining/presenters/__init__.py,sha256=d1CMtqtUAHYHYNzigPyjtGOUtnH1drtUwf7-bFQq2B8,138
28
- greenmining/presenters/console_presenter.py,sha256=qagn2c2aOym0WNKV8n175MQ-BTheLjrXzW8c1OafzAQ,4904
29
23
  greenmining/services/__init__.py,sha256=ZEMOVut0KRdume_vz58beSNps3YgeoGBXmUjEqNgIhc,690
30
24
  greenmining/services/commit_extractor.py,sha256=qBM9QpGzPZRmGMFufJ6gP8eWIuufTowLX8mQxqZwyEU,6996
31
25
  greenmining/services/data_aggregator.py,sha256=BU_HUb-8c0n0sa_7VZRB8jIVnaVhRLf-E6KA4ASh-08,19427
32
26
  greenmining/services/data_analyzer.py,sha256=0XqW-slrnt7RotrHDweOqKtoN8XIA7y6p7s2Jau6cMg,7431
33
27
  greenmining/services/github_graphql_fetcher.py,sha256=ZklXdEAc60KeFL83zRYMwW_-2OwMKpfPY7Wrifl0D50,11539
34
- greenmining/services/local_repo_analyzer.py,sha256=PYHj-zz0cePWbQq9HtGvd2OcZUYM8rRGe8eKIAp1_fI,24874
28
+ greenmining/services/local_repo_analyzer.py,sha256=kdqN9O7GlmOs1_H1iNni1HEcCzF9bBCPeP3wMStGe5Q,25306
35
29
  greenmining/services/reports.py,sha256=nhJuYiA5tPD_9AjtgSLEnrpW3x15sZXrwIxpxQEBbh0,23219
36
- greenmining-1.1.9.dist-info/licenses/LICENSE,sha256=M7ma3JHGeiIZIs3ea0HTcFl_wLFPX2NZElUliYs4bCA,1083
37
- greenmining-1.1.9.dist-info/METADATA,sha256=QVrwbhh-huQmP8Uv0YQDgraGOFoTmFRNxq7T27bZ4wk,30175
38
- greenmining-1.1.9.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
39
- greenmining-1.1.9.dist-info/top_level.txt,sha256=nreXgXxZIWI-42yQknQ0HXtUrFnzZ8N1ra4Mdy2KcsI,12
40
- greenmining-1.1.9.dist-info/RECORD,,
30
+ greenmining-1.2.1.dist-info/licenses/LICENSE,sha256=M7ma3JHGeiIZIs3ea0HTcFl_wLFPX2NZElUliYs4bCA,1083
31
+ greenmining-1.2.1.dist-info/METADATA,sha256=ZmLnx5P5jN8XpRGgfx_gVzykWvUZdsHrTEpg3Plh9-A,10522
32
+ greenmining-1.2.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
33
+ greenmining-1.2.1.dist-info/top_level.txt,sha256=nreXgXxZIWI-42yQknQ0HXtUrFnzZ8N1ra4Mdy2KcsI,12
34
+ greenmining-1.2.1.dist-info/RECORD,,
@@ -1,211 +0,0 @@
1
- # Power regression detection for identifying commits that increased power consumption.
2
- # Compares energy measurements between baseline and target commits.
3
-
4
- from __future__ import annotations
5
-
6
- import subprocess
7
- from dataclasses import dataclass, field
8
- from typing import Any, Dict, List, Optional
9
-
10
- from pydriller import Repository
11
-
12
- from greenmining.utils import colored_print
13
-
14
-
15
- @dataclass
16
- class PowerRegression:
17
- # A detected power regression from a commit.
18
-
19
- sha: str
20
- message: str
21
- author: str
22
- date: str
23
- power_before: float # watts
24
- power_after: float # watts
25
- power_increase: float # percentage
26
- energy_before: float # joules
27
- energy_after: float # joules
28
- is_regression: bool = True
29
-
30
- def to_dict(self) -> Dict[str, Any]:
31
- return {
32
- "sha": self.sha,
33
- "message": self.message,
34
- "author": self.author,
35
- "date": self.date,
36
- "power_before": round(self.power_before, 4),
37
- "power_after": round(self.power_after, 4),
38
- "power_increase": round(self.power_increase, 2),
39
- "energy_before": round(self.energy_before, 4),
40
- "energy_after": round(self.energy_after, 4),
41
- "is_regression": self.is_regression,
42
- }
43
-
44
-
45
- class PowerRegressionDetector:
46
- # Detect commits that caused power consumption regressions.
47
- # Runs a test command at each commit and measures energy usage.
48
-
49
- def __init__(
50
- self,
51
- test_command: str = "pytest tests/ -x",
52
- energy_backend: str = "rapl",
53
- threshold_percent: float = 5.0,
54
- iterations: int = 5,
55
- warmup_iterations: int = 1,
56
- ):
57
- # Initialize power regression detector.
58
- # Args:
59
- # test_command: Shell command to run for energy measurement
60
- # energy_backend: Energy measurement backend (rapl, codecarbon, cpu_meter)
61
- # threshold_percent: Minimum percentage increase to flag as regression
62
- # iterations: Number of measurement iterations per commit (for accuracy)
63
- # warmup_iterations: Number of warmup runs before measurement
64
- self.test_command = test_command
65
- self.energy_backend = energy_backend
66
- self.threshold_percent = threshold_percent
67
- self.iterations = iterations
68
- self.warmup_iterations = warmup_iterations
69
- self._meter = None
70
-
71
- def _get_energy_meter(self):
72
- # Get energy meter instance.
73
- if self._meter is None:
74
- from greenmining.energy.base import get_energy_meter
75
-
76
- self._meter = get_energy_meter(self.energy_backend)
77
- return self._meter
78
-
79
- def _run_test_command(self, cwd: str) -> float:
80
- # Run test command and return energy consumed in joules.
81
- meter = self._get_energy_meter()
82
-
83
- # Warmup
84
- for _ in range(self.warmup_iterations):
85
- subprocess.run(
86
- self.test_command,
87
- shell=True,
88
- cwd=cwd,
89
- capture_output=True,
90
- text=True,
91
- timeout=300,
92
- )
93
-
94
- # Measure
95
- total_joules = 0.0
96
- for _ in range(self.iterations):
97
- meter.start()
98
- subprocess.run(
99
- self.test_command,
100
- shell=True,
101
- cwd=cwd,
102
- capture_output=True,
103
- text=True,
104
- timeout=300,
105
- )
106
- metrics = meter.stop()
107
- total_joules += metrics.joules
108
-
109
- return total_joules / self.iterations
110
-
111
- def detect(
112
- self,
113
- repo_path: str,
114
- baseline_commit: str = "HEAD~10",
115
- target_commit: str = "HEAD",
116
- max_commits: int = 50,
117
- ) -> List[PowerRegression]:
118
- # Detect power regressions between baseline and target commits.
119
- # Args:
120
- # repo_path: Path to local git repository
121
- # baseline_commit: Baseline commit SHA or reference
122
- # target_commit: Target commit SHA or reference
123
- # max_commits: Maximum commits to analyze
124
- regressions = []
125
-
126
- colored_print(f"Detecting power regressions in {repo_path}", "cyan")
127
- colored_print(f" Range: {baseline_commit}..{target_commit}", "cyan")
128
- colored_print(f" Test: {self.test_command}", "cyan")
129
- colored_print(f" Threshold: {self.threshold_percent}%", "cyan")
130
-
131
- # Get commits in range
132
- commits = list(
133
- Repository(
134
- path_to_repo=repo_path,
135
- from_commit=baseline_commit,
136
- to_commit=target_commit,
137
- ).traverse_commits()
138
- )
139
-
140
- if not commits:
141
- colored_print("No commits found in range", "yellow")
142
- return regressions
143
-
144
- # Measure baseline
145
- colored_print(f" Measuring baseline ({commits[0].hash[:8]})...", "cyan")
146
- self._checkout(repo_path, commits[0].hash)
147
- baseline_energy = self._run_test_command(repo_path)
148
- colored_print(f" Baseline: {baseline_energy:.4f} joules", "green")
149
-
150
- previous_energy = baseline_energy
151
- commit_count = 0
152
-
153
- for commit in commits[1:]:
154
- if commit_count >= max_commits:
155
- break
156
-
157
- try:
158
- self._checkout(repo_path, commit.hash)
159
- current_energy = self._run_test_command(repo_path)
160
-
161
- # Calculate change
162
- if previous_energy > 0:
163
- change_percent = ((current_energy - previous_energy) / previous_energy) * 100
164
- else:
165
- change_percent = 0.0
166
-
167
- # Check for regression
168
- if change_percent > self.threshold_percent:
169
- regression = PowerRegression(
170
- sha=commit.hash,
171
- message=commit.msg[:200],
172
- author=commit.author.name,
173
- date=commit.author_date.isoformat() if commit.author_date else "",
174
- power_before=previous_energy / max(1, self.iterations),
175
- power_after=current_energy / max(1, self.iterations),
176
- power_increase=change_percent,
177
- energy_before=previous_energy,
178
- energy_after=current_energy,
179
- )
180
- regressions.append(regression)
181
- colored_print(f" REGRESSION: {commit.hash[:8]} +{change_percent:.1f}%", "red")
182
- else:
183
- colored_print(f" OK: {commit.hash[:8]} {change_percent:+.1f}%", "green")
184
-
185
- previous_energy = current_energy
186
- commit_count += 1
187
-
188
- except Exception as e:
189
- colored_print(f" Warning: Failed on {commit.hash[:8]}: {e}", "yellow")
190
- continue
191
-
192
- # Restore to target
193
- self._checkout(repo_path, target_commit)
194
-
195
- colored_print(
196
- f"\nFound {len(regressions)} power regressions "
197
- f"(>{self.threshold_percent}% increase)",
198
- "cyan" if not regressions else "red",
199
- )
200
-
201
- return regressions
202
-
203
- @staticmethod
204
- def _checkout(repo_path: str, ref: str):
205
- # Checkout a specific commit.
206
- subprocess.run(
207
- ["git", "checkout", ref, "--quiet"],
208
- cwd=repo_path,
209
- capture_output=True,
210
- text=True,
211
- )
@@ -1,394 +0,0 @@
1
- # Qualitative Analysis Framework for Pattern Validation
2
-
3
- from __future__ import annotations
4
-
5
- import random
6
- from typing import Dict, List, Optional
7
- from dataclasses import dataclass
8
- from collections import defaultdict
9
- import json
10
-
11
-
12
- @dataclass
13
- class ValidationSample:
14
- # Represents a single validation sample
15
-
16
- commit_sha: str
17
- commit_message: str
18
- code_diff: Optional[str]
19
- repository: str
20
- detected_patterns: List[str]
21
- detection_method: str # 'keyword', 'nlp', 'code_diff'
22
- validation_status: Optional[str] = None # 'pending', 'validated', 'rejected'
23
- true_label: Optional[bool] = None # Ground truth after manual review
24
- reviewer: Optional[str] = None
25
- review_notes: Optional[str] = None
26
-
27
-
28
- @dataclass
29
- class ValidationMetrics:
30
- # Precision/recall metrics for validation
31
-
32
- true_positives: int
33
- false_positives: int
34
- true_negatives: int
35
- false_negatives: int
36
- precision: float
37
- recall: float
38
- f1_score: float
39
- accuracy: float
40
-
41
-
42
- class QualitativeAnalyzer:
43
- # Framework for manual validation and qualitative analysis.
44
-
45
- def __init__(self, sample_size: int = 30, stratify_by: str = "pattern"):
46
- # Initialize qualitative analyzer.
47
- self.sample_size = sample_size
48
- self.stratify_by = stratify_by
49
- self.samples: List[ValidationSample] = []
50
-
51
- def generate_validation_samples(
52
- self, commits: List[Dict], analysis_results: List[Dict], include_negatives: bool = True
53
- ) -> List[ValidationSample]:
54
- # Generate stratified validation samples.
55
- # Build commit lookup
56
- commit_lookup = {c.get("hash", c.get("sha")): c for c in commits}
57
-
58
- # Separate positives (detected as green) and negatives
59
- positives = [r for r in analysis_results if r.get("is_green_aware", False)]
60
- negatives = [r for r in analysis_results if not r.get("is_green_aware", False)]
61
-
62
- samples = []
63
-
64
- # Calculate sample distribution
65
- if include_negatives:
66
- # 80% positives, 20% negatives (to check false negatives)
67
- pos_sample_size = int(self.sample_size * 0.8)
68
- neg_sample_size = self.sample_size - pos_sample_size
69
- else:
70
- pos_sample_size = self.sample_size
71
- neg_sample_size = 0
72
-
73
- # Sample positives (stratified by pattern or repository)
74
- if self.stratify_by == "pattern":
75
- pos_samples = self._stratified_sample_by_pattern(positives, pos_sample_size)
76
- elif self.stratify_by == "repository":
77
- pos_samples = self._stratified_sample_by_repo(positives, commit_lookup, pos_sample_size)
78
- else:
79
- pos_samples = random.sample(positives, min(pos_sample_size, len(positives)))
80
-
81
- # Sample negatives (random)
82
- if include_negatives and negatives:
83
- neg_samples = random.sample(negatives, min(neg_sample_size, len(negatives)))
84
- else:
85
- neg_samples = []
86
-
87
- # Create ValidationSample objects
88
- for result in pos_samples + neg_samples:
89
- commit_sha = result.get("commit_sha")
90
- commit = commit_lookup.get(commit_sha, {})
91
-
92
- sample = ValidationSample(
93
- commit_sha=commit_sha,
94
- commit_message=commit.get("message", result.get("commit_message", "")),
95
- code_diff=result.get("code_diff"),
96
- repository=commit.get("repository", result.get("repository", "")),
97
- detected_patterns=result.get("patterns_detected", []),
98
- detection_method=result.get("detection_method", "keyword"),
99
- validation_status="pending",
100
- )
101
- samples.append(sample)
102
-
103
- self.samples = samples
104
- return samples
105
-
106
- def _stratified_sample_by_pattern(self, results: List[Dict], sample_size: int) -> List[Dict]:
107
- # Stratified sampling ensuring each pattern category is represented.
108
- # Group by dominant pattern
109
- pattern_groups = defaultdict(list)
110
- for result in results:
111
- patterns = result.get("patterns_detected", [])
112
- if patterns:
113
- # Use first pattern as primary
114
- primary_pattern = patterns[0]
115
- pattern_groups[primary_pattern].append(result)
116
-
117
- # Calculate samples per pattern (proportional)
118
- total = len(results)
119
- samples = []
120
-
121
- for pattern, group in pattern_groups.items():
122
- proportion = len(group) / total
123
- pattern_sample_size = max(1, int(sample_size * proportion))
124
- pattern_samples = random.sample(group, min(pattern_sample_size, len(group)))
125
- samples.extend(pattern_samples)
126
-
127
- # If we have fewer than sample_size, add random extras
128
- if len(samples) < sample_size and len(samples) < len(results):
129
- remaining = [r for r in results if r not in samples]
130
- extra_needed = min(sample_size - len(samples), len(remaining))
131
- samples.extend(random.sample(remaining, extra_needed))
132
-
133
- return samples[:sample_size]
134
-
135
- def _stratified_sample_by_repo(
136
- self, results: List[Dict], commit_lookup: Dict, sample_size: int
137
- ) -> List[Dict]:
138
- # Stratified sampling ensuring each repository is represented.
139
- # Group by repository
140
- repo_groups = defaultdict(list)
141
- for result in results:
142
- commit_sha = result.get("commit_sha")
143
- commit = commit_lookup.get(commit_sha, {})
144
- repo = commit.get("repository", result.get("repository", "unknown"))
145
- repo_groups[repo].append(result)
146
-
147
- # Sample proportionally from each repo
148
- samples = []
149
- total = len(results)
150
-
151
- for repo, group in repo_groups.items():
152
- proportion = len(group) / total
153
- repo_sample_size = max(1, int(sample_size * proportion))
154
- repo_samples = random.sample(group, min(repo_sample_size, len(group)))
155
- samples.extend(repo_samples)
156
-
157
- return samples[:sample_size]
158
-
159
- def export_samples_for_review(self, output_path: str) -> None:
160
- # Export validation samples to JSON for manual review.
161
- samples_data = []
162
- for i, sample in enumerate(self.samples, 1):
163
- samples_data.append(
164
- {
165
- "sample_id": i,
166
- "commit_sha": sample.commit_sha,
167
- "repository": sample.repository,
168
- "commit_message": sample.commit_message,
169
- "detected_patterns": sample.detected_patterns,
170
- "detection_method": sample.detection_method,
171
- "code_diff_preview": sample.code_diff[:500] if sample.code_diff else None,
172
- "validation_status": sample.validation_status,
173
- "true_label": sample.true_label,
174
- "reviewer": sample.reviewer,
175
- "review_notes": sample.review_notes,
176
- "___INSTRUCTIONS___": "Set true_label to true/false, add reviewer name, add review_notes",
177
- }
178
- )
179
-
180
- with open(output_path, "w") as f:
181
- json.dump(samples_data, f, indent=2)
182
-
183
- def import_validated_samples(self, input_path: str) -> None:
184
- # Import manually validated samples from JSON.
185
- with open(input_path, "r") as f:
186
- samples_data = json.load(f)
187
-
188
- # Update samples with validation results
189
- for data in samples_data:
190
- commit_sha = data["commit_sha"]
191
-
192
- # Find matching sample
193
- for sample in self.samples:
194
- if sample.commit_sha == commit_sha:
195
- sample.true_label = data.get("true_label")
196
- sample.reviewer = data.get("reviewer")
197
- sample.review_notes = data.get("review_notes")
198
- sample.validation_status = (
199
- "validated" if sample.true_label is not None else "pending"
200
- )
201
- break
202
-
203
- def calculate_metrics(self) -> ValidationMetrics:
204
- # Calculate precision, recall, F1, and accuracy.
205
- # Count outcomes
206
- tp = 0 # True positive: detected as green, truly green
207
- fp = 0 # False positive: detected as green, not green
208
- tn = 0 # True negative: not detected, truly not green
209
- fn = 0 # False negative: not detected, but is green
210
-
211
- for sample in self.samples:
212
- if sample.true_label is None:
213
- continue # Skip unvalidated samples
214
-
215
- detected_as_green = len(sample.detected_patterns) > 0
216
- truly_green = sample.true_label
217
-
218
- if detected_as_green and truly_green:
219
- tp += 1
220
- elif detected_as_green and not truly_green:
221
- fp += 1
222
- elif not detected_as_green and not truly_green:
223
- tn += 1
224
- elif not detected_as_green and truly_green:
225
- fn += 1
226
-
227
- # Calculate metrics
228
- total = tp + fp + tn + fn
229
- precision = tp / (tp + fp) if (tp + fp) > 0 else 0
230
- recall = tp / (tp + fn) if (tp + fn) > 0 else 0
231
- f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
232
- accuracy = (tp + tn) / total if total > 0 else 0
233
-
234
- return ValidationMetrics(
235
- true_positives=tp,
236
- false_positives=fp,
237
- true_negatives=tn,
238
- false_negatives=fn,
239
- precision=round(precision, 4),
240
- recall=round(recall, 4),
241
- f1_score=round(f1, 4),
242
- accuracy=round(accuracy, 4),
243
- )
244
-
245
- def get_validation_report(self) -> Dict:
246
- # Generate comprehensive validation report.
247
- validated_count = sum(1 for s in self.samples if s.validation_status == "validated")
248
- pending_count = sum(1 for s in self.samples if s.validation_status == "pending")
249
-
250
- metrics = self.calculate_metrics() if validated_count > 0 else None
251
-
252
- # Analyze false positives and false negatives
253
- false_positives = [
254
- {
255
- "commit_sha": s.commit_sha,
256
- "detected_patterns": s.detected_patterns,
257
- "review_notes": s.review_notes,
258
- }
259
- for s in self.samples
260
- if s.true_label is not None and len(s.detected_patterns) > 0 and not s.true_label
261
- ]
262
-
263
- false_negatives = [
264
- {
265
- "commit_sha": s.commit_sha,
266
- "commit_message": s.commit_message[:100],
267
- "review_notes": s.review_notes,
268
- }
269
- for s in self.samples
270
- if s.true_label is not None and len(s.detected_patterns) == 0 and s.true_label
271
- ]
272
-
273
- # Pattern accuracy breakdown
274
- pattern_accuracy = self._analyze_pattern_accuracy()
275
-
276
- return {
277
- "sampling": {
278
- "total_samples": len(self.samples),
279
- "validated_samples": validated_count,
280
- "pending_samples": pending_count,
281
- "validation_progress": (
282
- round(validated_count / len(self.samples) * 100, 1) if self.samples else 0
283
- ),
284
- "stratification_method": self.stratify_by,
285
- },
286
- "metrics": {
287
- "precision": metrics.precision if metrics else None,
288
- "recall": metrics.recall if metrics else None,
289
- "f1_score": metrics.f1_score if metrics else None,
290
- "accuracy": metrics.accuracy if metrics else None,
291
- "true_positives": metrics.true_positives if metrics else None,
292
- "false_positives": metrics.false_positives if metrics else None,
293
- "true_negatives": metrics.true_negatives if metrics else None,
294
- "false_negatives": metrics.false_negatives if metrics else None,
295
- },
296
- "error_analysis": {
297
- "false_positive_count": len(false_positives),
298
- "false_negative_count": len(false_negatives),
299
- "false_positives": false_positives[:5], # Top 5
300
- "false_negatives": false_negatives[:5], # Top 5
301
- },
302
- "pattern_accuracy": pattern_accuracy,
303
- }
304
-
305
- def _analyze_pattern_accuracy(self) -> Dict:
306
- # Analyze accuracy per pattern category.
307
- pattern_stats = defaultdict(lambda: {"tp": 0, "fp": 0})
308
-
309
- for sample in self.samples:
310
- if sample.true_label is None:
311
- continue
312
-
313
- for pattern in sample.detected_patterns:
314
- if sample.true_label:
315
- pattern_stats[pattern]["tp"] += 1
316
- else:
317
- pattern_stats[pattern]["fp"] += 1
318
-
319
- # Calculate precision per pattern
320
- pattern_accuracy = {}
321
- for pattern, stats in pattern_stats.items():
322
- total = stats["tp"] + stats["fp"]
323
- precision = stats["tp"] / total if total > 0 else 0
324
- pattern_accuracy[pattern] = {
325
- "true_positives": stats["tp"],
326
- "false_positives": stats["fp"],
327
- "precision": round(precision, 4),
328
- }
329
-
330
- return pattern_accuracy
331
-
332
- def get_inter_rater_reliability(
333
- self,
334
- samples_from_reviewer_a: List[ValidationSample],
335
- samples_from_reviewer_b: List[ValidationSample],
336
- ) -> Dict:
337
- # Calculate inter-rater reliability (Cohen's Kappa).
338
- # Match samples by commit_sha
339
- matched_samples = []
340
- for sample_a in samples_from_reviewer_a:
341
- for sample_b in samples_from_reviewer_b:
342
- if sample_a.commit_sha == sample_b.commit_sha:
343
- matched_samples.append((sample_a, sample_b))
344
- break
345
-
346
- if not matched_samples:
347
- return {"error": "No matching samples between reviewers"}
348
-
349
- # Calculate agreement
350
- agreements = 0
351
- for sample_a, sample_b in matched_samples:
352
- if sample_a.true_label == sample_b.true_label:
353
- agreements += 1
354
-
355
- observed_agreement = agreements / len(matched_samples)
356
-
357
- # Calculate expected agreement (by chance)
358
- a_positive = sum(1 for s, _ in matched_samples if s.true_label)
359
- b_positive = sum(1 for _, s in matched_samples if s.true_label)
360
- n = len(matched_samples)
361
-
362
- p_a_yes = a_positive / n
363
- p_b_yes = b_positive / n
364
- expected_agreement = (p_a_yes * p_b_yes) + ((1 - p_a_yes) * (1 - p_b_yes))
365
-
366
- # Cohen's Kappa
367
- kappa = (
368
- (observed_agreement - expected_agreement) / (1 - expected_agreement)
369
- if expected_agreement < 1
370
- else 1
371
- )
372
-
373
- return {
374
- "cohens_kappa": round(kappa, 4),
375
- "observed_agreement": round(observed_agreement, 4),
376
- "expected_agreement": round(expected_agreement, 4),
377
- "sample_count": n,
378
- "interpretation": self._interpret_kappa(kappa),
379
- }
380
-
381
- def _interpret_kappa(self, kappa: float) -> str:
382
- # Interpret Cohen's Kappa value.
383
- if kappa < 0:
384
- return "Poor (less than chance)"
385
- elif kappa < 0.20:
386
- return "Slight"
387
- elif kappa < 0.40:
388
- return "Fair"
389
- elif kappa < 0.60:
390
- return "Moderate"
391
- elif kappa < 0.80:
392
- return "Substantial"
393
- else:
394
- return "Almost perfect"