greenmining 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__init__.py +11 -29
- greenmining/__main__.py +9 -3
- greenmining/__version__.py +2 -2
- greenmining/analyzers/__init__.py +3 -7
- greenmining/analyzers/code_diff_analyzer.py +151 -61
- greenmining/analyzers/qualitative_analyzer.py +15 -81
- greenmining/analyzers/statistical_analyzer.py +8 -69
- greenmining/analyzers/temporal_analyzer.py +16 -72
- greenmining/config.py +105 -58
- greenmining/controllers/__init__.py +1 -5
- greenmining/controllers/repository_controller.py +153 -94
- greenmining/energy/__init__.py +13 -0
- greenmining/energy/base.py +165 -0
- greenmining/energy/codecarbon_meter.py +146 -0
- greenmining/energy/rapl.py +157 -0
- greenmining/gsf_patterns.py +4 -26
- greenmining/models/__init__.py +1 -5
- greenmining/models/aggregated_stats.py +4 -4
- greenmining/models/analysis_result.py +4 -4
- greenmining/models/commit.py +5 -5
- greenmining/models/repository.py +5 -5
- greenmining/presenters/__init__.py +1 -5
- greenmining/presenters/console_presenter.py +24 -24
- greenmining/services/__init__.py +10 -6
- greenmining/services/commit_extractor.py +8 -152
- greenmining/services/data_aggregator.py +45 -175
- greenmining/services/data_analyzer.py +9 -202
- greenmining/services/github_fetcher.py +212 -323
- greenmining/services/github_graphql_fetcher.py +371 -0
- greenmining/services/local_repo_analyzer.py +387 -0
- greenmining/services/reports.py +33 -137
- greenmining/utils.py +21 -149
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/METADATA +61 -151
- greenmining-1.0.4.dist-info/RECORD +37 -0
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/WHEEL +1 -1
- greenmining/analyzers/ml_feature_extractor.py +0 -512
- greenmining/analyzers/nlp_analyzer.py +0 -365
- greenmining/cli.py +0 -471
- greenmining/main.py +0 -37
- greenmining-1.0.3.dist-info/RECORD +0 -36
- greenmining-1.0.3.dist-info/entry_points.txt +0 -2
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/licenses/LICENSE +0 -0
- {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/top_level.txt +0 -0
greenmining/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
# Green Microservices Mining - GSF Pattern Analysis Tool.
|
|
2
2
|
|
|
3
3
|
from greenmining.config import Config
|
|
4
4
|
from greenmining.controllers.repository_controller import RepositoryController
|
|
@@ -9,44 +9,26 @@ from greenmining.gsf_patterns import (
|
|
|
9
9
|
is_green_aware,
|
|
10
10
|
)
|
|
11
11
|
|
|
12
|
-
__version__ = "1.0.
|
|
12
|
+
__version__ = "1.0.4"
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def fetch_repositories(
|
|
16
16
|
github_token: str,
|
|
17
|
-
max_repos: int =
|
|
18
|
-
min_stars: int =
|
|
17
|
+
max_repos: int = None,
|
|
18
|
+
min_stars: int = None,
|
|
19
19
|
languages: list = None,
|
|
20
|
-
keywords: str =
|
|
20
|
+
keywords: str = None,
|
|
21
21
|
):
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
github_token: GitHub personal access token
|
|
26
|
-
max_repos: Maximum number of repositories to fetch (default: 100)
|
|
27
|
-
min_stars: Minimum GitHub stars required (default: 100)
|
|
28
|
-
languages: List of programming languages to filter (default: ["Python", "Java", "Go", "JavaScript", "TypeScript"])
|
|
29
|
-
keywords: Search keywords (default: "microservices")
|
|
30
|
-
|
|
31
|
-
Returns:
|
|
32
|
-
List of Repository model instances
|
|
33
|
-
|
|
34
|
-
Example:
|
|
35
|
-
>>> from greenmining import fetch_repositories
|
|
36
|
-
>>> repos = fetch_repositories(
|
|
37
|
-
... github_token="your_token",
|
|
38
|
-
... max_repos=50,
|
|
39
|
-
... keywords="kubernetes cloud-native",
|
|
40
|
-
... min_stars=500
|
|
41
|
-
... )
|
|
42
|
-
>>> print(f"Found {len(repos)} repositories")
|
|
43
|
-
"""
|
|
22
|
+
# Fetch repositories from GitHub with custom search keywords.
|
|
44
23
|
config = Config()
|
|
45
24
|
config.GITHUB_TOKEN = github_token
|
|
46
25
|
controller = RepositoryController(config)
|
|
47
26
|
|
|
48
27
|
return controller.fetch_repositories(
|
|
49
|
-
max_repos=max_repos,
|
|
28
|
+
max_repos=max_repos,
|
|
29
|
+
min_stars=min_stars,
|
|
30
|
+
languages=languages,
|
|
31
|
+
keywords=keywords,
|
|
50
32
|
)
|
|
51
33
|
|
|
52
34
|
|
|
@@ -58,4 +40,4 @@ __all__ = [
|
|
|
58
40
|
"get_pattern_by_keywords",
|
|
59
41
|
"fetch_repositories",
|
|
60
42
|
"__version__",
|
|
61
|
-
]
|
|
43
|
+
]
|
greenmining/__main__.py
CHANGED
|
@@ -1,6 +1,12 @@
|
|
|
1
|
-
|
|
1
|
+
# Allow running greenmining as a module: python -m greenmining
|
|
2
|
+
# This is a library - use Python API for programmatic access.
|
|
2
3
|
|
|
3
|
-
from greenmining
|
|
4
|
+
from greenmining import __version__
|
|
4
5
|
|
|
5
6
|
if __name__ == "__main__":
|
|
6
|
-
|
|
7
|
+
print(f"greenmining v{__version__}")
|
|
8
|
+
print("This is a Python library for analyzing green software patterns.")
|
|
9
|
+
print("\nUsage:")
|
|
10
|
+
print(" from greenmining import GSF_PATTERNS, is_green_aware, get_pattern_by_keywords")
|
|
11
|
+
print(" from greenmining.services import GitHubFetcher, CommitExtractor, DataAnalyzer")
|
|
12
|
+
print("\nDocumentation: https://github.com/adam-bouafia/greenmining")
|
greenmining/__version__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
|
|
1
|
+
# Version information for greenmining.
|
|
2
2
|
|
|
3
|
-
__version__ = "1.0.
|
|
3
|
+
__version__ = "1.0.4"
|
|
@@ -1,17 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
# Analyzers for GreenMining framework.
|
|
2
2
|
|
|
3
3
|
from .code_diff_analyzer import CodeDiffAnalyzer
|
|
4
|
-
from .statistical_analyzer import
|
|
5
|
-
from .nlp_analyzer import NLPAnalyzer
|
|
4
|
+
from .statistical_analyzer import StatisticalAnalyzer
|
|
6
5
|
from .temporal_analyzer import TemporalAnalyzer
|
|
7
6
|
from .qualitative_analyzer import QualitativeAnalyzer
|
|
8
|
-
from .ml_feature_extractor import MLFeatureExtractor
|
|
9
7
|
|
|
10
8
|
__all__ = [
|
|
11
9
|
"CodeDiffAnalyzer",
|
|
12
|
-
"
|
|
13
|
-
"NLPAnalyzer",
|
|
10
|
+
"StatisticalAnalyzer",
|
|
14
11
|
"TemporalAnalyzer",
|
|
15
12
|
"QualitativeAnalyzer",
|
|
16
|
-
"MLFeatureExtractor",
|
|
17
13
|
]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
# Code diff analyzer for detecting green software patterns in code changes.
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
4
|
from typing import Any, Dict, List
|
|
@@ -7,10 +7,7 @@ from pydriller import Commit, ModifiedFile
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class CodeDiffAnalyzer:
|
|
10
|
-
|
|
11
|
-
Analyze code diffs to detect green software patterns
|
|
12
|
-
beyond commit message keywords.
|
|
13
|
-
"""
|
|
10
|
+
# Analyze code diffs to detect green software patterns
|
|
14
11
|
|
|
15
12
|
# Pattern indicators in code changes
|
|
16
13
|
PATTERN_SIGNATURES = {
|
|
@@ -64,22 +61,154 @@ class CodeDiffAnalyzer:
|
|
|
64
61
|
"keywords": [r"lazy", r"defer", r"\.only\(", r"select_related"],
|
|
65
62
|
"patterns": [r"@lazy", r"LazyLoader", r"dynamic.*import"],
|
|
66
63
|
},
|
|
64
|
+
# NEW: Serverless computing patterns
|
|
65
|
+
"serverless_computing": {
|
|
66
|
+
"providers": [
|
|
67
|
+
r"aws.*lambda",
|
|
68
|
+
r"@app\.route",
|
|
69
|
+
r"functions\.https",
|
|
70
|
+
r"azure.*function",
|
|
71
|
+
],
|
|
72
|
+
"frameworks": [r"serverless", r"chalice", r"zappa", r"claudia"],
|
|
73
|
+
"keywords": [r"lambda_handler", r"cloud.*function", r"function.*app"],
|
|
74
|
+
},
|
|
75
|
+
# NEW: CDN and edge computing
|
|
76
|
+
"cdn_edge": {
|
|
77
|
+
"providers": [
|
|
78
|
+
r"cloudflare",
|
|
79
|
+
r"cloudfront",
|
|
80
|
+
r"fastly",
|
|
81
|
+
r"akamai",
|
|
82
|
+
r"cdn\.js",
|
|
83
|
+
],
|
|
84
|
+
"keywords": [
|
|
85
|
+
r"edge.*cache",
|
|
86
|
+
r"cdn",
|
|
87
|
+
r"\.distribute\(",
|
|
88
|
+
r"edge.*function",
|
|
89
|
+
],
|
|
90
|
+
},
|
|
91
|
+
# NEW: Compression patterns
|
|
92
|
+
"compression": {
|
|
93
|
+
"algorithms": [r"gzip", r"brotli", r"deflate", r"zstd", r"lz4"],
|
|
94
|
+
"keywords": [
|
|
95
|
+
r"compress",
|
|
96
|
+
r"decompress",
|
|
97
|
+
r"\.gz\b",
|
|
98
|
+
r"Content-Encoding",
|
|
99
|
+
],
|
|
100
|
+
"libraries": [r"import gzip", r"import zlib", r"import brotli"],
|
|
101
|
+
},
|
|
102
|
+
# NEW: ML model optimization
|
|
103
|
+
"model_optimization": {
|
|
104
|
+
"techniques": [
|
|
105
|
+
r"quantize",
|
|
106
|
+
r"quantization",
|
|
107
|
+
r"prune",
|
|
108
|
+
r"pruning",
|
|
109
|
+
r"distill",
|
|
110
|
+
],
|
|
111
|
+
"formats": [r"onnx", r"tensorrt", r"tflite", r"coreml"],
|
|
112
|
+
"keywords": [
|
|
113
|
+
r"int8",
|
|
114
|
+
r"fp16",
|
|
115
|
+
r"mixed.*precision",
|
|
116
|
+
r"model\.optimize",
|
|
117
|
+
],
|
|
118
|
+
},
|
|
119
|
+
# NEW: Efficient protocols (HTTP/2, gRPC)
|
|
120
|
+
"efficient_protocols": {
|
|
121
|
+
"http2": [r"http2", r"http/2", r"h2", r"alpn"],
|
|
122
|
+
"grpc": [r"grpc", r"protobuf", r"\.proto\b"],
|
|
123
|
+
"keywords": [
|
|
124
|
+
r"stream",
|
|
125
|
+
r"multiplexing",
|
|
126
|
+
r"server.*push",
|
|
127
|
+
r"binary.*protocol",
|
|
128
|
+
],
|
|
129
|
+
},
|
|
130
|
+
# NEW: Container optimization
|
|
131
|
+
"container_optimization": {
|
|
132
|
+
"base_images": [
|
|
133
|
+
r"FROM.*alpine",
|
|
134
|
+
r"FROM.*scratch",
|
|
135
|
+
r"FROM.*distroless",
|
|
136
|
+
],
|
|
137
|
+
"techniques": [
|
|
138
|
+
r"multi-stage",
|
|
139
|
+
r"--no-install-recommends",
|
|
140
|
+
r"&&.*rm.*-rf",
|
|
141
|
+
r"\.dockerignore",
|
|
142
|
+
],
|
|
143
|
+
"keywords": [r"layer.*cache", r"build.*cache", r"image.*size"],
|
|
144
|
+
},
|
|
145
|
+
# NEW: Green cloud regions
|
|
146
|
+
"green_regions": {
|
|
147
|
+
"regions": [
|
|
148
|
+
r"eu-west",
|
|
149
|
+
r"eu-north",
|
|
150
|
+
r"sweden",
|
|
151
|
+
r"norway",
|
|
152
|
+
r"canada",
|
|
153
|
+
],
|
|
154
|
+
"keywords": [
|
|
155
|
+
r"renewable",
|
|
156
|
+
r"green.*region",
|
|
157
|
+
r"sustainable.*region",
|
|
158
|
+
r"carbon.*neutral",
|
|
159
|
+
],
|
|
160
|
+
},
|
|
161
|
+
# NEW: Auto-scaling patterns
|
|
162
|
+
"auto_scaling": {
|
|
163
|
+
"kubernetes": [
|
|
164
|
+
r"HorizontalPodAutoscaler",
|
|
165
|
+
r"autoscaling/v",
|
|
166
|
+
r"hpa",
|
|
167
|
+
r"minReplicas",
|
|
168
|
+
r"maxReplicas",
|
|
169
|
+
],
|
|
170
|
+
"cloud": [
|
|
171
|
+
r"auto.*scal",
|
|
172
|
+
r"scale.*to.*zero",
|
|
173
|
+
r"ScalingPolicy",
|
|
174
|
+
r"TargetTracking",
|
|
175
|
+
],
|
|
176
|
+
"keywords": [
|
|
177
|
+
r"scale.*up",
|
|
178
|
+
r"scale.*down",
|
|
179
|
+
r"metrics.*server",
|
|
180
|
+
r"cpu.*utilization",
|
|
181
|
+
],
|
|
182
|
+
},
|
|
183
|
+
# NEW: Code splitting and lazy loading (web)
|
|
184
|
+
"code_splitting": {
|
|
185
|
+
"webpack": [
|
|
186
|
+
r"dynamic.*import",
|
|
187
|
+
r"lazy.*load",
|
|
188
|
+
r"code.*split",
|
|
189
|
+
r"chunk",
|
|
190
|
+
],
|
|
191
|
+
"react": [r"React\.lazy", r"Suspense", r"loadable"],
|
|
192
|
+
"keywords": [r"bundle", r"split.*chunk", r"async.*component"],
|
|
193
|
+
},
|
|
194
|
+
# NEW: Green ML training
|
|
195
|
+
"green_ml_training": {
|
|
196
|
+
"keywords": [
|
|
197
|
+
r"early.*stopping",
|
|
198
|
+
r"learning.*rate.*scheduler",
|
|
199
|
+
r"gradient.*checkpointing",
|
|
200
|
+
r"mixed.*precision",
|
|
201
|
+
],
|
|
202
|
+
"frameworks": [
|
|
203
|
+
r"apex",
|
|
204
|
+
r"torch\.cuda\.amp",
|
|
205
|
+
r"tf\.keras\.mixed_precision",
|
|
206
|
+
],
|
|
207
|
+
},
|
|
67
208
|
}
|
|
68
209
|
|
|
69
210
|
def analyze_commit_diff(self, commit: Commit) -> Dict[str, Any]:
|
|
70
|
-
|
|
71
|
-
Analyze code changes in a commit to detect green patterns.
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
commit: PyDriller Commit object
|
|
75
|
-
|
|
76
|
-
Returns:
|
|
77
|
-
Dictionary containing:
|
|
78
|
-
- patterns_detected: List of detected pattern names
|
|
79
|
-
- confidence: Confidence level (high/medium/low/none)
|
|
80
|
-
- evidence: Dictionary mapping patterns to evidence lines
|
|
81
|
-
- metrics: Code change metrics
|
|
82
|
-
"""
|
|
211
|
+
# Analyze code changes in a commit to detect green patterns.
|
|
83
212
|
patterns_detected = []
|
|
84
213
|
evidence = {}
|
|
85
214
|
metrics = self._calculate_metrics(commit)
|
|
@@ -116,15 +245,7 @@ class CodeDiffAnalyzer:
|
|
|
116
245
|
}
|
|
117
246
|
|
|
118
247
|
def _detect_patterns_in_line(self, code_line: str) -> List[str]:
|
|
119
|
-
|
|
120
|
-
Detect patterns in a single line of code.
|
|
121
|
-
|
|
122
|
-
Args:
|
|
123
|
-
code_line: Line of code to analyze
|
|
124
|
-
|
|
125
|
-
Returns:
|
|
126
|
-
List of detected pattern names
|
|
127
|
-
"""
|
|
248
|
+
# Detect patterns in a single line of code.
|
|
128
249
|
detected = []
|
|
129
250
|
|
|
130
251
|
for pattern_name, signatures in self.PATTERN_SIGNATURES.items():
|
|
@@ -137,15 +258,7 @@ class CodeDiffAnalyzer:
|
|
|
137
258
|
return detected
|
|
138
259
|
|
|
139
260
|
def _calculate_metrics(self, commit: Commit) -> Dict[str, int]:
|
|
140
|
-
|
|
141
|
-
Calculate code change metrics.
|
|
142
|
-
|
|
143
|
-
Args:
|
|
144
|
-
commit: PyDriller Commit object
|
|
145
|
-
|
|
146
|
-
Returns:
|
|
147
|
-
Dictionary of metrics
|
|
148
|
-
"""
|
|
261
|
+
# Calculate code change metrics.
|
|
149
262
|
lines_added = sum(f.added_lines for f in commit.modified_files)
|
|
150
263
|
lines_removed = sum(f.deleted_lines for f in commit.modified_files)
|
|
151
264
|
files_changed = len(commit.modified_files)
|
|
@@ -165,22 +278,7 @@ class CodeDiffAnalyzer:
|
|
|
165
278
|
def _calculate_diff_confidence(
|
|
166
279
|
self, patterns: List[str], evidence: Dict[str, List[str]], metrics: Dict[str, int]
|
|
167
280
|
) -> str:
|
|
168
|
-
|
|
169
|
-
Calculate confidence level for diff-based detection.
|
|
170
|
-
|
|
171
|
-
Factors:
|
|
172
|
-
- Number of patterns detected
|
|
173
|
-
- Amount of evidence per pattern
|
|
174
|
-
- Code change magnitude
|
|
175
|
-
|
|
176
|
-
Args:
|
|
177
|
-
patterns: List of detected patterns
|
|
178
|
-
evidence: Dictionary mapping patterns to evidence
|
|
179
|
-
metrics: Code change metrics
|
|
180
|
-
|
|
181
|
-
Returns:
|
|
182
|
-
Confidence level: high/medium/low/none
|
|
183
|
-
"""
|
|
281
|
+
# Calculate confidence level for diff-based detection.
|
|
184
282
|
if not patterns:
|
|
185
283
|
return "none"
|
|
186
284
|
|
|
@@ -194,15 +292,7 @@ class CodeDiffAnalyzer:
|
|
|
194
292
|
return "low"
|
|
195
293
|
|
|
196
294
|
def _is_code_file(self, modified_file: ModifiedFile) -> bool:
|
|
197
|
-
|
|
198
|
-
Check if file is a code file (not config, docs, etc.).
|
|
199
|
-
|
|
200
|
-
Args:
|
|
201
|
-
modified_file: PyDriller ModifiedFile object
|
|
202
|
-
|
|
203
|
-
Returns:
|
|
204
|
-
True if file is a code file
|
|
205
|
-
"""
|
|
295
|
+
# Check if file is a code file (not config, docs, etc.).
|
|
206
296
|
code_extensions = [
|
|
207
297
|
".py",
|
|
208
298
|
".java",
|
|
@@ -1,15 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
Qualitative Analysis Framework for Pattern Validation
|
|
3
|
-
|
|
4
|
-
Implements qualitative validation from Soliman et al. (2017):
|
|
5
|
-
- Stratified random sampling for manual validation
|
|
6
|
-
- Precision/recall calculation framework
|
|
7
|
-
- Inter-rater reliability support
|
|
8
|
-
- False positive/negative tracking
|
|
9
|
-
|
|
10
|
-
Based on Soliman et al.: 42/151 studies used qualitative analysis
|
|
11
|
-
Critical for: validating IR-based approaches, calculating accuracy metrics
|
|
12
|
-
"""
|
|
1
|
+
# Qualitative Analysis Framework for Pattern Validation
|
|
13
2
|
|
|
14
3
|
from __future__ import annotations
|
|
15
4
|
|
|
@@ -22,7 +11,7 @@ import json
|
|
|
22
11
|
|
|
23
12
|
@dataclass
|
|
24
13
|
class ValidationSample:
|
|
25
|
-
|
|
14
|
+
# Represents a single validation sample
|
|
26
15
|
|
|
27
16
|
commit_sha: str
|
|
28
17
|
commit_message: str
|
|
@@ -38,7 +27,7 @@ class ValidationSample:
|
|
|
38
27
|
|
|
39
28
|
@dataclass
|
|
40
29
|
class ValidationMetrics:
|
|
41
|
-
|
|
30
|
+
# Precision/recall metrics for validation
|
|
42
31
|
|
|
43
32
|
true_positives: int
|
|
44
33
|
false_positives: int
|
|
@@ -51,26 +40,10 @@ class ValidationMetrics:
|
|
|
51
40
|
|
|
52
41
|
|
|
53
42
|
class QualitativeAnalyzer:
|
|
54
|
-
|
|
55
|
-
Framework for manual validation and qualitative analysis.
|
|
56
|
-
|
|
57
|
-
Implements:
|
|
58
|
-
1. Stratified sampling (ensure representation across categories)
|
|
59
|
-
2. Validation workflow (export → review → import → calculate metrics)
|
|
60
|
-
3. Precision/recall calculation
|
|
61
|
-
4. Inter-rater reliability (if multiple reviewers)
|
|
62
|
-
|
|
63
|
-
Based on Soliman et al.: "42 studies used qualitative analysis for validation"
|
|
64
|
-
"""
|
|
43
|
+
# Framework for manual validation and qualitative analysis.
|
|
65
44
|
|
|
66
45
|
def __init__(self, sample_size: int = 30, stratify_by: str = "pattern"):
|
|
67
|
-
|
|
68
|
-
Initialize qualitative analyzer.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
sample_size: Number of commits to sample for validation
|
|
72
|
-
stratify_by: Stratification method ('pattern', 'repository', 'time', 'random')
|
|
73
|
-
"""
|
|
46
|
+
# Initialize qualitative analyzer.
|
|
74
47
|
self.sample_size = sample_size
|
|
75
48
|
self.stratify_by = stratify_by
|
|
76
49
|
self.samples: List[ValidationSample] = []
|
|
@@ -78,17 +51,7 @@ class QualitativeAnalyzer:
|
|
|
78
51
|
def generate_validation_samples(
|
|
79
52
|
self, commits: List[Dict], analysis_results: List[Dict], include_negatives: bool = True
|
|
80
53
|
) -> List[ValidationSample]:
|
|
81
|
-
|
|
82
|
-
Generate stratified validation samples.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
commits: All commits
|
|
86
|
-
analysis_results: Pattern detection results
|
|
87
|
-
include_negatives: Include non-green commits for false negative detection
|
|
88
|
-
|
|
89
|
-
Returns:
|
|
90
|
-
List of ValidationSample objects
|
|
91
|
-
"""
|
|
54
|
+
# Generate stratified validation samples.
|
|
92
55
|
# Build commit lookup
|
|
93
56
|
commit_lookup = {c.get("hash", c.get("sha")): c for c in commits}
|
|
94
57
|
|
|
@@ -141,7 +104,7 @@ class QualitativeAnalyzer:
|
|
|
141
104
|
return samples
|
|
142
105
|
|
|
143
106
|
def _stratified_sample_by_pattern(self, results: List[Dict], sample_size: int) -> List[Dict]:
|
|
144
|
-
|
|
107
|
+
# Stratified sampling ensuring each pattern category is represented.
|
|
145
108
|
# Group by dominant pattern
|
|
146
109
|
pattern_groups = defaultdict(list)
|
|
147
110
|
for result in results:
|
|
@@ -172,7 +135,7 @@ class QualitativeAnalyzer:
|
|
|
172
135
|
def _stratified_sample_by_repo(
|
|
173
136
|
self, results: List[Dict], commit_lookup: Dict, sample_size: int
|
|
174
137
|
) -> List[Dict]:
|
|
175
|
-
|
|
138
|
+
# Stratified sampling ensuring each repository is represented.
|
|
176
139
|
# Group by repository
|
|
177
140
|
repo_groups = defaultdict(list)
|
|
178
141
|
for result in results:
|
|
@@ -194,12 +157,7 @@ class QualitativeAnalyzer:
|
|
|
194
157
|
return samples[:sample_size]
|
|
195
158
|
|
|
196
159
|
def export_samples_for_review(self, output_path: str) -> None:
|
|
197
|
-
|
|
198
|
-
Export validation samples to JSON for manual review.
|
|
199
|
-
|
|
200
|
-
Args:
|
|
201
|
-
output_path: Path to output JSON file
|
|
202
|
-
"""
|
|
160
|
+
# Export validation samples to JSON for manual review.
|
|
203
161
|
samples_data = []
|
|
204
162
|
for i, sample in enumerate(self.samples, 1):
|
|
205
163
|
samples_data.append(
|
|
@@ -223,12 +181,7 @@ class QualitativeAnalyzer:
|
|
|
223
181
|
json.dump(samples_data, f, indent=2)
|
|
224
182
|
|
|
225
183
|
def import_validated_samples(self, input_path: str) -> None:
|
|
226
|
-
|
|
227
|
-
Import manually validated samples from JSON.
|
|
228
|
-
|
|
229
|
-
Args:
|
|
230
|
-
input_path: Path to JSON file with validated samples
|
|
231
|
-
"""
|
|
184
|
+
# Import manually validated samples from JSON.
|
|
232
185
|
with open(input_path, "r") as f:
|
|
233
186
|
samples_data = json.load(f)
|
|
234
187
|
|
|
@@ -248,12 +201,7 @@ class QualitativeAnalyzer:
|
|
|
248
201
|
break
|
|
249
202
|
|
|
250
203
|
def calculate_metrics(self) -> ValidationMetrics:
|
|
251
|
-
|
|
252
|
-
Calculate precision, recall, F1, and accuracy.
|
|
253
|
-
|
|
254
|
-
Returns:
|
|
255
|
-
ValidationMetrics object
|
|
256
|
-
"""
|
|
204
|
+
# Calculate precision, recall, F1, and accuracy.
|
|
257
205
|
# Count outcomes
|
|
258
206
|
tp = 0 # True positive: detected as green, truly green
|
|
259
207
|
fp = 0 # False positive: detected as green, not green
|
|
@@ -295,12 +243,7 @@ class QualitativeAnalyzer:
|
|
|
295
243
|
)
|
|
296
244
|
|
|
297
245
|
def get_validation_report(self) -> Dict:
|
|
298
|
-
|
|
299
|
-
Generate comprehensive validation report.
|
|
300
|
-
|
|
301
|
-
Returns:
|
|
302
|
-
Dictionary with validation statistics and metrics
|
|
303
|
-
"""
|
|
246
|
+
# Generate comprehensive validation report.
|
|
304
247
|
validated_count = sum(1 for s in self.samples if s.validation_status == "validated")
|
|
305
248
|
pending_count = sum(1 for s in self.samples if s.validation_status == "pending")
|
|
306
249
|
|
|
@@ -360,7 +303,7 @@ class QualitativeAnalyzer:
|
|
|
360
303
|
}
|
|
361
304
|
|
|
362
305
|
def _analyze_pattern_accuracy(self) -> Dict:
|
|
363
|
-
|
|
306
|
+
# Analyze accuracy per pattern category.
|
|
364
307
|
pattern_stats = defaultdict(lambda: {"tp": 0, "fp": 0})
|
|
365
308
|
|
|
366
309
|
for sample in self.samples:
|
|
@@ -391,16 +334,7 @@ class QualitativeAnalyzer:
|
|
|
391
334
|
samples_from_reviewer_a: List[ValidationSample],
|
|
392
335
|
samples_from_reviewer_b: List[ValidationSample],
|
|
393
336
|
) -> Dict:
|
|
394
|
-
|
|
395
|
-
Calculate inter-rater reliability (Cohen's Kappa).
|
|
396
|
-
|
|
397
|
-
Args:
|
|
398
|
-
samples_from_reviewer_a: Samples validated by reviewer A
|
|
399
|
-
samples_from_reviewer_b: Samples validated by reviewer B (same commits)
|
|
400
|
-
|
|
401
|
-
Returns:
|
|
402
|
-
Dictionary with Cohen's Kappa and agreement statistics
|
|
403
|
-
"""
|
|
337
|
+
# Calculate inter-rater reliability (Cohen's Kappa).
|
|
404
338
|
# Match samples by commit_sha
|
|
405
339
|
matched_samples = []
|
|
406
340
|
for sample_a in samples_from_reviewer_a:
|
|
@@ -445,7 +379,7 @@ class QualitativeAnalyzer:
|
|
|
445
379
|
}
|
|
446
380
|
|
|
447
381
|
def _interpret_kappa(self, kappa: float) -> str:
|
|
448
|
-
|
|
382
|
+
# Interpret Cohen's Kappa value.
|
|
449
383
|
if kappa < 0:
|
|
450
384
|
return "Poor (less than chance)"
|
|
451
385
|
elif kappa < 0.20:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
# Statistical analyzer for green software patterns.
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
@@ -9,24 +9,11 @@ import pandas as pd
|
|
|
9
9
|
from scipy import stats
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
class
|
|
13
|
-
|
|
14
|
-
Advanced statistical analyses for green software patterns.
|
|
15
|
-
Based on Soliman et al. quantitative validation techniques.
|
|
16
|
-
"""
|
|
12
|
+
class StatisticalAnalyzer:
|
|
13
|
+
# Advanced statistical analyses for green software patterns.
|
|
17
14
|
|
|
18
15
|
def analyze_pattern_correlations(self, commit_data: pd.DataFrame) -> Dict[str, Any]:
|
|
19
|
-
|
|
20
|
-
Analyze correlations between patterns.
|
|
21
|
-
|
|
22
|
-
Question: Do repositories that adopt caching also adopt resource limits?
|
|
23
|
-
|
|
24
|
-
Args:
|
|
25
|
-
commit_data: DataFrame with pattern columns
|
|
26
|
-
|
|
27
|
-
Returns:
|
|
28
|
-
Dictionary containing correlation matrix and significant pairs
|
|
29
|
-
"""
|
|
16
|
+
# Analyze correlations between patterns.
|
|
30
17
|
# Create pattern co-occurrence matrix
|
|
31
18
|
pattern_columns = [col for col in commit_data.columns if col.startswith("pattern_")]
|
|
32
19
|
|
|
@@ -61,20 +48,7 @@ class EnhancedStatisticalAnalyzer:
|
|
|
61
48
|
}
|
|
62
49
|
|
|
63
50
|
def temporal_trend_analysis(self, commits_df: pd.DataFrame) -> Dict[str, Any]:
|
|
64
|
-
|
|
65
|
-
Analyze temporal trends in green awareness.
|
|
66
|
-
|
|
67
|
-
Techniques:
|
|
68
|
-
- Mann-Kendall trend test (monotonic trend detection)
|
|
69
|
-
- Seasonal decomposition (identify cyclical patterns)
|
|
70
|
-
- Change point detection (identify sudden shifts)
|
|
71
|
-
|
|
72
|
-
Args:
|
|
73
|
-
commits_df: DataFrame with date and green_aware columns
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
Dictionary containing trend analysis results
|
|
77
|
-
"""
|
|
51
|
+
# Analyze temporal trends in green awareness.
|
|
78
52
|
# Prepare time series data
|
|
79
53
|
commits_df["date"] = pd.to_datetime(commits_df["date"])
|
|
80
54
|
commits_df = commits_df.sort_values("date")
|
|
@@ -127,21 +101,7 @@ class EnhancedStatisticalAnalyzer:
|
|
|
127
101
|
}
|
|
128
102
|
|
|
129
103
|
def effect_size_analysis(self, group1: List[float], group2: List[float]) -> Dict[str, Any]:
|
|
130
|
-
|
|
131
|
-
Calculate effect size between two groups.
|
|
132
|
-
|
|
133
|
-
Use case: Compare green awareness between:
|
|
134
|
-
- Different programming languages
|
|
135
|
-
- Different time periods
|
|
136
|
-
- Different repository sizes
|
|
137
|
-
|
|
138
|
-
Args:
|
|
139
|
-
group1: First group values
|
|
140
|
-
group2: Second group values
|
|
141
|
-
|
|
142
|
-
Returns:
|
|
143
|
-
Dictionary containing effect size metrics
|
|
144
|
-
"""
|
|
104
|
+
# Calculate effect size between two groups.
|
|
145
105
|
# Cohen's d (effect size)
|
|
146
106
|
mean1, mean2 = np.mean(group1), np.mean(group2)
|
|
147
107
|
std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
|
|
@@ -175,20 +135,7 @@ class EnhancedStatisticalAnalyzer:
|
|
|
175
135
|
}
|
|
176
136
|
|
|
177
137
|
def pattern_adoption_rate_analysis(self, commits_df: pd.DataFrame) -> Dict[str, Any]:
|
|
178
|
-
|
|
179
|
-
Analyze pattern adoption rates over repository lifetime.
|
|
180
|
-
|
|
181
|
-
Metrics:
|
|
182
|
-
- Time to first adoption (TTFA)
|
|
183
|
-
- Adoption acceleration
|
|
184
|
-
- Pattern stickiness (continued use after adoption)
|
|
185
|
-
|
|
186
|
-
Args:
|
|
187
|
-
commits_df: DataFrame with pattern and date columns
|
|
188
|
-
|
|
189
|
-
Returns:
|
|
190
|
-
Dictionary mapping patterns to adoption metrics
|
|
191
|
-
"""
|
|
138
|
+
# Analyze pattern adoption rates over repository lifetime.
|
|
192
139
|
results = {}
|
|
193
140
|
|
|
194
141
|
for pattern in commits_df["pattern"].unique():
|
|
@@ -220,15 +167,7 @@ class EnhancedStatisticalAnalyzer:
|
|
|
220
167
|
return results
|
|
221
168
|
|
|
222
169
|
def _interpret_correlations(self, significant_pairs: List[Dict[str, Any]]) -> str:
|
|
223
|
-
|
|
224
|
-
Generate interpretation of correlation results.
|
|
225
|
-
|
|
226
|
-
Args:
|
|
227
|
-
significant_pairs: List of significant correlation pairs
|
|
228
|
-
|
|
229
|
-
Returns:
|
|
230
|
-
Interpretation string
|
|
231
|
-
"""
|
|
170
|
+
# Generate interpretation of correlation results.
|
|
232
171
|
if not significant_pairs:
|
|
233
172
|
return "No significant correlations found between patterns."
|
|
234
173
|
|