greenmining 0.1.12__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__version__.py +1 -1
- greenmining/analyzers/__init__.py +17 -0
- greenmining/analyzers/code_diff_analyzer.py +238 -0
- greenmining/analyzers/ml_feature_extractor.py +512 -0
- greenmining/analyzers/nlp_analyzer.py +365 -0
- greenmining/analyzers/qualitative_analyzer.py +460 -0
- greenmining/analyzers/statistical_analyzer.py +245 -0
- greenmining/analyzers/temporal_analyzer.py +434 -0
- greenmining/cli.py +119 -24
- greenmining/config.py +21 -0
- greenmining/controllers/repository_controller.py +50 -2
- greenmining/gsf_patterns.py +10 -5
- greenmining/models/aggregated_stats.py +3 -1
- greenmining/models/commit.py +3 -0
- greenmining/models/repository.py +3 -1
- greenmining/presenters/console_presenter.py +3 -1
- greenmining/services/commit_extractor.py +37 -7
- greenmining/services/data_aggregator.py +171 -7
- greenmining/services/data_analyzer.py +111 -8
- greenmining/services/github_fetcher.py +62 -5
- greenmining/services/reports.py +123 -2
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/METADATA +250 -22
- greenmining-1.0.2.dist-info/RECORD +36 -0
- greenmining-0.1.12.dist-info/RECORD +0 -29
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/WHEEL +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/entry_points.txt +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/licenses/LICENSE +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.2.dist-info}/top_level.txt +0 -0
greenmining/__version__.py
CHANGED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Analyzers for GreenMining framework."""
|
|
2
|
+
|
|
3
|
+
from .code_diff_analyzer import CodeDiffAnalyzer
|
|
4
|
+
from .statistical_analyzer import EnhancedStatisticalAnalyzer
|
|
5
|
+
from .nlp_analyzer import NLPAnalyzer
|
|
6
|
+
from .temporal_analyzer import TemporalAnalyzer
|
|
7
|
+
from .qualitative_analyzer import QualitativeAnalyzer
|
|
8
|
+
from .ml_feature_extractor import MLFeatureExtractor
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"CodeDiffAnalyzer",
|
|
12
|
+
"EnhancedStatisticalAnalyzer",
|
|
13
|
+
"NLPAnalyzer",
|
|
14
|
+
"TemporalAnalyzer",
|
|
15
|
+
"QualitativeAnalyzer",
|
|
16
|
+
"MLFeatureExtractor",
|
|
17
|
+
]
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""Code diff analyzer for detecting green software patterns in code changes."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
from pydriller import Commit, ModifiedFile
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CodeDiffAnalyzer:
|
|
10
|
+
"""
|
|
11
|
+
Analyze code diffs to detect green software patterns
|
|
12
|
+
beyond commit message keywords.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# Pattern indicators in code changes
|
|
16
|
+
PATTERN_SIGNATURES = {
|
|
17
|
+
"caching": {
|
|
18
|
+
"imports": [
|
|
19
|
+
r"import.*cache",
|
|
20
|
+
r"from.*cache.*import",
|
|
21
|
+
r"import redis",
|
|
22
|
+
r"import memcached",
|
|
23
|
+
],
|
|
24
|
+
"annotations": [r"@cache", r"@cached", r"@lru_cache", r"@memoize"],
|
|
25
|
+
"function_calls": [r"\.cache\(", r"\.get_cache\(", r"\.set_cache\("],
|
|
26
|
+
"variable_names": [r"cache", r"cached_", r"_cache"],
|
|
27
|
+
},
|
|
28
|
+
"resource_optimization": {
|
|
29
|
+
"kubernetes": [
|
|
30
|
+
r"resources:\s*limits:",
|
|
31
|
+
r"resources:\s*requests:",
|
|
32
|
+
r"memory:\s*[0-9]+Mi",
|
|
33
|
+
r"cpu:\s*[0-9]+m",
|
|
34
|
+
],
|
|
35
|
+
"docker": [
|
|
36
|
+
r"FROM.*alpine",
|
|
37
|
+
r"FROM.*scratch",
|
|
38
|
+
r"--no-cache",
|
|
39
|
+
r"apt-get.*--no-install-recommends",
|
|
40
|
+
],
|
|
41
|
+
},
|
|
42
|
+
"database_optimization": {
|
|
43
|
+
"indexes": [r"CREATE.*INDEX", r"@Index", r"add_index"],
|
|
44
|
+
"query_optimization": [
|
|
45
|
+
r"\.select_related\(",
|
|
46
|
+
r"\.prefetch_related\(",
|
|
47
|
+
r"EXPLAIN",
|
|
48
|
+
],
|
|
49
|
+
"connection_pooling": [
|
|
50
|
+
r"pool_size",
|
|
51
|
+
r"max_connections",
|
|
52
|
+
r"connection_pool",
|
|
53
|
+
],
|
|
54
|
+
},
|
|
55
|
+
"async_processing": {
|
|
56
|
+
"keywords": [r"\basync\s+def\b", r"\bawait\b", r"asyncio", r"aiohttp"],
|
|
57
|
+
"patterns": [
|
|
58
|
+
r"ThreadPoolExecutor",
|
|
59
|
+
r"ProcessPoolExecutor",
|
|
60
|
+
r"@celery\.task",
|
|
61
|
+
],
|
|
62
|
+
},
|
|
63
|
+
"lazy_loading": {
|
|
64
|
+
"keywords": [r"lazy", r"defer", r"\.only\(", r"select_related"],
|
|
65
|
+
"patterns": [r"@lazy", r"LazyLoader", r"dynamic.*import"],
|
|
66
|
+
},
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
def analyze_commit_diff(self, commit: Commit) -> Dict[str, Any]:
|
|
70
|
+
"""
|
|
71
|
+
Analyze code changes in a commit to detect green patterns.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
commit: PyDriller Commit object
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Dictionary containing:
|
|
78
|
+
- patterns_detected: List of detected pattern names
|
|
79
|
+
- confidence: Confidence level (high/medium/low/none)
|
|
80
|
+
- evidence: Dictionary mapping patterns to evidence lines
|
|
81
|
+
- metrics: Code change metrics
|
|
82
|
+
"""
|
|
83
|
+
patterns_detected = []
|
|
84
|
+
evidence = {}
|
|
85
|
+
metrics = self._calculate_metrics(commit)
|
|
86
|
+
|
|
87
|
+
for modified_file in commit.modified_files:
|
|
88
|
+
# Skip non-code files
|
|
89
|
+
if not self._is_code_file(modified_file):
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
# Analyze additions
|
|
93
|
+
if modified_file.diff_parsed and modified_file.diff_parsed.get("added"):
|
|
94
|
+
for line in modified_file.diff_parsed["added"]:
|
|
95
|
+
detected = self._detect_patterns_in_line(line[1]) # line[1] is content
|
|
96
|
+
patterns_detected.extend(detected)
|
|
97
|
+
|
|
98
|
+
for pattern in detected:
|
|
99
|
+
if pattern not in evidence:
|
|
100
|
+
evidence[pattern] = []
|
|
101
|
+
evidence[pattern].append(
|
|
102
|
+
f"{modified_file.filename}:{line[0]} - {line[1][:80]}"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Deduplicate patterns
|
|
106
|
+
patterns_detected = list(set(patterns_detected))
|
|
107
|
+
|
|
108
|
+
# Confidence scoring
|
|
109
|
+
confidence = self._calculate_diff_confidence(patterns_detected, evidence, metrics)
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
"patterns_detected": patterns_detected,
|
|
113
|
+
"confidence": confidence,
|
|
114
|
+
"evidence": evidence,
|
|
115
|
+
"metrics": metrics,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
def _detect_patterns_in_line(self, code_line: str) -> List[str]:
|
|
119
|
+
"""
|
|
120
|
+
Detect patterns in a single line of code.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
code_line: Line of code to analyze
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
List of detected pattern names
|
|
127
|
+
"""
|
|
128
|
+
detected = []
|
|
129
|
+
|
|
130
|
+
for pattern_name, signatures in self.PATTERN_SIGNATURES.items():
|
|
131
|
+
for signature_type, patterns in signatures.items():
|
|
132
|
+
for pattern_regex in patterns:
|
|
133
|
+
if re.search(pattern_regex, code_line, re.IGNORECASE):
|
|
134
|
+
detected.append(pattern_name)
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
return detected
|
|
138
|
+
|
|
139
|
+
def _calculate_metrics(self, commit: Commit) -> Dict[str, int]:
|
|
140
|
+
"""
|
|
141
|
+
Calculate code change metrics.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
commit: PyDriller Commit object
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Dictionary of metrics
|
|
148
|
+
"""
|
|
149
|
+
lines_added = sum(f.added_lines for f in commit.modified_files)
|
|
150
|
+
lines_removed = sum(f.deleted_lines for f in commit.modified_files)
|
|
151
|
+
files_changed = len(commit.modified_files)
|
|
152
|
+
|
|
153
|
+
# Complexity change (requires static analysis - simplified for now)
|
|
154
|
+
complexity_before = sum(f.complexity or 0 for f in commit.modified_files)
|
|
155
|
+
complexity_after = complexity_before # Simplified
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
"lines_added": lines_added,
|
|
159
|
+
"lines_removed": lines_removed,
|
|
160
|
+
"files_changed": files_changed,
|
|
161
|
+
"net_lines": lines_added - lines_removed,
|
|
162
|
+
"complexity_change": complexity_after - complexity_before,
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
def _calculate_diff_confidence(
|
|
166
|
+
self, patterns: List[str], evidence: Dict[str, List[str]], metrics: Dict[str, int]
|
|
167
|
+
) -> str:
|
|
168
|
+
"""
|
|
169
|
+
Calculate confidence level for diff-based detection.
|
|
170
|
+
|
|
171
|
+
Factors:
|
|
172
|
+
- Number of patterns detected
|
|
173
|
+
- Amount of evidence per pattern
|
|
174
|
+
- Code change magnitude
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
patterns: List of detected patterns
|
|
178
|
+
evidence: Dictionary mapping patterns to evidence
|
|
179
|
+
metrics: Code change metrics
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Confidence level: high/medium/low/none
|
|
183
|
+
"""
|
|
184
|
+
if not patterns:
|
|
185
|
+
return "none"
|
|
186
|
+
|
|
187
|
+
evidence_count = sum(len(v) for v in evidence.values())
|
|
188
|
+
|
|
189
|
+
if len(patterns) >= 3 and evidence_count >= 5:
|
|
190
|
+
return "high"
|
|
191
|
+
elif len(patterns) >= 2 and evidence_count >= 3:
|
|
192
|
+
return "medium"
|
|
193
|
+
else:
|
|
194
|
+
return "low"
|
|
195
|
+
|
|
196
|
+
def _is_code_file(self, modified_file: ModifiedFile) -> bool:
|
|
197
|
+
"""
|
|
198
|
+
Check if file is a code file (not config, docs, etc.).
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
modified_file: PyDriller ModifiedFile object
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
True if file is a code file
|
|
205
|
+
"""
|
|
206
|
+
code_extensions = [
|
|
207
|
+
".py",
|
|
208
|
+
".java",
|
|
209
|
+
".go",
|
|
210
|
+
".js",
|
|
211
|
+
".ts",
|
|
212
|
+
".cpp",
|
|
213
|
+
".c",
|
|
214
|
+
".cs",
|
|
215
|
+
".rb",
|
|
216
|
+
".php",
|
|
217
|
+
".scala",
|
|
218
|
+
".kt",
|
|
219
|
+
".rs",
|
|
220
|
+
".swift",
|
|
221
|
+
]
|
|
222
|
+
|
|
223
|
+
# Check file extension
|
|
224
|
+
for ext in code_extensions:
|
|
225
|
+
if modified_file.filename.endswith(ext):
|
|
226
|
+
return True
|
|
227
|
+
|
|
228
|
+
# Also analyze Dockerfiles and Kubernetes manifests
|
|
229
|
+
if "Dockerfile" in modified_file.filename:
|
|
230
|
+
return True
|
|
231
|
+
if modified_file.filename.endswith((".yaml", ".yml")):
|
|
232
|
+
# Check if it's a Kubernetes manifest
|
|
233
|
+
if modified_file.source_code and any(
|
|
234
|
+
k in modified_file.source_code for k in ["kind:", "apiVersion:", "metadata:"]
|
|
235
|
+
):
|
|
236
|
+
return True
|
|
237
|
+
|
|
238
|
+
return False
|