greenmining 0.1.12__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
1
  """Version information for greenmining."""
2
2
 
3
- __version__ = "0.1.7"
3
+ __version__ = "1.0.1"
@@ -0,0 +1,17 @@
1
+ """Analyzers for GreenMining framework."""
2
+
3
+ from .code_diff_analyzer import CodeDiffAnalyzer
4
+ from .statistical_analyzer import EnhancedStatisticalAnalyzer
5
+ from .nlp_analyzer import NLPAnalyzer
6
+ from .temporal_analyzer import TemporalAnalyzer
7
+ from .qualitative_analyzer import QualitativeAnalyzer
8
+ from .ml_feature_extractor import MLFeatureExtractor
9
+
10
+ __all__ = [
11
+ "CodeDiffAnalyzer",
12
+ "EnhancedStatisticalAnalyzer",
13
+ "NLPAnalyzer",
14
+ "TemporalAnalyzer",
15
+ "QualitativeAnalyzer",
16
+ "MLFeatureExtractor",
17
+ ]
@@ -0,0 +1,238 @@
1
+ """Code diff analyzer for detecting green software patterns in code changes."""
2
+
3
+ import re
4
+ from typing import Any, Dict, List
5
+
6
+ from pydriller import Commit, ModifiedFile
7
+
8
+
9
+ class CodeDiffAnalyzer:
10
+ """
11
+ Analyze code diffs to detect green software patterns
12
+ beyond commit message keywords.
13
+ """
14
+
15
+ # Pattern indicators in code changes
16
+ PATTERN_SIGNATURES = {
17
+ "caching": {
18
+ "imports": [
19
+ r"import.*cache",
20
+ r"from.*cache.*import",
21
+ r"import redis",
22
+ r"import memcached",
23
+ ],
24
+ "annotations": [r"@cache", r"@cached", r"@lru_cache", r"@memoize"],
25
+ "function_calls": [r"\.cache\(", r"\.get_cache\(", r"\.set_cache\("],
26
+ "variable_names": [r"cache", r"cached_", r"_cache"],
27
+ },
28
+ "resource_optimization": {
29
+ "kubernetes": [
30
+ r"resources:\s*limits:",
31
+ r"resources:\s*requests:",
32
+ r"memory:\s*[0-9]+Mi",
33
+ r"cpu:\s*[0-9]+m",
34
+ ],
35
+ "docker": [
36
+ r"FROM.*alpine",
37
+ r"FROM.*scratch",
38
+ r"--no-cache",
39
+ r"apt-get.*--no-install-recommends",
40
+ ],
41
+ },
42
+ "database_optimization": {
43
+ "indexes": [r"CREATE.*INDEX", r"@Index", r"add_index"],
44
+ "query_optimization": [
45
+ r"\.select_related\(",
46
+ r"\.prefetch_related\(",
47
+ r"EXPLAIN",
48
+ ],
49
+ "connection_pooling": [
50
+ r"pool_size",
51
+ r"max_connections",
52
+ r"connection_pool",
53
+ ],
54
+ },
55
+ "async_processing": {
56
+ "keywords": [r"\basync\s+def\b", r"\bawait\b", r"asyncio", r"aiohttp"],
57
+ "patterns": [
58
+ r"ThreadPoolExecutor",
59
+ r"ProcessPoolExecutor",
60
+ r"@celery\.task",
61
+ ],
62
+ },
63
+ "lazy_loading": {
64
+ "keywords": [r"lazy", r"defer", r"\.only\(", r"select_related"],
65
+ "patterns": [r"@lazy", r"LazyLoader", r"dynamic.*import"],
66
+ },
67
+ }
68
+
69
+ def analyze_commit_diff(self, commit: Commit) -> Dict[str, Any]:
70
+ """
71
+ Analyze code changes in a commit to detect green patterns.
72
+
73
+ Args:
74
+ commit: PyDriller Commit object
75
+
76
+ Returns:
77
+ Dictionary containing:
78
+ - patterns_detected: List of detected pattern names
79
+ - confidence: Confidence level (high/medium/low/none)
80
+ - evidence: Dictionary mapping patterns to evidence lines
81
+ - metrics: Code change metrics
82
+ """
83
+ patterns_detected = []
84
+ evidence = {}
85
+ metrics = self._calculate_metrics(commit)
86
+
87
+ for modified_file in commit.modified_files:
88
+ # Skip non-code files
89
+ if not self._is_code_file(modified_file):
90
+ continue
91
+
92
+ # Analyze additions
93
+ if modified_file.diff_parsed and modified_file.diff_parsed.get("added"):
94
+ for line in modified_file.diff_parsed["added"]:
95
+ detected = self._detect_patterns_in_line(line[1]) # line[1] is content
96
+ patterns_detected.extend(detected)
97
+
98
+ for pattern in detected:
99
+ if pattern not in evidence:
100
+ evidence[pattern] = []
101
+ evidence[pattern].append(
102
+ f"{modified_file.filename}:{line[0]} - {line[1][:80]}"
103
+ )
104
+
105
+ # Deduplicate patterns
106
+ patterns_detected = list(set(patterns_detected))
107
+
108
+ # Confidence scoring
109
+ confidence = self._calculate_diff_confidence(patterns_detected, evidence, metrics)
110
+
111
+ return {
112
+ "patterns_detected": patterns_detected,
113
+ "confidence": confidence,
114
+ "evidence": evidence,
115
+ "metrics": metrics,
116
+ }
117
+
118
+ def _detect_patterns_in_line(self, code_line: str) -> List[str]:
119
+ """
120
+ Detect patterns in a single line of code.
121
+
122
+ Args:
123
+ code_line: Line of code to analyze
124
+
125
+ Returns:
126
+ List of detected pattern names
127
+ """
128
+ detected = []
129
+
130
+ for pattern_name, signatures in self.PATTERN_SIGNATURES.items():
131
+ for signature_type, patterns in signatures.items():
132
+ for pattern_regex in patterns:
133
+ if re.search(pattern_regex, code_line, re.IGNORECASE):
134
+ detected.append(pattern_name)
135
+ break
136
+
137
+ return detected
138
+
139
+ def _calculate_metrics(self, commit: Commit) -> Dict[str, int]:
140
+ """
141
+ Calculate code change metrics.
142
+
143
+ Args:
144
+ commit: PyDriller Commit object
145
+
146
+ Returns:
147
+ Dictionary of metrics
148
+ """
149
+ lines_added = sum(f.added_lines for f in commit.modified_files)
150
+ lines_removed = sum(f.deleted_lines for f in commit.modified_files)
151
+ files_changed = len(commit.modified_files)
152
+
153
+ # Complexity change (requires static analysis - simplified for now)
154
+ complexity_before = sum(f.complexity or 0 for f in commit.modified_files)
155
+ complexity_after = complexity_before # Simplified
156
+
157
+ return {
158
+ "lines_added": lines_added,
159
+ "lines_removed": lines_removed,
160
+ "files_changed": files_changed,
161
+ "net_lines": lines_added - lines_removed,
162
+ "complexity_change": complexity_after - complexity_before,
163
+ }
164
+
165
+ def _calculate_diff_confidence(
166
+ self, patterns: List[str], evidence: Dict[str, List[str]], metrics: Dict[str, int]
167
+ ) -> str:
168
+ """
169
+ Calculate confidence level for diff-based detection.
170
+
171
+ Factors:
172
+ - Number of patterns detected
173
+ - Amount of evidence per pattern
174
+ - Code change magnitude
175
+
176
+ Args:
177
+ patterns: List of detected patterns
178
+ evidence: Dictionary mapping patterns to evidence
179
+ metrics: Code change metrics
180
+
181
+ Returns:
182
+ Confidence level: high/medium/low/none
183
+ """
184
+ if not patterns:
185
+ return "none"
186
+
187
+ evidence_count = sum(len(v) for v in evidence.values())
188
+
189
+ if len(patterns) >= 3 and evidence_count >= 5:
190
+ return "high"
191
+ elif len(patterns) >= 2 and evidence_count >= 3:
192
+ return "medium"
193
+ else:
194
+ return "low"
195
+
196
+ def _is_code_file(self, modified_file: ModifiedFile) -> bool:
197
+ """
198
+ Check if file is a code file (not config, docs, etc.).
199
+
200
+ Args:
201
+ modified_file: PyDriller ModifiedFile object
202
+
203
+ Returns:
204
+ True if file is a code file
205
+ """
206
+ code_extensions = [
207
+ ".py",
208
+ ".java",
209
+ ".go",
210
+ ".js",
211
+ ".ts",
212
+ ".cpp",
213
+ ".c",
214
+ ".cs",
215
+ ".rb",
216
+ ".php",
217
+ ".scala",
218
+ ".kt",
219
+ ".rs",
220
+ ".swift",
221
+ ]
222
+
223
+ # Check file extension
224
+ for ext in code_extensions:
225
+ if modified_file.filename.endswith(ext):
226
+ return True
227
+
228
+ # Also analyze Dockerfiles and Kubernetes manifests
229
+ if "Dockerfile" in modified_file.filename:
230
+ return True
231
+ if modified_file.filename.endswith((".yaml", ".yml")):
232
+ # Check if it's a Kubernetes manifest
233
+ if modified_file.source_code and any(
234
+ k in modified_file.source_code for k in ["kind:", "apiVersion:", "metadata:"]
235
+ ):
236
+ return True
237
+
238
+ return False