greenmining 0.1.12__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,365 @@
1
+ """
2
+ NLP Analyzer for Enhanced Green Software Pattern Detection
3
+
4
+ Implements Natural Language Processing techniques from Soliman et al. (2017):
5
+ - Morphological analysis (stemming, lemmatization)
6
+ - Semantic matching (word embeddings, synonyms)
7
+ - Multi-term phrase matching
8
+
9
+ Addresses limitation: Current keyword matching misses variants like:
10
+ - optimize → optimizing, optimized, optimization
11
+ - cache → caching, cached, caches
12
+ - efficient → efficiency, efficiently
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import re
18
+ from typing import Dict, List, Set, Tuple
19
+ from dataclasses import dataclass
20
+
21
+
22
+ @dataclass
23
+ class NLPMatch:
24
+ """Represents an NLP-enhanced pattern match"""
25
+
26
+ original_term: str
27
+ matched_variant: str
28
+ position: int
29
+ context: str
30
+ match_type: str # 'exact', 'stemmed', 'semantic'
31
+
32
+
33
+ class NLPAnalyzer:
34
+ """
35
+ Enhanced pattern detection using NLP techniques.
36
+
37
+ Implements:
38
+ 1. Stemming/Lemmatization (catch morphological variants)
39
+ 2. Synonym expansion (semantic matching)
40
+ 3. Phrase pattern matching (multi-word expressions)
41
+
42
+ Based on Soliman et al. findings: 26/151 studies used NLP techniques
43
+ """
44
+
45
+ def __init__(self, enable_stemming: bool = True, enable_synonyms: bool = True):
46
+ """
47
+ Initialize NLP analyzer.
48
+
49
+ Args:
50
+ enable_stemming: Enable morphological analysis
51
+ enable_synonyms: Enable semantic synonym matching
52
+ """
53
+ self.enable_stemming = enable_stemming
54
+ self.enable_synonyms = enable_synonyms
55
+
56
+ # Simple stemming rules (Porter-like)
57
+ self._stemming_rules = {
58
+ "optimization": "optim",
59
+ "optimizing": "optim",
60
+ "optimized": "optim",
61
+ "optimize": "optim",
62
+ "caching": "cache",
63
+ "cached": "cache",
64
+ "caches": "cache",
65
+ "efficient": "effic",
66
+ "efficiency": "effic",
67
+ "efficiently": "effic",
68
+ "compressed": "compress",
69
+ "compressing": "compress",
70
+ "compression": "compress",
71
+ "scaling": "scale",
72
+ "scaled": "scale",
73
+ "scalable": "scale",
74
+ "scalability": "scale",
75
+ "monitoring": "monitor",
76
+ "monitored": "monitor",
77
+ "profiling": "profil",
78
+ "profiled": "profil",
79
+ "recycling": "recycl",
80
+ "recycled": "recycl",
81
+ "reducing": "reduc",
82
+ "reduced": "reduc",
83
+ "reduction": "reduc",
84
+ "minimizing": "minim",
85
+ "minimized": "minim",
86
+ "minimization": "minim",
87
+ "containerized": "container",
88
+ "containerization": "container",
89
+ "containerizing": "container",
90
+ }
91
+
92
+ # Semantic synonyms for green concepts
93
+ self._synonyms = {
94
+ "cache": ["buffer", "memoize", "store", "retain"],
95
+ "optimize": ["improve", "enhance", "tune", "refine", "streamline"],
96
+ "efficient": ["performant", "fast", "quick", "lean", "lightweight"],
97
+ "reduce": ["decrease", "minimize", "lower", "cut", "shrink", "lessen"],
98
+ "compress": ["compact", "shrink", "minify", "pack"],
99
+ "monitor": ["track", "measure", "observe", "watch", "profile"],
100
+ "scale": ["grow", "expand", "adapt", "resize"],
101
+ "green": ["sustainable", "eco-friendly", "carbon-aware", "energy-efficient"],
102
+ "power": ["energy", "electricity", "consumption", "usage"],
103
+ "resource": ["memory", "cpu", "compute", "capacity"],
104
+ }
105
+
106
+ # Multi-word phrases (higher precision)
107
+ self._phrase_patterns = [
108
+ r"reduce\s+(memory|cpu|power|energy|resource)",
109
+ r"optimize\s+(performance|efficiency|resource|memory)",
110
+ r"improve\s+(efficiency|performance|throughput)",
111
+ r"lazy\s+load(ing)?",
112
+ r"connection\s+pool(ing)?",
113
+ r"batch\s+process(ing)?",
114
+ r"data\s+compress(ion)?",
115
+ r"auto\s*scal(ing|e)",
116
+ r"load\s+balanc(ing|er)",
117
+ r"circuit\s+breaker",
118
+ r"rate\s+limit(ing|er)",
119
+ r"(horizontal|vertical)\s+scal(ing|e)",
120
+ r"serverless\s+(function|architecture)",
121
+ r"container\s+orchestration",
122
+ r"micro\s*service",
123
+ r"event\s+driven",
124
+ r"reactive\s+(programming|stream)",
125
+ r"asynchronous\s+process(ing)?",
126
+ r"parallel\s+process(ing)?",
127
+ r"distributed\s+(cache|caching)",
128
+ r"in-memory\s+(cache|database)",
129
+ r"edge\s+computing",
130
+ r"cdn\s+(cache|integration)",
131
+ r"database\s+(index|indexing)",
132
+ r"query\s+(optimization|cache)",
133
+ r"api\s+(rate|throttl)",
134
+ r"graceful\s+degrad(ation|e)",
135
+ r"back\s*pressure",
136
+ r"bulkhead\s+pattern",
137
+ ]
138
+
139
+ def stem_word(self, word: str) -> str:
140
+ """
141
+ Apply simple stemming to word.
142
+
143
+ Args:
144
+ word: Input word (lowercase)
145
+
146
+ Returns:
147
+ Stemmed form of word
148
+ """
149
+ word_lower = word.lower()
150
+
151
+ # Use predefined stems
152
+ if word_lower in self._stemming_rules:
153
+ return self._stemming_rules[word_lower]
154
+
155
+ # Simple suffix removal
156
+ for suffix in ["ing", "ed", "es", "s", "tion", "ation", "ment", "ity", "ly", "er"]:
157
+ if word_lower.endswith(suffix) and len(word_lower) > len(suffix) + 2:
158
+ return word_lower[: -len(suffix)]
159
+
160
+ return word_lower
161
+
162
+ def get_synonyms(self, word: str) -> Set[str]:
163
+ """
164
+ Get semantic synonyms for word.
165
+
166
+ Args:
167
+ word: Input word (lowercase)
168
+
169
+ Returns:
170
+ Set of synonyms including original word
171
+ """
172
+ word_lower = word.lower()
173
+ synonyms = {word_lower}
174
+
175
+ if word_lower in self._synonyms:
176
+ synonyms.update(self._synonyms[word_lower])
177
+
178
+ return synonyms
179
+
180
+ def find_morphological_matches(self, text: str, base_keywords: List[str]) -> List[NLPMatch]:
181
+ """
182
+ Find keyword matches including morphological variants.
183
+
184
+ Args:
185
+ text: Text to search (commit message or code)
186
+ base_keywords: List of base keywords to match
187
+
188
+ Returns:
189
+ List of NLPMatch objects with stemmed matches
190
+ """
191
+ if not self.enable_stemming:
192
+ return []
193
+
194
+ matches = []
195
+ text_lower = text.lower()
196
+ words = re.findall(r"\b\w+\b", text_lower)
197
+
198
+ # Build stem lookup
199
+ stemmed_keywords = {self.stem_word(kw): kw for kw in base_keywords}
200
+
201
+ for i, word in enumerate(words):
202
+ stemmed_word = self.stem_word(word)
203
+ if stemmed_word in stemmed_keywords:
204
+ original_kw = stemmed_keywords[stemmed_word]
205
+
206
+ # Find position in original text
207
+ position = text_lower.find(word)
208
+
209
+ # Extract context (10 chars before and after)
210
+ context_start = max(0, position - 10)
211
+ context_end = min(len(text), position + len(word) + 10)
212
+ context = text[context_start:context_end]
213
+
214
+ matches.append(
215
+ NLPMatch(
216
+ original_term=original_kw,
217
+ matched_variant=word,
218
+ position=position,
219
+ context=context,
220
+ match_type="stemmed",
221
+ )
222
+ )
223
+
224
+ return matches
225
+
226
+ def find_semantic_matches(self, text: str, base_keywords: List[str]) -> List[NLPMatch]:
227
+ """
228
+ Find keyword matches including semantic synonyms.
229
+
230
+ Args:
231
+ text: Text to search
232
+ base_keywords: List of base keywords
233
+
234
+ Returns:
235
+ List of NLPMatch objects with semantic matches
236
+ """
237
+ if not self.enable_synonyms:
238
+ return []
239
+
240
+ matches = []
241
+ text_lower = text.lower()
242
+
243
+ for keyword in base_keywords:
244
+ synonyms = self.get_synonyms(keyword)
245
+
246
+ for synonym in synonyms:
247
+ if synonym == keyword: # Skip exact match (already covered)
248
+ continue
249
+
250
+ # Find all occurrences
251
+ pattern = r"\b" + re.escape(synonym) + r"\b"
252
+ for match in re.finditer(pattern, text_lower, re.IGNORECASE):
253
+ position = match.start()
254
+ context_start = max(0, position - 10)
255
+ context_end = min(len(text), position + len(synonym) + 10)
256
+ context = text[context_start:context_end]
257
+
258
+ matches.append(
259
+ NLPMatch(
260
+ original_term=keyword,
261
+ matched_variant=synonym,
262
+ position=position,
263
+ context=context,
264
+ match_type="semantic",
265
+ )
266
+ )
267
+
268
+ return matches
269
+
270
+ def find_phrase_patterns(self, text: str) -> List[NLPMatch]:
271
+ """
272
+ Find multi-word phrase patterns indicating green practices.
273
+
274
+ Args:
275
+ text: Text to search
276
+
277
+ Returns:
278
+ List of NLPMatch objects with phrase matches
279
+ """
280
+ matches = []
281
+ text_lower = text.lower()
282
+
283
+ for pattern in self._phrase_patterns:
284
+ for match in re.finditer(pattern, text_lower, re.IGNORECASE):
285
+ matched_phrase = match.group(0)
286
+ position = match.start()
287
+ context_start = max(0, position - 10)
288
+ context_end = min(len(text), position + len(matched_phrase) + 10)
289
+ context = text[context_start:context_end]
290
+
291
+ matches.append(
292
+ NLPMatch(
293
+ original_term="phrase_pattern",
294
+ matched_variant=matched_phrase,
295
+ position=position,
296
+ context=context,
297
+ match_type="phrase",
298
+ )
299
+ )
300
+
301
+ return matches
302
+
303
+ def analyze_text(self, text: str, base_keywords: List[str]) -> Dict:
304
+ """
305
+ Comprehensive NLP analysis of text.
306
+
307
+ Args:
308
+ text: Text to analyze (commit message or code)
309
+ base_keywords: Base keywords from GSF patterns
310
+
311
+ Returns:
312
+ Dictionary with:
313
+ - morphological_matches: List of stemmed matches
314
+ - semantic_matches: List of synonym matches
315
+ - phrase_matches: List of phrase pattern matches
316
+ - total_nlp_matches: Total unique matches
317
+ - match_density: Matches per 100 words
318
+ """
319
+ morphological = self.find_morphological_matches(text, base_keywords)
320
+ semantic = self.find_semantic_matches(text, base_keywords)
321
+ phrases = self.find_phrase_patterns(text)
322
+
323
+ # Calculate metrics
324
+ word_count = len(re.findall(r"\b\w+\b", text))
325
+ total_matches = len(morphological) + len(semantic) + len(phrases)
326
+ match_density = (total_matches / word_count * 100) if word_count > 0 else 0
327
+
328
+ return {
329
+ "morphological_matches": morphological,
330
+ "semantic_matches": semantic,
331
+ "phrase_matches": phrases,
332
+ "total_nlp_matches": total_matches,
333
+ "match_density": round(match_density, 2),
334
+ "word_count": word_count,
335
+ }
336
+
337
+ def enhance_pattern_detection(
338
+ self, text: str, original_keywords: List[str]
339
+ ) -> Tuple[bool, List[str]]:
340
+ """
341
+ Enhance original keyword detection with NLP techniques.
342
+
343
+ Args:
344
+ text: Text to analyze
345
+ original_keywords: Keywords that were already detected
346
+
347
+ Returns:
348
+ Tuple of (has_additional_matches, additional_matched_terms)
349
+ """
350
+ analysis = self.analyze_text(text, original_keywords)
351
+
352
+ additional_terms = []
353
+
354
+ # Collect additional matched terms
355
+ for match in analysis["morphological_matches"]:
356
+ if match.matched_variant not in original_keywords:
357
+ additional_terms.append(f"{match.matched_variant} (stem: {match.original_term})")
358
+
359
+ for match in analysis["semantic_matches"]:
360
+ additional_terms.append(f"{match.matched_variant} (synonym: {match.original_term})")
361
+
362
+ for match in analysis["phrase_matches"]:
363
+ additional_terms.append(f"'{match.matched_variant}' (phrase)")
364
+
365
+ return len(additional_terms) > 0, additional_terms