greenmining 0.1.12__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,512 @@
1
+ """
2
+ Machine Learning Feature Extraction for Green Pattern Classification
3
+
4
+ Prepares data for ML-based pattern detection (Soliman et al.: 26/151 studies used ML).
5
+
6
+ Features extracted:
7
+ 1. Text features: TF-IDF, word embeddings, n-grams
8
+ 2. Code metrics: complexity, churn, file counts
9
+ 3. Temporal features: time of day, day of week, commit velocity
10
+ 4. Repository features: stars, contributors, language
11
+ 5. Historical features: past green awareness, pattern history
12
+
13
+ Use case: Train ML classifier as complement to keyword matching
14
+ Goal: Higher recall while maintaining precision (De Martino 2025: 97.91% accuracy)
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import re
20
+ from typing import Dict, List, Optional, Tuple
21
+ from dataclasses import dataclass
22
+ from datetime import datetime
23
+ from collections import Counter
24
+ import math
25
+
26
+
27
+ @dataclass
28
+ class MLFeatures:
29
+ """Feature vector for ML classification"""
30
+
31
+ # Text features
32
+ message_length: int
33
+ word_count: int
34
+ unique_word_ratio: float
35
+ avg_word_length: float
36
+ has_green_keywords: bool
37
+ keyword_count: int
38
+ keyword_density: float
39
+
40
+ # Code metrics
41
+ files_changed: int
42
+ lines_added: int
43
+ lines_deleted: int
44
+ total_changes: int
45
+ change_entropy: float # Distribution of changes across files
46
+
47
+ # Temporal features
48
+ hour_of_day: int
49
+ day_of_week: int
50
+ is_weekend: bool
51
+ commit_velocity: float # Recent commit frequency
52
+
53
+ # Repository features
54
+ repo_stars: int
55
+ repo_age_days: int
56
+ primary_language: str
57
+
58
+ # Historical features
59
+ author_green_rate: float # Author's historical green awareness
60
+ repo_green_rate: float # Repository's green awareness trend
61
+
62
+ # Target label (for training)
63
+ is_green_aware: Optional[bool] = None
64
+
65
+
66
+ class MLFeatureExtractor:
67
+ """
68
+ Extract features from commits for ML classification.
69
+
70
+ Implements feature engineering based on:
71
+ - Soliman et al. (2017): 26/151 studies used ML
72
+ - De Martino et al. (2025): 97.91% accuracy with ML classifier
73
+
74
+ Features are designed to be:
75
+ 1. Informative (capture green patterns)
76
+ 2. Discriminative (separate green from non-green)
77
+ 3. Robust (work across repositories/languages)
78
+ """
79
+
80
+ def __init__(self, green_keywords: Optional[List[str]] = None):
81
+ """
82
+ Initialize feature extractor.
83
+
84
+ Args:
85
+ green_keywords: List of green-related keywords for text features
86
+ """
87
+ self.green_keywords = green_keywords or self._default_keywords()
88
+ self._author_history = {}
89
+ self._repo_history = {}
90
+
91
+ def _default_keywords(self) -> List[str]:
92
+ """Default green keywords for feature extraction."""
93
+ return [
94
+ "cache",
95
+ "optimize",
96
+ "performance",
97
+ "efficient",
98
+ "reduce",
99
+ "compress",
100
+ "lazy",
101
+ "async",
102
+ "parallel",
103
+ "batch",
104
+ "pool",
105
+ "scale",
106
+ "memory",
107
+ "cpu",
108
+ "resource",
109
+ "green",
110
+ "sustainable",
111
+ "energy",
112
+ "power",
113
+ ]
114
+
115
+ def extract_text_features(self, text: str) -> Dict:
116
+ """
117
+ Extract text-based features from commit message.
118
+
119
+ Args:
120
+ text: Commit message text
121
+
122
+ Returns:
123
+ Dictionary with text features
124
+ """
125
+ words = re.findall(r"\b\w+\b", text.lower())
126
+ unique_words = set(words)
127
+
128
+ # Keyword matching
129
+ keyword_matches = [w for w in words if w in self.green_keywords]
130
+
131
+ return {
132
+ "message_length": len(text),
133
+ "word_count": len(words),
134
+ "unique_word_ratio": len(unique_words) / len(words) if words else 0,
135
+ "avg_word_length": sum(len(w) for w in words) / len(words) if words else 0,
136
+ "has_green_keywords": len(keyword_matches) > 0,
137
+ "keyword_count": len(keyword_matches),
138
+ "keyword_density": len(keyword_matches) / len(words) if words else 0,
139
+ }
140
+
141
+ def extract_code_metrics(self, commit: Dict) -> Dict:
142
+ """
143
+ Extract code change metrics.
144
+
145
+ Args:
146
+ commit: Commit dictionary with file changes
147
+
148
+ Returns:
149
+ Dictionary with code metrics
150
+ """
151
+ files = commit.get("files", [])
152
+
153
+ files_changed = len(files)
154
+ lines_added = sum(f.get("additions", 0) for f in files)
155
+ lines_deleted = sum(f.get("deletions", 0) for f in files)
156
+ total_changes = lines_added + lines_deleted
157
+
158
+ # Change entropy (distribution of changes)
159
+ if files_changed > 1:
160
+ file_changes = [f.get("additions", 0) + f.get("deletions", 0) for f in files]
161
+ total = sum(file_changes)
162
+ if total > 0:
163
+ probabilities = [c / total for c in file_changes]
164
+ entropy = -sum(p * math.log2(p) if p > 0 else 0 for p in probabilities)
165
+ else:
166
+ entropy = 0
167
+ else:
168
+ entropy = 0
169
+
170
+ return {
171
+ "files_changed": files_changed,
172
+ "lines_added": lines_added,
173
+ "lines_deleted": lines_deleted,
174
+ "total_changes": total_changes,
175
+ "change_entropy": round(entropy, 4),
176
+ }
177
+
178
+ def extract_temporal_features(self, commit: Dict, repo_commits: List[Dict]) -> Dict:
179
+ """
180
+ Extract temporal features.
181
+
182
+ Args:
183
+ commit: Current commit
184
+ repo_commits: All commits in repository (for velocity calculation)
185
+
186
+ Returns:
187
+ Dictionary with temporal features
188
+ """
189
+ date_str = commit.get("date")
190
+
191
+ if not date_str:
192
+ return {
193
+ "hour_of_day": 12,
194
+ "day_of_week": 3,
195
+ "is_weekend": False,
196
+ "commit_velocity": 0,
197
+ }
198
+
199
+ # Parse date
200
+ try:
201
+ if isinstance(date_str, datetime):
202
+ date = date_str
203
+ else:
204
+ date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
205
+ except (ValueError, AttributeError):
206
+ return {
207
+ "hour_of_day": 12,
208
+ "day_of_week": 3,
209
+ "is_weekend": False,
210
+ "commit_velocity": 0,
211
+ }
212
+
213
+ # Calculate velocity (commits in past 7 days)
214
+ week_ago = date.timestamp() - (7 * 24 * 60 * 60)
215
+ recent_commits = [
216
+ c for c in repo_commits if self._parse_date(c.get("date")).timestamp() > week_ago
217
+ ]
218
+ velocity = len(recent_commits) / 7 # commits per day
219
+
220
+ return {
221
+ "hour_of_day": date.hour,
222
+ "day_of_week": date.weekday(),
223
+ "is_weekend": date.weekday() >= 5,
224
+ "commit_velocity": round(velocity, 2),
225
+ }
226
+
227
+ def extract_repository_features(self, repository: Dict) -> Dict:
228
+ """
229
+ Extract repository-level features.
230
+
231
+ Args:
232
+ repository: Repository metadata
233
+
234
+ Returns:
235
+ Dictionary with repository features
236
+ """
237
+ created_at = repository.get("created_at")
238
+
239
+ if created_at:
240
+ try:
241
+ created_date = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
242
+ now = datetime.now(created_date.tzinfo)
243
+ age_days = (now - created_date).days
244
+ except (ValueError, AttributeError):
245
+ age_days = 365 # Default
246
+ else:
247
+ age_days = 365
248
+
249
+ return {
250
+ "repo_stars": repository.get("stars", 0),
251
+ "repo_age_days": age_days,
252
+ "primary_language": repository.get("language", "Unknown"),
253
+ }
254
+
255
+ def extract_historical_features(
256
+ self,
257
+ commit: Dict,
258
+ author_commits: List[Dict],
259
+ repo_commits: List[Dict],
260
+ analysis_results: List[Dict],
261
+ ) -> Dict:
262
+ """
263
+ Extract historical features (past green awareness).
264
+
265
+ Args:
266
+ commit: Current commit
267
+ author_commits: All commits by this author
268
+ repo_commits: All commits in repository
269
+ analysis_results: Pattern detection results
270
+
271
+ Returns:
272
+ Dictionary with historical features
273
+ """
274
+ # Author's historical green rate
275
+ author_email = commit.get("author_email")
276
+ author_green = [
277
+ r
278
+ for r in analysis_results
279
+ if r.get("author_email") == author_email and r.get("is_green_aware", False)
280
+ ]
281
+ author_total = len([c for c in author_commits if c.get("author_email") == author_email])
282
+ author_green_rate = len(author_green) / author_total if author_total > 0 else 0
283
+
284
+ # Repository's historical green rate
285
+ repo_name = commit.get("repository")
286
+ repo_green = [
287
+ r
288
+ for r in analysis_results
289
+ if r.get("repository") == repo_name and r.get("is_green_aware", False)
290
+ ]
291
+ repo_total = len([c for c in repo_commits if c.get("repository") == repo_name])
292
+ repo_green_rate = len(repo_green) / repo_total if repo_total > 0 else 0
293
+
294
+ return {
295
+ "author_green_rate": round(author_green_rate, 4),
296
+ "repo_green_rate": round(repo_green_rate, 4),
297
+ }
298
+
299
+ def extract_features(
300
+ self,
301
+ commit: Dict,
302
+ repository: Dict,
303
+ all_commits: List[Dict],
304
+ analysis_results: List[Dict],
305
+ ground_truth: Optional[bool] = None,
306
+ ) -> MLFeatures:
307
+ """
308
+ Extract complete feature vector for a commit.
309
+
310
+ Args:
311
+ commit: Commit to extract features from
312
+ repository: Repository metadata
313
+ all_commits: All commits (for temporal/historical features)
314
+ analysis_results: Pattern detection results (for historical features)
315
+ ground_truth: Optional true label for supervised learning
316
+
317
+ Returns:
318
+ MLFeatures object
319
+ """
320
+ message = commit.get("message", "")
321
+
322
+ # Extract feature groups
323
+ text_features = self.extract_text_features(message)
324
+ code_features = self.extract_code_metrics(commit)
325
+ temporal_features = self.extract_temporal_features(commit, all_commits)
326
+ repo_features = self.extract_repository_features(repository)
327
+ historical_features = self.extract_historical_features(
328
+ commit, all_commits, all_commits, analysis_results
329
+ )
330
+
331
+ # Combine into MLFeatures object
332
+ return MLFeatures(
333
+ # Text
334
+ message_length=text_features["message_length"],
335
+ word_count=text_features["word_count"],
336
+ unique_word_ratio=text_features["unique_word_ratio"],
337
+ avg_word_length=text_features["avg_word_length"],
338
+ has_green_keywords=text_features["has_green_keywords"],
339
+ keyword_count=text_features["keyword_count"],
340
+ keyword_density=text_features["keyword_density"],
341
+ # Code
342
+ files_changed=code_features["files_changed"],
343
+ lines_added=code_features["lines_added"],
344
+ lines_deleted=code_features["lines_deleted"],
345
+ total_changes=code_features["total_changes"],
346
+ change_entropy=code_features["change_entropy"],
347
+ # Temporal
348
+ hour_of_day=temporal_features["hour_of_day"],
349
+ day_of_week=temporal_features["day_of_week"],
350
+ is_weekend=temporal_features["is_weekend"],
351
+ commit_velocity=temporal_features["commit_velocity"],
352
+ # Repository
353
+ repo_stars=repo_features["repo_stars"],
354
+ repo_age_days=repo_features["repo_age_days"],
355
+ primary_language=repo_features["primary_language"],
356
+ # Historical
357
+ author_green_rate=historical_features["author_green_rate"],
358
+ repo_green_rate=historical_features["repo_green_rate"],
359
+ # Target
360
+ is_green_aware=ground_truth,
361
+ )
362
+
363
+ def extract_features_batch(
364
+ self,
365
+ commits: List[Dict],
366
+ repositories: List[Dict],
367
+ analysis_results: List[Dict],
368
+ ground_truth: Optional[List[bool]] = None,
369
+ ) -> List[MLFeatures]:
370
+ """
371
+ Extract features for multiple commits.
372
+
373
+ Args:
374
+ commits: List of commits
375
+ repositories: List of repository metadata
376
+ analysis_results: Pattern detection results
377
+ ground_truth: Optional list of true labels
378
+
379
+ Returns:
380
+ List of MLFeatures objects
381
+ """
382
+ # Build repository lookup
383
+ repo_lookup = {r["name"]: r for r in repositories}
384
+
385
+ features = []
386
+ for i, commit in enumerate(commits):
387
+ repo_name = commit.get("repository")
388
+ repository = repo_lookup.get(repo_name, {})
389
+
390
+ label = ground_truth[i] if ground_truth and i < len(ground_truth) else None
391
+
392
+ feature = self.extract_features(commit, repository, commits, analysis_results, label)
393
+ features.append(feature)
394
+
395
+ return features
396
+
397
+ def export_to_csv(self, features: List[MLFeatures], output_path: str) -> None:
398
+ """
399
+ Export features to CSV for ML training.
400
+
401
+ Args:
402
+ features: List of MLFeatures
403
+ output_path: Path to output CSV file
404
+ """
405
+ import csv
406
+
407
+ # Get all field names
408
+ field_names = [
409
+ "message_length",
410
+ "word_count",
411
+ "unique_word_ratio",
412
+ "avg_word_length",
413
+ "has_green_keywords",
414
+ "keyword_count",
415
+ "keyword_density",
416
+ "files_changed",
417
+ "lines_added",
418
+ "lines_deleted",
419
+ "total_changes",
420
+ "change_entropy",
421
+ "hour_of_day",
422
+ "day_of_week",
423
+ "is_weekend",
424
+ "commit_velocity",
425
+ "repo_stars",
426
+ "repo_age_days",
427
+ "primary_language",
428
+ "author_green_rate",
429
+ "repo_green_rate",
430
+ "is_green_aware",
431
+ ]
432
+
433
+ with open(output_path, "w", newline="") as f:
434
+ writer = csv.DictWriter(f, fieldnames=field_names)
435
+ writer.writeheader()
436
+
437
+ for feature in features:
438
+ row = {
439
+ "message_length": feature.message_length,
440
+ "word_count": feature.word_count,
441
+ "unique_word_ratio": feature.unique_word_ratio,
442
+ "avg_word_length": feature.avg_word_length,
443
+ "has_green_keywords": int(feature.has_green_keywords),
444
+ "keyword_count": feature.keyword_count,
445
+ "keyword_density": feature.keyword_density,
446
+ "files_changed": feature.files_changed,
447
+ "lines_added": feature.lines_added,
448
+ "lines_deleted": feature.lines_deleted,
449
+ "total_changes": feature.total_changes,
450
+ "change_entropy": feature.change_entropy,
451
+ "hour_of_day": feature.hour_of_day,
452
+ "day_of_week": feature.day_of_week,
453
+ "is_weekend": int(feature.is_weekend),
454
+ "commit_velocity": feature.commit_velocity,
455
+ "repo_stars": feature.repo_stars,
456
+ "repo_age_days": feature.repo_age_days,
457
+ "primary_language": feature.primary_language,
458
+ "author_green_rate": feature.author_green_rate,
459
+ "repo_green_rate": feature.repo_green_rate,
460
+ "is_green_aware": (
461
+ int(feature.is_green_aware) if feature.is_green_aware is not None else ""
462
+ ),
463
+ }
464
+ writer.writerow(row)
465
+
466
+ def get_feature_importance_guide(self) -> Dict:
467
+ """
468
+ Guide for interpreting feature importance in ML models.
469
+
470
+ Returns:
471
+ Dictionary describing each feature and expected importance
472
+ """
473
+ return {
474
+ "text_features": {
475
+ "keyword_density": "HIGH - Direct indicator of green awareness",
476
+ "keyword_count": "HIGH - Number of green terms",
477
+ "has_green_keywords": "MEDIUM - Binary presence indicator",
478
+ "message_length": "LOW - General text characteristic",
479
+ "unique_word_ratio": "LOW - Vocabulary diversity",
480
+ },
481
+ "code_features": {
482
+ "total_changes": "MEDIUM - Refactoring indicator",
483
+ "change_entropy": "LOW - Change distribution",
484
+ "files_changed": "LOW - Scope of change",
485
+ },
486
+ "temporal_features": {
487
+ "commit_velocity": "LOW - Development pace",
488
+ "hour_of_day": "VERY_LOW - Time of commit",
489
+ "day_of_week": "VERY_LOW - Day of commit",
490
+ },
491
+ "repository_features": {
492
+ "repo_stars": "LOW - Project popularity",
493
+ "repo_age_days": "LOW - Project maturity",
494
+ "primary_language": "MEDIUM - Language-specific patterns",
495
+ },
496
+ "historical_features": {
497
+ "author_green_rate": "HIGH - Author green awareness history",
498
+ "repo_green_rate": "HIGH - Repository green culture",
499
+ },
500
+ }
501
+
502
+ def _parse_date(self, date_str: Optional[str]) -> datetime:
503
+ """Parse date string to datetime."""
504
+ if not date_str:
505
+ return datetime.now()
506
+
507
+ try:
508
+ if isinstance(date_str, datetime):
509
+ return date_str
510
+ return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
511
+ except (ValueError, AttributeError):
512
+ return datetime.now()