greenmining 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__init__.py +11 -29
- greenmining/__main__.py +9 -3
- greenmining/__version__.py +2 -2
- greenmining/analyzers/__init__.py +3 -7
- greenmining/analyzers/code_diff_analyzer.py +151 -61
- greenmining/analyzers/qualitative_analyzer.py +15 -81
- greenmining/analyzers/statistical_analyzer.py +8 -69
- greenmining/analyzers/temporal_analyzer.py +16 -72
- greenmining/config.py +105 -58
- greenmining/controllers/__init__.py +1 -5
- greenmining/controllers/repository_controller.py +153 -94
- greenmining/energy/__init__.py +13 -0
- greenmining/energy/base.py +165 -0
- greenmining/energy/codecarbon_meter.py +146 -0
- greenmining/energy/rapl.py +157 -0
- greenmining/gsf_patterns.py +4 -26
- greenmining/models/__init__.py +1 -5
- greenmining/models/aggregated_stats.py +4 -4
- greenmining/models/analysis_result.py +4 -4
- greenmining/models/commit.py +5 -5
- greenmining/models/repository.py +5 -5
- greenmining/presenters/__init__.py +1 -5
- greenmining/presenters/console_presenter.py +24 -24
- greenmining/services/__init__.py +10 -6
- greenmining/services/commit_extractor.py +8 -152
- greenmining/services/data_aggregator.py +45 -175
- greenmining/services/data_analyzer.py +9 -202
- greenmining/services/github_fetcher.py +210 -323
- greenmining/services/github_graphql_fetcher.py +361 -0
- greenmining/services/local_repo_analyzer.py +387 -0
- greenmining/services/reports.py +33 -137
- greenmining/utils.py +21 -149
- {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/METADATA +69 -173
- greenmining-1.0.5.dist-info/RECORD +37 -0
- {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/WHEEL +1 -1
- greenmining/analyzers/ml_feature_extractor.py +0 -512
- greenmining/analyzers/nlp_analyzer.py +0 -365
- greenmining/cli.py +0 -471
- greenmining/main.py +0 -37
- greenmining-1.0.3.dist-info/RECORD +0 -36
- greenmining-1.0.3.dist-info/entry_points.txt +0 -2
- {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/licenses/LICENSE +0 -0
- {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
# Data analyzer for green microservices commits using GSF patterns.
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
@@ -8,13 +8,10 @@ from collections import Counter
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Any, Dict, List, Optional, Tuple
|
|
10
10
|
|
|
11
|
-
import click
|
|
12
11
|
from tqdm import tqdm
|
|
13
12
|
|
|
14
13
|
from greenmining.analyzers import (
|
|
15
14
|
CodeDiffAnalyzer,
|
|
16
|
-
NLPAnalyzer,
|
|
17
|
-
MLFeatureExtractor,
|
|
18
15
|
)
|
|
19
16
|
from greenmining.config import get_config
|
|
20
17
|
from greenmining.gsf_patterns import (
|
|
@@ -35,30 +32,19 @@ from greenmining.utils import (
|
|
|
35
32
|
|
|
36
33
|
|
|
37
34
|
class DataAnalyzer:
|
|
38
|
-
|
|
35
|
+
# Analyzes commits for green software patterns using GSF (Green Software Founda...
|
|
39
36
|
|
|
40
37
|
def __init__(
|
|
41
38
|
self,
|
|
42
39
|
batch_size: int = 10,
|
|
43
40
|
enable_diff_analysis: bool = False,
|
|
44
|
-
enable_nlp: bool = False,
|
|
45
|
-
enable_ml_features: bool = False,
|
|
46
41
|
):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
batch_size: Number of commits to process in each batch
|
|
51
|
-
enable_diff_analysis: Enable code diff analysis (slower but more accurate)
|
|
52
|
-
enable_nlp: Enable NLP-enhanced pattern detection
|
|
53
|
-
enable_ml_features: Enable ML feature extraction
|
|
54
|
-
"""
|
|
42
|
+
# Initialize analyzer with GSF patterns.
|
|
55
43
|
# Use GSF patterns from gsf_patterns.py
|
|
56
44
|
self.gsf_patterns = GSF_PATTERNS
|
|
57
45
|
self.green_keywords = GREEN_KEYWORDS
|
|
58
46
|
self.batch_size = batch_size
|
|
59
47
|
self.enable_diff_analysis = enable_diff_analysis
|
|
60
|
-
self.enable_nlp = enable_nlp
|
|
61
|
-
self.enable_ml_features = enable_ml_features
|
|
62
48
|
|
|
63
49
|
# Initialize code diff analyzer if enabled
|
|
64
50
|
if self.enable_diff_analysis:
|
|
@@ -67,32 +53,10 @@ class DataAnalyzer:
|
|
|
67
53
|
else:
|
|
68
54
|
self.diff_analyzer = None
|
|
69
55
|
|
|
70
|
-
# Initialize NLP analyzer if enabled
|
|
71
|
-
if self.enable_nlp:
|
|
72
|
-
self.nlp_analyzer = NLPAnalyzer(enable_stemming=True, enable_synonyms=True)
|
|
73
|
-
colored_print("NLP analysis enabled (morphological variants + synonyms)", "cyan")
|
|
74
|
-
else:
|
|
75
|
-
self.nlp_analyzer = None
|
|
76
|
-
|
|
77
|
-
# Initialize ML feature extractor if enabled
|
|
78
|
-
if self.enable_ml_features:
|
|
79
|
-
self.ml_extractor = MLFeatureExtractor(green_keywords=list(GREEN_KEYWORDS))
|
|
80
|
-
colored_print("ML feature extraction enabled", "cyan")
|
|
81
|
-
else:
|
|
82
|
-
self.ml_extractor = None
|
|
83
|
-
|
|
84
56
|
def analyze_commits(
|
|
85
57
|
self, commits: list[dict[str, Any]], resume_from: int = 0
|
|
86
58
|
) -> list[dict[str, Any]]:
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
Args:
|
|
90
|
-
commits: List of commit dictionaries
|
|
91
|
-
resume_from: Index to resume from
|
|
92
|
-
|
|
93
|
-
Returns:
|
|
94
|
-
List of analysis results
|
|
95
|
-
"""
|
|
59
|
+
# Analyze commits for green software practices.
|
|
96
60
|
results = []
|
|
97
61
|
|
|
98
62
|
colored_print(f"\nAnalyzing {len(commits)} commits for green practices...", "cyan")
|
|
@@ -115,14 +79,7 @@ class DataAnalyzer:
|
|
|
115
79
|
return results
|
|
116
80
|
|
|
117
81
|
def _analyze_commit(self, commit: dict[str, Any]) -> dict[str, Any]:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
commit: Commit dictionary
|
|
122
|
-
|
|
123
|
-
Returns:
|
|
124
|
-
Analysis result with GSF pattern matching
|
|
125
|
-
"""
|
|
82
|
+
# Analyze a single commit using GSF patterns.
|
|
126
83
|
message = commit.get("message", "")
|
|
127
84
|
|
|
128
85
|
# Q1: GREEN AWARENESS - Check using GSF keywords
|
|
@@ -131,20 +88,6 @@ class DataAnalyzer:
|
|
|
131
88
|
# Q2: KNOWN GSF PATTERNS - Match against Green Software Foundation patterns
|
|
132
89
|
matched_patterns = get_pattern_by_keywords(message)
|
|
133
90
|
|
|
134
|
-
# Enhanced NLP analysis (if enabled)
|
|
135
|
-
nlp_results = None
|
|
136
|
-
if self.nlp_analyzer:
|
|
137
|
-
nlp_results = self.nlp_analyzer.analyze_text(message, list(self.green_keywords))
|
|
138
|
-
|
|
139
|
-
# Check if NLP found additional matches not caught by keyword matching
|
|
140
|
-
has_nlp_matches, additional_terms = self.nlp_analyzer.enhance_pattern_detection(
|
|
141
|
-
message, matched_patterns
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
if has_nlp_matches:
|
|
145
|
-
# NLP enhancement found additional evidence
|
|
146
|
-
green_aware = True
|
|
147
|
-
|
|
148
91
|
# Q3: CODE DIFF ANALYSIS (if enabled and diff data available)
|
|
149
92
|
diff_analysis = None
|
|
150
93
|
if self.diff_analyzer and commit.get("diff_data"):
|
|
@@ -211,38 +154,10 @@ class DataAnalyzer:
|
|
|
211
154
|
if diff_analysis:
|
|
212
155
|
result["diff_analysis"] = diff_analysis
|
|
213
156
|
|
|
214
|
-
# Add NLP analysis results if available
|
|
215
|
-
if nlp_results:
|
|
216
|
-
result["nlp_analysis"] = {
|
|
217
|
-
"total_matches": nlp_results["total_nlp_matches"],
|
|
218
|
-
"match_density": nlp_results["match_density"],
|
|
219
|
-
"morphological_count": len(nlp_results["morphological_matches"]),
|
|
220
|
-
"semantic_count": len(nlp_results["semantic_matches"]),
|
|
221
|
-
"phrase_count": len(nlp_results["phrase_matches"]),
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
# Add ML features if enabled
|
|
225
|
-
if self.enable_ml_features and self.ml_extractor:
|
|
226
|
-
# Note: Full feature extraction requires repository context
|
|
227
|
-
# For now, extract basic text features
|
|
228
|
-
text_features = self.ml_extractor.extract_text_features(message)
|
|
229
|
-
result["ml_features"] = {
|
|
230
|
-
"text": text_features,
|
|
231
|
-
"note": "Full ML features require repository and historical context",
|
|
232
|
-
}
|
|
233
|
-
|
|
234
157
|
return result
|
|
235
158
|
|
|
236
159
|
def _check_green_awareness(self, message: str, files: list[str]) -> tuple[bool, Optional[str]]:
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
Args:
|
|
240
|
-
message: Commit message (lowercase)
|
|
241
|
-
files: List of changed files (lowercase)
|
|
242
|
-
|
|
243
|
-
Returns:
|
|
244
|
-
Tuple of (is_green_aware, evidence_text)
|
|
245
|
-
"""
|
|
160
|
+
# Check if commit explicitly mentions green/energy concerns.
|
|
246
161
|
# Check message for green keywords
|
|
247
162
|
for keyword in self.GREEN_KEYWORDS:
|
|
248
163
|
if keyword in message:
|
|
@@ -265,15 +180,7 @@ class DataAnalyzer:
|
|
|
265
180
|
return False, None
|
|
266
181
|
|
|
267
182
|
def _detect_known_pattern(self, message: str, files: list[str]) -> tuple[Optional[str], str]:
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
Args:
|
|
271
|
-
message: Commit message (lowercase)
|
|
272
|
-
files: List of changed files (lowercase)
|
|
273
|
-
|
|
274
|
-
Returns:
|
|
275
|
-
Tuple of (pattern_name, confidence_level)
|
|
276
|
-
"""
|
|
183
|
+
# Detect known green software pattern.
|
|
277
184
|
matches = []
|
|
278
185
|
|
|
279
186
|
# Check each pattern
|
|
@@ -299,12 +206,7 @@ class DataAnalyzer:
|
|
|
299
206
|
return matches[0][0], matches[0][1]
|
|
300
207
|
|
|
301
208
|
def save_results(self, results: list[dict[str, Any]], output_file: Path):
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
Args:
|
|
305
|
-
results: List of analysis results
|
|
306
|
-
output_file: Output file path
|
|
307
|
-
"""
|
|
209
|
+
# Save analysis results to JSON file.
|
|
308
210
|
# Calculate summary statistics
|
|
309
211
|
green_aware_count = sum(1 for r in results if r["green_aware"])
|
|
310
212
|
|
|
@@ -335,7 +237,7 @@ class DataAnalyzer:
|
|
|
335
237
|
colored_print(f"Saved analysis for {len(results)} commits to {output_file}", "green")
|
|
336
238
|
|
|
337
239
|
# Display summary
|
|
338
|
-
colored_print("\n
|
|
240
|
+
colored_print("\n Analysis Summary:", "cyan")
|
|
339
241
|
colored_print(
|
|
340
242
|
f" Green-aware commits: {green_aware_count} ({data['metadata']['green_aware_percentage']}%)",
|
|
341
243
|
"white",
|
|
@@ -344,98 +246,3 @@ class DataAnalyzer:
|
|
|
344
246
|
colored_print("\n Top patterns detected:", "cyan")
|
|
345
247
|
for pattern, count in pattern_counts.most_common(5):
|
|
346
248
|
colored_print(f" - {pattern}: {count}", "white")
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
@click.command()
|
|
350
|
-
@click.option("--batch-size", default=10, help="Batch size for processing")
|
|
351
|
-
@click.option("--resume", is_flag=True, help="Resume from checkpoint")
|
|
352
|
-
@click.option(
|
|
353
|
-
"--commits-file", default=None, help="Input commits file (default: data/commits.json)"
|
|
354
|
-
)
|
|
355
|
-
@click.option(
|
|
356
|
-
"--output", default=None, help="Output file path (default: data/analysis_results.json)"
|
|
357
|
-
)
|
|
358
|
-
@click.option("--config-file", default=".env", help="Path to .env configuration file")
|
|
359
|
-
def analyze(
|
|
360
|
-
batch_size: int,
|
|
361
|
-
resume: bool,
|
|
362
|
-
commits_file: Optional[str],
|
|
363
|
-
output: Optional[str],
|
|
364
|
-
config_file: str,
|
|
365
|
-
):
|
|
366
|
-
"""Analyze commits for green software practices."""
|
|
367
|
-
print_banner("Data Analyzer")
|
|
368
|
-
|
|
369
|
-
try:
|
|
370
|
-
# Load configuration
|
|
371
|
-
config = get_config(config_file)
|
|
372
|
-
|
|
373
|
-
# Determine input/output files
|
|
374
|
-
input_file = Path(commits_file) if commits_file else config.COMMITS_FILE
|
|
375
|
-
output_file = Path(output) if output else config.ANALYSIS_FILE
|
|
376
|
-
|
|
377
|
-
# Check if input file exists
|
|
378
|
-
if not input_file.exists():
|
|
379
|
-
colored_print(f"Input file not found: {input_file}", "red")
|
|
380
|
-
colored_print("Please run 'extract' command first to extract commits", "yellow")
|
|
381
|
-
exit(1)
|
|
382
|
-
|
|
383
|
-
# Load commits
|
|
384
|
-
colored_print(f"Loading commits from {input_file}...", "blue")
|
|
385
|
-
data = load_json_file(input_file)
|
|
386
|
-
commits = data.get("commits", [])
|
|
387
|
-
|
|
388
|
-
if not commits:
|
|
389
|
-
colored_print("No commits found in input file", "yellow")
|
|
390
|
-
exit(1)
|
|
391
|
-
|
|
392
|
-
colored_print(f"Loaded {len(commits)} commits", "green")
|
|
393
|
-
|
|
394
|
-
# Check for resume
|
|
395
|
-
resume_from = 0
|
|
396
|
-
if resume:
|
|
397
|
-
checkpoint_data = load_checkpoint(config.CHECKPOINT_FILE)
|
|
398
|
-
if checkpoint_data:
|
|
399
|
-
resume_from = checkpoint_data.get("processed_count", 0)
|
|
400
|
-
colored_print(
|
|
401
|
-
f"Resuming from checkpoint: {resume_from} commits processed", "yellow"
|
|
402
|
-
)
|
|
403
|
-
|
|
404
|
-
# Initialize analyzer
|
|
405
|
-
analyzer = DataAnalyzer(batch_size=batch_size)
|
|
406
|
-
|
|
407
|
-
# Analyze commits
|
|
408
|
-
results = analyzer.analyze_commits(commits, resume_from=resume_from)
|
|
409
|
-
|
|
410
|
-
if not results:
|
|
411
|
-
colored_print("No analysis results generated", "yellow")
|
|
412
|
-
exit(1)
|
|
413
|
-
|
|
414
|
-
# Save results
|
|
415
|
-
analyzer.save_results(results, output_file)
|
|
416
|
-
|
|
417
|
-
# Save checkpoint
|
|
418
|
-
create_checkpoint(
|
|
419
|
-
config.CHECKPOINT_FILE,
|
|
420
|
-
{"processed_count": len(results), "timestamp": format_timestamp()},
|
|
421
|
-
)
|
|
422
|
-
|
|
423
|
-
colored_print(f"\n✓ Successfully analyzed {len(results)} commits", "green")
|
|
424
|
-
colored_print(f"Output saved to: {output_file}", "green")
|
|
425
|
-
|
|
426
|
-
except FileNotFoundError as e:
|
|
427
|
-
colored_print(f"File not found: {e}", "red")
|
|
428
|
-
exit(1)
|
|
429
|
-
except json.JSONDecodeError:
|
|
430
|
-
colored_print(f"Invalid JSON in input file: {input_file}", "red")
|
|
431
|
-
exit(1)
|
|
432
|
-
except Exception as e:
|
|
433
|
-
colored_print(f"Error: {e}", "red")
|
|
434
|
-
import traceback
|
|
435
|
-
|
|
436
|
-
traceback.print_exc()
|
|
437
|
-
exit(1)
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
if __name__ == "__main__":
|
|
441
|
-
analyze()
|