greenmining 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. greenmining/__init__.py +11 -29
  2. greenmining/__main__.py +9 -3
  3. greenmining/__version__.py +2 -2
  4. greenmining/analyzers/__init__.py +3 -7
  5. greenmining/analyzers/code_diff_analyzer.py +151 -61
  6. greenmining/analyzers/qualitative_analyzer.py +15 -81
  7. greenmining/analyzers/statistical_analyzer.py +8 -69
  8. greenmining/analyzers/temporal_analyzer.py +16 -72
  9. greenmining/config.py +105 -58
  10. greenmining/controllers/__init__.py +1 -5
  11. greenmining/controllers/repository_controller.py +153 -94
  12. greenmining/energy/__init__.py +13 -0
  13. greenmining/energy/base.py +165 -0
  14. greenmining/energy/codecarbon_meter.py +146 -0
  15. greenmining/energy/rapl.py +157 -0
  16. greenmining/gsf_patterns.py +4 -26
  17. greenmining/models/__init__.py +1 -5
  18. greenmining/models/aggregated_stats.py +4 -4
  19. greenmining/models/analysis_result.py +4 -4
  20. greenmining/models/commit.py +5 -5
  21. greenmining/models/repository.py +5 -5
  22. greenmining/presenters/__init__.py +1 -5
  23. greenmining/presenters/console_presenter.py +24 -24
  24. greenmining/services/__init__.py +10 -6
  25. greenmining/services/commit_extractor.py +8 -152
  26. greenmining/services/data_aggregator.py +45 -175
  27. greenmining/services/data_analyzer.py +9 -202
  28. greenmining/services/github_fetcher.py +212 -323
  29. greenmining/services/github_graphql_fetcher.py +371 -0
  30. greenmining/services/local_repo_analyzer.py +387 -0
  31. greenmining/services/reports.py +33 -137
  32. greenmining/utils.py +21 -149
  33. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/METADATA +169 -146
  34. greenmining-1.0.4.dist-info/RECORD +37 -0
  35. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/WHEEL +1 -1
  36. greenmining/analyzers/ml_feature_extractor.py +0 -512
  37. greenmining/analyzers/nlp_analyzer.py +0 -365
  38. greenmining/cli.py +0 -471
  39. greenmining/main.py +0 -37
  40. greenmining-1.0.2.dist-info/RECORD +0 -36
  41. greenmining-1.0.2.dist-info/entry_points.txt +0 -2
  42. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/licenses/LICENSE +0 -0
  43. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- """Data analyzer for green microservices commits using GSF patterns."""
1
+ # Data analyzer for green microservices commits using GSF patterns.
2
2
 
3
3
  from __future__ import annotations
4
4
 
@@ -8,13 +8,10 @@ from collections import Counter
8
8
  from pathlib import Path
9
9
  from typing import Any, Dict, List, Optional, Tuple
10
10
 
11
- import click
12
11
  from tqdm import tqdm
13
12
 
14
13
  from greenmining.analyzers import (
15
14
  CodeDiffAnalyzer,
16
- NLPAnalyzer,
17
- MLFeatureExtractor,
18
15
  )
19
16
  from greenmining.config import get_config
20
17
  from greenmining.gsf_patterns import (
@@ -35,30 +32,19 @@ from greenmining.utils import (
35
32
 
36
33
 
37
34
  class DataAnalyzer:
38
- """Analyzes commits for green software patterns using GSF (Green Software Foundation) patterns."""
35
+ # Analyzes commits for green software patterns using GSF (Green Software Founda...
39
36
 
40
37
  def __init__(
41
38
  self,
42
39
  batch_size: int = 10,
43
40
  enable_diff_analysis: bool = False,
44
- enable_nlp: bool = False,
45
- enable_ml_features: bool = False,
46
41
  ):
47
- """Initialize analyzer with GSF patterns.
48
-
49
- Args:
50
- batch_size: Number of commits to process in each batch
51
- enable_diff_analysis: Enable code diff analysis (slower but more accurate)
52
- enable_nlp: Enable NLP-enhanced pattern detection
53
- enable_ml_features: Enable ML feature extraction
54
- """
42
+ # Initialize analyzer with GSF patterns.
55
43
  # Use GSF patterns from gsf_patterns.py
56
44
  self.gsf_patterns = GSF_PATTERNS
57
45
  self.green_keywords = GREEN_KEYWORDS
58
46
  self.batch_size = batch_size
59
47
  self.enable_diff_analysis = enable_diff_analysis
60
- self.enable_nlp = enable_nlp
61
- self.enable_ml_features = enable_ml_features
62
48
 
63
49
  # Initialize code diff analyzer if enabled
64
50
  if self.enable_diff_analysis:
@@ -67,32 +53,10 @@ class DataAnalyzer:
67
53
  else:
68
54
  self.diff_analyzer = None
69
55
 
70
- # Initialize NLP analyzer if enabled
71
- if self.enable_nlp:
72
- self.nlp_analyzer = NLPAnalyzer(enable_stemming=True, enable_synonyms=True)
73
- colored_print("NLP analysis enabled (morphological variants + synonyms)", "cyan")
74
- else:
75
- self.nlp_analyzer = None
76
-
77
- # Initialize ML feature extractor if enabled
78
- if self.enable_ml_features:
79
- self.ml_extractor = MLFeatureExtractor(green_keywords=list(GREEN_KEYWORDS))
80
- colored_print("ML feature extraction enabled", "cyan")
81
- else:
82
- self.ml_extractor = None
83
-
84
56
  def analyze_commits(
85
57
  self, commits: list[dict[str, Any]], resume_from: int = 0
86
58
  ) -> list[dict[str, Any]]:
87
- """Analyze commits for green software practices.
88
-
89
- Args:
90
- commits: List of commit dictionaries
91
- resume_from: Index to resume from
92
-
93
- Returns:
94
- List of analysis results
95
- """
59
+ # Analyze commits for green software practices.
96
60
  results = []
97
61
 
98
62
  colored_print(f"\nAnalyzing {len(commits)} commits for green practices...", "cyan")
@@ -115,14 +79,7 @@ class DataAnalyzer:
115
79
  return results
116
80
 
117
81
  def _analyze_commit(self, commit: dict[str, Any]) -> dict[str, Any]:
118
- """Analyze a single commit using GSF patterns.
119
-
120
- Args:
121
- commit: Commit dictionary
122
-
123
- Returns:
124
- Analysis result with GSF pattern matching
125
- """
82
+ # Analyze a single commit using GSF patterns.
126
83
  message = commit.get("message", "")
127
84
 
128
85
  # Q1: GREEN AWARENESS - Check using GSF keywords
@@ -131,20 +88,6 @@ class DataAnalyzer:
131
88
  # Q2: KNOWN GSF PATTERNS - Match against Green Software Foundation patterns
132
89
  matched_patterns = get_pattern_by_keywords(message)
133
90
 
134
- # Enhanced NLP analysis (if enabled)
135
- nlp_results = None
136
- if self.nlp_analyzer:
137
- nlp_results = self.nlp_analyzer.analyze_text(message, list(self.green_keywords))
138
-
139
- # Check if NLP found additional matches not caught by keyword matching
140
- has_nlp_matches, additional_terms = self.nlp_analyzer.enhance_pattern_detection(
141
- message, matched_patterns
142
- )
143
-
144
- if has_nlp_matches:
145
- # NLP enhancement found additional evidence
146
- green_aware = True
147
-
148
91
  # Q3: CODE DIFF ANALYSIS (if enabled and diff data available)
149
92
  diff_analysis = None
150
93
  if self.diff_analyzer and commit.get("diff_data"):
@@ -211,38 +154,10 @@ class DataAnalyzer:
211
154
  if diff_analysis:
212
155
  result["diff_analysis"] = diff_analysis
213
156
 
214
- # Add NLP analysis results if available
215
- if nlp_results:
216
- result["nlp_analysis"] = {
217
- "total_matches": nlp_results["total_nlp_matches"],
218
- "match_density": nlp_results["match_density"],
219
- "morphological_count": len(nlp_results["morphological_matches"]),
220
- "semantic_count": len(nlp_results["semantic_matches"]),
221
- "phrase_count": len(nlp_results["phrase_matches"]),
222
- }
223
-
224
- # Add ML features if enabled
225
- if self.enable_ml_features and self.ml_extractor:
226
- # Note: Full feature extraction requires repository context
227
- # For now, extract basic text features
228
- text_features = self.ml_extractor.extract_text_features(message)
229
- result["ml_features"] = {
230
- "text": text_features,
231
- "note": "Full ML features require repository and historical context",
232
- }
233
-
234
157
  return result
235
158
 
236
159
  def _check_green_awareness(self, message: str, files: list[str]) -> tuple[bool, Optional[str]]:
237
- """Check if commit explicitly mentions green/energy concerns.
238
-
239
- Args:
240
- message: Commit message (lowercase)
241
- files: List of changed files (lowercase)
242
-
243
- Returns:
244
- Tuple of (is_green_aware, evidence_text)
245
- """
160
+ # Check if commit explicitly mentions green/energy concerns.
246
161
  # Check message for green keywords
247
162
  for keyword in self.GREEN_KEYWORDS:
248
163
  if keyword in message:
@@ -265,15 +180,7 @@ class DataAnalyzer:
265
180
  return False, None
266
181
 
267
182
  def _detect_known_pattern(self, message: str, files: list[str]) -> tuple[Optional[str], str]:
268
- """Detect known green software pattern.
269
-
270
- Args:
271
- message: Commit message (lowercase)
272
- files: List of changed files (lowercase)
273
-
274
- Returns:
275
- Tuple of (pattern_name, confidence_level)
276
- """
183
+ # Detect known green software pattern.
277
184
  matches = []
278
185
 
279
186
  # Check each pattern
@@ -299,12 +206,7 @@ class DataAnalyzer:
299
206
  return matches[0][0], matches[0][1]
300
207
 
301
208
  def save_results(self, results: list[dict[str, Any]], output_file: Path):
302
- """Save analysis results to JSON file.
303
-
304
- Args:
305
- results: List of analysis results
306
- output_file: Output file path
307
- """
209
+ # Save analysis results to JSON file.
308
210
  # Calculate summary statistics
309
211
  green_aware_count = sum(1 for r in results if r["green_aware"])
310
212
 
@@ -335,7 +237,7 @@ class DataAnalyzer:
335
237
  colored_print(f"Saved analysis for {len(results)} commits to {output_file}", "green")
336
238
 
337
239
  # Display summary
338
- colored_print("\n📊 Analysis Summary:", "cyan")
240
+ colored_print("\n Analysis Summary:", "cyan")
339
241
  colored_print(
340
242
  f" Green-aware commits: {green_aware_count} ({data['metadata']['green_aware_percentage']}%)",
341
243
  "white",
@@ -344,98 +246,3 @@ class DataAnalyzer:
344
246
  colored_print("\n Top patterns detected:", "cyan")
345
247
  for pattern, count in pattern_counts.most_common(5):
346
248
  colored_print(f" - {pattern}: {count}", "white")
347
-
348
-
349
- @click.command()
350
- @click.option("--batch-size", default=10, help="Batch size for processing")
351
- @click.option("--resume", is_flag=True, help="Resume from checkpoint")
352
- @click.option(
353
- "--commits-file", default=None, help="Input commits file (default: data/commits.json)"
354
- )
355
- @click.option(
356
- "--output", default=None, help="Output file path (default: data/analysis_results.json)"
357
- )
358
- @click.option("--config-file", default=".env", help="Path to .env configuration file")
359
- def analyze(
360
- batch_size: int,
361
- resume: bool,
362
- commits_file: Optional[str],
363
- output: Optional[str],
364
- config_file: str,
365
- ):
366
- """Analyze commits for green software practices."""
367
- print_banner("Data Analyzer")
368
-
369
- try:
370
- # Load configuration
371
- config = get_config(config_file)
372
-
373
- # Determine input/output files
374
- input_file = Path(commits_file) if commits_file else config.COMMITS_FILE
375
- output_file = Path(output) if output else config.ANALYSIS_FILE
376
-
377
- # Check if input file exists
378
- if not input_file.exists():
379
- colored_print(f"Input file not found: {input_file}", "red")
380
- colored_print("Please run 'extract' command first to extract commits", "yellow")
381
- exit(1)
382
-
383
- # Load commits
384
- colored_print(f"Loading commits from {input_file}...", "blue")
385
- data = load_json_file(input_file)
386
- commits = data.get("commits", [])
387
-
388
- if not commits:
389
- colored_print("No commits found in input file", "yellow")
390
- exit(1)
391
-
392
- colored_print(f"Loaded {len(commits)} commits", "green")
393
-
394
- # Check for resume
395
- resume_from = 0
396
- if resume:
397
- checkpoint_data = load_checkpoint(config.CHECKPOINT_FILE)
398
- if checkpoint_data:
399
- resume_from = checkpoint_data.get("processed_count", 0)
400
- colored_print(
401
- f"Resuming from checkpoint: {resume_from} commits processed", "yellow"
402
- )
403
-
404
- # Initialize analyzer
405
- analyzer = DataAnalyzer(batch_size=batch_size)
406
-
407
- # Analyze commits
408
- results = analyzer.analyze_commits(commits, resume_from=resume_from)
409
-
410
- if not results:
411
- colored_print("No analysis results generated", "yellow")
412
- exit(1)
413
-
414
- # Save results
415
- analyzer.save_results(results, output_file)
416
-
417
- # Save checkpoint
418
- create_checkpoint(
419
- config.CHECKPOINT_FILE,
420
- {"processed_count": len(results), "timestamp": format_timestamp()},
421
- )
422
-
423
- colored_print(f"\n✓ Successfully analyzed {len(results)} commits", "green")
424
- colored_print(f"Output saved to: {output_file}", "green")
425
-
426
- except FileNotFoundError as e:
427
- colored_print(f"File not found: {e}", "red")
428
- exit(1)
429
- except json.JSONDecodeError:
430
- colored_print(f"Invalid JSON in input file: {input_file}", "red")
431
- exit(1)
432
- except Exception as e:
433
- colored_print(f"Error: {e}", "red")
434
- import traceback
435
-
436
- traceback.print_exc()
437
- exit(1)
438
-
439
-
440
- if __name__ == "__main__":
441
- analyze()