greenmining 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. greenmining/__init__.py +11 -29
  2. greenmining/__main__.py +9 -3
  3. greenmining/__version__.py +2 -2
  4. greenmining/analyzers/__init__.py +3 -7
  5. greenmining/analyzers/code_diff_analyzer.py +151 -61
  6. greenmining/analyzers/qualitative_analyzer.py +15 -81
  7. greenmining/analyzers/statistical_analyzer.py +8 -69
  8. greenmining/analyzers/temporal_analyzer.py +16 -72
  9. greenmining/config.py +105 -58
  10. greenmining/controllers/__init__.py +1 -5
  11. greenmining/controllers/repository_controller.py +153 -94
  12. greenmining/energy/__init__.py +13 -0
  13. greenmining/energy/base.py +165 -0
  14. greenmining/energy/codecarbon_meter.py +146 -0
  15. greenmining/energy/rapl.py +157 -0
  16. greenmining/gsf_patterns.py +4 -26
  17. greenmining/models/__init__.py +1 -5
  18. greenmining/models/aggregated_stats.py +4 -4
  19. greenmining/models/analysis_result.py +4 -4
  20. greenmining/models/commit.py +5 -5
  21. greenmining/models/repository.py +5 -5
  22. greenmining/presenters/__init__.py +1 -5
  23. greenmining/presenters/console_presenter.py +24 -24
  24. greenmining/services/__init__.py +10 -6
  25. greenmining/services/commit_extractor.py +8 -152
  26. greenmining/services/data_aggregator.py +45 -175
  27. greenmining/services/data_analyzer.py +9 -202
  28. greenmining/services/github_fetcher.py +212 -323
  29. greenmining/services/github_graphql_fetcher.py +371 -0
  30. greenmining/services/local_repo_analyzer.py +387 -0
  31. greenmining/services/reports.py +33 -137
  32. greenmining/utils.py +21 -149
  33. {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/METADATA +61 -151
  34. greenmining-1.0.4.dist-info/RECORD +37 -0
  35. {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/WHEEL +1 -1
  36. greenmining/analyzers/ml_feature_extractor.py +0 -512
  37. greenmining/analyzers/nlp_analyzer.py +0 -365
  38. greenmining/cli.py +0 -471
  39. greenmining/main.py +0 -37
  40. greenmining-1.0.3.dist-info/RECORD +0 -36
  41. greenmining-1.0.3.dist-info/entry_points.txt +0 -2
  42. {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/licenses/LICENSE +0 -0
  43. {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,4 @@
1
- """
2
- Temporal and Historical Analysis for Green Software Practices
3
-
4
- Implements time-series analysis from Soliman et al. (2017):
5
- - Adoption trend analysis (when practices emerged)
6
- - Velocity analysis (commit frequency over time)
7
- - Pattern evolution tracking (which practices dominated when)
8
- - Temporal correlations (do practices cluster in time?)
9
-
10
- Addresses research questions:
11
- 1. When did green practices emerge in software development?
12
- 2. Are green practices increasing or decreasing over time?
13
- 3. Which practices were early vs. late adopters?
14
- 4. Do green practices correlate with project maturity?
15
- """
1
+ # Temporal and Historical Analysis for Green Software Practices
16
2
 
17
3
  from __future__ import annotations
18
4
 
@@ -25,7 +11,7 @@ import statistics
25
11
 
26
12
  @dataclass
27
13
  class TemporalMetrics:
28
- """Metrics for a specific time period"""
14
+ # Metrics for a specific time period
29
15
 
30
16
  period: str
31
17
  start_date: datetime
@@ -40,7 +26,7 @@ class TemporalMetrics:
40
26
 
41
27
  @dataclass
42
28
  class TrendAnalysis:
43
- """Trend analysis results"""
29
+ # Trend analysis results
44
30
 
45
31
  trend_direction: str # 'increasing', 'decreasing', 'stable'
46
32
  slope: float
@@ -51,35 +37,16 @@ class TrendAnalysis:
51
37
 
52
38
 
53
39
  class TemporalAnalyzer:
54
- """
55
- Analyze temporal patterns in green software adoption.
56
-
57
- Based on Soliman et al.: "Time-range filtering is standard practice"
58
- Extends with: trend detection, velocity analysis, evolution tracking
59
- """
40
+ # Analyze temporal patterns in green software adoption.
60
41
 
61
42
  def __init__(self, granularity: str = "quarter"):
62
- """
63
- Initialize temporal analyzer.
64
-
65
- Args:
66
- granularity: Time period granularity ('day', 'week', 'month', 'quarter', 'year')
67
- """
43
+ # Initialize temporal analyzer.
68
44
  self.granularity = granularity
69
45
 
70
46
  def group_commits_by_period(
71
47
  self, commits: List[Dict], date_field: str = "date"
72
48
  ) -> Dict[str, List[Dict]]:
73
- """
74
- Group commits into time periods.
75
-
76
- Args:
77
- commits: List of commit dictionaries
78
- date_field: Field containing commit date
79
-
80
- Returns:
81
- Dictionary mapping period strings to commit lists
82
- """
49
+ # Group commits into time periods.
83
50
  periods = defaultdict(list)
84
51
 
85
52
  for commit in commits:
@@ -103,7 +70,7 @@ class TemporalAnalyzer:
103
70
  return dict(periods)
104
71
 
105
72
  def _get_period_key(self, date: datetime) -> str:
106
- """Get period key for a date based on granularity."""
73
+ # Get period key for a date based on granularity.
107
74
  if self.granularity == "day":
108
75
  return date.strftime("%Y-%m-%d")
109
76
  elif self.granularity == "week":
@@ -120,7 +87,7 @@ class TemporalAnalyzer:
120
87
  return date.strftime("%Y-%m")
121
88
 
122
89
  def _parse_period_key(self, period_key: str) -> Tuple[datetime, datetime]:
123
- """Parse period key back to start and end dates."""
90
+ # Parse period key back to start and end dates.
124
91
  if "W" in period_key:
125
92
  # Week format: 2024-W15
126
93
  year, week = period_key.split("-W")
@@ -173,17 +140,7 @@ class TemporalAnalyzer:
173
140
  def calculate_period_metrics(
174
141
  self, period_key: str, commits: List[Dict], analysis_results: List[Dict]
175
142
  ) -> TemporalMetrics:
176
- """
177
- Calculate metrics for a time period.
178
-
179
- Args:
180
- period_key: Period identifier
181
- commits: Commits in this period
182
- analysis_results: Pattern analysis results for commits
183
-
184
- Returns:
185
- TemporalMetrics object
186
- """
143
+ # Calculate metrics for a time period.
187
144
  start_date, end_date = self._parse_period_key(period_key)
188
145
 
189
146
  # Count green commits
@@ -229,20 +186,7 @@ class TemporalAnalyzer:
229
186
  )
230
187
 
231
188
  def analyze_trends(self, commits: List[Dict], analysis_results: List[Dict]) -> Dict:
232
- """
233
- Comprehensive temporal trend analysis.
234
-
235
- Args:
236
- commits: All commits to analyze
237
- analysis_results: Pattern analysis results
238
-
239
- Returns:
240
- Dictionary with:
241
- - periods: List of TemporalMetrics
242
- - trend: TrendAnalysis
243
- - adoption_curve: List of (period, cumulative_rate)
244
- - velocity_trend: Velocity change over time
245
- """
189
+ # Comprehensive temporal trend analysis.
246
190
  # Group by periods
247
191
  grouped = self.group_commits_by_period(commits)
248
192
 
@@ -284,7 +228,7 @@ class TemporalAnalyzer:
284
228
  }
285
229
 
286
230
  def _calculate_trend(self, periods: List[TemporalMetrics]) -> Optional[TrendAnalysis]:
287
- """Calculate linear trend using least squares regression."""
231
+ # Calculate linear trend using least squares regression.
288
232
  if len(periods) < 2:
289
233
  return None
290
234
 
@@ -332,7 +276,7 @@ class TemporalAnalyzer:
332
276
  )
333
277
 
334
278
  def _calculate_adoption_curve(self, periods: List[TemporalMetrics]) -> List[Tuple[str, float]]:
335
- """Calculate cumulative adoption over time."""
279
+ # Calculate cumulative adoption over time.
336
280
  cumulative_green = 0
337
281
  cumulative_total = 0
338
282
  curve = []
@@ -348,7 +292,7 @@ class TemporalAnalyzer:
348
292
  return curve
349
293
 
350
294
  def _calculate_velocity_trend(self, periods: List[TemporalMetrics]) -> Dict:
351
- """Analyze velocity changes over time."""
295
+ # Analyze velocity changes over time.
352
296
  if not periods:
353
297
  return {}
354
298
 
@@ -365,7 +309,7 @@ class TemporalAnalyzer:
365
309
  def _analyze_pattern_evolution(
366
310
  self, periods: List[TemporalMetrics], analysis_results: List[Dict]
367
311
  ) -> Dict:
368
- """Track when different patterns emerged and dominated."""
312
+ # Track when different patterns emerged and dominated.
369
313
  pattern_timeline = defaultdict(lambda: {"first_seen": None, "occurrences_by_period": {}})
370
314
 
371
315
  for period in periods:
@@ -406,7 +350,7 @@ class TemporalAnalyzer:
406
350
  }
407
351
 
408
352
  def _metrics_to_dict(self, metrics: TemporalMetrics) -> Dict:
409
- """Convert TemporalMetrics to dictionary."""
353
+ # Convert TemporalMetrics to dictionary.
410
354
  return {
411
355
  "period": metrics.period,
412
356
  "start_date": metrics.start_date.isoformat(),
@@ -420,7 +364,7 @@ class TemporalAnalyzer:
420
364
  }
421
365
 
422
366
  def _trend_to_dict(self, trend: Optional[TrendAnalysis]) -> Dict:
423
- """Convert TrendAnalysis to dictionary."""
367
+ # Convert TrendAnalysis to dictionary.
424
368
  if not trend:
425
369
  return {}
426
370
 
greenmining/config.py CHANGED
@@ -1,72 +1,93 @@
1
- """Configuration management for green microservices mining CLI."""
2
-
3
1
  import os
4
2
  from pathlib import Path
3
+ from typing import Any, Dict, List, Optional
5
4
 
6
5
  from dotenv import load_dotenv
7
6
 
8
7
 
9
- class Config:
10
- """Configuration class for loading and validating environment variables."""
8
+ def _load_yaml_config(yaml_path: Path) -> Dict[str, Any]:
9
+ # Load configuration from YAML file if it exists.
10
+ if not yaml_path.exists():
11
+ return {}
12
+ try:
13
+ import yaml
14
+ with open(yaml_path, 'r') as f:
15
+ return yaml.safe_load(f) or {}
16
+ except ImportError:
17
+ return {}
18
+ except Exception:
19
+ return {}
11
20
 
12
- def __init__(self, env_file: str = ".env"):
13
- """Initialize configuration from environment file.
14
21
 
15
- Args:
16
- env_file: Path to .env file
17
- """
22
+ class Config:
23
+ # Configuration class for loading from env vars and YAML.
24
+
25
+ def __init__(self, env_file: str = ".env", yaml_file: str = "greenmining.yaml"):
26
+ # Initialize configuration from environment and YAML file.
18
27
  # Load environment variables
19
28
  env_path = Path(env_file)
20
29
  if env_path.exists():
21
30
  load_dotenv(env_path)
22
31
  else:
23
32
  load_dotenv() # Load from system environment
33
+
34
+ # Load YAML config (takes precedence for certain options)
35
+ yaml_path = Path(yaml_file)
36
+ self._yaml_config = _load_yaml_config(yaml_path)
24
37
 
25
38
  # GitHub API Configuration
26
39
  self.GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
27
40
  if not self.GITHUB_TOKEN or self.GITHUB_TOKEN == "your_github_pat_here":
28
41
  raise ValueError("GITHUB_TOKEN not set. Please set it in .env file or environment.")
29
42
 
30
- # Analysis Type - Using GitHub Copilot for AI-powered analysis
43
+ # Analysis Type
31
44
  self.ANALYSIS_TYPE = "keyword_heuristic"
32
45
 
33
- # Search and Processing Configuration
34
- self.GITHUB_SEARCH_KEYWORDS = ["microservices", "microservice-architecture", "cloud-native"]
35
-
36
- self.SUPPORTED_LANGUAGES = [
37
- "Java",
38
- "Python",
39
- "Go",
40
- "JavaScript",
41
- "TypeScript",
42
- "C#",
43
- "Rust",
44
- ]
45
-
46
- # Repository and Commit Limits
47
- self.MIN_STARS = int(os.getenv("MIN_STARS", "100"))
48
- self.MAX_REPOS = int(os.getenv("MAX_REPOS", "100"))
49
- self.COMMITS_PER_REPO = int(os.getenv("COMMITS_PER_REPO", "50"))
50
- self.DAYS_BACK = int(os.getenv("DAYS_BACK", "730")) # 2 years
46
+ # Search and Processing Configuration (YAML: sources.search.keywords)
47
+ yaml_search = self._yaml_config.get("sources", {}).get("search", {})
48
+ self.GITHUB_SEARCH_KEYWORDS = yaml_search.get("keywords",
49
+ ["microservices", "microservice-architecture", "cloud-native"])
50
+
51
+ # Supported Languages (YAML: sources.search.languages)
52
+ self.SUPPORTED_LANGUAGES = yaml_search.get("languages", [
53
+ "Java", "Python", "Go", "JavaScript", "TypeScript", "C#", "Rust",
54
+ ])
51
55
 
52
- # Advanced Analyzer Configuration
56
+ # Repository and Commit Limits (YAML: extraction.*)
57
+ yaml_extraction = self._yaml_config.get("extraction", {})
58
+ self.MIN_STARS = yaml_search.get("min_stars", int(os.getenv("MIN_STARS", "100")))
59
+ self.MAX_REPOS = int(os.getenv("MAX_REPOS", "100"))
60
+ self.COMMITS_PER_REPO = yaml_extraction.get("max_commits",
61
+ int(os.getenv("COMMITS_PER_REPO", "50")))
62
+ self.DAYS_BACK = yaml_extraction.get("days_back",
63
+ int(os.getenv("DAYS_BACK", "730")))
64
+ self.SKIP_MERGES = yaml_extraction.get("skip_merges", True)
65
+
66
+ # Analysis Configuration (YAML: analysis.*)
67
+ yaml_analysis = self._yaml_config.get("analysis", {})
53
68
  self.ENABLE_NLP_ANALYSIS = os.getenv("ENABLE_NLP_ANALYSIS", "false").lower() == "true"
54
69
  self.ENABLE_TEMPORAL_ANALYSIS = (
55
70
  os.getenv("ENABLE_TEMPORAL_ANALYSIS", "false").lower() == "true"
56
71
  )
57
- self.TEMPORAL_GRANULARITY = os.getenv(
58
- "TEMPORAL_GRANULARITY", "quarter"
59
- ) # day, week, month, quarter, year
72
+ self.TEMPORAL_GRANULARITY = os.getenv("TEMPORAL_GRANULARITY", "quarter")
60
73
  self.ENABLE_ML_FEATURES = os.getenv("ENABLE_ML_FEATURES", "false").lower() == "true"
61
74
  self.VALIDATION_SAMPLE_SIZE = int(os.getenv("VALIDATION_SAMPLE_SIZE", "30"))
62
-
63
- # Temporal Filtering (NEW)
64
- self.CREATED_AFTER = os.getenv("CREATED_AFTER") # YYYY-MM-DD
65
- self.CREATED_BEFORE = os.getenv("CREATED_BEFORE") # YYYY-MM-DD
66
- self.PUSHED_AFTER = os.getenv("PUSHED_AFTER") # YYYY-MM-DD
67
- self.PUSHED_BEFORE = os.getenv("PUSHED_BEFORE") # YYYY-MM-DD
68
- self.COMMIT_DATE_FROM = os.getenv("COMMIT_DATE_FROM") # YYYY-MM-DD
69
- self.COMMIT_DATE_TO = os.getenv("COMMIT_DATE_TO") # YYYY-MM-DD
75
+
76
+ # PyDriller options (YAML: analysis.process_metrics, etc.)
77
+ self.PROCESS_METRICS_ENABLED = yaml_analysis.get("process_metrics",
78
+ os.getenv("PROCESS_METRICS_ENABLED", "true").lower() == "true")
79
+ self.STRUCTURAL_METRICS_ENABLED = yaml_analysis.get("structural_metrics",
80
+ os.getenv("STRUCTURAL_METRICS_ENABLED", "true").lower() == "true")
81
+ self.DMM_ENABLED = yaml_analysis.get("delta_maintainability",
82
+ os.getenv("DMM_ENABLED", "true").lower() == "true")
83
+
84
+ # Temporal Filtering
85
+ self.CREATED_AFTER = os.getenv("CREATED_AFTER")
86
+ self.CREATED_BEFORE = os.getenv("CREATED_BEFORE")
87
+ self.PUSHED_AFTER = os.getenv("PUSHED_AFTER")
88
+ self.PUSHED_BEFORE = os.getenv("PUSHED_BEFORE")
89
+ self.COMMIT_DATE_FROM = os.getenv("COMMIT_DATE_FROM")
90
+ self.COMMIT_DATE_TO = os.getenv("COMMIT_DATE_TO")
70
91
  self.MIN_COMMITS = int(os.getenv("MIN_COMMITS", "0"))
71
92
  self.ACTIVITY_WINDOW_DAYS = int(os.getenv("ACTIVITY_WINDOW_DAYS", "730"))
72
93
 
@@ -76,11 +97,13 @@ class Config:
76
97
  # Processing Configuration
77
98
  self.TIMEOUT_SECONDS = int(os.getenv("TIMEOUT_SECONDS", "30"))
78
99
  self.MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3"))
79
- self.RETRY_DELAY = 2 # seconds
100
+ self.RETRY_DELAY = 2
80
101
  self.EXPONENTIAL_BACKOFF = True
81
102
 
82
- # Output Configuration
83
- self.OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", "./data"))
103
+ # Output Configuration (YAML: output.directory)
104
+ yaml_output = self._yaml_config.get("output", {})
105
+ self.OUTPUT_DIR = Path(yaml_output.get("directory",
106
+ os.getenv("OUTPUT_DIR", "./data")))
84
107
  self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
85
108
 
86
109
  # File Paths
@@ -92,17 +115,48 @@ class Config:
92
115
  self.REPORT_FILE = self.OUTPUT_DIR / "green_microservices_analysis.md"
93
116
  self.CHECKPOINT_FILE = self.OUTPUT_DIR / "checkpoint.json"
94
117
 
118
+ # Direct Repository URL Support (YAML: sources.urls)
119
+ yaml_urls = self._yaml_config.get("sources", {}).get("urls", [])
120
+ env_urls = self._parse_repository_urls(os.getenv("REPOSITORY_URLS", ""))
121
+ self.REPOSITORY_URLS: List[str] = yaml_urls if yaml_urls else env_urls
122
+
123
+ # Clone path (YAML: extraction.clone_path)
124
+ self.CLONE_PATH = Path(yaml_extraction.get("clone_path",
125
+ os.getenv("CLONE_PATH", "/tmp/greenmining_repos")))
126
+ self.CLEANUP_AFTER_ANALYSIS = (
127
+ os.getenv("CLEANUP_AFTER_ANALYSIS", "true").lower() == "true"
128
+ )
129
+
130
+ # Energy Measurement (YAML: energy.*)
131
+ yaml_energy = self._yaml_config.get("energy", {})
132
+ self.ENERGY_ENABLED = yaml_energy.get("enabled",
133
+ os.getenv("ENERGY_ENABLED", "false").lower() == "true")
134
+ self.ENERGY_BACKEND = yaml_energy.get("backend",
135
+ os.getenv("ENERGY_BACKEND", "rapl"))
136
+ self.CARBON_TRACKING = yaml_energy.get("carbon_tracking",
137
+ os.getenv("CARBON_TRACKING", "false").lower() == "true")
138
+ self.COUNTRY_ISO = yaml_energy.get("country_iso",
139
+ os.getenv("COUNTRY_ISO", "USA"))
140
+
141
+ # Power profiling (YAML: energy.power_profiling.*)
142
+ yaml_power = yaml_energy.get("power_profiling", {})
143
+ self.POWER_PROFILING_ENABLED = yaml_power.get("enabled", False)
144
+ self.POWER_TEST_COMMAND = yaml_power.get("test_command", None)
145
+ self.POWER_REGRESSION_THRESHOLD = yaml_power.get("regression_threshold", 5.0)
146
+
95
147
  # Logging
96
148
  self.VERBOSE = os.getenv("VERBOSE", "false").lower() == "true"
97
149
  self.LOG_FILE = self.OUTPUT_DIR / "mining.log"
98
150
 
99
- def validate(self) -> bool:
100
- """Validate that all required configuration is present.
151
+ def _parse_repository_urls(self, urls_str: str) -> List[str]:
152
+ # Parse comma-separated repository URLs from environment variable.
153
+ if not urls_str:
154
+ return []
155
+ return [url.strip() for url in urls_str.split(",") if url.strip()]
101
156
 
102
- Returns:
103
- True if configuration is valid
104
- """
105
- required_attrs = ["GITHUB_TOKEN", "CLAUDE_API_KEY", "MAX_REPOS", "COMMITS_PER_REPO"]
157
+ def validate(self) -> bool:
158
+ # Validate that all required configuration is present.
159
+ required_attrs = ["GITHUB_TOKEN", "MAX_REPOS", "COMMITS_PER_REPO"]
106
160
 
107
161
  for attr in required_attrs:
108
162
  if not getattr(self, attr, None):
@@ -111,7 +165,7 @@ class Config:
111
165
  return True
112
166
 
113
167
  def __repr__(self) -> str:
114
- """String representation of configuration (hiding sensitive data)."""
168
+ # String representation of configuration (hiding sensitive data).
115
169
  return (
116
170
  f"Config("
117
171
  f"MAX_REPOS={self.MAX_REPOS}, "
@@ -127,14 +181,7 @@ _config_instance = None
127
181
 
128
182
 
129
183
  def get_config(env_file: str = ".env") -> Config:
130
- """Get or create global configuration instance.
131
-
132
- Args:
133
- env_file: Path to .env file
134
-
135
- Returns:
136
- Config instance
137
- """
184
+ # Get or create global configuration instance.
138
185
  global _config_instance
139
186
  if _config_instance is None:
140
187
  _config_instance = Config(env_file)
@@ -1,8 +1,4 @@
1
- """
2
- Controllers Package - Business logic and orchestration for mining operations.
3
-
4
- Controllers coordinate between models, services, and presenters following MCP architecture.
5
- """
1
+ # Controllers Package - Business logic and orchestration for mining operations.
6
2
 
7
3
  from .repository_controller import RepositoryController
8
4