greenmining 1.1.7__tar.gz → 1.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {greenmining-1.1.7 → greenmining-1.1.9}/CHANGELOG.md +1 -1
- {greenmining-1.1.7/greenmining.egg-info → greenmining-1.1.9}/PKG-INFO +10 -10
- {greenmining-1.1.7 → greenmining-1.1.9}/README.md +9 -9
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/__init__.py +1 -1
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/analyzers/metrics_power_correlator.py +1 -1
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/analyzers/power_regression.py +0 -1
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/analyzers/qualitative_analyzer.py +1 -1
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/analyzers/statistical_analyzer.py +0 -32
- greenmining-1.1.9/greenmining/config.py +91 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/controllers/repository_controller.py +77 -16
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/energy/codecarbon_meter.py +0 -21
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/energy/cpu_meter.py +1 -1
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/gsf_patterns.py +41 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/models/aggregated_stats.py +1 -1
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/models/commit.py +0 -1
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/models/repository.py +1 -1
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/services/commit_extractor.py +2 -41
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/services/data_aggregator.py +1 -6
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/services/data_analyzer.py +1 -57
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/services/local_repo_analyzer.py +1 -2
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/services/reports.py +1 -6
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/utils.py +0 -87
- {greenmining-1.1.7 → greenmining-1.1.9/greenmining.egg-info}/PKG-INFO +10 -10
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining.egg-info/SOURCES.txt +0 -4
- {greenmining-1.1.7 → greenmining-1.1.9}/pyproject.toml +1 -1
- greenmining-1.1.7/greenmining/__version__.py +0 -3
- greenmining-1.1.7/greenmining/config.py +0 -200
- greenmining-1.1.7/greenmining/services/github_fetcher.py +0 -2
- {greenmining-1.1.7 → greenmining-1.1.9}/LICENSE +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/MANIFEST.in +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/__main__.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/analyzers/__init__.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/analyzers/code_diff_analyzer.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/analyzers/temporal_analyzer.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/analyzers/version_power_analyzer.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/controllers/__init__.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/energy/__init__.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/energy/base.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/energy/carbon_reporter.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/energy/rapl.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/models/__init__.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/models/analysis_result.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/presenters/__init__.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/presenters/console_presenter.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/services/__init__.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining/services/github_graphql_fetcher.py +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining.egg-info/dependency_links.txt +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining.egg-info/requires.txt +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/greenmining.egg-info/top_level.txt +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/setup.cfg +0 -0
- {greenmining-1.1.7 → greenmining-1.1.9}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: greenmining
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.9
|
|
4
4
|
Summary: An empirical Python library for Mining Software Repositories (MSR) in Green IT research
|
|
5
5
|
Author-email: Adam Bouafia <a.bouafia@student.vu.nl>
|
|
6
6
|
License: MIT
|
|
@@ -68,9 +68,9 @@ An empirical Python library for Mining Software Repositories (MSR) in Green IT r
|
|
|
68
68
|
|
|
69
69
|
`greenmining` is a research-grade Python library designed for **empirical Mining Software Repositories (MSR)** studies in **Green IT**. It enables researchers and practitioners to:
|
|
70
70
|
|
|
71
|
-
- **Mine repositories at scale** - Fetch and analyze GitHub repositories via GraphQL API with configurable filters
|
|
72
|
-
|
|
73
|
-
- **Classify green commits** - Detect
|
|
71
|
+
- **Mine repositories at scale** - Search, Fetch and analyze GitHub repositories via GraphQL API with configurable filters
|
|
72
|
+
|
|
73
|
+
- **Classify green commits** - Detect 124 sustainability patterns from the Green Software Foundation (GSF) catalog
|
|
74
74
|
- **Analyze any repository by URL** - Direct Git-based analysis with support for private repositories
|
|
75
75
|
- **Measure energy consumption** - RAPL, CodeCarbon, and CPU Energy Meter backends for power profiling
|
|
76
76
|
- **Carbon footprint reporting** - CO2 emissions calculation with 20+ country profiles and cloud region support
|
|
@@ -113,7 +113,7 @@ docker pull adambouafia/greenmining:latest
|
|
|
113
113
|
from greenmining import GSF_PATTERNS, is_green_aware, get_pattern_by_keywords
|
|
114
114
|
|
|
115
115
|
# Check available patterns
|
|
116
|
-
print(f"Total patterns: {len(GSF_PATTERNS)}") #
|
|
116
|
+
print(f"Total patterns: {len(GSF_PATTERNS)}") # 124 patterns across 15 categories
|
|
117
117
|
|
|
118
118
|
# Detect green awareness in commit messages
|
|
119
119
|
commit_msg = "Optimize Redis caching to reduce energy consumption"
|
|
@@ -670,8 +670,8 @@ config = Config(
|
|
|
670
670
|
|
|
671
671
|
### Core Capabilities
|
|
672
672
|
|
|
673
|
-
- **Pattern Detection**:
|
|
674
|
-
- **Keyword Analysis**:
|
|
673
|
+
- **Pattern Detection**: 124 sustainability patterns across 15 categories from the GSF catalog
|
|
674
|
+
- **Keyword Analysis**: 332 green software detection keywords
|
|
675
675
|
- **Repository Fetching**: GraphQL API with date, star, and language filters
|
|
676
676
|
- **URL-Based Analysis**: Direct Git-based analysis from GitHub URLs (HTTPS and SSH)
|
|
677
677
|
- **Batch Processing**: Parallel analysis of multiple repositories with configurable workers
|
|
@@ -739,7 +739,7 @@ print(f"Equivalent: {report.tree_months:.2f} tree-months to offset")
|
|
|
739
739
|
|
|
740
740
|
### Pattern Database
|
|
741
741
|
|
|
742
|
-
**
|
|
742
|
+
**124 green software patterns based on:**
|
|
743
743
|
- Green Software Foundation (GSF) Patterns Catalog
|
|
744
744
|
- VU Amsterdam 2024 research on ML system sustainability
|
|
745
745
|
- ICSE 2024 conference papers on sustainable software
|
|
@@ -749,11 +749,11 @@ print(f"Equivalent: {report.tree_months:.2f} tree-months to offset")
|
|
|
749
749
|
- **Coverage**: 67% of patterns actively detect in real-world commits
|
|
750
750
|
- **Accuracy**: 100% true positive rate for green-aware commits
|
|
751
751
|
- **Categories**: 15 distinct sustainability domains covered
|
|
752
|
-
- **Keywords**:
|
|
752
|
+
- **Keywords**: 332 detection terms across all patterns
|
|
753
753
|
|
|
754
754
|
## GSF Pattern Categories
|
|
755
755
|
|
|
756
|
-
**
|
|
756
|
+
**124 patterns across 15 categories:**
|
|
757
757
|
|
|
758
758
|
### 1. Cloud (40 patterns)
|
|
759
759
|
Auto-scaling, serverless computing, right-sizing instances, region selection for renewable energy, spot instances, idle resource detection, cloud-native architectures
|
|
@@ -11,9 +11,9 @@ An empirical Python library for Mining Software Repositories (MSR) in Green IT r
|
|
|
11
11
|
|
|
12
12
|
`greenmining` is a research-grade Python library designed for **empirical Mining Software Repositories (MSR)** studies in **Green IT**. It enables researchers and practitioners to:
|
|
13
13
|
|
|
14
|
-
- **Mine repositories at scale** - Fetch and analyze GitHub repositories via GraphQL API with configurable filters
|
|
15
|
-
|
|
16
|
-
- **Classify green commits** - Detect
|
|
14
|
+
- **Mine repositories at scale** - Search, Fetch and analyze GitHub repositories via GraphQL API with configurable filters
|
|
15
|
+
|
|
16
|
+
- **Classify green commits** - Detect 124 sustainability patterns from the Green Software Foundation (GSF) catalog
|
|
17
17
|
- **Analyze any repository by URL** - Direct Git-based analysis with support for private repositories
|
|
18
18
|
- **Measure energy consumption** - RAPL, CodeCarbon, and CPU Energy Meter backends for power profiling
|
|
19
19
|
- **Carbon footprint reporting** - CO2 emissions calculation with 20+ country profiles and cloud region support
|
|
@@ -56,7 +56,7 @@ docker pull adambouafia/greenmining:latest
|
|
|
56
56
|
from greenmining import GSF_PATTERNS, is_green_aware, get_pattern_by_keywords
|
|
57
57
|
|
|
58
58
|
# Check available patterns
|
|
59
|
-
print(f"Total patterns: {len(GSF_PATTERNS)}") #
|
|
59
|
+
print(f"Total patterns: {len(GSF_PATTERNS)}") # 124 patterns across 15 categories
|
|
60
60
|
|
|
61
61
|
# Detect green awareness in commit messages
|
|
62
62
|
commit_msg = "Optimize Redis caching to reduce energy consumption"
|
|
@@ -613,8 +613,8 @@ config = Config(
|
|
|
613
613
|
|
|
614
614
|
### Core Capabilities
|
|
615
615
|
|
|
616
|
-
- **Pattern Detection**:
|
|
617
|
-
- **Keyword Analysis**:
|
|
616
|
+
- **Pattern Detection**: 124 sustainability patterns across 15 categories from the GSF catalog
|
|
617
|
+
- **Keyword Analysis**: 332 green software detection keywords
|
|
618
618
|
- **Repository Fetching**: GraphQL API with date, star, and language filters
|
|
619
619
|
- **URL-Based Analysis**: Direct Git-based analysis from GitHub URLs (HTTPS and SSH)
|
|
620
620
|
- **Batch Processing**: Parallel analysis of multiple repositories with configurable workers
|
|
@@ -682,7 +682,7 @@ print(f"Equivalent: {report.tree_months:.2f} tree-months to offset")
|
|
|
682
682
|
|
|
683
683
|
### Pattern Database
|
|
684
684
|
|
|
685
|
-
**
|
|
685
|
+
**124 green software patterns based on:**
|
|
686
686
|
- Green Software Foundation (GSF) Patterns Catalog
|
|
687
687
|
- VU Amsterdam 2024 research on ML system sustainability
|
|
688
688
|
- ICSE 2024 conference papers on sustainable software
|
|
@@ -692,11 +692,11 @@ print(f"Equivalent: {report.tree_months:.2f} tree-months to offset")
|
|
|
692
692
|
- **Coverage**: 67% of patterns actively detect in real-world commits
|
|
693
693
|
- **Accuracy**: 100% true positive rate for green-aware commits
|
|
694
694
|
- **Categories**: 15 distinct sustainability domains covered
|
|
695
|
-
- **Keywords**:
|
|
695
|
+
- **Keywords**: 332 detection terms across all patterns
|
|
696
696
|
|
|
697
697
|
## GSF Pattern Categories
|
|
698
698
|
|
|
699
|
-
**
|
|
699
|
+
**124 patterns across 15 categories:**
|
|
700
700
|
|
|
701
701
|
### 1. Cloud (40 patterns)
|
|
702
702
|
Auto-scaling, serverless computing, right-sizing instances, region selection for renewable energy, spot instances, idle resource detection, cloud-native architectures
|
|
@@ -135,38 +135,6 @@ class StatisticalAnalyzer:
|
|
|
135
135
|
"significant": bool(p_value < 0.05),
|
|
136
136
|
}
|
|
137
137
|
|
|
138
|
-
def pattern_adoption_rate_analysis(self, commits_df: pd.DataFrame) -> Dict[str, Any]:
|
|
139
|
-
# Analyze pattern adoption rates over repository lifetime.
|
|
140
|
-
results = {}
|
|
141
|
-
|
|
142
|
-
for pattern in commits_df["pattern"].unique():
|
|
143
|
-
pattern_commits = commits_df[commits_df["pattern"] == pattern].sort_values("date")
|
|
144
|
-
|
|
145
|
-
if len(pattern_commits) == 0:
|
|
146
|
-
continue
|
|
147
|
-
|
|
148
|
-
# Time to first adoption
|
|
149
|
-
first_adoption = pattern_commits.iloc[0]["date"]
|
|
150
|
-
repo_start = commits_df["date"].min()
|
|
151
|
-
ttfa_days = (first_adoption - repo_start).days
|
|
152
|
-
|
|
153
|
-
# Adoption frequency over time
|
|
154
|
-
monthly_adoption = pattern_commits.set_index("date").resample("ME").size()
|
|
155
|
-
|
|
156
|
-
# Pattern stickiness (months with at least one adoption)
|
|
157
|
-
total_months = len(commits_df.set_index("date").resample("ME").size())
|
|
158
|
-
active_months = len(monthly_adoption[monthly_adoption > 0])
|
|
159
|
-
stickiness = active_months / total_months if total_months > 0 else 0
|
|
160
|
-
|
|
161
|
-
results[pattern] = {
|
|
162
|
-
"ttfa_days": ttfa_days,
|
|
163
|
-
"total_adoptions": len(pattern_commits),
|
|
164
|
-
"stickiness": stickiness,
|
|
165
|
-
"monthly_adoption_rate": monthly_adoption.mean(),
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
return results
|
|
169
|
-
|
|
170
138
|
def _interpret_correlations(self, significant_pairs: List[Dict[str, Any]]) -> str:
|
|
171
139
|
# Generate interpretation of correlation results.
|
|
172
140
|
if not significant_pairs:
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _load_yaml_config(yaml_path: Path) -> Dict[str, Any]:
|
|
9
|
+
# Load configuration from YAML file if it exists.
|
|
10
|
+
if not yaml_path.exists():
|
|
11
|
+
return {}
|
|
12
|
+
try:
|
|
13
|
+
import yaml
|
|
14
|
+
|
|
15
|
+
with open(yaml_path, "r") as f:
|
|
16
|
+
return yaml.safe_load(f) or {}
|
|
17
|
+
except ImportError:
|
|
18
|
+
return {}
|
|
19
|
+
except Exception:
|
|
20
|
+
return {}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Config:
|
|
24
|
+
# Configuration class for loading from env vars and YAML.
|
|
25
|
+
|
|
26
|
+
def __init__(self, env_file: str = ".env", yaml_file: str = "greenmining.yaml"):
|
|
27
|
+
# Initialize configuration from environment and YAML file.
|
|
28
|
+
env_path = Path(env_file)
|
|
29
|
+
if env_path.exists():
|
|
30
|
+
load_dotenv(env_path)
|
|
31
|
+
else:
|
|
32
|
+
load_dotenv()
|
|
33
|
+
|
|
34
|
+
# Load YAML config
|
|
35
|
+
yaml_path = Path(yaml_file)
|
|
36
|
+
self._yaml_config = _load_yaml_config(yaml_path)
|
|
37
|
+
|
|
38
|
+
# GitHub API Configuration
|
|
39
|
+
self.GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
|
|
40
|
+
if not self.GITHUB_TOKEN or self.GITHUB_TOKEN == "your_github_pat_here":
|
|
41
|
+
raise ValueError("GITHUB_TOKEN not set. Please set it in .env file or environment.")
|
|
42
|
+
|
|
43
|
+
# Search Configuration (YAML: sources.search.*)
|
|
44
|
+
yaml_search = self._yaml_config.get("sources", {}).get("search", {})
|
|
45
|
+
|
|
46
|
+
self.SUPPORTED_LANGUAGES: List[str] = yaml_search.get(
|
|
47
|
+
"languages",
|
|
48
|
+
[
|
|
49
|
+
"Python",
|
|
50
|
+
"JavaScript",
|
|
51
|
+
"TypeScript",
|
|
52
|
+
"Java",
|
|
53
|
+
"C++",
|
|
54
|
+
"C#",
|
|
55
|
+
"Go",
|
|
56
|
+
"Rust",
|
|
57
|
+
"PHP",
|
|
58
|
+
"Ruby",
|
|
59
|
+
"Swift",
|
|
60
|
+
"Kotlin",
|
|
61
|
+
"Scala",
|
|
62
|
+
"R",
|
|
63
|
+
"MATLAB",
|
|
64
|
+
"Dart",
|
|
65
|
+
"Lua",
|
|
66
|
+
"Perl",
|
|
67
|
+
"Haskell",
|
|
68
|
+
"Elixir",
|
|
69
|
+
],
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Repository Limits
|
|
73
|
+
self.MIN_STARS = yaml_search.get("min_stars", int(os.getenv("MIN_STARS", "100")))
|
|
74
|
+
self.MAX_REPOS = int(os.getenv("MAX_REPOS", "100"))
|
|
75
|
+
|
|
76
|
+
# Output Configuration (YAML: output.directory)
|
|
77
|
+
yaml_output = self._yaml_config.get("output", {})
|
|
78
|
+
self.OUTPUT_DIR = Path(yaml_output.get("directory", os.getenv("OUTPUT_DIR", "./data")))
|
|
79
|
+
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
80
|
+
|
|
81
|
+
# File Paths
|
|
82
|
+
self.REPOS_FILE = self.OUTPUT_DIR / "repositories.json"
|
|
83
|
+
|
|
84
|
+
def __repr__(self) -> str:
|
|
85
|
+
# String representation of configuration (hiding sensitive data).
|
|
86
|
+
return (
|
|
87
|
+
f"Config("
|
|
88
|
+
f"MAX_REPOS={self.MAX_REPOS}, "
|
|
89
|
+
f"OUTPUT_DIR={self.OUTPUT_DIR}"
|
|
90
|
+
f")"
|
|
91
|
+
)
|
|
@@ -1,6 +1,9 @@
|
|
|
1
|
-
# Repository Controller - Handles repository fetching operations.
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
# Repository Controller - Handles repository fetching + cloning operations.
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import shutil
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Dict
|
|
4
7
|
|
|
5
8
|
from greenmining.config import Config
|
|
6
9
|
from greenmining.models.repository import Repository
|
|
@@ -15,23 +18,81 @@ class RepositoryController:
|
|
|
15
18
|
# Initialize controller with configuration.
|
|
16
19
|
self.config = config
|
|
17
20
|
self.graphql_fetcher = GitHubGraphQLFetcher(config.GITHUB_TOKEN)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
21
|
+
self.repos_dir = Path.cwd() / "greenmining_repos"
|
|
22
|
+
|
|
23
|
+
def _sanitize_repo_name(self, repo: Repository, index: int = 0) -> str:
|
|
24
|
+
"""Safe unique dir name: owner_repo[_index]. Handles case collisions."""
|
|
25
|
+
base = re.sub(r'[^a-z0-9-]', '_', repo.full_name.replace('/', '_').lower())
|
|
26
|
+
name = f"{base}_{index}" if index else base
|
|
27
|
+
path = self.repos_dir / name
|
|
28
|
+
counter = 1
|
|
29
|
+
while path.exists():
|
|
30
|
+
name = f"{base}_{counter}"
|
|
31
|
+
path = self.repos_dir / name
|
|
32
|
+
counter += 1
|
|
33
|
+
return name
|
|
34
|
+
|
|
35
|
+
def clone_repositories(
|
|
36
|
+
self,
|
|
37
|
+
repositories: List[Repository],
|
|
38
|
+
github_token: str = None,
|
|
39
|
+
cleanup: bool = True,
|
|
40
|
+
depth: int = 1 # Shallow clone
|
|
41
|
+
) -> List[Dict]:
|
|
42
|
+
"""Clone repos to ./greenmining_repos/ with unique sanitized names."""
|
|
43
|
+
self.repos_dir.mkdir(exist_ok=True)
|
|
44
|
+
if cleanup:
|
|
45
|
+
shutil.rmtree(self.repos_dir, ignore_errors=True)
|
|
46
|
+
self.repos_dir.mkdir(exist_ok=True)
|
|
47
|
+
colored_print(f"Cleaned {self.repos_dir}", "yellow")
|
|
48
|
+
|
|
49
|
+
results = []
|
|
50
|
+
for i, repo in enumerate(repositories, 1):
|
|
51
|
+
safe_name = self._sanitize_repo_name(repo, i)
|
|
52
|
+
clone_path = self.repos_dir / safe_name
|
|
53
|
+
|
|
54
|
+
colored_print(f"[{i}/{len(repositories)}] Cloning {repo.full_name} → {safe_name}", "cyan")
|
|
55
|
+
|
|
56
|
+
url = f"https://{github_token}@github.com/{repo.full_name}.git" if github_token else repo.url
|
|
57
|
+
cmd = ["git", "clone", f"--depth={depth}", "-v", url, str(clone_path)]
|
|
58
|
+
|
|
59
|
+
import subprocess
|
|
60
|
+
try:
|
|
61
|
+
subprocess.check_call(cmd, cwd=self.repos_dir.parent)
|
|
62
|
+
colored_print(f"{safe_name}", "green")
|
|
63
|
+
results.append({
|
|
64
|
+
"full_name": repo.full_name,
|
|
65
|
+
"local_path": str(clone_path),
|
|
66
|
+
"success": True
|
|
67
|
+
})
|
|
68
|
+
except subprocess.CalledProcessError as e:
|
|
69
|
+
colored_print(f"{safe_name}: {e}", "red")
|
|
70
|
+
results.append({
|
|
71
|
+
"full_name": repo.full_name,
|
|
72
|
+
"local_path": str(clone_path),
|
|
73
|
+
"success": False,
|
|
74
|
+
"error": str(e)
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
# Save map for analyze_repositories
|
|
78
|
+
save_json_file(results, self.repos_dir / "clone_results.json")
|
|
79
|
+
success_rate = sum(1 for r in results if r["success"]) / len(results) * 100
|
|
80
|
+
colored_print(f"Cloned: {success_rate:.1f}% ({self.repos_dir}/clone_results.json)", "green")
|
|
81
|
+
return results
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def fetch_repositories(self, max_repos: int = None, min_stars: int = None,
|
|
88
|
+
languages: list[str] = None, keywords: str = None,
|
|
89
|
+
created_after: str = None, created_before: str = None,
|
|
90
|
+
pushed_after: str = None, pushed_before: str = None) -> list[Repository]:
|
|
30
91
|
# Fetch repositories from GitHub using GraphQL API.
|
|
31
92
|
max_repos = max_repos or self.config.MAX_REPOS
|
|
32
93
|
min_stars = min_stars or self.config.MIN_STARS
|
|
33
94
|
languages = languages or self.config.SUPPORTED_LANGUAGES
|
|
34
|
-
keywords = keywords
|
|
95
|
+
keywords = keywords
|
|
35
96
|
|
|
36
97
|
colored_print(f"Fetching up to {max_repos} repositories...", "cyan")
|
|
37
98
|
colored_print(f" Keywords: {keywords}", "cyan")
|
|
@@ -124,24 +124,3 @@ class CodeCarbonMeter(EnergyMeter):
|
|
|
124
124
|
end_time=datetime.fromtimestamp(end_time),
|
|
125
125
|
)
|
|
126
126
|
|
|
127
|
-
def get_carbon_intensity(self) -> Optional[float]:
|
|
128
|
-
# Get current carbon intensity for the configured region.
|
|
129
|
-
if not self._codecarbon_available:
|
|
130
|
-
return None
|
|
131
|
-
|
|
132
|
-
try:
|
|
133
|
-
from codecarbon import EmissionsTracker
|
|
134
|
-
|
|
135
|
-
# Create temporary tracker to get carbon intensity
|
|
136
|
-
tracker = EmissionsTracker(
|
|
137
|
-
project_name="carbon_check",
|
|
138
|
-
country_iso_code=self.country_iso_code,
|
|
139
|
-
save_to_file=False,
|
|
140
|
-
log_level="error",
|
|
141
|
-
)
|
|
142
|
-
tracker.start()
|
|
143
|
-
tracker.stop()
|
|
144
|
-
|
|
145
|
-
return getattr(tracker, "_carbon_intensity", None)
|
|
146
|
-
except Exception:
|
|
147
|
-
return None
|
|
@@ -254,6 +254,35 @@ GSF_PATTERNS = {
|
|
|
254
254
|
"description": "Choose hardware optimized for energy efficiency",
|
|
255
255
|
"sci_impact": "Direct reduction in energy consumption",
|
|
256
256
|
},
|
|
257
|
+
"match_preconfigured_server": {
|
|
258
|
+
"name": "Match Utilization Requirements with Pre-configured Servers",
|
|
259
|
+
"category": "cloud",
|
|
260
|
+
"keywords": [
|
|
261
|
+
"pre-configured server",
|
|
262
|
+
"energy proportionality",
|
|
263
|
+
"server utilization",
|
|
264
|
+
"oversized server",
|
|
265
|
+
"underutilized server",
|
|
266
|
+
"server consolidation",
|
|
267
|
+
],
|
|
268
|
+
"description": "Select pre-configured servers that match utilization needs; one highly utilized server is more energy-efficient than two underutilized ones",
|
|
269
|
+
"sci_impact": "Higher utilization improves energy proportionality; fewer servers reduces embodied carbon",
|
|
270
|
+
},
|
|
271
|
+
"optimize_customer_device_impact": {
|
|
272
|
+
"name": "Optimize Impact on Customer Devices and Equipment",
|
|
273
|
+
"category": "cloud",
|
|
274
|
+
"keywords": [
|
|
275
|
+
"customer device",
|
|
276
|
+
"backward compatible",
|
|
277
|
+
"backwards compatible",
|
|
278
|
+
"older hardware",
|
|
279
|
+
"device lifetime",
|
|
280
|
+
"older browser",
|
|
281
|
+
"end-of-life hardware",
|
|
282
|
+
],
|
|
283
|
+
"description": "Design software to extend customer hardware lifetimes through backward compatibility with older devices, browsers, and operating systems",
|
|
284
|
+
"sci_impact": "Extending device lifetimes reduces embodied carbon; optimizing for older hardware may also reduce energy intensity",
|
|
285
|
+
},
|
|
257
286
|
# ==================== WEB PATTERNS (15+) ====================
|
|
258
287
|
"avoid_chaining_requests": {
|
|
259
288
|
"name": "Avoid Chaining Critical Requests",
|
|
@@ -1555,6 +1584,18 @@ GREEN_KEYWORDS = [
|
|
|
1555
1584
|
"workload",
|
|
1556
1585
|
"overhead",
|
|
1557
1586
|
"footprint",
|
|
1587
|
+
# Server utilization & customer device patterns
|
|
1588
|
+
"pre-configured server",
|
|
1589
|
+
"energy proportionality",
|
|
1590
|
+
"server consolidation",
|
|
1591
|
+
"underutilized server",
|
|
1592
|
+
"oversized server",
|
|
1593
|
+
"backward compatible",
|
|
1594
|
+
"backwards compatible",
|
|
1595
|
+
"customer device",
|
|
1596
|
+
"device lifetime",
|
|
1597
|
+
"older browser",
|
|
1598
|
+
"end-of-life hardware",
|
|
1558
1599
|
]
|
|
1559
1600
|
|
|
1560
1601
|
|
|
@@ -2,21 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
5
|
from datetime import datetime, timedelta
|
|
7
6
|
from pathlib import Path
|
|
8
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
9
8
|
|
|
10
9
|
from github import Github
|
|
11
10
|
from tqdm import tqdm
|
|
12
11
|
|
|
13
|
-
from greenmining.config import get_config
|
|
14
12
|
from greenmining.models.repository import Repository
|
|
15
13
|
from greenmining.utils import (
|
|
16
14
|
colored_print,
|
|
17
15
|
format_timestamp,
|
|
18
|
-
load_json_file,
|
|
19
|
-
print_banner,
|
|
20
16
|
retry_on_exception,
|
|
21
17
|
save_json_file,
|
|
22
18
|
)
|
|
@@ -110,8 +106,7 @@ class CommitExtractor:
|
|
|
110
106
|
try:
|
|
111
107
|
# Get repository from GitHub API
|
|
112
108
|
if not self.github:
|
|
113
|
-
|
|
114
|
-
self.github = Github(config.GITHUB_TOKEN)
|
|
109
|
+
raise ValueError("github_token is required for commit extraction")
|
|
115
110
|
|
|
116
111
|
gh_repo = self.github.get_repo(repo_name)
|
|
117
112
|
|
|
@@ -143,40 +138,6 @@ class CommitExtractor:
|
|
|
143
138
|
|
|
144
139
|
return commits
|
|
145
140
|
|
|
146
|
-
def _extract_commit_metadata(self, commit, repo_name: str) -> dict[str, Any]:
|
|
147
|
-
# Extract metadata from commit object.
|
|
148
|
-
# Get modified files
|
|
149
|
-
files_changed = []
|
|
150
|
-
lines_added = 0
|
|
151
|
-
lines_deleted = 0
|
|
152
|
-
|
|
153
|
-
try:
|
|
154
|
-
for modified_file in commit.modified_files:
|
|
155
|
-
files_changed.append(modified_file.filename)
|
|
156
|
-
lines_added += modified_file.added_lines
|
|
157
|
-
lines_deleted += modified_file.deleted_lines
|
|
158
|
-
except Exception:
|
|
159
|
-
pass
|
|
160
|
-
|
|
161
|
-
return {
|
|
162
|
-
"commit_id": commit.hash,
|
|
163
|
-
"repo_name": repo_name,
|
|
164
|
-
"date": commit.committer_date.isoformat(),
|
|
165
|
-
"author": commit.author.name,
|
|
166
|
-
"author_email": commit.author.email,
|
|
167
|
-
"message": commit.msg.strip(),
|
|
168
|
-
"files_changed": files_changed[:20], # Limit to 20 files
|
|
169
|
-
"lines_added": lines_added,
|
|
170
|
-
"lines_deleted": lines_deleted,
|
|
171
|
-
"insertions": lines_added,
|
|
172
|
-
"deletions": lines_deleted,
|
|
173
|
-
"is_merge": commit.merge,
|
|
174
|
-
"branches": (
|
|
175
|
-
list(commit.branches) if hasattr(commit, "branches") and commit.branches else []
|
|
176
|
-
),
|
|
177
|
-
"in_main_branch": commit.in_main_branch if hasattr(commit, "in_main_branch") else True,
|
|
178
|
-
}
|
|
179
|
-
|
|
180
141
|
def _extract_commit_metadata_from_github(self, commit, repo_name: str) -> dict[str, Any]:
|
|
181
142
|
# Extract metadata from GitHub API commit object.
|
|
182
143
|
# Get modified files and stats
|
|
@@ -2,26 +2,21 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
5
|
from collections import defaultdict
|
|
7
6
|
from pathlib import Path
|
|
8
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
9
8
|
|
|
10
9
|
import pandas as pd
|
|
11
10
|
|
|
12
11
|
from greenmining.analyzers import (
|
|
13
12
|
StatisticalAnalyzer,
|
|
14
13
|
TemporalAnalyzer,
|
|
15
|
-
QualitativeAnalyzer,
|
|
16
14
|
)
|
|
17
|
-
from greenmining.config import get_config
|
|
18
15
|
from greenmining.models.repository import Repository
|
|
19
16
|
from greenmining.utils import (
|
|
20
17
|
colored_print,
|
|
21
18
|
format_number,
|
|
22
19
|
format_percentage,
|
|
23
|
-
load_json_file,
|
|
24
|
-
print_banner,
|
|
25
20
|
save_csv_file,
|
|
26
21
|
save_json_file,
|
|
27
22
|
)
|
|
@@ -2,18 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
|
-
import re
|
|
7
5
|
from collections import Counter
|
|
8
6
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
10
8
|
|
|
11
9
|
from tqdm import tqdm
|
|
12
10
|
|
|
13
11
|
from greenmining.analyzers import (
|
|
14
12
|
CodeDiffAnalyzer,
|
|
15
13
|
)
|
|
16
|
-
from greenmining.config import get_config
|
|
17
14
|
from greenmining.gsf_patterns import (
|
|
18
15
|
GREEN_KEYWORDS,
|
|
19
16
|
GSF_PATTERNS,
|
|
@@ -22,11 +19,7 @@ from greenmining.gsf_patterns import (
|
|
|
22
19
|
)
|
|
23
20
|
from greenmining.utils import (
|
|
24
21
|
colored_print,
|
|
25
|
-
create_checkpoint,
|
|
26
22
|
format_timestamp,
|
|
27
|
-
load_checkpoint,
|
|
28
|
-
load_json_file,
|
|
29
|
-
print_banner,
|
|
30
23
|
save_json_file,
|
|
31
24
|
)
|
|
32
25
|
|
|
@@ -156,55 +149,6 @@ class DataAnalyzer:
|
|
|
156
149
|
|
|
157
150
|
return result
|
|
158
151
|
|
|
159
|
-
def _check_green_awareness(self, message: str, files: list[str]) -> tuple[bool, Optional[str]]:
|
|
160
|
-
# Check if commit explicitly mentions green/energy concerns.
|
|
161
|
-
# Check message for green keywords
|
|
162
|
-
for keyword in self.GREEN_KEYWORDS:
|
|
163
|
-
if keyword in message:
|
|
164
|
-
# Extract context around keyword
|
|
165
|
-
pattern = rf".{{0,30}}{re.escape(keyword)}.{{0,30}}"
|
|
166
|
-
match = re.search(pattern, message, re.IGNORECASE)
|
|
167
|
-
if match:
|
|
168
|
-
evidence = match.group(0).strip()
|
|
169
|
-
return True, f"Keyword '{keyword}': {evidence}"
|
|
170
|
-
|
|
171
|
-
# Check file names for patterns
|
|
172
|
-
cache_files = [f for f in files if "cache" in f or "redis" in f]
|
|
173
|
-
if cache_files:
|
|
174
|
-
return True, f"Modified cache-related file: {cache_files[0]}"
|
|
175
|
-
|
|
176
|
-
perf_files = [f for f in files if "performance" in f or "optimization" in f]
|
|
177
|
-
if perf_files:
|
|
178
|
-
return True, f"Modified performance file: {perf_files[0]}"
|
|
179
|
-
|
|
180
|
-
return False, None
|
|
181
|
-
|
|
182
|
-
def _detect_known_pattern(self, message: str, files: list[str]) -> tuple[Optional[str], str]:
|
|
183
|
-
# Detect known green software pattern.
|
|
184
|
-
matches = []
|
|
185
|
-
|
|
186
|
-
# Check each pattern
|
|
187
|
-
for pattern_name, keywords in self.GREEN_PATTERNS.items():
|
|
188
|
-
for keyword in keywords:
|
|
189
|
-
if keyword in message:
|
|
190
|
-
# Calculate confidence based on specificity
|
|
191
|
-
confidence = "HIGH" if len(keyword) > 10 else "MEDIUM"
|
|
192
|
-
matches.append((pattern_name, confidence, len(keyword)))
|
|
193
|
-
|
|
194
|
-
# Check file names for pattern hints
|
|
195
|
-
all_files = " ".join(files)
|
|
196
|
-
for pattern_name, keywords in self.GREEN_PATTERNS.items():
|
|
197
|
-
for keyword in keywords:
|
|
198
|
-
if keyword in all_files:
|
|
199
|
-
matches.append((pattern_name, "MEDIUM", len(keyword)))
|
|
200
|
-
|
|
201
|
-
if not matches:
|
|
202
|
-
return "NONE DETECTED", "NONE"
|
|
203
|
-
|
|
204
|
-
# Return most specific match (longest keyword)
|
|
205
|
-
matches.sort(key=lambda x: x[2], reverse=True)
|
|
206
|
-
return matches[0][0], matches[0][1]
|
|
207
|
-
|
|
208
152
|
def save_results(self, results: list[dict[str, Any]], output_file: Path):
|
|
209
153
|
# Save analysis results to JSON file.
|
|
210
154
|
# Calculate summary statistics
|
|
@@ -5,13 +5,12 @@ from __future__ import annotations
|
|
|
5
5
|
import os
|
|
6
6
|
import re
|
|
7
7
|
import shutil
|
|
8
|
-
import subprocess
|
|
9
8
|
import tempfile
|
|
10
9
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
11
10
|
from dataclasses import dataclass, field
|
|
12
11
|
from datetime import datetime, timedelta
|
|
13
12
|
from pathlib import Path
|
|
14
|
-
from typing import Any, Dict, List, Optional
|
|
13
|
+
from typing import Any, Dict, List, Optional
|
|
15
14
|
|
|
16
15
|
from pydriller import Repository
|
|
17
16
|
from pydriller.metrics.process.change_set import ChangeSet
|
|
@@ -1,20 +1,15 @@
|
|
|
1
1
|
# Report generation for green mining analysis.
|
|
2
|
-
"""Report generation module for GreenMining analysis results."""
|
|
3
2
|
|
|
4
3
|
from __future__ import annotations
|
|
5
4
|
|
|
6
|
-
import json
|
|
7
5
|
from datetime import datetime
|
|
8
6
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
10
8
|
|
|
11
|
-
from greenmining.config import get_config
|
|
12
9
|
from greenmining.utils import (
|
|
13
10
|
colored_print,
|
|
14
11
|
format_number,
|
|
15
12
|
format_percentage,
|
|
16
|
-
load_json_file,
|
|
17
|
-
print_banner,
|
|
18
13
|
)
|
|
19
14
|
|
|
20
15
|
|
|
@@ -38,41 +38,12 @@ def save_json_file(data: dict[str, Any], path: Path, indent: int = 2) -> None:
|
|
|
38
38
|
json.dump(data, f, indent=indent, ensure_ascii=False)
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
def load_csv_file(path: Path) -> pd.DataFrame:
|
|
42
|
-
# Load CSV file as pandas DataFrame.
|
|
43
|
-
if not path.exists():
|
|
44
|
-
raise FileNotFoundError(f"File not found: {path}")
|
|
45
|
-
|
|
46
|
-
return pd.read_csv(path)
|
|
47
|
-
|
|
48
|
-
|
|
49
41
|
def save_csv_file(df: pd.DataFrame, path: Path) -> None:
|
|
50
42
|
# Save DataFrame to CSV file.
|
|
51
43
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
52
44
|
df.to_csv(path, index=False, encoding="utf-8")
|
|
53
45
|
|
|
54
46
|
|
|
55
|
-
def estimate_tokens(text: str) -> int:
|
|
56
|
-
# Estimate number of tokens in text.
|
|
57
|
-
return len(text) // 4
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def estimate_cost(tokens: int, model: str = "claude-sonnet-4-20250514") -> float:
|
|
61
|
-
# Estimate API cost based on token usage.
|
|
62
|
-
# Claude Sonnet 4 pricing (as of Dec 2024)
|
|
63
|
-
# Input: $3 per million tokens
|
|
64
|
-
# Output: $15 per million tokens
|
|
65
|
-
# Average estimate: assume 50% input, 50% output
|
|
66
|
-
|
|
67
|
-
if "sonnet" in model.lower():
|
|
68
|
-
input_cost = 3.0 / 1_000_000 # per token
|
|
69
|
-
output_cost = 15.0 / 1_000_000 # per token
|
|
70
|
-
avg_cost = (input_cost + output_cost) / 2
|
|
71
|
-
return tokens * avg_cost
|
|
72
|
-
|
|
73
|
-
return 0.0
|
|
74
|
-
|
|
75
|
-
|
|
76
47
|
def retry_on_exception(
|
|
77
48
|
max_retries: int = 3,
|
|
78
49
|
delay: float = 2.0,
|
|
@@ -124,14 +95,6 @@ def colored_print(text: str, color: str = "white") -> None:
|
|
|
124
95
|
print(f"{color_code}{text}{Style.RESET_ALL}")
|
|
125
96
|
|
|
126
97
|
|
|
127
|
-
def handle_github_rate_limit(response) -> None:
|
|
128
|
-
# Handle GitHub API rate limiting.
|
|
129
|
-
if hasattr(response, "status") and response.status == 403:
|
|
130
|
-
colored_print("GitHub API rate limit exceeded!", "red")
|
|
131
|
-
colored_print("Please wait or use an authenticated token.", "yellow")
|
|
132
|
-
raise Exception("GitHub API rate limit exceeded")
|
|
133
|
-
|
|
134
|
-
|
|
135
98
|
def format_number(num: int) -> str:
|
|
136
99
|
# Format large numbers with thousand separators.
|
|
137
100
|
return f"{num:,}"
|
|
@@ -140,53 +103,3 @@ def format_number(num: int) -> str:
|
|
|
140
103
|
def format_percentage(value: float, decimals: int = 1) -> str:
|
|
141
104
|
# Format percentage value.
|
|
142
105
|
return f"{value:.{decimals}f}%"
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def format_duration(seconds: float) -> str:
|
|
146
|
-
# Format duration in human-readable format.
|
|
147
|
-
if seconds < 60:
|
|
148
|
-
return f"{int(seconds)}s"
|
|
149
|
-
elif seconds < 3600:
|
|
150
|
-
minutes = int(seconds / 60)
|
|
151
|
-
secs = int(seconds % 60)
|
|
152
|
-
return f"{minutes}m {secs}s"
|
|
153
|
-
else:
|
|
154
|
-
hours = int(seconds / 3600)
|
|
155
|
-
minutes = int((seconds % 3600) / 60)
|
|
156
|
-
return f"{hours}h {minutes}m"
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def truncate_text(text: str, max_length: int = 100) -> str:
|
|
160
|
-
# Truncate text to maximum length.
|
|
161
|
-
if len(text) <= max_length:
|
|
162
|
-
return text
|
|
163
|
-
return text[: max_length - 3] + "..."
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
def create_checkpoint(checkpoint_file: Path, data: dict[str, Any]) -> None:
|
|
167
|
-
# Create checkpoint file for resuming operations.
|
|
168
|
-
save_json_file(data, checkpoint_file)
|
|
169
|
-
colored_print(f"Checkpoint saved: {checkpoint_file}", "green")
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
def load_checkpoint(checkpoint_file: Path) -> Optional[dict[str, Any]]:
|
|
173
|
-
# Load checkpoint data if exists.
|
|
174
|
-
if checkpoint_file.exists():
|
|
175
|
-
try:
|
|
176
|
-
return load_json_file(checkpoint_file)
|
|
177
|
-
except Exception as e:
|
|
178
|
-
colored_print(f"Failed to load checkpoint: {e}", "yellow")
|
|
179
|
-
return None
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
def print_banner(title: str) -> None:
|
|
183
|
-
# Print formatted banner.
|
|
184
|
-
colored_print("\n" + "=" * 60, "cyan")
|
|
185
|
-
colored_print(f" {title}", "cyan")
|
|
186
|
-
colored_print("=" * 60 + "\n", "cyan")
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def print_section(title: str) -> None:
|
|
190
|
-
# Print section header.
|
|
191
|
-
colored_print(f"\n {title}", "blue")
|
|
192
|
-
colored_print("-" * 60, "blue")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: greenmining
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.9
|
|
4
4
|
Summary: An empirical Python library for Mining Software Repositories (MSR) in Green IT research
|
|
5
5
|
Author-email: Adam Bouafia <a.bouafia@student.vu.nl>
|
|
6
6
|
License: MIT
|
|
@@ -68,9 +68,9 @@ An empirical Python library for Mining Software Repositories (MSR) in Green IT r
|
|
|
68
68
|
|
|
69
69
|
`greenmining` is a research-grade Python library designed for **empirical Mining Software Repositories (MSR)** studies in **Green IT**. It enables researchers and practitioners to:
|
|
70
70
|
|
|
71
|
-
- **Mine repositories at scale** - Fetch and analyze GitHub repositories via GraphQL API with configurable filters
|
|
72
|
-
|
|
73
|
-
- **Classify green commits** - Detect
|
|
71
|
+
- **Mine repositories at scale** - Search, Fetch and analyze GitHub repositories via GraphQL API with configurable filters
|
|
72
|
+
|
|
73
|
+
- **Classify green commits** - Detect 124 sustainability patterns from the Green Software Foundation (GSF) catalog
|
|
74
74
|
- **Analyze any repository by URL** - Direct Git-based analysis with support for private repositories
|
|
75
75
|
- **Measure energy consumption** - RAPL, CodeCarbon, and CPU Energy Meter backends for power profiling
|
|
76
76
|
- **Carbon footprint reporting** - CO2 emissions calculation with 20+ country profiles and cloud region support
|
|
@@ -113,7 +113,7 @@ docker pull adambouafia/greenmining:latest
|
|
|
113
113
|
from greenmining import GSF_PATTERNS, is_green_aware, get_pattern_by_keywords
|
|
114
114
|
|
|
115
115
|
# Check available patterns
|
|
116
|
-
print(f"Total patterns: {len(GSF_PATTERNS)}") #
|
|
116
|
+
print(f"Total patterns: {len(GSF_PATTERNS)}") # 124 patterns across 15 categories
|
|
117
117
|
|
|
118
118
|
# Detect green awareness in commit messages
|
|
119
119
|
commit_msg = "Optimize Redis caching to reduce energy consumption"
|
|
@@ -670,8 +670,8 @@ config = Config(
|
|
|
670
670
|
|
|
671
671
|
### Core Capabilities
|
|
672
672
|
|
|
673
|
-
- **Pattern Detection**:
|
|
674
|
-
- **Keyword Analysis**:
|
|
673
|
+
- **Pattern Detection**: 124 sustainability patterns across 15 categories from the GSF catalog
|
|
674
|
+
- **Keyword Analysis**: 332 green software detection keywords
|
|
675
675
|
- **Repository Fetching**: GraphQL API with date, star, and language filters
|
|
676
676
|
- **URL-Based Analysis**: Direct Git-based analysis from GitHub URLs (HTTPS and SSH)
|
|
677
677
|
- **Batch Processing**: Parallel analysis of multiple repositories with configurable workers
|
|
@@ -739,7 +739,7 @@ print(f"Equivalent: {report.tree_months:.2f} tree-months to offset")
|
|
|
739
739
|
|
|
740
740
|
### Pattern Database
|
|
741
741
|
|
|
742
|
-
**
|
|
742
|
+
**124 green software patterns based on:**
|
|
743
743
|
- Green Software Foundation (GSF) Patterns Catalog
|
|
744
744
|
- VU Amsterdam 2024 research on ML system sustainability
|
|
745
745
|
- ICSE 2024 conference papers on sustainable software
|
|
@@ -749,11 +749,11 @@ print(f"Equivalent: {report.tree_months:.2f} tree-months to offset")
|
|
|
749
749
|
- **Coverage**: 67% of patterns actively detect in real-world commits
|
|
750
750
|
- **Accuracy**: 100% true positive rate for green-aware commits
|
|
751
751
|
- **Categories**: 15 distinct sustainability domains covered
|
|
752
|
-
- **Keywords**:
|
|
752
|
+
- **Keywords**: 332 detection terms across all patterns
|
|
753
753
|
|
|
754
754
|
## GSF Pattern Categories
|
|
755
755
|
|
|
756
|
-
**
|
|
756
|
+
**124 patterns across 15 categories:**
|
|
757
757
|
|
|
758
758
|
### 1. Cloud (40 patterns)
|
|
759
759
|
Auto-scaling, serverless computing, right-sizing instances, region selection for renewable energy, spot instances, idle resource detection, cloud-native architectures
|
|
@@ -6,7 +6,6 @@ pyproject.toml
|
|
|
6
6
|
setup.py
|
|
7
7
|
./greenmining/__init__.py
|
|
8
8
|
./greenmining/__main__.py
|
|
9
|
-
./greenmining/__version__.py
|
|
10
9
|
./greenmining/config.py
|
|
11
10
|
./greenmining/gsf_patterns.py
|
|
12
11
|
./greenmining/utils.py
|
|
@@ -37,13 +36,11 @@ setup.py
|
|
|
37
36
|
./greenmining/services/commit_extractor.py
|
|
38
37
|
./greenmining/services/data_aggregator.py
|
|
39
38
|
./greenmining/services/data_analyzer.py
|
|
40
|
-
./greenmining/services/github_fetcher.py
|
|
41
39
|
./greenmining/services/github_graphql_fetcher.py
|
|
42
40
|
./greenmining/services/local_repo_analyzer.py
|
|
43
41
|
./greenmining/services/reports.py
|
|
44
42
|
greenmining/__init__.py
|
|
45
43
|
greenmining/__main__.py
|
|
46
|
-
greenmining/__version__.py
|
|
47
44
|
greenmining/config.py
|
|
48
45
|
greenmining/gsf_patterns.py
|
|
49
46
|
greenmining/utils.py
|
|
@@ -79,7 +76,6 @@ greenmining/services/__init__.py
|
|
|
79
76
|
greenmining/services/commit_extractor.py
|
|
80
77
|
greenmining/services/data_aggregator.py
|
|
81
78
|
greenmining/services/data_analyzer.py
|
|
82
|
-
greenmining/services/github_fetcher.py
|
|
83
79
|
greenmining/services/github_graphql_fetcher.py
|
|
84
80
|
greenmining/services/local_repo_analyzer.py
|
|
85
81
|
greenmining/services/reports.py
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "greenmining"
|
|
7
|
-
version = "1.1.
|
|
7
|
+
version = "1.1.9"
|
|
8
8
|
description = "An empirical Python library for Mining Software Repositories (MSR) in Green IT research"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -1,200 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
-
|
|
5
|
-
from dotenv import load_dotenv
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def _load_yaml_config(yaml_path: Path) -> Dict[str, Any]:
|
|
9
|
-
# Load configuration from YAML file if it exists.
|
|
10
|
-
if not yaml_path.exists():
|
|
11
|
-
return {}
|
|
12
|
-
try:
|
|
13
|
-
import yaml
|
|
14
|
-
|
|
15
|
-
with open(yaml_path, "r") as f:
|
|
16
|
-
return yaml.safe_load(f) or {}
|
|
17
|
-
except ImportError:
|
|
18
|
-
return {}
|
|
19
|
-
except Exception:
|
|
20
|
-
return {}
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class Config:
|
|
24
|
-
# Configuration class for loading from env vars and YAML.
|
|
25
|
-
|
|
26
|
-
def __init__(self, env_file: str = ".env", yaml_file: str = "greenmining.yaml"):
|
|
27
|
-
# Initialize configuration from environment and YAML file.
|
|
28
|
-
# Load environment variables
|
|
29
|
-
env_path = Path(env_file)
|
|
30
|
-
if env_path.exists():
|
|
31
|
-
load_dotenv(env_path)
|
|
32
|
-
else:
|
|
33
|
-
load_dotenv() # Load from system environment
|
|
34
|
-
|
|
35
|
-
# Load YAML config (takes precedence for certain options)
|
|
36
|
-
yaml_path = Path(yaml_file)
|
|
37
|
-
self._yaml_config = _load_yaml_config(yaml_path)
|
|
38
|
-
|
|
39
|
-
# GitHub API Configuration
|
|
40
|
-
self.GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
|
|
41
|
-
if not self.GITHUB_TOKEN or self.GITHUB_TOKEN == "your_github_pat_here":
|
|
42
|
-
raise ValueError("GITHUB_TOKEN not set. Please set it in .env file or environment.")
|
|
43
|
-
|
|
44
|
-
# Analysis Type
|
|
45
|
-
self.ANALYSIS_TYPE = "keyword_heuristic"
|
|
46
|
-
|
|
47
|
-
# Search and Processing Configuration (YAML: sources.search.keywords)
|
|
48
|
-
yaml_search = self._yaml_config.get("sources", {}).get("search", {})
|
|
49
|
-
self.GITHUB_SEARCH_KEYWORDS = yaml_search.get(
|
|
50
|
-
"keywords", ["microservices", "microservice-architecture", "cloud-native"]
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
# Supported Languages (YAML: sources.search.languages)
|
|
54
|
-
self.SUPPORTED_LANGUAGES = yaml_search.get(
|
|
55
|
-
"languages",
|
|
56
|
-
[
|
|
57
|
-
"Java",
|
|
58
|
-
"Python",
|
|
59
|
-
"Go",
|
|
60
|
-
"JavaScript",
|
|
61
|
-
"TypeScript",
|
|
62
|
-
"C#",
|
|
63
|
-
"Rust",
|
|
64
|
-
],
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
# Repository and Commit Limits (YAML: extraction.*)
|
|
68
|
-
yaml_extraction = self._yaml_config.get("extraction", {})
|
|
69
|
-
self.MIN_STARS = yaml_search.get("min_stars", int(os.getenv("MIN_STARS", "100")))
|
|
70
|
-
self.MAX_REPOS = int(os.getenv("MAX_REPOS", "100"))
|
|
71
|
-
self.COMMITS_PER_REPO = yaml_extraction.get(
|
|
72
|
-
"max_commits", int(os.getenv("COMMITS_PER_REPO", "50"))
|
|
73
|
-
)
|
|
74
|
-
self.DAYS_BACK = yaml_extraction.get("days_back", int(os.getenv("DAYS_BACK", "730")))
|
|
75
|
-
self.SKIP_MERGES = yaml_extraction.get("skip_merges", True)
|
|
76
|
-
|
|
77
|
-
# Analysis Configuration (YAML: analysis.*)
|
|
78
|
-
yaml_analysis = self._yaml_config.get("analysis", {})
|
|
79
|
-
self.ENABLE_NLP_ANALYSIS = os.getenv("ENABLE_NLP_ANALYSIS", "false").lower() == "true"
|
|
80
|
-
self.ENABLE_TEMPORAL_ANALYSIS = (
|
|
81
|
-
os.getenv("ENABLE_TEMPORAL_ANALYSIS", "false").lower() == "true"
|
|
82
|
-
)
|
|
83
|
-
self.TEMPORAL_GRANULARITY = os.getenv("TEMPORAL_GRANULARITY", "quarter")
|
|
84
|
-
self.ENABLE_ML_FEATURES = os.getenv("ENABLE_ML_FEATURES", "false").lower() == "true"
|
|
85
|
-
self.VALIDATION_SAMPLE_SIZE = int(os.getenv("VALIDATION_SAMPLE_SIZE", "30"))
|
|
86
|
-
|
|
87
|
-
# PyDriller options (YAML: analysis.process_metrics, etc.)
|
|
88
|
-
self.PROCESS_METRICS_ENABLED = yaml_analysis.get(
|
|
89
|
-
"process_metrics", os.getenv("PROCESS_METRICS_ENABLED", "true").lower() == "true"
|
|
90
|
-
)
|
|
91
|
-
self.STRUCTURAL_METRICS_ENABLED = yaml_analysis.get(
|
|
92
|
-
"structural_metrics", os.getenv("STRUCTURAL_METRICS_ENABLED", "true").lower() == "true"
|
|
93
|
-
)
|
|
94
|
-
self.DMM_ENABLED = yaml_analysis.get(
|
|
95
|
-
"delta_maintainability", os.getenv("DMM_ENABLED", "true").lower() == "true"
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
# Temporal Filtering
|
|
99
|
-
self.CREATED_AFTER = os.getenv("CREATED_AFTER")
|
|
100
|
-
self.CREATED_BEFORE = os.getenv("CREATED_BEFORE")
|
|
101
|
-
self.PUSHED_AFTER = os.getenv("PUSHED_AFTER")
|
|
102
|
-
self.PUSHED_BEFORE = os.getenv("PUSHED_BEFORE")
|
|
103
|
-
self.COMMIT_DATE_FROM = os.getenv("COMMIT_DATE_FROM")
|
|
104
|
-
self.COMMIT_DATE_TO = os.getenv("COMMIT_DATE_TO")
|
|
105
|
-
self.MIN_COMMITS = int(os.getenv("MIN_COMMITS", "0"))
|
|
106
|
-
self.ACTIVITY_WINDOW_DAYS = int(os.getenv("ACTIVITY_WINDOW_DAYS", "730"))
|
|
107
|
-
|
|
108
|
-
# Analysis Configuration
|
|
109
|
-
self.BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10"))
|
|
110
|
-
|
|
111
|
-
# Processing Configuration
|
|
112
|
-
self.TIMEOUT_SECONDS = int(os.getenv("TIMEOUT_SECONDS", "30"))
|
|
113
|
-
self.MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3"))
|
|
114
|
-
self.RETRY_DELAY = 2
|
|
115
|
-
self.EXPONENTIAL_BACKOFF = True
|
|
116
|
-
|
|
117
|
-
# Output Configuration (YAML: output.directory)
|
|
118
|
-
yaml_output = self._yaml_config.get("output", {})
|
|
119
|
-
self.OUTPUT_DIR = Path(yaml_output.get("directory", os.getenv("OUTPUT_DIR", "./data")))
|
|
120
|
-
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
121
|
-
|
|
122
|
-
# File Paths
|
|
123
|
-
self.REPOS_FILE = self.OUTPUT_DIR / "repositories.json"
|
|
124
|
-
self.COMMITS_FILE = self.OUTPUT_DIR / "commits.json"
|
|
125
|
-
self.ANALYSIS_FILE = self.OUTPUT_DIR / "analysis_results.json"
|
|
126
|
-
self.AGGREGATED_FILE = self.OUTPUT_DIR / "aggregated_statistics.json"
|
|
127
|
-
self.CSV_FILE = self.OUTPUT_DIR / "green_analysis_results.csv"
|
|
128
|
-
self.REPORT_FILE = self.OUTPUT_DIR / "green_microservices_analysis.md"
|
|
129
|
-
self.CHECKPOINT_FILE = self.OUTPUT_DIR / "checkpoint.json"
|
|
130
|
-
|
|
131
|
-
# Direct Repository URL Support (YAML: sources.urls)
|
|
132
|
-
yaml_urls = self._yaml_config.get("sources", {}).get("urls", [])
|
|
133
|
-
env_urls = self._parse_repository_urls(os.getenv("REPOSITORY_URLS", ""))
|
|
134
|
-
self.REPOSITORY_URLS: List[str] = yaml_urls if yaml_urls else env_urls
|
|
135
|
-
|
|
136
|
-
# Clone path (YAML: extraction.clone_path)
|
|
137
|
-
self.CLONE_PATH = Path(
|
|
138
|
-
yaml_extraction.get("clone_path", os.getenv("CLONE_PATH", "/tmp/greenmining_repos"))
|
|
139
|
-
)
|
|
140
|
-
self.CLEANUP_AFTER_ANALYSIS = os.getenv("CLEANUP_AFTER_ANALYSIS", "true").lower() == "true"
|
|
141
|
-
|
|
142
|
-
# Energy Measurement (YAML: energy.*)
|
|
143
|
-
yaml_energy = self._yaml_config.get("energy", {})
|
|
144
|
-
self.ENERGY_ENABLED = yaml_energy.get(
|
|
145
|
-
"enabled", os.getenv("ENERGY_ENABLED", "false").lower() == "true"
|
|
146
|
-
)
|
|
147
|
-
self.ENERGY_BACKEND = yaml_energy.get("backend", os.getenv("ENERGY_BACKEND", "rapl"))
|
|
148
|
-
self.CARBON_TRACKING = yaml_energy.get(
|
|
149
|
-
"carbon_tracking", os.getenv("CARBON_TRACKING", "false").lower() == "true"
|
|
150
|
-
)
|
|
151
|
-
self.COUNTRY_ISO = yaml_energy.get("country_iso", os.getenv("COUNTRY_ISO", "USA"))
|
|
152
|
-
|
|
153
|
-
# Power profiling (YAML: energy.power_profiling.*)
|
|
154
|
-
yaml_power = yaml_energy.get("power_profiling", {})
|
|
155
|
-
self.POWER_PROFILING_ENABLED = yaml_power.get("enabled", False)
|
|
156
|
-
self.POWER_TEST_COMMAND = yaml_power.get("test_command", None)
|
|
157
|
-
self.POWER_REGRESSION_THRESHOLD = yaml_power.get("regression_threshold", 5.0)
|
|
158
|
-
|
|
159
|
-
# Logging
|
|
160
|
-
self.VERBOSE = os.getenv("VERBOSE", "false").lower() == "true"
|
|
161
|
-
self.LOG_FILE = self.OUTPUT_DIR / "mining.log"
|
|
162
|
-
|
|
163
|
-
def _parse_repository_urls(self, urls_str: str) -> List[str]:
|
|
164
|
-
# Parse comma-separated repository URLs from environment variable.
|
|
165
|
-
if not urls_str:
|
|
166
|
-
return []
|
|
167
|
-
return [url.strip() for url in urls_str.split(",") if url.strip()]
|
|
168
|
-
|
|
169
|
-
def validate(self) -> bool:
|
|
170
|
-
# Validate that all required configuration is present.
|
|
171
|
-
required_attrs = ["GITHUB_TOKEN", "MAX_REPOS", "COMMITS_PER_REPO"]
|
|
172
|
-
|
|
173
|
-
for attr in required_attrs:
|
|
174
|
-
if not getattr(self, attr, None):
|
|
175
|
-
raise ValueError(f"Missing required configuration: {attr}")
|
|
176
|
-
|
|
177
|
-
return True
|
|
178
|
-
|
|
179
|
-
def __repr__(self) -> str:
|
|
180
|
-
# String representation of configuration (hiding sensitive data).
|
|
181
|
-
return (
|
|
182
|
-
f"Config("
|
|
183
|
-
f"MAX_REPOS={self.MAX_REPOS}, "
|
|
184
|
-
f"COMMITS_PER_REPO={self.COMMITS_PER_REPO}, "
|
|
185
|
-
f"BATCH_SIZE={self.BATCH_SIZE}, "
|
|
186
|
-
f"OUTPUT_DIR={self.OUTPUT_DIR}"
|
|
187
|
-
f")"
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
# Global config instance
|
|
192
|
-
_config_instance = None
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
def get_config(env_file: str = ".env") -> Config:
|
|
196
|
-
# Get or create global configuration instance.
|
|
197
|
-
global _config_instance
|
|
198
|
-
if _config_instance is None:
|
|
199
|
-
_config_instance = Config(env_file)
|
|
200
|
-
return _config_instance
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|