greenmining 1.1.9__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__init__.py +29 -10
- greenmining/analyzers/__init__.py +0 -8
- greenmining/controllers/repository_controller.py +83 -88
- greenmining/services/local_repo_analyzer.py +1 -1
- greenmining-1.2.0.dist-info/METADATA +311 -0
- {greenmining-1.1.9.dist-info → greenmining-1.2.0.dist-info}/RECORD +9 -15
- greenmining/analyzers/power_regression.py +0 -211
- greenmining/analyzers/qualitative_analyzer.py +0 -394
- greenmining/analyzers/version_power_analyzer.py +0 -246
- greenmining/config.py +0 -91
- greenmining/presenters/__init__.py +0 -7
- greenmining/presenters/console_presenter.py +0 -143
- greenmining-1.1.9.dist-info/METADATA +0 -865
- {greenmining-1.1.9.dist-info → greenmining-1.2.0.dist-info}/WHEEL +0 -0
- {greenmining-1.1.9.dist-info → greenmining-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {greenmining-1.1.9.dist-info → greenmining-1.2.0.dist-info}/top_level.txt +0 -0
greenmining/__init__.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
#
|
|
1
|
+
# GreenMining - MSR library for Green IT research.
|
|
2
2
|
|
|
3
|
-
from greenmining.config import Config
|
|
4
3
|
from greenmining.controllers.repository_controller import RepositoryController
|
|
5
4
|
from greenmining.gsf_patterns import (
|
|
6
5
|
GREEN_KEYWORDS,
|
|
@@ -9,24 +8,23 @@ from greenmining.gsf_patterns import (
|
|
|
9
8
|
is_green_aware,
|
|
10
9
|
)
|
|
11
10
|
|
|
12
|
-
__version__ = "1.
|
|
11
|
+
__version__ = "1.2.0"
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
def fetch_repositories(
|
|
16
15
|
github_token: str,
|
|
17
|
-
max_repos: int =
|
|
18
|
-
min_stars: int =
|
|
16
|
+
max_repos: int = 100,
|
|
17
|
+
min_stars: int = 100,
|
|
19
18
|
languages: list = None,
|
|
20
19
|
keywords: str = None,
|
|
21
20
|
created_after: str = None,
|
|
22
21
|
created_before: str = None,
|
|
23
22
|
pushed_after: str = None,
|
|
24
23
|
pushed_before: str = None,
|
|
24
|
+
output_dir: str = "./data",
|
|
25
25
|
):
|
|
26
|
-
# Fetch repositories from GitHub
|
|
27
|
-
|
|
28
|
-
config.GITHUB_TOKEN = github_token
|
|
29
|
-
controller = RepositoryController(config)
|
|
26
|
+
# Fetch repositories from GitHub via GraphQL search.
|
|
27
|
+
controller = RepositoryController(github_token, output_dir=output_dir)
|
|
30
28
|
|
|
31
29
|
return controller.fetch_repositories(
|
|
32
30
|
max_repos=max_repos,
|
|
@@ -40,6 +38,27 @@ def fetch_repositories(
|
|
|
40
38
|
)
|
|
41
39
|
|
|
42
40
|
|
|
41
|
+
def clone_repositories(
|
|
42
|
+
repositories: list,
|
|
43
|
+
github_token: str = None,
|
|
44
|
+
output_dir: str = "./data",
|
|
45
|
+
cleanup_existing: bool = False,
|
|
46
|
+
):
|
|
47
|
+
# Clone repositories into ./greenmining_repos with sanitized directory names.
|
|
48
|
+
# Args:
|
|
49
|
+
# repositories: List of Repository objects (from fetch_repositories)
|
|
50
|
+
# github_token: GitHub token (required for controller init)
|
|
51
|
+
# output_dir: Output directory for metadata files
|
|
52
|
+
# cleanup_existing: Remove existing greenmining_repos/ before cloning
|
|
53
|
+
token = github_token or "unused"
|
|
54
|
+
controller = RepositoryController(token, output_dir=output_dir)
|
|
55
|
+
|
|
56
|
+
return controller.clone_repositories(
|
|
57
|
+
repositories=repositories,
|
|
58
|
+
cleanup_existing=cleanup_existing,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
43
62
|
def analyze_repositories(
|
|
44
63
|
urls: list,
|
|
45
64
|
max_commits: int = 500,
|
|
@@ -99,12 +118,12 @@ def analyze_repositories(
|
|
|
99
118
|
|
|
100
119
|
|
|
101
120
|
__all__ = [
|
|
102
|
-
"Config",
|
|
103
121
|
"GSF_PATTERNS",
|
|
104
122
|
"GREEN_KEYWORDS",
|
|
105
123
|
"is_green_aware",
|
|
106
124
|
"get_pattern_by_keywords",
|
|
107
125
|
"fetch_repositories",
|
|
126
|
+
"clone_repositories",
|
|
108
127
|
"analyze_repositories",
|
|
109
128
|
"__version__",
|
|
110
129
|
]
|
|
@@ -3,20 +3,12 @@
|
|
|
3
3
|
from .code_diff_analyzer import CodeDiffAnalyzer
|
|
4
4
|
from .statistical_analyzer import StatisticalAnalyzer
|
|
5
5
|
from .temporal_analyzer import TemporalAnalyzer
|
|
6
|
-
from .qualitative_analyzer import QualitativeAnalyzer
|
|
7
|
-
from .power_regression import PowerRegressionDetector, PowerRegression
|
|
8
6
|
from .metrics_power_correlator import MetricsPowerCorrelator, CorrelationResult
|
|
9
|
-
from .version_power_analyzer import VersionPowerAnalyzer, VersionPowerReport
|
|
10
7
|
|
|
11
8
|
__all__ = [
|
|
12
9
|
"CodeDiffAnalyzer",
|
|
13
10
|
"StatisticalAnalyzer",
|
|
14
11
|
"TemporalAnalyzer",
|
|
15
|
-
"QualitativeAnalyzer",
|
|
16
|
-
"PowerRegressionDetector",
|
|
17
|
-
"PowerRegression",
|
|
18
12
|
"MetricsPowerCorrelator",
|
|
19
13
|
"CorrelationResult",
|
|
20
|
-
"VersionPowerAnalyzer",
|
|
21
|
-
"VersionPowerReport",
|
|
22
14
|
]
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
# Repository Controller - Handles repository fetching + cloning operations.
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
import re
|
|
4
4
|
import shutil
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import List, Dict
|
|
7
6
|
|
|
8
|
-
from greenmining.config import Config
|
|
9
7
|
from greenmining.models.repository import Repository
|
|
10
8
|
from greenmining.services.github_graphql_fetcher import GitHubGraphQLFetcher
|
|
11
9
|
from greenmining.utils import colored_print, load_json_file, save_json_file
|
|
@@ -14,86 +12,26 @@ from greenmining.utils import colored_print, load_json_file, save_json_file
|
|
|
14
12
|
class RepositoryController:
|
|
15
13
|
# Controller for GitHub repository operations using GraphQL API.
|
|
16
14
|
|
|
17
|
-
def __init__(self,
|
|
18
|
-
# Initialize controller with
|
|
19
|
-
self.
|
|
20
|
-
self.
|
|
15
|
+
def __init__(self, github_token: str, output_dir: str = "./data"):
|
|
16
|
+
# Initialize controller with GitHub token and output directory.
|
|
17
|
+
self.graphql_fetcher = GitHubGraphQLFetcher(github_token)
|
|
18
|
+
self.output_dir = Path(output_dir)
|
|
19
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
20
|
+
self.repos_file = self.output_dir / "repositories.json"
|
|
21
21
|
self.repos_dir = Path.cwd() / "greenmining_repos"
|
|
22
22
|
|
|
23
|
-
def
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def clone_repositories(
|
|
36
|
-
self,
|
|
37
|
-
repositories: List[Repository],
|
|
38
|
-
github_token: str = None,
|
|
39
|
-
cleanup: bool = True,
|
|
40
|
-
depth: int = 1 # Shallow clone
|
|
41
|
-
) -> List[Dict]:
|
|
42
|
-
"""Clone repos to ./greenmining_repos/ with unique sanitized names."""
|
|
43
|
-
self.repos_dir.mkdir(exist_ok=True)
|
|
44
|
-
if cleanup:
|
|
45
|
-
shutil.rmtree(self.repos_dir, ignore_errors=True)
|
|
46
|
-
self.repos_dir.mkdir(exist_ok=True)
|
|
47
|
-
colored_print(f"Cleaned {self.repos_dir}", "yellow")
|
|
48
|
-
|
|
49
|
-
results = []
|
|
50
|
-
for i, repo in enumerate(repositories, 1):
|
|
51
|
-
safe_name = self._sanitize_repo_name(repo, i)
|
|
52
|
-
clone_path = self.repos_dir / safe_name
|
|
53
|
-
|
|
54
|
-
colored_print(f"[{i}/{len(repositories)}] Cloning {repo.full_name} → {safe_name}", "cyan")
|
|
55
|
-
|
|
56
|
-
url = f"https://{github_token}@github.com/{repo.full_name}.git" if github_token else repo.url
|
|
57
|
-
cmd = ["git", "clone", f"--depth={depth}", "-v", url, str(clone_path)]
|
|
58
|
-
|
|
59
|
-
import subprocess
|
|
60
|
-
try:
|
|
61
|
-
subprocess.check_call(cmd, cwd=self.repos_dir.parent)
|
|
62
|
-
colored_print(f"{safe_name}", "green")
|
|
63
|
-
results.append({
|
|
64
|
-
"full_name": repo.full_name,
|
|
65
|
-
"local_path": str(clone_path),
|
|
66
|
-
"success": True
|
|
67
|
-
})
|
|
68
|
-
except subprocess.CalledProcessError as e:
|
|
69
|
-
colored_print(f"{safe_name}: {e}", "red")
|
|
70
|
-
results.append({
|
|
71
|
-
"full_name": repo.full_name,
|
|
72
|
-
"local_path": str(clone_path),
|
|
73
|
-
"success": False,
|
|
74
|
-
"error": str(e)
|
|
75
|
-
})
|
|
76
|
-
|
|
77
|
-
# Save map for analyze_repositories
|
|
78
|
-
save_json_file(results, self.repos_dir / "clone_results.json")
|
|
79
|
-
success_rate = sum(1 for r in results if r["success"]) / len(results) * 100
|
|
80
|
-
colored_print(f"Cloned: {success_rate:.1f}% ({self.repos_dir}/clone_results.json)", "green")
|
|
81
|
-
return results
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def fetch_repositories(self, max_repos: int = None, min_stars: int = None,
|
|
88
|
-
languages: list[str] = None, keywords: str = None,
|
|
89
|
-
created_after: str = None, created_before: str = None,
|
|
90
|
-
pushed_after: str = None, pushed_before: str = None) -> list[Repository]:
|
|
23
|
+
def fetch_repositories(
|
|
24
|
+
self,
|
|
25
|
+
max_repos: int = 100,
|
|
26
|
+
min_stars: int = 100,
|
|
27
|
+
languages: list[str] = None,
|
|
28
|
+
keywords: str = None,
|
|
29
|
+
created_after: str = None,
|
|
30
|
+
created_before: str = None,
|
|
31
|
+
pushed_after: str = None,
|
|
32
|
+
pushed_before: str = None,
|
|
33
|
+
) -> list[Repository]:
|
|
91
34
|
# Fetch repositories from GitHub using GraphQL API.
|
|
92
|
-
max_repos = max_repos or self.config.MAX_REPOS
|
|
93
|
-
min_stars = min_stars or self.config.MIN_STARS
|
|
94
|
-
languages = languages or self.config.SUPPORTED_LANGUAGES
|
|
95
|
-
keywords = keywords
|
|
96
|
-
|
|
97
35
|
colored_print(f"Fetching up to {max_repos} repositories...", "cyan")
|
|
98
36
|
colored_print(f" Keywords: {keywords}", "cyan")
|
|
99
37
|
colored_print(f" Filters: min_stars={min_stars}", "cyan")
|
|
@@ -103,10 +41,11 @@ class RepositoryController:
|
|
|
103
41
|
f" Created: {created_after or 'any'} to {created_before or 'any'}", "cyan"
|
|
104
42
|
)
|
|
105
43
|
if pushed_after or pushed_before:
|
|
106
|
-
colored_print(
|
|
44
|
+
colored_print(
|
|
45
|
+
f" Pushed: {pushed_after or 'any'} to {pushed_before or 'any'}", "cyan"
|
|
46
|
+
)
|
|
107
47
|
|
|
108
48
|
try:
|
|
109
|
-
# Execute GraphQL search
|
|
110
49
|
repositories = self.graphql_fetcher.search_repositories(
|
|
111
50
|
keywords=keywords,
|
|
112
51
|
max_repos=max_repos,
|
|
@@ -118,12 +57,11 @@ class RepositoryController:
|
|
|
118
57
|
pushed_before=pushed_before,
|
|
119
58
|
)
|
|
120
59
|
|
|
121
|
-
# Save to file
|
|
122
60
|
repo_dicts = [r.to_dict() for r in repositories]
|
|
123
|
-
save_json_file(repo_dicts, self.
|
|
61
|
+
save_json_file(repo_dicts, self.repos_file)
|
|
124
62
|
|
|
125
63
|
colored_print(f"Fetched {len(repositories)} repositories", "green")
|
|
126
|
-
colored_print(f" Saved to: {self.
|
|
64
|
+
colored_print(f" Saved to: {self.repos_file}", "cyan")
|
|
127
65
|
|
|
128
66
|
return repositories
|
|
129
67
|
|
|
@@ -131,12 +69,69 @@ class RepositoryController:
|
|
|
131
69
|
colored_print(f"Error fetching repositories: {e}", "red")
|
|
132
70
|
raise
|
|
133
71
|
|
|
72
|
+
def clone_repositories(
|
|
73
|
+
self,
|
|
74
|
+
repositories: list[Repository],
|
|
75
|
+
cleanup_existing: bool = False,
|
|
76
|
+
) -> list[Path]:
|
|
77
|
+
# Clone repositories into ./greenmining_repos with sanitized directory names.
|
|
78
|
+
self.repos_dir.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
|
|
80
|
+
if cleanup_existing and self.repos_dir.exists():
|
|
81
|
+
shutil.rmtree(self.repos_dir)
|
|
82
|
+
self.repos_dir.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
|
|
84
|
+
cloned_paths = []
|
|
85
|
+
colored_print(f"\nCloning {len(repositories)} repositories into {self.repos_dir}", "cyan")
|
|
86
|
+
|
|
87
|
+
for repo in repositories:
|
|
88
|
+
safe_name = self._sanitize_repo_name(repo)
|
|
89
|
+
local_path = self.repos_dir / safe_name
|
|
90
|
+
|
|
91
|
+
if local_path.exists():
|
|
92
|
+
colored_print(f" Already exists: {safe_name}", "yellow")
|
|
93
|
+
cloned_paths.append(local_path)
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
url = repo.url if hasattr(repo, "url") else f"https://github.com/{repo.full_name}"
|
|
98
|
+
colored_print(f" Cloning {repo.full_name} -> {safe_name}", "cyan")
|
|
99
|
+
|
|
100
|
+
import subprocess
|
|
101
|
+
|
|
102
|
+
subprocess.run(
|
|
103
|
+
["git", "clone", "--depth", "1", url, str(local_path)],
|
|
104
|
+
capture_output=True,
|
|
105
|
+
text=True,
|
|
106
|
+
check=True,
|
|
107
|
+
timeout=120,
|
|
108
|
+
)
|
|
109
|
+
cloned_paths.append(local_path)
|
|
110
|
+
colored_print(f" Cloned: {safe_name}", "green")
|
|
111
|
+
|
|
112
|
+
except Exception as e:
|
|
113
|
+
colored_print(f" Failed to clone {repo.full_name}: {e}", "yellow")
|
|
114
|
+
|
|
115
|
+
colored_print(f"Cloned {len(cloned_paths)}/{len(repositories)} repositories", "green")
|
|
116
|
+
return cloned_paths
|
|
117
|
+
|
|
118
|
+
def _sanitize_repo_name(self, repo: Repository) -> str:
|
|
119
|
+
# Safe unique directory name: owner_repo. Handles case collisions.
|
|
120
|
+
base = re.sub(r"[^a-z0-9-]", "_", repo.full_name.replace("/", "_").lower())
|
|
121
|
+
path = self.repos_dir / base
|
|
122
|
+
if not path.exists():
|
|
123
|
+
return base
|
|
124
|
+
counter = 1
|
|
125
|
+
while (self.repos_dir / f"{base}_{counter}").exists():
|
|
126
|
+
counter += 1
|
|
127
|
+
return f"{base}_{counter}"
|
|
128
|
+
|
|
134
129
|
def load_repositories(self) -> list[Repository]:
|
|
135
130
|
# Load repositories from file.
|
|
136
|
-
if not self.
|
|
137
|
-
raise FileNotFoundError(f"No repositories file found at {self.
|
|
131
|
+
if not self.repos_file.exists():
|
|
132
|
+
raise FileNotFoundError(f"No repositories file found at {self.repos_file}")
|
|
138
133
|
|
|
139
|
-
repo_dicts = load_json_file(self.
|
|
134
|
+
repo_dicts = load_json_file(self.repos_file)
|
|
140
135
|
return [Repository.from_dict(r) for r in repo_dicts]
|
|
141
136
|
|
|
142
137
|
def get_repository_stats(self, repositories: list[Repository]) -> dict:
|
|
@@ -221,7 +221,7 @@ class LocalRepoAnalyzer:
|
|
|
221
221
|
# method_level_analysis: Extract per-method metrics via Lizard
|
|
222
222
|
# include_source_code: Include source code before/after in results
|
|
223
223
|
# process_metrics: "standard" or "full" PyDriller process metrics
|
|
224
|
-
self.clone_path = clone_path or Path
|
|
224
|
+
self.clone_path = clone_path or Path.cwd() / "greenmining_repos"
|
|
225
225
|
self.clone_path.mkdir(parents=True, exist_ok=True)
|
|
226
226
|
self.max_commits = max_commits
|
|
227
227
|
self.days_back = days_back
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: greenmining
|
|
3
|
+
Version: 1.2.0
|
|
4
|
+
Summary: An empirical Python library for Mining Software Repositories (MSR) in Green IT research
|
|
5
|
+
Author-email: Adam Bouafia <a.bouafia@student.vu.nl>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/adam-bouafia/greenmining
|
|
8
|
+
Project-URL: Documentation, https://github.com/adam-bouafia/greenmining#readme
|
|
9
|
+
Project-URL: Linkedin, https://www.linkedin.com/in/adam-bouafia/
|
|
10
|
+
Project-URL: Repository, https://github.com/adam-bouafia/greenmining
|
|
11
|
+
Project-URL: Issues, https://github.com/adam-bouafia/greenmining/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/adam-bouafia/greenmining/blob/main/CHANGELOG.md
|
|
13
|
+
Keywords: green-software,gsf,msr,mining-software-repositories,green-it,sustainability,carbon-footprint,energy-efficiency,repository-analysis,github-analysis,pydriller,empirical-software-engineering
|
|
14
|
+
Classifier: Development Status :: 3 - Alpha
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
19
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
26
|
+
Classifier: Operating System :: OS Independent
|
|
27
|
+
Requires-Python: >=3.9
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Dist: PyGithub
|
|
31
|
+
Requires-Dist: PyDriller
|
|
32
|
+
Requires-Dist: pandas
|
|
33
|
+
Requires-Dist: colorama
|
|
34
|
+
Requires-Dist: tabulate
|
|
35
|
+
Requires-Dist: tqdm
|
|
36
|
+
Requires-Dist: matplotlib
|
|
37
|
+
Requires-Dist: plotly
|
|
38
|
+
Requires-Dist: python-dotenv
|
|
39
|
+
Requires-Dist: requests
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: pytest; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
43
|
+
Requires-Dist: pytest-mock; extra == "dev"
|
|
44
|
+
Requires-Dist: black; extra == "dev"
|
|
45
|
+
Requires-Dist: ruff; extra == "dev"
|
|
46
|
+
Requires-Dist: mypy; extra == "dev"
|
|
47
|
+
Requires-Dist: build; extra == "dev"
|
|
48
|
+
Requires-Dist: twine; extra == "dev"
|
|
49
|
+
Provides-Extra: energy
|
|
50
|
+
Requires-Dist: psutil; extra == "energy"
|
|
51
|
+
Requires-Dist: codecarbon; extra == "energy"
|
|
52
|
+
Provides-Extra: docs
|
|
53
|
+
Requires-Dist: sphinx; extra == "docs"
|
|
54
|
+
Requires-Dist: sphinx-rtd-theme; extra == "docs"
|
|
55
|
+
Requires-Dist: myst-parser; extra == "docs"
|
|
56
|
+
Dynamic: license-file
|
|
57
|
+
|
|
58
|
+
# greenmining
|
|
59
|
+
|
|
60
|
+
An empirical Python library for Mining Software Repositories (MSR) in Green IT research.
|
|
61
|
+
|
|
62
|
+
[](https://pypi.org/project/greenmining/)
|
|
63
|
+
[](https://pypi.org/project/greenmining/)
|
|
64
|
+
[](LICENSE)
|
|
65
|
+
[](https://greenmining.readthedocs.io/)
|
|
66
|
+
|
|
67
|
+
## Overview
|
|
68
|
+
|
|
69
|
+
`greenmining` is a research-grade Python library designed for **empirical Mining Software Repositories (MSR)** studies in **Green IT**. It enables researchers and practitioners to:
|
|
70
|
+
|
|
71
|
+
- **Mine repositories at scale** - Search, fetch, and clone GitHub repositories via GraphQL API with configurable filters
|
|
72
|
+
- **Classify green commits** - Detect 124 sustainability patterns from the Green Software Foundation (GSF) catalog using 332 keywords
|
|
73
|
+
- **Analyze any repository by URL** - Direct Git-based analysis with support for private repositories
|
|
74
|
+
- **Measure energy consumption** - RAPL, CodeCarbon, and CPU Energy Meter backends for power profiling
|
|
75
|
+
- **Carbon footprint reporting** - CO2 emissions calculation with 20+ country profiles and cloud region support
|
|
76
|
+
- **Method-level analysis** - Per-method complexity and metrics via Lizard integration
|
|
77
|
+
- **Generate research datasets** - Statistical analysis, temporal trends, and publication-ready reports
|
|
78
|
+
|
|
79
|
+
## Installation
|
|
80
|
+
|
|
81
|
+
### Via pip
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pip install greenmining
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### With energy measurement
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install greenmining[energy]
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### From source
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
git clone https://github.com/adam-bouafia/greenmining.git
|
|
97
|
+
cd greenmining
|
|
98
|
+
pip install -e .
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Quick Start
|
|
102
|
+
|
|
103
|
+
### Pattern Detection
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from greenmining import GSF_PATTERNS, is_green_aware, get_pattern_by_keywords
|
|
107
|
+
|
|
108
|
+
print(f"Total patterns: {len(GSF_PATTERNS)}") # 124 patterns across 15 categories
|
|
109
|
+
|
|
110
|
+
commit_msg = "Optimize Redis caching to reduce energy consumption"
|
|
111
|
+
if is_green_aware(commit_msg):
|
|
112
|
+
patterns = get_pattern_by_keywords(commit_msg)
|
|
113
|
+
print(f"Matched patterns: {patterns}")
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Fetch Repositories
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from greenmining import fetch_repositories
|
|
120
|
+
|
|
121
|
+
repos = fetch_repositories(
|
|
122
|
+
github_token="your_token",
|
|
123
|
+
max_repos=50,
|
|
124
|
+
min_stars=500,
|
|
125
|
+
keywords="kubernetes cloud-native",
|
|
126
|
+
languages=["Python", "Go"],
|
|
127
|
+
created_after="2020-01-01",
|
|
128
|
+
pushed_after="2023-01-01",
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
for repo in repos[:5]:
|
|
132
|
+
print(f"- {repo.full_name} ({repo.stars} stars)")
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Clone Repositories
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from greenmining import fetch_repositories, clone_repositories
|
|
139
|
+
|
|
140
|
+
repos = fetch_repositories(github_token="your_token", max_repos=10, keywords="android")
|
|
141
|
+
|
|
142
|
+
# Clone into ./greenmining_repos/ with sanitized directory names
|
|
143
|
+
paths = clone_repositories(repos)
|
|
144
|
+
print(f"Cloned {len(paths)} repositories")
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Analyze Repositories by URL
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from greenmining import analyze_repositories
|
|
151
|
+
|
|
152
|
+
results = analyze_repositories(
|
|
153
|
+
urls=[
|
|
154
|
+
"https://github.com/kubernetes/kubernetes",
|
|
155
|
+
"https://github.com/istio/istio",
|
|
156
|
+
],
|
|
157
|
+
max_commits=100,
|
|
158
|
+
parallel_workers=2,
|
|
159
|
+
energy_tracking=True,
|
|
160
|
+
energy_backend="auto",
|
|
161
|
+
method_level_analysis=True,
|
|
162
|
+
include_source_code=True,
|
|
163
|
+
github_token="your_token",
|
|
164
|
+
since_date="2020-01-01",
|
|
165
|
+
to_date="2025-12-31",
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
for result in results:
|
|
169
|
+
print(f"{result.name}: {result.green_commit_rate:.1%} green")
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Access Pattern Data
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from greenmining import GSF_PATTERNS
|
|
176
|
+
|
|
177
|
+
# Get patterns by category
|
|
178
|
+
cloud = {k: v for k, v in GSF_PATTERNS.items() if v['category'] == 'cloud'}
|
|
179
|
+
print(f"Cloud patterns: {len(cloud)}")
|
|
180
|
+
|
|
181
|
+
# All categories
|
|
182
|
+
categories = set(p['category'] for p in GSF_PATTERNS.values())
|
|
183
|
+
print(f"Categories: {sorted(categories)}")
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Energy Measurement
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from greenmining.energy import get_energy_meter, CPUEnergyMeter
|
|
190
|
+
|
|
191
|
+
# Auto-detect best backend
|
|
192
|
+
meter = get_energy_meter("auto")
|
|
193
|
+
meter.start()
|
|
194
|
+
# ... your workload ...
|
|
195
|
+
result = meter.stop()
|
|
196
|
+
print(f"Energy: {result.joules:.2f} J, Power: {result.watts_avg:.2f} W")
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Statistical Analysis
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
from greenmining.analyzers import StatisticalAnalyzer, TemporalAnalyzer
|
|
203
|
+
|
|
204
|
+
stat = StatisticalAnalyzer()
|
|
205
|
+
temporal = TemporalAnalyzer(granularity="quarter")
|
|
206
|
+
|
|
207
|
+
# Pattern correlations, effect sizes, temporal trends
|
|
208
|
+
# See experiment notebook for full usage
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Metrics-to-Power Correlation
|
|
212
|
+
|
|
213
|
+
```python
|
|
214
|
+
from greenmining.analyzers import MetricsPowerCorrelator
|
|
215
|
+
|
|
216
|
+
correlator = MetricsPowerCorrelator()
|
|
217
|
+
correlator.fit(
|
|
218
|
+
metrics=["complexity", "nloc", "code_churn"],
|
|
219
|
+
metrics_values={
|
|
220
|
+
"complexity": [10, 20, 30, 40],
|
|
221
|
+
"nloc": [100, 200, 300, 400],
|
|
222
|
+
"code_churn": [50, 100, 150, 200],
|
|
223
|
+
},
|
|
224
|
+
power_measurements=[5.0, 8.0, 12.0, 15.0],
|
|
225
|
+
)
|
|
226
|
+
print(f"Feature importance: {correlator.feature_importance}")
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## Features
|
|
230
|
+
|
|
231
|
+
### Core Capabilities
|
|
232
|
+
|
|
233
|
+
- **Pattern Detection**: 124 sustainability patterns across 15 categories from the GSF catalog
|
|
234
|
+
- **Keyword Analysis**: 332 green software detection keywords
|
|
235
|
+
- **Repository Fetching**: GraphQL API with date, star, and language filters
|
|
236
|
+
- **Repository Cloning**: Sanitized directory names in `./greenmining_repos/`
|
|
237
|
+
- **URL-Based Analysis**: Direct Git-based analysis from GitHub URLs (HTTPS and SSH)
|
|
238
|
+
- **Batch Processing**: Parallel analysis of multiple repositories
|
|
239
|
+
- **Private Repository Support**: Authentication via SSH keys or GitHub tokens
|
|
240
|
+
|
|
241
|
+
### Analysis & Measurement
|
|
242
|
+
|
|
243
|
+
- **Energy Measurement**: RAPL, CodeCarbon, and CPU Energy Meter backends
|
|
244
|
+
- **Carbon Footprint Reporting**: CO2 emissions with 20+ country profiles (AWS, GCP, Azure)
|
|
245
|
+
- **Metrics-to-Power Correlation**: Pearson and Spearman analysis between code metrics and power
|
|
246
|
+
- **Method-Level Analysis**: Per-method complexity metrics via Lizard integration
|
|
247
|
+
- **Source Code Access**: Before/after source code for refactoring detection
|
|
248
|
+
- **Process Metrics**: DMM size, complexity, interfacing via PyDriller
|
|
249
|
+
- **Statistical Analysis**: Correlations, effect sizes, and temporal trends
|
|
250
|
+
- **Multi-format Output**: JSON, CSV, pandas DataFrame
|
|
251
|
+
|
|
252
|
+
### Energy Backends
|
|
253
|
+
|
|
254
|
+
| Backend | Platform | Metrics | Requirements |
|
|
255
|
+
|---------|----------|---------|--------------|
|
|
256
|
+
| **RAPL** | Linux (Intel/AMD) | CPU/RAM energy (Joules) | `/sys/class/powercap/` access |
|
|
257
|
+
| **CodeCarbon** | Cross-platform | Energy + Carbon emissions (gCO2) | `pip install codecarbon` |
|
|
258
|
+
| **CPU Meter** | All platforms | Estimated CPU energy (Joules) | Optional: `pip install psutil` |
|
|
259
|
+
| **Auto** | All platforms | Best available backend | Automatic detection |
|
|
260
|
+
|
|
261
|
+
### GSF Pattern Categories
|
|
262
|
+
|
|
263
|
+
**124 patterns across 15 categories:**
|
|
264
|
+
|
|
265
|
+
| Category | Patterns | Examples |
|
|
266
|
+
|----------|----------|----------|
|
|
267
|
+
| Cloud | 42 | Auto-scaling, serverless, right-sizing, region selection |
|
|
268
|
+
| Web | 17 | CDN, caching, lazy loading, compression |
|
|
269
|
+
| AI/ML | 19 | Model pruning, quantization, edge inference |
|
|
270
|
+
| Database | 5 | Indexing, query optimization, connection pooling |
|
|
271
|
+
| Networking | 8 | Protocol optimization, HTTP/2, gRPC |
|
|
272
|
+
| Network | 6 | Request batching, GraphQL, circuit breakers |
|
|
273
|
+
| Microservices | 4 | Service decomposition, graceful shutdown |
|
|
274
|
+
| Infrastructure | 4 | Alpine containers, IaC, renewable regions |
|
|
275
|
+
| General | 8 | Feature flags, precomputation, background jobs |
|
|
276
|
+
| Others | 11 | Caching, resource, data, async, code, monitoring |
|
|
277
|
+
|
|
278
|
+
## Development
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
git clone https://github.com/adam-bouafia/greenmining.git
|
|
282
|
+
cd greenmining
|
|
283
|
+
pip install -e ".[dev]"
|
|
284
|
+
|
|
285
|
+
pytest tests/
|
|
286
|
+
black greenmining/ tests/
|
|
287
|
+
ruff check greenmining/ tests/
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## Requirements
|
|
291
|
+
|
|
292
|
+
- Python 3.9+
|
|
293
|
+
- PyGithub, PyDriller, pandas, colorama, tqdm
|
|
294
|
+
|
|
295
|
+
**Optional:**
|
|
296
|
+
|
|
297
|
+
```bash
|
|
298
|
+
pip install greenmining[energy] # psutil, codecarbon
|
|
299
|
+
pip install greenmining[dev] # pytest, black, ruff, mypy
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
## License
|
|
303
|
+
|
|
304
|
+
MIT License - See [LICENSE](LICENSE) for details.
|
|
305
|
+
|
|
306
|
+
## Links
|
|
307
|
+
|
|
308
|
+
- **GitHub**: https://github.com/adam-bouafia/greenmining
|
|
309
|
+
- **PyPI**: https://pypi.org/project/greenmining/
|
|
310
|
+
- **Documentation**: https://greenmining.readthedocs.io/
|
|
311
|
+
- **Docker Hub**: https://hub.docker.com/r/adambouafia/greenmining
|