PyPI - greenmining - Versions diffs - 1.0.6__tar.gz → 1.0.8__tar.gz - Mend

greenmining 1.0.6tar.gz → 1.0.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{greenmining-1.0.6/greenmining.egg-info → greenmining-1.0.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: greenmining
-Version: 1.0.6
+Version: 1.0.8
 Summary: An empirical Python library for Mining Software Repositories (MSR) in Green IT research
 Author-email: Adam Bouafia <a.bouafia@student.vu.nl>
 License: MIT
@@ -330,7 +330,137 @@ print(f"Top patterns: {stats['top_patterns'][:5]}")
 aggregator.export_to_csv(results, "output.csv")
 ```
-#### Batch Analysis
+#### URL-Based Repository Analysis
+```python
+from greenmining.services.local_repo_analyzer import LocalRepoAnalyzer
+analyzer = LocalRepoAnalyzer(
+    max_commits=200,
+    cleanup_after=True,
+)
+result = analyzer.analyze_repository("https://github.com/pallets/flask")
+print(f"Repository: {result.name}")
+print(f"Commits analyzed: {result.total_commits}")
+print(f"Green-aware: {result.green_commits} ({result.green_commit_rate:.1%})")
+for commit in result.commits[:5]:
+    if commit.green_aware:
+        print(f"  {commit.message[:60]}...")
+```
+#### Batch Analysis with Parallelism
+```python
+from greenmining import analyze_repositories
+results = analyze_repositories(
+    urls=[
+        "https://github.com/kubernetes/kubernetes",
+        "https://github.com/istio/istio",
+        "https://github.com/envoyproxy/envoy",
+    ],
+    max_commits=100,
+    parallel_workers=3,
+    energy_tracking=True,
+    energy_backend="auto",
+)
+for result in results:
+    print(f"{result.name}: {result.green_commit_rate:.1%} green")
+```
+#### Private Repository Analysis
+```python
+from greenmining.services.local_repo_analyzer import LocalRepoAnalyzer
+# HTTPS with token
+analyzer = LocalRepoAnalyzer(github_token="ghp_xxxx")
+result = analyzer.analyze_repository("https://github.com/company/private-repo")
+# SSH with key
+analyzer = LocalRepoAnalyzer(ssh_key_path="~/.ssh/id_rsa")
+result = analyzer.analyze_repository("git@github.com:company/private-repo.git")
+```
+#### Power Regression Detection
+```python
+from greenmining.analyzers import PowerRegressionDetector
+detector = PowerRegressionDetector(
+    test_command="pytest tests/ -x",
+    energy_backend="rapl",
+    threshold_percent=5.0,
+    iterations=5,
+)
+regressions = detector.detect(
+    repo_path="/path/to/repo",
+    baseline_commit="v1.0.0",
+    target_commit="HEAD",
+)
+for regression in regressions:
+    print(f"Commit {regression.sha[:8]}: +{regression.power_increase:.1f}%")
+```
+#### Version Power Comparison
+```python
+from greenmining.analyzers import VersionPowerAnalyzer
+analyzer = VersionPowerAnalyzer(
+    test_command="pytest tests/",
+    energy_backend="rapl",
+    iterations=10,
+    warmup_iterations=2,
+)
+report = analyzer.analyze_versions(
+    repo_path="/path/to/repo",
+    versions=["v1.0", "v1.1", "v1.2", "v2.0"],
+)
+print(report.summary())
+print(f"Trend: {report.trend}")
+print(f"Most efficient: {report.most_efficient}")
+```
+#### Metrics-to-Power Correlation
+```python
+from greenmining.analyzers import MetricsPowerCorrelator
+correlator = MetricsPowerCorrelator()
+correlator.fit(
+    metrics=["complexity", "nloc", "code_churn"],
+    metrics_values={
+        "complexity": [10, 20, 30, 40],
+        "nloc": [100, 200, 300, 400],
+        "code_churn": [50, 100, 150, 200],
+    },
+    power_measurements=[5.0, 8.0, 12.0, 15.0],
+)
+print(f"Pearson: {correlator.pearson}")
+print(f"Spearman: {correlator.spearman}")
+print(f"Feature importance: {correlator.feature_importance}")
+```
+#### Web Dashboard
+```python
+from greenmining.dashboard import run_dashboard
+# Launch interactive dashboard (requires pip install greenmining[dashboard])
+run_dashboard(data_dir="./data", host="127.0.0.1", port=5000)
+```
+#### Pipeline Batch Analysis
 ```python
 from greenmining.controllers.repository_controller import RepositoryController
@@ -551,17 +681,24 @@ config = Config(
 ### Core Capabilities
-- **Pattern Detection**: Automatically identifies 122 sustainability patterns across 15 categories
-- **Keyword Analysis**: Scans commit messages using 321 green software keywords
-- **Custom Repository Fetching**: Fetch repositories with custom search keywords (not limited to microservices)
-- **Repository Analysis**: Analyzes repositories from GitHub with flexible filtering
-- **Batch Processing**: Analyze hundreds of repositories and thousands of commits
-- **Multi-format Output**: Generates Markdown reports, CSV exports, and JSON data
-- **Statistical Analysis**: Calculates green-awareness metrics, pattern distribution, and trends
+- **Pattern Detection**: 122 sustainability patterns across 15 categories from the GSF catalog
+- **Keyword Analysis**: 321 green software detection keywords
+- **Repository Fetching**: GraphQL API with date, star, and language filters
+- **URL-Based Analysis**: Direct PyDriller analysis from GitHub URLs (HTTPS and SSH)
+- **Batch Processing**: Parallel analysis of multiple repositories with configurable workers
+- **Private Repository Support**: Authentication via SSH keys or GitHub tokens
+- **Energy Measurement**: RAPL, CodeCarbon, and CPU Energy Meter backends
+- **Carbon Footprint Reporting**: CO2 emissions with 20+ country profiles and cloud region support (AWS, GCP, Azure)
+- **Power Regression Detection**: Identify commits that increased energy consumption
+- **Metrics-to-Power Correlation**: Pearson and Spearman analysis between code metrics and power
+- **Version Power Comparison**: Compare power consumption across software versions with trend detection
+- **Method-Level Analysis**: Per-method complexity metrics via Lizard integration
+- **Source Code Access**: Before/after source code for refactoring detection
+- **Full Process Metrics**: All 8 PyDriller process metrics (ChangeSet, CodeChurn, CommitsCount, ContributorsCount, ContributorsExperience, HistoryComplexity, HunksCount, LinesCount)
+- **Statistical Analysis**: Correlations, effect sizes, and temporal trends
+- **Multi-format Output**: Markdown reports, CSV exports, JSON data
+- **Web Dashboard**: Flask-based interactive visualization (`pip install greenmining[dashboard]`)
 - **Docker Support**: Pre-built images for containerized analysis
-- **Programmatic API**: Full Python API for custom workflows and integrations
-- **Clean Architecture**: Modular design with services layer (Fetcher, Extractor, Analyzer, Aggregator, Reports)
-- **Energy Measurement**: Real-time energy consumption tracking via RAPL (Linux) or CodeCarbon (cross-platform)
 ### Energy Measurement
@@ -712,8 +849,15 @@ ruff check greenmining/ tests/
 - Python 3.9+
 - PyGithub >= 2.1.1
 - PyDriller >= 2.5
-- pandas >= 2.2.0
-- codecarbon >= 2.0.0 (optional, for cross-platform energy measurement)
+- pandas >= 2.2.0
+**Optional dependencies:**
+```bash
+pip install greenmining[energy]      # psutil, codecarbon (energy measurement)
+pip install greenmining[dashboard]   # flask (web dashboard)
+pip install greenmining[dev]         # pytest, black, ruff, mypy (development)
+```
 ## License

{greenmining-1.0.6 → greenmining-1.0.8}/README.md RENAMED Viewed

@@ -272,7 +272,137 @@ print(f"Top patterns: {stats['top_patterns'][:5]}")
 aggregator.export_to_csv(results, "output.csv")
 ```
-#### Batch Analysis
+#### URL-Based Repository Analysis
+```python
+from greenmining.services.local_repo_analyzer import LocalRepoAnalyzer
+analyzer = LocalRepoAnalyzer(
+    max_commits=200,
+    cleanup_after=True,
+)
+result = analyzer.analyze_repository("https://github.com/pallets/flask")
+print(f"Repository: {result.name}")
+print(f"Commits analyzed: {result.total_commits}")
+print(f"Green-aware: {result.green_commits} ({result.green_commit_rate:.1%})")
+for commit in result.commits[:5]:
+    if commit.green_aware:
+        print(f"  {commit.message[:60]}...")
+```
+#### Batch Analysis with Parallelism
+```python
+from greenmining import analyze_repositories
+results = analyze_repositories(
+    urls=[
+        "https://github.com/kubernetes/kubernetes",
+        "https://github.com/istio/istio",
+        "https://github.com/envoyproxy/envoy",
+    ],
+    max_commits=100,
+    parallel_workers=3,
+    energy_tracking=True,
+    energy_backend="auto",
+)
+for result in results:
+    print(f"{result.name}: {result.green_commit_rate:.1%} green")
+```
+#### Private Repository Analysis
+```python
+from greenmining.services.local_repo_analyzer import LocalRepoAnalyzer
+# HTTPS with token
+analyzer = LocalRepoAnalyzer(github_token="ghp_xxxx")
+result = analyzer.analyze_repository("https://github.com/company/private-repo")
+# SSH with key
+analyzer = LocalRepoAnalyzer(ssh_key_path="~/.ssh/id_rsa")
+result = analyzer.analyze_repository("git@github.com:company/private-repo.git")
+```
+#### Power Regression Detection
+```python
+from greenmining.analyzers import PowerRegressionDetector
+detector = PowerRegressionDetector(
+    test_command="pytest tests/ -x",
+    energy_backend="rapl",
+    threshold_percent=5.0,
+    iterations=5,
+)
+regressions = detector.detect(
+    repo_path="/path/to/repo",
+    baseline_commit="v1.0.0",
+    target_commit="HEAD",
+)
+for regression in regressions:
+    print(f"Commit {regression.sha[:8]}: +{regression.power_increase:.1f}%")
+```
+#### Version Power Comparison
+```python
+from greenmining.analyzers import VersionPowerAnalyzer
+analyzer = VersionPowerAnalyzer(
+    test_command="pytest tests/",
+    energy_backend="rapl",
+    iterations=10,
+    warmup_iterations=2,
+)
+report = analyzer.analyze_versions(
+    repo_path="/path/to/repo",
+    versions=["v1.0", "v1.1", "v1.2", "v2.0"],
+)
+print(report.summary())
+print(f"Trend: {report.trend}")
+print(f"Most efficient: {report.most_efficient}")
+```
+#### Metrics-to-Power Correlation
+```python
+from greenmining.analyzers import MetricsPowerCorrelator
+correlator = MetricsPowerCorrelator()
+correlator.fit(
+    metrics=["complexity", "nloc", "code_churn"],
+    metrics_values={
+        "complexity": [10, 20, 30, 40],
+        "nloc": [100, 200, 300, 400],
+        "code_churn": [50, 100, 150, 200],
+    },
+    power_measurements=[5.0, 8.0, 12.0, 15.0],
+)
+print(f"Pearson: {correlator.pearson}")
+print(f"Spearman: {correlator.spearman}")
+print(f"Feature importance: {correlator.feature_importance}")
+```
+#### Web Dashboard
+```python
+from greenmining.dashboard import run_dashboard
+# Launch interactive dashboard (requires pip install greenmining[dashboard])
+run_dashboard(data_dir="./data", host="127.0.0.1", port=5000)
+```
+#### Pipeline Batch Analysis
 ```python
 from greenmining.controllers.repository_controller import RepositoryController
@@ -493,17 +623,24 @@ config = Config(
 ### Core Capabilities
-- **Pattern Detection**: Automatically identifies 122 sustainability patterns across 15 categories
-- **Keyword Analysis**: Scans commit messages using 321 green software keywords
-- **Custom Repository Fetching**: Fetch repositories with custom search keywords (not limited to microservices)
-- **Repository Analysis**: Analyzes repositories from GitHub with flexible filtering
-- **Batch Processing**: Analyze hundreds of repositories and thousands of commits
-- **Multi-format Output**: Generates Markdown reports, CSV exports, and JSON data
-- **Statistical Analysis**: Calculates green-awareness metrics, pattern distribution, and trends
+- **Pattern Detection**: 122 sustainability patterns across 15 categories from the GSF catalog
+- **Keyword Analysis**: 321 green software detection keywords
+- **Repository Fetching**: GraphQL API with date, star, and language filters
+- **URL-Based Analysis**: Direct PyDriller analysis from GitHub URLs (HTTPS and SSH)
+- **Batch Processing**: Parallel analysis of multiple repositories with configurable workers
+- **Private Repository Support**: Authentication via SSH keys or GitHub tokens
+- **Energy Measurement**: RAPL, CodeCarbon, and CPU Energy Meter backends
+- **Carbon Footprint Reporting**: CO2 emissions with 20+ country profiles and cloud region support (AWS, GCP, Azure)
+- **Power Regression Detection**: Identify commits that increased energy consumption
+- **Metrics-to-Power Correlation**: Pearson and Spearman analysis between code metrics and power
+- **Version Power Comparison**: Compare power consumption across software versions with trend detection
+- **Method-Level Analysis**: Per-method complexity metrics via Lizard integration
+- **Source Code Access**: Before/after source code for refactoring detection
+- **Full Process Metrics**: All 8 PyDriller process metrics (ChangeSet, CodeChurn, CommitsCount, ContributorsCount, ContributorsExperience, HistoryComplexity, HunksCount, LinesCount)
+- **Statistical Analysis**: Correlations, effect sizes, and temporal trends
+- **Multi-format Output**: Markdown reports, CSV exports, JSON data
+- **Web Dashboard**: Flask-based interactive visualization (`pip install greenmining[dashboard]`)
 - **Docker Support**: Pre-built images for containerized analysis
-- **Programmatic API**: Full Python API for custom workflows and integrations
-- **Clean Architecture**: Modular design with services layer (Fetcher, Extractor, Analyzer, Aggregator, Reports)
-- **Energy Measurement**: Real-time energy consumption tracking via RAPL (Linux) or CodeCarbon (cross-platform)
 ### Energy Measurement
@@ -654,8 +791,15 @@ ruff check greenmining/ tests/
 - Python 3.9+
 - PyGithub >= 2.1.1
 - PyDriller >= 2.5
-- pandas >= 2.2.0
-- codecarbon >= 2.0.0 (optional, for cross-platform energy measurement)
+- pandas >= 2.2.0
+**Optional dependencies:**
+```bash
+pip install greenmining[energy]      # psutil, codecarbon (energy measurement)
+pip install greenmining[dashboard]   # flask (web dashboard)
+pip install greenmining[dev]         # pytest, black, ruff, mypy (development)
+```
 ## License

{greenmining-1.0.6 → greenmining-1.0.8}/greenmining/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ from greenmining.gsf_patterns import (
     is_green_aware,
 )
-__version__ = "1.0.6"
+__version__ = "1.0.8"
 def fetch_repositories(
@@ -18,6 +18,10 @@ def fetch_repositories(
     min_stars: int = None,
     languages: list = None,
     keywords: str = None,
+    created_after: str = None,
+    created_before: str = None,
+    pushed_after: str = None,
+    pushed_before: str = None,
 ):
     # Fetch repositories from GitHub with custom search keywords.
     config = Config()
@@ -29,6 +33,10 @@ def fetch_repositories(
         min_stars=min_stars,
         languages=languages,
         keywords=keywords,
+        created_after=created_after,
+        created_before=created_before,
+        pushed_after=pushed_after,
+        pushed_before=pushed_before,
     )

greenmining-1.0.8/greenmining/controllers/repository_controller.py ADDED Viewed

@@ -0,0 +1,100 @@
+# Repository Controller - Handles repository fetching operations.
+from tqdm import tqdm
+from greenmining.config import Config
+from greenmining.models.repository import Repository
+from greenmining.services.github_graphql_fetcher import GitHubGraphQLFetcher
+from greenmining.utils import colored_print, load_json_file, save_json_file
+class RepositoryController:
+    # Controller for GitHub repository operations using GraphQL API.
+    def __init__(self, config: Config):
+        # Initialize controller with configuration.
+        self.config = config
+        self.graphql_fetcher = GitHubGraphQLFetcher(config.GITHUB_TOKEN)
+    def fetch_repositories(
+        self,
+        max_repos: int = None,
+        min_stars: int = None,
+        languages: list[str] = None,
+        keywords: str = None,
+        created_after: str = None,
+        created_before: str = None,
+        pushed_after: str = None,
+        pushed_before: str = None,
+    ) -> list[Repository]:
+        # Fetch repositories from GitHub using GraphQL API.
+        max_repos = max_repos or self.config.MAX_REPOS
+        min_stars = min_stars or self.config.MIN_STARS
+        languages = languages or self.config.SUPPORTED_LANGUAGES
+        keywords = keywords or "microservices"
+        colored_print(f"Fetching up to {max_repos} repositories...", "cyan")
+        colored_print(f"   Keywords: {keywords}", "cyan")
+        colored_print(f"   Filters: min_stars={min_stars}", "cyan")
+        if created_after or created_before:
+            colored_print(
+                f"   Created: {created_after or 'any'} to {created_before or 'any'}", "cyan"
+            )
+        if pushed_after or pushed_before:
+            colored_print(f"   Pushed: {pushed_after or 'any'} to {pushed_before or 'any'}", "cyan")
+        try:
+            # Execute GraphQL search
+            repositories = self.graphql_fetcher.search_repositories(
+                keywords=keywords,
+                max_repos=max_repos,
+                min_stars=min_stars,
+                languages=languages,
+                created_after=created_after,
+                created_before=created_before,
+                pushed_after=pushed_after,
+                pushed_before=pushed_before,
+            )
+            # Save to file
+            repo_dicts = [r.to_dict() for r in repositories]
+            save_json_file(repo_dicts, self.config.REPOS_FILE)
+            colored_print(f"Fetched {len(repositories)} repositories", "green")
+            colored_print(f"   Saved to: {self.config.REPOS_FILE}", "cyan")
+            return repositories
+        except Exception as e:
+            colored_print(f"Error fetching repositories: {e}", "red")
+            raise
+    def load_repositories(self) -> list[Repository]:
+        # Load repositories from file.
+        if not self.config.REPOS_FILE.exists():
+            raise FileNotFoundError(f"No repositories file found at {self.config.REPOS_FILE}")
+        repo_dicts = load_json_file(self.config.REPOS_FILE)
+        return [Repository.from_dict(r) for r in repo_dicts]
+    def get_repository_stats(self, repositories: list[Repository]) -> dict:
+        # Get statistics about fetched repositories.
+        if not repositories:
+            return {}
+        return {
+            "total": len(repositories),
+            "by_language": self._count_by_language(repositories),
+            "total_stars": sum(r.stars for r in repositories),
+            "avg_stars": sum(r.stars for r in repositories) / len(repositories),
+            "top_repo": max(repositories, key=lambda r: r.stars).full_name,
+        }
+    def _count_by_language(self, repositories: list[Repository]) -> dict:
+        # Count repositories by language.
+        counts = {}
+        for repo in repositories:
+            lang = repo.language or "Unknown"
+            counts[lang] = counts.get(lang, 0) + 1
+        return counts

{greenmining-1.0.6 → greenmining-1.0.8}/greenmining/presenters/console_presenter.py RENAMED Viewed

@@ -113,7 +113,7 @@ class ConsolePresenter:
         table_data = []
         for phase, info in status.items():
-            status_icon = "" if info.get("completed") else "⏳"
+            status_icon = "done" if info.get("completed") else "pending"
             table_data.append(
                 [status_icon, phase, info.get("file", "N/A"), info.get("size", "N/A")]
             )

greenmining-1.0.8/greenmining/services/github_fetcher.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # Legacy GitHub REST API fetcher (deprecated).
2	+ # Use github_graphql_fetcher.GitHubGraphQLFetcher instead.

{greenmining-1.0.6 → greenmining-1.0.8}/greenmining/services/github_graphql_fetcher.py RENAMED Viewed

@@ -1,7 +1,4 @@
-# GitHub GraphQL API fetcher for faster and more efficient repository fetching.
-#
-# GraphQL allows fetching exactly the data you need in a single request,
-# reducing API calls and improving rate limit efficiency.
+# GitHub GraphQL API fetcher for repository search and data retrieval.
 import json
 import time
@@ -14,12 +11,6 @@ from greenmining.models.repository import Repository
 class GitHubGraphQLFetcher:
     # Fetch GitHub repositories using GraphQL API v4.
-    #
-    # Benefits over REST API:
-    # - Fetch repos + commits in 1 request instead of 100+ REST calls
-    # - Get exactly the fields you need (no over-fetching)
-    # - Better rate limit efficiency (5000 points/hour vs 5000 requests/hour)
-    # - More powerful search capabilities
     GRAPHQL_ENDPOINT = "https://api.github.com/graphql"
@@ -193,10 +184,10 @@ class GitHubGraphQLFetcher:
         # Star count
         query_parts.append(f"stars:>={min_stars}")
-        # Languages
-        if languages:
-            lang_query = " OR ".join([f"language:{lang}" for lang in languages])
-            query_parts.append(f"({lang_query})")
+        # Languages - skip filter if more than 5 to avoid exceeding GitHub query limits
+        if languages and len(languages) <= 5:
+            lang_query = " ".join([f"language:{lang}" for lang in languages])
+            query_parts.append(lang_query)
         # Date filters
         if created_after:
@@ -259,9 +250,6 @@ class GitHubGraphQLFetcher:
     ) -> List[Dict[str, Any]]:
         # Fetch commits for a specific repository using GraphQL.
         #
-        # This is much faster than REST API as it gets all commits in 1-2 requests
-        # instead of paginating through 100 individual REST calls.
-        #
         # Args:
         #     owner: Repository owner
         #     name: Repository name

greenmining 1.0.6__tar.gz → 1.0.8__tar.gz

greenmining 1.0.6tar.gz → 1.0.8tar.gz