greenmining 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,333 @@
1
+ """Data analyzer for green microservices commits using GSF patterns."""
2
+
3
+ import json
4
+ import re
5
+ from collections import Counter
6
+ from pathlib import Path
7
+ from typing import Any, Optional
8
+
9
+ import click
10
+ from tqdm import tqdm
11
+
12
+ from greenmining.config import get_config
13
+ from greenmining.gsf_patterns import (
14
+ GREEN_KEYWORDS,
15
+ GSF_PATTERNS,
16
+ get_pattern_by_keywords,
17
+ is_green_aware,
18
+ )
19
+ from greenmining.utils import (
20
+ colored_print,
21
+ create_checkpoint,
22
+ format_timestamp,
23
+ load_checkpoint,
24
+ load_json_file,
25
+ print_banner,
26
+ save_json_file,
27
+ )
28
+
29
+
30
+ class DataAnalyzer:
31
+ """Analyzes commits for green software patterns using GSF (Green Software Foundation) patterns."""
32
+
33
+ def __init__(self, batch_size: int = 10):
34
+ """Initialize analyzer with GSF patterns.
35
+
36
+ Args:
37
+ batch_size: Number of commits to process in each batch
38
+ """
39
+ # Use GSF patterns from gsf_patterns.py
40
+ self.gsf_patterns = GSF_PATTERNS
41
+ self.green_keywords = GREEN_KEYWORDS
42
+ self.batch_size = batch_size
43
+
44
+ def analyze_commits(
45
+ self, commits: list[dict[str, Any]], resume_from: int = 0
46
+ ) -> list[dict[str, Any]]:
47
+ """Analyze commits for green software practices.
48
+
49
+ Args:
50
+ commits: List of commit dictionaries
51
+ resume_from: Index to resume from
52
+
53
+ Returns:
54
+ List of analysis results
55
+ """
56
+ results = []
57
+
58
+ colored_print(f"\nAnalyzing {len(commits)} commits for green practices...", "cyan")
59
+
60
+ with tqdm(
61
+ total=len(commits), initial=resume_from, desc="Analyzing commits", unit="commit"
62
+ ) as pbar:
63
+ for _idx, commit in enumerate(commits[resume_from:], start=resume_from):
64
+ try:
65
+ analysis = self._analyze_commit(commit)
66
+ results.append(analysis)
67
+ pbar.update(1)
68
+ except Exception as e:
69
+ colored_print(
70
+ f"\nError analyzing commit {commit.get('commit_id', 'unknown')}: {e}",
71
+ "yellow",
72
+ )
73
+ pbar.update(1)
74
+
75
+ return results
76
+
77
+ def _analyze_commit(self, commit: dict[str, Any]) -> dict[str, Any]:
78
+ """Analyze a single commit using GSF patterns.
79
+
80
+ Args:
81
+ commit: Commit dictionary
82
+
83
+ Returns:
84
+ Analysis result with GSF pattern matching
85
+ """
86
+ message = commit.get("message", "")
87
+
88
+ # Q1: GREEN AWARENESS - Check using GSF keywords
89
+ green_aware = is_green_aware(message)
90
+
91
+ # Q2: KNOWN GSF PATTERNS - Match against Green Software Foundation patterns
92
+ matched_patterns = get_pattern_by_keywords(message)
93
+
94
+ # Get detailed pattern info
95
+ pattern_details = []
96
+ for _pattern_id, pattern in self.gsf_patterns.items():
97
+ if pattern["name"] in matched_patterns:
98
+ pattern_details.append(
99
+ {
100
+ "name": pattern["name"],
101
+ "category": pattern["category"],
102
+ "description": pattern["description"],
103
+ "sci_impact": pattern["sci_impact"],
104
+ }
105
+ )
106
+
107
+ # Calculate confidence based on number of patterns matched
108
+ confidence = (
109
+ "high"
110
+ if len(matched_patterns) >= 2
111
+ else "medium" if len(matched_patterns) == 1 else "low"
112
+ )
113
+
114
+ return {
115
+ "commit_hash": commit.get("hash", commit.get("commit_id", "unknown")),
116
+ "repository": commit.get("repository", commit.get("repo_name", "unknown")),
117
+ "author": commit.get("author_name", "unknown"),
118
+ "date": commit.get("author_date", commit.get("date", "unknown")),
119
+ "message": message,
120
+ # Research Question 1: Green awareness
121
+ "green_aware": green_aware,
122
+ # Research Question 2: Known GSF patterns
123
+ "gsf_patterns_matched": matched_patterns,
124
+ "pattern_count": len(matched_patterns),
125
+ "pattern_details": pattern_details,
126
+ "confidence": confidence,
127
+ # Additional metadata
128
+ "files_modified": commit.get("modified_files", commit.get("files_changed", [])),
129
+ "insertions": commit.get("insertions", commit.get("lines_added", 0)),
130
+ "deletions": commit.get("deletions", commit.get("lines_deleted", 0)),
131
+ "lines_deleted": commit.get("lines_deleted", 0),
132
+ }
133
+
134
+ def _check_green_awareness(self, message: str, files: list[str]) -> tuple[bool, Optional[str]]:
135
+ """Check if commit explicitly mentions green/energy concerns.
136
+
137
+ Args:
138
+ message: Commit message (lowercase)
139
+ files: List of changed files (lowercase)
140
+
141
+ Returns:
142
+ Tuple of (is_green_aware, evidence_text)
143
+ """
144
+ # Check message for green keywords
145
+ for keyword in self.GREEN_KEYWORDS:
146
+ if keyword in message:
147
+ # Extract context around keyword
148
+ pattern = rf".{{0,30}}{re.escape(keyword)}.{{0,30}}"
149
+ match = re.search(pattern, message, re.IGNORECASE)
150
+ if match:
151
+ evidence = match.group(0).strip()
152
+ return True, f"Keyword '{keyword}': {evidence}"
153
+
154
+ # Check file names for patterns
155
+ cache_files = [f for f in files if "cache" in f or "redis" in f]
156
+ if cache_files:
157
+ return True, f"Modified cache-related file: {cache_files[0]}"
158
+
159
+ perf_files = [f for f in files if "performance" in f or "optimization" in f]
160
+ if perf_files:
161
+ return True, f"Modified performance file: {perf_files[0]}"
162
+
163
+ return False, None
164
+
165
+ def _detect_known_pattern(self, message: str, files: list[str]) -> tuple[Optional[str], str]:
166
+ """Detect known green software pattern.
167
+
168
+ Args:
169
+ message: Commit message (lowercase)
170
+ files: List of changed files (lowercase)
171
+
172
+ Returns:
173
+ Tuple of (pattern_name, confidence_level)
174
+ """
175
+ matches = []
176
+
177
+ # Check each pattern
178
+ for pattern_name, keywords in self.GREEN_PATTERNS.items():
179
+ for keyword in keywords:
180
+ if keyword in message:
181
+ # Calculate confidence based on specificity
182
+ confidence = "HIGH" if len(keyword) > 10 else "MEDIUM"
183
+ matches.append((pattern_name, confidence, len(keyword)))
184
+
185
+ # Check file names for pattern hints
186
+ all_files = " ".join(files)
187
+ for pattern_name, keywords in self.GREEN_PATTERNS.items():
188
+ for keyword in keywords:
189
+ if keyword in all_files:
190
+ matches.append((pattern_name, "MEDIUM", len(keyword)))
191
+
192
+ if not matches:
193
+ return "NONE DETECTED", "NONE"
194
+
195
+ # Return most specific match (longest keyword)
196
+ matches.sort(key=lambda x: x[2], reverse=True)
197
+ return matches[0][0], matches[0][1]
198
+
199
+ def save_results(self, results: list[dict[str, Any]], output_file: Path):
200
+ """Save analysis results to JSON file.
201
+
202
+ Args:
203
+ results: List of analysis results
204
+ output_file: Output file path
205
+ """
206
+ # Calculate summary statistics
207
+ green_aware_count = sum(1 for r in results if r["green_aware"])
208
+ pattern_counts = Counter(
209
+ r["known_pattern"] for r in results if r["known_pattern"] != "NONE DETECTED"
210
+ )
211
+
212
+ data = {
213
+ "metadata": {
214
+ "analyzed_at": format_timestamp(),
215
+ "total_commits_analyzed": len(results),
216
+ "green_aware_commits": green_aware_count,
217
+ "green_aware_percentage": (
218
+ round(green_aware_count / len(results) * 100, 2) if results else 0
219
+ ),
220
+ "analyzer_type": "keyword_heuristic",
221
+ "note": "This analysis uses keyword and heuristic matching. For AI-powered analysis, use Claude API.",
222
+ },
223
+ "results": results,
224
+ }
225
+
226
+ save_json_file(data, output_file)
227
+ colored_print(f"Saved analysis for {len(results)} commits to {output_file}", "green")
228
+
229
+ # Display summary
230
+ colored_print("\n📊 Analysis Summary:", "cyan")
231
+ colored_print(
232
+ f" Green-aware commits: {green_aware_count} ({data['metadata']['green_aware_percentage']}%)",
233
+ "white",
234
+ )
235
+ if pattern_counts:
236
+ colored_print("\n Top patterns detected:", "cyan")
237
+ for pattern, count in pattern_counts.most_common(5):
238
+ colored_print(f" - {pattern}: {count}", "white")
239
+
240
+
241
+ @click.command()
242
+ @click.option("--batch-size", default=10, help="Batch size for processing")
243
+ @click.option("--resume", is_flag=True, help="Resume from checkpoint")
244
+ @click.option(
245
+ "--commits-file", default=None, help="Input commits file (default: data/commits.json)"
246
+ )
247
+ @click.option(
248
+ "--output", default=None, help="Output file path (default: data/analysis_results.json)"
249
+ )
250
+ @click.option("--config-file", default=".env", help="Path to .env configuration file")
251
+ def analyze(
252
+ batch_size: int,
253
+ resume: bool,
254
+ commits_file: Optional[str],
255
+ output: Optional[str],
256
+ config_file: str,
257
+ ):
258
+ """Analyze commits for green software practices."""
259
+ print_banner("Data Analyzer")
260
+
261
+ try:
262
+ # Load configuration
263
+ config = get_config(config_file)
264
+
265
+ # Determine input/output files
266
+ input_file = Path(commits_file) if commits_file else config.COMMITS_FILE
267
+ output_file = Path(output) if output else config.ANALYSIS_FILE
268
+
269
+ # Check if input file exists
270
+ if not input_file.exists():
271
+ colored_print(f"Input file not found: {input_file}", "red")
272
+ colored_print("Please run 'extract' command first to extract commits", "yellow")
273
+ exit(1)
274
+
275
+ # Load commits
276
+ colored_print(f"Loading commits from {input_file}...", "blue")
277
+ data = load_json_file(input_file)
278
+ commits = data.get("commits", [])
279
+
280
+ if not commits:
281
+ colored_print("No commits found in input file", "yellow")
282
+ exit(1)
283
+
284
+ colored_print(f"Loaded {len(commits)} commits", "green")
285
+
286
+ # Check for resume
287
+ resume_from = 0
288
+ if resume:
289
+ checkpoint_data = load_checkpoint(config.CHECKPOINT_FILE)
290
+ if checkpoint_data:
291
+ resume_from = checkpoint_data.get("processed_count", 0)
292
+ colored_print(
293
+ f"Resuming from checkpoint: {resume_from} commits processed", "yellow"
294
+ )
295
+
296
+ # Initialize analyzer
297
+ analyzer = DataAnalyzer(batch_size=batch_size)
298
+
299
+ # Analyze commits
300
+ results = analyzer.analyze_commits(commits, resume_from=resume_from)
301
+
302
+ if not results:
303
+ colored_print("No analysis results generated", "yellow")
304
+ exit(1)
305
+
306
+ # Save results
307
+ analyzer.save_results(results, output_file)
308
+
309
+ # Save checkpoint
310
+ create_checkpoint(
311
+ config.CHECKPOINT_FILE,
312
+ {"processed_count": len(results), "timestamp": format_timestamp()},
313
+ )
314
+
315
+ colored_print(f"\n✓ Successfully analyzed {len(results)} commits", "green")
316
+ colored_print(f"Output saved to: {output_file}", "green")
317
+
318
+ except FileNotFoundError as e:
319
+ colored_print(f"File not found: {e}", "red")
320
+ exit(1)
321
+ except json.JSONDecodeError:
322
+ colored_print(f"Invalid JSON in input file: {input_file}", "red")
323
+ exit(1)
324
+ except Exception as e:
325
+ colored_print(f"Error: {e}", "red")
326
+ import traceback
327
+
328
+ traceback.print_exc()
329
+ exit(1)
330
+
331
+
332
+ if __name__ == "__main__":
333
+ analyze()
@@ -0,0 +1,266 @@
1
+ """GitHub repository fetcher for green microservices mining."""
2
+
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Any, Optional
6
+
7
+ import click
8
+ from github import Github, GithubException, RateLimitExceededException
9
+ from tqdm import tqdm
10
+
11
+ from greenmining.config import get_config
12
+ from greenmining.utils import (
13
+ colored_print,
14
+ format_timestamp,
15
+ print_banner,
16
+ save_json_file,
17
+ )
18
+
19
+
20
+ class GitHubFetcher:
21
+ """Fetches microservice repositories from GitHub."""
22
+
23
+ def __init__(
24
+ self,
25
+ token: str,
26
+ max_repos: int = 100,
27
+ min_stars: int = 100,
28
+ languages: Optional[list[str]] = None,
29
+ ):
30
+ """Initialize GitHub fetcher.
31
+
32
+ Args:
33
+ token: GitHub personal access token
34
+ max_repos: Maximum number of repositories to fetch
35
+ min_stars: Minimum number of stars required
36
+ languages: List of programming languages to filter
37
+ """
38
+ self.github = Github(token)
39
+ self.max_repos = max_repos
40
+ self.min_stars = min_stars
41
+ self.languages = languages or [
42
+ "Java",
43
+ "Python",
44
+ "Go",
45
+ "JavaScript",
46
+ "TypeScript",
47
+ "C#",
48
+ "Rust",
49
+ ]
50
+
51
+ def search_repositories(self) -> list[dict[str, Any]]:
52
+ """Search for microservice repositories.
53
+
54
+ Returns:
55
+ List of repository metadata dictionaries
56
+ """
57
+ repositories = []
58
+ keywords = ["microservices", "microservice-architecture", "cloud-native"]
59
+
60
+ colored_print(f"Searching for repositories with keywords: {', '.join(keywords)}", "cyan")
61
+ colored_print(
62
+ f"Filters: min_stars={self.min_stars}, languages={', '.join(self.languages)}", "cyan"
63
+ )
64
+
65
+ # Build search query
66
+ keyword_query = " OR ".join(keywords)
67
+ language_query = " OR ".join([f"language:{lang}" for lang in self.languages])
68
+ query = f"({keyword_query}) ({language_query}) stars:>={self.min_stars}"
69
+
70
+ try:
71
+ # Execute search
72
+ search_results = self.github.search_repositories(
73
+ query=query, sort="stars", order="desc"
74
+ )
75
+
76
+ total_found = search_results.totalCount
77
+ colored_print(f"Found {total_found} repositories matching criteria", "green")
78
+
79
+ # Fetch repository details with progress bar
80
+ with tqdm(
81
+ total=min(self.max_repos, total_found), desc="Fetching repositories", unit="repo"
82
+ ) as pbar:
83
+ for idx, repo in enumerate(search_results):
84
+ if idx >= self.max_repos:
85
+ break
86
+
87
+ try:
88
+ repo_data = self._extract_repo_metadata(repo, idx + 1)
89
+ repositories.append(repo_data)
90
+ pbar.update(1)
91
+ except GithubException as e:
92
+ colored_print(f"Error fetching {repo.full_name}: {e}", "yellow")
93
+ continue
94
+ except RateLimitExceededException:
95
+ colored_print("Rate limit exceeded. Waiting...", "red")
96
+ self._handle_rate_limit()
97
+ continue
98
+
99
+ return repositories
100
+
101
+ except GithubException as e:
102
+ colored_print(f"GitHub API error: {e}", "red")
103
+ raise
104
+ except Exception as e:
105
+ colored_print(f"Unexpected error: {e}", "red")
106
+ raise
107
+
108
+ def _extract_repo_metadata(self, repo, repo_id: int) -> dict[str, Any]:
109
+ """Extract metadata from repository object.
110
+
111
+ Args:
112
+ repo: GitHub repository object
113
+ repo_id: Sequential repository ID
114
+
115
+ Returns:
116
+ Dictionary with repository metadata
117
+ """
118
+ return {
119
+ "repo_id": repo_id,
120
+ "name": repo.name,
121
+ "owner": repo.owner.login,
122
+ "full_name": repo.full_name,
123
+ "url": repo.html_url,
124
+ "clone_url": repo.clone_url,
125
+ "language": repo.language,
126
+ "stars": repo.stargazers_count,
127
+ "forks": repo.forks_count,
128
+ "watchers": repo.watchers_count,
129
+ "open_issues": repo.open_issues_count,
130
+ "last_updated": repo.updated_at.isoformat() if repo.updated_at else None,
131
+ "created_at": repo.created_at.isoformat() if repo.created_at else None,
132
+ "description": repo.description or "",
133
+ "main_branch": repo.default_branch,
134
+ "topics": repo.get_topics() if hasattr(repo, "get_topics") else [],
135
+ "size": repo.size,
136
+ "has_issues": repo.has_issues,
137
+ "has_wiki": repo.has_wiki,
138
+ "archived": repo.archived,
139
+ "license": repo.license.name if repo.license else None,
140
+ }
141
+
142
+ def _handle_rate_limit(self):
143
+ """Handle GitHub API rate limiting."""
144
+ rate_limit = self.github.get_rate_limit()
145
+ reset_time = rate_limit.core.reset
146
+ wait_seconds = (reset_time - datetime.now()).total_seconds()
147
+
148
+ if wait_seconds > 0:
149
+ colored_print(f"Rate limit will reset in {wait_seconds:.0f} seconds", "yellow")
150
+ import time
151
+
152
+ time.sleep(min(wait_seconds + 10, 60)) # Wait with max 60 seconds
153
+
154
+ def save_results(self, repositories: list[dict[str, Any]], output_file: Path):
155
+ """Save fetched repositories to JSON file.
156
+
157
+ Args:
158
+ repositories: List of repository metadata
159
+ output_file: Output file path
160
+ """
161
+ data = {
162
+ "metadata": {
163
+ "fetched_at": format_timestamp(),
164
+ "total_repos": len(repositories),
165
+ "min_stars": self.min_stars,
166
+ "languages": self.languages,
167
+ "search_keywords": ["microservices", "microservice-architecture", "cloud-native"],
168
+ },
169
+ "repositories": repositories,
170
+ }
171
+
172
+ save_json_file(data, output_file)
173
+ colored_print(f"Saved {len(repositories)} repositories to {output_file}", "green")
174
+
175
+
176
+ @click.command()
177
+ @click.option("--max-repos", default=100, help="Maximum number of repositories to fetch")
178
+ @click.option("--min-stars", default=100, help="Minimum stars required")
179
+ @click.option(
180
+ "--languages",
181
+ default="java,python,go,javascript,typescript,csharp,rust",
182
+ help="Comma-separated list of languages",
183
+ )
184
+ @click.option("--output", default=None, help="Output file path (default: data/repositories.json)")
185
+ @click.option("--config-file", default=".env", help="Path to .env configuration file")
186
+ def fetch(max_repos: int, min_stars: int, languages: str, output: Optional[str], config_file: str):
187
+ """Fetch top microservice repositories from GitHub."""
188
+ print_banner("GitHub Repository Fetcher")
189
+
190
+ try:
191
+ # Load configuration
192
+ config = get_config(config_file)
193
+
194
+ # Parse languages
195
+ language_list = [lang.strip().title() for lang in languages.split(",")]
196
+
197
+ # Map common language names
198
+ language_map = {"Nodejs": "JavaScript", "Csharp": "C#", "Typescript": "TypeScript"}
199
+ language_list = [language_map.get(lang, lang) for lang in language_list]
200
+
201
+ # Determine output file
202
+ output_file = Path(output) if output else config.REPOS_FILE
203
+
204
+ colored_print(f"Fetching up to {max_repos} repositories...", "blue")
205
+
206
+ # Initialize fetcher
207
+ fetcher = GitHubFetcher(
208
+ token=config.GITHUB_TOKEN,
209
+ max_repos=max_repos,
210
+ min_stars=min_stars,
211
+ languages=language_list,
212
+ )
213
+
214
+ # Search and fetch repositories
215
+ repositories = fetcher.search_repositories()
216
+
217
+ if not repositories:
218
+ colored_print("No repositories found matching criteria", "yellow")
219
+ return
220
+
221
+ # Save results
222
+ fetcher.save_results(repositories, output_file)
223
+
224
+ # Display summary
225
+ colored_print(f"\n✓ Successfully fetched {len(repositories)} repositories", "green")
226
+ colored_print(f"Output saved to: {output_file}", "green")
227
+
228
+ # Show top 5 repos
229
+ colored_print("\nTop 5 repositories by stars:", "cyan")
230
+ from tabulate import tabulate
231
+
232
+ top_repos = sorted(repositories, key=lambda x: x["stars"], reverse=True)[:5]
233
+ table_data = [
234
+ [
235
+ repo["full_name"],
236
+ repo["language"],
237
+ f"{repo['stars']:,}",
238
+ repo["description"][:50] + "...",
239
+ ]
240
+ for repo in top_repos
241
+ ]
242
+ print(
243
+ tabulate(
244
+ table_data,
245
+ headers=["Repository", "Language", "Stars", "Description"],
246
+ tablefmt="simple",
247
+ )
248
+ )
249
+
250
+ except ValueError as e:
251
+ colored_print(f"Configuration error: {e}", "red")
252
+ colored_print("Please check your .env file and ensure GITHUB_TOKEN is set", "yellow")
253
+ exit(1)
254
+ except GithubException as e:
255
+ colored_print(f"GitHub API error: {e}", "red")
256
+ exit(1)
257
+ except Exception as e:
258
+ colored_print(f"Error: {e}", "red")
259
+ import traceback
260
+
261
+ traceback.print_exc()
262
+ exit(1)
263
+
264
+
265
+ if __name__ == "__main__":
266
+ fetch()