greenmining 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. greenmining/__init__.py +11 -29
  2. greenmining/__main__.py +9 -3
  3. greenmining/__version__.py +2 -2
  4. greenmining/analyzers/__init__.py +3 -7
  5. greenmining/analyzers/code_diff_analyzer.py +151 -61
  6. greenmining/analyzers/qualitative_analyzer.py +15 -81
  7. greenmining/analyzers/statistical_analyzer.py +8 -69
  8. greenmining/analyzers/temporal_analyzer.py +16 -72
  9. greenmining/config.py +105 -58
  10. greenmining/controllers/__init__.py +1 -5
  11. greenmining/controllers/repository_controller.py +153 -94
  12. greenmining/energy/__init__.py +13 -0
  13. greenmining/energy/base.py +165 -0
  14. greenmining/energy/codecarbon_meter.py +146 -0
  15. greenmining/energy/rapl.py +157 -0
  16. greenmining/gsf_patterns.py +4 -26
  17. greenmining/models/__init__.py +1 -5
  18. greenmining/models/aggregated_stats.py +4 -4
  19. greenmining/models/analysis_result.py +4 -4
  20. greenmining/models/commit.py +5 -5
  21. greenmining/models/repository.py +5 -5
  22. greenmining/presenters/__init__.py +1 -5
  23. greenmining/presenters/console_presenter.py +24 -24
  24. greenmining/services/__init__.py +10 -6
  25. greenmining/services/commit_extractor.py +8 -152
  26. greenmining/services/data_aggregator.py +45 -175
  27. greenmining/services/data_analyzer.py +9 -202
  28. greenmining/services/github_fetcher.py +210 -323
  29. greenmining/services/github_graphql_fetcher.py +361 -0
  30. greenmining/services/local_repo_analyzer.py +387 -0
  31. greenmining/services/reports.py +33 -137
  32. greenmining/utils.py +21 -149
  33. {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/METADATA +69 -173
  34. greenmining-1.0.5.dist-info/RECORD +37 -0
  35. {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/WHEEL +1 -1
  36. greenmining/analyzers/ml_feature_extractor.py +0 -512
  37. greenmining/analyzers/nlp_analyzer.py +0 -365
  38. greenmining/cli.py +0 -471
  39. greenmining/main.py +0 -37
  40. greenmining-1.0.3.dist-info/RECORD +0 -36
  41. greenmining-1.0.3.dist-info/entry_points.txt +0 -2
  42. {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/licenses/LICENSE +0 -0
  43. {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/top_level.txt +0 -0
@@ -1,323 +1,210 @@
1
- """GitHub repository fetcher for green microservices mining."""
2
-
3
- from __future__ import annotations
4
-
5
- from datetime import datetime
6
- from pathlib import Path
7
- from typing import Any, Dict, List, Optional
8
-
9
- import click
10
- from github import Github, GithubException, RateLimitExceededException
11
- from tqdm import tqdm
12
-
13
- from greenmining.config import get_config
14
- from greenmining.utils import (
15
- colored_print,
16
- format_timestamp,
17
- print_banner,
18
- save_json_file,
19
- )
20
-
21
-
22
- class GitHubFetcher:
23
- """Fetches microservice repositories from GitHub."""
24
-
25
- def __init__(
26
- self,
27
- token: str,
28
- max_repos: int = 100,
29
- min_stars: int = 100,
30
- languages: Optional[list[str]] = None,
31
- created_after: Optional[str] = None,
32
- created_before: Optional[str] = None,
33
- pushed_after: Optional[str] = None,
34
- pushed_before: Optional[str] = None,
35
- ):
36
- """Initialize GitHub fetcher.
37
-
38
- Args:
39
- token: GitHub personal access token
40
- max_repos: Maximum number of repositories to fetch
41
- min_stars: Minimum number of stars required
42
- languages: List of programming languages to filter
43
- created_after: Repository created after date (YYYY-MM-DD)
44
- created_before: Repository created before date (YYYY-MM-DD)
45
- pushed_after: Repository pushed after date (YYYY-MM-DD)
46
- pushed_before: Repository pushed before date (YYYY-MM-DD)
47
- """
48
- self.github = Github(token)
49
- self.max_repos = max_repos
50
- self.min_stars = min_stars
51
- self.languages = languages or [
52
- "Java",
53
- "Python",
54
- "Go",
55
- "JavaScript",
56
- "TypeScript",
57
- "C#",
58
- "Rust",
59
- ]
60
- self.created_after = created_after
61
- self.created_before = created_before
62
- self.pushed_after = pushed_after
63
- self.pushed_before = pushed_before
64
-
65
- def search_repositories(self) -> list[dict[str, Any]]:
66
- """Search for microservice repositories.
67
-
68
- Returns:
69
- List of repository metadata dictionaries
70
- """
71
- repositories = []
72
- keywords = ["microservices", "microservice-architecture", "cloud-native"]
73
-
74
- colored_print(f"Searching for repositories with keywords: {', '.join(keywords)}", "cyan")
75
- colored_print(
76
- f"Filters: min_stars={self.min_stars}, languages={', '.join(self.languages)}", "cyan"
77
- )
78
-
79
- # Build search query with temporal filters
80
- query = self._build_temporal_query(keywords)
81
-
82
- try:
83
- # Execute search
84
- search_results = self.github.search_repositories(
85
- query=query, sort="stars", order="desc"
86
- )
87
-
88
- total_found = search_results.totalCount
89
- colored_print(f"Found {total_found} repositories matching criteria", "green")
90
-
91
- # Fetch repository details with progress bar
92
- with tqdm(
93
- total=min(self.max_repos, total_found), desc="Fetching repositories", unit="repo"
94
- ) as pbar:
95
- for idx, repo in enumerate(search_results):
96
- if idx >= self.max_repos:
97
- break
98
-
99
- try:
100
- repo_data = self._extract_repo_metadata(repo, idx + 1)
101
- repositories.append(repo_data)
102
- pbar.update(1)
103
- except GithubException as e:
104
- colored_print(f"Error fetching {repo.full_name}: {e}", "yellow")
105
- continue
106
- except RateLimitExceededException:
107
- colored_print("Rate limit exceeded. Waiting...", "red")
108
- self._handle_rate_limit()
109
- continue
110
-
111
- return repositories
112
-
113
- except GithubException as e:
114
- colored_print(f"GitHub API error: {e}", "red")
115
- raise
116
- except Exception as e:
117
- colored_print(f"Unexpected error: {e}", "red")
118
- raise
119
-
120
- def _extract_repo_metadata(self, repo, repo_id: int) -> dict[str, Any]:
121
- """Extract metadata from repository object.
122
-
123
- Args:
124
- repo: GitHub repository object
125
- repo_id: Sequential repository ID
126
-
127
- Returns:
128
- Dictionary with repository metadata
129
- """
130
- return {
131
- "repo_id": repo_id,
132
- "name": repo.name,
133
- "owner": repo.owner.login,
134
- "full_name": repo.full_name,
135
- "url": repo.html_url,
136
- "clone_url": repo.clone_url,
137
- "language": repo.language,
138
- "stars": repo.stargazers_count,
139
- "forks": repo.forks_count,
140
- "watchers": repo.watchers_count,
141
- "open_issues": repo.open_issues_count,
142
- "last_updated": repo.updated_at.isoformat() if repo.updated_at else None,
143
- "created_at": repo.created_at.isoformat() if repo.created_at else None,
144
- "description": repo.description or "",
145
- "main_branch": repo.default_branch,
146
- "topics": repo.get_topics() if hasattr(repo, "get_topics") else [],
147
- "size": repo.size,
148
- "has_issues": repo.has_issues,
149
- "has_wiki": repo.has_wiki,
150
- "archived": repo.archived,
151
- "license": repo.license.name if repo.license else None,
152
- }
153
-
154
- def _build_temporal_query(self, keywords: list[str]) -> str:
155
- """
156
- Build GitHub search query with temporal constraints.
157
-
158
- Args:
159
- keywords: List of search keywords
160
-
161
- Returns:
162
- Complete search query string
163
- """
164
- query_parts = []
165
-
166
- # Keywords
167
- keyword_query = " OR ".join(keywords)
168
- query_parts.append(f"({keyword_query})")
169
-
170
- # Languages
171
- language_query = " OR ".join([f"language:{lang}" for lang in self.languages])
172
- query_parts.append(f"({language_query})")
173
-
174
- # Stars
175
- query_parts.append(f"stars:>={self.min_stars}")
176
-
177
- # Archived filter
178
- query_parts.append("archived:false")
179
-
180
- # Temporal filters
181
- if self.created_after and self.created_before:
182
- query_parts.append(f"created:{self.created_after}..{self.created_before}")
183
- elif self.created_after:
184
- query_parts.append(f"created:>={self.created_after}")
185
- elif self.created_before:
186
- query_parts.append(f"created:<={self.created_before}")
187
-
188
- if self.pushed_after and self.pushed_before:
189
- query_parts.append(f"pushed:{self.pushed_after}..{self.pushed_before}")
190
- elif self.pushed_after:
191
- query_parts.append(f"pushed:>={self.pushed_after}")
192
- elif self.pushed_before:
193
- query_parts.append(f"pushed:<={self.pushed_before}")
194
-
195
- query = " ".join(query_parts)
196
- colored_print(f"Query: {query}", "cyan")
197
- return query
198
-
199
- def _handle_rate_limit(self):
200
- """Handle GitHub API rate limiting."""
201
- rate_limit = self.github.get_rate_limit()
202
- reset_time = rate_limit.core.reset
203
- wait_seconds = (reset_time - datetime.now()).total_seconds()
204
-
205
- if wait_seconds > 0:
206
- colored_print(f"Rate limit will reset in {wait_seconds:.0f} seconds", "yellow")
207
- import time
208
-
209
- time.sleep(min(wait_seconds + 10, 60)) # Wait with max 60 seconds
210
-
211
- def save_results(self, repositories: list[dict[str, Any]], output_file: Path):
212
- """Save fetched repositories to JSON file.
213
-
214
- Args:
215
- repositories: List of repository metadata
216
- output_file: Output file path
217
- """
218
- data = {
219
- "metadata": {
220
- "fetched_at": format_timestamp(),
221
- "total_repos": len(repositories),
222
- "min_stars": self.min_stars,
223
- "languages": self.languages,
224
- "search_keywords": ["microservices", "microservice-architecture", "cloud-native"],
225
- },
226
- "repositories": repositories,
227
- }
228
-
229
- save_json_file(data, output_file)
230
- colored_print(f"Saved {len(repositories)} repositories to {output_file}", "green")
231
-
232
-
233
- @click.command()
234
- @click.option("--max-repos", default=100, help="Maximum number of repositories to fetch")
235
- @click.option("--min-stars", default=100, help="Minimum stars required")
236
- @click.option(
237
- "--languages",
238
- default="java,python,go,javascript,typescript,csharp,rust",
239
- help="Comma-separated list of languages",
240
- )
241
- @click.option("--output", default=None, help="Output file path (default: data/repositories.json)")
242
- @click.option("--config-file", default=".env", help="Path to .env configuration file")
243
- def fetch(max_repos: int, min_stars: int, languages: str, output: Optional[str], config_file: str):
244
- """Fetch top microservice repositories from GitHub."""
245
- print_banner("GitHub Repository Fetcher")
246
-
247
- try:
248
- # Load configuration
249
- config = get_config(config_file)
250
-
251
- # Parse languages
252
- language_list = [lang.strip().title() for lang in languages.split(",")]
253
-
254
- # Map common language names
255
- language_map = {"Nodejs": "JavaScript", "Csharp": "C#", "Typescript": "TypeScript"}
256
- language_list = [language_map.get(lang, lang) for lang in language_list]
257
-
258
- # Determine output file
259
- output_file = Path(output) if output else config.REPOS_FILE
260
-
261
- colored_print(f"Fetching up to {max_repos} repositories...", "blue")
262
-
263
- # Initialize fetcher
264
- fetcher = GitHubFetcher(
265
- token=config.GITHUB_TOKEN,
266
- max_repos=max_repos,
267
- min_stars=min_stars,
268
- languages=language_list,
269
- )
270
-
271
- # Search and fetch repositories
272
- repositories = fetcher.search_repositories()
273
-
274
- if not repositories:
275
- colored_print("No repositories found matching criteria", "yellow")
276
- return
277
-
278
- # Save results
279
- fetcher.save_results(repositories, output_file)
280
-
281
- # Display summary
282
- colored_print(f"\n✓ Successfully fetched {len(repositories)} repositories", "green")
283
- colored_print(f"Output saved to: {output_file}", "green")
284
-
285
- # Show top 5 repos
286
- colored_print("\nTop 5 repositories by stars:", "cyan")
287
- from tabulate import tabulate
288
-
289
- top_repos = sorted(repositories, key=lambda x: x["stars"], reverse=True)[:5]
290
- table_data = [
291
- [
292
- repo["full_name"],
293
- repo["language"],
294
- f"{repo['stars']:,}",
295
- repo["description"][:50] + "...",
296
- ]
297
- for repo in top_repos
298
- ]
299
- print(
300
- tabulate(
301
- table_data,
302
- headers=["Repository", "Language", "Stars", "Description"],
303
- tablefmt="simple",
304
- )
305
- )
306
-
307
- except ValueError as e:
308
- colored_print(f"Configuration error: {e}", "red")
309
- colored_print("Please check your .env file and ensure GITHUB_TOKEN is set", "yellow")
310
- exit(1)
311
- except GithubException as e:
312
- colored_print(f"GitHub API error: {e}", "red")
313
- exit(1)
314
- except Exception as e:
315
- colored_print(f"Error: {e}", "red")
316
- import traceback
317
-
318
- traceback.print_exc()
319
- exit(1)
320
-
321
-
322
- if __name__ == "__main__":
323
- fetch()
1
+ # ================================================================================
2
+ # DEADCODE - OLD REST API IMPLEMENTATION
3
+ # ================================================================================
4
+ #
5
+ # This file contains the OLD GitHub REST API implementation.
6
+ # It has been REPLACED by GitHubGraphQLFetcher for better performance.
7
+ #
8
+ # Performance comparison:
9
+ # REST API: 10+ requests for 100 repos, ~2 minutes
10
+ # GraphQL API: 1-2 requests for 100 repos, ~15 seconds (10x faster!)
11
+ #
12
+ # USE INSTEAD: greenmining.services.github_graphql_fetcher.GitHubGraphQLFetcher
13
+ #
14
+ # This file is kept for reference only. Do not use in production.
15
+ #
16
+ # ================================================================================
17
+
18
+ # GitHub repository fetcher for green microservices mining.
19
+
20
+ # from __future__ import annotations
21
+ #
22
+ # from datetime import datetime
23
+ # from pathlib import Path
24
+ # from typing import Any, Dict, List, Optional
25
+ #
26
+ # from github import Github, GithubException, RateLimitExceededException
27
+ # from tqdm import tqdm
28
+ #
29
+ # from greenmining.config import get_config
30
+ # from greenmining.utils import (
31
+ # colored_print,
32
+ # format_timestamp,
33
+ # print_banner,
34
+ # save_json_file,
35
+ # )
36
+ #
37
+ #
38
+ # class GitHubFetcher:
39
+ # # Fetches microservice repositories from GitHub using REST API (SLOW).
40
+ #
41
+ # def __init__(
42
+ # self,
43
+ # token: str,
44
+ # max_repos: int = 100,
45
+ # min_stars: int = 100,
46
+ # languages: Optional[list[str]] = None,
47
+ # created_after: Optional[str] = None,
48
+ # created_before: Optional[str] = None,
49
+ # pushed_after: Optional[str] = None,
50
+ # pushed_before: Optional[str] = None,
51
+ # ):
52
+ # # Initialize GitHub fetcher.
53
+ # self.github = Github(token)
54
+ # self.max_repos = max_repos
55
+ # self.min_stars = min_stars
56
+ # self.languages = languages or [
57
+ # "Java",
58
+ # "Python",
59
+ # "Go",
60
+ # "JavaScript",
61
+ # "TypeScript",
62
+ # "C#",
63
+ # "Rust",
64
+ # ]
65
+ # self.created_after = created_after
66
+ # self.created_before = created_before
67
+ # self.pushed_after = pushed_after
68
+ # self.pushed_before = pushed_before
69
+ #
70
+ # def search_repositories(self) -> list[dict[str, Any]]:
71
+ # # Search for microservice repositories (REST API - many requests).
72
+ # repositories = []
73
+ # keywords = ["microservices", "microservice-architecture", "cloud-native"]
74
+ #
75
+ # colored_print(f"Searching for repositories with keywords: {', '.join(keywords)}", "cyan")
76
+ # colored_print(
77
+ # f"Filters: min_stars={self.min_stars}, languages={', '.join(self.languages)}", "cyan"
78
+ # )
79
+ #
80
+ # # Build search query with temporal filters
81
+ # query = self._build_temporal_query(keywords)
82
+ #
83
+ # try:
84
+ # # Execute search (1 request)
85
+ # search_results = self.github.search_repositories(
86
+ # query=query, sort="stars", order="desc"
87
+ # )
88
+ #
89
+ # total_found = search_results.totalCount
90
+ # colored_print(f"Found {total_found} repositories matching criteria", "green")
91
+ #
92
+ # # Fetch repository details with progress bar (1 request per repo = SLOW)
93
+ # with tqdm(
94
+ # total=min(self.max_repos, total_found), desc="Fetching repositories", unit="repo"
95
+ # ) as pbar:
96
+ # for idx, repo in enumerate(search_results):
97
+ # if idx >= self.max_repos:
98
+ # break
99
+ #
100
+ # try:
101
+ # repo_data = self._extract_repo_metadata(repo, idx + 1)
102
+ # repositories.append(repo_data)
103
+ # pbar.update(1)
104
+ # except GithubException as e:
105
+ # colored_print(f"Error fetching {repo.full_name}: {e}", "yellow")
106
+ # continue
107
+ # except RateLimitExceededException:
108
+ # colored_print("Rate limit exceeded. Waiting...", "red")
109
+ # self._handle_rate_limit()
110
+ # continue
111
+ #
112
+ # return repositories
113
+ #
114
+ # except GithubException as e:
115
+ # colored_print(f"GitHub API error: {e}", "red")
116
+ # raise
117
+ # except Exception as e:
118
+ # colored_print(f"Unexpected error: {e}", "red")
119
+ # raise
120
+ #
121
+ # def _extract_repo_metadata(self, repo, repo_id: int) -> dict[str, Any]:
122
+ # # Extract metadata from repository object.
123
+ # return {
124
+ # "repo_id": repo_id,
125
+ # "name": repo.name,
126
+ # "owner": repo.owner.login,
127
+ # "full_name": repo.full_name,
128
+ # "url": repo.html_url,
129
+ # "clone_url": repo.clone_url,
130
+ # "language": repo.language,
131
+ # "stars": repo.stargazers_count,
132
+ # "forks": repo.forks_count,
133
+ # "watchers": repo.watchers_count,
134
+ # "open_issues": repo.open_issues_count,
135
+ # "last_updated": repo.updated_at.isoformat() if repo.updated_at else None,
136
+ # "created_at": repo.created_at.isoformat() if repo.created_at else None,
137
+ # "description": repo.description or "",
138
+ # "main_branch": repo.default_branch,
139
+ # "topics": repo.get_topics() if hasattr(repo, "get_topics") else [],
140
+ # "size": repo.size,
141
+ # "has_issues": repo.has_issues,
142
+ # "has_wiki": repo.has_wiki,
143
+ # "archived": repo.archived,
144
+ # "license": repo.license.name if repo.license else None,
145
+ # }
146
+ #
147
+ # def _build_temporal_query(self, keywords: list[str]) -> str:
148
+ # # Build GitHub search query with temporal constraints.
149
+ # query_parts = []
150
+ #
151
+ # # Keywords
152
+ # keyword_query = " OR ".join(keywords)
153
+ # query_parts.append(f"({keyword_query})")
154
+ #
155
+ # # Languages
156
+ # language_query = " OR ".join([f"language:{lang}" for lang in self.languages])
157
+ # query_parts.append(f"({language_query})")
158
+ #
159
+ # # Stars
160
+ # query_parts.append(f"stars:>={self.min_stars}")
161
+ #
162
+ # # Archived filter
163
+ # query_parts.append("archived:false")
164
+ #
165
+ # # Temporal filters
166
+ # if self.created_after and self.created_before:
167
+ # query_parts.append(f"created:{self.created_after}..{self.created_before}")
168
+ # elif self.created_after:
169
+ # query_parts.append(f"created:>={self.created_after}")
170
+ # elif self.created_before:
171
+ # query_parts.append(f"created:<={self.created_before}")
172
+ #
173
+ # if self.pushed_after and self.pushed_before:
174
+ # query_parts.append(f"pushed:{self.pushed_after}..{self.pushed_before}")
175
+ # elif self.pushed_after:
176
+ # query_parts.append(f"pushed:>={self.pushed_after}")
177
+ # elif self.pushed_before:
178
+ # query_parts.append(f"pushed:<={self.pushed_before}")
179
+ #
180
+ # query = " ".join(query_parts)
181
+ # colored_print(f"Query: {query}", "cyan")
182
+ # return query
183
+ #
184
+ # def _handle_rate_limit(self):
185
+ # # Handle GitHub API rate limiting.
186
+ # rate_limit = self.github.get_rate_limit()
187
+ # reset_time = rate_limit.core.reset
188
+ # wait_seconds = (reset_time - datetime.now()).total_seconds()
189
+ #
190
+ # if wait_seconds > 0:
191
+ # colored_print(f"Rate limit will reset in {wait_seconds:.0f} seconds", "yellow")
192
+ # import time
193
+ #
194
+ # time.sleep(min(wait_seconds + 10, 60)) # Wait with max 60 seconds
195
+ #
196
+ # def save_results(self, repositories: list[dict[str, Any]], output_file: Path):
197
+ # # Save fetched repositories to JSON file.
198
+ # data = {
199
+ # "metadata": {
200
+ # "fetched_at": format_timestamp(),
201
+ # "total_repos": len(repositories),
202
+ # "min_stars": self.min_stars,
203
+ # "languages": self.languages,
204
+ # "search_keywords": ["microservices", "microservice-architecture", "cloud-native"],
205
+ # },
206
+ # "repositories": repositories,
207
+ # }
208
+ #
209
+ # save_json_file(data, output_file)
210
+ # colored_print(f"Saved {len(repositories)} repositories to {output_file}", "green")