greenmining 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. greenmining/__init__.py +11 -29
  2. greenmining/__main__.py +9 -3
  3. greenmining/__version__.py +2 -2
  4. greenmining/analyzers/__init__.py +3 -7
  5. greenmining/analyzers/code_diff_analyzer.py +151 -61
  6. greenmining/analyzers/qualitative_analyzer.py +15 -81
  7. greenmining/analyzers/statistical_analyzer.py +8 -69
  8. greenmining/analyzers/temporal_analyzer.py +16 -72
  9. greenmining/config.py +105 -58
  10. greenmining/controllers/__init__.py +1 -5
  11. greenmining/controllers/repository_controller.py +153 -94
  12. greenmining/energy/__init__.py +13 -0
  13. greenmining/energy/base.py +165 -0
  14. greenmining/energy/codecarbon_meter.py +146 -0
  15. greenmining/energy/rapl.py +157 -0
  16. greenmining/gsf_patterns.py +4 -26
  17. greenmining/models/__init__.py +1 -5
  18. greenmining/models/aggregated_stats.py +4 -4
  19. greenmining/models/analysis_result.py +4 -4
  20. greenmining/models/commit.py +5 -5
  21. greenmining/models/repository.py +5 -5
  22. greenmining/presenters/__init__.py +1 -5
  23. greenmining/presenters/console_presenter.py +24 -24
  24. greenmining/services/__init__.py +10 -6
  25. greenmining/services/commit_extractor.py +8 -152
  26. greenmining/services/data_aggregator.py +45 -175
  27. greenmining/services/data_analyzer.py +9 -202
  28. greenmining/services/github_fetcher.py +212 -323
  29. greenmining/services/github_graphql_fetcher.py +371 -0
  30. greenmining/services/local_repo_analyzer.py +387 -0
  31. greenmining/services/reports.py +33 -137
  32. greenmining/utils.py +21 -149
  33. {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/METADATA +61 -151
  34. greenmining-1.0.4.dist-info/RECORD +37 -0
  35. {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/WHEEL +1 -1
  36. greenmining/analyzers/ml_feature_extractor.py +0 -512
  37. greenmining/analyzers/nlp_analyzer.py +0 -365
  38. greenmining/cli.py +0 -471
  39. greenmining/main.py +0 -37
  40. greenmining-1.0.3.dist-info/RECORD +0 -36
  41. greenmining-1.0.3.dist-info/entry_points.txt +0 -2
  42. {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/licenses/LICENSE +0 -0
  43. {greenmining-1.0.3.dist-info → greenmining-1.0.4.dist-info}/top_level.txt +0 -0
@@ -1,323 +1,212 @@
1
- """GitHub repository fetcher for green microservices mining."""
2
-
3
- from __future__ import annotations
4
-
5
- from datetime import datetime
6
- from pathlib import Path
7
- from typing import Any, Dict, List, Optional
8
-
9
- import click
10
- from github import Github, GithubException, RateLimitExceededException
11
- from tqdm import tqdm
12
-
13
- from greenmining.config import get_config
14
- from greenmining.utils import (
15
- colored_print,
16
- format_timestamp,
17
- print_banner,
18
- save_json_file,
19
- )
20
-
21
-
22
- class GitHubFetcher:
23
- """Fetches microservice repositories from GitHub."""
24
-
25
- def __init__(
26
- self,
27
- token: str,
28
- max_repos: int = 100,
29
- min_stars: int = 100,
30
- languages: Optional[list[str]] = None,
31
- created_after: Optional[str] = None,
32
- created_before: Optional[str] = None,
33
- pushed_after: Optional[str] = None,
34
- pushed_before: Optional[str] = None,
35
- ):
36
- """Initialize GitHub fetcher.
37
-
38
- Args:
39
- token: GitHub personal access token
40
- max_repos: Maximum number of repositories to fetch
41
- min_stars: Minimum number of stars required
42
- languages: List of programming languages to filter
43
- created_after: Repository created after date (YYYY-MM-DD)
44
- created_before: Repository created before date (YYYY-MM-DD)
45
- pushed_after: Repository pushed after date (YYYY-MM-DD)
46
- pushed_before: Repository pushed before date (YYYY-MM-DD)
47
- """
48
- self.github = Github(token)
49
- self.max_repos = max_repos
50
- self.min_stars = min_stars
51
- self.languages = languages or [
52
- "Java",
53
- "Python",
54
- "Go",
55
- "JavaScript",
56
- "TypeScript",
57
- "C#",
58
- "Rust",
59
- ]
60
- self.created_after = created_after
61
- self.created_before = created_before
62
- self.pushed_after = pushed_after
63
- self.pushed_before = pushed_before
64
-
65
- def search_repositories(self) -> list[dict[str, Any]]:
66
- """Search for microservice repositories.
67
-
68
- Returns:
69
- List of repository metadata dictionaries
70
- """
71
- repositories = []
72
- keywords = ["microservices", "microservice-architecture", "cloud-native"]
73
-
74
- colored_print(f"Searching for repositories with keywords: {', '.join(keywords)}", "cyan")
75
- colored_print(
76
- f"Filters: min_stars={self.min_stars}, languages={', '.join(self.languages)}", "cyan"
77
- )
78
-
79
- # Build search query with temporal filters
80
- query = self._build_temporal_query(keywords)
81
-
82
- try:
83
- # Execute search
84
- search_results = self.github.search_repositories(
85
- query=query, sort="stars", order="desc"
86
- )
87
-
88
- total_found = search_results.totalCount
89
- colored_print(f"Found {total_found} repositories matching criteria", "green")
90
-
91
- # Fetch repository details with progress bar
92
- with tqdm(
93
- total=min(self.max_repos, total_found), desc="Fetching repositories", unit="repo"
94
- ) as pbar:
95
- for idx, repo in enumerate(search_results):
96
- if idx >= self.max_repos:
97
- break
98
-
99
- try:
100
- repo_data = self._extract_repo_metadata(repo, idx + 1)
101
- repositories.append(repo_data)
102
- pbar.update(1)
103
- except GithubException as e:
104
- colored_print(f"Error fetching {repo.full_name}: {e}", "yellow")
105
- continue
106
- except RateLimitExceededException:
107
- colored_print("Rate limit exceeded. Waiting...", "red")
108
- self._handle_rate_limit()
109
- continue
110
-
111
- return repositories
112
-
113
- except GithubException as e:
114
- colored_print(f"GitHub API error: {e}", "red")
115
- raise
116
- except Exception as e:
117
- colored_print(f"Unexpected error: {e}", "red")
118
- raise
119
-
120
- def _extract_repo_metadata(self, repo, repo_id: int) -> dict[str, Any]:
121
- """Extract metadata from repository object.
122
-
123
- Args:
124
- repo: GitHub repository object
125
- repo_id: Sequential repository ID
126
-
127
- Returns:
128
- Dictionary with repository metadata
129
- """
130
- return {
131
- "repo_id": repo_id,
132
- "name": repo.name,
133
- "owner": repo.owner.login,
134
- "full_name": repo.full_name,
135
- "url": repo.html_url,
136
- "clone_url": repo.clone_url,
137
- "language": repo.language,
138
- "stars": repo.stargazers_count,
139
- "forks": repo.forks_count,
140
- "watchers": repo.watchers_count,
141
- "open_issues": repo.open_issues_count,
142
- "last_updated": repo.updated_at.isoformat() if repo.updated_at else None,
143
- "created_at": repo.created_at.isoformat() if repo.created_at else None,
144
- "description": repo.description or "",
145
- "main_branch": repo.default_branch,
146
- "topics": repo.get_topics() if hasattr(repo, "get_topics") else [],
147
- "size": repo.size,
148
- "has_issues": repo.has_issues,
149
- "has_wiki": repo.has_wiki,
150
- "archived": repo.archived,
151
- "license": repo.license.name if repo.license else None,
152
- }
153
-
154
- def _build_temporal_query(self, keywords: list[str]) -> str:
155
- """
156
- Build GitHub search query with temporal constraints.
157
-
158
- Args:
159
- keywords: List of search keywords
160
-
161
- Returns:
162
- Complete search query string
163
- """
164
- query_parts = []
165
-
166
- # Keywords
167
- keyword_query = " OR ".join(keywords)
168
- query_parts.append(f"({keyword_query})")
169
-
170
- # Languages
171
- language_query = " OR ".join([f"language:{lang}" for lang in self.languages])
172
- query_parts.append(f"({language_query})")
173
-
174
- # Stars
175
- query_parts.append(f"stars:>={self.min_stars}")
176
-
177
- # Archived filter
178
- query_parts.append("archived:false")
179
-
180
- # Temporal filters
181
- if self.created_after and self.created_before:
182
- query_parts.append(f"created:{self.created_after}..{self.created_before}")
183
- elif self.created_after:
184
- query_parts.append(f"created:>={self.created_after}")
185
- elif self.created_before:
186
- query_parts.append(f"created:<={self.created_before}")
187
-
188
- if self.pushed_after and self.pushed_before:
189
- query_parts.append(f"pushed:{self.pushed_after}..{self.pushed_before}")
190
- elif self.pushed_after:
191
- query_parts.append(f"pushed:>={self.pushed_after}")
192
- elif self.pushed_before:
193
- query_parts.append(f"pushed:<={self.pushed_before}")
194
-
195
- query = " ".join(query_parts)
196
- colored_print(f"Query: {query}", "cyan")
197
- return query
198
-
199
- def _handle_rate_limit(self):
200
- """Handle GitHub API rate limiting."""
201
- rate_limit = self.github.get_rate_limit()
202
- reset_time = rate_limit.core.reset
203
- wait_seconds = (reset_time - datetime.now()).total_seconds()
204
-
205
- if wait_seconds > 0:
206
- colored_print(f"Rate limit will reset in {wait_seconds:.0f} seconds", "yellow")
207
- import time
208
-
209
- time.sleep(min(wait_seconds + 10, 60)) # Wait with max 60 seconds
210
-
211
- def save_results(self, repositories: list[dict[str, Any]], output_file: Path):
212
- """Save fetched repositories to JSON file.
213
-
214
- Args:
215
- repositories: List of repository metadata
216
- output_file: Output file path
217
- """
218
- data = {
219
- "metadata": {
220
- "fetched_at": format_timestamp(),
221
- "total_repos": len(repositories),
222
- "min_stars": self.min_stars,
223
- "languages": self.languages,
224
- "search_keywords": ["microservices", "microservice-architecture", "cloud-native"],
225
- },
226
- "repositories": repositories,
227
- }
228
-
229
- save_json_file(data, output_file)
230
- colored_print(f"Saved {len(repositories)} repositories to {output_file}", "green")
231
-
232
-
233
- @click.command()
234
- @click.option("--max-repos", default=100, help="Maximum number of repositories to fetch")
235
- @click.option("--min-stars", default=100, help="Minimum stars required")
236
- @click.option(
237
- "--languages",
238
- default="java,python,go,javascript,typescript,csharp,rust",
239
- help="Comma-separated list of languages",
240
- )
241
- @click.option("--output", default=None, help="Output file path (default: data/repositories.json)")
242
- @click.option("--config-file", default=".env", help="Path to .env configuration file")
243
- def fetch(max_repos: int, min_stars: int, languages: str, output: Optional[str], config_file: str):
244
- """Fetch top microservice repositories from GitHub."""
245
- print_banner("GitHub Repository Fetcher")
246
-
247
- try:
248
- # Load configuration
249
- config = get_config(config_file)
250
-
251
- # Parse languages
252
- language_list = [lang.strip().title() for lang in languages.split(",")]
253
-
254
- # Map common language names
255
- language_map = {"Nodejs": "JavaScript", "Csharp": "C#", "Typescript": "TypeScript"}
256
- language_list = [language_map.get(lang, lang) for lang in language_list]
257
-
258
- # Determine output file
259
- output_file = Path(output) if output else config.REPOS_FILE
260
-
261
- colored_print(f"Fetching up to {max_repos} repositories...", "blue")
262
-
263
- # Initialize fetcher
264
- fetcher = GitHubFetcher(
265
- token=config.GITHUB_TOKEN,
266
- max_repos=max_repos,
267
- min_stars=min_stars,
268
- languages=language_list,
269
- )
270
-
271
- # Search and fetch repositories
272
- repositories = fetcher.search_repositories()
273
-
274
- if not repositories:
275
- colored_print("No repositories found matching criteria", "yellow")
276
- return
277
-
278
- # Save results
279
- fetcher.save_results(repositories, output_file)
280
-
281
- # Display summary
282
- colored_print(f"\n✓ Successfully fetched {len(repositories)} repositories", "green")
283
- colored_print(f"Output saved to: {output_file}", "green")
284
-
285
- # Show top 5 repos
286
- colored_print("\nTop 5 repositories by stars:", "cyan")
287
- from tabulate import tabulate
288
-
289
- top_repos = sorted(repositories, key=lambda x: x["stars"], reverse=True)[:5]
290
- table_data = [
291
- [
292
- repo["full_name"],
293
- repo["language"],
294
- f"{repo['stars']:,}",
295
- repo["description"][:50] + "...",
296
- ]
297
- for repo in top_repos
298
- ]
299
- print(
300
- tabulate(
301
- table_data,
302
- headers=["Repository", "Language", "Stars", "Description"],
303
- tablefmt="simple",
304
- )
305
- )
306
-
307
- except ValueError as e:
308
- colored_print(f"Configuration error: {e}", "red")
309
- colored_print("Please check your .env file and ensure GITHUB_TOKEN is set", "yellow")
310
- exit(1)
311
- except GithubException as e:
312
- colored_print(f"GitHub API error: {e}", "red")
313
- exit(1)
314
- except Exception as e:
315
- colored_print(f"Error: {e}", "red")
316
- import traceback
317
-
318
- traceback.print_exc()
319
- exit(1)
320
-
321
-
322
- if __name__ == "__main__":
323
- fetch()
1
+ """
2
+ ================================================================================
3
+ DEADCODE - OLD REST API IMPLEMENTATION
4
+ ================================================================================
5
+
6
+ This file contains the OLD GitHub REST API implementation.
7
+ It has been REPLACED by GitHubGraphQLFetcher for better performance.
8
+
9
+ Performance comparison:
10
+ REST API: 10+ requests for 100 repos, ~2 minutes
11
+ GraphQL API: 1-2 requests for 100 repos, ~15 seconds (10x faster!)
12
+
13
+ USE INSTEAD: greenmining.services.github_graphql_fetcher.GitHubGraphQLFetcher
14
+
15
+ This file is kept for reference only. Do not use in production.
16
+
17
+ ================================================================================
18
+ """
19
+
20
+ # GitHub repository fetcher for green microservices mining.
21
+
22
+ # from __future__ import annotations
23
+ #
24
+ # from datetime import datetime
25
+ # from pathlib import Path
26
+ # from typing import Any, Dict, List, Optional
27
+ #
28
+ # from github import Github, GithubException, RateLimitExceededException
29
+ # from tqdm import tqdm
30
+ #
31
+ # from greenmining.config import get_config
32
+ # from greenmining.utils import (
33
+ # colored_print,
34
+ # format_timestamp,
35
+ # print_banner,
36
+ # save_json_file,
37
+ # )
38
+ #
39
+ #
40
+ # class GitHubFetcher:
41
+ # # Fetches microservice repositories from GitHub using REST API (SLOW).
42
+ #
43
+ # def __init__(
44
+ # self,
45
+ # token: str,
46
+ # max_repos: int = 100,
47
+ # min_stars: int = 100,
48
+ # languages: Optional[list[str]] = None,
49
+ # created_after: Optional[str] = None,
50
+ # created_before: Optional[str] = None,
51
+ # pushed_after: Optional[str] = None,
52
+ # pushed_before: Optional[str] = None,
53
+ # ):
54
+ # # Initialize GitHub fetcher.
55
+ # self.github = Github(token)
56
+ # self.max_repos = max_repos
57
+ # self.min_stars = min_stars
58
+ # self.languages = languages or [
59
+ # "Java",
60
+ # "Python",
61
+ # "Go",
62
+ # "JavaScript",
63
+ # "TypeScript",
64
+ # "C#",
65
+ # "Rust",
66
+ # ]
67
+ # self.created_after = created_after
68
+ # self.created_before = created_before
69
+ # self.pushed_after = pushed_after
70
+ # self.pushed_before = pushed_before
71
+ #
72
+ # def search_repositories(self) -> list[dict[str, Any]]:
73
+ # # Search for microservice repositories (REST API - many requests).
74
+ # repositories = []
75
+ # keywords = ["microservices", "microservice-architecture", "cloud-native"]
76
+ #
77
+ # colored_print(f"Searching for repositories with keywords: {', '.join(keywords)}", "cyan")
78
+ # colored_print(
79
+ # f"Filters: min_stars={self.min_stars}, languages={', '.join(self.languages)}", "cyan"
80
+ # )
81
+ #
82
+ # # Build search query with temporal filters
83
+ # query = self._build_temporal_query(keywords)
84
+ #
85
+ # try:
86
+ # # Execute search (1 request)
87
+ # search_results = self.github.search_repositories(
88
+ # query=query, sort="stars", order="desc"
89
+ # )
90
+ #
91
+ # total_found = search_results.totalCount
92
+ # colored_print(f"Found {total_found} repositories matching criteria", "green")
93
+ #
94
+ # # Fetch repository details with progress bar (1 request per repo = SLOW)
95
+ # with tqdm(
96
+ # total=min(self.max_repos, total_found), desc="Fetching repositories", unit="repo"
97
+ # ) as pbar:
98
+ # for idx, repo in enumerate(search_results):
99
+ # if idx >= self.max_repos:
100
+ # break
101
+ #
102
+ # try:
103
+ # repo_data = self._extract_repo_metadata(repo, idx + 1)
104
+ # repositories.append(repo_data)
105
+ # pbar.update(1)
106
+ # except GithubException as e:
107
+ # colored_print(f"Error fetching {repo.full_name}: {e}", "yellow")
108
+ # continue
109
+ # except RateLimitExceededException:
110
+ # colored_print("Rate limit exceeded. Waiting...", "red")
111
+ # self._handle_rate_limit()
112
+ # continue
113
+ #
114
+ # return repositories
115
+ #
116
+ # except GithubException as e:
117
+ # colored_print(f"GitHub API error: {e}", "red")
118
+ # raise
119
+ # except Exception as e:
120
+ # colored_print(f"Unexpected error: {e}", "red")
121
+ # raise
122
+ #
123
+ # def _extract_repo_metadata(self, repo, repo_id: int) -> dict[str, Any]:
124
+ # # Extract metadata from repository object.
125
+ # return {
126
+ # "repo_id": repo_id,
127
+ # "name": repo.name,
128
+ # "owner": repo.owner.login,
129
+ # "full_name": repo.full_name,
130
+ # "url": repo.html_url,
131
+ # "clone_url": repo.clone_url,
132
+ # "language": repo.language,
133
+ # "stars": repo.stargazers_count,
134
+ # "forks": repo.forks_count,
135
+ # "watchers": repo.watchers_count,
136
+ # "open_issues": repo.open_issues_count,
137
+ # "last_updated": repo.updated_at.isoformat() if repo.updated_at else None,
138
+ # "created_at": repo.created_at.isoformat() if repo.created_at else None,
139
+ # "description": repo.description or "",
140
+ # "main_branch": repo.default_branch,
141
+ # "topics": repo.get_topics() if hasattr(repo, "get_topics") else [],
142
+ # "size": repo.size,
143
+ # "has_issues": repo.has_issues,
144
+ # "has_wiki": repo.has_wiki,
145
+ # "archived": repo.archived,
146
+ # "license": repo.license.name if repo.license else None,
147
+ # }
148
+ #
149
+ # def _build_temporal_query(self, keywords: list[str]) -> str:
150
+ # # Build GitHub search query with temporal constraints.
151
+ # query_parts = []
152
+ #
153
+ # # Keywords
154
+ # keyword_query = " OR ".join(keywords)
155
+ # query_parts.append(f"({keyword_query})")
156
+ #
157
+ # # Languages
158
+ # language_query = " OR ".join([f"language:{lang}" for lang in self.languages])
159
+ # query_parts.append(f"({language_query})")
160
+ #
161
+ # # Stars
162
+ # query_parts.append(f"stars:>={self.min_stars}")
163
+ #
164
+ # # Archived filter
165
+ # query_parts.append("archived:false")
166
+ #
167
+ # # Temporal filters
168
+ # if self.created_after and self.created_before:
169
+ # query_parts.append(f"created:{self.created_after}..{self.created_before}")
170
+ # elif self.created_after:
171
+ # query_parts.append(f"created:>={self.created_after}")
172
+ # elif self.created_before:
173
+ # query_parts.append(f"created:<={self.created_before}")
174
+ #
175
+ # if self.pushed_after and self.pushed_before:
176
+ # query_parts.append(f"pushed:{self.pushed_after}..{self.pushed_before}")
177
+ # elif self.pushed_after:
178
+ # query_parts.append(f"pushed:>={self.pushed_after}")
179
+ # elif self.pushed_before:
180
+ # query_parts.append(f"pushed:<={self.pushed_before}")
181
+ #
182
+ # query = " ".join(query_parts)
183
+ # colored_print(f"Query: {query}", "cyan")
184
+ # return query
185
+ #
186
+ # def _handle_rate_limit(self):
187
+ # # Handle GitHub API rate limiting.
188
+ # rate_limit = self.github.get_rate_limit()
189
+ # reset_time = rate_limit.core.reset
190
+ # wait_seconds = (reset_time - datetime.now()).total_seconds()
191
+ #
192
+ # if wait_seconds > 0:
193
+ # colored_print(f"Rate limit will reset in {wait_seconds:.0f} seconds", "yellow")
194
+ # import time
195
+ #
196
+ # time.sleep(min(wait_seconds + 10, 60)) # Wait with max 60 seconds
197
+ #
198
+ # def save_results(self, repositories: list[dict[str, Any]], output_file: Path):
199
+ # # Save fetched repositories to JSON file.
200
+ # data = {
201
+ # "metadata": {
202
+ # "fetched_at": format_timestamp(),
203
+ # "total_repos": len(repositories),
204
+ # "min_stars": self.min_stars,
205
+ # "languages": self.languages,
206
+ # "search_keywords": ["microservices", "microservice-architecture", "cloud-native"],
207
+ # },
208
+ # "repositories": repositories,
209
+ # }
210
+ #
211
+ # save_json_file(data, output_file)
212
+ # colored_print(f"Saved {len(repositories)} repositories to {output_file}", "green")