greenmining 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. greenmining/__init__.py +11 -29
  2. greenmining/__main__.py +9 -3
  3. greenmining/__version__.py +2 -2
  4. greenmining/analyzers/__init__.py +3 -7
  5. greenmining/analyzers/code_diff_analyzer.py +151 -61
  6. greenmining/analyzers/qualitative_analyzer.py +15 -81
  7. greenmining/analyzers/statistical_analyzer.py +8 -69
  8. greenmining/analyzers/temporal_analyzer.py +16 -72
  9. greenmining/config.py +105 -58
  10. greenmining/controllers/__init__.py +1 -5
  11. greenmining/controllers/repository_controller.py +153 -94
  12. greenmining/energy/__init__.py +13 -0
  13. greenmining/energy/base.py +165 -0
  14. greenmining/energy/codecarbon_meter.py +146 -0
  15. greenmining/energy/rapl.py +157 -0
  16. greenmining/gsf_patterns.py +4 -26
  17. greenmining/models/__init__.py +1 -5
  18. greenmining/models/aggregated_stats.py +4 -4
  19. greenmining/models/analysis_result.py +4 -4
  20. greenmining/models/commit.py +5 -5
  21. greenmining/models/repository.py +5 -5
  22. greenmining/presenters/__init__.py +1 -5
  23. greenmining/presenters/console_presenter.py +24 -24
  24. greenmining/services/__init__.py +10 -6
  25. greenmining/services/commit_extractor.py +8 -152
  26. greenmining/services/data_aggregator.py +45 -175
  27. greenmining/services/data_analyzer.py +9 -202
  28. greenmining/services/github_fetcher.py +210 -323
  29. greenmining/services/github_graphql_fetcher.py +361 -0
  30. greenmining/services/local_repo_analyzer.py +387 -0
  31. greenmining/services/reports.py +33 -137
  32. greenmining/utils.py +21 -149
  33. {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/METADATA +69 -173
  34. greenmining-1.0.5.dist-info/RECORD +37 -0
  35. {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/WHEEL +1 -1
  36. greenmining/analyzers/ml_feature_extractor.py +0 -512
  37. greenmining/analyzers/nlp_analyzer.py +0 -365
  38. greenmining/cli.py +0 -471
  39. greenmining/main.py +0 -37
  40. greenmining-1.0.3.dist-info/RECORD +0 -36
  41. greenmining-1.0.3.dist-info/entry_points.txt +0 -2
  42. {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/licenses/LICENSE +0 -0
  43. {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,361 @@
1
+ # GitHub GraphQL API fetcher for faster and more efficient repository fetching.
2
+ #
3
+ # GraphQL allows fetching exactly the data you need in a single request,
4
+ # reducing API calls and improving rate limit efficiency.
5
+
6
+ import json
7
+ import time
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ import requests
11
+
12
+ from greenmining.models.repository import Repository
13
+
14
+
15
+ class GitHubGraphQLFetcher:
16
+ # Fetch GitHub repositories using GraphQL API v4.
17
+ #
18
+ # Benefits over REST API:
19
+ # - Fetch repos + commits in 1 request instead of 100+ REST calls
20
+ # - Get exactly the fields you need (no over-fetching)
21
+ # - Better rate limit efficiency (5000 points/hour vs 5000 requests/hour)
22
+ # - More powerful search capabilities
23
+
24
+ GRAPHQL_ENDPOINT = "https://api.github.com/graphql"
25
+
26
+ def __init__(self, token: str):
27
+ # Initialize GraphQL fetcher.
28
+ #
29
+ # Args:
30
+ # token: GitHub personal access token
31
+ self.token = token
32
+ self.headers = {
33
+ "Authorization": f"Bearer {token}",
34
+ "Content-Type": "application/json",
35
+ }
36
+
37
+ def search_repositories(
38
+ self,
39
+ keywords: str = "microservices",
40
+ max_repos: int = 100,
41
+ min_stars: int = 100,
42
+ languages: Optional[List[str]] = None,
43
+ created_after: Optional[str] = None,
44
+ created_before: Optional[str] = None,
45
+ pushed_after: Optional[str] = None,
46
+ pushed_before: Optional[str] = None,
47
+ ) -> List[Repository]:
48
+ # Search GitHub repositories using GraphQL.
49
+ #
50
+ # Args:
51
+ # keywords: Search keywords
52
+ # max_repos: Maximum number of repositories to fetch
53
+ # min_stars: Minimum star count
54
+ # languages: Programming languages to filter
55
+ # created_after: Created after date (YYYY-MM-DD)
56
+ # created_before: Created before date (YYYY-MM-DD)
57
+ # pushed_after: Pushed after date (YYYY-MM-DD)
58
+ # pushed_before: Pushed before date (YYYY-MM-DD)
59
+ #
60
+ # Returns:
61
+ # List of Repository objects
62
+ # Build search query
63
+ search_query = self._build_search_query(
64
+ keywords,
65
+ min_stars,
66
+ languages,
67
+ created_after,
68
+ created_before,
69
+ pushed_after,
70
+ pushed_before,
71
+ )
72
+
73
+ print(f"GraphQL Search Query: {search_query}")
74
+
75
+ # GraphQL query to fetch repositories
76
+ query = """
77
+ query($searchQuery: String!, $first: Int!) {
78
+ search(query: $searchQuery, type: REPOSITORY, first: $first) {
79
+ repositoryCount
80
+ pageInfo {
81
+ hasNextPage
82
+ endCursor
83
+ }
84
+ nodes {
85
+ ... on Repository {
86
+ id
87
+ name
88
+ nameWithOwner
89
+ description
90
+ url
91
+ createdAt
92
+ updatedAt
93
+ pushedAt
94
+ stargazerCount
95
+ forkCount
96
+ watchers {
97
+ totalCount
98
+ }
99
+ primaryLanguage {
100
+ name
101
+ }
102
+ languages(first: 5) {
103
+ nodes {
104
+ name
105
+ }
106
+ }
107
+ licenseInfo {
108
+ name
109
+ }
110
+ isArchived
111
+ isFork
112
+ defaultBranchRef {
113
+ name
114
+ }
115
+ }
116
+ }
117
+ }
118
+ rateLimit {
119
+ limit
120
+ cost
121
+ remaining
122
+ resetAt
123
+ }
124
+ }
125
+ """
126
+
127
+ variables = {"searchQuery": search_query, "first": min(max_repos, 100)}
128
+
129
+ # Execute query
130
+ repositories = []
131
+ page_count = 0
132
+ max_pages = (max_repos + 99) // 100 # Round up
133
+
134
+ while len(repositories) < max_repos and page_count < max_pages:
135
+ try:
136
+ response = self._execute_query(query, variables)
137
+
138
+ if "errors" in response:
139
+ print(f"GraphQL Errors: {response['errors']}")
140
+ break
141
+
142
+ data = response.get("data", {})
143
+ search = data.get("search", {})
144
+ rate_limit = data.get("rateLimit", {})
145
+
146
+ # Print rate limit info
147
+ print(
148
+ f"Rate Limit: {rate_limit.get('remaining')}/{rate_limit.get('limit')} "
149
+ f"(cost: {rate_limit.get('cost')})"
150
+ )
151
+
152
+ # Parse repositories
153
+ nodes = search.get("nodes", [])
154
+ for node in nodes:
155
+ if node and len(repositories) < max_repos:
156
+ repo = self._parse_repository(node)
157
+ repositories.append(repo)
158
+
159
+ # Check pagination
160
+ page_info = search.get("pageInfo", {})
161
+ if not page_info.get("hasNextPage"):
162
+ break
163
+
164
+ # Update cursor for next page
165
+ variables["after"] = page_info.get("endCursor")
166
+ page_count += 1
167
+
168
+ # Respect rate limits
169
+ if rate_limit.get("remaining", 0) < 100:
170
+ print("Approaching rate limit, sleeping...")
171
+ time.sleep(60)
172
+
173
+ except Exception as e:
174
+ print(f"Error fetching repositories: {e}")
175
+ break
176
+
177
+ print(f"Fetched {len(repositories)} repositories using GraphQL")
178
+ return repositories
179
+
180
+ def _build_search_query(
181
+ self,
182
+ keywords: str,
183
+ min_stars: int,
184
+ languages: Optional[List[str]],
185
+ created_after: Optional[str],
186
+ created_before: Optional[str],
187
+ pushed_after: Optional[str],
188
+ pushed_before: Optional[str],
189
+ ) -> str:
190
+ # Build GitHub search query string.
191
+ query_parts = [keywords]
192
+
193
+ # Star count
194
+ query_parts.append(f"stars:>={min_stars}")
195
+
196
+ # Languages
197
+ if languages:
198
+ lang_query = " OR ".join([f"language:{lang}" for lang in languages])
199
+ query_parts.append(f"({lang_query})")
200
+
201
+ # Date filters
202
+ if created_after:
203
+ query_parts.append(f"created:>={created_after}")
204
+ if created_before:
205
+ query_parts.append(f"created:<={created_before}")
206
+ if pushed_after:
207
+ query_parts.append(f"pushed:>={pushed_after}")
208
+ if pushed_before:
209
+ query_parts.append(f"pushed:<={pushed_before}")
210
+
211
+ return " ".join(query_parts)
212
+
213
+ def _execute_query(self, query: str, variables: Dict[str, Any]) -> Dict[str, Any]:
214
+ # Execute GraphQL query.
215
+ payload = {"query": query, "variables": variables}
216
+
217
+ response = requests.post(
218
+ self.GRAPHQL_ENDPOINT, headers=self.headers, json=payload, timeout=30
219
+ )
220
+
221
+ response.raise_for_status()
222
+ return response.json()
223
+
224
+ def _parse_repository(self, node: Dict[str, Any]) -> Repository:
225
+ # Parse GraphQL repository node to Repository object.
226
+ # Extract languages
227
+ languages = []
228
+ if node.get("languages") and node["languages"].get("nodes"):
229
+ languages = [lang["name"] for lang in node["languages"]["nodes"]]
230
+ elif node.get("primaryLanguage"):
231
+ languages = [node["primaryLanguage"]["name"]]
232
+
233
+ # Extract license
234
+ license_name = None
235
+ if node.get("licenseInfo"):
236
+ license_name = node["licenseInfo"].get("name")
237
+
238
+ return Repository(
239
+ name=node.get("name", ""),
240
+ full_name=node.get("nameWithOwner", ""),
241
+ description=node.get("description", ""),
242
+ url=node.get("url", ""),
243
+ stars=node.get("stargazerCount", 0),
244
+ forks=node.get("forkCount", 0),
245
+ watchers=node.get("watchers", {}).get("totalCount", 0),
246
+ language=node.get("primaryLanguage", {}).get("name", ""),
247
+ languages=languages,
248
+ created_at=node.get("createdAt", ""),
249
+ updated_at=node.get("updatedAt", ""),
250
+ pushed_at=node.get("pushedAt", ""),
251
+ license=license_name,
252
+ is_fork=node.get("isFork", False),
253
+ is_archived=node.get("isArchived", False),
254
+ default_branch=node.get("defaultBranchRef", {}).get("name", "main"),
255
+ )
256
+
257
+ def get_repository_commits(
258
+ self, owner: str, name: str, max_commits: int = 100
259
+ ) -> List[Dict[str, Any]]:
260
+ # Fetch commits for a specific repository using GraphQL.
261
+ #
262
+ # This is much faster than REST API as it gets all commits in 1-2 requests
263
+ # instead of paginating through 100 individual REST calls.
264
+ #
265
+ # Args:
266
+ # owner: Repository owner
267
+ # name: Repository name
268
+ # max_commits: Maximum commits to fetch
269
+ #
270
+ # Returns:
271
+ # List of commit dictionaries
272
+ query = """
273
+ query($owner: String!, $name: String!, $first: Int!) {
274
+ repository(owner: $owner, name: $name) {
275
+ defaultBranchRef {
276
+ target {
277
+ ... on Commit {
278
+ history(first: $first) {
279
+ totalCount
280
+ pageInfo {
281
+ hasNextPage
282
+ endCursor
283
+ }
284
+ nodes {
285
+ oid
286
+ message
287
+ committedDate
288
+ author {
289
+ name
290
+ email
291
+ user {
292
+ login
293
+ }
294
+ }
295
+ additions
296
+ deletions
297
+ changedFiles
298
+ }
299
+ }
300
+ }
301
+ }
302
+ }
303
+ }
304
+ rateLimit {
305
+ remaining
306
+ cost
307
+ }
308
+ }
309
+ """
310
+
311
+ variables = {"owner": owner, "name": name, "first": min(max_commits, 100)}
312
+
313
+ commits = []
314
+ try:
315
+ response = self._execute_query(query, variables)
316
+
317
+ if "errors" in response:
318
+ print(f"GraphQL Errors: {response['errors']}")
319
+ return commits
320
+
321
+ data = response.get("data", {})
322
+ repo = data.get("repository", {})
323
+ branch = repo.get("defaultBranchRef", {})
324
+ target = branch.get("target", {})
325
+ history = target.get("history", {})
326
+ nodes = history.get("nodes", [])
327
+
328
+ for node in nodes:
329
+ commit = {
330
+ "sha": node.get("oid"),
331
+ "message": node.get("message"),
332
+ "date": node.get("committedDate"),
333
+ "author": node.get("author", {}).get("name"),
334
+ "author_email": node.get("author", {}).get("email"),
335
+ "additions": node.get("additions", 0),
336
+ "deletions": node.get("deletions", 0),
337
+ "changed_files": node.get("changedFiles", 0),
338
+ }
339
+ commits.append(commit)
340
+
341
+ print(
342
+ f"Fetched {len(commits)} commits for {owner}/{name} "
343
+ f"(rate limit cost: {data.get('rateLimit', {}).get('cost')})"
344
+ )
345
+
346
+ except Exception as e:
347
+ print(f"Error fetching commits for {owner}/{name}: {e}")
348
+
349
+ return commits
350
+
351
+ def save_results(self, repositories: List[Repository], output_file: str):
352
+ # Save repositories to JSON file.
353
+ data = {
354
+ "total_repositories": len(repositories),
355
+ "repositories": [repo.to_dict() for repo in repositories],
356
+ }
357
+
358
+ with open(output_file, "w", encoding="utf-8") as f:
359
+ json.dump(data, f, indent=2, ensure_ascii=False)
360
+
361
+ print(f"Saved {len(repositories)} repositories to {output_file}")