greenmining 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. greenmining/__init__.py +11 -29
  2. greenmining/__main__.py +9 -3
  3. greenmining/__version__.py +2 -2
  4. greenmining/analyzers/__init__.py +3 -7
  5. greenmining/analyzers/code_diff_analyzer.py +151 -61
  6. greenmining/analyzers/qualitative_analyzer.py +15 -81
  7. greenmining/analyzers/statistical_analyzer.py +8 -69
  8. greenmining/analyzers/temporal_analyzer.py +16 -72
  9. greenmining/config.py +105 -58
  10. greenmining/controllers/__init__.py +1 -5
  11. greenmining/controllers/repository_controller.py +153 -94
  12. greenmining/energy/__init__.py +13 -0
  13. greenmining/energy/base.py +165 -0
  14. greenmining/energy/codecarbon_meter.py +146 -0
  15. greenmining/energy/rapl.py +157 -0
  16. greenmining/gsf_patterns.py +4 -26
  17. greenmining/models/__init__.py +1 -5
  18. greenmining/models/aggregated_stats.py +4 -4
  19. greenmining/models/analysis_result.py +4 -4
  20. greenmining/models/commit.py +5 -5
  21. greenmining/models/repository.py +5 -5
  22. greenmining/presenters/__init__.py +1 -5
  23. greenmining/presenters/console_presenter.py +24 -24
  24. greenmining/services/__init__.py +10 -6
  25. greenmining/services/commit_extractor.py +8 -152
  26. greenmining/services/data_aggregator.py +45 -175
  27. greenmining/services/data_analyzer.py +9 -202
  28. greenmining/services/github_fetcher.py +212 -323
  29. greenmining/services/github_graphql_fetcher.py +371 -0
  30. greenmining/services/local_repo_analyzer.py +387 -0
  31. greenmining/services/reports.py +33 -137
  32. greenmining/utils.py +21 -149
  33. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/METADATA +169 -146
  34. greenmining-1.0.4.dist-info/RECORD +37 -0
  35. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/WHEEL +1 -1
  36. greenmining/analyzers/ml_feature_extractor.py +0 -512
  37. greenmining/analyzers/nlp_analyzer.py +0 -365
  38. greenmining/cli.py +0 -471
  39. greenmining/main.py +0 -37
  40. greenmining-1.0.2.dist-info/RECORD +0 -36
  41. greenmining-1.0.2.dist-info/entry_points.txt +0 -2
  42. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/licenses/LICENSE +0 -0
  43. {greenmining-1.0.2.dist-info → greenmining-1.0.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,371 @@
1
+ """
2
+ GitHub GraphQL API fetcher for faster and more efficient repository fetching.
3
+
4
+ GraphQL allows fetching exactly the data you need in a single request,
5
+ reducing API calls and improving rate limit efficiency.
6
+ """
7
+
8
+ import json
9
+ import time
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ import requests
13
+
14
+ from greenmining.models.repository import Repository
15
+
16
+
17
+ class GitHubGraphQLFetcher:
18
+ """
19
+ Fetch GitHub repositories using GraphQL API v4.
20
+
21
+ Benefits over REST API:
22
+ - Fetch repos + commits in 1 request instead of 100+ REST calls
23
+ - Get exactly the fields you need (no over-fetching)
24
+ - Better rate limit efficiency (5000 points/hour vs 5000 requests/hour)
25
+ - More powerful search capabilities
26
+ """
27
+
28
+ GRAPHQL_ENDPOINT = "https://api.github.com/graphql"
29
+
30
+ def __init__(self, token: str):
31
+ """
32
+ Initialize GraphQL fetcher.
33
+
34
+ Args:
35
+ token: GitHub personal access token
36
+ """
37
+ self.token = token
38
+ self.headers = {
39
+ "Authorization": f"Bearer {token}",
40
+ "Content-Type": "application/json",
41
+ }
42
+
43
+ def search_repositories(
44
+ self,
45
+ keywords: str = "microservices",
46
+ max_repos: int = 100,
47
+ min_stars: int = 100,
48
+ languages: Optional[List[str]] = None,
49
+ created_after: Optional[str] = None,
50
+ created_before: Optional[str] = None,
51
+ pushed_after: Optional[str] = None,
52
+ pushed_before: Optional[str] = None,
53
+ ) -> List[Repository]:
54
+ """
55
+ Search GitHub repositories using GraphQL.
56
+
57
+ Args:
58
+ keywords: Search keywords
59
+ max_repos: Maximum number of repositories to fetch
60
+ min_stars: Minimum star count
61
+ languages: Programming languages to filter
62
+ created_after: Created after date (YYYY-MM-DD)
63
+ created_before: Created before date (YYYY-MM-DD)
64
+ pushed_after: Pushed after date (YYYY-MM-DD)
65
+ pushed_before: Pushed before date (YYYY-MM-DD)
66
+
67
+ Returns:
68
+ List of Repository objects
69
+ """
70
+ # Build search query
71
+ search_query = self._build_search_query(
72
+ keywords,
73
+ min_stars,
74
+ languages,
75
+ created_after,
76
+ created_before,
77
+ pushed_after,
78
+ pushed_before,
79
+ )
80
+
81
+ print(f"GraphQL Search Query: {search_query}")
82
+
83
+ # GraphQL query to fetch repositories
84
+ query = """
85
+ query($searchQuery: String!, $first: Int!) {
86
+ search(query: $searchQuery, type: REPOSITORY, first: $first) {
87
+ repositoryCount
88
+ pageInfo {
89
+ hasNextPage
90
+ endCursor
91
+ }
92
+ nodes {
93
+ ... on Repository {
94
+ id
95
+ name
96
+ nameWithOwner
97
+ description
98
+ url
99
+ createdAt
100
+ updatedAt
101
+ pushedAt
102
+ stargazerCount
103
+ forkCount
104
+ watchers {
105
+ totalCount
106
+ }
107
+ primaryLanguage {
108
+ name
109
+ }
110
+ languages(first: 5) {
111
+ nodes {
112
+ name
113
+ }
114
+ }
115
+ licenseInfo {
116
+ name
117
+ }
118
+ isArchived
119
+ isFork
120
+ defaultBranchRef {
121
+ name
122
+ }
123
+ }
124
+ }
125
+ }
126
+ rateLimit {
127
+ limit
128
+ cost
129
+ remaining
130
+ resetAt
131
+ }
132
+ }
133
+ """
134
+
135
+ variables = {"searchQuery": search_query, "first": min(max_repos, 100)}
136
+
137
+ # Execute query
138
+ repositories = []
139
+ page_count = 0
140
+ max_pages = (max_repos + 99) // 100 # Round up
141
+
142
+ while len(repositories) < max_repos and page_count < max_pages:
143
+ try:
144
+ response = self._execute_query(query, variables)
145
+
146
+ if "errors" in response:
147
+ print(f"GraphQL Errors: {response['errors']}")
148
+ break
149
+
150
+ data = response.get("data", {})
151
+ search = data.get("search", {})
152
+ rate_limit = data.get("rateLimit", {})
153
+
154
+ # Print rate limit info
155
+ print(
156
+ f"Rate Limit: {rate_limit.get('remaining')}/{rate_limit.get('limit')} "
157
+ f"(cost: {rate_limit.get('cost')})"
158
+ )
159
+
160
+ # Parse repositories
161
+ nodes = search.get("nodes", [])
162
+ for node in nodes:
163
+ if node and len(repositories) < max_repos:
164
+ repo = self._parse_repository(node)
165
+ repositories.append(repo)
166
+
167
+ # Check pagination
168
+ page_info = search.get("pageInfo", {})
169
+ if not page_info.get("hasNextPage"):
170
+ break
171
+
172
+ # Update cursor for next page
173
+ variables["after"] = page_info.get("endCursor")
174
+ page_count += 1
175
+
176
+ # Respect rate limits
177
+ if rate_limit.get("remaining", 0) < 100:
178
+ print("Approaching rate limit, sleeping...")
179
+ time.sleep(60)
180
+
181
+ except Exception as e:
182
+ print(f"Error fetching repositories: {e}")
183
+ break
184
+
185
+ print(f"Fetched {len(repositories)} repositories using GraphQL")
186
+ return repositories
187
+
188
+ def _build_search_query(
189
+ self,
190
+ keywords: str,
191
+ min_stars: int,
192
+ languages: Optional[List[str]],
193
+ created_after: Optional[str],
194
+ created_before: Optional[str],
195
+ pushed_after: Optional[str],
196
+ pushed_before: Optional[str],
197
+ ) -> str:
198
+ """Build GitHub search query string."""
199
+ query_parts = [keywords]
200
+
201
+ # Star count
202
+ query_parts.append(f"stars:>={min_stars}")
203
+
204
+ # Languages
205
+ if languages:
206
+ lang_query = " OR ".join([f"language:{lang}" for lang in languages])
207
+ query_parts.append(f"({lang_query})")
208
+
209
+ # Date filters
210
+ if created_after:
211
+ query_parts.append(f"created:>={created_after}")
212
+ if created_before:
213
+ query_parts.append(f"created:<={created_before}")
214
+ if pushed_after:
215
+ query_parts.append(f"pushed:>={pushed_after}")
216
+ if pushed_before:
217
+ query_parts.append(f"pushed:<={pushed_before}")
218
+
219
+ return " ".join(query_parts)
220
+
221
+ def _execute_query(self, query: str, variables: Dict[str, Any]) -> Dict[str, Any]:
222
+ """Execute GraphQL query."""
223
+ payload = {"query": query, "variables": variables}
224
+
225
+ response = requests.post(
226
+ self.GRAPHQL_ENDPOINT, headers=self.headers, json=payload, timeout=30
227
+ )
228
+
229
+ response.raise_for_status()
230
+ return response.json()
231
+
232
+ def _parse_repository(self, node: Dict[str, Any]) -> Repository:
233
+ """Parse GraphQL repository node to Repository object."""
234
+ # Extract languages
235
+ languages = []
236
+ if node.get("languages") and node["languages"].get("nodes"):
237
+ languages = [lang["name"] for lang in node["languages"]["nodes"]]
238
+ elif node.get("primaryLanguage"):
239
+ languages = [node["primaryLanguage"]["name"]]
240
+
241
+ # Extract license
242
+ license_name = None
243
+ if node.get("licenseInfo"):
244
+ license_name = node["licenseInfo"].get("name")
245
+
246
+ return Repository(
247
+ name=node.get("name", ""),
248
+ full_name=node.get("nameWithOwner", ""),
249
+ description=node.get("description", ""),
250
+ url=node.get("url", ""),
251
+ stars=node.get("stargazerCount", 0),
252
+ forks=node.get("forkCount", 0),
253
+ watchers=node.get("watchers", {}).get("totalCount", 0),
254
+ language=node.get("primaryLanguage", {}).get("name", ""),
255
+ languages=languages,
256
+ created_at=node.get("createdAt", ""),
257
+ updated_at=node.get("updatedAt", ""),
258
+ pushed_at=node.get("pushedAt", ""),
259
+ license=license_name,
260
+ is_fork=node.get("isFork", False),
261
+ is_archived=node.get("isArchived", False),
262
+ default_branch=node.get("defaultBranchRef", {}).get("name", "main"),
263
+ )
264
+
265
+ def get_repository_commits(
266
+ self, owner: str, name: str, max_commits: int = 100
267
+ ) -> List[Dict[str, Any]]:
268
+ """
269
+ Fetch commits for a specific repository using GraphQL.
270
+
271
+ This is much faster than REST API as it gets all commits in 1-2 requests
272
+ instead of paginating through 100 individual REST calls.
273
+
274
+ Args:
275
+ owner: Repository owner
276
+ name: Repository name
277
+ max_commits: Maximum commits to fetch
278
+
279
+ Returns:
280
+ List of commit dictionaries
281
+ """
282
+ query = """
283
+ query($owner: String!, $name: String!, $first: Int!) {
284
+ repository(owner: $owner, name: $name) {
285
+ defaultBranchRef {
286
+ target {
287
+ ... on Commit {
288
+ history(first: $first) {
289
+ totalCount
290
+ pageInfo {
291
+ hasNextPage
292
+ endCursor
293
+ }
294
+ nodes {
295
+ oid
296
+ message
297
+ committedDate
298
+ author {
299
+ name
300
+ email
301
+ user {
302
+ login
303
+ }
304
+ }
305
+ additions
306
+ deletions
307
+ changedFiles
308
+ }
309
+ }
310
+ }
311
+ }
312
+ }
313
+ }
314
+ rateLimit {
315
+ remaining
316
+ cost
317
+ }
318
+ }
319
+ """
320
+
321
+ variables = {"owner": owner, "name": name, "first": min(max_commits, 100)}
322
+
323
+ commits = []
324
+ try:
325
+ response = self._execute_query(query, variables)
326
+
327
+ if "errors" in response:
328
+ print(f"GraphQL Errors: {response['errors']}")
329
+ return commits
330
+
331
+ data = response.get("data", {})
332
+ repo = data.get("repository", {})
333
+ branch = repo.get("defaultBranchRef", {})
334
+ target = branch.get("target", {})
335
+ history = target.get("history", {})
336
+ nodes = history.get("nodes", [])
337
+
338
+ for node in nodes:
339
+ commit = {
340
+ "sha": node.get("oid"),
341
+ "message": node.get("message"),
342
+ "date": node.get("committedDate"),
343
+ "author": node.get("author", {}).get("name"),
344
+ "author_email": node.get("author", {}).get("email"),
345
+ "additions": node.get("additions", 0),
346
+ "deletions": node.get("deletions", 0),
347
+ "changed_files": node.get("changedFiles", 0),
348
+ }
349
+ commits.append(commit)
350
+
351
+ print(
352
+ f"Fetched {len(commits)} commits for {owner}/{name} "
353
+ f"(rate limit cost: {data.get('rateLimit', {}).get('cost')})"
354
+ )
355
+
356
+ except Exception as e:
357
+ print(f"Error fetching commits for {owner}/{name}: {e}")
358
+
359
+ return commits
360
+
361
+ def save_results(self, repositories: List[Repository], output_file: str):
362
+ """Save repositories to JSON file."""
363
+ data = {
364
+ "total_repositories": len(repositories),
365
+ "repositories": [repo.to_dict() for repo in repositories],
366
+ }
367
+
368
+ with open(output_file, "w", encoding="utf-8") as f:
369
+ json.dump(data, f, indent=2, ensure_ascii=False)
370
+
371
+ print(f"Saved {len(repositories)} repositories to {output_file}")