greenmining 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__init__.py +11 -29
- greenmining/__main__.py +9 -3
- greenmining/__version__.py +2 -2
- greenmining/analyzers/__init__.py +3 -7
- greenmining/analyzers/code_diff_analyzer.py +151 -61
- greenmining/analyzers/qualitative_analyzer.py +15 -81
- greenmining/analyzers/statistical_analyzer.py +8 -69
- greenmining/analyzers/temporal_analyzer.py +16 -72
- greenmining/config.py +105 -58
- greenmining/controllers/__init__.py +1 -5
- greenmining/controllers/repository_controller.py +153 -94
- greenmining/energy/__init__.py +13 -0
- greenmining/energy/base.py +165 -0
- greenmining/energy/codecarbon_meter.py +146 -0
- greenmining/energy/rapl.py +157 -0
- greenmining/gsf_patterns.py +4 -26
- greenmining/models/__init__.py +1 -5
- greenmining/models/aggregated_stats.py +4 -4
- greenmining/models/analysis_result.py +4 -4
- greenmining/models/commit.py +5 -5
- greenmining/models/repository.py +5 -5
- greenmining/presenters/__init__.py +1 -5
- greenmining/presenters/console_presenter.py +24 -24
- greenmining/services/__init__.py +10 -6
- greenmining/services/commit_extractor.py +8 -152
- greenmining/services/data_aggregator.py +45 -175
- greenmining/services/data_analyzer.py +9 -202
- greenmining/services/github_fetcher.py +210 -323
- greenmining/services/github_graphql_fetcher.py +361 -0
- greenmining/services/local_repo_analyzer.py +387 -0
- greenmining/services/reports.py +33 -137
- greenmining/utils.py +21 -149
- {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/METADATA +69 -173
- greenmining-1.0.5.dist-info/RECORD +37 -0
- {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/WHEEL +1 -1
- greenmining/analyzers/ml_feature_extractor.py +0 -512
- greenmining/analyzers/nlp_analyzer.py +0 -365
- greenmining/cli.py +0 -471
- greenmining/main.py +0 -37
- greenmining-1.0.3.dist-info/RECORD +0 -36
- greenmining-1.0.3.dist-info/entry_points.txt +0 -2
- {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/licenses/LICENSE +0 -0
- {greenmining-1.0.3.dist-info → greenmining-1.0.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
# GitHub GraphQL API fetcher for faster and more efficient repository fetching.
|
|
2
|
+
#
|
|
3
|
+
# GraphQL allows fetching exactly the data you need in a single request,
|
|
4
|
+
# reducing API calls and improving rate limit efficiency.
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import time
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
|
|
12
|
+
from greenmining.models.repository import Repository
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GitHubGraphQLFetcher:
|
|
16
|
+
# Fetch GitHub repositories using GraphQL API v4.
|
|
17
|
+
#
|
|
18
|
+
# Benefits over REST API:
|
|
19
|
+
# - Fetch repos + commits in 1 request instead of 100+ REST calls
|
|
20
|
+
# - Get exactly the fields you need (no over-fetching)
|
|
21
|
+
# - Better rate limit efficiency (5000 points/hour vs 5000 requests/hour)
|
|
22
|
+
# - More powerful search capabilities
|
|
23
|
+
|
|
24
|
+
GRAPHQL_ENDPOINT = "https://api.github.com/graphql"
|
|
25
|
+
|
|
26
|
+
def __init__(self, token: str):
|
|
27
|
+
# Initialize GraphQL fetcher.
|
|
28
|
+
#
|
|
29
|
+
# Args:
|
|
30
|
+
# token: GitHub personal access token
|
|
31
|
+
self.token = token
|
|
32
|
+
self.headers = {
|
|
33
|
+
"Authorization": f"Bearer {token}",
|
|
34
|
+
"Content-Type": "application/json",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
def search_repositories(
|
|
38
|
+
self,
|
|
39
|
+
keywords: str = "microservices",
|
|
40
|
+
max_repos: int = 100,
|
|
41
|
+
min_stars: int = 100,
|
|
42
|
+
languages: Optional[List[str]] = None,
|
|
43
|
+
created_after: Optional[str] = None,
|
|
44
|
+
created_before: Optional[str] = None,
|
|
45
|
+
pushed_after: Optional[str] = None,
|
|
46
|
+
pushed_before: Optional[str] = None,
|
|
47
|
+
) -> List[Repository]:
|
|
48
|
+
# Search GitHub repositories using GraphQL.
|
|
49
|
+
#
|
|
50
|
+
# Args:
|
|
51
|
+
# keywords: Search keywords
|
|
52
|
+
# max_repos: Maximum number of repositories to fetch
|
|
53
|
+
# min_stars: Minimum star count
|
|
54
|
+
# languages: Programming languages to filter
|
|
55
|
+
# created_after: Created after date (YYYY-MM-DD)
|
|
56
|
+
# created_before: Created before date (YYYY-MM-DD)
|
|
57
|
+
# pushed_after: Pushed after date (YYYY-MM-DD)
|
|
58
|
+
# pushed_before: Pushed before date (YYYY-MM-DD)
|
|
59
|
+
#
|
|
60
|
+
# Returns:
|
|
61
|
+
# List of Repository objects
|
|
62
|
+
# Build search query
|
|
63
|
+
search_query = self._build_search_query(
|
|
64
|
+
keywords,
|
|
65
|
+
min_stars,
|
|
66
|
+
languages,
|
|
67
|
+
created_after,
|
|
68
|
+
created_before,
|
|
69
|
+
pushed_after,
|
|
70
|
+
pushed_before,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
print(f"GraphQL Search Query: {search_query}")
|
|
74
|
+
|
|
75
|
+
# GraphQL query to fetch repositories
|
|
76
|
+
query = """
|
|
77
|
+
query($searchQuery: String!, $first: Int!) {
|
|
78
|
+
search(query: $searchQuery, type: REPOSITORY, first: $first) {
|
|
79
|
+
repositoryCount
|
|
80
|
+
pageInfo {
|
|
81
|
+
hasNextPage
|
|
82
|
+
endCursor
|
|
83
|
+
}
|
|
84
|
+
nodes {
|
|
85
|
+
... on Repository {
|
|
86
|
+
id
|
|
87
|
+
name
|
|
88
|
+
nameWithOwner
|
|
89
|
+
description
|
|
90
|
+
url
|
|
91
|
+
createdAt
|
|
92
|
+
updatedAt
|
|
93
|
+
pushedAt
|
|
94
|
+
stargazerCount
|
|
95
|
+
forkCount
|
|
96
|
+
watchers {
|
|
97
|
+
totalCount
|
|
98
|
+
}
|
|
99
|
+
primaryLanguage {
|
|
100
|
+
name
|
|
101
|
+
}
|
|
102
|
+
languages(first: 5) {
|
|
103
|
+
nodes {
|
|
104
|
+
name
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
licenseInfo {
|
|
108
|
+
name
|
|
109
|
+
}
|
|
110
|
+
isArchived
|
|
111
|
+
isFork
|
|
112
|
+
defaultBranchRef {
|
|
113
|
+
name
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
rateLimit {
|
|
119
|
+
limit
|
|
120
|
+
cost
|
|
121
|
+
remaining
|
|
122
|
+
resetAt
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
variables = {"searchQuery": search_query, "first": min(max_repos, 100)}
|
|
128
|
+
|
|
129
|
+
# Execute query
|
|
130
|
+
repositories = []
|
|
131
|
+
page_count = 0
|
|
132
|
+
max_pages = (max_repos + 99) // 100 # Round up
|
|
133
|
+
|
|
134
|
+
while len(repositories) < max_repos and page_count < max_pages:
|
|
135
|
+
try:
|
|
136
|
+
response = self._execute_query(query, variables)
|
|
137
|
+
|
|
138
|
+
if "errors" in response:
|
|
139
|
+
print(f"GraphQL Errors: {response['errors']}")
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
data = response.get("data", {})
|
|
143
|
+
search = data.get("search", {})
|
|
144
|
+
rate_limit = data.get("rateLimit", {})
|
|
145
|
+
|
|
146
|
+
# Print rate limit info
|
|
147
|
+
print(
|
|
148
|
+
f"Rate Limit: {rate_limit.get('remaining')}/{rate_limit.get('limit')} "
|
|
149
|
+
f"(cost: {rate_limit.get('cost')})"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Parse repositories
|
|
153
|
+
nodes = search.get("nodes", [])
|
|
154
|
+
for node in nodes:
|
|
155
|
+
if node and len(repositories) < max_repos:
|
|
156
|
+
repo = self._parse_repository(node)
|
|
157
|
+
repositories.append(repo)
|
|
158
|
+
|
|
159
|
+
# Check pagination
|
|
160
|
+
page_info = search.get("pageInfo", {})
|
|
161
|
+
if not page_info.get("hasNextPage"):
|
|
162
|
+
break
|
|
163
|
+
|
|
164
|
+
# Update cursor for next page
|
|
165
|
+
variables["after"] = page_info.get("endCursor")
|
|
166
|
+
page_count += 1
|
|
167
|
+
|
|
168
|
+
# Respect rate limits
|
|
169
|
+
if rate_limit.get("remaining", 0) < 100:
|
|
170
|
+
print("Approaching rate limit, sleeping...")
|
|
171
|
+
time.sleep(60)
|
|
172
|
+
|
|
173
|
+
except Exception as e:
|
|
174
|
+
print(f"Error fetching repositories: {e}")
|
|
175
|
+
break
|
|
176
|
+
|
|
177
|
+
print(f"Fetched {len(repositories)} repositories using GraphQL")
|
|
178
|
+
return repositories
|
|
179
|
+
|
|
180
|
+
def _build_search_query(
|
|
181
|
+
self,
|
|
182
|
+
keywords: str,
|
|
183
|
+
min_stars: int,
|
|
184
|
+
languages: Optional[List[str]],
|
|
185
|
+
created_after: Optional[str],
|
|
186
|
+
created_before: Optional[str],
|
|
187
|
+
pushed_after: Optional[str],
|
|
188
|
+
pushed_before: Optional[str],
|
|
189
|
+
) -> str:
|
|
190
|
+
# Build GitHub search query string.
|
|
191
|
+
query_parts = [keywords]
|
|
192
|
+
|
|
193
|
+
# Star count
|
|
194
|
+
query_parts.append(f"stars:>={min_stars}")
|
|
195
|
+
|
|
196
|
+
# Languages
|
|
197
|
+
if languages:
|
|
198
|
+
lang_query = " OR ".join([f"language:{lang}" for lang in languages])
|
|
199
|
+
query_parts.append(f"({lang_query})")
|
|
200
|
+
|
|
201
|
+
# Date filters
|
|
202
|
+
if created_after:
|
|
203
|
+
query_parts.append(f"created:>={created_after}")
|
|
204
|
+
if created_before:
|
|
205
|
+
query_parts.append(f"created:<={created_before}")
|
|
206
|
+
if pushed_after:
|
|
207
|
+
query_parts.append(f"pushed:>={pushed_after}")
|
|
208
|
+
if pushed_before:
|
|
209
|
+
query_parts.append(f"pushed:<={pushed_before}")
|
|
210
|
+
|
|
211
|
+
return " ".join(query_parts)
|
|
212
|
+
|
|
213
|
+
def _execute_query(self, query: str, variables: Dict[str, Any]) -> Dict[str, Any]:
|
|
214
|
+
# Execute GraphQL query.
|
|
215
|
+
payload = {"query": query, "variables": variables}
|
|
216
|
+
|
|
217
|
+
response = requests.post(
|
|
218
|
+
self.GRAPHQL_ENDPOINT, headers=self.headers, json=payload, timeout=30
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
response.raise_for_status()
|
|
222
|
+
return response.json()
|
|
223
|
+
|
|
224
|
+
def _parse_repository(self, node: Dict[str, Any]) -> Repository:
|
|
225
|
+
# Parse GraphQL repository node to Repository object.
|
|
226
|
+
# Extract languages
|
|
227
|
+
languages = []
|
|
228
|
+
if node.get("languages") and node["languages"].get("nodes"):
|
|
229
|
+
languages = [lang["name"] for lang in node["languages"]["nodes"]]
|
|
230
|
+
elif node.get("primaryLanguage"):
|
|
231
|
+
languages = [node["primaryLanguage"]["name"]]
|
|
232
|
+
|
|
233
|
+
# Extract license
|
|
234
|
+
license_name = None
|
|
235
|
+
if node.get("licenseInfo"):
|
|
236
|
+
license_name = node["licenseInfo"].get("name")
|
|
237
|
+
|
|
238
|
+
return Repository(
|
|
239
|
+
name=node.get("name", ""),
|
|
240
|
+
full_name=node.get("nameWithOwner", ""),
|
|
241
|
+
description=node.get("description", ""),
|
|
242
|
+
url=node.get("url", ""),
|
|
243
|
+
stars=node.get("stargazerCount", 0),
|
|
244
|
+
forks=node.get("forkCount", 0),
|
|
245
|
+
watchers=node.get("watchers", {}).get("totalCount", 0),
|
|
246
|
+
language=node.get("primaryLanguage", {}).get("name", ""),
|
|
247
|
+
languages=languages,
|
|
248
|
+
created_at=node.get("createdAt", ""),
|
|
249
|
+
updated_at=node.get("updatedAt", ""),
|
|
250
|
+
pushed_at=node.get("pushedAt", ""),
|
|
251
|
+
license=license_name,
|
|
252
|
+
is_fork=node.get("isFork", False),
|
|
253
|
+
is_archived=node.get("isArchived", False),
|
|
254
|
+
default_branch=node.get("defaultBranchRef", {}).get("name", "main"),
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
def get_repository_commits(
|
|
258
|
+
self, owner: str, name: str, max_commits: int = 100
|
|
259
|
+
) -> List[Dict[str, Any]]:
|
|
260
|
+
# Fetch commits for a specific repository using GraphQL.
|
|
261
|
+
#
|
|
262
|
+
# This is much faster than REST API as it gets all commits in 1-2 requests
|
|
263
|
+
# instead of paginating through 100 individual REST calls.
|
|
264
|
+
#
|
|
265
|
+
# Args:
|
|
266
|
+
# owner: Repository owner
|
|
267
|
+
# name: Repository name
|
|
268
|
+
# max_commits: Maximum commits to fetch
|
|
269
|
+
#
|
|
270
|
+
# Returns:
|
|
271
|
+
# List of commit dictionaries
|
|
272
|
+
query = """
|
|
273
|
+
query($owner: String!, $name: String!, $first: Int!) {
|
|
274
|
+
repository(owner: $owner, name: $name) {
|
|
275
|
+
defaultBranchRef {
|
|
276
|
+
target {
|
|
277
|
+
... on Commit {
|
|
278
|
+
history(first: $first) {
|
|
279
|
+
totalCount
|
|
280
|
+
pageInfo {
|
|
281
|
+
hasNextPage
|
|
282
|
+
endCursor
|
|
283
|
+
}
|
|
284
|
+
nodes {
|
|
285
|
+
oid
|
|
286
|
+
message
|
|
287
|
+
committedDate
|
|
288
|
+
author {
|
|
289
|
+
name
|
|
290
|
+
email
|
|
291
|
+
user {
|
|
292
|
+
login
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
additions
|
|
296
|
+
deletions
|
|
297
|
+
changedFiles
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
rateLimit {
|
|
305
|
+
remaining
|
|
306
|
+
cost
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
variables = {"owner": owner, "name": name, "first": min(max_commits, 100)}
|
|
312
|
+
|
|
313
|
+
commits = []
|
|
314
|
+
try:
|
|
315
|
+
response = self._execute_query(query, variables)
|
|
316
|
+
|
|
317
|
+
if "errors" in response:
|
|
318
|
+
print(f"GraphQL Errors: {response['errors']}")
|
|
319
|
+
return commits
|
|
320
|
+
|
|
321
|
+
data = response.get("data", {})
|
|
322
|
+
repo = data.get("repository", {})
|
|
323
|
+
branch = repo.get("defaultBranchRef", {})
|
|
324
|
+
target = branch.get("target", {})
|
|
325
|
+
history = target.get("history", {})
|
|
326
|
+
nodes = history.get("nodes", [])
|
|
327
|
+
|
|
328
|
+
for node in nodes:
|
|
329
|
+
commit = {
|
|
330
|
+
"sha": node.get("oid"),
|
|
331
|
+
"message": node.get("message"),
|
|
332
|
+
"date": node.get("committedDate"),
|
|
333
|
+
"author": node.get("author", {}).get("name"),
|
|
334
|
+
"author_email": node.get("author", {}).get("email"),
|
|
335
|
+
"additions": node.get("additions", 0),
|
|
336
|
+
"deletions": node.get("deletions", 0),
|
|
337
|
+
"changed_files": node.get("changedFiles", 0),
|
|
338
|
+
}
|
|
339
|
+
commits.append(commit)
|
|
340
|
+
|
|
341
|
+
print(
|
|
342
|
+
f"Fetched {len(commits)} commits for {owner}/{name} "
|
|
343
|
+
f"(rate limit cost: {data.get('rateLimit', {}).get('cost')})"
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
except Exception as e:
|
|
347
|
+
print(f"Error fetching commits for {owner}/{name}: {e}")
|
|
348
|
+
|
|
349
|
+
return commits
|
|
350
|
+
|
|
351
|
+
def save_results(self, repositories: List[Repository], output_file: str):
|
|
352
|
+
# Save repositories to JSON file.
|
|
353
|
+
data = {
|
|
354
|
+
"total_repositories": len(repositories),
|
|
355
|
+
"repositories": [repo.to_dict() for repo in repositories],
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
359
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
360
|
+
|
|
361
|
+
print(f"Saved {len(repositories)} repositories to {output_file}")
|