greenmining 0.1.11__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__init__.py +42 -1
- greenmining/__version__.py +1 -1
- greenmining/analyzers/__init__.py +17 -0
- greenmining/analyzers/code_diff_analyzer.py +238 -0
- greenmining/analyzers/ml_feature_extractor.py +512 -0
- greenmining/analyzers/nlp_analyzer.py +365 -0
- greenmining/analyzers/qualitative_analyzer.py +460 -0
- greenmining/analyzers/statistical_analyzer.py +245 -0
- greenmining/analyzers/temporal_analyzer.py +434 -0
- greenmining/cli.py +126 -25
- greenmining/config.py +21 -0
- greenmining/controllers/repository_controller.py +58 -3
- greenmining/gsf_patterns.py +10 -5
- greenmining/models/aggregated_stats.py +3 -1
- greenmining/models/commit.py +3 -0
- greenmining/models/repository.py +3 -1
- greenmining/presenters/console_presenter.py +3 -1
- greenmining/services/commit_extractor.py +27 -1
- greenmining/services/data_aggregator.py +163 -5
- greenmining/services/data_analyzer.py +111 -8
- greenmining/services/github_fetcher.py +62 -5
- greenmining/services/reports.py +123 -2
- greenmining-1.0.1.dist-info/METADATA +699 -0
- greenmining-1.0.1.dist-info/RECORD +36 -0
- greenmining-0.1.11.dist-info/METADATA +0 -335
- greenmining-0.1.11.dist-info/RECORD +0 -29
- {greenmining-0.1.11.dist-info → greenmining-1.0.1.dist-info}/WHEEL +0 -0
- {greenmining-0.1.11.dist-info → greenmining-1.0.1.dist-info}/entry_points.txt +0 -0
- {greenmining-0.1.11.dist-info → greenmining-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {greenmining-0.1.11.dist-info → greenmining-1.0.1.dist-info}/top_level.txt +0 -0
greenmining/cli.py
CHANGED
|
@@ -29,8 +29,27 @@ def cli(config_file, verbose):
|
|
|
29
29
|
@click.option(
|
|
30
30
|
"--languages", default="Python,Java,Go,JavaScript,TypeScript", help="Comma-separated languages"
|
|
31
31
|
)
|
|
32
|
-
|
|
33
|
-
""
|
|
32
|
+
@click.option(
|
|
33
|
+
"--keywords",
|
|
34
|
+
default="microservices",
|
|
35
|
+
type=str,
|
|
36
|
+
help="Search keywords (e.g., 'kubernetes', 'docker', 'cloud-native')",
|
|
37
|
+
)
|
|
38
|
+
@click.option("--created-after", type=str, help="Repository created after (YYYY-MM-DD)")
|
|
39
|
+
@click.option("--created-before", type=str, help="Repository created before (YYYY-MM-DD)")
|
|
40
|
+
@click.option("--pushed-after", type=str, help="Repository pushed after (YYYY-MM-DD)")
|
|
41
|
+
@click.option("--pushed-before", type=str, help="Repository pushed before (YYYY-MM-DD)")
|
|
42
|
+
def fetch(
|
|
43
|
+
max_repos,
|
|
44
|
+
min_stars,
|
|
45
|
+
languages,
|
|
46
|
+
keywords,
|
|
47
|
+
created_after,
|
|
48
|
+
created_before,
|
|
49
|
+
pushed_after,
|
|
50
|
+
pushed_before,
|
|
51
|
+
):
|
|
52
|
+
"""Fetch repositories from GitHub based on custom search keywords."""
|
|
34
53
|
presenter.show_banner()
|
|
35
54
|
colored_print(f"\n🎯 Target: {max_repos} repositories\n", "cyan")
|
|
36
55
|
|
|
@@ -39,7 +58,14 @@ def fetch(max_repos, min_stars, languages):
|
|
|
39
58
|
|
|
40
59
|
try:
|
|
41
60
|
repositories = controller.fetch_repositories(
|
|
42
|
-
max_repos=max_repos,
|
|
61
|
+
max_repos=max_repos,
|
|
62
|
+
min_stars=min_stars,
|
|
63
|
+
languages=lang_list,
|
|
64
|
+
keywords=keywords,
|
|
65
|
+
created_after=created_after,
|
|
66
|
+
created_before=created_before,
|
|
67
|
+
pushed_after=pushed_after,
|
|
68
|
+
pushed_before=pushed_before,
|
|
43
69
|
)
|
|
44
70
|
|
|
45
71
|
# Show results
|
|
@@ -61,11 +87,12 @@ def fetch(max_repos, min_stars, languages):
|
|
|
61
87
|
@click.option("--max-commits", default=50, type=int, help="Max commits per repository")
|
|
62
88
|
@click.option("--skip-merges", is_flag=True, default=True, help="Skip merge commits")
|
|
63
89
|
@click.option("--days-back", default=730, type=int, help="Days to look back (default: 2 years)")
|
|
64
|
-
|
|
90
|
+
@click.option("--timeout", default=60, type=int, help="Timeout per repo in seconds (default: 60)")
|
|
91
|
+
def extract(max_commits, skip_merges, days_back, timeout):
|
|
65
92
|
"""Extract commits from fetched repositories."""
|
|
66
93
|
presenter.show_banner()
|
|
67
94
|
|
|
68
|
-
from services.commit_extractor import CommitExtractor
|
|
95
|
+
from greenmining.services.commit_extractor import CommitExtractor
|
|
69
96
|
|
|
70
97
|
try:
|
|
71
98
|
# Load repositories
|
|
@@ -80,14 +107,14 @@ def extract(max_commits, skip_merges, days_back):
|
|
|
80
107
|
|
|
81
108
|
# Extract commits
|
|
82
109
|
extractor = CommitExtractor(
|
|
83
|
-
max_commits=max_commits, skip_merges=skip_merges, days_back=days_back
|
|
110
|
+
max_commits=max_commits, skip_merges=skip_merges, days_back=days_back, timeout=timeout
|
|
84
111
|
)
|
|
85
112
|
commits = extractor.extract_from_repositories(
|
|
86
113
|
repositories=[r.to_dict() for r in repositories]
|
|
87
114
|
)
|
|
88
115
|
|
|
89
116
|
# Save commits
|
|
90
|
-
from utils import save_json_file
|
|
117
|
+
from greenmining.utils import save_json_file
|
|
91
118
|
|
|
92
119
|
save_json_file(commits, config.COMMITS_FILE)
|
|
93
120
|
colored_print(f" Saved to: {config.COMMITS_FILE}", "cyan")
|
|
@@ -113,12 +140,15 @@ def extract(max_commits, skip_merges, days_back):
|
|
|
113
140
|
|
|
114
141
|
@cli.command()
|
|
115
142
|
@click.option("--batch-size", default=10, type=int, help="Batch size for processing")
|
|
116
|
-
|
|
143
|
+
@click.option("--enable-diff-analysis", is_flag=True, help="Enable code diff analysis (slower)")
|
|
144
|
+
@click.option("--enable-nlp", is_flag=True, help="Enable NLP-enhanced pattern detection")
|
|
145
|
+
@click.option("--enable-ml-features", is_flag=True, help="Enable ML feature extraction")
|
|
146
|
+
def analyze(batch_size, enable_diff_analysis, enable_nlp, enable_ml_features):
|
|
117
147
|
"""Analyze commits for green software patterns."""
|
|
118
148
|
presenter.show_banner()
|
|
119
149
|
|
|
120
|
-
from services.data_analyzer import DataAnalyzer
|
|
121
|
-
from utils import save_json_file
|
|
150
|
+
from greenmining.services.data_analyzer import DataAnalyzer
|
|
151
|
+
from greenmining.utils import save_json_file
|
|
122
152
|
|
|
123
153
|
try:
|
|
124
154
|
# Load commits
|
|
@@ -127,12 +157,27 @@ def analyze(batch_size):
|
|
|
127
157
|
|
|
128
158
|
commits = load_json_file(config.COMMITS_FILE)
|
|
129
159
|
colored_print(f"\n🔬 Analyzing {len(commits)} commits for green patterns...\n", "cyan")
|
|
130
|
-
|
|
160
|
+
|
|
161
|
+
# Show enabled methods
|
|
162
|
+
methods = ["Keyword"]
|
|
163
|
+
if enable_diff_analysis:
|
|
164
|
+
methods.append("Code Diff")
|
|
165
|
+
if enable_nlp:
|
|
166
|
+
methods.append("NLP")
|
|
167
|
+
if enable_ml_features:
|
|
168
|
+
methods.append("ML Features")
|
|
169
|
+
|
|
170
|
+
colored_print(f" Methods: {' + '.join(methods)}\n", "cyan")
|
|
131
171
|
colored_print(f" Batch size: {batch_size}\n", "cyan")
|
|
132
172
|
|
|
133
173
|
# Analyze
|
|
134
|
-
analyzer = DataAnalyzer(
|
|
135
|
-
|
|
174
|
+
analyzer = DataAnalyzer(
|
|
175
|
+
batch_size=batch_size,
|
|
176
|
+
enable_diff_analysis=enable_diff_analysis,
|
|
177
|
+
enable_nlp=enable_nlp,
|
|
178
|
+
enable_ml_features=enable_ml_features,
|
|
179
|
+
)
|
|
180
|
+
results = analyzer.analyze_commits(commits)
|
|
136
181
|
|
|
137
182
|
# Save results
|
|
138
183
|
save_json_file(results, config.ANALYSIS_FILE)
|
|
@@ -159,12 +204,20 @@ def analyze(batch_size):
|
|
|
159
204
|
|
|
160
205
|
|
|
161
206
|
@cli.command()
|
|
162
|
-
|
|
207
|
+
@click.option("--enable-enhanced-stats", is_flag=True, help="Enable enhanced statistical analysis")
|
|
208
|
+
@click.option("--enable-temporal", is_flag=True, help="Enable temporal trend analysis")
|
|
209
|
+
@click.option(
|
|
210
|
+
"--temporal-granularity",
|
|
211
|
+
default="quarter",
|
|
212
|
+
type=click.Choice(["day", "week", "month", "quarter", "year"]),
|
|
213
|
+
help="Temporal analysis granularity",
|
|
214
|
+
)
|
|
215
|
+
def aggregate(enable_enhanced_stats, enable_temporal, temporal_granularity):
|
|
163
216
|
"""Aggregate analysis results and generate statistics."""
|
|
164
217
|
presenter.show_banner()
|
|
165
218
|
|
|
166
|
-
from services.data_aggregator import DataAggregator
|
|
167
|
-
from utils import save_json_file
|
|
219
|
+
from greenmining.services.data_aggregator import DataAggregator
|
|
220
|
+
from greenmining.utils import save_json_file
|
|
168
221
|
|
|
169
222
|
try:
|
|
170
223
|
# Load data
|
|
@@ -176,8 +229,20 @@ def aggregate():
|
|
|
176
229
|
|
|
177
230
|
colored_print(f"\n📊 Aggregating results from {len(results)} commits...\n", "cyan")
|
|
178
231
|
|
|
232
|
+
# Show enabled features
|
|
233
|
+
if enable_enhanced_stats:
|
|
234
|
+
colored_print(" Enhanced statistics: Enabled\n", "cyan")
|
|
235
|
+
if enable_temporal:
|
|
236
|
+
colored_print(
|
|
237
|
+
f" Temporal analysis: Enabled (granularity: {temporal_granularity})\n", "cyan"
|
|
238
|
+
)
|
|
239
|
+
|
|
179
240
|
# Aggregate
|
|
180
|
-
aggregator = DataAggregator(
|
|
241
|
+
aggregator = DataAggregator(
|
|
242
|
+
enable_enhanced_stats=enable_enhanced_stats,
|
|
243
|
+
enable_temporal=enable_temporal,
|
|
244
|
+
temporal_granularity=temporal_granularity,
|
|
245
|
+
)
|
|
181
246
|
aggregated = aggregator.aggregate(results, repos)
|
|
182
247
|
|
|
183
248
|
# Save
|
|
@@ -187,7 +252,15 @@ def aggregate():
|
|
|
187
252
|
presenter.show_analysis_results(aggregated)
|
|
188
253
|
|
|
189
254
|
if aggregated.get("known_patterns"):
|
|
190
|
-
presenter
|
|
255
|
+
# Convert list format to dict format expected by presenter
|
|
256
|
+
patterns_dict = {}
|
|
257
|
+
for pattern in aggregated["known_patterns"]:
|
|
258
|
+
patterns_dict[pattern["pattern_name"]] = {
|
|
259
|
+
"count": pattern["count"],
|
|
260
|
+
"percentage": pattern["percentage"],
|
|
261
|
+
"confidence_distribution": pattern.get("confidence_breakdown", {}),
|
|
262
|
+
}
|
|
263
|
+
presenter.show_pattern_distribution(patterns_dict, limit=10)
|
|
191
264
|
|
|
192
265
|
presenter.show_success(f"Aggregation complete! Results saved to {config.AGGREGATED_FILE}")
|
|
193
266
|
|
|
@@ -202,20 +275,48 @@ def report(output):
|
|
|
202
275
|
"""Generate comprehensive markdown report."""
|
|
203
276
|
presenter.show_banner()
|
|
204
277
|
|
|
205
|
-
from services.reports import ReportGenerator
|
|
278
|
+
from greenmining.services.reports import ReportGenerator
|
|
206
279
|
|
|
207
280
|
try:
|
|
208
281
|
# Load aggregated data
|
|
209
282
|
if not config.AGGREGATED_FILE.exists():
|
|
210
283
|
raise FileNotFoundError("No aggregated data found. Run 'aggregate' first.")
|
|
211
284
|
|
|
285
|
+
# Load analysis results
|
|
286
|
+
if not config.ANALYSIS_FILE.exists():
|
|
287
|
+
raise FileNotFoundError("No analysis results found. Run 'analyze' first.")
|
|
288
|
+
|
|
289
|
+
# Load repository data
|
|
290
|
+
if not config.REPOS_FILE.exists():
|
|
291
|
+
raise FileNotFoundError("No repository data found. Run 'fetch' first.")
|
|
292
|
+
|
|
212
293
|
aggregated = load_json_file(config.AGGREGATED_FILE)
|
|
294
|
+
analysis_results = load_json_file(config.ANALYSIS_FILE)
|
|
295
|
+
repos_data = load_json_file(config.REPOS_FILE)
|
|
296
|
+
|
|
297
|
+
# Wrap analysis results if it's a list
|
|
298
|
+
if isinstance(analysis_results, list):
|
|
299
|
+
analysis = {"results": analysis_results, "total": len(analysis_results)}
|
|
300
|
+
else:
|
|
301
|
+
analysis = analysis_results
|
|
302
|
+
|
|
303
|
+
# Wrap repos data if it's a list
|
|
304
|
+
if isinstance(repos_data, list):
|
|
305
|
+
repos = {"repositories": repos_data, "total": len(repos_data)}
|
|
306
|
+
else:
|
|
307
|
+
repos = repos_data
|
|
213
308
|
|
|
214
309
|
colored_print("\n📄 Generating comprehensive report...\n", "cyan")
|
|
215
310
|
|
|
216
311
|
# Generate report
|
|
217
312
|
generator = ReportGenerator()
|
|
218
|
-
|
|
313
|
+
report_content = generator.generate_report(aggregated, analysis, repos)
|
|
314
|
+
|
|
315
|
+
# Save report
|
|
316
|
+
from pathlib import Path
|
|
317
|
+
|
|
318
|
+
report_path = Path(output)
|
|
319
|
+
report_path.write_text(report_content)
|
|
219
320
|
|
|
220
321
|
presenter.show_success(f"Report generated: {report_path}")
|
|
221
322
|
colored_print("\n📖 The report includes:", "cyan")
|
|
@@ -317,8 +418,8 @@ def pipeline(max_repos, skip_fetch):
|
|
|
317
418
|
|
|
318
419
|
# Phase 2: Extract
|
|
319
420
|
colored_print("\n[2/5] 📝 Extracting commits...", "cyan")
|
|
320
|
-
from services.commit_extractor import CommitExtractor
|
|
321
|
-
from utils import save_json_file
|
|
421
|
+
from greenmining.services.commit_extractor import CommitExtractor
|
|
422
|
+
from greenmining.utils import save_json_file
|
|
322
423
|
|
|
323
424
|
controller = RepositoryController(config)
|
|
324
425
|
repos = controller.load_repositories()
|
|
@@ -329,7 +430,7 @@ def pipeline(max_repos, skip_fetch):
|
|
|
329
430
|
|
|
330
431
|
# Phase 3: Analyze
|
|
331
432
|
colored_print("\n[3/5] 🔬 Analyzing commits...", "cyan")
|
|
332
|
-
from services.data_analyzer import DataAnalyzer
|
|
433
|
+
from greenmining.services.data_analyzer import DataAnalyzer
|
|
333
434
|
|
|
334
435
|
commits = load_json_file(config.COMMITS_FILE)
|
|
335
436
|
analyzer = DataAnalyzer()
|
|
@@ -341,7 +442,7 @@ def pipeline(max_repos, skip_fetch):
|
|
|
341
442
|
|
|
342
443
|
# Phase 4: Aggregate
|
|
343
444
|
colored_print("\n[4/5] 📊 Aggregating results...", "cyan")
|
|
344
|
-
from services.data_aggregator import DataAggregator
|
|
445
|
+
from greenmining.services.data_aggregator import DataAggregator
|
|
345
446
|
|
|
346
447
|
aggregator = DataAggregator()
|
|
347
448
|
aggregated = aggregator.aggregate(results, [r.to_dict() for r in repos])
|
|
@@ -349,7 +450,7 @@ def pipeline(max_repos, skip_fetch):
|
|
|
349
450
|
|
|
350
451
|
# Phase 5: Report
|
|
351
452
|
colored_print("\n[5/5] 📄 Generating report...", "cyan")
|
|
352
|
-
from services.reports import ReportGenerator
|
|
453
|
+
from greenmining.services.reports import ReportGenerator
|
|
353
454
|
|
|
354
455
|
generator = ReportGenerator()
|
|
355
456
|
generator.generate_report(aggregated)
|
greenmining/config.py
CHANGED
|
@@ -49,6 +49,27 @@ class Config:
|
|
|
49
49
|
self.COMMITS_PER_REPO = int(os.getenv("COMMITS_PER_REPO", "50"))
|
|
50
50
|
self.DAYS_BACK = int(os.getenv("DAYS_BACK", "730")) # 2 years
|
|
51
51
|
|
|
52
|
+
# Advanced Analyzer Configuration
|
|
53
|
+
self.ENABLE_NLP_ANALYSIS = os.getenv("ENABLE_NLP_ANALYSIS", "false").lower() == "true"
|
|
54
|
+
self.ENABLE_TEMPORAL_ANALYSIS = (
|
|
55
|
+
os.getenv("ENABLE_TEMPORAL_ANALYSIS", "false").lower() == "true"
|
|
56
|
+
)
|
|
57
|
+
self.TEMPORAL_GRANULARITY = os.getenv(
|
|
58
|
+
"TEMPORAL_GRANULARITY", "quarter"
|
|
59
|
+
) # day, week, month, quarter, year
|
|
60
|
+
self.ENABLE_ML_FEATURES = os.getenv("ENABLE_ML_FEATURES", "false").lower() == "true"
|
|
61
|
+
self.VALIDATION_SAMPLE_SIZE = int(os.getenv("VALIDATION_SAMPLE_SIZE", "30"))
|
|
62
|
+
|
|
63
|
+
# Temporal Filtering (NEW)
|
|
64
|
+
self.CREATED_AFTER = os.getenv("CREATED_AFTER") # YYYY-MM-DD
|
|
65
|
+
self.CREATED_BEFORE = os.getenv("CREATED_BEFORE") # YYYY-MM-DD
|
|
66
|
+
self.PUSHED_AFTER = os.getenv("PUSHED_AFTER") # YYYY-MM-DD
|
|
67
|
+
self.PUSHED_BEFORE = os.getenv("PUSHED_BEFORE") # YYYY-MM-DD
|
|
68
|
+
self.COMMIT_DATE_FROM = os.getenv("COMMIT_DATE_FROM") # YYYY-MM-DD
|
|
69
|
+
self.COMMIT_DATE_TO = os.getenv("COMMIT_DATE_TO") # YYYY-MM-DD
|
|
70
|
+
self.MIN_COMMITS = int(os.getenv("MIN_COMMITS", "0"))
|
|
71
|
+
self.ACTIVITY_WINDOW_DAYS = int(os.getenv("ACTIVITY_WINDOW_DAYS", "730"))
|
|
72
|
+
|
|
52
73
|
# Analysis Configuration
|
|
53
74
|
self.BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10"))
|
|
54
75
|
|
|
@@ -17,7 +17,15 @@ class RepositoryController:
|
|
|
17
17
|
self.github = Github(config.GITHUB_TOKEN)
|
|
18
18
|
|
|
19
19
|
def fetch_repositories(
|
|
20
|
-
self,
|
|
20
|
+
self,
|
|
21
|
+
max_repos: int = None,
|
|
22
|
+
min_stars: int = None,
|
|
23
|
+
languages: list[str] = None,
|
|
24
|
+
keywords: str = None,
|
|
25
|
+
created_after: str = None,
|
|
26
|
+
created_before: str = None,
|
|
27
|
+
pushed_after: str = None,
|
|
28
|
+
pushed_before: str = None,
|
|
21
29
|
) -> list[Repository]:
|
|
22
30
|
"""Fetch repositories from GitHub.
|
|
23
31
|
|
|
@@ -25,6 +33,11 @@ class RepositoryController:
|
|
|
25
33
|
max_repos: Maximum number of repositories to fetch
|
|
26
34
|
min_stars: Minimum stars filter
|
|
27
35
|
languages: List of programming languages to filter
|
|
36
|
+
keywords: Custom search keywords (default: "microservices")
|
|
37
|
+
created_after: Repository created after date (YYYY-MM-DD)
|
|
38
|
+
created_before: Repository created before date (YYYY-MM-DD)
|
|
39
|
+
pushed_after: Repository pushed after date (YYYY-MM-DD)
|
|
40
|
+
pushed_before: Repository pushed before date (YYYY-MM-DD)
|
|
28
41
|
|
|
29
42
|
Returns:
|
|
30
43
|
List of Repository model instances
|
|
@@ -32,12 +45,23 @@ class RepositoryController:
|
|
|
32
45
|
max_repos = max_repos or self.config.MAX_REPOS
|
|
33
46
|
min_stars = min_stars or self.config.MIN_STARS
|
|
34
47
|
languages = languages or self.config.SUPPORTED_LANGUAGES
|
|
48
|
+
keywords = keywords or "microservices"
|
|
35
49
|
|
|
36
50
|
colored_print(f"🔍 Fetching up to {max_repos} repositories...", "cyan")
|
|
51
|
+
colored_print(f" Keywords: {keywords}", "cyan")
|
|
37
52
|
colored_print(f" Filters: min_stars={min_stars}", "cyan")
|
|
38
53
|
|
|
39
|
-
|
|
40
|
-
|
|
54
|
+
if created_after or created_before:
|
|
55
|
+
colored_print(
|
|
56
|
+
f" Created: {created_after or 'any'} to {created_before or 'any'}", "cyan"
|
|
57
|
+
)
|
|
58
|
+
if pushed_after or pushed_before:
|
|
59
|
+
colored_print(f" Pushed: {pushed_after or 'any'} to {pushed_before or 'any'}", "cyan")
|
|
60
|
+
|
|
61
|
+
# Build search query with temporal filters
|
|
62
|
+
query = self._build_temporal_query(
|
|
63
|
+
keywords, min_stars, created_after, created_before, pushed_after, pushed_before
|
|
64
|
+
)
|
|
41
65
|
|
|
42
66
|
try:
|
|
43
67
|
# Execute search
|
|
@@ -76,6 +100,37 @@ class RepositoryController:
|
|
|
76
100
|
colored_print(f"❌ Error fetching repositories: {e}", "red")
|
|
77
101
|
raise
|
|
78
102
|
|
|
103
|
+
def _build_temporal_query(
|
|
104
|
+
self,
|
|
105
|
+
keywords: str,
|
|
106
|
+
min_stars: int,
|
|
107
|
+
created_after: str = None,
|
|
108
|
+
created_before: str = None,
|
|
109
|
+
pushed_after: str = None,
|
|
110
|
+
pushed_before: str = None,
|
|
111
|
+
) -> str:
|
|
112
|
+
"""Build GitHub search query with temporal constraints."""
|
|
113
|
+
query_parts = [keywords, f"stars:>={min_stars}"]
|
|
114
|
+
|
|
115
|
+
# Temporal filters
|
|
116
|
+
if created_after and created_before:
|
|
117
|
+
query_parts.append(f"created:{created_after}..{created_before}")
|
|
118
|
+
elif created_after:
|
|
119
|
+
query_parts.append(f"created:>={created_after}")
|
|
120
|
+
elif created_before:
|
|
121
|
+
query_parts.append(f"created:<={created_before}")
|
|
122
|
+
|
|
123
|
+
if pushed_after and pushed_before:
|
|
124
|
+
query_parts.append(f"pushed:{pushed_after}..{pushed_before}")
|
|
125
|
+
elif pushed_after:
|
|
126
|
+
query_parts.append(f"pushed:>={pushed_after}")
|
|
127
|
+
elif pushed_before:
|
|
128
|
+
query_parts.append(f"pushed:<={pushed_before}")
|
|
129
|
+
|
|
130
|
+
query = " ".join(query_parts)
|
|
131
|
+
colored_print(f" Query: {query}", "cyan")
|
|
132
|
+
return query
|
|
133
|
+
|
|
79
134
|
def load_repositories(self) -> list[Repository]:
|
|
80
135
|
"""Load repositories from file.
|
|
81
136
|
|
greenmining/gsf_patterns.py
CHANGED
|
@@ -356,7 +356,12 @@ GSF_PATTERNS = {
|
|
|
356
356
|
"compress_ml_models": {
|
|
357
357
|
"name": "Compress ML Models for Inference",
|
|
358
358
|
"category": "ai",
|
|
359
|
-
"keywords": [
|
|
359
|
+
"keywords": [
|
|
360
|
+
"model compression",
|
|
361
|
+
"quantization",
|
|
362
|
+
"model pruning",
|
|
363
|
+
"knowledge distillation",
|
|
364
|
+
],
|
|
360
365
|
"description": "Reduce model size through quantization, pruning, distillation",
|
|
361
366
|
"sci_impact": "Dramatically reduces inference energy and memory",
|
|
362
367
|
},
|
|
@@ -370,14 +375,14 @@ GSF_PATTERNS = {
|
|
|
370
375
|
"energy_efficient_ai_edge": {
|
|
371
376
|
"name": "Energy Efficient AI at Edge",
|
|
372
377
|
"category": "ai",
|
|
373
|
-
"keywords": ["edge", "ai", "
|
|
378
|
+
"keywords": ["edge inference", "edge ai", "edge ml", "tflite", "onnx runtime"],
|
|
374
379
|
"description": "Run inference on edge devices when possible",
|
|
375
380
|
"sci_impact": "Eliminates network transfer, uses local compute",
|
|
376
381
|
},
|
|
377
382
|
"energy_efficient_framework": {
|
|
378
383
|
"name": "Energy Efficient Framework",
|
|
379
384
|
"category": "ai",
|
|
380
|
-
"keywords": ["
|
|
385
|
+
"keywords": ["tensorflow", "pytorch", "onnx", "jax", "huggingface"],
|
|
381
386
|
"description": "Choose ML frameworks optimized for efficiency",
|
|
382
387
|
"sci_impact": "Different frameworks have different energy profiles",
|
|
383
388
|
},
|
|
@@ -405,14 +410,14 @@ GSF_PATTERNS = {
|
|
|
405
410
|
"right_hardware_ai": {
|
|
406
411
|
"name": "Right Hardware Type for AI",
|
|
407
412
|
"category": "ai",
|
|
408
|
-
"keywords": ["
|
|
413
|
+
"keywords": ["gpu training", "tpu", "cuda", "nvidia ai", "ml accelerator"],
|
|
409
414
|
"description": "Use appropriate hardware (GPU/TPU) for AI workloads",
|
|
410
415
|
"sci_impact": "Specialized hardware is more energy efficient",
|
|
411
416
|
},
|
|
412
417
|
"serverless_ml": {
|
|
413
418
|
"name": "Serverless Model Development",
|
|
414
419
|
"category": "ai",
|
|
415
|
-
"keywords": ["
|
|
420
|
+
"keywords": ["sagemaker", "vertex ai", "azure ml", "lambda inference", "serverless ml"],
|
|
416
421
|
"description": "Use serverless platforms for ML development",
|
|
417
422
|
"sci_impact": "Pay-per-use, no idle resources",
|
|
418
423
|
},
|
greenmining/models/commit.py
CHANGED
greenmining/models/repository.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
"""Commit extractor for green microservices mining."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import json
|
|
4
6
|
from datetime import datetime, timedelta
|
|
5
7
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Optional
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
7
9
|
|
|
8
10
|
import click
|
|
9
11
|
from github import Github
|
|
@@ -29,6 +31,7 @@ class CommitExtractor:
|
|
|
29
31
|
skip_merges: bool = True,
|
|
30
32
|
days_back: int = 730,
|
|
31
33
|
github_token: str | None = None,
|
|
34
|
+
timeout: int = 60,
|
|
32
35
|
):
|
|
33
36
|
"""Initialize commit extractor.
|
|
34
37
|
|
|
@@ -37,12 +40,14 @@ class CommitExtractor:
|
|
|
37
40
|
skip_merges: Skip merge commits
|
|
38
41
|
days_back: Only analyze commits from last N days
|
|
39
42
|
github_token: GitHub API token (optional)
|
|
43
|
+
timeout: Timeout in seconds per repository (default: 60)
|
|
40
44
|
"""
|
|
41
45
|
self.max_commits = max_commits
|
|
42
46
|
self.skip_merges = skip_merges
|
|
43
47
|
self.days_back = days_back
|
|
44
48
|
self.cutoff_date = datetime.now() - timedelta(days=days_back)
|
|
45
49
|
self.github = Github(github_token) if github_token else None
|
|
50
|
+
self.timeout = timeout
|
|
46
51
|
|
|
47
52
|
def extract_from_repositories(self, repositories: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
48
53
|
"""Extract commits from list of repositories.
|
|
@@ -62,14 +67,35 @@ class CommitExtractor:
|
|
|
62
67
|
"cyan",
|
|
63
68
|
)
|
|
64
69
|
|
|
70
|
+
import signal
|
|
71
|
+
|
|
72
|
+
def timeout_handler(signum, frame):
|
|
73
|
+
raise TimeoutError("Repository extraction timeout")
|
|
74
|
+
|
|
65
75
|
with tqdm(total=len(repositories), desc="Processing repositories", unit="repo") as pbar:
|
|
66
76
|
for repo in repositories:
|
|
67
77
|
try:
|
|
78
|
+
# Set timeout alarm
|
|
79
|
+
signal.signal(signal.SIGALRM, timeout_handler)
|
|
80
|
+
signal.alarm(self.timeout)
|
|
81
|
+
|
|
68
82
|
commits = self._extract_repo_commits(repo)
|
|
69
83
|
all_commits.extend(commits)
|
|
84
|
+
|
|
85
|
+
# Cancel alarm
|
|
86
|
+
signal.alarm(0)
|
|
87
|
+
|
|
70
88
|
pbar.set_postfix({"commits": len(all_commits), "failed": len(failed_repos)})
|
|
71
89
|
pbar.update(1)
|
|
90
|
+
except TimeoutError:
|
|
91
|
+
signal.alarm(0) # Cancel alarm
|
|
92
|
+
colored_print(
|
|
93
|
+
f"\nTimeout processing {repo['full_name']} (>{self.timeout}s)", "yellow"
|
|
94
|
+
)
|
|
95
|
+
failed_repos.append(repo["full_name"])
|
|
96
|
+
pbar.update(1)
|
|
72
97
|
except Exception as e:
|
|
98
|
+
signal.alarm(0) # Cancel alarm
|
|
73
99
|
colored_print(f"\nError processing {repo['full_name']}: {e}", "yellow")
|
|
74
100
|
failed_repos.append(repo["full_name"])
|
|
75
101
|
pbar.update(1)
|