greenmining 0.1.12__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- greenmining/__version__.py +1 -1
- greenmining/analyzers/__init__.py +17 -0
- greenmining/analyzers/code_diff_analyzer.py +238 -0
- greenmining/analyzers/ml_feature_extractor.py +512 -0
- greenmining/analyzers/nlp_analyzer.py +365 -0
- greenmining/analyzers/qualitative_analyzer.py +460 -0
- greenmining/analyzers/statistical_analyzer.py +245 -0
- greenmining/analyzers/temporal_analyzer.py +434 -0
- greenmining/cli.py +119 -24
- greenmining/config.py +21 -0
- greenmining/controllers/repository_controller.py +50 -2
- greenmining/gsf_patterns.py +10 -5
- greenmining/models/aggregated_stats.py +3 -1
- greenmining/models/commit.py +3 -0
- greenmining/models/repository.py +3 -1
- greenmining/presenters/console_presenter.py +3 -1
- greenmining/services/commit_extractor.py +27 -1
- greenmining/services/data_aggregator.py +163 -5
- greenmining/services/data_analyzer.py +111 -8
- greenmining/services/github_fetcher.py +62 -5
- greenmining/services/reports.py +123 -2
- {greenmining-0.1.12.dist-info → greenmining-1.0.1.dist-info}/METADATA +250 -22
- greenmining-1.0.1.dist-info/RECORD +36 -0
- greenmining-0.1.12.dist-info/RECORD +0 -29
- {greenmining-0.1.12.dist-info → greenmining-1.0.1.dist-info}/WHEEL +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.1.dist-info}/entry_points.txt +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {greenmining-0.1.12.dist-info → greenmining-1.0.1.dist-info}/top_level.txt +0 -0
greenmining/cli.py
CHANGED
|
@@ -35,7 +35,20 @@ def cli(config_file, verbose):
|
|
|
35
35
|
type=str,
|
|
36
36
|
help="Search keywords (e.g., 'kubernetes', 'docker', 'cloud-native')",
|
|
37
37
|
)
|
|
38
|
-
|
|
38
|
+
@click.option("--created-after", type=str, help="Repository created after (YYYY-MM-DD)")
|
|
39
|
+
@click.option("--created-before", type=str, help="Repository created before (YYYY-MM-DD)")
|
|
40
|
+
@click.option("--pushed-after", type=str, help="Repository pushed after (YYYY-MM-DD)")
|
|
41
|
+
@click.option("--pushed-before", type=str, help="Repository pushed before (YYYY-MM-DD)")
|
|
42
|
+
def fetch(
|
|
43
|
+
max_repos,
|
|
44
|
+
min_stars,
|
|
45
|
+
languages,
|
|
46
|
+
keywords,
|
|
47
|
+
created_after,
|
|
48
|
+
created_before,
|
|
49
|
+
pushed_after,
|
|
50
|
+
pushed_before,
|
|
51
|
+
):
|
|
39
52
|
"""Fetch repositories from GitHub based on custom search keywords."""
|
|
40
53
|
presenter.show_banner()
|
|
41
54
|
colored_print(f"\n🎯 Target: {max_repos} repositories\n", "cyan")
|
|
@@ -45,7 +58,14 @@ def fetch(max_repos, min_stars, languages, keywords):
|
|
|
45
58
|
|
|
46
59
|
try:
|
|
47
60
|
repositories = controller.fetch_repositories(
|
|
48
|
-
max_repos=max_repos,
|
|
61
|
+
max_repos=max_repos,
|
|
62
|
+
min_stars=min_stars,
|
|
63
|
+
languages=lang_list,
|
|
64
|
+
keywords=keywords,
|
|
65
|
+
created_after=created_after,
|
|
66
|
+
created_before=created_before,
|
|
67
|
+
pushed_after=pushed_after,
|
|
68
|
+
pushed_before=pushed_before,
|
|
49
69
|
)
|
|
50
70
|
|
|
51
71
|
# Show results
|
|
@@ -67,11 +87,12 @@ def fetch(max_repos, min_stars, languages, keywords):
|
|
|
67
87
|
@click.option("--max-commits", default=50, type=int, help="Max commits per repository")
|
|
68
88
|
@click.option("--skip-merges", is_flag=True, default=True, help="Skip merge commits")
|
|
69
89
|
@click.option("--days-back", default=730, type=int, help="Days to look back (default: 2 years)")
|
|
70
|
-
|
|
90
|
+
@click.option("--timeout", default=60, type=int, help="Timeout per repo in seconds (default: 60)")
|
|
91
|
+
def extract(max_commits, skip_merges, days_back, timeout):
|
|
71
92
|
"""Extract commits from fetched repositories."""
|
|
72
93
|
presenter.show_banner()
|
|
73
94
|
|
|
74
|
-
from services.commit_extractor import CommitExtractor
|
|
95
|
+
from greenmining.services.commit_extractor import CommitExtractor
|
|
75
96
|
|
|
76
97
|
try:
|
|
77
98
|
# Load repositories
|
|
@@ -86,14 +107,14 @@ def extract(max_commits, skip_merges, days_back):
|
|
|
86
107
|
|
|
87
108
|
# Extract commits
|
|
88
109
|
extractor = CommitExtractor(
|
|
89
|
-
max_commits=max_commits, skip_merges=skip_merges, days_back=days_back
|
|
110
|
+
max_commits=max_commits, skip_merges=skip_merges, days_back=days_back, timeout=timeout
|
|
90
111
|
)
|
|
91
112
|
commits = extractor.extract_from_repositories(
|
|
92
113
|
repositories=[r.to_dict() for r in repositories]
|
|
93
114
|
)
|
|
94
115
|
|
|
95
116
|
# Save commits
|
|
96
|
-
from utils import save_json_file
|
|
117
|
+
from greenmining.utils import save_json_file
|
|
97
118
|
|
|
98
119
|
save_json_file(commits, config.COMMITS_FILE)
|
|
99
120
|
colored_print(f" Saved to: {config.COMMITS_FILE}", "cyan")
|
|
@@ -119,12 +140,15 @@ def extract(max_commits, skip_merges, days_back):
|
|
|
119
140
|
|
|
120
141
|
@cli.command()
|
|
121
142
|
@click.option("--batch-size", default=10, type=int, help="Batch size for processing")
|
|
122
|
-
|
|
143
|
+
@click.option("--enable-diff-analysis", is_flag=True, help="Enable code diff analysis (slower)")
|
|
144
|
+
@click.option("--enable-nlp", is_flag=True, help="Enable NLP-enhanced pattern detection")
|
|
145
|
+
@click.option("--enable-ml-features", is_flag=True, help="Enable ML feature extraction")
|
|
146
|
+
def analyze(batch_size, enable_diff_analysis, enable_nlp, enable_ml_features):
|
|
123
147
|
"""Analyze commits for green software patterns."""
|
|
124
148
|
presenter.show_banner()
|
|
125
149
|
|
|
126
|
-
from services.data_analyzer import DataAnalyzer
|
|
127
|
-
from utils import save_json_file
|
|
150
|
+
from greenmining.services.data_analyzer import DataAnalyzer
|
|
151
|
+
from greenmining.utils import save_json_file
|
|
128
152
|
|
|
129
153
|
try:
|
|
130
154
|
# Load commits
|
|
@@ -133,12 +157,27 @@ def analyze(batch_size):
|
|
|
133
157
|
|
|
134
158
|
commits = load_json_file(config.COMMITS_FILE)
|
|
135
159
|
colored_print(f"\n🔬 Analyzing {len(commits)} commits for green patterns...\n", "cyan")
|
|
136
|
-
|
|
160
|
+
|
|
161
|
+
# Show enabled methods
|
|
162
|
+
methods = ["Keyword"]
|
|
163
|
+
if enable_diff_analysis:
|
|
164
|
+
methods.append("Code Diff")
|
|
165
|
+
if enable_nlp:
|
|
166
|
+
methods.append("NLP")
|
|
167
|
+
if enable_ml_features:
|
|
168
|
+
methods.append("ML Features")
|
|
169
|
+
|
|
170
|
+
colored_print(f" Methods: {' + '.join(methods)}\n", "cyan")
|
|
137
171
|
colored_print(f" Batch size: {batch_size}\n", "cyan")
|
|
138
172
|
|
|
139
173
|
# Analyze
|
|
140
|
-
analyzer = DataAnalyzer(
|
|
141
|
-
|
|
174
|
+
analyzer = DataAnalyzer(
|
|
175
|
+
batch_size=batch_size,
|
|
176
|
+
enable_diff_analysis=enable_diff_analysis,
|
|
177
|
+
enable_nlp=enable_nlp,
|
|
178
|
+
enable_ml_features=enable_ml_features,
|
|
179
|
+
)
|
|
180
|
+
results = analyzer.analyze_commits(commits)
|
|
142
181
|
|
|
143
182
|
# Save results
|
|
144
183
|
save_json_file(results, config.ANALYSIS_FILE)
|
|
@@ -165,12 +204,20 @@ def analyze(batch_size):
|
|
|
165
204
|
|
|
166
205
|
|
|
167
206
|
@cli.command()
|
|
168
|
-
|
|
207
|
+
@click.option("--enable-enhanced-stats", is_flag=True, help="Enable enhanced statistical analysis")
|
|
208
|
+
@click.option("--enable-temporal", is_flag=True, help="Enable temporal trend analysis")
|
|
209
|
+
@click.option(
|
|
210
|
+
"--temporal-granularity",
|
|
211
|
+
default="quarter",
|
|
212
|
+
type=click.Choice(["day", "week", "month", "quarter", "year"]),
|
|
213
|
+
help="Temporal analysis granularity",
|
|
214
|
+
)
|
|
215
|
+
def aggregate(enable_enhanced_stats, enable_temporal, temporal_granularity):
|
|
169
216
|
"""Aggregate analysis results and generate statistics."""
|
|
170
217
|
presenter.show_banner()
|
|
171
218
|
|
|
172
|
-
from services.data_aggregator import DataAggregator
|
|
173
|
-
from utils import save_json_file
|
|
219
|
+
from greenmining.services.data_aggregator import DataAggregator
|
|
220
|
+
from greenmining.utils import save_json_file
|
|
174
221
|
|
|
175
222
|
try:
|
|
176
223
|
# Load data
|
|
@@ -182,8 +229,20 @@ def aggregate():
|
|
|
182
229
|
|
|
183
230
|
colored_print(f"\n📊 Aggregating results from {len(results)} commits...\n", "cyan")
|
|
184
231
|
|
|
232
|
+
# Show enabled features
|
|
233
|
+
if enable_enhanced_stats:
|
|
234
|
+
colored_print(" Enhanced statistics: Enabled\n", "cyan")
|
|
235
|
+
if enable_temporal:
|
|
236
|
+
colored_print(
|
|
237
|
+
f" Temporal analysis: Enabled (granularity: {temporal_granularity})\n", "cyan"
|
|
238
|
+
)
|
|
239
|
+
|
|
185
240
|
# Aggregate
|
|
186
|
-
aggregator = DataAggregator(
|
|
241
|
+
aggregator = DataAggregator(
|
|
242
|
+
enable_enhanced_stats=enable_enhanced_stats,
|
|
243
|
+
enable_temporal=enable_temporal,
|
|
244
|
+
temporal_granularity=temporal_granularity,
|
|
245
|
+
)
|
|
187
246
|
aggregated = aggregator.aggregate(results, repos)
|
|
188
247
|
|
|
189
248
|
# Save
|
|
@@ -193,7 +252,15 @@ def aggregate():
|
|
|
193
252
|
presenter.show_analysis_results(aggregated)
|
|
194
253
|
|
|
195
254
|
if aggregated.get("known_patterns"):
|
|
196
|
-
presenter
|
|
255
|
+
# Convert list format to dict format expected by presenter
|
|
256
|
+
patterns_dict = {}
|
|
257
|
+
for pattern in aggregated["known_patterns"]:
|
|
258
|
+
patterns_dict[pattern["pattern_name"]] = {
|
|
259
|
+
"count": pattern["count"],
|
|
260
|
+
"percentage": pattern["percentage"],
|
|
261
|
+
"confidence_distribution": pattern.get("confidence_breakdown", {}),
|
|
262
|
+
}
|
|
263
|
+
presenter.show_pattern_distribution(patterns_dict, limit=10)
|
|
197
264
|
|
|
198
265
|
presenter.show_success(f"Aggregation complete! Results saved to {config.AGGREGATED_FILE}")
|
|
199
266
|
|
|
@@ -208,20 +275,48 @@ def report(output):
|
|
|
208
275
|
"""Generate comprehensive markdown report."""
|
|
209
276
|
presenter.show_banner()
|
|
210
277
|
|
|
211
|
-
from services.reports import ReportGenerator
|
|
278
|
+
from greenmining.services.reports import ReportGenerator
|
|
212
279
|
|
|
213
280
|
try:
|
|
214
281
|
# Load aggregated data
|
|
215
282
|
if not config.AGGREGATED_FILE.exists():
|
|
216
283
|
raise FileNotFoundError("No aggregated data found. Run 'aggregate' first.")
|
|
217
284
|
|
|
285
|
+
# Load analysis results
|
|
286
|
+
if not config.ANALYSIS_FILE.exists():
|
|
287
|
+
raise FileNotFoundError("No analysis results found. Run 'analyze' first.")
|
|
288
|
+
|
|
289
|
+
# Load repository data
|
|
290
|
+
if not config.REPOS_FILE.exists():
|
|
291
|
+
raise FileNotFoundError("No repository data found. Run 'fetch' first.")
|
|
292
|
+
|
|
218
293
|
aggregated = load_json_file(config.AGGREGATED_FILE)
|
|
294
|
+
analysis_results = load_json_file(config.ANALYSIS_FILE)
|
|
295
|
+
repos_data = load_json_file(config.REPOS_FILE)
|
|
296
|
+
|
|
297
|
+
# Wrap analysis results if it's a list
|
|
298
|
+
if isinstance(analysis_results, list):
|
|
299
|
+
analysis = {"results": analysis_results, "total": len(analysis_results)}
|
|
300
|
+
else:
|
|
301
|
+
analysis = analysis_results
|
|
302
|
+
|
|
303
|
+
# Wrap repos data if it's a list
|
|
304
|
+
if isinstance(repos_data, list):
|
|
305
|
+
repos = {"repositories": repos_data, "total": len(repos_data)}
|
|
306
|
+
else:
|
|
307
|
+
repos = repos_data
|
|
219
308
|
|
|
220
309
|
colored_print("\n📄 Generating comprehensive report...\n", "cyan")
|
|
221
310
|
|
|
222
311
|
# Generate report
|
|
223
312
|
generator = ReportGenerator()
|
|
224
|
-
|
|
313
|
+
report_content = generator.generate_report(aggregated, analysis, repos)
|
|
314
|
+
|
|
315
|
+
# Save report
|
|
316
|
+
from pathlib import Path
|
|
317
|
+
|
|
318
|
+
report_path = Path(output)
|
|
319
|
+
report_path.write_text(report_content)
|
|
225
320
|
|
|
226
321
|
presenter.show_success(f"Report generated: {report_path}")
|
|
227
322
|
colored_print("\n📖 The report includes:", "cyan")
|
|
@@ -323,8 +418,8 @@ def pipeline(max_repos, skip_fetch):
|
|
|
323
418
|
|
|
324
419
|
# Phase 2: Extract
|
|
325
420
|
colored_print("\n[2/5] 📝 Extracting commits...", "cyan")
|
|
326
|
-
from services.commit_extractor import CommitExtractor
|
|
327
|
-
from utils import save_json_file
|
|
421
|
+
from greenmining.services.commit_extractor import CommitExtractor
|
|
422
|
+
from greenmining.utils import save_json_file
|
|
328
423
|
|
|
329
424
|
controller = RepositoryController(config)
|
|
330
425
|
repos = controller.load_repositories()
|
|
@@ -335,7 +430,7 @@ def pipeline(max_repos, skip_fetch):
|
|
|
335
430
|
|
|
336
431
|
# Phase 3: Analyze
|
|
337
432
|
colored_print("\n[3/5] 🔬 Analyzing commits...", "cyan")
|
|
338
|
-
from services.data_analyzer import DataAnalyzer
|
|
433
|
+
from greenmining.services.data_analyzer import DataAnalyzer
|
|
339
434
|
|
|
340
435
|
commits = load_json_file(config.COMMITS_FILE)
|
|
341
436
|
analyzer = DataAnalyzer()
|
|
@@ -347,7 +442,7 @@ def pipeline(max_repos, skip_fetch):
|
|
|
347
442
|
|
|
348
443
|
# Phase 4: Aggregate
|
|
349
444
|
colored_print("\n[4/5] 📊 Aggregating results...", "cyan")
|
|
350
|
-
from services.data_aggregator import DataAggregator
|
|
445
|
+
from greenmining.services.data_aggregator import DataAggregator
|
|
351
446
|
|
|
352
447
|
aggregator = DataAggregator()
|
|
353
448
|
aggregated = aggregator.aggregate(results, [r.to_dict() for r in repos])
|
|
@@ -355,7 +450,7 @@ def pipeline(max_repos, skip_fetch):
|
|
|
355
450
|
|
|
356
451
|
# Phase 5: Report
|
|
357
452
|
colored_print("\n[5/5] 📄 Generating report...", "cyan")
|
|
358
|
-
from services.reports import ReportGenerator
|
|
453
|
+
from greenmining.services.reports import ReportGenerator
|
|
359
454
|
|
|
360
455
|
generator = ReportGenerator()
|
|
361
456
|
generator.generate_report(aggregated)
|
greenmining/config.py
CHANGED
|
@@ -49,6 +49,27 @@ class Config:
|
|
|
49
49
|
self.COMMITS_PER_REPO = int(os.getenv("COMMITS_PER_REPO", "50"))
|
|
50
50
|
self.DAYS_BACK = int(os.getenv("DAYS_BACK", "730")) # 2 years
|
|
51
51
|
|
|
52
|
+
# Advanced Analyzer Configuration
|
|
53
|
+
self.ENABLE_NLP_ANALYSIS = os.getenv("ENABLE_NLP_ANALYSIS", "false").lower() == "true"
|
|
54
|
+
self.ENABLE_TEMPORAL_ANALYSIS = (
|
|
55
|
+
os.getenv("ENABLE_TEMPORAL_ANALYSIS", "false").lower() == "true"
|
|
56
|
+
)
|
|
57
|
+
self.TEMPORAL_GRANULARITY = os.getenv(
|
|
58
|
+
"TEMPORAL_GRANULARITY", "quarter"
|
|
59
|
+
) # day, week, month, quarter, year
|
|
60
|
+
self.ENABLE_ML_FEATURES = os.getenv("ENABLE_ML_FEATURES", "false").lower() == "true"
|
|
61
|
+
self.VALIDATION_SAMPLE_SIZE = int(os.getenv("VALIDATION_SAMPLE_SIZE", "30"))
|
|
62
|
+
|
|
63
|
+
# Temporal Filtering (NEW)
|
|
64
|
+
self.CREATED_AFTER = os.getenv("CREATED_AFTER") # YYYY-MM-DD
|
|
65
|
+
self.CREATED_BEFORE = os.getenv("CREATED_BEFORE") # YYYY-MM-DD
|
|
66
|
+
self.PUSHED_AFTER = os.getenv("PUSHED_AFTER") # YYYY-MM-DD
|
|
67
|
+
self.PUSHED_BEFORE = os.getenv("PUSHED_BEFORE") # YYYY-MM-DD
|
|
68
|
+
self.COMMIT_DATE_FROM = os.getenv("COMMIT_DATE_FROM") # YYYY-MM-DD
|
|
69
|
+
self.COMMIT_DATE_TO = os.getenv("COMMIT_DATE_TO") # YYYY-MM-DD
|
|
70
|
+
self.MIN_COMMITS = int(os.getenv("MIN_COMMITS", "0"))
|
|
71
|
+
self.ACTIVITY_WINDOW_DAYS = int(os.getenv("ACTIVITY_WINDOW_DAYS", "730"))
|
|
72
|
+
|
|
52
73
|
# Analysis Configuration
|
|
53
74
|
self.BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10"))
|
|
54
75
|
|
|
@@ -22,6 +22,10 @@ class RepositoryController:
|
|
|
22
22
|
min_stars: int = None,
|
|
23
23
|
languages: list[str] = None,
|
|
24
24
|
keywords: str = None,
|
|
25
|
+
created_after: str = None,
|
|
26
|
+
created_before: str = None,
|
|
27
|
+
pushed_after: str = None,
|
|
28
|
+
pushed_before: str = None,
|
|
25
29
|
) -> list[Repository]:
|
|
26
30
|
"""Fetch repositories from GitHub.
|
|
27
31
|
|
|
@@ -30,6 +34,10 @@ class RepositoryController:
|
|
|
30
34
|
min_stars: Minimum stars filter
|
|
31
35
|
languages: List of programming languages to filter
|
|
32
36
|
keywords: Custom search keywords (default: "microservices")
|
|
37
|
+
created_after: Repository created after date (YYYY-MM-DD)
|
|
38
|
+
created_before: Repository created before date (YYYY-MM-DD)
|
|
39
|
+
pushed_after: Repository pushed after date (YYYY-MM-DD)
|
|
40
|
+
pushed_before: Repository pushed before date (YYYY-MM-DD)
|
|
33
41
|
|
|
34
42
|
Returns:
|
|
35
43
|
List of Repository model instances
|
|
@@ -43,8 +51,17 @@ class RepositoryController:
|
|
|
43
51
|
colored_print(f" Keywords: {keywords}", "cyan")
|
|
44
52
|
colored_print(f" Filters: min_stars={min_stars}", "cyan")
|
|
45
53
|
|
|
46
|
-
|
|
47
|
-
|
|
54
|
+
if created_after or created_before:
|
|
55
|
+
colored_print(
|
|
56
|
+
f" Created: {created_after or 'any'} to {created_before or 'any'}", "cyan"
|
|
57
|
+
)
|
|
58
|
+
if pushed_after or pushed_before:
|
|
59
|
+
colored_print(f" Pushed: {pushed_after or 'any'} to {pushed_before or 'any'}", "cyan")
|
|
60
|
+
|
|
61
|
+
# Build search query with temporal filters
|
|
62
|
+
query = self._build_temporal_query(
|
|
63
|
+
keywords, min_stars, created_after, created_before, pushed_after, pushed_before
|
|
64
|
+
)
|
|
48
65
|
|
|
49
66
|
try:
|
|
50
67
|
# Execute search
|
|
@@ -83,6 +100,37 @@ class RepositoryController:
|
|
|
83
100
|
colored_print(f"❌ Error fetching repositories: {e}", "red")
|
|
84
101
|
raise
|
|
85
102
|
|
|
103
|
+
def _build_temporal_query(
|
|
104
|
+
self,
|
|
105
|
+
keywords: str,
|
|
106
|
+
min_stars: int,
|
|
107
|
+
created_after: str = None,
|
|
108
|
+
created_before: str = None,
|
|
109
|
+
pushed_after: str = None,
|
|
110
|
+
pushed_before: str = None,
|
|
111
|
+
) -> str:
|
|
112
|
+
"""Build GitHub search query with temporal constraints."""
|
|
113
|
+
query_parts = [keywords, f"stars:>={min_stars}"]
|
|
114
|
+
|
|
115
|
+
# Temporal filters
|
|
116
|
+
if created_after and created_before:
|
|
117
|
+
query_parts.append(f"created:{created_after}..{created_before}")
|
|
118
|
+
elif created_after:
|
|
119
|
+
query_parts.append(f"created:>={created_after}")
|
|
120
|
+
elif created_before:
|
|
121
|
+
query_parts.append(f"created:<={created_before}")
|
|
122
|
+
|
|
123
|
+
if pushed_after and pushed_before:
|
|
124
|
+
query_parts.append(f"pushed:{pushed_after}..{pushed_before}")
|
|
125
|
+
elif pushed_after:
|
|
126
|
+
query_parts.append(f"pushed:>={pushed_after}")
|
|
127
|
+
elif pushed_before:
|
|
128
|
+
query_parts.append(f"pushed:<={pushed_before}")
|
|
129
|
+
|
|
130
|
+
query = " ".join(query_parts)
|
|
131
|
+
colored_print(f" Query: {query}", "cyan")
|
|
132
|
+
return query
|
|
133
|
+
|
|
86
134
|
def load_repositories(self) -> list[Repository]:
|
|
87
135
|
"""Load repositories from file.
|
|
88
136
|
|
greenmining/gsf_patterns.py
CHANGED
|
@@ -356,7 +356,12 @@ GSF_PATTERNS = {
|
|
|
356
356
|
"compress_ml_models": {
|
|
357
357
|
"name": "Compress ML Models for Inference",
|
|
358
358
|
"category": "ai",
|
|
359
|
-
"keywords": [
|
|
359
|
+
"keywords": [
|
|
360
|
+
"model compression",
|
|
361
|
+
"quantization",
|
|
362
|
+
"model pruning",
|
|
363
|
+
"knowledge distillation",
|
|
364
|
+
],
|
|
360
365
|
"description": "Reduce model size through quantization, pruning, distillation",
|
|
361
366
|
"sci_impact": "Dramatically reduces inference energy and memory",
|
|
362
367
|
},
|
|
@@ -370,14 +375,14 @@ GSF_PATTERNS = {
|
|
|
370
375
|
"energy_efficient_ai_edge": {
|
|
371
376
|
"name": "Energy Efficient AI at Edge",
|
|
372
377
|
"category": "ai",
|
|
373
|
-
"keywords": ["edge", "ai", "
|
|
378
|
+
"keywords": ["edge inference", "edge ai", "edge ml", "tflite", "onnx runtime"],
|
|
374
379
|
"description": "Run inference on edge devices when possible",
|
|
375
380
|
"sci_impact": "Eliminates network transfer, uses local compute",
|
|
376
381
|
},
|
|
377
382
|
"energy_efficient_framework": {
|
|
378
383
|
"name": "Energy Efficient Framework",
|
|
379
384
|
"category": "ai",
|
|
380
|
-
"keywords": ["
|
|
385
|
+
"keywords": ["tensorflow", "pytorch", "onnx", "jax", "huggingface"],
|
|
381
386
|
"description": "Choose ML frameworks optimized for efficiency",
|
|
382
387
|
"sci_impact": "Different frameworks have different energy profiles",
|
|
383
388
|
},
|
|
@@ -405,14 +410,14 @@ GSF_PATTERNS = {
|
|
|
405
410
|
"right_hardware_ai": {
|
|
406
411
|
"name": "Right Hardware Type for AI",
|
|
407
412
|
"category": "ai",
|
|
408
|
-
"keywords": ["
|
|
413
|
+
"keywords": ["gpu training", "tpu", "cuda", "nvidia ai", "ml accelerator"],
|
|
409
414
|
"description": "Use appropriate hardware (GPU/TPU) for AI workloads",
|
|
410
415
|
"sci_impact": "Specialized hardware is more energy efficient",
|
|
411
416
|
},
|
|
412
417
|
"serverless_ml": {
|
|
413
418
|
"name": "Serverless Model Development",
|
|
414
419
|
"category": "ai",
|
|
415
|
-
"keywords": ["
|
|
420
|
+
"keywords": ["sagemaker", "vertex ai", "azure ml", "lambda inference", "serverless ml"],
|
|
416
421
|
"description": "Use serverless platforms for ML development",
|
|
417
422
|
"sci_impact": "Pay-per-use, no idle resources",
|
|
418
423
|
},
|
greenmining/models/commit.py
CHANGED
greenmining/models/repository.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
"""Commit extractor for green microservices mining."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import json
|
|
4
6
|
from datetime import datetime, timedelta
|
|
5
7
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Optional
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
7
9
|
|
|
8
10
|
import click
|
|
9
11
|
from github import Github
|
|
@@ -29,6 +31,7 @@ class CommitExtractor:
|
|
|
29
31
|
skip_merges: bool = True,
|
|
30
32
|
days_back: int = 730,
|
|
31
33
|
github_token: str | None = None,
|
|
34
|
+
timeout: int = 60,
|
|
32
35
|
):
|
|
33
36
|
"""Initialize commit extractor.
|
|
34
37
|
|
|
@@ -37,12 +40,14 @@ class CommitExtractor:
|
|
|
37
40
|
skip_merges: Skip merge commits
|
|
38
41
|
days_back: Only analyze commits from last N days
|
|
39
42
|
github_token: GitHub API token (optional)
|
|
43
|
+
timeout: Timeout in seconds per repository (default: 60)
|
|
40
44
|
"""
|
|
41
45
|
self.max_commits = max_commits
|
|
42
46
|
self.skip_merges = skip_merges
|
|
43
47
|
self.days_back = days_back
|
|
44
48
|
self.cutoff_date = datetime.now() - timedelta(days=days_back)
|
|
45
49
|
self.github = Github(github_token) if github_token else None
|
|
50
|
+
self.timeout = timeout
|
|
46
51
|
|
|
47
52
|
def extract_from_repositories(self, repositories: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
48
53
|
"""Extract commits from list of repositories.
|
|
@@ -62,14 +67,35 @@ class CommitExtractor:
|
|
|
62
67
|
"cyan",
|
|
63
68
|
)
|
|
64
69
|
|
|
70
|
+
import signal
|
|
71
|
+
|
|
72
|
+
def timeout_handler(signum, frame):
|
|
73
|
+
raise TimeoutError("Repository extraction timeout")
|
|
74
|
+
|
|
65
75
|
with tqdm(total=len(repositories), desc="Processing repositories", unit="repo") as pbar:
|
|
66
76
|
for repo in repositories:
|
|
67
77
|
try:
|
|
78
|
+
# Set timeout alarm
|
|
79
|
+
signal.signal(signal.SIGALRM, timeout_handler)
|
|
80
|
+
signal.alarm(self.timeout)
|
|
81
|
+
|
|
68
82
|
commits = self._extract_repo_commits(repo)
|
|
69
83
|
all_commits.extend(commits)
|
|
84
|
+
|
|
85
|
+
# Cancel alarm
|
|
86
|
+
signal.alarm(0)
|
|
87
|
+
|
|
70
88
|
pbar.set_postfix({"commits": len(all_commits), "failed": len(failed_repos)})
|
|
71
89
|
pbar.update(1)
|
|
90
|
+
except TimeoutError:
|
|
91
|
+
signal.alarm(0) # Cancel alarm
|
|
92
|
+
colored_print(
|
|
93
|
+
f"\nTimeout processing {repo['full_name']} (>{self.timeout}s)", "yellow"
|
|
94
|
+
)
|
|
95
|
+
failed_repos.append(repo["full_name"])
|
|
96
|
+
pbar.update(1)
|
|
72
97
|
except Exception as e:
|
|
98
|
+
signal.alarm(0) # Cancel alarm
|
|
73
99
|
colored_print(f"\nError processing {repo['full_name']}: {e}", "yellow")
|
|
74
100
|
failed_repos.append(repo["full_name"])
|
|
75
101
|
pbar.update(1)
|