osslag 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- osslag/__init__.py +0 -0
- osslag/cli.py +1380 -0
- osslag/distro/__init__.py +0 -0
- osslag/distro/debian.py +382 -0
- osslag/distro/fedora.py +38 -0
- osslag/metrics/__init__.py +0 -0
- osslag/metrics/malta.py +585 -0
- osslag/metrics/pvac.py +166 -0
- osslag/utils/__init__.py +0 -0
- osslag/utils/github_helper.py +240 -0
- osslag/utils/vcs.py +543 -0
- osslag-1.0.0.dist-info/METADATA +46 -0
- osslag-1.0.0.dist-info/RECORD +15 -0
- osslag-1.0.0.dist-info/WHEEL +4 -0
- osslag-1.0.0.dist-info/entry_points.txt +3 -0
osslag/cli.py
ADDED
|
@@ -0,0 +1,1380 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import logging.config
|
|
5
|
+
import os
|
|
6
|
+
import pathlib
|
|
7
|
+
import sys
|
|
8
|
+
from concurrent.futures import Future, ProcessPoolExecutor
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Callable
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import typer
|
|
15
|
+
from dotenv import load_dotenv
|
|
16
|
+
from rich.console import Console, Group
|
|
17
|
+
from rich.live import Live
|
|
18
|
+
from rich.panel import Panel
|
|
19
|
+
from rich.progress import (
|
|
20
|
+
BarColumn,
|
|
21
|
+
Progress,
|
|
22
|
+
SpinnerColumn,
|
|
23
|
+
TextColumn,
|
|
24
|
+
)
|
|
25
|
+
from rich.status import Status
|
|
26
|
+
from rich.table import Table
|
|
27
|
+
from rich.text import Text
|
|
28
|
+
|
|
29
|
+
from osslag.distro import debian as deb
|
|
30
|
+
from osslag.utils import github_helper as gh
|
|
31
|
+
from osslag.utils import vcs
|
|
32
|
+
|
|
33
|
+
load_dotenv()
|
|
34
|
+
app = typer.Typer()
|
|
35
|
+
dataset_app = typer.Typer(
|
|
36
|
+
help="Dataset pipeline commands for building package analysis datasets."
|
|
37
|
+
)
|
|
38
|
+
app.add_typer(dataset_app, name="dataset")
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
console = Console()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# region Tasks
|
|
44
|
+
def _fetch_github_repo_metadata_task(args: tuple[str, str, str]) -> TaskResult:
|
|
45
|
+
"""Worker function to fetch GitHub repo metadata with proper timeout and rate limit handling."""
|
|
46
|
+
repo_url, source, cache_path = args
|
|
47
|
+
checkpoint_path = Path(cache_path, f"{source}.parquet")
|
|
48
|
+
try:
|
|
49
|
+
metadata_df = gh.fetch_github_repo_metadata(repo_url)
|
|
50
|
+
# Save checkpoint
|
|
51
|
+
metadata_df.to_parquet(checkpoint_path)
|
|
52
|
+
return TaskResult(
|
|
53
|
+
task_id=repo_url,
|
|
54
|
+
success=True,
|
|
55
|
+
data=metadata_df,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
except Exception as e:
|
|
59
|
+
failed_marker = checkpoint_path.parent / f"{checkpoint_path.name}.failed"
|
|
60
|
+
return TaskResult(
|
|
61
|
+
task_id=repo_url,
|
|
62
|
+
success=False,
|
|
63
|
+
error=str(e),
|
|
64
|
+
failed_marker_path=failed_marker,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _fetch_github_repo_pull_requests_task(args: tuple[str, str, str]) -> TaskResult:
|
|
69
|
+
"""Worker function to fetch GitHub repo pull requests with proper timeout and rate limit handling."""
|
|
70
|
+
repo_url, source, cache_path = args
|
|
71
|
+
checkpoint_path = Path(cache_path, f"{source}.parquet")
|
|
72
|
+
try:
|
|
73
|
+
pull_requests_df = gh.fetch_pull_requests(repo_url)
|
|
74
|
+
# Save checkpoint
|
|
75
|
+
pull_requests_df.to_parquet(checkpoint_path)
|
|
76
|
+
return TaskResult(
|
|
77
|
+
task_id=repo_url,
|
|
78
|
+
success=True,
|
|
79
|
+
data=pull_requests_df,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
except Exception as e:
|
|
83
|
+
failed_marker = checkpoint_path.parent / f"{checkpoint_path.name}.failed"
|
|
84
|
+
return TaskResult(
|
|
85
|
+
task_id=repo_url,
|
|
86
|
+
success=False,
|
|
87
|
+
error=str(e),
|
|
88
|
+
failed_marker_path=failed_marker,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _clone_task(args: tuple[str, str]) -> TaskResult:
|
|
93
|
+
"""Worker function for parallel cloning. Returns TaskResult."""
|
|
94
|
+
repo_url, target_dir = args
|
|
95
|
+
try:
|
|
96
|
+
result = vcs.clone_repo(repo_url, target_dir)
|
|
97
|
+
return TaskResult(
|
|
98
|
+
task_id=repo_url,
|
|
99
|
+
success=result.success,
|
|
100
|
+
error=result.error,
|
|
101
|
+
)
|
|
102
|
+
except Exception as e:
|
|
103
|
+
return TaskResult(task_id=repo_url, success=False, error=str(e))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _load_commits_task(args: tuple[str, str, str, str]) -> TaskResult:
|
|
107
|
+
"""Worker function for parallel commit loading. Returns TaskResult with DataFrame."""
|
|
108
|
+
local_repo_path, repo_url, source, cache_path = args
|
|
109
|
+
checkpoint_path = Path(cache_path, f"{source}.parquet")
|
|
110
|
+
try:
|
|
111
|
+
repo_commits_df = vcs.load_commits(local_repo_path, include_files=True)
|
|
112
|
+
repo_commits_df["repo_url"] = repo_url
|
|
113
|
+
repo_commits_df["source"] = source
|
|
114
|
+
# Save checkpoint
|
|
115
|
+
repo_commits_df.to_parquet(checkpoint_path)
|
|
116
|
+
return TaskResult(
|
|
117
|
+
task_id=repo_url,
|
|
118
|
+
success=True,
|
|
119
|
+
data=repo_commits_df,
|
|
120
|
+
)
|
|
121
|
+
except Exception as e:
|
|
122
|
+
error_detail = f"{type(e).__name__}: {str(e)}"
|
|
123
|
+
return TaskResult(task_id=repo_url, success=False, error=error_detail)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# endregion
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class SuppressConsoleLogging:
|
|
130
|
+
"""Context manager to temporarily suppress console logging output."""
|
|
131
|
+
|
|
132
|
+
def __enter__(self):
|
|
133
|
+
# Find and disable all console/stream handlers, saving their original levels
|
|
134
|
+
self._disabled_handlers: list[tuple[logging.Handler, int]] = []
|
|
135
|
+
for name in list(logging.Logger.manager.loggerDict.keys()) + ["", "root"]:
|
|
136
|
+
log = logging.getLogger(name) if name else logging.getLogger()
|
|
137
|
+
for handler in log.handlers[:]:
|
|
138
|
+
if isinstance(handler, logging.StreamHandler) and not isinstance(
|
|
139
|
+
handler, logging.FileHandler
|
|
140
|
+
):
|
|
141
|
+
original_level = handler.level
|
|
142
|
+
handler.setLevel(logging.CRITICAL + 1) # Effectively disable
|
|
143
|
+
self._disabled_handlers.append((handler, original_level))
|
|
144
|
+
return self
|
|
145
|
+
|
|
146
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
147
|
+
# Restore original levels
|
|
148
|
+
for handler, original_level in self._disabled_handlers:
|
|
149
|
+
handler.setLevel(original_level)
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@dataclass
|
|
154
|
+
class TaskResult:
|
|
155
|
+
"""Result from a parallel task execution."""
|
|
156
|
+
|
|
157
|
+
task_id: str
|
|
158
|
+
success: bool
|
|
159
|
+
error: str | None = None
|
|
160
|
+
data: Any = None
|
|
161
|
+
failed_marker_path: pathlib.Path | None = None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@dataclass
|
|
165
|
+
class WorkerStatus:
|
|
166
|
+
"""Tracks the status of a single worker."""
|
|
167
|
+
|
|
168
|
+
worker_id: int
|
|
169
|
+
current_task: str | None = None
|
|
170
|
+
tasks_completed: int = 0
|
|
171
|
+
tasks_failed: int = 0
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class ParallelExecutor:
|
|
175
|
+
"""Generic parallel task executor with a fancy Rich CLI UI.
|
|
176
|
+
|
|
177
|
+
Usage:
|
|
178
|
+
executor = ParallelExecutor(
|
|
179
|
+
task_name="Cloning repositories",
|
|
180
|
+
max_workers=4,
|
|
181
|
+
)
|
|
182
|
+
results = executor.run(
|
|
183
|
+
tasks=[(url, path), ...],
|
|
184
|
+
worker_fn=clone_task,
|
|
185
|
+
task_id_fn=lambda t: t[0], # Extract task ID for display
|
|
186
|
+
)
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
def __init__(
|
|
190
|
+
self,
|
|
191
|
+
task_name: str,
|
|
192
|
+
max_workers: int = 4,
|
|
193
|
+
show_recent_completed: int = 5,
|
|
194
|
+
rate_limit: int | None = None,
|
|
195
|
+
rate_remaining: int | None = None,
|
|
196
|
+
):
|
|
197
|
+
self.task_name = task_name
|
|
198
|
+
self.max_workers = max_workers
|
|
199
|
+
self.show_recent_completed = show_recent_completed
|
|
200
|
+
|
|
201
|
+
# Rate limit info
|
|
202
|
+
self.rate_limit = rate_limit
|
|
203
|
+
self.rate_remaining = rate_remaining
|
|
204
|
+
|
|
205
|
+
# Tracking state
|
|
206
|
+
self.workers: dict[int, WorkerStatus] = {}
|
|
207
|
+
self.completed_tasks: list[TaskResult] = []
|
|
208
|
+
self.failed_tasks: list[TaskResult] = []
|
|
209
|
+
self.recent_completed: list[tuple[str, bool]] = [] # (task_id, success)
|
|
210
|
+
self.total_tasks = 0
|
|
211
|
+
self.skipped_tasks = 0
|
|
212
|
+
|
|
213
|
+
# Timing
|
|
214
|
+
self.start_time: float | None = None
|
|
215
|
+
|
|
216
|
+
def create_display(self, progress: Progress) -> Panel:
|
|
217
|
+
"""Create the rich display panel."""
|
|
218
|
+
# Stats section
|
|
219
|
+
completed = len(self.completed_tasks)
|
|
220
|
+
failed = len(self.failed_tasks)
|
|
221
|
+
in_progress = sum(1 for w in self.workers.values() if w.current_task)
|
|
222
|
+
|
|
223
|
+
stats_table = Table.grid(padding=(0, 2))
|
|
224
|
+
stats_table.add_column(style="cyan", justify="right")
|
|
225
|
+
stats_table.add_column(style="white")
|
|
226
|
+
stats_table.add_column(style="cyan", justify="right")
|
|
227
|
+
stats_table.add_column(style="white")
|
|
228
|
+
|
|
229
|
+
stats_table.add_row(
|
|
230
|
+
"Total:",
|
|
231
|
+
f"{self.total_tasks}",
|
|
232
|
+
"Skipped:",
|
|
233
|
+
f"{self.skipped_tasks}",
|
|
234
|
+
)
|
|
235
|
+
stats_table.add_row(
|
|
236
|
+
"✓ Completed:",
|
|
237
|
+
f"[green]{completed}[/]",
|
|
238
|
+
"✗ Failed:",
|
|
239
|
+
f"[red]{failed}[/]",
|
|
240
|
+
)
|
|
241
|
+
stats_table.add_row(
|
|
242
|
+
"⟳ In Progress:",
|
|
243
|
+
f"[yellow]{in_progress}[/]",
|
|
244
|
+
"Workers:",
|
|
245
|
+
f"{self.max_workers}",
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Add rate limit info if available
|
|
249
|
+
if self.rate_limit is not None and self.rate_remaining is not None:
|
|
250
|
+
stats_table.add_row(
|
|
251
|
+
"Rate Limit:",
|
|
252
|
+
f"[cyan]{self.rate_remaining}[/]/[white]{self.rate_limit}[/]",
|
|
253
|
+
"",
|
|
254
|
+
"",
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Workers section
|
|
258
|
+
workers_table = Table(
|
|
259
|
+
title="[bold]Active Workers[/]",
|
|
260
|
+
show_header=True,
|
|
261
|
+
header_style="bold magenta",
|
|
262
|
+
border_style="dim",
|
|
263
|
+
expand=False,
|
|
264
|
+
)
|
|
265
|
+
workers_table.add_column("Worker", style="cyan", width=8)
|
|
266
|
+
workers_table.add_column("Status", style="white", width=12)
|
|
267
|
+
workers_table.add_column(
|
|
268
|
+
"Current Task", style="yellow", overflow="ellipsis", no_wrap=True, width=60
|
|
269
|
+
)
|
|
270
|
+
workers_table.add_column("Done", style="green", justify="right", width=6)
|
|
271
|
+
workers_table.add_column("Fail", style="red", justify="right", width=6)
|
|
272
|
+
|
|
273
|
+
for wid in sorted(self.workers.keys()):
|
|
274
|
+
w = self.workers[wid]
|
|
275
|
+
status = "[green]●[/] Working" if w.current_task else "[dim]○ Idle[/]"
|
|
276
|
+
task_display = (
|
|
277
|
+
w.current_task[:58] + "…"
|
|
278
|
+
if w.current_task and len(w.current_task) > 58
|
|
279
|
+
else (w.current_task or "-")
|
|
280
|
+
)
|
|
281
|
+
workers_table.add_row(
|
|
282
|
+
f"#{wid}",
|
|
283
|
+
status,
|
|
284
|
+
task_display,
|
|
285
|
+
str(w.tasks_completed),
|
|
286
|
+
str(w.tasks_failed),
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Recent completions
|
|
290
|
+
recent_text = Text()
|
|
291
|
+
for task_id, success in self.recent_completed[-self.show_recent_completed :]:
|
|
292
|
+
short_id = task_id[:70] + "…" if len(task_id) > 70 else task_id
|
|
293
|
+
recent_text.append(" ")
|
|
294
|
+
recent_text.append(
|
|
295
|
+
"✓ " if success else "✗ ", style="bold green" if success else "bold red"
|
|
296
|
+
)
|
|
297
|
+
recent_text.append(f"{short_id}\n")
|
|
298
|
+
|
|
299
|
+
components = [
|
|
300
|
+
stats_table,
|
|
301
|
+
Text(),
|
|
302
|
+
progress,
|
|
303
|
+
Text(),
|
|
304
|
+
workers_table,
|
|
305
|
+
Text(),
|
|
306
|
+
Panel(
|
|
307
|
+
recent_text
|
|
308
|
+
if recent_text.plain
|
|
309
|
+
else Text(" Waiting for tasks...", style="dim italic"),
|
|
310
|
+
title="[bold]Recent Completions[/]",
|
|
311
|
+
border_style="dim",
|
|
312
|
+
),
|
|
313
|
+
]
|
|
314
|
+
|
|
315
|
+
group = Group(*components)
|
|
316
|
+
|
|
317
|
+
return Panel(
|
|
318
|
+
group,
|
|
319
|
+
title=f"[bold blue]⚡ {self.task_name}[/]",
|
|
320
|
+
border_style="blue",
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
def run(
|
|
324
|
+
self,
|
|
325
|
+
tasks: list[Any],
|
|
326
|
+
worker_fn: Callable[[Any], TaskResult],
|
|
327
|
+
task_id_fn: Callable[[Any], str],
|
|
328
|
+
skipped: int = 0,
|
|
329
|
+
) -> list[TaskResult]:
|
|
330
|
+
"""Execute tasks in parallel with a live UI.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
tasks: List of task arguments to pass to worker_fn
|
|
334
|
+
worker_fn: Function that processes a single task and returns TaskResult
|
|
335
|
+
task_id_fn: Function to extract a display ID from a task
|
|
336
|
+
skipped: Number of tasks that were skipped before execution
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
List of TaskResult objects
|
|
340
|
+
|
|
341
|
+
"""
|
|
342
|
+
import time as time_module
|
|
343
|
+
|
|
344
|
+
self.total_tasks = len(tasks)
|
|
345
|
+
self.skipped_tasks = skipped
|
|
346
|
+
self.workers = {i: WorkerStatus(worker_id=i) for i in range(self.max_workers)}
|
|
347
|
+
self.completed_tasks = []
|
|
348
|
+
self.failed_tasks = []
|
|
349
|
+
self.recent_completed = []
|
|
350
|
+
self.start_time = time_module.time()
|
|
351
|
+
|
|
352
|
+
if self.total_tasks == 0:
|
|
353
|
+
console.print(
|
|
354
|
+
Panel(
|
|
355
|
+
"[dim]No tasks to process[/]",
|
|
356
|
+
title=f"[bold]{self.task_name}[/]",
|
|
357
|
+
border_style="dim",
|
|
358
|
+
)
|
|
359
|
+
)
|
|
360
|
+
return []
|
|
361
|
+
|
|
362
|
+
results: list[TaskResult] = []
|
|
363
|
+
|
|
364
|
+
progress = Progress(
|
|
365
|
+
SpinnerColumn(),
|
|
366
|
+
TextColumn("[progress.description]{task.description}"),
|
|
367
|
+
BarColumn(bar_width=40),
|
|
368
|
+
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
|
369
|
+
)
|
|
370
|
+
progress_task_id = progress.add_task(self.task_name, total=self.total_tasks)
|
|
371
|
+
|
|
372
|
+
with (
|
|
373
|
+
SuppressConsoleLogging(),
|
|
374
|
+
Live(
|
|
375
|
+
self.create_display(progress), refresh_per_second=4, console=console
|
|
376
|
+
) as live,
|
|
377
|
+
):
|
|
378
|
+
with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
|
|
379
|
+
# Map futures to (task, worker_id)
|
|
380
|
+
future_to_task: dict[Future, tuple[Any, int]] = {}
|
|
381
|
+
available_workers = list(range(self.max_workers))
|
|
382
|
+
pending_tasks = list(tasks)
|
|
383
|
+
|
|
384
|
+
# Submit initial batch
|
|
385
|
+
while available_workers and pending_tasks:
|
|
386
|
+
worker_id = available_workers.pop(0)
|
|
387
|
+
task = pending_tasks.pop(0)
|
|
388
|
+
task_id = task_id_fn(task)
|
|
389
|
+
|
|
390
|
+
self.workers[worker_id].current_task = task_id
|
|
391
|
+
future = executor.submit(worker_fn, task)
|
|
392
|
+
future_to_task[future] = (task, worker_id)
|
|
393
|
+
|
|
394
|
+
live.update(self.create_display(progress))
|
|
395
|
+
|
|
396
|
+
# Process as futures complete
|
|
397
|
+
while future_to_task:
|
|
398
|
+
# Wait for at least one to complete
|
|
399
|
+
done_futures = []
|
|
400
|
+
for future in list(future_to_task.keys()):
|
|
401
|
+
if future.done():
|
|
402
|
+
done_futures.append(future)
|
|
403
|
+
|
|
404
|
+
if not done_futures:
|
|
405
|
+
# Small sleep to avoid busy waiting
|
|
406
|
+
time_module.sleep(0.05)
|
|
407
|
+
continue
|
|
408
|
+
|
|
409
|
+
for future in done_futures:
|
|
410
|
+
task, worker_id = future_to_task.pop(future)
|
|
411
|
+
task_id = task_id_fn(task)
|
|
412
|
+
try:
|
|
413
|
+
result = future.result()
|
|
414
|
+
if result.success:
|
|
415
|
+
results.append(result)
|
|
416
|
+
self.completed_tasks.append(result)
|
|
417
|
+
self.workers[worker_id].tasks_completed += 1
|
|
418
|
+
self.recent_completed.append((task_id, result.success))
|
|
419
|
+
else:
|
|
420
|
+
results.append(result)
|
|
421
|
+
self.failed_tasks.append(result)
|
|
422
|
+
self.workers[worker_id].tasks_failed += 1
|
|
423
|
+
self.recent_completed.append((task_id, result.success))
|
|
424
|
+
marker = result.failed_marker_path
|
|
425
|
+
marker.write_text(
|
|
426
|
+
f"Task failed: {result.error}\n"
|
|
427
|
+
) if marker else None
|
|
428
|
+
|
|
429
|
+
except Exception as e:
|
|
430
|
+
error_result = TaskResult(
|
|
431
|
+
task_id=task_id, success=False, error=str(e)
|
|
432
|
+
)
|
|
433
|
+
results.append(error_result)
|
|
434
|
+
self.failed_tasks.append(error_result)
|
|
435
|
+
self.workers[worker_id].tasks_failed += 1
|
|
436
|
+
self.recent_completed.append((task_id, False))
|
|
437
|
+
result = error_result # Assign for progress check below
|
|
438
|
+
marker = result.failed_marker_path
|
|
439
|
+
marker.write_text(
|
|
440
|
+
f"Task failed: {result.error}\n"
|
|
441
|
+
) if marker else None
|
|
442
|
+
# Update progress
|
|
443
|
+
progress.advance(progress_task_id)
|
|
444
|
+
|
|
445
|
+
# Mark worker as available
|
|
446
|
+
self.workers[worker_id].current_task = None
|
|
447
|
+
available_workers.append(worker_id)
|
|
448
|
+
|
|
449
|
+
# Submit next task if available and not rate limited
|
|
450
|
+
if pending_tasks and available_workers:
|
|
451
|
+
next_worker = available_workers.pop(0)
|
|
452
|
+
next_task = pending_tasks.pop(0)
|
|
453
|
+
next_task_id = task_id_fn(next_task)
|
|
454
|
+
|
|
455
|
+
self.workers[next_worker].current_task = next_task_id
|
|
456
|
+
next_future = executor.submit(worker_fn, next_task)
|
|
457
|
+
future_to_task[next_future] = (next_task, next_worker)
|
|
458
|
+
|
|
459
|
+
if done_futures:
|
|
460
|
+
live.update(self.create_display(progress))
|
|
461
|
+
|
|
462
|
+
# Final summary with elapsed time
|
|
463
|
+
elapsed = time_module.time() - self.start_time
|
|
464
|
+
if elapsed >= 60:
|
|
465
|
+
elapsed_str = f"{int(elapsed // 60)}m {int(elapsed % 60)}s"
|
|
466
|
+
else:
|
|
467
|
+
elapsed_str = f"{elapsed:.1f}s"
|
|
468
|
+
|
|
469
|
+
console.print()
|
|
470
|
+
console.print(
|
|
471
|
+
Panel(
|
|
472
|
+
f"[green]✓ Completed:[/] {len(self.completed_tasks)} "
|
|
473
|
+
f"[red]✗ Failed:[/] {len(self.failed_tasks)} "
|
|
474
|
+
f"[dim]Skipped:[/] {self.skipped_tasks} "
|
|
475
|
+
f"[cyan]⏱ Time:[/] {elapsed_str}",
|
|
476
|
+
title=f"[bold]{self.task_name} Complete[/]",
|
|
477
|
+
border_style="green" if not self.failed_tasks else "yellow",
|
|
478
|
+
)
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
return results
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def setup_logging() -> None:
|
|
485
|
+
log_level = os.getenv("LOG_LEVEL", "INFO")
|
|
486
|
+
log_file = os.getenv("LOG_FILE", "osslag.log")
|
|
487
|
+
|
|
488
|
+
logging.config.dictConfig(
|
|
489
|
+
{
|
|
490
|
+
"version": 1,
|
|
491
|
+
"disable_existing_loggers": False,
|
|
492
|
+
"formatters": {
|
|
493
|
+
"standard": {
|
|
494
|
+
"format": "%(asctime)s | %(name)-20s | %(funcName)-20s:%(lineno)-4d | %(levelname)-8s | %(message)s",
|
|
495
|
+
},
|
|
496
|
+
},
|
|
497
|
+
"handlers": {
|
|
498
|
+
"console": {
|
|
499
|
+
"class": "logging.StreamHandler",
|
|
500
|
+
"level": log_level,
|
|
501
|
+
"formatter": "standard",
|
|
502
|
+
"stream": "ext://sys.stdout",
|
|
503
|
+
},
|
|
504
|
+
"file": {
|
|
505
|
+
"class": "logging.handlers.RotatingFileHandler",
|
|
506
|
+
"level": log_level,
|
|
507
|
+
"formatter": "standard",
|
|
508
|
+
"filename": log_file,
|
|
509
|
+
"maxBytes": 5 * 1024 * 1024,
|
|
510
|
+
"backupCount": 3,
|
|
511
|
+
"encoding": "utf-8",
|
|
512
|
+
},
|
|
513
|
+
},
|
|
514
|
+
"loggers": {
|
|
515
|
+
"osslag": {
|
|
516
|
+
"handlers": ["console", "file"],
|
|
517
|
+
"level": log_level,
|
|
518
|
+
"propagate": False,
|
|
519
|
+
},
|
|
520
|
+
"__main__": {
|
|
521
|
+
"handlers": ["console", "file"],
|
|
522
|
+
"level": log_level,
|
|
523
|
+
"propagate": False,
|
|
524
|
+
},
|
|
525
|
+
},
|
|
526
|
+
"root": {
|
|
527
|
+
"handlers": ["console"],
|
|
528
|
+
"level": log_level,
|
|
529
|
+
},
|
|
530
|
+
}
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
@app.command()
|
|
535
|
+
def clone(repo_url: str, dest_dir: str = "./cache/repos"):
|
|
536
|
+
"""Clone a single Git repository to the specified destination directory."""
|
|
537
|
+
dest_dir = os.path.abspath(dest_dir)
|
|
538
|
+
print(f"Cloning repository {repo_url} into directory {dest_dir}")
|
|
539
|
+
success = vcs.clone_repo(repo_url, dest_dir)
|
|
540
|
+
if success:
|
|
541
|
+
print(f"Successfully cloned or updated repository {repo_url} into {dest_dir}")
|
|
542
|
+
else:
|
|
543
|
+
print(f"Failed to clone repository {repo_url} into {dest_dir}")
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
@app.command()
|
|
547
|
+
def get_metadata(
|
|
548
|
+
repo_url: str,
|
|
549
|
+
cache: str = typer.Option("./cache", help="Output path for metadata parquet file"),
|
|
550
|
+
):
|
|
551
|
+
"""Fetch GitHub repository metadata and save to a parquet file."""
|
|
552
|
+
github_token = os.getenv("GITHUB_TOKEN")
|
|
553
|
+
cache_path = os.getenv("CACHE_DIR") or cache
|
|
554
|
+
pathlib.Path(cache_path).mkdir(parents=True, exist_ok=True)
|
|
555
|
+
print(f"Fetching metadata for repository {repo_url}")
|
|
556
|
+
try:
|
|
557
|
+
metadata_df = gh.fetch_github_repo_metadata(repo_url, github_token)
|
|
558
|
+
parquet_path = Path(cache_path, "metadata.parquet")
|
|
559
|
+
metadata_df.to_parquet(parquet_path)
|
|
560
|
+
print(f"Metadata saved to {parquet_path}")
|
|
561
|
+
except Exception as e:
|
|
562
|
+
print(f"Failed to fetch metadata for {repo_url}: {e}")
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
@app.command()
|
|
566
|
+
def rate_limit():
|
|
567
|
+
"""Fetch and display GitHub API rate limit information."""
|
|
568
|
+
github_token = os.getenv("GITHUB_TOKEN")
|
|
569
|
+
if github_token is None:
|
|
570
|
+
print("GITHUB_TOKEN is not set in environment variables.")
|
|
571
|
+
return
|
|
572
|
+
print(f"Using token: {github_token}")
|
|
573
|
+
rate_info = gh.gh_get_rate_limit_info(github_token)
|
|
574
|
+
if rate_info is not None:
|
|
575
|
+
print(
|
|
576
|
+
f"GitHub API Rate Limit: {rate_info['limit']}/{rate_info['remaining']} remaining (resets at {rate_info['reset_datetime']})"
|
|
577
|
+
)
|
|
578
|
+
else:
|
|
579
|
+
print("Failed to fetch rate limit info from GitHub.")
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
@app.command()
|
|
583
|
+
def pull_requests(
|
|
584
|
+
repo_url: str = typer.Argument(
|
|
585
|
+
..., help="The GitHub repository URL to fetch pull requests for"
|
|
586
|
+
),
|
|
587
|
+
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
588
|
+
):
|
|
589
|
+
"""Fetch GitHub pull requests for a specified repository and save to a parquet file."""
|
|
590
|
+
github_token = os.getenv("GITHUB_TOKEN")
|
|
591
|
+
cache_path = os.getenv("CACHE_DIR") or cache
|
|
592
|
+
pathlib.Path(cache_path).mkdir(parents=True, exist_ok=True)
|
|
593
|
+
output_path = Path(cache_path, "pull_requests.parquet")
|
|
594
|
+
print(f"Fetching pull requests for repository {repo_url}")
|
|
595
|
+
try:
|
|
596
|
+
pr_df = gh.fetch_pull_requests(repo_url, github_token)
|
|
597
|
+
pr_df.to_parquet(output_path)
|
|
598
|
+
print(f"Pull requests saved to {output_path}")
|
|
599
|
+
except Exception as e:
|
|
600
|
+
print(f"Failed to fetch pull requests for {repo_url}: {e}")
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
@dataset_app.command(name="run", rich_help_panel="Full Pipeline")
|
|
604
|
+
def run_dataset_pipeline(
|
|
605
|
+
distro: str = typer.Option(
|
|
606
|
+
"debian", help="The Linux distribution to process (e.g., 'debian' 'fedora')"
|
|
607
|
+
),
|
|
608
|
+
releases: list[str] = typer.Option(
|
|
609
|
+
...,
|
|
610
|
+
"--release",
|
|
611
|
+
help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
|
|
612
|
+
),
|
|
613
|
+
cache: str = typer.Option("./cache", help="Cache directory (EV: CACHE_DIR)"),
|
|
614
|
+
force: bool = typer.Option(
|
|
615
|
+
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
616
|
+
),
|
|
617
|
+
):
|
|
618
|
+
"""Run the full pipeline: fetch packages, filter repos, extract versions,
|
|
619
|
+
merge releases, clone repos, load commits, pull GitHub data.
|
|
620
|
+
|
|
621
|
+
Uses cached data when available. Use --force to re-process all steps.
|
|
622
|
+
"""
|
|
623
|
+
# Flatten: handle both repeated flags and comma-separated values
|
|
624
|
+
to_process = []
|
|
625
|
+
for r in releases:
|
|
626
|
+
to_process.extend([item.strip() for item in r.split(",")])
|
|
627
|
+
|
|
628
|
+
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
629
|
+
|
|
630
|
+
console.print(
|
|
631
|
+
Panel(
|
|
632
|
+
f"[bold]Distro:[/] {distro} [bold]Releases:[/] {', '.join(to_process)} [bold]Cache:[/] {cache_dir}"
|
|
633
|
+
+ (" [bold yellow]--force[/]" if force else ""),
|
|
634
|
+
title="[bold blue]🚀 Dataset Pipeline[/]",
|
|
635
|
+
border_style="blue",
|
|
636
|
+
)
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
# Suppress console logging for steps 1-4 (non-parallel steps)
|
|
640
|
+
with SuppressConsoleLogging():
|
|
641
|
+
# Step 1: Get and cache package data for each release
|
|
642
|
+
with Status("[bold cyan]Step 1/6:[/] Fetching packages...", console=console):
|
|
643
|
+
fetch_packages(distro=distro, releases=to_process, cache=cache_dir)
|
|
644
|
+
console.print("[green]✓[/] Step 1/6: Fetched packages")
|
|
645
|
+
|
|
646
|
+
# Step 2: Filter GitHub repos
|
|
647
|
+
with Status(
|
|
648
|
+
"[bold cyan]Step 2/6:[/] Filtering GitHub repos...", console=console
|
|
649
|
+
):
|
|
650
|
+
filter_debian_github_repos(
|
|
651
|
+
distro=distro, release=to_process, cache=cache_dir, force=force
|
|
652
|
+
)
|
|
653
|
+
console.print("[green]✓[/] Step 2/6: Filtered GitHub repos")
|
|
654
|
+
|
|
655
|
+
# Step 3: Extract the version string and add upstream version columns
|
|
656
|
+
with Status(
|
|
657
|
+
"[bold cyan]Step 3/6:[/] Extracting upstream versions...", console=console
|
|
658
|
+
):
|
|
659
|
+
extract_upstream_versions(
|
|
660
|
+
distro=distro, release=to_process, cache=cache_dir, force=force
|
|
661
|
+
)
|
|
662
|
+
console.print("[green]✓[/] Step 3/6: Extracted upstream versions")
|
|
663
|
+
|
|
664
|
+
# Step 4: Merge releases into a single DataFrame with all required columns
|
|
665
|
+
with Status("[bold cyan]Step 4/6:[/] Merging releases...", console=console):
|
|
666
|
+
merge_releases(
|
|
667
|
+
distro=distro, releases=to_process, cache=cache_dir, force=force
|
|
668
|
+
)
|
|
669
|
+
console.print("[green]✓[/] Step 4/6: Merged releases")
|
|
670
|
+
|
|
671
|
+
# Step 5: Clone all upstream GitHub repos (has its own UI)
|
|
672
|
+
console.print("\n[bold cyan]Step 5/6:[/] Cloning repositories...")
|
|
673
|
+
clone_upstream_repos(distro=distro, cache=cache_dir)
|
|
674
|
+
|
|
675
|
+
# Step 6: Extract all commits into a single DataFrame (has its own UI)
|
|
676
|
+
console.print("\n[bold cyan]Step 6/6:[/] Loading commits...")
|
|
677
|
+
load_commits_into_dataframe(distro=distro, cache=cache_dir, force=force)
|
|
678
|
+
|
|
679
|
+
# Step 7: Fetch GitHub metadata for all repos (has its own UI)
|
|
680
|
+
console.print("\n[bold cyan]Step 7/7:[/] Fetching GitHub metadata...")
|
|
681
|
+
all_github_metadata(distro=distro, cache=cache_dir, force=force)
|
|
682
|
+
|
|
683
|
+
# Step 8: Fetch GitHub pull requests for all repos (has its own UI)
|
|
684
|
+
console.print("\n[bold cyan]Step 8/8:[/] Fetching GitHub pull requests...")
|
|
685
|
+
all_github_pull_requests(distro=distro, cache=cache_dir, force=force)
|
|
686
|
+
|
|
687
|
+
console.print(
|
|
688
|
+
Panel(
|
|
689
|
+
"[bold green]Pipeline completed successfully![/]",
|
|
690
|
+
border_style="green",
|
|
691
|
+
)
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
@dataset_app.command(rich_help_panel="Step 1: Fetch Data")
|
|
696
|
+
def fetch_packages(
|
|
697
|
+
distro: str = typer.Argument(
|
|
698
|
+
...,
|
|
699
|
+
help="The Linux distribution to fetch packages for (e.g., 'debian' 'fedora')",
|
|
700
|
+
),
|
|
701
|
+
releases: list[str] = typer.Argument(
|
|
702
|
+
...,
|
|
703
|
+
help="The release(s) to fetch packages for (e.g., 'trixie', 'bookworm', '40')",
|
|
704
|
+
),
|
|
705
|
+
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
706
|
+
):
|
|
707
|
+
"""Fetch and cache distribution package data for specified releases."""
|
|
708
|
+
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
709
|
+
|
|
710
|
+
# Ensure cache directory exists
|
|
711
|
+
Path(cache_dir).mkdir(parents=True, exist_ok=True)
|
|
712
|
+
|
|
713
|
+
if distro.lower() == "debian":
|
|
714
|
+
for rel in releases:
|
|
715
|
+
parquet_path = Path(cache_dir, f"{distro}_{rel}_all_packages.parquet")
|
|
716
|
+
if parquet_path.exists():
|
|
717
|
+
logger.info(f"Using cached {rel} packages from {parquet_path}")
|
|
718
|
+
continue
|
|
719
|
+
|
|
720
|
+
# Show status since this can take a while (large file download + parsing)
|
|
721
|
+
with Status(
|
|
722
|
+
f"[bold cyan]Fetching {rel} packages (this may take a minute)...[/]",
|
|
723
|
+
console=console,
|
|
724
|
+
):
|
|
725
|
+
logger.info(f"Fetching and caching {rel} packages to {parquet_path}")
|
|
726
|
+
df: pd.DataFrame | None = deb.fetch_packages(rel)
|
|
727
|
+
if df is None:
|
|
728
|
+
logger.error(f"Failed to fetch {rel} packages.")
|
|
729
|
+
console.print(f"[red]✗ Failed to fetch {rel} packages[/]")
|
|
730
|
+
else:
|
|
731
|
+
df.to_parquet(parquet_path)
|
|
732
|
+
console.print(f"[green]✓ Fetched {len(df):,} {rel} packages[/]")
|
|
733
|
+
else:
|
|
734
|
+
logger.error(f"Distro '{distro}' is not supported for fetching packages.")
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
@dataset_app.command(rich_help_panel="Step 2: Filter Repos")
|
|
738
|
+
def filter_debian_github_repos(
|
|
739
|
+
distro: str = typer.Argument(
|
|
740
|
+
..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
|
|
741
|
+
),
|
|
742
|
+
release: list[str] = typer.Argument(
|
|
743
|
+
...,
|
|
744
|
+
help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
|
|
745
|
+
),
|
|
746
|
+
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
747
|
+
force: bool = typer.Option(
|
|
748
|
+
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
749
|
+
),
|
|
750
|
+
):
|
|
751
|
+
"""Filter distro package DataFrames to only include GitHub repositories."""
|
|
752
|
+
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
753
|
+
|
|
754
|
+
if distro.lower() == "debian":
|
|
755
|
+
for rel in release:
|
|
756
|
+
filtered_parquet_path = Path(
|
|
757
|
+
cache_dir, f"{distro}_{rel}_filtered_packages.parquet"
|
|
758
|
+
)
|
|
759
|
+
if filtered_parquet_path.exists() and not force:
|
|
760
|
+
logger.info(
|
|
761
|
+
f"Using cached filtered packages from {filtered_parquet_path}"
|
|
762
|
+
)
|
|
763
|
+
continue
|
|
764
|
+
|
|
765
|
+
parquet_path = Path(cache_dir, f"{distro}_{rel}_all_packages.parquet")
|
|
766
|
+
if not parquet_path.exists():
|
|
767
|
+
logger.error(
|
|
768
|
+
f"Required parquet file {parquet_path} does not exist. Please run the 'fetch-packages' command first."
|
|
769
|
+
)
|
|
770
|
+
continue
|
|
771
|
+
|
|
772
|
+
logger.info(f"Filtering GitHub repositories for Debian release '{rel}'")
|
|
773
|
+
df: pd.DataFrame = pd.read_parquet(parquet_path)
|
|
774
|
+
size_before = df.shape[0]
|
|
775
|
+
filtered_df = deb.filter_github_repos(df)
|
|
776
|
+
size_after = filtered_df.shape[0]
|
|
777
|
+
logger.info(
|
|
778
|
+
f"Dropped {size_before - size_after} packages due to non-GitHub '{rel}'."
|
|
779
|
+
)
|
|
780
|
+
filtered_df = deb.add_local_repo_cache_path_column(
|
|
781
|
+
filtered_df, cache_dir=cache_dir
|
|
782
|
+
)
|
|
783
|
+
filtered_df.reset_index(drop=True, inplace=True)
|
|
784
|
+
filtered_df.to_parquet(filtered_parquet_path)
|
|
785
|
+
logger.info(
|
|
786
|
+
f"Filtered GitHub repositories saved to {filtered_parquet_path}"
|
|
787
|
+
)
|
|
788
|
+
else:
|
|
789
|
+
logger.error(
|
|
790
|
+
f"Distro '{distro}' is not supported for filtering GitHub repositories."
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
@dataset_app.command(rich_help_panel="Step 3: Extract Versions")
|
|
795
|
+
def extract_upstream_versions(
|
|
796
|
+
distro: str = typer.Argument(
|
|
797
|
+
..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
|
|
798
|
+
),
|
|
799
|
+
release: list[str] = typer.Argument(
|
|
800
|
+
...,
|
|
801
|
+
help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
|
|
802
|
+
),
|
|
803
|
+
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
804
|
+
force: bool = typer.Option(
|
|
805
|
+
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
806
|
+
),
|
|
807
|
+
):
|
|
808
|
+
"""Extract upstream version strings from Debian package versions and add as a new column."""
|
|
809
|
+
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
810
|
+
|
|
811
|
+
if distro.lower() == "debian":
|
|
812
|
+
for rel in release:
|
|
813
|
+
versions_parquet_path = Path(
|
|
814
|
+
cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet"
|
|
815
|
+
)
|
|
816
|
+
if versions_parquet_path.exists() and not force:
|
|
817
|
+
logger.info(
|
|
818
|
+
f"Using cached upstream versions from {versions_parquet_path}"
|
|
819
|
+
)
|
|
820
|
+
continue
|
|
821
|
+
|
|
822
|
+
filtered_parquet_path = Path(
|
|
823
|
+
cache_dir, f"{distro}_{rel}_filtered_packages.parquet"
|
|
824
|
+
)
|
|
825
|
+
if not filtered_parquet_path.exists():
|
|
826
|
+
logger.error(
|
|
827
|
+
f"Required parquet file {filtered_parquet_path} does not exist. Please run the 'filter-debian-github-repos' command first."
|
|
828
|
+
)
|
|
829
|
+
continue
|
|
830
|
+
|
|
831
|
+
logger.info(f"Extracting upstream versions for Debian release '{rel}'")
|
|
832
|
+
df: pd.DataFrame = pd.read_parquet(filtered_parquet_path)
|
|
833
|
+
version_column = f"{rel}_upstream_version"
|
|
834
|
+
df_with_versions = deb.add_upstream_version_column(
|
|
835
|
+
df, f"{rel}_version", new_column_name=version_column
|
|
836
|
+
)
|
|
837
|
+
drop_before = df_with_versions.shape[0]
|
|
838
|
+
df_with_versions.dropna(subset=[version_column], inplace=True)
|
|
839
|
+
drop_after = df_with_versions.shape[0]
|
|
840
|
+
logger.info(
|
|
841
|
+
f"Dropped {drop_before - drop_after} rows with missing upstream versions for release '{rel}'."
|
|
842
|
+
)
|
|
843
|
+
df_with_versions.reset_index(drop=True, inplace=True)
|
|
844
|
+
df_with_versions.to_parquet(versions_parquet_path)
|
|
845
|
+
logger.info(
|
|
846
|
+
f"Upstream versions extracted and saved to {versions_parquet_path}"
|
|
847
|
+
)
|
|
848
|
+
else:
|
|
849
|
+
logger.error(
|
|
850
|
+
f"Distro '{distro}' is not supported for extracting upstream versions."
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
@dataset_app.command(rich_help_panel="Step 4: Merge Releases")
|
|
855
|
+
def merge_releases(
|
|
856
|
+
distro: str = typer.Argument(
|
|
857
|
+
..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
|
|
858
|
+
),
|
|
859
|
+
releases: list[str] = typer.Argument(
|
|
860
|
+
...,
|
|
861
|
+
help="One or more distro releases to merge (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
|
|
862
|
+
),
|
|
863
|
+
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
864
|
+
force: bool = typer.Option(
|
|
865
|
+
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
866
|
+
),
|
|
867
|
+
):
|
|
868
|
+
"""Merge multiple release DataFrames into a single DataFrame with all required columns."""
|
|
869
|
+
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
870
|
+
|
|
871
|
+
if distro.lower() == "debian":
|
|
872
|
+
merged_parquet_path = Path(
|
|
873
|
+
cache_dir, f"{distro}_merged_releases_packages.parquet"
|
|
874
|
+
)
|
|
875
|
+
if merged_parquet_path.exists() and not force:
|
|
876
|
+
logger.info(f"Using cached merged releases from {merged_parquet_path}")
|
|
877
|
+
return
|
|
878
|
+
|
|
879
|
+
dfs = []
|
|
880
|
+
for rel in releases:
|
|
881
|
+
versions_parquet_path = Path(
|
|
882
|
+
cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet"
|
|
883
|
+
)
|
|
884
|
+
if not versions_parquet_path.exists():
|
|
885
|
+
logger.error(
|
|
886
|
+
f"Required parquet file {versions_parquet_path} does not exist. Please run the 'extract-upstream-versions' command first."
|
|
887
|
+
)
|
|
888
|
+
continue
|
|
889
|
+
|
|
890
|
+
logger.info(
|
|
891
|
+
f"Loading packages with upstream versions for Debian release '{rel}'"
|
|
892
|
+
)
|
|
893
|
+
df: pd.DataFrame = pd.read_parquet(versions_parquet_path)
|
|
894
|
+
dfs.append(df)
|
|
895
|
+
deb_merged_df, deb_dropped_after_merge = deb.merge_release_packages(dfs)
|
|
896
|
+
logger.info(
|
|
897
|
+
f"Merged releases {releases}. Dropped {deb_dropped_after_merge.shape[0]} rows that were not present in all releases."
|
|
898
|
+
)
|
|
899
|
+
deb_merged_df.reset_index(drop=True, inplace=True)
|
|
900
|
+
deb_merged_df.to_parquet(merged_parquet_path)
|
|
901
|
+
logger.info(f"Merged release packages saved to {merged_parquet_path}")
|
|
902
|
+
deb_dropped_after_merge.to_parquet(
|
|
903
|
+
Path(cache_dir, f"{distro}_dropped_after_merge.parquet")
|
|
904
|
+
)
|
|
905
|
+
logger.info(
|
|
906
|
+
f"Dropped rows after merge saved to {Path(cache_dir, f'{distro}_dropped_after_merge.parquet')}"
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
else:
|
|
910
|
+
logger.error(f"Distro '{distro}' is not supported for merging releases.")
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
@dataset_app.command(rich_help_panel="Step 5: Clone Repos")
|
|
914
|
+
def clone_upstream_repos(
|
|
915
|
+
distro: str = typer.Argument(
|
|
916
|
+
..., help="The distro for (e.g., 'debian' 'fedora', etc.)"
|
|
917
|
+
),
|
|
918
|
+
repos_cache: str = typer.Option(
|
|
919
|
+
"./cache/repos", help="Cache directory for cloned repositories"
|
|
920
|
+
),
|
|
921
|
+
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
922
|
+
max_workers: int = typer.Option(
|
|
923
|
+
4, help="Maximum number of parallel clone processes (env: MAX_WORKERS)"
|
|
924
|
+
),
|
|
925
|
+
):
|
|
926
|
+
"""Clone all upstream GitHub repositories in the filtered package DataFrame."""
|
|
927
|
+
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
928
|
+
repos_cache = os.getenv("REPOS_CACHE_DIR") or repos_cache
|
|
929
|
+
max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
|
|
930
|
+
if distro.lower() == "debian":
|
|
931
|
+
parquet_path = Path(cache_dir, f"{distro}_merged_releases_packages.parquet")
|
|
932
|
+
if not parquet_path.exists():
|
|
933
|
+
console.print(
|
|
934
|
+
f"[red]Error:[/] Required parquet file {parquet_path} does not exist. Please run the 'merge-releases' command first."
|
|
935
|
+
)
|
|
936
|
+
return
|
|
937
|
+
|
|
938
|
+
# Suppress logging during setup
|
|
939
|
+
with SuppressConsoleLogging():
|
|
940
|
+
df: pd.DataFrame = pd.read_parquet(parquet_path)
|
|
941
|
+
repos_cache_path = pathlib.Path(repos_cache)
|
|
942
|
+
|
|
943
|
+
vcs.ensure_dir(repos_cache_path)
|
|
944
|
+
|
|
945
|
+
# Build list of repos to clone (skip already cloned)
|
|
946
|
+
clone_tasks: list[tuple[str, str]] = []
|
|
947
|
+
skipped = 0
|
|
948
|
+
invalid = 0
|
|
949
|
+
for _, row in df.iterrows():
|
|
950
|
+
repo_url = str(row["upstream_repo_url"])
|
|
951
|
+
target_dir = vcs.construct_repo_local_path(
|
|
952
|
+
repo_url, cache_dir=repos_cache_path, must_exist=False
|
|
953
|
+
)
|
|
954
|
+
if target_dir is None:
|
|
955
|
+
invalid += 1
|
|
956
|
+
continue
|
|
957
|
+
if target_dir.exists():
|
|
958
|
+
skipped += 1
|
|
959
|
+
continue
|
|
960
|
+
# Skip if there is a .failed marker file
|
|
961
|
+
failed_marker = repos_cache_path / f"{target_dir.name}.failed"
|
|
962
|
+
if failed_marker.exists():
|
|
963
|
+
skipped += 1
|
|
964
|
+
continue
|
|
965
|
+
|
|
966
|
+
clone_tasks.append((repo_url, str(target_dir)))
|
|
967
|
+
|
|
968
|
+
if invalid > 0:
|
|
969
|
+
console.print(f"[yellow]Skipped {invalid} invalid repository URLs[/]")
|
|
970
|
+
|
|
971
|
+
if len(clone_tasks) == 0:
|
|
972
|
+
console.print(f"[green]All {skipped} repositories already cloned.[/]")
|
|
973
|
+
return
|
|
974
|
+
|
|
975
|
+
# Use the parallel executor with fancy UI
|
|
976
|
+
executor = ParallelExecutor(
|
|
977
|
+
task_name=f"Cloning {distro.title()} Repositories",
|
|
978
|
+
max_workers=max_workers,
|
|
979
|
+
)
|
|
980
|
+
executor.run(
|
|
981
|
+
tasks=clone_tasks,
|
|
982
|
+
worker_fn=_clone_task,
|
|
983
|
+
task_id_fn=lambda t: t[0], # repo_url
|
|
984
|
+
skipped=skipped,
|
|
985
|
+
)
|
|
986
|
+
else:
|
|
987
|
+
console.print(
|
|
988
|
+
f"[red]Error:[/] Distro '{distro}' is not supported for cloning repositories."
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
|
|
992
|
+
@dataset_app.command(rich_help_panel="Step 6: Load Commits")
|
|
993
|
+
def load_commits_into_dataframe(
|
|
994
|
+
distro: str = typer.Argument(
|
|
995
|
+
..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
|
|
996
|
+
),
|
|
997
|
+
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
998
|
+
repo_cache: str = typer.Option(
|
|
999
|
+
"./cache/repos", help="Cache directory for cloned repositories"
|
|
1000
|
+
),
|
|
1001
|
+
max_workers: int = typer.Option(
|
|
1002
|
+
4, help="Maximum number of parallel worker processes (env: MAX_WORKERS)"
|
|
1003
|
+
),
|
|
1004
|
+
force: bool = typer.Option(
|
|
1005
|
+
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
1006
|
+
),
|
|
1007
|
+
):
|
|
1008
|
+
"""Load all GitHub commits for the upstream repositories into a single DataFrame."""
|
|
1009
|
+
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
1010
|
+
repo_cache_dir = os.getenv("REPOS_CACHE_DIR") or repo_cache
|
|
1011
|
+
checkpoint_dir = Path(cache_dir, "commit_checkpoints")
|
|
1012
|
+
max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
|
|
1013
|
+
|
|
1014
|
+
commits_parquet_path = Path(cache_dir, f"{distro}_all_upstream_commits.parquet")
|
|
1015
|
+
if commits_parquet_path.exists() and not force:
|
|
1016
|
+
console.print(f"[green]Using cached commits from {commits_parquet_path}[/]")
|
|
1017
|
+
return
|
|
1018
|
+
|
|
1019
|
+
all_packages_parquet_path = Path(
|
|
1020
|
+
cache_dir, f"{distro}_merged_releases_packages.parquet"
|
|
1021
|
+
)
|
|
1022
|
+
if not all_packages_parquet_path.exists():
|
|
1023
|
+
console.print(
|
|
1024
|
+
f"[red]Error:[/] Required parquet file {all_packages_parquet_path} does not exist. Please run the 'merge-releases' and 'clone-upstream-repos' commands first."
|
|
1025
|
+
)
|
|
1026
|
+
return
|
|
1027
|
+
# Create checkpoint directory
|
|
1028
|
+
vcs.ensure_dir(checkpoint_dir)
|
|
1029
|
+
|
|
1030
|
+
if force and checkpoint_dir.exists():
|
|
1031
|
+
console.print(f"[yellow]Removing existing checkpoint at {checkpoint_dir}[/]")
|
|
1032
|
+
for ck in checkpoint_dir.iterdir():
|
|
1033
|
+
if ck.name.endswith(".parquet"):
|
|
1034
|
+
ck.unlink()
|
|
1035
|
+
|
|
1036
|
+
df: pd.DataFrame = pd.read_parquet(all_packages_parquet_path)
|
|
1037
|
+
|
|
1038
|
+
# Build list of tasks (skip repos without local paths)
|
|
1039
|
+
tasks: list[tuple[str, str, str, str]] = []
|
|
1040
|
+
skipped = 0
|
|
1041
|
+
for _, row in df.iterrows():
|
|
1042
|
+
repo_url = str(row["upstream_repo_url"])
|
|
1043
|
+
local_repo_path = vcs.construct_repo_local_path(
|
|
1044
|
+
repo_url, cache_dir=Path(repo_cache_dir), must_exist=True
|
|
1045
|
+
)
|
|
1046
|
+
if local_repo_path is None or not local_repo_path.exists():
|
|
1047
|
+
skipped += 1
|
|
1048
|
+
continue
|
|
1049
|
+
source = str(row["source"])
|
|
1050
|
+
if Path(checkpoint_dir, f"{source}.parquet").exists():
|
|
1051
|
+
skipped += 1
|
|
1052
|
+
continue
|
|
1053
|
+
tasks.append((str(local_repo_path), repo_url, source, str(checkpoint_dir)))
|
|
1054
|
+
results: list[TaskResult] = []
|
|
1055
|
+
# Run tasks if any
|
|
1056
|
+
if len(tasks) > 0:
|
|
1057
|
+
# Use the parallel executor with fancy UI
|
|
1058
|
+
executor = ParallelExecutor(
|
|
1059
|
+
task_name=f"Loading {distro.title()} Commits",
|
|
1060
|
+
max_workers=max_workers,
|
|
1061
|
+
)
|
|
1062
|
+
results = executor.run(
|
|
1063
|
+
tasks=tasks,
|
|
1064
|
+
worker_fn=_load_commits_task,
|
|
1065
|
+
task_id_fn=lambda t: t[1], # repo_url
|
|
1066
|
+
skipped=skipped,
|
|
1067
|
+
)
|
|
1068
|
+
# Collect all the checkpointed DataFrames
|
|
1069
|
+
if checkpoint_dir.exists():
|
|
1070
|
+
try:
|
|
1071
|
+
console.print(
|
|
1072
|
+
f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
|
|
1073
|
+
)
|
|
1074
|
+
for ck in checkpoint_dir.iterdir():
|
|
1075
|
+
if not ck.name.endswith(".parquet"):
|
|
1076
|
+
continue
|
|
1077
|
+
|
|
1078
|
+
checkpoint_df: pd.DataFrame = pd.read_parquet(ck)
|
|
1079
|
+
results.append(
|
|
1080
|
+
TaskResult(
|
|
1081
|
+
task_id="checkpoint",
|
|
1082
|
+
success=True,
|
|
1083
|
+
data=checkpoint_df,
|
|
1084
|
+
)
|
|
1085
|
+
)
|
|
1086
|
+
except Exception as e:
|
|
1087
|
+
console.print(
|
|
1088
|
+
f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
|
|
1089
|
+
)
|
|
1090
|
+
|
|
1091
|
+
# Collect successful DataFrames
|
|
1092
|
+
all_commits = [r.data for r in results if r.success and r.data is not None]
|
|
1093
|
+
|
|
1094
|
+
if all_commits:
|
|
1095
|
+
console.print(f"[green]Loaded commits from {len(all_commits)} repositories.[/]")
|
|
1096
|
+
combined_commits_df = pd.concat(all_commits, ignore_index=True)
|
|
1097
|
+
commits_parquet_path = Path(cache_dir, f"{distro}_all_upstream_commits.parquet")
|
|
1098
|
+
combined_commits_df.to_parquet(commits_parquet_path)
|
|
1099
|
+
console.print(
|
|
1100
|
+
f"[green]Saved {len(combined_commits_df):,} commits to {commits_parquet_path}[/]"
|
|
1101
|
+
)
|
|
1102
|
+
else:
|
|
1103
|
+
console.print("[yellow]No commits were loaded from any repositories.[/]")
|
|
1104
|
+
|
|
1105
|
+
|
|
1106
|
+
@dataset_app.command(rich_help_panel="Step 7: GitHub Metadata")
|
|
1107
|
+
def all_github_metadata(
|
|
1108
|
+
distro: str = typer.Option(
|
|
1109
|
+
"debian", help="The Linux distribution to process (default: debian)"
|
|
1110
|
+
),
|
|
1111
|
+
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
1112
|
+
max_workers: int = typer.Option(
|
|
1113
|
+
4, help="Maximum number of parallel GitHub API workers (env: MAX_WORKERS)"
|
|
1114
|
+
),
|
|
1115
|
+
force: bool = typer.Option(
|
|
1116
|
+
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
1117
|
+
),
|
|
1118
|
+
):
|
|
1119
|
+
"""Fetch GitHub repository metadata for all unique repos in the commits parquet file."""
|
|
1120
|
+
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
1121
|
+
max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
|
|
1122
|
+
all_packages_parquet_path = Path(
|
|
1123
|
+
cache_dir, f"{distro}_merged_releases_packages.parquet"
|
|
1124
|
+
)
|
|
1125
|
+
output_parquet_path = Path(cache_dir, f"{distro}_github_repo_metadata.parquet")
|
|
1126
|
+
checkpoint_dir = Path(cache_dir, "github_metadata_checkpoints")
|
|
1127
|
+
# Create checkpoint directory
|
|
1128
|
+
vcs.ensure_dir(checkpoint_dir)
|
|
1129
|
+
|
|
1130
|
+
if force and checkpoint_dir.exists():
|
|
1131
|
+
console.print(
|
|
1132
|
+
f"[yellow]Removing existing GitHub metadata checkpoint at {checkpoint_dir}[/]"
|
|
1133
|
+
)
|
|
1134
|
+
for ck in checkpoint_dir.iterdir():
|
|
1135
|
+
if ck.name.endswith(".parquet"):
|
|
1136
|
+
ck.unlink()
|
|
1137
|
+
|
|
1138
|
+
if output_parquet_path.exists() and not force:
|
|
1139
|
+
console.print(
|
|
1140
|
+
f"[green]Using cached GitHub metadata from {output_parquet_path}[/]"
|
|
1141
|
+
)
|
|
1142
|
+
return
|
|
1143
|
+
|
|
1144
|
+
if not all_packages_parquet_path.exists():
|
|
1145
|
+
console.print(
|
|
1146
|
+
f"[red]Error:[/] Required parquet file {all_packages_parquet_path} does not exist. Please run the 'merge-releases' command first."
|
|
1147
|
+
)
|
|
1148
|
+
return
|
|
1149
|
+
|
|
1150
|
+
df: pd.DataFrame = pd.read_parquet(all_packages_parquet_path)
|
|
1151
|
+
|
|
1152
|
+
# Build list of tasks (skip repos without local paths)
|
|
1153
|
+
tasks: list[tuple[str, str, str]] = []
|
|
1154
|
+
skipped = 0
|
|
1155
|
+
for _, row in df.iterrows():
|
|
1156
|
+
repo_url = str(row["upstream_repo_url"])
|
|
1157
|
+
source = str(row["source"])
|
|
1158
|
+
if Path(checkpoint_dir, f"{source}.parquet").exists():
|
|
1159
|
+
skipped += 1
|
|
1160
|
+
continue
|
|
1161
|
+
# Skip if there is a .failed marker file
|
|
1162
|
+
failed_marker = checkpoint_dir / f"{source}.failed"
|
|
1163
|
+
if failed_marker.exists():
|
|
1164
|
+
skipped += 1
|
|
1165
|
+
continue
|
|
1166
|
+
tasks.append((repo_url, source, str(checkpoint_dir)))
|
|
1167
|
+
results: list[TaskResult] = []
|
|
1168
|
+
|
|
1169
|
+
# Display rate limit info
|
|
1170
|
+
github_token = os.getenv("GITHUB_TOKEN")
|
|
1171
|
+
rate_info = gh.gh_get_rate_limit_info(github_token)
|
|
1172
|
+
rate_limit = rate_info["limit"] if rate_info else None
|
|
1173
|
+
rate_remaining = rate_info["remaining"] if rate_info else None
|
|
1174
|
+
if rate_info:
|
|
1175
|
+
console.print(
|
|
1176
|
+
f"[cyan]GitHub API Rate Limit:[/] {rate_info['remaining']}/{rate_info['limit']} remaining (resets at {rate_info['reset_datetime']})"
|
|
1177
|
+
)
|
|
1178
|
+
else:
|
|
1179
|
+
console.print("[yellow]Warning:[/] Could not fetch rate limit info")
|
|
1180
|
+
|
|
1181
|
+
console.print(f"[cyan]Fetching GitHub metadata for {len(tasks)} repositories...[/]")
|
|
1182
|
+
executor = ParallelExecutor(
|
|
1183
|
+
task_name="GitHub Metadata Fetch",
|
|
1184
|
+
max_workers=min(max_workers, len(tasks)),
|
|
1185
|
+
rate_limit=rate_limit,
|
|
1186
|
+
rate_remaining=rate_remaining,
|
|
1187
|
+
)
|
|
1188
|
+
results = executor.run(
|
|
1189
|
+
tasks=tasks,
|
|
1190
|
+
worker_fn=_fetch_github_repo_metadata_task,
|
|
1191
|
+
task_id_fn=lambda t: t[0],
|
|
1192
|
+
skipped=skipped,
|
|
1193
|
+
)
|
|
1194
|
+
# Collect all the checkpointed DataFrames
|
|
1195
|
+
if checkpoint_dir.exists():
|
|
1196
|
+
try:
|
|
1197
|
+
console.print(
|
|
1198
|
+
f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
|
|
1199
|
+
)
|
|
1200
|
+
for ck in checkpoint_dir.iterdir():
|
|
1201
|
+
if not ck.name.endswith(".parquet"):
|
|
1202
|
+
continue
|
|
1203
|
+
|
|
1204
|
+
checkpoint_df: pd.DataFrame = pd.read_parquet(ck)
|
|
1205
|
+
results.append(
|
|
1206
|
+
TaskResult(
|
|
1207
|
+
task_id="checkpoint",
|
|
1208
|
+
success=True,
|
|
1209
|
+
data=checkpoint_df,
|
|
1210
|
+
)
|
|
1211
|
+
)
|
|
1212
|
+
except Exception as e:
|
|
1213
|
+
console.print(
|
|
1214
|
+
f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
|
|
1215
|
+
)
|
|
1216
|
+
# Collect successful DataFrames
|
|
1217
|
+
all_metadata = [r.data for r in results if r.success and r.data is not None]
|
|
1218
|
+
|
|
1219
|
+
if all_metadata:
|
|
1220
|
+
console.print(
|
|
1221
|
+
f"[green]Loaded metadata from {len(all_metadata)} repositories.[/]"
|
|
1222
|
+
)
|
|
1223
|
+
combined_metadata_df = pd.concat(all_metadata, ignore_index=True)
|
|
1224
|
+
metadata_parquet_path = Path(
|
|
1225
|
+
cache_dir, f"{distro}_all_upstream_metadata.parquet"
|
|
1226
|
+
)
|
|
1227
|
+
combined_metadata_df.to_parquet(metadata_parquet_path)
|
|
1228
|
+
console.print(
|
|
1229
|
+
f"[green]Saved {len(combined_metadata_df):,} metadata entries to {metadata_parquet_path}[/]"
|
|
1230
|
+
)
|
|
1231
|
+
else:
|
|
1232
|
+
console.print(
|
|
1233
|
+
"[yellow]No metadata entries were loaded from any repositories.[/]"
|
|
1234
|
+
)
|
|
1235
|
+
|
|
1236
|
+
|
|
1237
|
+
@dataset_app.command(rich_help_panel="Step 8: GitHub Metadata")
|
|
1238
|
+
def all_github_pull_requests(
|
|
1239
|
+
distro: str = typer.Option(
|
|
1240
|
+
"debian", help="The Linux distribution to process (default: debian)"
|
|
1241
|
+
),
|
|
1242
|
+
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
1243
|
+
max_workers: int = typer.Option(
|
|
1244
|
+
4, help="Maximum number of parallel GitHub API workers (env: MAX_WORKERS)"
|
|
1245
|
+
),
|
|
1246
|
+
force: bool = typer.Option(
|
|
1247
|
+
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
1248
|
+
),
|
|
1249
|
+
):
|
|
1250
|
+
"""Fetch GitHub repository pull requests for all unique repos in the commits parquet file."""
|
|
1251
|
+
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
1252
|
+
max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
|
|
1253
|
+
all_packages_parquet_path = Path(
|
|
1254
|
+
cache_dir, f"{distro}_merged_releases_packages.parquet"
|
|
1255
|
+
)
|
|
1256
|
+
output_parquet_path = Path(cache_dir, f"{distro}_github_repo_pull_requests.parquet")
|
|
1257
|
+
checkpoint_dir = Path(cache_dir, "github_pr_checkpoints")
|
|
1258
|
+
# Create checkpoint directory
|
|
1259
|
+
vcs.ensure_dir(checkpoint_dir)
|
|
1260
|
+
|
|
1261
|
+
if output_parquet_path.exists() and not force:
|
|
1262
|
+
console.print(
|
|
1263
|
+
f"[green]Using cached GitHub pull requests from {output_parquet_path}[/]"
|
|
1264
|
+
)
|
|
1265
|
+
return
|
|
1266
|
+
|
|
1267
|
+
if not all_packages_parquet_path.exists():
|
|
1268
|
+
console.print(
|
|
1269
|
+
f"[red]Error:[/] Required parquet file {all_packages_parquet_path} does not exist. Please run the 'merge-releases' command first."
|
|
1270
|
+
)
|
|
1271
|
+
return
|
|
1272
|
+
|
|
1273
|
+
if force and checkpoint_dir.exists():
|
|
1274
|
+
console.print(
|
|
1275
|
+
f"[yellow]Removing existing GitHub pull requests checkpoint at {checkpoint_dir}[/]"
|
|
1276
|
+
)
|
|
1277
|
+
for ck in checkpoint_dir.iterdir():
|
|
1278
|
+
if ck.name.endswith(".parquet"):
|
|
1279
|
+
ck.unlink()
|
|
1280
|
+
|
|
1281
|
+
df: pd.DataFrame = pd.read_parquet(all_packages_parquet_path)
|
|
1282
|
+
|
|
1283
|
+
# Build list of tasks (skip repos without local paths)
|
|
1284
|
+
tasks: list[tuple[str, str, str]] = []
|
|
1285
|
+
skipped = 0
|
|
1286
|
+
for _, row in df.iterrows():
|
|
1287
|
+
repo_url = str(row["upstream_repo_url"])
|
|
1288
|
+
source = str(row["source"])
|
|
1289
|
+
if Path(checkpoint_dir, f"{source}.parquet").exists():
|
|
1290
|
+
skipped += 1
|
|
1291
|
+
continue
|
|
1292
|
+
# Skip if there is a .failed marker file
|
|
1293
|
+
failed_marker = checkpoint_dir / f"{source}.failed"
|
|
1294
|
+
if failed_marker.exists():
|
|
1295
|
+
skipped += 1
|
|
1296
|
+
continue
|
|
1297
|
+
tasks.append((repo_url, source, str(checkpoint_dir)))
|
|
1298
|
+
results: list[TaskResult] = []
|
|
1299
|
+
|
|
1300
|
+
# Display rate limit info
|
|
1301
|
+
github_token = os.getenv("GITHUB_TOKEN")
|
|
1302
|
+
rate_info = gh.gh_get_rate_limit_info(github_token)
|
|
1303
|
+
if rate_info:
|
|
1304
|
+
console.print(
|
|
1305
|
+
f"[cyan]GitHub API Rate Limit:[/] {rate_info['remaining']}/{rate_info['limit']} remaining (resets at {rate_info['reset_datetime']})"
|
|
1306
|
+
)
|
|
1307
|
+
else:
|
|
1308
|
+
console.print("[yellow]Warning:[/] Could not fetch rate limit info")
|
|
1309
|
+
|
|
1310
|
+
console.print(
|
|
1311
|
+
f"[cyan]Fetching GitHub pull requests for {len(tasks)} repositories...[/]"
|
|
1312
|
+
)
|
|
1313
|
+
executor = ParallelExecutor(
|
|
1314
|
+
task_name="GitHub Pull Requests Fetch",
|
|
1315
|
+
max_workers=min(max_workers, len(tasks)),
|
|
1316
|
+
)
|
|
1317
|
+
results = executor.run(
|
|
1318
|
+
tasks=tasks,
|
|
1319
|
+
worker_fn=_fetch_github_repo_pull_requests_task,
|
|
1320
|
+
task_id_fn=lambda t: t[0],
|
|
1321
|
+
skipped=skipped,
|
|
1322
|
+
)
|
|
1323
|
+
# Collect all the checkpointed DataFrames
|
|
1324
|
+
if checkpoint_dir.exists():
|
|
1325
|
+
try:
|
|
1326
|
+
console.print(
|
|
1327
|
+
f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
|
|
1328
|
+
)
|
|
1329
|
+
for ck in checkpoint_dir.iterdir():
|
|
1330
|
+
if not ck.name.endswith(".parquet"):
|
|
1331
|
+
continue
|
|
1332
|
+
|
|
1333
|
+
checkpoint_df: pd.DataFrame = pd.read_parquet(ck)
|
|
1334
|
+
results.append(
|
|
1335
|
+
TaskResult(
|
|
1336
|
+
task_id="checkpoint",
|
|
1337
|
+
success=True,
|
|
1338
|
+
data=checkpoint_df,
|
|
1339
|
+
)
|
|
1340
|
+
)
|
|
1341
|
+
except Exception as e:
|
|
1342
|
+
console.print(
|
|
1343
|
+
f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
|
|
1344
|
+
)
|
|
1345
|
+
# Collect successful DataFrames
|
|
1346
|
+
all_metadata = [r.data for r in results if r.success and r.data is not None]
|
|
1347
|
+
|
|
1348
|
+
if all_metadata:
|
|
1349
|
+
console.print(
|
|
1350
|
+
f"[green]Loaded pull requests from {len(all_metadata)} repositories.[/]"
|
|
1351
|
+
)
|
|
1352
|
+
combined_metadata_df = pd.concat(all_metadata, ignore_index=True)
|
|
1353
|
+
metadata_parquet_path = Path(
|
|
1354
|
+
cache_dir, f"{distro}_all_upstream_pull_requests.parquet"
|
|
1355
|
+
)
|
|
1356
|
+
combined_metadata_df.to_parquet(metadata_parquet_path)
|
|
1357
|
+
console.print(
|
|
1358
|
+
f"[green]Saved {len(combined_metadata_df):,} pull request entries to {metadata_parquet_path}[/]"
|
|
1359
|
+
)
|
|
1360
|
+
else:
|
|
1361
|
+
console.print(
|
|
1362
|
+
"[yellow]No pull request entries were loaded from any repositories.[/]"
|
|
1363
|
+
)
|
|
1364
|
+
|
|
1365
|
+
|
|
1366
|
+
@app.command()
|
|
1367
|
+
def show_cache():
|
|
1368
|
+
"""Show the current cache directory."""
|
|
1369
|
+
cache_dir = os.getenv("CACHE_DIR", "./cache")
|
|
1370
|
+
console.print(f"[blue]Current cache directory:[/] {cache_dir}")
|
|
1371
|
+
|
|
1372
|
+
|
|
1373
|
+
def main():
|
|
1374
|
+
"""Main entry point for the CLI application."""
|
|
1375
|
+
setup_logging()
|
|
1376
|
+
# Show help menu if no arguments provided
|
|
1377
|
+
if len(sys.argv) == 1:
|
|
1378
|
+
app(["--help"])
|
|
1379
|
+
else:
|
|
1380
|
+
app()
|