osslag 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
osslag/cli.py ADDED
@@ -0,0 +1,1380 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import logging.config
5
+ import os
6
+ import pathlib
7
+ import sys
8
+ from concurrent.futures import Future, ProcessPoolExecutor
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Any, Callable
12
+
13
+ import pandas as pd
14
+ import typer
15
+ from dotenv import load_dotenv
16
+ from rich.console import Console, Group
17
+ from rich.live import Live
18
+ from rich.panel import Panel
19
+ from rich.progress import (
20
+ BarColumn,
21
+ Progress,
22
+ SpinnerColumn,
23
+ TextColumn,
24
+ )
25
+ from rich.status import Status
26
+ from rich.table import Table
27
+ from rich.text import Text
28
+
29
+ from osslag.distro import debian as deb
30
+ from osslag.utils import github_helper as gh
31
+ from osslag.utils import vcs
32
+
33
+ load_dotenv()
34
+ app = typer.Typer()
35
+ dataset_app = typer.Typer(
36
+ help="Dataset pipeline commands for building package analysis datasets."
37
+ )
38
+ app.add_typer(dataset_app, name="dataset")
39
+ logger = logging.getLogger(__name__)
40
+ console = Console()
41
+
42
+
43
+ # region Tasks
44
+ def _fetch_github_repo_metadata_task(args: tuple[str, str, str]) -> TaskResult:
45
+ """Worker function to fetch GitHub repo metadata with proper timeout and rate limit handling."""
46
+ repo_url, source, cache_path = args
47
+ checkpoint_path = Path(cache_path, f"{source}.parquet")
48
+ try:
49
+ metadata_df = gh.fetch_github_repo_metadata(repo_url)
50
+ # Save checkpoint
51
+ metadata_df.to_parquet(checkpoint_path)
52
+ return TaskResult(
53
+ task_id=repo_url,
54
+ success=True,
55
+ data=metadata_df,
56
+ )
57
+
58
+ except Exception as e:
59
+ failed_marker = checkpoint_path.parent / f"{checkpoint_path.name}.failed"
60
+ return TaskResult(
61
+ task_id=repo_url,
62
+ success=False,
63
+ error=str(e),
64
+ failed_marker_path=failed_marker,
65
+ )
66
+
67
+
68
+ def _fetch_github_repo_pull_requests_task(args: tuple[str, str, str]) -> TaskResult:
69
+ """Worker function to fetch GitHub repo pull requests with proper timeout and rate limit handling."""
70
+ repo_url, source, cache_path = args
71
+ checkpoint_path = Path(cache_path, f"{source}.parquet")
72
+ try:
73
+ pull_requests_df = gh.fetch_pull_requests(repo_url)
74
+ # Save checkpoint
75
+ pull_requests_df.to_parquet(checkpoint_path)
76
+ return TaskResult(
77
+ task_id=repo_url,
78
+ success=True,
79
+ data=pull_requests_df,
80
+ )
81
+
82
+ except Exception as e:
83
+ failed_marker = checkpoint_path.parent / f"{checkpoint_path.name}.failed"
84
+ return TaskResult(
85
+ task_id=repo_url,
86
+ success=False,
87
+ error=str(e),
88
+ failed_marker_path=failed_marker,
89
+ )
90
+
91
+
92
+ def _clone_task(args: tuple[str, str]) -> TaskResult:
93
+ """Worker function for parallel cloning. Returns TaskResult."""
94
+ repo_url, target_dir = args
95
+ try:
96
+ result = vcs.clone_repo(repo_url, target_dir)
97
+ return TaskResult(
98
+ task_id=repo_url,
99
+ success=result.success,
100
+ error=result.error,
101
+ )
102
+ except Exception as e:
103
+ return TaskResult(task_id=repo_url, success=False, error=str(e))
104
+
105
+
106
+ def _load_commits_task(args: tuple[str, str, str, str]) -> TaskResult:
107
+ """Worker function for parallel commit loading. Returns TaskResult with DataFrame."""
108
+ local_repo_path, repo_url, source, cache_path = args
109
+ checkpoint_path = Path(cache_path, f"{source}.parquet")
110
+ try:
111
+ repo_commits_df = vcs.load_commits(local_repo_path, include_files=True)
112
+ repo_commits_df["repo_url"] = repo_url
113
+ repo_commits_df["source"] = source
114
+ # Save checkpoint
115
+ repo_commits_df.to_parquet(checkpoint_path)
116
+ return TaskResult(
117
+ task_id=repo_url,
118
+ success=True,
119
+ data=repo_commits_df,
120
+ )
121
+ except Exception as e:
122
+ error_detail = f"{type(e).__name__}: {str(e)}"
123
+ return TaskResult(task_id=repo_url, success=False, error=error_detail)
124
+
125
+
126
+ # endregion
127
+
128
+
129
+ class SuppressConsoleLogging:
130
+ """Context manager to temporarily suppress console logging output."""
131
+
132
+ def __enter__(self):
133
+ # Find and disable all console/stream handlers, saving their original levels
134
+ self._disabled_handlers: list[tuple[logging.Handler, int]] = []
135
+ for name in list(logging.Logger.manager.loggerDict.keys()) + ["", "root"]:
136
+ log = logging.getLogger(name) if name else logging.getLogger()
137
+ for handler in log.handlers[:]:
138
+ if isinstance(handler, logging.StreamHandler) and not isinstance(
139
+ handler, logging.FileHandler
140
+ ):
141
+ original_level = handler.level
142
+ handler.setLevel(logging.CRITICAL + 1) # Effectively disable
143
+ self._disabled_handlers.append((handler, original_level))
144
+ return self
145
+
146
+ def __exit__(self, exc_type, exc_val, exc_tb):
147
+ # Restore original levels
148
+ for handler, original_level in self._disabled_handlers:
149
+ handler.setLevel(original_level)
150
+ return False
151
+
152
+
153
+ @dataclass
154
+ class TaskResult:
155
+ """Result from a parallel task execution."""
156
+
157
+ task_id: str
158
+ success: bool
159
+ error: str | None = None
160
+ data: Any = None
161
+ failed_marker_path: pathlib.Path | None = None
162
+
163
+
164
+ @dataclass
165
+ class WorkerStatus:
166
+ """Tracks the status of a single worker."""
167
+
168
+ worker_id: int
169
+ current_task: str | None = None
170
+ tasks_completed: int = 0
171
+ tasks_failed: int = 0
172
+
173
+
174
+ class ParallelExecutor:
175
+ """Generic parallel task executor with a fancy Rich CLI UI.
176
+
177
+ Usage:
178
+ executor = ParallelExecutor(
179
+ task_name="Cloning repositories",
180
+ max_workers=4,
181
+ )
182
+ results = executor.run(
183
+ tasks=[(url, path), ...],
184
+ worker_fn=clone_task,
185
+ task_id_fn=lambda t: t[0], # Extract task ID for display
186
+ )
187
+ """
188
+
189
+ def __init__(
190
+ self,
191
+ task_name: str,
192
+ max_workers: int = 4,
193
+ show_recent_completed: int = 5,
194
+ rate_limit: int | None = None,
195
+ rate_remaining: int | None = None,
196
+ ):
197
+ self.task_name = task_name
198
+ self.max_workers = max_workers
199
+ self.show_recent_completed = show_recent_completed
200
+
201
+ # Rate limit info
202
+ self.rate_limit = rate_limit
203
+ self.rate_remaining = rate_remaining
204
+
205
+ # Tracking state
206
+ self.workers: dict[int, WorkerStatus] = {}
207
+ self.completed_tasks: list[TaskResult] = []
208
+ self.failed_tasks: list[TaskResult] = []
209
+ self.recent_completed: list[tuple[str, bool]] = [] # (task_id, success)
210
+ self.total_tasks = 0
211
+ self.skipped_tasks = 0
212
+
213
+ # Timing
214
+ self.start_time: float | None = None
215
+
216
+ def create_display(self, progress: Progress) -> Panel:
217
+ """Create the rich display panel."""
218
+ # Stats section
219
+ completed = len(self.completed_tasks)
220
+ failed = len(self.failed_tasks)
221
+ in_progress = sum(1 for w in self.workers.values() if w.current_task)
222
+
223
+ stats_table = Table.grid(padding=(0, 2))
224
+ stats_table.add_column(style="cyan", justify="right")
225
+ stats_table.add_column(style="white")
226
+ stats_table.add_column(style="cyan", justify="right")
227
+ stats_table.add_column(style="white")
228
+
229
+ stats_table.add_row(
230
+ "Total:",
231
+ f"{self.total_tasks}",
232
+ "Skipped:",
233
+ f"{self.skipped_tasks}",
234
+ )
235
+ stats_table.add_row(
236
+ "✓ Completed:",
237
+ f"[green]{completed}[/]",
238
+ "✗ Failed:",
239
+ f"[red]{failed}[/]",
240
+ )
241
+ stats_table.add_row(
242
+ "⟳ In Progress:",
243
+ f"[yellow]{in_progress}[/]",
244
+ "Workers:",
245
+ f"{self.max_workers}",
246
+ )
247
+
248
+ # Add rate limit info if available
249
+ if self.rate_limit is not None and self.rate_remaining is not None:
250
+ stats_table.add_row(
251
+ "Rate Limit:",
252
+ f"[cyan]{self.rate_remaining}[/]/[white]{self.rate_limit}[/]",
253
+ "",
254
+ "",
255
+ )
256
+
257
+ # Workers section
258
+ workers_table = Table(
259
+ title="[bold]Active Workers[/]",
260
+ show_header=True,
261
+ header_style="bold magenta",
262
+ border_style="dim",
263
+ expand=False,
264
+ )
265
+ workers_table.add_column("Worker", style="cyan", width=8)
266
+ workers_table.add_column("Status", style="white", width=12)
267
+ workers_table.add_column(
268
+ "Current Task", style="yellow", overflow="ellipsis", no_wrap=True, width=60
269
+ )
270
+ workers_table.add_column("Done", style="green", justify="right", width=6)
271
+ workers_table.add_column("Fail", style="red", justify="right", width=6)
272
+
273
+ for wid in sorted(self.workers.keys()):
274
+ w = self.workers[wid]
275
+ status = "[green]●[/] Working" if w.current_task else "[dim]○ Idle[/]"
276
+ task_display = (
277
+ w.current_task[:58] + "…"
278
+ if w.current_task and len(w.current_task) > 58
279
+ else (w.current_task or "-")
280
+ )
281
+ workers_table.add_row(
282
+ f"#{wid}",
283
+ status,
284
+ task_display,
285
+ str(w.tasks_completed),
286
+ str(w.tasks_failed),
287
+ )
288
+
289
+ # Recent completions
290
+ recent_text = Text()
291
+ for task_id, success in self.recent_completed[-self.show_recent_completed :]:
292
+ short_id = task_id[:70] + "…" if len(task_id) > 70 else task_id
293
+ recent_text.append(" ")
294
+ recent_text.append(
295
+ "✓ " if success else "✗ ", style="bold green" if success else "bold red"
296
+ )
297
+ recent_text.append(f"{short_id}\n")
298
+
299
+ components = [
300
+ stats_table,
301
+ Text(),
302
+ progress,
303
+ Text(),
304
+ workers_table,
305
+ Text(),
306
+ Panel(
307
+ recent_text
308
+ if recent_text.plain
309
+ else Text(" Waiting for tasks...", style="dim italic"),
310
+ title="[bold]Recent Completions[/]",
311
+ border_style="dim",
312
+ ),
313
+ ]
314
+
315
+ group = Group(*components)
316
+
317
+ return Panel(
318
+ group,
319
+ title=f"[bold blue]⚡ {self.task_name}[/]",
320
+ border_style="blue",
321
+ )
322
+
323
+ def run(
324
+ self,
325
+ tasks: list[Any],
326
+ worker_fn: Callable[[Any], TaskResult],
327
+ task_id_fn: Callable[[Any], str],
328
+ skipped: int = 0,
329
+ ) -> list[TaskResult]:
330
+ """Execute tasks in parallel with a live UI.
331
+
332
+ Args:
333
+ tasks: List of task arguments to pass to worker_fn
334
+ worker_fn: Function that processes a single task and returns TaskResult
335
+ task_id_fn: Function to extract a display ID from a task
336
+ skipped: Number of tasks that were skipped before execution
337
+
338
+ Returns:
339
+ List of TaskResult objects
340
+
341
+ """
342
+ import time as time_module
343
+
344
+ self.total_tasks = len(tasks)
345
+ self.skipped_tasks = skipped
346
+ self.workers = {i: WorkerStatus(worker_id=i) for i in range(self.max_workers)}
347
+ self.completed_tasks = []
348
+ self.failed_tasks = []
349
+ self.recent_completed = []
350
+ self.start_time = time_module.time()
351
+
352
+ if self.total_tasks == 0:
353
+ console.print(
354
+ Panel(
355
+ "[dim]No tasks to process[/]",
356
+ title=f"[bold]{self.task_name}[/]",
357
+ border_style="dim",
358
+ )
359
+ )
360
+ return []
361
+
362
+ results: list[TaskResult] = []
363
+
364
+ progress = Progress(
365
+ SpinnerColumn(),
366
+ TextColumn("[progress.description]{task.description}"),
367
+ BarColumn(bar_width=40),
368
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
369
+ )
370
+ progress_task_id = progress.add_task(self.task_name, total=self.total_tasks)
371
+
372
+ with (
373
+ SuppressConsoleLogging(),
374
+ Live(
375
+ self.create_display(progress), refresh_per_second=4, console=console
376
+ ) as live,
377
+ ):
378
+ with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
379
+ # Map futures to (task, worker_id)
380
+ future_to_task: dict[Future, tuple[Any, int]] = {}
381
+ available_workers = list(range(self.max_workers))
382
+ pending_tasks = list(tasks)
383
+
384
+ # Submit initial batch
385
+ while available_workers and pending_tasks:
386
+ worker_id = available_workers.pop(0)
387
+ task = pending_tasks.pop(0)
388
+ task_id = task_id_fn(task)
389
+
390
+ self.workers[worker_id].current_task = task_id
391
+ future = executor.submit(worker_fn, task)
392
+ future_to_task[future] = (task, worker_id)
393
+
394
+ live.update(self.create_display(progress))
395
+
396
+ # Process as futures complete
397
+ while future_to_task:
398
+ # Wait for at least one to complete
399
+ done_futures = []
400
+ for future in list(future_to_task.keys()):
401
+ if future.done():
402
+ done_futures.append(future)
403
+
404
+ if not done_futures:
405
+ # Small sleep to avoid busy waiting
406
+ time_module.sleep(0.05)
407
+ continue
408
+
409
+ for future in done_futures:
410
+ task, worker_id = future_to_task.pop(future)
411
+ task_id = task_id_fn(task)
412
+ try:
413
+ result = future.result()
414
+ if result.success:
415
+ results.append(result)
416
+ self.completed_tasks.append(result)
417
+ self.workers[worker_id].tasks_completed += 1
418
+ self.recent_completed.append((task_id, result.success))
419
+ else:
420
+ results.append(result)
421
+ self.failed_tasks.append(result)
422
+ self.workers[worker_id].tasks_failed += 1
423
+ self.recent_completed.append((task_id, result.success))
424
+ marker = result.failed_marker_path
425
+ marker.write_text(
426
+ f"Task failed: {result.error}\n"
427
+ ) if marker else None
428
+
429
+ except Exception as e:
430
+ error_result = TaskResult(
431
+ task_id=task_id, success=False, error=str(e)
432
+ )
433
+ results.append(error_result)
434
+ self.failed_tasks.append(error_result)
435
+ self.workers[worker_id].tasks_failed += 1
436
+ self.recent_completed.append((task_id, False))
437
+ result = error_result # Assign for progress check below
438
+ marker = result.failed_marker_path
439
+ marker.write_text(
440
+ f"Task failed: {result.error}\n"
441
+ ) if marker else None
442
+ # Update progress
443
+ progress.advance(progress_task_id)
444
+
445
+ # Mark worker as available
446
+ self.workers[worker_id].current_task = None
447
+ available_workers.append(worker_id)
448
+
449
+ # Submit next task if available and not rate limited
450
+ if pending_tasks and available_workers:
451
+ next_worker = available_workers.pop(0)
452
+ next_task = pending_tasks.pop(0)
453
+ next_task_id = task_id_fn(next_task)
454
+
455
+ self.workers[next_worker].current_task = next_task_id
456
+ next_future = executor.submit(worker_fn, next_task)
457
+ future_to_task[next_future] = (next_task, next_worker)
458
+
459
+ if done_futures:
460
+ live.update(self.create_display(progress))
461
+
462
+ # Final summary with elapsed time
463
+ elapsed = time_module.time() - self.start_time
464
+ if elapsed >= 60:
465
+ elapsed_str = f"{int(elapsed // 60)}m {int(elapsed % 60)}s"
466
+ else:
467
+ elapsed_str = f"{elapsed:.1f}s"
468
+
469
+ console.print()
470
+ console.print(
471
+ Panel(
472
+ f"[green]✓ Completed:[/] {len(self.completed_tasks)} "
473
+ f"[red]✗ Failed:[/] {len(self.failed_tasks)} "
474
+ f"[dim]Skipped:[/] {self.skipped_tasks} "
475
+ f"[cyan]⏱ Time:[/] {elapsed_str}",
476
+ title=f"[bold]{self.task_name} Complete[/]",
477
+ border_style="green" if not self.failed_tasks else "yellow",
478
+ )
479
+ )
480
+
481
+ return results
482
+
483
+
484
+ def setup_logging() -> None:
485
+ log_level = os.getenv("LOG_LEVEL", "INFO")
486
+ log_file = os.getenv("LOG_FILE", "osslag.log")
487
+
488
+ logging.config.dictConfig(
489
+ {
490
+ "version": 1,
491
+ "disable_existing_loggers": False,
492
+ "formatters": {
493
+ "standard": {
494
+ "format": "%(asctime)s | %(name)-20s | %(funcName)-20s:%(lineno)-4d | %(levelname)-8s | %(message)s",
495
+ },
496
+ },
497
+ "handlers": {
498
+ "console": {
499
+ "class": "logging.StreamHandler",
500
+ "level": log_level,
501
+ "formatter": "standard",
502
+ "stream": "ext://sys.stdout",
503
+ },
504
+ "file": {
505
+ "class": "logging.handlers.RotatingFileHandler",
506
+ "level": log_level,
507
+ "formatter": "standard",
508
+ "filename": log_file,
509
+ "maxBytes": 5 * 1024 * 1024,
510
+ "backupCount": 3,
511
+ "encoding": "utf-8",
512
+ },
513
+ },
514
+ "loggers": {
515
+ "osslag": {
516
+ "handlers": ["console", "file"],
517
+ "level": log_level,
518
+ "propagate": False,
519
+ },
520
+ "__main__": {
521
+ "handlers": ["console", "file"],
522
+ "level": log_level,
523
+ "propagate": False,
524
+ },
525
+ },
526
+ "root": {
527
+ "handlers": ["console"],
528
+ "level": log_level,
529
+ },
530
+ }
531
+ )
532
+
533
+
534
+ @app.command()
535
+ def clone(repo_url: str, dest_dir: str = "./cache/repos"):
536
+ """Clone a single Git repository to the specified destination directory."""
537
+ dest_dir = os.path.abspath(dest_dir)
538
+ print(f"Cloning repository {repo_url} into directory {dest_dir}")
539
+ success = vcs.clone_repo(repo_url, dest_dir)
540
+ if success:
541
+ print(f"Successfully cloned or updated repository {repo_url} into {dest_dir}")
542
+ else:
543
+ print(f"Failed to clone repository {repo_url} into {dest_dir}")
544
+
545
+
546
+ @app.command()
547
+ def get_metadata(
548
+ repo_url: str,
549
+ cache: str = typer.Option("./cache", help="Output path for metadata parquet file"),
550
+ ):
551
+ """Fetch GitHub repository metadata and save to a parquet file."""
552
+ github_token = os.getenv("GITHUB_TOKEN")
553
+ cache_path = os.getenv("CACHE_DIR") or cache
554
+ pathlib.Path(cache_path).mkdir(parents=True, exist_ok=True)
555
+ print(f"Fetching metadata for repository {repo_url}")
556
+ try:
557
+ metadata_df = gh.fetch_github_repo_metadata(repo_url, github_token)
558
+ parquet_path = Path(cache_path, "metadata.parquet")
559
+ metadata_df.to_parquet(parquet_path)
560
+ print(f"Metadata saved to {parquet_path}")
561
+ except Exception as e:
562
+ print(f"Failed to fetch metadata for {repo_url}: {e}")
563
+
564
+
565
+ @app.command()
566
+ def rate_limit():
567
+ """Fetch and display GitHub API rate limit information."""
568
+ github_token = os.getenv("GITHUB_TOKEN")
569
+ if github_token is None:
570
+ print("GITHUB_TOKEN is not set in environment variables.")
571
+ return
572
+ print(f"Using token: {github_token}")
573
+ rate_info = gh.gh_get_rate_limit_info(github_token)
574
+ if rate_info is not None:
575
+ print(
576
+ f"GitHub API Rate Limit: {rate_info['limit']}/{rate_info['remaining']} remaining (resets at {rate_info['reset_datetime']})"
577
+ )
578
+ else:
579
+ print("Failed to fetch rate limit info from GitHub.")
580
+
581
+
582
+ @app.command()
583
+ def pull_requests(
584
+ repo_url: str = typer.Argument(
585
+ ..., help="The GitHub repository URL to fetch pull requests for"
586
+ ),
587
+ cache: str = typer.Option("./cache", help="Cache directory"),
588
+ ):
589
+ """Fetch GitHub pull requests for a specified repository and save to a parquet file."""
590
+ github_token = os.getenv("GITHUB_TOKEN")
591
+ cache_path = os.getenv("CACHE_DIR") or cache
592
+ pathlib.Path(cache_path).mkdir(parents=True, exist_ok=True)
593
+ output_path = Path(cache_path, "pull_requests.parquet")
594
+ print(f"Fetching pull requests for repository {repo_url}")
595
+ try:
596
+ pr_df = gh.fetch_pull_requests(repo_url, github_token)
597
+ pr_df.to_parquet(output_path)
598
+ print(f"Pull requests saved to {output_path}")
599
+ except Exception as e:
600
+ print(f"Failed to fetch pull requests for {repo_url}: {e}")
601
+
602
+
603
+ @dataset_app.command(name="run", rich_help_panel="Full Pipeline")
604
+ def run_dataset_pipeline(
605
+ distro: str = typer.Option(
606
+ "debian", help="The Linux distribution to process (e.g., 'debian' 'fedora')"
607
+ ),
608
+ releases: list[str] = typer.Option(
609
+ ...,
610
+ "--release",
611
+ help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
612
+ ),
613
+ cache: str = typer.Option("./cache", help="Cache directory (EV: CACHE_DIR)"),
614
+ force: bool = typer.Option(
615
+ False, "--force", "-f", help="Force re-processing even if cache exists"
616
+ ),
617
+ ):
618
+ """Run the full pipeline: fetch packages, filter repos, extract versions,
619
+ merge releases, clone repos, load commits, pull GitHub data.
620
+
621
+ Uses cached data when available. Use --force to re-process all steps.
622
+ """
623
+ # Flatten: handle both repeated flags and comma-separated values
624
+ to_process = []
625
+ for r in releases:
626
+ to_process.extend([item.strip() for item in r.split(",")])
627
+
628
+ cache_dir = os.getenv("CACHE_DIR") or cache
629
+
630
+ console.print(
631
+ Panel(
632
+ f"[bold]Distro:[/] {distro} [bold]Releases:[/] {', '.join(to_process)} [bold]Cache:[/] {cache_dir}"
633
+ + (" [bold yellow]--force[/]" if force else ""),
634
+ title="[bold blue]🚀 Dataset Pipeline[/]",
635
+ border_style="blue",
636
+ )
637
+ )
638
+
639
+ # Suppress console logging for steps 1-4 (non-parallel steps)
640
+ with SuppressConsoleLogging():
641
+ # Step 1: Get and cache package data for each release
642
+ with Status("[bold cyan]Step 1/6:[/] Fetching packages...", console=console):
643
+ fetch_packages(distro=distro, releases=to_process, cache=cache_dir)
644
+ console.print("[green]✓[/] Step 1/6: Fetched packages")
645
+
646
+ # Step 2: Filter GitHub repos
647
+ with Status(
648
+ "[bold cyan]Step 2/6:[/] Filtering GitHub repos...", console=console
649
+ ):
650
+ filter_debian_github_repos(
651
+ distro=distro, release=to_process, cache=cache_dir, force=force
652
+ )
653
+ console.print("[green]✓[/] Step 2/6: Filtered GitHub repos")
654
+
655
+ # Step 3: Extract the version string and add upstream version columns
656
+ with Status(
657
+ "[bold cyan]Step 3/6:[/] Extracting upstream versions...", console=console
658
+ ):
659
+ extract_upstream_versions(
660
+ distro=distro, release=to_process, cache=cache_dir, force=force
661
+ )
662
+ console.print("[green]✓[/] Step 3/6: Extracted upstream versions")
663
+
664
+ # Step 4: Merge releases into a single DataFrame with all required columns
665
+ with Status("[bold cyan]Step 4/6:[/] Merging releases...", console=console):
666
+ merge_releases(
667
+ distro=distro, releases=to_process, cache=cache_dir, force=force
668
+ )
669
+ console.print("[green]✓[/] Step 4/6: Merged releases")
670
+
671
+ # Step 5: Clone all upstream GitHub repos (has its own UI)
672
+ console.print("\n[bold cyan]Step 5/6:[/] Cloning repositories...")
673
+ clone_upstream_repos(distro=distro, cache=cache_dir)
674
+
675
+ # Step 6: Extract all commits into a single DataFrame (has its own UI)
676
+ console.print("\n[bold cyan]Step 6/6:[/] Loading commits...")
677
+ load_commits_into_dataframe(distro=distro, cache=cache_dir, force=force)
678
+
679
+ # Step 7: Fetch GitHub metadata for all repos (has its own UI)
680
+ console.print("\n[bold cyan]Step 7/7:[/] Fetching GitHub metadata...")
681
+ all_github_metadata(distro=distro, cache=cache_dir, force=force)
682
+
683
+ # Step 8: Fetch GitHub pull requests for all repos (has its own UI)
684
+ console.print("\n[bold cyan]Step 8/8:[/] Fetching GitHub pull requests...")
685
+ all_github_pull_requests(distro=distro, cache=cache_dir, force=force)
686
+
687
+ console.print(
688
+ Panel(
689
+ "[bold green]Pipeline completed successfully![/]",
690
+ border_style="green",
691
+ )
692
+ )
693
+
694
+
695
+ @dataset_app.command(rich_help_panel="Step 1: Fetch Data")
696
+ def fetch_packages(
697
+ distro: str = typer.Argument(
698
+ ...,
699
+ help="The Linux distribution to fetch packages for (e.g., 'debian' 'fedora')",
700
+ ),
701
+ releases: list[str] = typer.Argument(
702
+ ...,
703
+ help="The release(s) to fetch packages for (e.g., 'trixie', 'bookworm', '40')",
704
+ ),
705
+ cache: str = typer.Option("./cache", help="Cache directory"),
706
+ ):
707
+ """Fetch and cache distribution package data for specified releases."""
708
+ cache_dir = os.getenv("CACHE_DIR") or cache
709
+
710
+ # Ensure cache directory exists
711
+ Path(cache_dir).mkdir(parents=True, exist_ok=True)
712
+
713
+ if distro.lower() == "debian":
714
+ for rel in releases:
715
+ parquet_path = Path(cache_dir, f"{distro}_{rel}_all_packages.parquet")
716
+ if parquet_path.exists():
717
+ logger.info(f"Using cached {rel} packages from {parquet_path}")
718
+ continue
719
+
720
+ # Show status since this can take a while (large file download + parsing)
721
+ with Status(
722
+ f"[bold cyan]Fetching {rel} packages (this may take a minute)...[/]",
723
+ console=console,
724
+ ):
725
+ logger.info(f"Fetching and caching {rel} packages to {parquet_path}")
726
+ df: pd.DataFrame | None = deb.fetch_packages(rel)
727
+ if df is None:
728
+ logger.error(f"Failed to fetch {rel} packages.")
729
+ console.print(f"[red]✗ Failed to fetch {rel} packages[/]")
730
+ else:
731
+ df.to_parquet(parquet_path)
732
+ console.print(f"[green]✓ Fetched {len(df):,} {rel} packages[/]")
733
+ else:
734
+ logger.error(f"Distro '{distro}' is not supported for fetching packages.")
735
+
736
+
737
+ @dataset_app.command(rich_help_panel="Step 2: Filter Repos")
738
+ def filter_debian_github_repos(
739
+ distro: str = typer.Argument(
740
+ ..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
741
+ ),
742
+ release: list[str] = typer.Argument(
743
+ ...,
744
+ help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
745
+ ),
746
+ cache: str = typer.Option("./cache", help="Cache directory"),
747
+ force: bool = typer.Option(
748
+ False, "--force", "-f", help="Force re-processing even if cache exists"
749
+ ),
750
+ ):
751
+ """Filter distro package DataFrames to only include GitHub repositories."""
752
+ cache_dir = os.getenv("CACHE_DIR") or cache
753
+
754
+ if distro.lower() == "debian":
755
+ for rel in release:
756
+ filtered_parquet_path = Path(
757
+ cache_dir, f"{distro}_{rel}_filtered_packages.parquet"
758
+ )
759
+ if filtered_parquet_path.exists() and not force:
760
+ logger.info(
761
+ f"Using cached filtered packages from {filtered_parquet_path}"
762
+ )
763
+ continue
764
+
765
+ parquet_path = Path(cache_dir, f"{distro}_{rel}_all_packages.parquet")
766
+ if not parquet_path.exists():
767
+ logger.error(
768
+ f"Required parquet file {parquet_path} does not exist. Please run the 'fetch-packages' command first."
769
+ )
770
+ continue
771
+
772
+ logger.info(f"Filtering GitHub repositories for Debian release '{rel}'")
773
+ df: pd.DataFrame = pd.read_parquet(parquet_path)
774
+ size_before = df.shape[0]
775
+ filtered_df = deb.filter_github_repos(df)
776
+ size_after = filtered_df.shape[0]
777
+ logger.info(
778
+ f"Dropped {size_before - size_after} packages due to non-GitHub '{rel}'."
779
+ )
780
+ filtered_df = deb.add_local_repo_cache_path_column(
781
+ filtered_df, cache_dir=cache_dir
782
+ )
783
+ filtered_df.reset_index(drop=True, inplace=True)
784
+ filtered_df.to_parquet(filtered_parquet_path)
785
+ logger.info(
786
+ f"Filtered GitHub repositories saved to {filtered_parquet_path}"
787
+ )
788
+ else:
789
+ logger.error(
790
+ f"Distro '{distro}' is not supported for filtering GitHub repositories."
791
+ )
792
+
793
+
794
+ @dataset_app.command(rich_help_panel="Step 3: Extract Versions")
795
+ def extract_upstream_versions(
796
+ distro: str = typer.Argument(
797
+ ..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
798
+ ),
799
+ release: list[str] = typer.Argument(
800
+ ...,
801
+ help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
802
+ ),
803
+ cache: str = typer.Option("./cache", help="Cache directory"),
804
+ force: bool = typer.Option(
805
+ False, "--force", "-f", help="Force re-processing even if cache exists"
806
+ ),
807
+ ):
808
+ """Extract upstream version strings from Debian package versions and add as a new column."""
809
+ cache_dir = os.getenv("CACHE_DIR") or cache
810
+
811
+ if distro.lower() == "debian":
812
+ for rel in release:
813
+ versions_parquet_path = Path(
814
+ cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet"
815
+ )
816
+ if versions_parquet_path.exists() and not force:
817
+ logger.info(
818
+ f"Using cached upstream versions from {versions_parquet_path}"
819
+ )
820
+ continue
821
+
822
+ filtered_parquet_path = Path(
823
+ cache_dir, f"{distro}_{rel}_filtered_packages.parquet"
824
+ )
825
+ if not filtered_parquet_path.exists():
826
+ logger.error(
827
+ f"Required parquet file {filtered_parquet_path} does not exist. Please run the 'filter-debian-github-repos' command first."
828
+ )
829
+ continue
830
+
831
+ logger.info(f"Extracting upstream versions for Debian release '{rel}'")
832
+ df: pd.DataFrame = pd.read_parquet(filtered_parquet_path)
833
+ version_column = f"{rel}_upstream_version"
834
+ df_with_versions = deb.add_upstream_version_column(
835
+ df, f"{rel}_version", new_column_name=version_column
836
+ )
837
+ drop_before = df_with_versions.shape[0]
838
+ df_with_versions.dropna(subset=[version_column], inplace=True)
839
+ drop_after = df_with_versions.shape[0]
840
+ logger.info(
841
+ f"Dropped {drop_before - drop_after} rows with missing upstream versions for release '{rel}'."
842
+ )
843
+ df_with_versions.reset_index(drop=True, inplace=True)
844
+ df_with_versions.to_parquet(versions_parquet_path)
845
+ logger.info(
846
+ f"Upstream versions extracted and saved to {versions_parquet_path}"
847
+ )
848
+ else:
849
+ logger.error(
850
+ f"Distro '{distro}' is not supported for extracting upstream versions."
851
+ )
852
+
853
+
854
+ @dataset_app.command(rich_help_panel="Step 4: Merge Releases")
855
+ def merge_releases(
856
+ distro: str = typer.Argument(
857
+ ..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
858
+ ),
859
+ releases: list[str] = typer.Argument(
860
+ ...,
861
+ help="One or more distro releases to merge (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
862
+ ),
863
+ cache: str = typer.Option("./cache", help="Cache directory"),
864
+ force: bool = typer.Option(
865
+ False, "--force", "-f", help="Force re-processing even if cache exists"
866
+ ),
867
+ ):
868
+ """Merge multiple release DataFrames into a single DataFrame with all required columns."""
869
+ cache_dir = os.getenv("CACHE_DIR") or cache
870
+
871
+ if distro.lower() == "debian":
872
+ merged_parquet_path = Path(
873
+ cache_dir, f"{distro}_merged_releases_packages.parquet"
874
+ )
875
+ if merged_parquet_path.exists() and not force:
876
+ logger.info(f"Using cached merged releases from {merged_parquet_path}")
877
+ return
878
+
879
+ dfs = []
880
+ for rel in releases:
881
+ versions_parquet_path = Path(
882
+ cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet"
883
+ )
884
+ if not versions_parquet_path.exists():
885
+ logger.error(
886
+ f"Required parquet file {versions_parquet_path} does not exist. Please run the 'extract-upstream-versions' command first."
887
+ )
888
+ continue
889
+
890
+ logger.info(
891
+ f"Loading packages with upstream versions for Debian release '{rel}'"
892
+ )
893
+ df: pd.DataFrame = pd.read_parquet(versions_parquet_path)
894
+ dfs.append(df)
895
+ deb_merged_df, deb_dropped_after_merge = deb.merge_release_packages(dfs)
896
+ logger.info(
897
+ f"Merged releases {releases}. Dropped {deb_dropped_after_merge.shape[0]} rows that were not present in all releases."
898
+ )
899
+ deb_merged_df.reset_index(drop=True, inplace=True)
900
+ deb_merged_df.to_parquet(merged_parquet_path)
901
+ logger.info(f"Merged release packages saved to {merged_parquet_path}")
902
+ deb_dropped_after_merge.to_parquet(
903
+ Path(cache_dir, f"{distro}_dropped_after_merge.parquet")
904
+ )
905
+ logger.info(
906
+ f"Dropped rows after merge saved to {Path(cache_dir, f'{distro}_dropped_after_merge.parquet')}"
907
+ )
908
+
909
+ else:
910
+ logger.error(f"Distro '{distro}' is not supported for merging releases.")
911
+
912
+
913
+ @dataset_app.command(rich_help_panel="Step 5: Clone Repos")
914
+ def clone_upstream_repos(
915
+ distro: str = typer.Argument(
916
+ ..., help="The distro for (e.g., 'debian' 'fedora', etc.)"
917
+ ),
918
+ repos_cache: str = typer.Option(
919
+ "./cache/repos", help="Cache directory for cloned repositories"
920
+ ),
921
+ cache: str = typer.Option("./cache", help="Cache directory"),
922
+ max_workers: int = typer.Option(
923
+ 4, help="Maximum number of parallel clone processes (env: MAX_WORKERS)"
924
+ ),
925
+ ):
926
+ """Clone all upstream GitHub repositories in the filtered package DataFrame."""
927
+ cache_dir = os.getenv("CACHE_DIR") or cache
928
+ repos_cache = os.getenv("REPOS_CACHE_DIR") or repos_cache
929
+ max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
930
+ if distro.lower() == "debian":
931
+ parquet_path = Path(cache_dir, f"{distro}_merged_releases_packages.parquet")
932
+ if not parquet_path.exists():
933
+ console.print(
934
+ f"[red]Error:[/] Required parquet file {parquet_path} does not exist. Please run the 'merge-releases' command first."
935
+ )
936
+ return
937
+
938
+ # Suppress logging during setup
939
+ with SuppressConsoleLogging():
940
+ df: pd.DataFrame = pd.read_parquet(parquet_path)
941
+ repos_cache_path = pathlib.Path(repos_cache)
942
+
943
+ vcs.ensure_dir(repos_cache_path)
944
+
945
+ # Build list of repos to clone (skip already cloned)
946
+ clone_tasks: list[tuple[str, str]] = []
947
+ skipped = 0
948
+ invalid = 0
949
+ for _, row in df.iterrows():
950
+ repo_url = str(row["upstream_repo_url"])
951
+ target_dir = vcs.construct_repo_local_path(
952
+ repo_url, cache_dir=repos_cache_path, must_exist=False
953
+ )
954
+ if target_dir is None:
955
+ invalid += 1
956
+ continue
957
+ if target_dir.exists():
958
+ skipped += 1
959
+ continue
960
+ # Skip if there is a .failed marker file
961
+ failed_marker = repos_cache_path / f"{target_dir.name}.failed"
962
+ if failed_marker.exists():
963
+ skipped += 1
964
+ continue
965
+
966
+ clone_tasks.append((repo_url, str(target_dir)))
967
+
968
+ if invalid > 0:
969
+ console.print(f"[yellow]Skipped {invalid} invalid repository URLs[/]")
970
+
971
+ if len(clone_tasks) == 0:
972
+ console.print(f"[green]All {skipped} repositories already cloned.[/]")
973
+ return
974
+
975
+ # Use the parallel executor with fancy UI
976
+ executor = ParallelExecutor(
977
+ task_name=f"Cloning {distro.title()} Repositories",
978
+ max_workers=max_workers,
979
+ )
980
+ executor.run(
981
+ tasks=clone_tasks,
982
+ worker_fn=_clone_task,
983
+ task_id_fn=lambda t: t[0], # repo_url
984
+ skipped=skipped,
985
+ )
986
+ else:
987
+ console.print(
988
+ f"[red]Error:[/] Distro '{distro}' is not supported for cloning repositories."
989
+ )
990
+
991
+
992
+ @dataset_app.command(rich_help_panel="Step 6: Load Commits")
993
+ def load_commits_into_dataframe(
994
+ distro: str = typer.Argument(
995
+ ..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
996
+ ),
997
+ cache: str = typer.Option("./cache", help="Cache directory"),
998
+ repo_cache: str = typer.Option(
999
+ "./cache/repos", help="Cache directory for cloned repositories"
1000
+ ),
1001
+ max_workers: int = typer.Option(
1002
+ 4, help="Maximum number of parallel worker processes (env: MAX_WORKERS)"
1003
+ ),
1004
+ force: bool = typer.Option(
1005
+ False, "--force", "-f", help="Force re-processing even if cache exists"
1006
+ ),
1007
+ ):
1008
+ """Load all GitHub commits for the upstream repositories into a single DataFrame."""
1009
+ cache_dir = os.getenv("CACHE_DIR") or cache
1010
+ repo_cache_dir = os.getenv("REPOS_CACHE_DIR") or repo_cache
1011
+ checkpoint_dir = Path(cache_dir, "commit_checkpoints")
1012
+ max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
1013
+
1014
+ commits_parquet_path = Path(cache_dir, f"{distro}_all_upstream_commits.parquet")
1015
+ if commits_parquet_path.exists() and not force:
1016
+ console.print(f"[green]Using cached commits from {commits_parquet_path}[/]")
1017
+ return
1018
+
1019
+ all_packages_parquet_path = Path(
1020
+ cache_dir, f"{distro}_merged_releases_packages.parquet"
1021
+ )
1022
+ if not all_packages_parquet_path.exists():
1023
+ console.print(
1024
+ f"[red]Error:[/] Required parquet file {all_packages_parquet_path} does not exist. Please run the 'merge-releases' and 'clone-upstream-repos' commands first."
1025
+ )
1026
+ return
1027
+ # Create checkpoint directory
1028
+ vcs.ensure_dir(checkpoint_dir)
1029
+
1030
+ if force and checkpoint_dir.exists():
1031
+ console.print(f"[yellow]Removing existing checkpoint at {checkpoint_dir}[/]")
1032
+ for ck in checkpoint_dir.iterdir():
1033
+ if ck.name.endswith(".parquet"):
1034
+ ck.unlink()
1035
+
1036
+ df: pd.DataFrame = pd.read_parquet(all_packages_parquet_path)
1037
+
1038
+ # Build list of tasks (skip repos without local paths)
1039
+ tasks: list[tuple[str, str, str, str]] = []
1040
+ skipped = 0
1041
+ for _, row in df.iterrows():
1042
+ repo_url = str(row["upstream_repo_url"])
1043
+ local_repo_path = vcs.construct_repo_local_path(
1044
+ repo_url, cache_dir=Path(repo_cache_dir), must_exist=True
1045
+ )
1046
+ if local_repo_path is None or not local_repo_path.exists():
1047
+ skipped += 1
1048
+ continue
1049
+ source = str(row["source"])
1050
+ if Path(checkpoint_dir, f"{source}.parquet").exists():
1051
+ skipped += 1
1052
+ continue
1053
+ tasks.append((str(local_repo_path), repo_url, source, str(checkpoint_dir)))
1054
+ results: list[TaskResult] = []
1055
+ # Run tasks if any
1056
+ if len(tasks) > 0:
1057
+ # Use the parallel executor with fancy UI
1058
+ executor = ParallelExecutor(
1059
+ task_name=f"Loading {distro.title()} Commits",
1060
+ max_workers=max_workers,
1061
+ )
1062
+ results = executor.run(
1063
+ tasks=tasks,
1064
+ worker_fn=_load_commits_task,
1065
+ task_id_fn=lambda t: t[1], # repo_url
1066
+ skipped=skipped,
1067
+ )
1068
+ # Collect all the checkpointed DataFrames
1069
+ if checkpoint_dir.exists():
1070
+ try:
1071
+ console.print(
1072
+ f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
1073
+ )
1074
+ for ck in checkpoint_dir.iterdir():
1075
+ if not ck.name.endswith(".parquet"):
1076
+ continue
1077
+
1078
+ checkpoint_df: pd.DataFrame = pd.read_parquet(ck)
1079
+ results.append(
1080
+ TaskResult(
1081
+ task_id="checkpoint",
1082
+ success=True,
1083
+ data=checkpoint_df,
1084
+ )
1085
+ )
1086
+ except Exception as e:
1087
+ console.print(
1088
+ f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
1089
+ )
1090
+
1091
+ # Collect successful DataFrames
1092
+ all_commits = [r.data for r in results if r.success and r.data is not None]
1093
+
1094
+ if all_commits:
1095
+ console.print(f"[green]Loaded commits from {len(all_commits)} repositories.[/]")
1096
+ combined_commits_df = pd.concat(all_commits, ignore_index=True)
1097
+ commits_parquet_path = Path(cache_dir, f"{distro}_all_upstream_commits.parquet")
1098
+ combined_commits_df.to_parquet(commits_parquet_path)
1099
+ console.print(
1100
+ f"[green]Saved {len(combined_commits_df):,} commits to {commits_parquet_path}[/]"
1101
+ )
1102
+ else:
1103
+ console.print("[yellow]No commits were loaded from any repositories.[/]")
1104
+
1105
+
1106
+ @dataset_app.command(rich_help_panel="Step 7: GitHub Metadata")
1107
+ def all_github_metadata(
1108
+ distro: str = typer.Option(
1109
+ "debian", help="The Linux distribution to process (default: debian)"
1110
+ ),
1111
+ cache: str = typer.Option("./cache", help="Cache directory"),
1112
+ max_workers: int = typer.Option(
1113
+ 4, help="Maximum number of parallel GitHub API workers (env: MAX_WORKERS)"
1114
+ ),
1115
+ force: bool = typer.Option(
1116
+ False, "--force", "-f", help="Force re-processing even if cache exists"
1117
+ ),
1118
+ ):
1119
+ """Fetch GitHub repository metadata for all unique repos in the commits parquet file."""
1120
+ cache_dir = os.getenv("CACHE_DIR") or cache
1121
+ max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
1122
+ all_packages_parquet_path = Path(
1123
+ cache_dir, f"{distro}_merged_releases_packages.parquet"
1124
+ )
1125
+ output_parquet_path = Path(cache_dir, f"{distro}_github_repo_metadata.parquet")
1126
+ checkpoint_dir = Path(cache_dir, "github_metadata_checkpoints")
1127
+ # Create checkpoint directory
1128
+ vcs.ensure_dir(checkpoint_dir)
1129
+
1130
+ if force and checkpoint_dir.exists():
1131
+ console.print(
1132
+ f"[yellow]Removing existing GitHub metadata checkpoint at {checkpoint_dir}[/]"
1133
+ )
1134
+ for ck in checkpoint_dir.iterdir():
1135
+ if ck.name.endswith(".parquet"):
1136
+ ck.unlink()
1137
+
1138
+ if output_parquet_path.exists() and not force:
1139
+ console.print(
1140
+ f"[green]Using cached GitHub metadata from {output_parquet_path}[/]"
1141
+ )
1142
+ return
1143
+
1144
+ if not all_packages_parquet_path.exists():
1145
+ console.print(
1146
+ f"[red]Error:[/] Required parquet file {all_packages_parquet_path} does not exist. Please run the 'merge-releases' command first."
1147
+ )
1148
+ return
1149
+
1150
+ df: pd.DataFrame = pd.read_parquet(all_packages_parquet_path)
1151
+
1152
+ # Build list of tasks (skip repos without local paths)
1153
+ tasks: list[tuple[str, str, str]] = []
1154
+ skipped = 0
1155
+ for _, row in df.iterrows():
1156
+ repo_url = str(row["upstream_repo_url"])
1157
+ source = str(row["source"])
1158
+ if Path(checkpoint_dir, f"{source}.parquet").exists():
1159
+ skipped += 1
1160
+ continue
1161
+ # Skip if there is a .failed marker file
1162
+ failed_marker = checkpoint_dir / f"{source}.failed"
1163
+ if failed_marker.exists():
1164
+ skipped += 1
1165
+ continue
1166
+ tasks.append((repo_url, source, str(checkpoint_dir)))
1167
+ results: list[TaskResult] = []
1168
+
1169
+ # Display rate limit info
1170
+ github_token = os.getenv("GITHUB_TOKEN")
1171
+ rate_info = gh.gh_get_rate_limit_info(github_token)
1172
+ rate_limit = rate_info["limit"] if rate_info else None
1173
+ rate_remaining = rate_info["remaining"] if rate_info else None
1174
+ if rate_info:
1175
+ console.print(
1176
+ f"[cyan]GitHub API Rate Limit:[/] {rate_info['remaining']}/{rate_info['limit']} remaining (resets at {rate_info['reset_datetime']})"
1177
+ )
1178
+ else:
1179
+ console.print("[yellow]Warning:[/] Could not fetch rate limit info")
1180
+
1181
+ console.print(f"[cyan]Fetching GitHub metadata for {len(tasks)} repositories...[/]")
1182
+ executor = ParallelExecutor(
1183
+ task_name="GitHub Metadata Fetch",
1184
+ max_workers=min(max_workers, len(tasks)),
1185
+ rate_limit=rate_limit,
1186
+ rate_remaining=rate_remaining,
1187
+ )
1188
+ results = executor.run(
1189
+ tasks=tasks,
1190
+ worker_fn=_fetch_github_repo_metadata_task,
1191
+ task_id_fn=lambda t: t[0],
1192
+ skipped=skipped,
1193
+ )
1194
+ # Collect all the checkpointed DataFrames
1195
+ if checkpoint_dir.exists():
1196
+ try:
1197
+ console.print(
1198
+ f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
1199
+ )
1200
+ for ck in checkpoint_dir.iterdir():
1201
+ if not ck.name.endswith(".parquet"):
1202
+ continue
1203
+
1204
+ checkpoint_df: pd.DataFrame = pd.read_parquet(ck)
1205
+ results.append(
1206
+ TaskResult(
1207
+ task_id="checkpoint",
1208
+ success=True,
1209
+ data=checkpoint_df,
1210
+ )
1211
+ )
1212
+ except Exception as e:
1213
+ console.print(
1214
+ f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
1215
+ )
1216
+ # Collect successful DataFrames
1217
+ all_metadata = [r.data for r in results if r.success and r.data is not None]
1218
+
1219
+ if all_metadata:
1220
+ console.print(
1221
+ f"[green]Loaded metadata from {len(all_metadata)} repositories.[/]"
1222
+ )
1223
+ combined_metadata_df = pd.concat(all_metadata, ignore_index=True)
1224
+ metadata_parquet_path = Path(
1225
+ cache_dir, f"{distro}_all_upstream_metadata.parquet"
1226
+ )
1227
+ combined_metadata_df.to_parquet(metadata_parquet_path)
1228
+ console.print(
1229
+ f"[green]Saved {len(combined_metadata_df):,} metadata entries to {metadata_parquet_path}[/]"
1230
+ )
1231
+ else:
1232
+ console.print(
1233
+ "[yellow]No metadata entries were loaded from any repositories.[/]"
1234
+ )
1235
+
1236
+
1237
+ @dataset_app.command(rich_help_panel="Step 8: GitHub Metadata")
1238
+ def all_github_pull_requests(
1239
+ distro: str = typer.Option(
1240
+ "debian", help="The Linux distribution to process (default: debian)"
1241
+ ),
1242
+ cache: str = typer.Option("./cache", help="Cache directory"),
1243
+ max_workers: int = typer.Option(
1244
+ 4, help="Maximum number of parallel GitHub API workers (env: MAX_WORKERS)"
1245
+ ),
1246
+ force: bool = typer.Option(
1247
+ False, "--force", "-f", help="Force re-processing even if cache exists"
1248
+ ),
1249
+ ):
1250
+ """Fetch GitHub repository pull requests for all unique repos in the commits parquet file."""
1251
+ cache_dir = os.getenv("CACHE_DIR") or cache
1252
+ max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
1253
+ all_packages_parquet_path = Path(
1254
+ cache_dir, f"{distro}_merged_releases_packages.parquet"
1255
+ )
1256
+ output_parquet_path = Path(cache_dir, f"{distro}_github_repo_pull_requests.parquet")
1257
+ checkpoint_dir = Path(cache_dir, "github_pr_checkpoints")
1258
+ # Create checkpoint directory
1259
+ vcs.ensure_dir(checkpoint_dir)
1260
+
1261
+ if output_parquet_path.exists() and not force:
1262
+ console.print(
1263
+ f"[green]Using cached GitHub pull requests from {output_parquet_path}[/]"
1264
+ )
1265
+ return
1266
+
1267
+ if not all_packages_parquet_path.exists():
1268
+ console.print(
1269
+ f"[red]Error:[/] Required parquet file {all_packages_parquet_path} does not exist. Please run the 'merge-releases' command first."
1270
+ )
1271
+ return
1272
+
1273
+ if force and checkpoint_dir.exists():
1274
+ console.print(
1275
+ f"[yellow]Removing existing GitHub pull requests checkpoint at {checkpoint_dir}[/]"
1276
+ )
1277
+ for ck in checkpoint_dir.iterdir():
1278
+ if ck.name.endswith(".parquet"):
1279
+ ck.unlink()
1280
+
1281
+ df: pd.DataFrame = pd.read_parquet(all_packages_parquet_path)
1282
+
1283
+ # Build list of tasks (skip repos without local paths)
1284
+ tasks: list[tuple[str, str, str]] = []
1285
+ skipped = 0
1286
+ for _, row in df.iterrows():
1287
+ repo_url = str(row["upstream_repo_url"])
1288
+ source = str(row["source"])
1289
+ if Path(checkpoint_dir, f"{source}.parquet").exists():
1290
+ skipped += 1
1291
+ continue
1292
+ # Skip if there is a .failed marker file
1293
+ failed_marker = checkpoint_dir / f"{source}.failed"
1294
+ if failed_marker.exists():
1295
+ skipped += 1
1296
+ continue
1297
+ tasks.append((repo_url, source, str(checkpoint_dir)))
1298
+ results: list[TaskResult] = []
1299
+
1300
+ # Display rate limit info
1301
+ github_token = os.getenv("GITHUB_TOKEN")
1302
+ rate_info = gh.gh_get_rate_limit_info(github_token)
1303
+ if rate_info:
1304
+ console.print(
1305
+ f"[cyan]GitHub API Rate Limit:[/] {rate_info['remaining']}/{rate_info['limit']} remaining (resets at {rate_info['reset_datetime']})"
1306
+ )
1307
+ else:
1308
+ console.print("[yellow]Warning:[/] Could not fetch rate limit info")
1309
+
1310
+ console.print(
1311
+ f"[cyan]Fetching GitHub pull requests for {len(tasks)} repositories...[/]"
1312
+ )
1313
+ executor = ParallelExecutor(
1314
+ task_name="GitHub Pull Requests Fetch",
1315
+ max_workers=min(max_workers, len(tasks)),
1316
+ )
1317
+ results = executor.run(
1318
+ tasks=tasks,
1319
+ worker_fn=_fetch_github_repo_pull_requests_task,
1320
+ task_id_fn=lambda t: t[0],
1321
+ skipped=skipped,
1322
+ )
1323
+ # Collect all the checkpointed DataFrames
1324
+ if checkpoint_dir.exists():
1325
+ try:
1326
+ console.print(
1327
+ f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
1328
+ )
1329
+ for ck in checkpoint_dir.iterdir():
1330
+ if not ck.name.endswith(".parquet"):
1331
+ continue
1332
+
1333
+ checkpoint_df: pd.DataFrame = pd.read_parquet(ck)
1334
+ results.append(
1335
+ TaskResult(
1336
+ task_id="checkpoint",
1337
+ success=True,
1338
+ data=checkpoint_df,
1339
+ )
1340
+ )
1341
+ except Exception as e:
1342
+ console.print(
1343
+ f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
1344
+ )
1345
+ # Collect successful DataFrames
1346
+ all_metadata = [r.data for r in results if r.success and r.data is not None]
1347
+
1348
+ if all_metadata:
1349
+ console.print(
1350
+ f"[green]Loaded pull requests from {len(all_metadata)} repositories.[/]"
1351
+ )
1352
+ combined_metadata_df = pd.concat(all_metadata, ignore_index=True)
1353
+ metadata_parquet_path = Path(
1354
+ cache_dir, f"{distro}_all_upstream_pull_requests.parquet"
1355
+ )
1356
+ combined_metadata_df.to_parquet(metadata_parquet_path)
1357
+ console.print(
1358
+ f"[green]Saved {len(combined_metadata_df):,} pull request entries to {metadata_parquet_path}[/]"
1359
+ )
1360
+ else:
1361
+ console.print(
1362
+ "[yellow]No pull request entries were loaded from any repositories.[/]"
1363
+ )
1364
+
1365
+
1366
+ @app.command()
1367
+ def show_cache():
1368
+ """Show the current cache directory."""
1369
+ cache_dir = os.getenv("CACHE_DIR", "./cache")
1370
+ console.print(f"[blue]Current cache directory:[/] {cache_dir}")
1371
+
1372
+
1373
+ def main():
1374
+ """Main entry point for the CLI application."""
1375
+ setup_logging()
1376
+ # Show help menu if no arguments provided
1377
+ if len(sys.argv) == 1:
1378
+ app(["--help"])
1379
+ else:
1380
+ app()