osslag 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
osslag/cli.py CHANGED
@@ -26,15 +26,32 @@ from rich.status import Status
26
26
  from rich.table import Table
27
27
  from rich.text import Text
28
28
 
29
+ from osslag import __version__
29
30
  from osslag.distro import debian as deb
30
31
  from osslag.utils import github_helper as gh
31
32
  from osslag.utils import vcs
32
33
 
33
34
  load_dotenv()
35
+
36
+
37
+ def version_callback(value: bool):
38
+ if value:
39
+ print(f"osslag {__version__}")
40
+ raise typer.Exit()
41
+
42
+
34
43
  app = typer.Typer()
35
- dataset_app = typer.Typer(
36
- help="Dataset pipeline commands for building package analysis datasets."
37
- )
44
+
45
+
46
+ @app.callback()
47
+ def main_callback(
48
+ version: bool = typer.Option(None, "--version", "-v", callback=version_callback, is_eager=True, help="Show version"),
49
+ ):
50
+ """OSS Lag - Technical Lag tools for Open Source Software Projects."""
51
+ pass
52
+
53
+
54
+ dataset_app = typer.Typer(help="Dataset pipeline commands for building package analysis datasets.")
38
55
  app.add_typer(dataset_app, name="dataset")
39
56
  logger = logging.getLogger(__name__)
40
57
  console = Console()
@@ -135,9 +152,7 @@ class SuppressConsoleLogging:
135
152
  for name in list(logging.Logger.manager.loggerDict.keys()) + ["", "root"]:
136
153
  log = logging.getLogger(name) if name else logging.getLogger()
137
154
  for handler in log.handlers[:]:
138
- if isinstance(handler, logging.StreamHandler) and not isinstance(
139
- handler, logging.FileHandler
140
- ):
155
+ if isinstance(handler, logging.StreamHandler) and not isinstance(handler, logging.FileHandler):
141
156
  original_level = handler.level
142
157
  handler.setLevel(logging.CRITICAL + 1) # Effectively disable
143
158
  self._disabled_handlers.append((handler, original_level))
@@ -264,9 +279,7 @@ class ParallelExecutor:
264
279
  )
265
280
  workers_table.add_column("Worker", style="cyan", width=8)
266
281
  workers_table.add_column("Status", style="white", width=12)
267
- workers_table.add_column(
268
- "Current Task", style="yellow", overflow="ellipsis", no_wrap=True, width=60
269
- )
282
+ workers_table.add_column("Current Task", style="yellow", overflow="ellipsis", no_wrap=True, width=60)
270
283
  workers_table.add_column("Done", style="green", justify="right", width=6)
271
284
  workers_table.add_column("Fail", style="red", justify="right", width=6)
272
285
 
@@ -274,9 +287,7 @@ class ParallelExecutor:
274
287
  w = self.workers[wid]
275
288
  status = "[green]●[/] Working" if w.current_task else "[dim]○ Idle[/]"
276
289
  task_display = (
277
- w.current_task[:58] + "…"
278
- if w.current_task and len(w.current_task) > 58
279
- else (w.current_task or "-")
290
+ w.current_task[:58] + "…" if w.current_task and len(w.current_task) > 58 else (w.current_task or "-")
280
291
  )
281
292
  workers_table.add_row(
282
293
  f"#{wid}",
@@ -291,9 +302,7 @@ class ParallelExecutor:
291
302
  for task_id, success in self.recent_completed[-self.show_recent_completed :]:
292
303
  short_id = task_id[:70] + "…" if len(task_id) > 70 else task_id
293
304
  recent_text.append(" ")
294
- recent_text.append(
295
- "✓ " if success else "✗ ", style="bold green" if success else "bold red"
296
- )
305
+ recent_text.append("✓ " if success else "✗ ", style="bold green" if success else "bold red")
297
306
  recent_text.append(f"{short_id}\n")
298
307
 
299
308
  components = [
@@ -304,9 +313,7 @@ class ParallelExecutor:
304
313
  workers_table,
305
314
  Text(),
306
315
  Panel(
307
- recent_text
308
- if recent_text.plain
309
- else Text(" Waiting for tasks...", style="dim italic"),
316
+ recent_text if recent_text.plain else Text(" Waiting for tasks...", style="dim italic"),
310
317
  title="[bold]Recent Completions[/]",
311
318
  border_style="dim",
312
319
  ),
@@ -371,9 +378,7 @@ class ParallelExecutor:
371
378
 
372
379
  with (
373
380
  SuppressConsoleLogging(),
374
- Live(
375
- self.create_display(progress), refresh_per_second=4, console=console
376
- ) as live,
381
+ Live(self.create_display(progress), refresh_per_second=4, console=console) as live,
377
382
  ):
378
383
  with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
379
384
  # Map futures to (task, worker_id)
@@ -422,23 +427,17 @@ class ParallelExecutor:
422
427
  self.workers[worker_id].tasks_failed += 1
423
428
  self.recent_completed.append((task_id, result.success))
424
429
  marker = result.failed_marker_path
425
- marker.write_text(
426
- f"Task failed: {result.error}\n"
427
- ) if marker else None
430
+ marker.write_text(f"Task failed: {result.error}\n") if marker else None
428
431
 
429
432
  except Exception as e:
430
- error_result = TaskResult(
431
- task_id=task_id, success=False, error=str(e)
432
- )
433
+ error_result = TaskResult(task_id=task_id, success=False, error=str(e))
433
434
  results.append(error_result)
434
435
  self.failed_tasks.append(error_result)
435
436
  self.workers[worker_id].tasks_failed += 1
436
437
  self.recent_completed.append((task_id, False))
437
438
  result = error_result # Assign for progress check below
438
439
  marker = result.failed_marker_path
439
- marker.write_text(
440
- f"Task failed: {result.error}\n"
441
- ) if marker else None
440
+ marker.write_text(f"Task failed: {result.error}\n") if marker else None
442
441
  # Update progress
443
442
  progress.advance(progress_task_id)
444
443
 
@@ -581,9 +580,7 @@ def rate_limit():
581
580
 
582
581
  @app.command()
583
582
  def pull_requests(
584
- repo_url: str = typer.Argument(
585
- ..., help="The GitHub repository URL to fetch pull requests for"
586
- ),
583
+ repo_url: str = typer.Argument(..., help="The GitHub repository URL to fetch pull requests for"),
587
584
  cache: str = typer.Option("./cache", help="Cache directory"),
588
585
  ):
589
586
  """Fetch GitHub pull requests for a specified repository and save to a parquet file."""
@@ -602,18 +599,14 @@ def pull_requests(
602
599
 
603
600
  @dataset_app.command(name="run", rich_help_panel="Full Pipeline")
604
601
  def run_dataset_pipeline(
605
- distro: str = typer.Option(
606
- "debian", help="The Linux distribution to process (e.g., 'debian' 'fedora')"
607
- ),
602
+ distro: str = typer.Option("debian", help="The Linux distribution to process (e.g., 'debian' 'fedora')"),
608
603
  releases: list[str] = typer.Option(
609
604
  ...,
610
605
  "--release",
611
606
  help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
612
607
  ),
613
608
  cache: str = typer.Option("./cache", help="Cache directory (EV: CACHE_DIR)"),
614
- force: bool = typer.Option(
615
- False, "--force", "-f", help="Force re-processing even if cache exists"
616
- ),
609
+ force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
617
610
  ):
618
611
  """Run the full pipeline: fetch packages, filter repos, extract versions,
619
612
  merge releases, clone repos, load commits, pull GitHub data.
@@ -644,28 +637,18 @@ def run_dataset_pipeline(
644
637
  console.print("[green]✓[/] Step 1/6: Fetched packages")
645
638
 
646
639
  # Step 2: Filter GitHub repos
647
- with Status(
648
- "[bold cyan]Step 2/6:[/] Filtering GitHub repos...", console=console
649
- ):
650
- filter_debian_github_repos(
651
- distro=distro, release=to_process, cache=cache_dir, force=force
652
- )
640
+ with Status("[bold cyan]Step 2/6:[/] Filtering GitHub repos...", console=console):
641
+ filter_debian_github_repos(distro=distro, release=to_process, cache=cache_dir, force=force)
653
642
  console.print("[green]✓[/] Step 2/6: Filtered GitHub repos")
654
643
 
655
644
  # Step 3: Extract the version string and add upstream version columns
656
- with Status(
657
- "[bold cyan]Step 3/6:[/] Extracting upstream versions...", console=console
658
- ):
659
- extract_upstream_versions(
660
- distro=distro, release=to_process, cache=cache_dir, force=force
661
- )
645
+ with Status("[bold cyan]Step 3/6:[/] Extracting upstream versions...", console=console):
646
+ extract_upstream_versions(distro=distro, release=to_process, cache=cache_dir, force=force)
662
647
  console.print("[green]✓[/] Step 3/6: Extracted upstream versions")
663
648
 
664
649
  # Step 4: Merge releases into a single DataFrame with all required columns
665
650
  with Status("[bold cyan]Step 4/6:[/] Merging releases...", console=console):
666
- merge_releases(
667
- distro=distro, releases=to_process, cache=cache_dir, force=force
668
- )
651
+ merge_releases(distro=distro, releases=to_process, cache=cache_dir, force=force)
669
652
  console.print("[green]✓[/] Step 4/6: Merged releases")
670
653
 
671
654
  # Step 5: Clone all upstream GitHub repos (has its own UI)
@@ -736,30 +719,22 @@ def fetch_packages(
736
719
 
737
720
  @dataset_app.command(rich_help_panel="Step 2: Filter Repos")
738
721
  def filter_debian_github_repos(
739
- distro: str = typer.Argument(
740
- ..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
741
- ),
722
+ distro: str = typer.Argument(..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"),
742
723
  release: list[str] = typer.Argument(
743
724
  ...,
744
725
  help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
745
726
  ),
746
727
  cache: str = typer.Option("./cache", help="Cache directory"),
747
- force: bool = typer.Option(
748
- False, "--force", "-f", help="Force re-processing even if cache exists"
749
- ),
728
+ force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
750
729
  ):
751
730
  """Filter distro package DataFrames to only include GitHub repositories."""
752
731
  cache_dir = os.getenv("CACHE_DIR") or cache
753
732
 
754
733
  if distro.lower() == "debian":
755
734
  for rel in release:
756
- filtered_parquet_path = Path(
757
- cache_dir, f"{distro}_{rel}_filtered_packages.parquet"
758
- )
735
+ filtered_parquet_path = Path(cache_dir, f"{distro}_{rel}_filtered_packages.parquet")
759
736
  if filtered_parquet_path.exists() and not force:
760
- logger.info(
761
- f"Using cached filtered packages from {filtered_parquet_path}"
762
- )
737
+ logger.info(f"Using cached filtered packages from {filtered_parquet_path}")
763
738
  continue
764
739
 
765
740
  parquet_path = Path(cache_dir, f"{distro}_{rel}_all_packages.parquet")
@@ -774,54 +749,36 @@ def filter_debian_github_repos(
774
749
  size_before = df.shape[0]
775
750
  filtered_df = deb.filter_github_repos(df)
776
751
  size_after = filtered_df.shape[0]
777
- logger.info(
778
- f"Dropped {size_before - size_after} packages due to non-GitHub '{rel}'."
779
- )
780
- filtered_df = deb.add_local_repo_cache_path_column(
781
- filtered_df, cache_dir=cache_dir
782
- )
752
+ logger.info(f"Dropped {size_before - size_after} packages due to non-GitHub '{rel}'.")
753
+ filtered_df = deb.add_local_repo_cache_path_column(filtered_df, cache_dir=cache_dir)
783
754
  filtered_df.reset_index(drop=True, inplace=True)
784
755
  filtered_df.to_parquet(filtered_parquet_path)
785
- logger.info(
786
- f"Filtered GitHub repositories saved to {filtered_parquet_path}"
787
- )
756
+ logger.info(f"Filtered GitHub repositories saved to {filtered_parquet_path}")
788
757
  else:
789
- logger.error(
790
- f"Distro '{distro}' is not supported for filtering GitHub repositories."
791
- )
758
+ logger.error(f"Distro '{distro}' is not supported for filtering GitHub repositories.")
792
759
 
793
760
 
794
761
  @dataset_app.command(rich_help_panel="Step 3: Extract Versions")
795
762
  def extract_upstream_versions(
796
- distro: str = typer.Argument(
797
- ..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
798
- ),
763
+ distro: str = typer.Argument(..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"),
799
764
  release: list[str] = typer.Argument(
800
765
  ...,
801
766
  help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
802
767
  ),
803
768
  cache: str = typer.Option("./cache", help="Cache directory"),
804
- force: bool = typer.Option(
805
- False, "--force", "-f", help="Force re-processing even if cache exists"
806
- ),
769
+ force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
807
770
  ):
808
771
  """Extract upstream version strings from Debian package versions and add as a new column."""
809
772
  cache_dir = os.getenv("CACHE_DIR") or cache
810
773
 
811
774
  if distro.lower() == "debian":
812
775
  for rel in release:
813
- versions_parquet_path = Path(
814
- cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet"
815
- )
776
+ versions_parquet_path = Path(cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet")
816
777
  if versions_parquet_path.exists() and not force:
817
- logger.info(
818
- f"Using cached upstream versions from {versions_parquet_path}"
819
- )
778
+ logger.info(f"Using cached upstream versions from {versions_parquet_path}")
820
779
  continue
821
780
 
822
- filtered_parquet_path = Path(
823
- cache_dir, f"{distro}_{rel}_filtered_packages.parquet"
824
- )
781
+ filtered_parquet_path = Path(cache_dir, f"{distro}_{rel}_filtered_packages.parquet")
825
782
  if not filtered_parquet_path.exists():
826
783
  logger.error(
827
784
  f"Required parquet file {filtered_parquet_path} does not exist. Please run the 'filter-debian-github-repos' command first."
@@ -831,65 +788,47 @@ def extract_upstream_versions(
831
788
  logger.info(f"Extracting upstream versions for Debian release '{rel}'")
832
789
  df: pd.DataFrame = pd.read_parquet(filtered_parquet_path)
833
790
  version_column = f"{rel}_upstream_version"
834
- df_with_versions = deb.add_upstream_version_column(
835
- df, f"{rel}_version", new_column_name=version_column
836
- )
791
+ df_with_versions = deb.add_upstream_version_column(df, f"{rel}_version", new_column_name=version_column)
837
792
  drop_before = df_with_versions.shape[0]
838
793
  df_with_versions.dropna(subset=[version_column], inplace=True)
839
794
  drop_after = df_with_versions.shape[0]
840
- logger.info(
841
- f"Dropped {drop_before - drop_after} rows with missing upstream versions for release '{rel}'."
842
- )
795
+ logger.info(f"Dropped {drop_before - drop_after} rows with missing upstream versions for release '{rel}'.")
843
796
  df_with_versions.reset_index(drop=True, inplace=True)
844
797
  df_with_versions.to_parquet(versions_parquet_path)
845
- logger.info(
846
- f"Upstream versions extracted and saved to {versions_parquet_path}"
847
- )
798
+ logger.info(f"Upstream versions extracted and saved to {versions_parquet_path}")
848
799
  else:
849
- logger.error(
850
- f"Distro '{distro}' is not supported for extracting upstream versions."
851
- )
800
+ logger.error(f"Distro '{distro}' is not supported for extracting upstream versions.")
852
801
 
853
802
 
854
803
  @dataset_app.command(rich_help_panel="Step 4: Merge Releases")
855
804
  def merge_releases(
856
- distro: str = typer.Argument(
857
- ..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
858
- ),
805
+ distro: str = typer.Argument(..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"),
859
806
  releases: list[str] = typer.Argument(
860
807
  ...,
861
808
  help="One or more distro releases to merge (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
862
809
  ),
863
810
  cache: str = typer.Option("./cache", help="Cache directory"),
864
- force: bool = typer.Option(
865
- False, "--force", "-f", help="Force re-processing even if cache exists"
866
- ),
811
+ force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
867
812
  ):
868
813
  """Merge multiple release DataFrames into a single DataFrame with all required columns."""
869
814
  cache_dir = os.getenv("CACHE_DIR") or cache
870
815
 
871
816
  if distro.lower() == "debian":
872
- merged_parquet_path = Path(
873
- cache_dir, f"{distro}_merged_releases_packages.parquet"
874
- )
817
+ merged_parquet_path = Path(cache_dir, f"{distro}_merged_releases_packages.parquet")
875
818
  if merged_parquet_path.exists() and not force:
876
819
  logger.info(f"Using cached merged releases from {merged_parquet_path}")
877
820
  return
878
821
 
879
822
  dfs = []
880
823
  for rel in releases:
881
- versions_parquet_path = Path(
882
- cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet"
883
- )
824
+ versions_parquet_path = Path(cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet")
884
825
  if not versions_parquet_path.exists():
885
826
  logger.error(
886
827
  f"Required parquet file {versions_parquet_path} does not exist. Please run the 'extract-upstream-versions' command first."
887
828
  )
888
829
  continue
889
830
 
890
- logger.info(
891
- f"Loading packages with upstream versions for Debian release '{rel}'"
892
- )
831
+ logger.info(f"Loading packages with upstream versions for Debian release '{rel}'")
893
832
  df: pd.DataFrame = pd.read_parquet(versions_parquet_path)
894
833
  dfs.append(df)
895
834
  deb_merged_df, deb_dropped_after_merge = deb.merge_release_packages(dfs)
@@ -899,12 +838,8 @@ def merge_releases(
899
838
  deb_merged_df.reset_index(drop=True, inplace=True)
900
839
  deb_merged_df.to_parquet(merged_parquet_path)
901
840
  logger.info(f"Merged release packages saved to {merged_parquet_path}")
902
- deb_dropped_after_merge.to_parquet(
903
- Path(cache_dir, f"{distro}_dropped_after_merge.parquet")
904
- )
905
- logger.info(
906
- f"Dropped rows after merge saved to {Path(cache_dir, f'{distro}_dropped_after_merge.parquet')}"
907
- )
841
+ deb_dropped_after_merge.to_parquet(Path(cache_dir, f"{distro}_dropped_after_merge.parquet"))
842
+ logger.info(f"Dropped rows after merge saved to {Path(cache_dir, f'{distro}_dropped_after_merge.parquet')}")
908
843
 
909
844
  else:
910
845
  logger.error(f"Distro '{distro}' is not supported for merging releases.")
@@ -912,16 +847,10 @@ def merge_releases(
912
847
 
913
848
  @dataset_app.command(rich_help_panel="Step 5: Clone Repos")
914
849
  def clone_upstream_repos(
915
- distro: str = typer.Argument(
916
- ..., help="The distro for (e.g., 'debian' 'fedora', etc.)"
917
- ),
918
- repos_cache: str = typer.Option(
919
- "./cache/repos", help="Cache directory for cloned repositories"
920
- ),
850
+ distro: str = typer.Argument(..., help="The distro for (e.g., 'debian' 'fedora', etc.)"),
851
+ repos_cache: str = typer.Option("./cache/repos", help="Cache directory for cloned repositories"),
921
852
  cache: str = typer.Option("./cache", help="Cache directory"),
922
- max_workers: int = typer.Option(
923
- 4, help="Maximum number of parallel clone processes (env: MAX_WORKERS)"
924
- ),
853
+ max_workers: int = typer.Option(4, help="Maximum number of parallel clone processes (env: MAX_WORKERS)"),
925
854
  ):
926
855
  """Clone all upstream GitHub repositories in the filtered package DataFrame."""
927
856
  cache_dir = os.getenv("CACHE_DIR") or cache
@@ -948,9 +877,7 @@ def clone_upstream_repos(
948
877
  invalid = 0
949
878
  for _, row in df.iterrows():
950
879
  repo_url = str(row["upstream_repo_url"])
951
- target_dir = vcs.construct_repo_local_path(
952
- repo_url, cache_dir=repos_cache_path, must_exist=False
953
- )
880
+ target_dir = vcs.construct_repo_local_path(repo_url, cache_dir=repos_cache_path, must_exist=False)
954
881
  if target_dir is None:
955
882
  invalid += 1
956
883
  continue
@@ -984,26 +911,16 @@ def clone_upstream_repos(
984
911
  skipped=skipped,
985
912
  )
986
913
  else:
987
- console.print(
988
- f"[red]Error:[/] Distro '{distro}' is not supported for cloning repositories."
989
- )
914
+ console.print(f"[red]Error:[/] Distro '{distro}' is not supported for cloning repositories.")
990
915
 
991
916
 
992
917
  @dataset_app.command(rich_help_panel="Step 6: Load Commits")
993
918
  def load_commits_into_dataframe(
994
- distro: str = typer.Argument(
995
- ..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
996
- ),
919
+ distro: str = typer.Argument(..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"),
997
920
  cache: str = typer.Option("./cache", help="Cache directory"),
998
- repo_cache: str = typer.Option(
999
- "./cache/repos", help="Cache directory for cloned repositories"
1000
- ),
1001
- max_workers: int = typer.Option(
1002
- 4, help="Maximum number of parallel worker processes (env: MAX_WORKERS)"
1003
- ),
1004
- force: bool = typer.Option(
1005
- False, "--force", "-f", help="Force re-processing even if cache exists"
1006
- ),
921
+ repo_cache: str = typer.Option("./cache/repos", help="Cache directory for cloned repositories"),
922
+ max_workers: int = typer.Option(4, help="Maximum number of parallel worker processes (env: MAX_WORKERS)"),
923
+ force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
1007
924
  ):
1008
925
  """Load all GitHub commits for the upstream repositories into a single DataFrame."""
1009
926
  cache_dir = os.getenv("CACHE_DIR") or cache
@@ -1016,9 +933,7 @@ def load_commits_into_dataframe(
1016
933
  console.print(f"[green]Using cached commits from {commits_parquet_path}[/]")
1017
934
  return
1018
935
 
1019
- all_packages_parquet_path = Path(
1020
- cache_dir, f"{distro}_merged_releases_packages.parquet"
1021
- )
936
+ all_packages_parquet_path = Path(cache_dir, f"{distro}_merged_releases_packages.parquet")
1022
937
  if not all_packages_parquet_path.exists():
1023
938
  console.print(
1024
939
  f"[red]Error:[/] Required parquet file {all_packages_parquet_path} does not exist. Please run the 'merge-releases' and 'clone-upstream-repos' commands first."
@@ -1040,9 +955,7 @@ def load_commits_into_dataframe(
1040
955
  skipped = 0
1041
956
  for _, row in df.iterrows():
1042
957
  repo_url = str(row["upstream_repo_url"])
1043
- local_repo_path = vcs.construct_repo_local_path(
1044
- repo_url, cache_dir=Path(repo_cache_dir), must_exist=True
1045
- )
958
+ local_repo_path = vcs.construct_repo_local_path(repo_url, cache_dir=Path(repo_cache_dir), must_exist=True)
1046
959
  if local_repo_path is None or not local_repo_path.exists():
1047
960
  skipped += 1
1048
961
  continue
@@ -1068,9 +981,7 @@ def load_commits_into_dataframe(
1068
981
  # Collect all the checkpointed DataFrames
1069
982
  if checkpoint_dir.exists():
1070
983
  try:
1071
- console.print(
1072
- f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
1073
- )
984
+ console.print(f"[green]Loading checkpointed commits from {checkpoint_dir}[/]")
1074
985
  for ck in checkpoint_dir.iterdir():
1075
986
  if not ck.name.endswith(".parquet"):
1076
987
  continue
@@ -1084,9 +995,7 @@ def load_commits_into_dataframe(
1084
995
  )
1085
996
  )
1086
997
  except Exception as e:
1087
- console.print(
1088
- f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
1089
- )
998
+ console.print(f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]")
1090
999
 
1091
1000
  # Collect successful DataFrames
1092
1001
  all_commits = [r.data for r in results if r.success and r.data is not None]
@@ -1096,49 +1005,35 @@ def load_commits_into_dataframe(
1096
1005
  combined_commits_df = pd.concat(all_commits, ignore_index=True)
1097
1006
  commits_parquet_path = Path(cache_dir, f"{distro}_all_upstream_commits.parquet")
1098
1007
  combined_commits_df.to_parquet(commits_parquet_path)
1099
- console.print(
1100
- f"[green]Saved {len(combined_commits_df):,} commits to {commits_parquet_path}[/]"
1101
- )
1008
+ console.print(f"[green]Saved {len(combined_commits_df):,} commits to {commits_parquet_path}[/]")
1102
1009
  else:
1103
1010
  console.print("[yellow]No commits were loaded from any repositories.[/]")
1104
1011
 
1105
1012
 
1106
1013
  @dataset_app.command(rich_help_panel="Step 7: GitHub Metadata")
1107
1014
  def all_github_metadata(
1108
- distro: str = typer.Option(
1109
- "debian", help="The Linux distribution to process (default: debian)"
1110
- ),
1015
+ distro: str = typer.Option("debian", help="The Linux distribution to process (default: debian)"),
1111
1016
  cache: str = typer.Option("./cache", help="Cache directory"),
1112
- max_workers: int = typer.Option(
1113
- 4, help="Maximum number of parallel GitHub API workers (env: MAX_WORKERS)"
1114
- ),
1115
- force: bool = typer.Option(
1116
- False, "--force", "-f", help="Force re-processing even if cache exists"
1117
- ),
1017
+ max_workers: int = typer.Option(4, help="Maximum number of parallel GitHub API workers (env: MAX_WORKERS)"),
1018
+ force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
1118
1019
  ):
1119
1020
  """Fetch GitHub repository metadata for all unique repos in the commits parquet file."""
1120
1021
  cache_dir = os.getenv("CACHE_DIR") or cache
1121
1022
  max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
1122
- all_packages_parquet_path = Path(
1123
- cache_dir, f"{distro}_merged_releases_packages.parquet"
1124
- )
1023
+ all_packages_parquet_path = Path(cache_dir, f"{distro}_merged_releases_packages.parquet")
1125
1024
  output_parquet_path = Path(cache_dir, f"{distro}_github_repo_metadata.parquet")
1126
1025
  checkpoint_dir = Path(cache_dir, "github_metadata_checkpoints")
1127
1026
  # Create checkpoint directory
1128
1027
  vcs.ensure_dir(checkpoint_dir)
1129
1028
 
1130
1029
  if force and checkpoint_dir.exists():
1131
- console.print(
1132
- f"[yellow]Removing existing GitHub metadata checkpoint at {checkpoint_dir}[/]"
1133
- )
1030
+ console.print(f"[yellow]Removing existing GitHub metadata checkpoint at {checkpoint_dir}[/]")
1134
1031
  for ck in checkpoint_dir.iterdir():
1135
1032
  if ck.name.endswith(".parquet"):
1136
1033
  ck.unlink()
1137
1034
 
1138
1035
  if output_parquet_path.exists() and not force:
1139
- console.print(
1140
- f"[green]Using cached GitHub metadata from {output_parquet_path}[/]"
1141
- )
1036
+ console.print(f"[green]Using cached GitHub metadata from {output_parquet_path}[/]")
1142
1037
  return
1143
1038
 
1144
1039
  if not all_packages_parquet_path.exists():
@@ -1194,9 +1089,7 @@ def all_github_metadata(
1194
1089
  # Collect all the checkpointed DataFrames
1195
1090
  if checkpoint_dir.exists():
1196
1091
  try:
1197
- console.print(
1198
- f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
1199
- )
1092
+ console.print(f"[green]Loading checkpointed commits from {checkpoint_dir}[/]")
1200
1093
  for ck in checkpoint_dir.iterdir():
1201
1094
  if not ck.name.endswith(".parquet"):
1202
1095
  continue
@@ -1210,58 +1103,38 @@ def all_github_metadata(
1210
1103
  )
1211
1104
  )
1212
1105
  except Exception as e:
1213
- console.print(
1214
- f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
1215
- )
1106
+ console.print(f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]")
1216
1107
  # Collect successful DataFrames
1217
1108
  all_metadata = [r.data for r in results if r.success and r.data is not None]
1218
1109
 
1219
1110
  if all_metadata:
1220
- console.print(
1221
- f"[green]Loaded metadata from {len(all_metadata)} repositories.[/]"
1222
- )
1111
+ console.print(f"[green]Loaded metadata from {len(all_metadata)} repositories.[/]")
1223
1112
  combined_metadata_df = pd.concat(all_metadata, ignore_index=True)
1224
- metadata_parquet_path = Path(
1225
- cache_dir, f"{distro}_all_upstream_metadata.parquet"
1226
- )
1113
+ metadata_parquet_path = Path(cache_dir, f"{distro}_all_upstream_metadata.parquet")
1227
1114
  combined_metadata_df.to_parquet(metadata_parquet_path)
1228
- console.print(
1229
- f"[green]Saved {len(combined_metadata_df):,} metadata entries to {metadata_parquet_path}[/]"
1230
- )
1115
+ console.print(f"[green]Saved {len(combined_metadata_df):,} metadata entries to {metadata_parquet_path}[/]")
1231
1116
  else:
1232
- console.print(
1233
- "[yellow]No metadata entries were loaded from any repositories.[/]"
1234
- )
1117
+ console.print("[yellow]No metadata entries were loaded from any repositories.[/]")
1235
1118
 
1236
1119
 
1237
1120
  @dataset_app.command(rich_help_panel="Step 8: GitHub Metadata")
1238
1121
  def all_github_pull_requests(
1239
- distro: str = typer.Option(
1240
- "debian", help="The Linux distribution to process (default: debian)"
1241
- ),
1122
+ distro: str = typer.Option("debian", help="The Linux distribution to process (default: debian)"),
1242
1123
  cache: str = typer.Option("./cache", help="Cache directory"),
1243
- max_workers: int = typer.Option(
1244
- 4, help="Maximum number of parallel GitHub API workers (env: MAX_WORKERS)"
1245
- ),
1246
- force: bool = typer.Option(
1247
- False, "--force", "-f", help="Force re-processing even if cache exists"
1248
- ),
1124
+ max_workers: int = typer.Option(4, help="Maximum number of parallel GitHub API workers (env: MAX_WORKERS)"),
1125
+ force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
1249
1126
  ):
1250
1127
  """Fetch GitHub repository pull requests for all unique repos in the commits parquet file."""
1251
1128
  cache_dir = os.getenv("CACHE_DIR") or cache
1252
1129
  max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
1253
- all_packages_parquet_path = Path(
1254
- cache_dir, f"{distro}_merged_releases_packages.parquet"
1255
- )
1130
+ all_packages_parquet_path = Path(cache_dir, f"{distro}_merged_releases_packages.parquet")
1256
1131
  output_parquet_path = Path(cache_dir, f"{distro}_github_repo_pull_requests.parquet")
1257
1132
  checkpoint_dir = Path(cache_dir, "github_pr_checkpoints")
1258
1133
  # Create checkpoint directory
1259
1134
  vcs.ensure_dir(checkpoint_dir)
1260
1135
 
1261
1136
  if output_parquet_path.exists() and not force:
1262
- console.print(
1263
- f"[green]Using cached GitHub pull requests from {output_parquet_path}[/]"
1264
- )
1137
+ console.print(f"[green]Using cached GitHub pull requests from {output_parquet_path}[/]")
1265
1138
  return
1266
1139
 
1267
1140
  if not all_packages_parquet_path.exists():
@@ -1271,9 +1144,7 @@ def all_github_pull_requests(
1271
1144
  return
1272
1145
 
1273
1146
  if force and checkpoint_dir.exists():
1274
- console.print(
1275
- f"[yellow]Removing existing GitHub pull requests checkpoint at {checkpoint_dir}[/]"
1276
- )
1147
+ console.print(f"[yellow]Removing existing GitHub pull requests checkpoint at {checkpoint_dir}[/]")
1277
1148
  for ck in checkpoint_dir.iterdir():
1278
1149
  if ck.name.endswith(".parquet"):
1279
1150
  ck.unlink()
@@ -1307,9 +1178,7 @@ def all_github_pull_requests(
1307
1178
  else:
1308
1179
  console.print("[yellow]Warning:[/] Could not fetch rate limit info")
1309
1180
 
1310
- console.print(
1311
- f"[cyan]Fetching GitHub pull requests for {len(tasks)} repositories...[/]"
1312
- )
1181
+ console.print(f"[cyan]Fetching GitHub pull requests for {len(tasks)} repositories...[/]")
1313
1182
  executor = ParallelExecutor(
1314
1183
  task_name="GitHub Pull Requests Fetch",
1315
1184
  max_workers=min(max_workers, len(tasks)),
@@ -1323,9 +1192,7 @@ def all_github_pull_requests(
1323
1192
  # Collect all the checkpointed DataFrames
1324
1193
  if checkpoint_dir.exists():
1325
1194
  try:
1326
- console.print(
1327
- f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
1328
- )
1195
+ console.print(f"[green]Loading checkpointed commits from {checkpoint_dir}[/]")
1329
1196
  for ck in checkpoint_dir.iterdir():
1330
1197
  if not ck.name.endswith(".parquet"):
1331
1198
  continue
@@ -1339,28 +1206,18 @@ def all_github_pull_requests(
1339
1206
  )
1340
1207
  )
1341
1208
  except Exception as e:
1342
- console.print(
1343
- f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
1344
- )
1209
+ console.print(f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]")
1345
1210
  # Collect successful DataFrames
1346
1211
  all_metadata = [r.data for r in results if r.success and r.data is not None]
1347
1212
 
1348
1213
  if all_metadata:
1349
- console.print(
1350
- f"[green]Loaded pull requests from {len(all_metadata)} repositories.[/]"
1351
- )
1214
+ console.print(f"[green]Loaded pull requests from {len(all_metadata)} repositories.[/]")
1352
1215
  combined_metadata_df = pd.concat(all_metadata, ignore_index=True)
1353
- metadata_parquet_path = Path(
1354
- cache_dir, f"{distro}_all_upstream_pull_requests.parquet"
1355
- )
1216
+ metadata_parquet_path = Path(cache_dir, f"{distro}_all_upstream_pull_requests.parquet")
1356
1217
  combined_metadata_df.to_parquet(metadata_parquet_path)
1357
- console.print(
1358
- f"[green]Saved {len(combined_metadata_df):,} pull request entries to {metadata_parquet_path}[/]"
1359
- )
1218
+ console.print(f"[green]Saved {len(combined_metadata_df):,} pull request entries to {metadata_parquet_path}[/]")
1360
1219
  else:
1361
- console.print(
1362
- "[yellow]No pull request entries were loaded from any repositories.[/]"
1363
- )
1220
+ console.print("[yellow]No pull request entries were loaded from any repositories.[/]")
1364
1221
 
1365
1222
 
1366
1223
  @app.command()