osslag 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- osslag/__init__.py +9 -0
- osslag/cli.py +100 -243
- osslag/distro/debian.py +13 -41
- osslag/distro/fedora.py +1 -3
- osslag/metrics/malta.py +412 -125
- osslag/metrics/pvac.py +2 -6
- osslag/utils/github_helper.py +7 -23
- osslag/utils/vcs.py +16 -49
- {osslag-1.0.0.dist-info → osslag-1.0.1.dist-info}/METADATA +14 -8
- osslag-1.0.1.dist-info/RECORD +17 -0
- {osslag-1.0.0.dist-info → osslag-1.0.1.dist-info}/WHEEL +2 -1
- {osslag-1.0.0.dist-info → osslag-1.0.1.dist-info}/entry_points.txt +0 -1
- osslag-1.0.1.dist-info/licenses/LICENSE +21 -0
- osslag-1.0.1.dist-info/top_level.txt +1 -0
- osslag-1.0.0.dist-info/RECORD +0 -15
osslag/cli.py
CHANGED
|
@@ -26,15 +26,32 @@ from rich.status import Status
|
|
|
26
26
|
from rich.table import Table
|
|
27
27
|
from rich.text import Text
|
|
28
28
|
|
|
29
|
+
from osslag import __version__
|
|
29
30
|
from osslag.distro import debian as deb
|
|
30
31
|
from osslag.utils import github_helper as gh
|
|
31
32
|
from osslag.utils import vcs
|
|
32
33
|
|
|
33
34
|
load_dotenv()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def version_callback(value: bool):
|
|
38
|
+
if value:
|
|
39
|
+
print(f"osslag {__version__}")
|
|
40
|
+
raise typer.Exit()
|
|
41
|
+
|
|
42
|
+
|
|
34
43
|
app = typer.Typer()
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@app.callback()
|
|
47
|
+
def main_callback(
|
|
48
|
+
version: bool = typer.Option(None, "--version", "-v", callback=version_callback, is_eager=True, help="Show version"),
|
|
49
|
+
):
|
|
50
|
+
"""OSS Lag - Technical Lag tools for Open Source Software Projects."""
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
dataset_app = typer.Typer(help="Dataset pipeline commands for building package analysis datasets.")
|
|
38
55
|
app.add_typer(dataset_app, name="dataset")
|
|
39
56
|
logger = logging.getLogger(__name__)
|
|
40
57
|
console = Console()
|
|
@@ -135,9 +152,7 @@ class SuppressConsoleLogging:
|
|
|
135
152
|
for name in list(logging.Logger.manager.loggerDict.keys()) + ["", "root"]:
|
|
136
153
|
log = logging.getLogger(name) if name else logging.getLogger()
|
|
137
154
|
for handler in log.handlers[:]:
|
|
138
|
-
if isinstance(handler, logging.StreamHandler) and not isinstance(
|
|
139
|
-
handler, logging.FileHandler
|
|
140
|
-
):
|
|
155
|
+
if isinstance(handler, logging.StreamHandler) and not isinstance(handler, logging.FileHandler):
|
|
141
156
|
original_level = handler.level
|
|
142
157
|
handler.setLevel(logging.CRITICAL + 1) # Effectively disable
|
|
143
158
|
self._disabled_handlers.append((handler, original_level))
|
|
@@ -264,9 +279,7 @@ class ParallelExecutor:
|
|
|
264
279
|
)
|
|
265
280
|
workers_table.add_column("Worker", style="cyan", width=8)
|
|
266
281
|
workers_table.add_column("Status", style="white", width=12)
|
|
267
|
-
workers_table.add_column(
|
|
268
|
-
"Current Task", style="yellow", overflow="ellipsis", no_wrap=True, width=60
|
|
269
|
-
)
|
|
282
|
+
workers_table.add_column("Current Task", style="yellow", overflow="ellipsis", no_wrap=True, width=60)
|
|
270
283
|
workers_table.add_column("Done", style="green", justify="right", width=6)
|
|
271
284
|
workers_table.add_column("Fail", style="red", justify="right", width=6)
|
|
272
285
|
|
|
@@ -274,9 +287,7 @@ class ParallelExecutor:
|
|
|
274
287
|
w = self.workers[wid]
|
|
275
288
|
status = "[green]●[/] Working" if w.current_task else "[dim]○ Idle[/]"
|
|
276
289
|
task_display = (
|
|
277
|
-
w.current_task[:58] + "…"
|
|
278
|
-
if w.current_task and len(w.current_task) > 58
|
|
279
|
-
else (w.current_task or "-")
|
|
290
|
+
w.current_task[:58] + "…" if w.current_task and len(w.current_task) > 58 else (w.current_task or "-")
|
|
280
291
|
)
|
|
281
292
|
workers_table.add_row(
|
|
282
293
|
f"#{wid}",
|
|
@@ -291,9 +302,7 @@ class ParallelExecutor:
|
|
|
291
302
|
for task_id, success in self.recent_completed[-self.show_recent_completed :]:
|
|
292
303
|
short_id = task_id[:70] + "…" if len(task_id) > 70 else task_id
|
|
293
304
|
recent_text.append(" ")
|
|
294
|
-
recent_text.append(
|
|
295
|
-
"✓ " if success else "✗ ", style="bold green" if success else "bold red"
|
|
296
|
-
)
|
|
305
|
+
recent_text.append("✓ " if success else "✗ ", style="bold green" if success else "bold red")
|
|
297
306
|
recent_text.append(f"{short_id}\n")
|
|
298
307
|
|
|
299
308
|
components = [
|
|
@@ -304,9 +313,7 @@ class ParallelExecutor:
|
|
|
304
313
|
workers_table,
|
|
305
314
|
Text(),
|
|
306
315
|
Panel(
|
|
307
|
-
recent_text
|
|
308
|
-
if recent_text.plain
|
|
309
|
-
else Text(" Waiting for tasks...", style="dim italic"),
|
|
316
|
+
recent_text if recent_text.plain else Text(" Waiting for tasks...", style="dim italic"),
|
|
310
317
|
title="[bold]Recent Completions[/]",
|
|
311
318
|
border_style="dim",
|
|
312
319
|
),
|
|
@@ -371,9 +378,7 @@ class ParallelExecutor:
|
|
|
371
378
|
|
|
372
379
|
with (
|
|
373
380
|
SuppressConsoleLogging(),
|
|
374
|
-
Live(
|
|
375
|
-
self.create_display(progress), refresh_per_second=4, console=console
|
|
376
|
-
) as live,
|
|
381
|
+
Live(self.create_display(progress), refresh_per_second=4, console=console) as live,
|
|
377
382
|
):
|
|
378
383
|
with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
|
|
379
384
|
# Map futures to (task, worker_id)
|
|
@@ -422,23 +427,17 @@ class ParallelExecutor:
|
|
|
422
427
|
self.workers[worker_id].tasks_failed += 1
|
|
423
428
|
self.recent_completed.append((task_id, result.success))
|
|
424
429
|
marker = result.failed_marker_path
|
|
425
|
-
marker.write_text(
|
|
426
|
-
f"Task failed: {result.error}\n"
|
|
427
|
-
) if marker else None
|
|
430
|
+
marker.write_text(f"Task failed: {result.error}\n") if marker else None
|
|
428
431
|
|
|
429
432
|
except Exception as e:
|
|
430
|
-
error_result = TaskResult(
|
|
431
|
-
task_id=task_id, success=False, error=str(e)
|
|
432
|
-
)
|
|
433
|
+
error_result = TaskResult(task_id=task_id, success=False, error=str(e))
|
|
433
434
|
results.append(error_result)
|
|
434
435
|
self.failed_tasks.append(error_result)
|
|
435
436
|
self.workers[worker_id].tasks_failed += 1
|
|
436
437
|
self.recent_completed.append((task_id, False))
|
|
437
438
|
result = error_result # Assign for progress check below
|
|
438
439
|
marker = result.failed_marker_path
|
|
439
|
-
marker.write_text(
|
|
440
|
-
f"Task failed: {result.error}\n"
|
|
441
|
-
) if marker else None
|
|
440
|
+
marker.write_text(f"Task failed: {result.error}\n") if marker else None
|
|
442
441
|
# Update progress
|
|
443
442
|
progress.advance(progress_task_id)
|
|
444
443
|
|
|
@@ -581,9 +580,7 @@ def rate_limit():
|
|
|
581
580
|
|
|
582
581
|
@app.command()
|
|
583
582
|
def pull_requests(
|
|
584
|
-
repo_url: str = typer.Argument(
|
|
585
|
-
..., help="The GitHub repository URL to fetch pull requests for"
|
|
586
|
-
),
|
|
583
|
+
repo_url: str = typer.Argument(..., help="The GitHub repository URL to fetch pull requests for"),
|
|
587
584
|
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
588
585
|
):
|
|
589
586
|
"""Fetch GitHub pull requests for a specified repository and save to a parquet file."""
|
|
@@ -602,18 +599,14 @@ def pull_requests(
|
|
|
602
599
|
|
|
603
600
|
@dataset_app.command(name="run", rich_help_panel="Full Pipeline")
|
|
604
601
|
def run_dataset_pipeline(
|
|
605
|
-
distro: str = typer.Option(
|
|
606
|
-
"debian", help="The Linux distribution to process (e.g., 'debian' 'fedora')"
|
|
607
|
-
),
|
|
602
|
+
distro: str = typer.Option("debian", help="The Linux distribution to process (e.g., 'debian' 'fedora')"),
|
|
608
603
|
releases: list[str] = typer.Option(
|
|
609
604
|
...,
|
|
610
605
|
"--release",
|
|
611
606
|
help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
|
|
612
607
|
),
|
|
613
608
|
cache: str = typer.Option("./cache", help="Cache directory (EV: CACHE_DIR)"),
|
|
614
|
-
force: bool = typer.Option(
|
|
615
|
-
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
616
|
-
),
|
|
609
|
+
force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
|
|
617
610
|
):
|
|
618
611
|
"""Run the full pipeline: fetch packages, filter repos, extract versions,
|
|
619
612
|
merge releases, clone repos, load commits, pull GitHub data.
|
|
@@ -644,28 +637,18 @@ def run_dataset_pipeline(
|
|
|
644
637
|
console.print("[green]✓[/] Step 1/6: Fetched packages")
|
|
645
638
|
|
|
646
639
|
# Step 2: Filter GitHub repos
|
|
647
|
-
with Status(
|
|
648
|
-
|
|
649
|
-
):
|
|
650
|
-
filter_debian_github_repos(
|
|
651
|
-
distro=distro, release=to_process, cache=cache_dir, force=force
|
|
652
|
-
)
|
|
640
|
+
with Status("[bold cyan]Step 2/6:[/] Filtering GitHub repos...", console=console):
|
|
641
|
+
filter_debian_github_repos(distro=distro, release=to_process, cache=cache_dir, force=force)
|
|
653
642
|
console.print("[green]✓[/] Step 2/6: Filtered GitHub repos")
|
|
654
643
|
|
|
655
644
|
# Step 3: Extract the version string and add upstream version columns
|
|
656
|
-
with Status(
|
|
657
|
-
|
|
658
|
-
):
|
|
659
|
-
extract_upstream_versions(
|
|
660
|
-
distro=distro, release=to_process, cache=cache_dir, force=force
|
|
661
|
-
)
|
|
645
|
+
with Status("[bold cyan]Step 3/6:[/] Extracting upstream versions...", console=console):
|
|
646
|
+
extract_upstream_versions(distro=distro, release=to_process, cache=cache_dir, force=force)
|
|
662
647
|
console.print("[green]✓[/] Step 3/6: Extracted upstream versions")
|
|
663
648
|
|
|
664
649
|
# Step 4: Merge releases into a single DataFrame with all required columns
|
|
665
650
|
with Status("[bold cyan]Step 4/6:[/] Merging releases...", console=console):
|
|
666
|
-
merge_releases(
|
|
667
|
-
distro=distro, releases=to_process, cache=cache_dir, force=force
|
|
668
|
-
)
|
|
651
|
+
merge_releases(distro=distro, releases=to_process, cache=cache_dir, force=force)
|
|
669
652
|
console.print("[green]✓[/] Step 4/6: Merged releases")
|
|
670
653
|
|
|
671
654
|
# Step 5: Clone all upstream GitHub repos (has its own UI)
|
|
@@ -736,30 +719,22 @@ def fetch_packages(
|
|
|
736
719
|
|
|
737
720
|
@dataset_app.command(rich_help_panel="Step 2: Filter Repos")
|
|
738
721
|
def filter_debian_github_repos(
|
|
739
|
-
distro: str = typer.Argument(
|
|
740
|
-
..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
|
|
741
|
-
),
|
|
722
|
+
distro: str = typer.Argument(..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"),
|
|
742
723
|
release: list[str] = typer.Argument(
|
|
743
724
|
...,
|
|
744
725
|
help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
|
|
745
726
|
),
|
|
746
727
|
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
747
|
-
force: bool = typer.Option(
|
|
748
|
-
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
749
|
-
),
|
|
728
|
+
force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
|
|
750
729
|
):
|
|
751
730
|
"""Filter distro package DataFrames to only include GitHub repositories."""
|
|
752
731
|
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
753
732
|
|
|
754
733
|
if distro.lower() == "debian":
|
|
755
734
|
for rel in release:
|
|
756
|
-
filtered_parquet_path = Path(
|
|
757
|
-
cache_dir, f"{distro}_{rel}_filtered_packages.parquet"
|
|
758
|
-
)
|
|
735
|
+
filtered_parquet_path = Path(cache_dir, f"{distro}_{rel}_filtered_packages.parquet")
|
|
759
736
|
if filtered_parquet_path.exists() and not force:
|
|
760
|
-
logger.info(
|
|
761
|
-
f"Using cached filtered packages from {filtered_parquet_path}"
|
|
762
|
-
)
|
|
737
|
+
logger.info(f"Using cached filtered packages from {filtered_parquet_path}")
|
|
763
738
|
continue
|
|
764
739
|
|
|
765
740
|
parquet_path = Path(cache_dir, f"{distro}_{rel}_all_packages.parquet")
|
|
@@ -774,54 +749,36 @@ def filter_debian_github_repos(
|
|
|
774
749
|
size_before = df.shape[0]
|
|
775
750
|
filtered_df = deb.filter_github_repos(df)
|
|
776
751
|
size_after = filtered_df.shape[0]
|
|
777
|
-
logger.info(
|
|
778
|
-
|
|
779
|
-
)
|
|
780
|
-
filtered_df = deb.add_local_repo_cache_path_column(
|
|
781
|
-
filtered_df, cache_dir=cache_dir
|
|
782
|
-
)
|
|
752
|
+
logger.info(f"Dropped {size_before - size_after} packages due to non-GitHub '{rel}'.")
|
|
753
|
+
filtered_df = deb.add_local_repo_cache_path_column(filtered_df, cache_dir=cache_dir)
|
|
783
754
|
filtered_df.reset_index(drop=True, inplace=True)
|
|
784
755
|
filtered_df.to_parquet(filtered_parquet_path)
|
|
785
|
-
logger.info(
|
|
786
|
-
f"Filtered GitHub repositories saved to {filtered_parquet_path}"
|
|
787
|
-
)
|
|
756
|
+
logger.info(f"Filtered GitHub repositories saved to {filtered_parquet_path}")
|
|
788
757
|
else:
|
|
789
|
-
logger.error(
|
|
790
|
-
f"Distro '{distro}' is not supported for filtering GitHub repositories."
|
|
791
|
-
)
|
|
758
|
+
logger.error(f"Distro '{distro}' is not supported for filtering GitHub repositories.")
|
|
792
759
|
|
|
793
760
|
|
|
794
761
|
@dataset_app.command(rich_help_panel="Step 3: Extract Versions")
|
|
795
762
|
def extract_upstream_versions(
|
|
796
|
-
distro: str = typer.Argument(
|
|
797
|
-
..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
|
|
798
|
-
),
|
|
763
|
+
distro: str = typer.Argument(..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"),
|
|
799
764
|
release: list[str] = typer.Argument(
|
|
800
765
|
...,
|
|
801
766
|
help="One or more distro releases to process (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
|
|
802
767
|
),
|
|
803
768
|
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
804
|
-
force: bool = typer.Option(
|
|
805
|
-
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
806
|
-
),
|
|
769
|
+
force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
|
|
807
770
|
):
|
|
808
771
|
"""Extract upstream version strings from Debian package versions and add as a new column."""
|
|
809
772
|
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
810
773
|
|
|
811
774
|
if distro.lower() == "debian":
|
|
812
775
|
for rel in release:
|
|
813
|
-
versions_parquet_path = Path(
|
|
814
|
-
cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet"
|
|
815
|
-
)
|
|
776
|
+
versions_parquet_path = Path(cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet")
|
|
816
777
|
if versions_parquet_path.exists() and not force:
|
|
817
|
-
logger.info(
|
|
818
|
-
f"Using cached upstream versions from {versions_parquet_path}"
|
|
819
|
-
)
|
|
778
|
+
logger.info(f"Using cached upstream versions from {versions_parquet_path}")
|
|
820
779
|
continue
|
|
821
780
|
|
|
822
|
-
filtered_parquet_path = Path(
|
|
823
|
-
cache_dir, f"{distro}_{rel}_filtered_packages.parquet"
|
|
824
|
-
)
|
|
781
|
+
filtered_parquet_path = Path(cache_dir, f"{distro}_{rel}_filtered_packages.parquet")
|
|
825
782
|
if not filtered_parquet_path.exists():
|
|
826
783
|
logger.error(
|
|
827
784
|
f"Required parquet file {filtered_parquet_path} does not exist. Please run the 'filter-debian-github-repos' command first."
|
|
@@ -831,65 +788,47 @@ def extract_upstream_versions(
|
|
|
831
788
|
logger.info(f"Extracting upstream versions for Debian release '{rel}'")
|
|
832
789
|
df: pd.DataFrame = pd.read_parquet(filtered_parquet_path)
|
|
833
790
|
version_column = f"{rel}_upstream_version"
|
|
834
|
-
df_with_versions = deb.add_upstream_version_column(
|
|
835
|
-
df, f"{rel}_version", new_column_name=version_column
|
|
836
|
-
)
|
|
791
|
+
df_with_versions = deb.add_upstream_version_column(df, f"{rel}_version", new_column_name=version_column)
|
|
837
792
|
drop_before = df_with_versions.shape[0]
|
|
838
793
|
df_with_versions.dropna(subset=[version_column], inplace=True)
|
|
839
794
|
drop_after = df_with_versions.shape[0]
|
|
840
|
-
logger.info(
|
|
841
|
-
f"Dropped {drop_before - drop_after} rows with missing upstream versions for release '{rel}'."
|
|
842
|
-
)
|
|
795
|
+
logger.info(f"Dropped {drop_before - drop_after} rows with missing upstream versions for release '{rel}'.")
|
|
843
796
|
df_with_versions.reset_index(drop=True, inplace=True)
|
|
844
797
|
df_with_versions.to_parquet(versions_parquet_path)
|
|
845
|
-
logger.info(
|
|
846
|
-
f"Upstream versions extracted and saved to {versions_parquet_path}"
|
|
847
|
-
)
|
|
798
|
+
logger.info(f"Upstream versions extracted and saved to {versions_parquet_path}")
|
|
848
799
|
else:
|
|
849
|
-
logger.error(
|
|
850
|
-
f"Distro '{distro}' is not supported for extracting upstream versions."
|
|
851
|
-
)
|
|
800
|
+
logger.error(f"Distro '{distro}' is not supported for extracting upstream versions.")
|
|
852
801
|
|
|
853
802
|
|
|
854
803
|
@dataset_app.command(rich_help_panel="Step 4: Merge Releases")
|
|
855
804
|
def merge_releases(
|
|
856
|
-
distro: str = typer.Argument(
|
|
857
|
-
..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
|
|
858
|
-
),
|
|
805
|
+
distro: str = typer.Argument(..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"),
|
|
859
806
|
releases: list[str] = typer.Argument(
|
|
860
807
|
...,
|
|
861
808
|
help="One or more distro releases to merge (e.g., 'trixie', 'bookworm', '40'). Can repeat flag or use comma-separated.",
|
|
862
809
|
),
|
|
863
810
|
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
864
|
-
force: bool = typer.Option(
|
|
865
|
-
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
866
|
-
),
|
|
811
|
+
force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
|
|
867
812
|
):
|
|
868
813
|
"""Merge multiple release DataFrames into a single DataFrame with all required columns."""
|
|
869
814
|
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
870
815
|
|
|
871
816
|
if distro.lower() == "debian":
|
|
872
|
-
merged_parquet_path = Path(
|
|
873
|
-
cache_dir, f"{distro}_merged_releases_packages.parquet"
|
|
874
|
-
)
|
|
817
|
+
merged_parquet_path = Path(cache_dir, f"{distro}_merged_releases_packages.parquet")
|
|
875
818
|
if merged_parquet_path.exists() and not force:
|
|
876
819
|
logger.info(f"Using cached merged releases from {merged_parquet_path}")
|
|
877
820
|
return
|
|
878
821
|
|
|
879
822
|
dfs = []
|
|
880
823
|
for rel in releases:
|
|
881
|
-
versions_parquet_path = Path(
|
|
882
|
-
cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet"
|
|
883
|
-
)
|
|
824
|
+
versions_parquet_path = Path(cache_dir, f"{distro}_{rel}_packages_with_upstream_versions.parquet")
|
|
884
825
|
if not versions_parquet_path.exists():
|
|
885
826
|
logger.error(
|
|
886
827
|
f"Required parquet file {versions_parquet_path} does not exist. Please run the 'extract-upstream-versions' command first."
|
|
887
828
|
)
|
|
888
829
|
continue
|
|
889
830
|
|
|
890
|
-
logger.info(
|
|
891
|
-
f"Loading packages with upstream versions for Debian release '{rel}'"
|
|
892
|
-
)
|
|
831
|
+
logger.info(f"Loading packages with upstream versions for Debian release '{rel}'")
|
|
893
832
|
df: pd.DataFrame = pd.read_parquet(versions_parquet_path)
|
|
894
833
|
dfs.append(df)
|
|
895
834
|
deb_merged_df, deb_dropped_after_merge = deb.merge_release_packages(dfs)
|
|
@@ -899,12 +838,8 @@ def merge_releases(
|
|
|
899
838
|
deb_merged_df.reset_index(drop=True, inplace=True)
|
|
900
839
|
deb_merged_df.to_parquet(merged_parquet_path)
|
|
901
840
|
logger.info(f"Merged release packages saved to {merged_parquet_path}")
|
|
902
|
-
deb_dropped_after_merge.to_parquet(
|
|
903
|
-
|
|
904
|
-
)
|
|
905
|
-
logger.info(
|
|
906
|
-
f"Dropped rows after merge saved to {Path(cache_dir, f'{distro}_dropped_after_merge.parquet')}"
|
|
907
|
-
)
|
|
841
|
+
deb_dropped_after_merge.to_parquet(Path(cache_dir, f"{distro}_dropped_after_merge.parquet"))
|
|
842
|
+
logger.info(f"Dropped rows after merge saved to {Path(cache_dir, f'{distro}_dropped_after_merge.parquet')}")
|
|
908
843
|
|
|
909
844
|
else:
|
|
910
845
|
logger.error(f"Distro '{distro}' is not supported for merging releases.")
|
|
@@ -912,16 +847,10 @@ def merge_releases(
|
|
|
912
847
|
|
|
913
848
|
@dataset_app.command(rich_help_panel="Step 5: Clone Repos")
|
|
914
849
|
def clone_upstream_repos(
|
|
915
|
-
distro: str = typer.Argument(
|
|
916
|
-
|
|
917
|
-
),
|
|
918
|
-
repos_cache: str = typer.Option(
|
|
919
|
-
"./cache/repos", help="Cache directory for cloned repositories"
|
|
920
|
-
),
|
|
850
|
+
distro: str = typer.Argument(..., help="The distro for (e.g., 'debian' 'fedora', etc.)"),
|
|
851
|
+
repos_cache: str = typer.Option("./cache/repos", help="Cache directory for cloned repositories"),
|
|
921
852
|
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
922
|
-
max_workers: int = typer.Option(
|
|
923
|
-
4, help="Maximum number of parallel clone processes (env: MAX_WORKERS)"
|
|
924
|
-
),
|
|
853
|
+
max_workers: int = typer.Option(4, help="Maximum number of parallel clone processes (env: MAX_WORKERS)"),
|
|
925
854
|
):
|
|
926
855
|
"""Clone all upstream GitHub repositories in the filtered package DataFrame."""
|
|
927
856
|
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
@@ -948,9 +877,7 @@ def clone_upstream_repos(
|
|
|
948
877
|
invalid = 0
|
|
949
878
|
for _, row in df.iterrows():
|
|
950
879
|
repo_url = str(row["upstream_repo_url"])
|
|
951
|
-
target_dir = vcs.construct_repo_local_path(
|
|
952
|
-
repo_url, cache_dir=repos_cache_path, must_exist=False
|
|
953
|
-
)
|
|
880
|
+
target_dir = vcs.construct_repo_local_path(repo_url, cache_dir=repos_cache_path, must_exist=False)
|
|
954
881
|
if target_dir is None:
|
|
955
882
|
invalid += 1
|
|
956
883
|
continue
|
|
@@ -984,26 +911,16 @@ def clone_upstream_repos(
|
|
|
984
911
|
skipped=skipped,
|
|
985
912
|
)
|
|
986
913
|
else:
|
|
987
|
-
console.print(
|
|
988
|
-
f"[red]Error:[/] Distro '{distro}' is not supported for cloning repositories."
|
|
989
|
-
)
|
|
914
|
+
console.print(f"[red]Error:[/] Distro '{distro}' is not supported for cloning repositories.")
|
|
990
915
|
|
|
991
916
|
|
|
992
917
|
@dataset_app.command(rich_help_panel="Step 6: Load Commits")
|
|
993
918
|
def load_commits_into_dataframe(
|
|
994
|
-
distro: str = typer.Argument(
|
|
995
|
-
..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"
|
|
996
|
-
),
|
|
919
|
+
distro: str = typer.Argument(..., help="The Linux distribution to process (e.g., 'debian' 'fedora')"),
|
|
997
920
|
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
998
|
-
repo_cache: str = typer.Option(
|
|
999
|
-
|
|
1000
|
-
),
|
|
1001
|
-
max_workers: int = typer.Option(
|
|
1002
|
-
4, help="Maximum number of parallel worker processes (env: MAX_WORKERS)"
|
|
1003
|
-
),
|
|
1004
|
-
force: bool = typer.Option(
|
|
1005
|
-
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
1006
|
-
),
|
|
921
|
+
repo_cache: str = typer.Option("./cache/repos", help="Cache directory for cloned repositories"),
|
|
922
|
+
max_workers: int = typer.Option(4, help="Maximum number of parallel worker processes (env: MAX_WORKERS)"),
|
|
923
|
+
force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
|
|
1007
924
|
):
|
|
1008
925
|
"""Load all GitHub commits for the upstream repositories into a single DataFrame."""
|
|
1009
926
|
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
@@ -1016,9 +933,7 @@ def load_commits_into_dataframe(
|
|
|
1016
933
|
console.print(f"[green]Using cached commits from {commits_parquet_path}[/]")
|
|
1017
934
|
return
|
|
1018
935
|
|
|
1019
|
-
all_packages_parquet_path = Path(
|
|
1020
|
-
cache_dir, f"{distro}_merged_releases_packages.parquet"
|
|
1021
|
-
)
|
|
936
|
+
all_packages_parquet_path = Path(cache_dir, f"{distro}_merged_releases_packages.parquet")
|
|
1022
937
|
if not all_packages_parquet_path.exists():
|
|
1023
938
|
console.print(
|
|
1024
939
|
f"[red]Error:[/] Required parquet file {all_packages_parquet_path} does not exist. Please run the 'merge-releases' and 'clone-upstream-repos' commands first."
|
|
@@ -1040,9 +955,7 @@ def load_commits_into_dataframe(
|
|
|
1040
955
|
skipped = 0
|
|
1041
956
|
for _, row in df.iterrows():
|
|
1042
957
|
repo_url = str(row["upstream_repo_url"])
|
|
1043
|
-
local_repo_path = vcs.construct_repo_local_path(
|
|
1044
|
-
repo_url, cache_dir=Path(repo_cache_dir), must_exist=True
|
|
1045
|
-
)
|
|
958
|
+
local_repo_path = vcs.construct_repo_local_path(repo_url, cache_dir=Path(repo_cache_dir), must_exist=True)
|
|
1046
959
|
if local_repo_path is None or not local_repo_path.exists():
|
|
1047
960
|
skipped += 1
|
|
1048
961
|
continue
|
|
@@ -1068,9 +981,7 @@ def load_commits_into_dataframe(
|
|
|
1068
981
|
# Collect all the checkpointed DataFrames
|
|
1069
982
|
if checkpoint_dir.exists():
|
|
1070
983
|
try:
|
|
1071
|
-
console.print(
|
|
1072
|
-
f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
|
|
1073
|
-
)
|
|
984
|
+
console.print(f"[green]Loading checkpointed commits from {checkpoint_dir}[/]")
|
|
1074
985
|
for ck in checkpoint_dir.iterdir():
|
|
1075
986
|
if not ck.name.endswith(".parquet"):
|
|
1076
987
|
continue
|
|
@@ -1084,9 +995,7 @@ def load_commits_into_dataframe(
|
|
|
1084
995
|
)
|
|
1085
996
|
)
|
|
1086
997
|
except Exception as e:
|
|
1087
|
-
console.print(
|
|
1088
|
-
f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
|
|
1089
|
-
)
|
|
998
|
+
console.print(f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]")
|
|
1090
999
|
|
|
1091
1000
|
# Collect successful DataFrames
|
|
1092
1001
|
all_commits = [r.data for r in results if r.success and r.data is not None]
|
|
@@ -1096,49 +1005,35 @@ def load_commits_into_dataframe(
|
|
|
1096
1005
|
combined_commits_df = pd.concat(all_commits, ignore_index=True)
|
|
1097
1006
|
commits_parquet_path = Path(cache_dir, f"{distro}_all_upstream_commits.parquet")
|
|
1098
1007
|
combined_commits_df.to_parquet(commits_parquet_path)
|
|
1099
|
-
console.print(
|
|
1100
|
-
f"[green]Saved {len(combined_commits_df):,} commits to {commits_parquet_path}[/]"
|
|
1101
|
-
)
|
|
1008
|
+
console.print(f"[green]Saved {len(combined_commits_df):,} commits to {commits_parquet_path}[/]")
|
|
1102
1009
|
else:
|
|
1103
1010
|
console.print("[yellow]No commits were loaded from any repositories.[/]")
|
|
1104
1011
|
|
|
1105
1012
|
|
|
1106
1013
|
@dataset_app.command(rich_help_panel="Step 7: GitHub Metadata")
|
|
1107
1014
|
def all_github_metadata(
|
|
1108
|
-
distro: str = typer.Option(
|
|
1109
|
-
"debian", help="The Linux distribution to process (default: debian)"
|
|
1110
|
-
),
|
|
1015
|
+
distro: str = typer.Option("debian", help="The Linux distribution to process (default: debian)"),
|
|
1111
1016
|
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
1112
|
-
max_workers: int = typer.Option(
|
|
1113
|
-
|
|
1114
|
-
),
|
|
1115
|
-
force: bool = typer.Option(
|
|
1116
|
-
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
1117
|
-
),
|
|
1017
|
+
max_workers: int = typer.Option(4, help="Maximum number of parallel GitHub API workers (env: MAX_WORKERS)"),
|
|
1018
|
+
force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
|
|
1118
1019
|
):
|
|
1119
1020
|
"""Fetch GitHub repository metadata for all unique repos in the commits parquet file."""
|
|
1120
1021
|
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
1121
1022
|
max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
|
|
1122
|
-
all_packages_parquet_path = Path(
|
|
1123
|
-
cache_dir, f"{distro}_merged_releases_packages.parquet"
|
|
1124
|
-
)
|
|
1023
|
+
all_packages_parquet_path = Path(cache_dir, f"{distro}_merged_releases_packages.parquet")
|
|
1125
1024
|
output_parquet_path = Path(cache_dir, f"{distro}_github_repo_metadata.parquet")
|
|
1126
1025
|
checkpoint_dir = Path(cache_dir, "github_metadata_checkpoints")
|
|
1127
1026
|
# Create checkpoint directory
|
|
1128
1027
|
vcs.ensure_dir(checkpoint_dir)
|
|
1129
1028
|
|
|
1130
1029
|
if force and checkpoint_dir.exists():
|
|
1131
|
-
console.print(
|
|
1132
|
-
f"[yellow]Removing existing GitHub metadata checkpoint at {checkpoint_dir}[/]"
|
|
1133
|
-
)
|
|
1030
|
+
console.print(f"[yellow]Removing existing GitHub metadata checkpoint at {checkpoint_dir}[/]")
|
|
1134
1031
|
for ck in checkpoint_dir.iterdir():
|
|
1135
1032
|
if ck.name.endswith(".parquet"):
|
|
1136
1033
|
ck.unlink()
|
|
1137
1034
|
|
|
1138
1035
|
if output_parquet_path.exists() and not force:
|
|
1139
|
-
console.print(
|
|
1140
|
-
f"[green]Using cached GitHub metadata from {output_parquet_path}[/]"
|
|
1141
|
-
)
|
|
1036
|
+
console.print(f"[green]Using cached GitHub metadata from {output_parquet_path}[/]")
|
|
1142
1037
|
return
|
|
1143
1038
|
|
|
1144
1039
|
if not all_packages_parquet_path.exists():
|
|
@@ -1194,9 +1089,7 @@ def all_github_metadata(
|
|
|
1194
1089
|
# Collect all the checkpointed DataFrames
|
|
1195
1090
|
if checkpoint_dir.exists():
|
|
1196
1091
|
try:
|
|
1197
|
-
console.print(
|
|
1198
|
-
f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
|
|
1199
|
-
)
|
|
1092
|
+
console.print(f"[green]Loading checkpointed commits from {checkpoint_dir}[/]")
|
|
1200
1093
|
for ck in checkpoint_dir.iterdir():
|
|
1201
1094
|
if not ck.name.endswith(".parquet"):
|
|
1202
1095
|
continue
|
|
@@ -1210,58 +1103,38 @@ def all_github_metadata(
|
|
|
1210
1103
|
)
|
|
1211
1104
|
)
|
|
1212
1105
|
except Exception as e:
|
|
1213
|
-
console.print(
|
|
1214
|
-
f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
|
|
1215
|
-
)
|
|
1106
|
+
console.print(f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]")
|
|
1216
1107
|
# Collect successful DataFrames
|
|
1217
1108
|
all_metadata = [r.data for r in results if r.success and r.data is not None]
|
|
1218
1109
|
|
|
1219
1110
|
if all_metadata:
|
|
1220
|
-
console.print(
|
|
1221
|
-
f"[green]Loaded metadata from {len(all_metadata)} repositories.[/]"
|
|
1222
|
-
)
|
|
1111
|
+
console.print(f"[green]Loaded metadata from {len(all_metadata)} repositories.[/]")
|
|
1223
1112
|
combined_metadata_df = pd.concat(all_metadata, ignore_index=True)
|
|
1224
|
-
metadata_parquet_path = Path(
|
|
1225
|
-
cache_dir, f"{distro}_all_upstream_metadata.parquet"
|
|
1226
|
-
)
|
|
1113
|
+
metadata_parquet_path = Path(cache_dir, f"{distro}_all_upstream_metadata.parquet")
|
|
1227
1114
|
combined_metadata_df.to_parquet(metadata_parquet_path)
|
|
1228
|
-
console.print(
|
|
1229
|
-
f"[green]Saved {len(combined_metadata_df):,} metadata entries to {metadata_parquet_path}[/]"
|
|
1230
|
-
)
|
|
1115
|
+
console.print(f"[green]Saved {len(combined_metadata_df):,} metadata entries to {metadata_parquet_path}[/]")
|
|
1231
1116
|
else:
|
|
1232
|
-
console.print(
|
|
1233
|
-
"[yellow]No metadata entries were loaded from any repositories.[/]"
|
|
1234
|
-
)
|
|
1117
|
+
console.print("[yellow]No metadata entries were loaded from any repositories.[/]")
|
|
1235
1118
|
|
|
1236
1119
|
|
|
1237
1120
|
@dataset_app.command(rich_help_panel="Step 8: GitHub Metadata")
|
|
1238
1121
|
def all_github_pull_requests(
|
|
1239
|
-
distro: str = typer.Option(
|
|
1240
|
-
"debian", help="The Linux distribution to process (default: debian)"
|
|
1241
|
-
),
|
|
1122
|
+
distro: str = typer.Option("debian", help="The Linux distribution to process (default: debian)"),
|
|
1242
1123
|
cache: str = typer.Option("./cache", help="Cache directory"),
|
|
1243
|
-
max_workers: int = typer.Option(
|
|
1244
|
-
|
|
1245
|
-
),
|
|
1246
|
-
force: bool = typer.Option(
|
|
1247
|
-
False, "--force", "-f", help="Force re-processing even if cache exists"
|
|
1248
|
-
),
|
|
1124
|
+
max_workers: int = typer.Option(4, help="Maximum number of parallel GitHub API workers (env: MAX_WORKERS)"),
|
|
1125
|
+
force: bool = typer.Option(False, "--force", "-f", help="Force re-processing even if cache exists"),
|
|
1249
1126
|
):
|
|
1250
1127
|
"""Fetch GitHub repository pull requests for all unique repos in the commits parquet file."""
|
|
1251
1128
|
cache_dir = os.getenv("CACHE_DIR") or cache
|
|
1252
1129
|
max_workers = int(os.getenv("MAX_WORKERS", str(max_workers)))
|
|
1253
|
-
all_packages_parquet_path = Path(
|
|
1254
|
-
cache_dir, f"{distro}_merged_releases_packages.parquet"
|
|
1255
|
-
)
|
|
1130
|
+
all_packages_parquet_path = Path(cache_dir, f"{distro}_merged_releases_packages.parquet")
|
|
1256
1131
|
output_parquet_path = Path(cache_dir, f"{distro}_github_repo_pull_requests.parquet")
|
|
1257
1132
|
checkpoint_dir = Path(cache_dir, "github_pr_checkpoints")
|
|
1258
1133
|
# Create checkpoint directory
|
|
1259
1134
|
vcs.ensure_dir(checkpoint_dir)
|
|
1260
1135
|
|
|
1261
1136
|
if output_parquet_path.exists() and not force:
|
|
1262
|
-
console.print(
|
|
1263
|
-
f"[green]Using cached GitHub pull requests from {output_parquet_path}[/]"
|
|
1264
|
-
)
|
|
1137
|
+
console.print(f"[green]Using cached GitHub pull requests from {output_parquet_path}[/]")
|
|
1265
1138
|
return
|
|
1266
1139
|
|
|
1267
1140
|
if not all_packages_parquet_path.exists():
|
|
@@ -1271,9 +1144,7 @@ def all_github_pull_requests(
|
|
|
1271
1144
|
return
|
|
1272
1145
|
|
|
1273
1146
|
if force and checkpoint_dir.exists():
|
|
1274
|
-
console.print(
|
|
1275
|
-
f"[yellow]Removing existing GitHub pull requests checkpoint at {checkpoint_dir}[/]"
|
|
1276
|
-
)
|
|
1147
|
+
console.print(f"[yellow]Removing existing GitHub pull requests checkpoint at {checkpoint_dir}[/]")
|
|
1277
1148
|
for ck in checkpoint_dir.iterdir():
|
|
1278
1149
|
if ck.name.endswith(".parquet"):
|
|
1279
1150
|
ck.unlink()
|
|
@@ -1307,9 +1178,7 @@ def all_github_pull_requests(
|
|
|
1307
1178
|
else:
|
|
1308
1179
|
console.print("[yellow]Warning:[/] Could not fetch rate limit info")
|
|
1309
1180
|
|
|
1310
|
-
console.print(
|
|
1311
|
-
f"[cyan]Fetching GitHub pull requests for {len(tasks)} repositories...[/]"
|
|
1312
|
-
)
|
|
1181
|
+
console.print(f"[cyan]Fetching GitHub pull requests for {len(tasks)} repositories...[/]")
|
|
1313
1182
|
executor = ParallelExecutor(
|
|
1314
1183
|
task_name="GitHub Pull Requests Fetch",
|
|
1315
1184
|
max_workers=min(max_workers, len(tasks)),
|
|
@@ -1323,9 +1192,7 @@ def all_github_pull_requests(
|
|
|
1323
1192
|
# Collect all the checkpointed DataFrames
|
|
1324
1193
|
if checkpoint_dir.exists():
|
|
1325
1194
|
try:
|
|
1326
|
-
console.print(
|
|
1327
|
-
f"[green]Loading checkpointed commits from {checkpoint_dir}[/]"
|
|
1328
|
-
)
|
|
1195
|
+
console.print(f"[green]Loading checkpointed commits from {checkpoint_dir}[/]")
|
|
1329
1196
|
for ck in checkpoint_dir.iterdir():
|
|
1330
1197
|
if not ck.name.endswith(".parquet"):
|
|
1331
1198
|
continue
|
|
@@ -1339,28 +1206,18 @@ def all_github_pull_requests(
|
|
|
1339
1206
|
)
|
|
1340
1207
|
)
|
|
1341
1208
|
except Exception as e:
|
|
1342
|
-
console.print(
|
|
1343
|
-
f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]"
|
|
1344
|
-
)
|
|
1209
|
+
console.print(f"[yellow]Warning:[/] Failed to load checkpointed commits: {e}[/]")
|
|
1345
1210
|
# Collect successful DataFrames
|
|
1346
1211
|
all_metadata = [r.data for r in results if r.success and r.data is not None]
|
|
1347
1212
|
|
|
1348
1213
|
if all_metadata:
|
|
1349
|
-
console.print(
|
|
1350
|
-
f"[green]Loaded pull requests from {len(all_metadata)} repositories.[/]"
|
|
1351
|
-
)
|
|
1214
|
+
console.print(f"[green]Loaded pull requests from {len(all_metadata)} repositories.[/]")
|
|
1352
1215
|
combined_metadata_df = pd.concat(all_metadata, ignore_index=True)
|
|
1353
|
-
metadata_parquet_path = Path(
|
|
1354
|
-
cache_dir, f"{distro}_all_upstream_pull_requests.parquet"
|
|
1355
|
-
)
|
|
1216
|
+
metadata_parquet_path = Path(cache_dir, f"{distro}_all_upstream_pull_requests.parquet")
|
|
1356
1217
|
combined_metadata_df.to_parquet(metadata_parquet_path)
|
|
1357
|
-
console.print(
|
|
1358
|
-
f"[green]Saved {len(combined_metadata_df):,} pull request entries to {metadata_parquet_path}[/]"
|
|
1359
|
-
)
|
|
1218
|
+
console.print(f"[green]Saved {len(combined_metadata_df):,} pull request entries to {metadata_parquet_path}[/]")
|
|
1360
1219
|
else:
|
|
1361
|
-
console.print(
|
|
1362
|
-
"[yellow]No pull request entries were loaded from any repositories.[/]"
|
|
1363
|
-
)
|
|
1220
|
+
console.print("[yellow]No pull request entries were loaded from any repositories.[/]")
|
|
1364
1221
|
|
|
1365
1222
|
|
|
1366
1223
|
@app.command()
|