npm - opencode-skills-antigravity - Versions diffs - 1.0.39 → 1.0.41 - Mend

opencode-skills-antigravity 1.0.39 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

package/bundled-skills/hugging-face-jobs/scripts/finepdfs-stats.py ADDED Viewed

@@ -0,0 +1,546 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "polars>=1.31.0",
+#     "huggingface-hub",
+#     "datasets",
+#     "ascii-graph",
+# ]
+# ///
+"""
+Analyze educational quality trends across CommonCrawl dumps using Polars streaming.
+Answers: "Is the web getting more educational over time?"
+Demonstrates Polars HF Hub integration - process 50M+ docs without downloading 300GB+.
+Example usage:
+    # Analyze English PDFs (default)
+    uv run finepdfs-stats.py
+    # Analyze all 70+ languages
+    uv run finepdfs-stats.py --all-languages
+    # Quick test
+    uv run finepdfs-stats.py --limit 10000 --show-plan
+    # Save results to HF Hub
+    uv run finepdfs-stats.py --output-repo username/finepdfs-temporal-stats
+    # Run on HF Jobs
+    hf jobs uv run \\
+        -s HF_TOKEN \\
+        -e HF_XET_HIGH_PERFORMANCE=1 \\
+        https://huggingface.co/datasets/uv-scripts/dataset-stats/raw/main/finepdfs-stats.py \\
+        -- --output-repo username/stats
+"""
+import argparse
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+import polars as pl
+from ascii_graph import Pyasciigraph
+from datasets import Dataset
+from huggingface_hub import HfApi, create_repo, list_repo_tree, login
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+# Common language+script codes for finepdfs-edu
+COMMON_LANGUAGES = {
+    "eng_Latn": "English (Latin script)",
+    "fra_Latn": "French (Latin script)",
+    "deu_Latn": "German (Latin script)",
+    "spa_Latn": "Spanish (Latin script)",
+    "por_Latn": "Portuguese (Latin script)",
+    "ita_Latn": "Italian (Latin script)",
+    "nld_Latn": "Dutch (Latin script)",
+    "pol_Latn": "Polish (Latin script)",
+    "rus_Cyrl": "Russian (Cyrillic script)",
+    "zho_Hans": "Chinese (Simplified)",
+    "zho_Hant": "Chinese (Traditional)",
+    "jpn_Jpan": "Japanese",
+    "kor_Hang": "Korean",
+    "ara_Arab": "Arabic",
+    "hin_Deva": "Hindi (Devanagari)",
+}
+def list_available_languages(dataset_id: str) -> list[str]:
+    """List available language subsets in the dataset."""
+    try:
+        tree = list_repo_tree(dataset_id, path_in_repo="data", repo_type="dataset")
+        languages = [
+            item.path.replace("data/", "")
+            for item in tree
+            if item.path.startswith("data/")
+            and "/" not in item.path.replace("data/", "")
+        ]
+        return sorted(languages)
+    except Exception as e:
+        logger.warning(f"Could not list languages: {e}")
+        return list(COMMON_LANGUAGES.keys())
+def compute_temporal_stats(df: pl.LazyFrame, output_path: Path) -> pl.DataFrame:
+    """Single scan: compute stats grouped by dump for temporal analysis."""
+    query = df.group_by("dump").agg(
+        pl.len().alias("doc_count"),
+        pl.col("token_count").sum().alias("total_tokens"),
+        pl.col("fw_edu_scores").list.mean().mean().alias("avg_edu_score"),
+        (pl.col("fw_edu_scores").list.mean() >= 3).sum().alias("high_edu_count"),
+    )
+    query.sink_parquet(output_path, engine="streaming")
+    return pl.read_parquet(output_path)
+def compute_global_stats(temporal: pl.DataFrame) -> pl.DataFrame:
+    """Compute global stats from temporal breakdown."""
+    total = temporal["doc_count"].sum()
+    return pl.DataFrame(
+        {
+            "total_docs": [total],
+            "total_tokens": [temporal["total_tokens"].sum()],
+            "avg_edu_score": [
+                (temporal["avg_edu_score"] * temporal["doc_count"]).sum() / total
+            ],
+            "high_edu_rate": [temporal["high_edu_count"].sum() / total],
+            "num_dumps": [len(temporal)],
+        }
+    )
+def format_temporal_stats(temporal: pl.DataFrame) -> pl.DataFrame:
+    """Format temporal stats with high_edu_rate, sorted chronologically."""
+    return (
+        temporal.with_columns(
+            (pl.col("high_edu_count") / pl.col("doc_count")).alias("high_edu_rate")
+        )
+        .select(["dump", "doc_count", "avg_edu_score", "high_edu_rate"])
+        .sort(
+            "dump"
+        )  # Chronological order (CC-MAIN-2017-xx comes before CC-MAIN-2024-xx)
+    )
+def create_ascii_charts(temporal_stats: pl.DataFrame) -> str:
+    """Create ASCII bar charts showing temporal trends."""
+    # Extract year from dump name (CC-MAIN-2024-42 -> 2024)
+    # Group by year and average the values for cleaner display
+    yearly = (
+        temporal_stats.with_columns(
+            pl.col("dump").str.extract(r"CC-MAIN-(\d{4})", 1).alias("year")
+        )
+        .group_by("year")
+        .agg(
+            pl.col("doc_count").sum(),
+            pl.col("avg_edu_score").mean(),
+            pl.col("high_edu_rate").mean(),
+        )
+        .sort("year")
+    )
+    lines = []
+    # High edu rate chart (more dramatic differences)
+    data_rate = [
+        (row["year"], row["high_edu_rate"] * 100)
+        for row in yearly.iter_rows(named=True)
+    ]
+    graph = Pyasciigraph(line_length=60, float_format="{0:.1f}%")
+    lines.extend(graph.graph("High Educational Content (edu >= 3)", data_rate))
+    lines.append("")
+    # Avg edu score chart
+    data_score = [
+        (row["year"], row["avg_edu_score"]) for row in yearly.iter_rows(named=True)
+    ]
+    graph2 = Pyasciigraph(line_length=60, float_format="{0:.2f}")
+    lines.extend(graph2.graph("Average Educational Score", data_score))
+    return "\n".join(lines)
+def create_readme(
+    args,
+    global_stats: pl.DataFrame,
+    temporal_stats: pl.DataFrame,
+    scan_time: float,
+    ascii_charts: str,
+) -> str:
+    """Create README content for the stats dataset."""
+    stats = global_stats.to_dicts()[0]
+    total_docs = stats.get("total_docs", 0)
+    docs_per_sec = total_docs / scan_time if scan_time > 0 else 0
+    # Get first and last year averages for trend (more representative than single dumps)
+    yearly = (
+        temporal_stats.with_columns(
+            pl.col("dump").str.extract(r"CC-MAIN-(\d{4})", 1).alias("year")
+        )
+        .group_by("year")
+        .agg(
+            pl.col("doc_count").sum(),
+            pl.col("avg_edu_score").mean(),
+            pl.col("high_edu_rate").mean(),
+        )
+        .sort("year")
+    )
+    first_year = yearly.head(1).to_dicts()[0]
+    last_year = yearly.tail(1).to_dicts()[0]
+    scope = (
+        "all languages"
+        if args.all_languages
+        else COMMON_LANGUAGES.get(args.lang, args.lang)
+    )
+    return f"""---
+tags:
+  - uv-script
+  - statistics
+  - polars
+  - finepdfs-edu
+  - temporal-analysis
+license: odc-by
+configs:
+  - config_name: global_stats
+    data_files: global_stats/train-*.parquet
+  - config_name: temporal_stats
+    data_files: temporal_stats/train-*.parquet
+default_viewer_config: temporal_stats
+---
+# Is the Web Getting More Educational?
+Temporal analysis of educational quality in **{scope}** across {stats.get("num_dumps", 0)} CommonCrawl dumps.
+## Trend
+```
+{ascii_charts}
+```
+## Key Finding
+| Year | Avg Edu Score | High Edu Rate |
+|------|---------------|---------------|
+| {first_year["year"]} | {first_year["avg_edu_score"]:.2f} | {first_year["high_edu_rate"] * 100:.1f}% |
+| {last_year["year"]} | {last_year["avg_edu_score"]:.2f} | {last_year["high_edu_rate"] * 100:.1f}% |
+## Performance
+- **{total_docs:,} documents** processed in **{scan_time:.0f} seconds**
+- **{docs_per_sec:,.0f} docs/sec** using Polars streaming
+- Single scan, no full dataset download required
+## Summary
+| Metric | Value |
+|--------|-------|
+| Scope | {scope} |
+| Total Documents | {total_docs:,} |
+| Total Tokens | {stats.get("total_tokens", 0):,} |
+| Avg Edu Score | {stats.get("avg_edu_score", 0):.3f} |
+| High Edu Rate | {stats.get("high_edu_rate", 0) * 100:.1f}% |
+| CommonCrawl Dumps | {stats.get("num_dumps", 0)} |
+## Files
+- `global_stats` - Overall summary
+- `temporal_stats` - Per-dump breakdown (sorted chronologically)
+## Reproduce
+```bash
+uv run https://huggingface.co/datasets/uv-scripts/dataset-stats/raw/main/finepdfs-stats.py \\
+    {"--all-languages" if args.all_languages else f"--lang {args.lang}"} --output-repo your-username/stats
+```
+## Source
+- **Dataset**: [{args.source_dataset}](https://huggingface.co/datasets/{args.source_dataset})
+- **Script**: [uv-scripts/dataset-stats](https://huggingface.co/datasets/uv-scripts/dataset-stats)
+"""
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyze educational quality trends across CommonCrawl dumps",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--source-dataset",
+        type=str,
+        default="HuggingFaceFW/finepdfs-edu",
+        help="Source dataset (default: HuggingFaceFW/finepdfs-edu)",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        default="eng_Latn",
+        help="Language+script code (default: eng_Latn)",
+    )
+    parser.add_argument(
+        "--all-languages",
+        action="store_true",
+        help="Analyze all languages (70+) instead of single language",
+    )
+    parser.add_argument(
+        "--show-plan",
+        action="store_true",
+        help="Show Polars query plan (demonstrates optimization)",
+    )
+    parser.add_argument(
+        "--list-languages",
+        action="store_true",
+        help="List available languages and exit",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Limit to first N rows (for testing)",
+    )
+    parser.add_argument(
+        "--output-repo",
+        type=str,
+        help="HuggingFace dataset repository to upload results",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./stats_output",
+        help="Local directory for output files",
+    )
+    parser.add_argument(
+        "--hf-token",
+        type=str,
+        help="HuggingFace API token (or set HF_TOKEN env var)",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Make the output dataset private",
+    )
+    args = parser.parse_args()
+    # Check for high-performance mode
+    if os.environ.get("HF_XET_HIGH_PERFORMANCE"):
+        logger.info("High-performance mode enabled (HF_XET_HIGH_PERFORMANCE=1)")
+    # List languages mode
+    if args.list_languages:
+        print(f"Available language+script codes for {args.source_dataset}:\n")
+        print("Common languages:")
+        for code, name in COMMON_LANGUAGES.items():
+            print(f"  {code:12} - {name}")
+        print("\nFetching full list from HF Hub...")
+        all_langs = list_available_languages(args.source_dataset)
+        print(f"\nAll available ({len(all_langs)} total):")
+        for lang in all_langs[:30]:  # Show first 30
+            name = COMMON_LANGUAGES.get(lang, "")
+            print(f"  {lang:12} {name}")
+        if len(all_langs) > 30:
+            print(f"  ... and {len(all_langs) - 30} more")
+        sys.exit(0)
+    # Build the parquet path
+    if args.all_languages:
+        source_path = f"hf://datasets/{args.source_dataset}/data/*/train/*.parquet"
+        scope_desc = "all languages"
+    else:
+        source_path = (
+            f"hf://datasets/{args.source_dataset}/data/{args.lang}/train/*.parquet"
+        )
+        scope_desc = f"{args.lang} ({COMMON_LANGUAGES.get(args.lang, 'unknown')})"
+    logger.info(f"Scanning: {source_path}")
+    logger.info(f"Scope: {scope_desc}")
+    # Create lazy frame - this doesn't load any data yet!
+    logger.info("Creating lazy query plan...")
+    df = pl.scan_parquet(source_path)
+    # Apply limit if specified
+    if args.limit:
+        logger.info(f"Limiting to first {args.limit:,} rows")
+        df = df.head(args.limit)
+    # Show query plan if requested
+    if args.show_plan:
+        # Build a sample query to show the plan
+        sample_query = df.select(
+            pl.len(),
+            pl.col("token_count").sum(),
+            pl.col("language").n_unique(),
+        )
+        print("\nQuery Plan (showing Polars optimization):")
+        print("=" * 60)
+        print(sample_query.explain())
+        print("=" * 60)
+        print("\nNote: Polars uses projection pushdown - only reads columns needed!")
+        print("The 'text' column is never loaded, making this very fast.\n")
+    # Create output directory
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Single scan: compute temporal stats
+    logger.info("Computing temporal stats (single scan)...")
+    start = time.perf_counter()
+    temporal_path = output_dir / "temporal_stats.parquet"
+    temporal_raw = compute_temporal_stats(df, temporal_path)
+    scan_time = time.perf_counter() - start
+    logger.info(f"Scan complete in {scan_time:.2f}s - {len(temporal_raw)} dumps")
+    # Compute stats
+    global_stats = compute_global_stats(temporal_raw)
+    temporal_stats = format_temporal_stats(temporal_raw)
+    # Save
+    global_stats.write_parquet(output_dir / "global_stats.parquet")
+    temporal_stats.write_parquet(output_dir / "temporal_stats.parquet")
+    # Print results
+    total_docs = global_stats["total_docs"][0]
+    docs_per_sec = total_docs / scan_time if scan_time > 0 else 0
+    print("\n" + "=" * 70)
+    print("IS THE WEB GETTING MORE EDUCATIONAL?")
+    print("=" * 70)
+    print(f"\nScope: {scope_desc}")
+    print(f"Dataset: {args.source_dataset}")
+    print("\n" + "-" * 70)
+    print("GLOBAL STATS")
+    print("-" * 70)
+    print(global_stats)
+    print("\n" + "-" * 70)
+    print(f"TEMPORAL TREND ({len(temporal_stats)} CommonCrawl dumps)")
+    print("-" * 70)
+    # Show first 5 and last 5
+    if len(temporal_stats) > 10:
+        print("Earliest dumps:")
+        print(temporal_stats.head(5))
+        print("\n...")
+        print("\nLatest dumps:")
+        print(temporal_stats.tail(5))
+    else:
+        print(temporal_stats)
+    # Create ASCII charts
+    ascii_charts = create_ascii_charts(temporal_stats)
+    print("\n" + "-" * 70)
+    print("TREND VISUALIZATION")
+    print("-" * 70)
+    print(ascii_charts)
+    print("\n" + "-" * 70)
+    print("PERFORMANCE")
+    print("-" * 70)
+    print(f"Scan time: {scan_time:.2f}s")
+    print(f"Documents: {total_docs:,}")
+    print(f"Throughput: {docs_per_sec:,.0f} docs/sec")
+    logger.info(f"Results saved to: {output_dir}")
+    # Upload to HF Hub if requested
+    if args.output_repo:
+        hf_token = args.hf_token or os.environ.get("HF_TOKEN")
+        if hf_token:
+            login(token=hf_token)
+        api = HfApi(token=hf_token)
+        logger.info(f"Creating/updating dataset repository: {args.output_repo}")
+        create_repo(
+            args.output_repo,
+            repo_type="dataset",
+            private=args.private,
+            token=hf_token,
+            exist_ok=True,
+        )
+        # Upload each as a dataset config
+        configs = [
+            ("global_stats", global_stats),
+            ("temporal_stats", temporal_stats),
+        ]
+        for config_name, stats_df in configs:
+            logger.info(f"Uploading {config_name}...")
+            ds = Dataset.from_polars(stats_df)
+            ds.push_to_hub(
+                args.output_repo,
+                config_name=config_name,
+                token=hf_token,
+                private=args.private,
+            )
+            time.sleep(1)  # Avoid 409 conflicts
+        # Upload README
+        readme_content = create_readme(
+            args, global_stats, temporal_stats, scan_time, ascii_charts
+        )
+        api.upload_file(
+            path_or_fileobj=readme_content.encode(),
+            path_in_repo="README.md",
+            repo_id=args.output_repo,
+            repo_type="dataset",
+            token=hf_token,
+        )
+        dataset_url = f"https://huggingface.co/datasets/{args.output_repo}"
+        logger.info(f"Dataset uploaded: {dataset_url}")
+        print(f"\nResults uploaded to: {dataset_url}")
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        print("Is the Web Getting More Educational?")
+        print("=" * 40)
+        print("\nAnalyze educational quality trends across CommonCrawl dumps")
+        print("using Polars streaming - no download needed!\n")
+        print("Example commands:\n")
+        print("# Quick test:")
+        print("uv run finepdfs-stats.py --limit 10000\n")
+        print("# Analyze English PDFs:")
+        print("uv run finepdfs-stats.py\n")
+        print("# Analyze ALL 70+ languages:")
+        print("uv run finepdfs-stats.py --all-languages\n")
+        print("# Show query plan (see Polars optimization):")
+        print("uv run finepdfs-stats.py --show-plan --limit 1000\n")
+        print("# Save results to HF Hub:")
+        print("uv run finepdfs-stats.py --output-repo username/temporal-stats\n")
+        print("# Run on HF Jobs:")
+        print("hf jobs uv run \\")
+        print("    -s HF_TOKEN \\")
+        print("    -e HF_XET_HIGH_PERFORMANCE=1 \\")
+        print(
+            "    https://huggingface.co/datasets/uv-scripts/dataset-stats/raw/main/finepdfs-stats.py \\"
+        )
+        print("    -- --output-repo username/stats")
+        sys.exit(0)
+    main()