PyPI - hf2vespa - Versions diffs - 0.1.0__py3-none-any.whl - Mend

hf2vespa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

hf2vespa/__init__.py +1 -0
hf2vespa/__main__.py +4 -0
hf2vespa/cli.py +465 -0
hf2vespa/config.py +131 -0
hf2vespa/converters.py +648 -0
hf2vespa/init.py +351 -0
hf2vespa/pipeline.py +198 -0
hf2vespa/stats.py +76 -0
hf2vespa/utils.py +57 -0
hf2vespa-0.1.0.dist-info/METADATA +820 -0
hf2vespa-0.1.0.dist-info/RECORD +14 -0
hf2vespa-0.1.0.dist-info/WHEEL +5 -0
hf2vespa-0.1.0.dist-info/entry_points.txt +2 -0
hf2vespa-0.1.0.dist-info/top_level.txt +1 -0

hf2vespa/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.1.0"

hf2vespa/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from hf2vespa.cli import app
+if __name__ == "__main__":
+    app()

hf2vespa/cli.py ADDED Viewed

@@ -0,0 +1,465 @@
+# Suppress warnings that occur during cleanup. These are harmless but confusing to users.
+# 1. HuggingFace HTTP retry warnings: https://github.com/apache/arrow/issues/45214
+# 2. Multiprocessing resource tracker warnings (leaked semaphores from HF datasets)
+#
+# IMPORTANT: Set PYTHONWARNINGS env var BEFORE any imports that might trigger multiprocessing.
+# The resource_tracker runs as a separate daemon process and inherits the env at spawn time.
+# warnings.filterwarnings() alone doesn't work because it only affects the current process.
+import os as _os
+_existing_warnings = _os.environ.get("PYTHONWARNINGS", "")
+_new_filter = "ignore::UserWarning:multiprocessing.resource_tracker"
+if _existing_warnings:
+    _os.environ["PYTHONWARNINGS"] = f"{_existing_warnings},{_new_filter}"
+else:
+    _os.environ["PYTHONWARNINGS"] = _new_filter
+del _os, _existing_warnings, _new_filter
+import logging as _logging
+import warnings as _warnings
+_logging.getLogger("huggingface_hub.utils._http").setLevel(_logging.CRITICAL)
+_warnings.filterwarnings("ignore", message="resource_tracker:", category=UserWarning)
+del _logging, _warnings
+"""CLI for streaming HuggingFace datasets to Vespa JSON format."""
+import gc
+import itertools
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Annotated
+import orjson
+import typer
+from tqdm import tqdm
+from hf2vespa.config import VespaConfig
+from hf2vespa.pipeline import format_vespa_put, stream_dataset, validate_config
+from hf2vespa.stats import ErrorMode, ProcessingStats
+from hf2vespa.utils import handle_broken_pipe
+# Create a Typer app that supports subcommands
+app = typer.Typer(
+    help="Stream HuggingFace datasets to Vespa JSON format.",
+    no_args_is_help=False,
+    add_completion=False,  # We provide our own install-completion command
+)
+def report_completion(stats: ProcessingStats, elapsed_ns: int) -> None:
+    """Print completion statistics to stderr."""
+    elapsed_sec = elapsed_ns / 1_000_000_000
+    total = stats.total_processed
+    errors = stats.error_count
+    success = stats.success_count
+    throughput = total / elapsed_sec if elapsed_sec > 0 else 0
+    print("\n--- Completion Statistics ---", file=sys.stderr)
+    print(f"Total records processed: {total:,}", file=sys.stderr)
+    print(f"Successful: {success:,}", file=sys.stderr)
+    print(f"Errors: {errors:,}", file=sys.stderr)
+    print(f"Throughput: {throughput:.1f} records/sec", file=sys.stderr)
+    print(f"Elapsed time: {elapsed_sec:.2f}s", file=sys.stderr)
+def _cleanup_hf_resources() -> None:
+    """Clean up HuggingFace Hub resources to prevent exit hangs.
+    This addresses a known PyArrow bug (https://github.com/apache/arrow/issues/45214)
+    that causes hangs when cleaning up streaming dataset iterators.
+    """
+    # Enable offline mode to prevent HTTP retry loops during cleanup
+    os.environ["HF_HUB_OFFLINE"] = "1"
+    # Force garbage collection to trigger finalizers
+    gc.collect()
+def feed_impl(
+    dataset: str,
+    split: str = "train",
+    config: str | None = None,
+    include: list[str] | None = None,
+    rename: list[str] | None = None,
+    namespace: str = "doc",
+    doctype: str = "doc",
+    config_file: Path | None = None,
+    limit: int | None = None,
+    id_column: str | None = None,
+    on_error: ErrorMode = ErrorMode.fail,
+    num_workers: int | None = None,
+) -> None:
+    """Implementation of the feed command.
+    This is the actual implementation that both the callback (for backward
+    compatibility) and the explicit 'feed' command delegate to.
+    """
+    # Validate limit parameter
+    if limit is not None and limit <= 0:
+        typer.echo("Error: --limit must be positive", err=True)
+        raise typer.Exit(1)
+    # Auto-detect CPU cores if not specified
+    if num_workers is None:
+        num_workers = os.cpu_count()
+    # Validate num_workers
+    if num_workers is not None and num_workers <= 0:
+        typer.echo("Error: --num-workers must be positive", err=True)
+        raise typer.Exit(1)
+    # Load config from file or create default
+    if config_file is not None:
+        try:
+            vespa_config = VespaConfig.from_yaml(config_file)
+        except ValueError as e:
+            typer.echo(f"Error loading config file: {e}", err=True)
+            raise typer.Exit(1)
+        # CLI flags override config file values only if non-default
+        # (We can't detect if user explicitly passed the default value, so we assume
+        # that if a config file is provided, the user wants to use its values unless
+        # they explicitly override them with a different value)
+        if namespace != "doc":
+            vespa_config.namespace = namespace
+        if doctype != "doc":
+            vespa_config.doctype = doctype
+        if id_column is not None:
+            vespa_config.id_column = id_column
+    else:
+        # No config file - use CLI values directly
+        vespa_config = VespaConfig(
+            namespace=namespace,
+            doctype=doctype,
+            id_column=id_column,
+        )
+    # Parse rename list into dictionary
+    rename_dict = None
+    if rename:
+        rename_dict = {}
+        for pair in rename:
+            if ":" not in pair:
+                typer.echo(
+                    f"Error: --rename must be in 'old:new' format, got: {pair}",
+                    err=True,
+                )
+                raise typer.Exit(1)
+            old, new = pair.split(":", 1)
+            rename_dict[old] = new
+    # Load dataset to validate config against schema
+    from datasets import load_dataset
+    try:
+        ds = load_dataset(dataset, config, split=split, streaming=True)
+        dataset_columns = set(ds.column_names)
+    except Exception as e:
+        typer.echo(f"Error loading dataset: {e}", err=True)
+        raise typer.Exit(1)
+    # Validate config against dataset schema (fail-fast)
+    try:
+        validate_config(vespa_config, dataset_columns)
+    except ValueError as e:
+        typer.echo(f"Config validation failed: {e}", err=True)
+        raise typer.Exit(1)
+    # Stream dataset with transformations
+    records = stream_dataset(
+        dataset_name=dataset,
+        split=split,
+        include=include,
+        rename=rename_dict,
+        config=config,
+        num_proc=num_workers,
+    )
+    # Format as Vespa PUT operations
+    vespa_docs = format_vespa_put(
+        records, vespa_config.namespace, vespa_config.doctype, config=vespa_config
+    )
+    # Apply limit if specified
+    if limit is not None:
+        vespa_docs = itertools.islice(vespa_docs, limit)
+    # Initialize statistics tracking
+    stats = ProcessingStats()
+    start_time = time.perf_counter_ns()
+    show_progress = sys.stderr.isatty()
+    # Write to stdout with SIGPIPE handling
+    with handle_broken_pipe():
+        # Wrap with progress bar (no-op if not TTY)
+        iterator = tqdm(
+            vespa_docs,
+            disable=not show_progress,
+            desc="Processing records",
+            unit="rec",
+            file=sys.stderr,
+        )
+        try:
+            for doc in iterator:
+                try:
+                    # Serialize to JSON with orjson for performance
+                    json_bytes = orjson.dumps(doc, option=orjson.OPT_APPEND_NEWLINE)
+                    sys.stdout.buffer.write(json_bytes)
+                    stats.record_success()
+                except Exception as e:
+                    # Error during processing this record
+                    if on_error == ErrorMode.fail:
+                        raise
+                    else:
+                        # Skip mode: warn and continue
+                        tqdm.write(f"Warning: {e}", file=sys.stderr)
+                        stats.record_error()
+                        continue
+                # Flush periodically for streaming UX
+                if stats.total_processed % 100 == 0:
+                    sys.stdout.flush()
+        finally:
+            # Always report stats, even on error
+            end_time = time.perf_counter_ns()
+            report_completion(stats, end_time - start_time)
+            # Clean up HuggingFace resources IMMEDIATELY after processing,
+            # before the iterator is garbage collected. This prevents
+            # "Bad file descriptor" errors from PyArrow/fsspec cleanup.
+            _cleanup_hf_resources()
+@app.command("feed")
+def feed(
+    dataset: Annotated[str, typer.Argument(help="HuggingFace dataset name")],
+    split: Annotated[str, typer.Option(help="Dataset split")] = "train",
+    config: Annotated[str | None, typer.Option(help="Dataset config name")] = None,
+    include: Annotated[
+        list[str] | None, typer.Option(help="Columns to include (repeatable)")
+    ] = None,
+    rename: Annotated[
+        list[str] | None,
+        typer.Option(help="Rename columns as 'old:new' (repeatable)"),
+    ] = None,
+    namespace: Annotated[
+        str, typer.Option(help="Vespa namespace for document IDs")
+    ] = "doc",
+    doctype: Annotated[
+        str, typer.Option(help="Vespa document type for document IDs")
+    ] = "doc",
+    config_file: Annotated[
+        Path | None,
+        typer.Option("--config-file", help="YAML config file for field mappings"),
+    ] = None,
+    limit: Annotated[
+        int | None, typer.Option(help="Process only first N records")
+    ] = None,
+    id_column: Annotated[
+        str | None,
+        typer.Option("--id-column", help="Dataset column to use as document ID"),
+    ] = None,
+    on_error: Annotated[
+        ErrorMode,
+        typer.Option(
+            "--on-error",
+            help="Error handling: fail (stop on error) or skip (warn and continue)",
+        ),
+    ] = ErrorMode.fail,
+    num_workers: Annotated[
+        int | None,
+        typer.Option(
+            "--num-workers",
+            help="Number of parallel workers. Note: Not supported with streaming mode. Reserved for future use with non-streaming datasets.",
+        ),
+    ] = None,
+) -> None:
+    """Stream HuggingFace dataset to Vespa JSON format.
+    Examples:
+        # Basic usage
+        $ hf2vespa feed glue --split test --config ax
+        # Filter columns
+        $ hf2vespa feed glue --split test --config ax --include premise --include hypothesis
+        # Custom namespace and doctype
+        $ hf2vespa feed squad --namespace wiki --doctype article
+        # Use config file
+        $ hf2vespa feed glue --config ax --config-file mappings.yaml
+        # Preview first 10 records
+        $ hf2vespa feed squad --limit 10
+    """
+    feed_impl(
+        dataset=dataset,
+        split=split,
+        config=config,
+        include=include,
+        rename=rename,
+        namespace=namespace,
+        doctype=doctype,
+        config_file=config_file,
+        limit=limit,
+        id_column=id_column,
+        on_error=on_error,
+        num_workers=num_workers,
+    )
+@app.command("init")
+def init(
+    dataset: Annotated[str, typer.Argument(help="HuggingFace dataset name")],
+    output: Annotated[
+        Path, typer.Option("--output", "-o", help="Output YAML file path")
+    ] = Path("vespa-config.yaml"),
+    split: Annotated[
+        str, typer.Option("--split", "-s", help="Dataset split to inspect")
+    ] = "train",
+    config: Annotated[
+        str | None,
+        typer.Option(
+            "--config",
+            "-c",
+            help="Dataset config name (required for multi-config datasets)",
+        ),
+    ] = None,
+) -> None:
+    """Generate a YAML config by inspecting a HuggingFace dataset schema.
+    This command inspects the dataset schema (without downloading the full dataset)
+    and generates a YAML configuration file with sensible defaults and helpful comments.
+    Examples:
+        # Generate config for a dataset
+        $ hf2vespa init glue --config ax
+        # Specify output file
+        $ hf2vespa init squad --output my-config.yaml
+        # Inspect a specific split
+        $ hf2vespa init my-dataset --split validation
+    """
+    from hf2vespa.init import init_command
+    init_command(dataset, output, split, config)
+@app.command("install-completion")
+def install_completion(
+    shell: Annotated[
+        str | None,
+        typer.Argument(
+            help="Shell to install completion for (bash, zsh, fish). Auto-detected if omitted."
+        ),
+    ] = None,
+) -> None:
+    """Install shell tab-completion for hf2vespa.
+    Detects your shell automatically, or specify explicitly.
+    Examples:
+        hf2vespa install-completion        # Auto-detect shell
+        hf2vespa install-completion bash   # Explicit bash
+        hf2vespa install-completion zsh    # Explicit zsh
+    """
+    from typer._completion_shared import Shells, install
+    # Detect shell if not provided
+    if shell is None:
+        try:
+            import shellingham
+            detected_name, _ = shellingham.detect_shell()
+            shell = detected_name.lower()
+            typer.echo(f"Detected shell: {shell}")
+        except Exception:
+            typer.echo(
+                "Could not auto-detect your shell.\n"
+                "Please specify: hf2vespa install-completion [bash|zsh|fish]",
+                err=True,
+            )
+            raise typer.Exit(1)
+    # Validate shell
+    shell = shell.lower()
+    supported = {"bash", "zsh", "fish"}
+    if shell not in supported:
+        typer.echo(f"Unsupported shell: {shell}", err=True)
+        typer.echo(f"Supported shells: {', '.join(sorted(supported))}", err=True)
+        raise typer.Exit(1)
+    # Install completion
+    try:
+        shell_enum = Shells(shell)
+        _, path = install(shell=shell_enum)
+        # Success message
+        typer.echo(f"\nShell completion installed for {shell}!")
+        typer.echo(f"Modified: {path}")
+        typer.echo("\nTo activate, either:")
+        typer.echo("  1. Restart your terminal, OR")
+        typer.echo(f"  2. Run: source {path}")
+    except Exception as e:
+        typer.echo(f"Failed to install completion: {e}", err=True)
+        raise typer.Exit(1)
+def run() -> None:
+    """Entry point with backward compatibility handling.
+    This function provides backward compatibility for the old CLI pattern:
+    `hf2vespa <dataset>` by checking if the first argument looks like
+    a dataset name (not a subcommand) and inserting 'feed' if needed.
+    """
+    # The known subcommands
+    subcommands = {"feed", "init", "install-completion"}
+    if len(sys.argv) > 1:
+        first_arg = sys.argv[1]
+        # If first arg is a known subcommand, proceed normally
+        # If first arg is a flag (--help, -h, etc.), proceed normally
+        # Otherwise, assume it's a dataset name and insert 'feed'
+        if first_arg not in subcommands and not first_arg.startswith("-"):
+            sys.argv.insert(1, "feed")
+    exit_code = 0
+    try:
+        app()
+    except SystemExit as e:
+        # Preserve the exit code from typer (e.g., 1 for validation errors)
+        exit_code = e.code if isinstance(e.code, int) else 1
+    # Clean up HuggingFace resources before exit
+    _cleanup_hf_resources()
+    # Flush all output streams
+    sys.stdout.flush()
+    sys.stderr.flush()
+    # Redirect stderr to /dev/null before exit to suppress resource_tracker warnings.
+    # The warnings.filterwarnings() call at module level doesn't affect the resource_tracker
+    # subprocess that Python's multiprocessing spawns. These warnings are harmless but
+    # confusing to users. We've already flushed our output above.
+    try:
+        devnull = os.open(os.devnull, os.O_WRONLY)
+        os.dup2(devnull, sys.stderr.fileno())
+        os.close(devnull)
+    except OSError:
+        pass  # If we can't redirect, just proceed
+    # Exit immediately to avoid HuggingFace/PyArrow cleanup issues that cause hangs.
+    # The hang is a known PyArrow bug: https://github.com/huggingface/datasets/issues/7467
+    # os._exit() is required as a safety net until the upstream bug is fixed.
+    os._exit(exit_code)
+if __name__ == "__main__":
+    run()

hf2vespa/config.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""Configuration schema for Vespa feed generation."""
+from pathlib import Path
+import yaml
+from pydantic import BaseModel, field_validator
+class FieldMapping(BaseModel):
+    """
+    Field mapping configuration from dataset column to Vespa field.
+    Attributes:
+        source: Source column name in the dataset
+        target: Target field name in Vespa document
+        type: Type conversion to apply (e.g., "tensor", "string", "int", "float")
+    Examples:
+        >>> m = FieldMapping(source="embedding", target="vector", type="tensor")
+        >>> m.source
+        'embedding'
+        >>> m.type
+        'tensor'
+    """
+    source: str
+    target: str
+    type: str | None = None
+    @field_validator("type")
+    @classmethod
+    def validate_type(cls, v: str | None) -> str | None:
+        """
+        Validate that type is one of the known converter types.
+        Args:
+            v: Type string to validate
+        Returns:
+            Validated type string or None
+        Raises:
+            ValueError: If type is not one of the known types
+        """
+        if v is None:
+            return v
+        valid_types = {
+            # Basic types
+            "tensor",
+            "string",
+            "int",
+            "float",
+            # Scalar types (Phase 8)
+            "position",
+            "weightedset",
+            "map",
+            # Hex tensor types (Phase 9)
+            "tensor_int8_hex",
+            "tensor_bfloat16_hex",
+            "tensor_float32_hex",
+            "tensor_float64_hex",
+            # Sparse and mixed tensor types (Phase 10)
+            "sparse_tensor",
+            "mixed_tensor",
+            "mixed_tensor_hex",
+        }
+        if v not in valid_types:
+            raise ValueError(
+                f"Unknown type converter '{v}'. Must be one of: {', '.join(sorted(valid_types))}"
+            )
+        return v
+class VespaConfig(BaseModel):
+    """
+    Configuration for Vespa feed generation.
+    Attributes:
+        namespace: Vespa namespace for document IDs (default: "doc")
+        doctype: Vespa document type for document IDs (default: "doc")
+        id_column: Dataset column to use as document ID (default: None, uses sequential numbering)
+        mappings: List of field mappings from dataset to Vespa format
+    Examples:
+        >>> cfg = VespaConfig()
+        >>> cfg.namespace
+        'doc'
+        >>> cfg.mappings
+        []
+    """
+    namespace: str = "doc"
+    doctype: str = "doc"
+    id_column: str | None = None
+    mappings: list[FieldMapping] = []
+    @classmethod
+    def from_yaml(cls, path: str | Path) -> "VespaConfig":
+        """
+        Load configuration from a YAML file.
+        Args:
+            path: Path to YAML configuration file
+        Returns:
+            VespaConfig instance with validated configuration
+        Raises:
+            ValueError: If file not found, YAML parsing fails, or validation fails
+        Examples:
+            >>> # Assuming config.yaml exists with valid configuration
+            >>> cfg = VespaConfig.from_yaml("config.yaml")  # doctest: +SKIP
+            >>> isinstance(cfg, VespaConfig)  # doctest: +SKIP
+            True
+        """
+        path_obj = Path(path)
+        try:
+            with open(path_obj, "r") as f:
+                data = yaml.safe_load(f)
+        except FileNotFoundError:
+            raise ValueError(f"Configuration file not found: {path}")
+        except yaml.YAMLError as e:
+            raise ValueError(f"Failed to parse YAML from {path}: {e}")
+        try:
+            return cls(**data)
+        except Exception as e:
+            raise ValueError(f"Failed to validate configuration from {path}: {e}")