PyPI - hf2vespa - Versions diffs - 0.1.0__py3-none-any.whl - Mend

hf2vespa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

hf2vespa/__init__.py +1 -0
hf2vespa/__main__.py +4 -0
hf2vespa/cli.py +465 -0
hf2vespa/config.py +131 -0
hf2vespa/converters.py +648 -0
hf2vespa/init.py +351 -0
hf2vespa/pipeline.py +198 -0
hf2vespa/stats.py +76 -0
hf2vespa/utils.py +57 -0
hf2vespa-0.1.0.dist-info/METADATA +820 -0
hf2vespa-0.1.0.dist-info/RECORD +14 -0
hf2vespa-0.1.0.dist-info/WHEEL +5 -0
hf2vespa-0.1.0.dist-info/entry_points.txt +2 -0
hf2vespa-0.1.0.dist-info/top_level.txt +1 -0

hf2vespa/init.py ADDED Viewed

@@ -0,0 +1,351 @@
+"""Schema inspection and YAML config generation for HuggingFace datasets."""
+from pathlib import Path
+from typing import Any
+import typer
+from datasets import Sequence, Value, load_dataset_builder, get_dataset_config_names
+from datasets.features import List, LargeList
+from ruamel.yaml import YAML
+from ruamel.yaml.comments import CommentedMap, CommentedSeq
+def is_list_feature(feature: Any) -> bool:
+    """
+    Check if feature is a list/sequence type suitable for tensor conversion.
+    List columns containing numeric values are good candidates for
+    Vespa tensor conversion (embeddings, vectors, etc.)
+    Args:
+        feature: A HuggingFace datasets Feature object
+    Returns:
+        True if feature is a list/sequence type
+    Examples:
+        >>> from datasets import Sequence, Value
+        >>> is_list_feature(Sequence(feature=Value("float32")))
+        True
+        >>> is_list_feature(Value("string"))
+        False
+    """
+    # Check for datasets library sequence types
+    if isinstance(feature, (Sequence, List, LargeList)):
+        return True
+    # Check for Python list with feature type inside
+    if isinstance(feature, list) and len(feature) == 1:
+        return True
+    return False
+def get_value_dtype(feature: Any) -> str:
+    """
+    Get the dtype string for display in YAML comments.
+    Args:
+        feature: A HuggingFace datasets Feature object
+    Returns:
+        Human-readable dtype string (e.g., "string", "Sequence[float32]")
+    Examples:
+        >>> from datasets import Sequence, Value
+        >>> get_value_dtype(Value("string"))
+        'string'
+        >>> get_value_dtype(Sequence(feature=Value("float32")))
+        'Sequence[float32]'
+    """
+    if isinstance(feature, Value):
+        return feature.dtype
+    elif isinstance(feature, Sequence):
+        inner = feature.feature
+        if isinstance(inner, Value):
+            return f"Sequence[{inner.dtype}]"
+        return "Sequence[complex]"
+    elif isinstance(feature, (List, LargeList)):
+        inner = feature.feature
+        if isinstance(inner, Value):
+            return f"List[{inner.dtype}]"
+        return "List[complex]"
+    elif isinstance(feature, list) and len(feature) == 1:
+        inner = feature[0]
+        if isinstance(inner, Value):
+            return f"list[{inner.dtype}]"
+        return "list[complex]"
+    elif isinstance(feature, dict):
+        return "dict"
+    else:
+        return str(type(feature).__name__)
+def suggest_type(col_name: str, feature: Any) -> str | None:
+    """
+    Suggest type conversion based on feature type.
+    For list columns with numeric inner types, suggests "tensor" conversion.
+    For other columns, returns None (no conversion needed).
+    Args:
+        col_name: Column name (not currently used but available for name-based heuristics)
+        feature: A HuggingFace datasets Feature object
+    Returns:
+        "tensor" for numeric list columns, None otherwise
+    Examples:
+        >>> from datasets import Sequence, Value
+        >>> suggest_type("embedding", Sequence(feature=Value("float32")))
+        'tensor'
+        >>> suggest_type("name", Value("string"))
+    """
+    if is_list_feature(feature):
+        # Get inner type
+        inner = None
+        if isinstance(feature, Sequence):
+            inner = feature.feature
+        elif isinstance(feature, (List, LargeList)):
+            inner = feature.feature
+        elif isinstance(feature, list) and len(feature) == 1:
+            inner = feature[0]
+        # Check if inner type is numeric
+        if isinstance(inner, Value):
+            numeric_types = (
+                "float32",
+                "float64",
+                "float",
+                "double",
+                "int32",
+                "int64",
+                "int",
+                "int8",
+                "int16",
+            )
+            if inner.dtype in numeric_types:
+                return "tensor"
+    return None
+def inspect_dataset_schema(
+    dataset_name: str,
+    config: str | None = None,
+) -> dict[str, Any]:
+    """
+    Get dataset schema without downloading data files.
+    Uses load_dataset_builder() to access metadata without downloading
+    the actual data files.
+    Args:
+        dataset_name: HuggingFace dataset name (e.g., "glue", "squad")
+        config: Dataset configuration name (required for multi-config datasets)
+    Returns:
+        Dict containing:
+        - columns: {name: feature_type} mapping
+        - list_columns: List of column names that are list/sequence types
+        - available_splits: List of available split names
+    Raises:
+        ValueError: If dataset cannot be loaded or config is required but not provided
+    Examples:
+        >>> schema = inspect_dataset_schema("glue", config="ax")  # doctest: +SKIP
+        >>> "premise" in schema["columns"]  # doctest: +SKIP
+        True
+    """
+    # Get builder for dataset (no download)
+    builder = load_dataset_builder(dataset_name, config)
+    features = builder.info.features
+    columns = {}
+    list_columns = []
+    for col_name, col_type in features.items():
+        columns[col_name] = col_type
+        # Detect list/sequence columns for tensor suggestion
+        if is_list_feature(col_type):
+            list_columns.append(col_name)
+    # Get available splits
+    available_splits = (
+        list(builder.info.splits.keys()) if builder.info.splits else ["train"]
+    )
+    return {
+        "columns": columns,
+        "list_columns": list_columns,
+        "available_splits": available_splits,
+    }
+def generate_config_yaml(
+    features: dict[str, Any],
+    output_path: Path,
+    dataset_name: str,
+    config: str | None = None,
+    split: str = "train",
+    namespace: str = "doc",
+    doctype: str = "doc",
+) -> None:
+    """
+    Generate a YAML config file with comments for user guidance.
+    Uses ruamel.yaml's CommentedMap to include helpful comments
+    explaining each configuration option.
+    Args:
+        features: Dict mapping column names to feature types
+        output_path: Path to write the YAML file
+        dataset_name: Name of the dataset (for header comment)
+        config: Dataset config name (for header comment)
+        split: Dataset split name (for header comment)
+        namespace: Default Vespa namespace
+        doctype: Default Vespa document type
+    Examples:
+        >>> from pathlib import Path
+        >>> features = {"id": Value("string"), "text": Value("string")}  # doctest: +SKIP
+        >>> generate_config_yaml(features, Path("/tmp/test.yaml"), "test-dataset")  # doctest: +SKIP
+    """
+    yaml = YAML()
+    yaml.default_flow_style = False
+    yaml.indent(mapping=2, sequence=4, offset=2)
+    # Create root config with CommentedMap for comment support
+    root = CommentedMap()
+    # Header comment
+    root.yaml_set_start_comment(
+        f"Generated config for dataset: {dataset_name}\n"
+        f"Config: {config or 'default'}, Split: {split}\n"
+        f"Edit this file to customize field mappings.\n"
+    )
+    # Basic config with comments
+    root["namespace"] = namespace
+    root.yaml_add_eol_comment("Vespa namespace for document IDs", "namespace")
+    root["doctype"] = doctype
+    root.yaml_add_eol_comment("Vespa document type", "doctype")
+    root["id_column"] = None
+    root.yaml_add_eol_comment(
+        "Column to use as document ID (null = auto-increment)", "id_column"
+    )
+    # Mappings section
+    mappings = CommentedSeq()
+    root["mappings"] = mappings
+    root.yaml_set_comment_before_after_key(
+        "mappings", before="\nField mappings: source (dataset) -> target (Vespa)"
+    )
+    for col_name, col_type in features.items():
+        mapping = CommentedMap()
+        mapping["source"] = col_name
+        mapping["target"] = col_name  # Default: same name
+        # Add type suggestion for list columns
+        suggested = suggest_type(col_name, col_type)
+        dtype_str = get_value_dtype(col_type)
+        if suggested:
+            mapping["type"] = suggested
+            mapping.yaml_add_eol_comment(
+                f"{dtype_str} -> suggested: {suggested}", "type"
+            )
+        else:
+            mapping["type"] = None
+            mapping.yaml_add_eol_comment(dtype_str, "type")
+        mappings.append(mapping)
+    # Write to file
+    with open(output_path, "w") as f:
+        yaml.dump(root, f)
+def init_command(
+    dataset: str,
+    output: Path,
+    split: str = "train",
+    config: str | None = None,
+) -> None:
+    """
+    Generate a YAML config by inspecting a HuggingFace dataset schema.
+    Main entry point for the init command. Inspects the dataset schema
+    without downloading data and generates a commented YAML config file.
+    Args:
+        dataset: HuggingFace dataset name
+        output: Output YAML file path
+        split: Dataset split to inspect (default: "train")
+        config: Dataset config name (required for multi-config datasets)
+    Raises:
+        typer.Exit: On error (with helpful message printed to stderr)
+    """
+    # Check for multi-config datasets
+    try:
+        configs = get_dataset_config_names(dataset)
+        if len(configs) > 1 and config is None:
+            typer.echo(
+                f"Dataset '{dataset}' has multiple configs: {', '.join(configs)}",
+                err=True,
+            )
+            typer.echo("Please specify one with --config", err=True)
+            raise typer.Exit(1)
+    except Exception as e:
+        # Some datasets don't have configs - that's fine
+        if "multiple configs" not in str(e):
+            configs = []
+    # Get dataset builder (no download)
+    try:
+        builder = load_dataset_builder(dataset, config)
+    except Exception as e:
+        typer.echo(f"Error loading dataset: {e}", err=True)
+        raise typer.Exit(1)
+    features = builder.info.features
+    # Check split exists
+    available_splits = (
+        list(builder.info.splits.keys()) if builder.info.splits else ["train"]
+    )
+    if split not in available_splits:
+        typer.echo(
+            f"Split '{split}' not found. Available: {', '.join(available_splits)}",
+            err=True,
+        )
+        raise typer.Exit(1)
+    # Generate YAML config
+    try:
+        generate_config_yaml(
+            features=features,
+            output_path=output,
+            dataset_name=dataset,
+            config=config,
+            split=split,
+        )
+    except Exception as e:
+        typer.echo(f"Error writing file: {e}", err=True)
+        raise typer.Exit(1)
+    # Print summary
+    typer.echo(f"Generated config: {output}", err=True)
+    typer.echo(f"  {len(features)} columns mapped", err=True)
+    list_cols = [n for n, t in features.items() if is_list_feature(t)]
+    if list_cols:
+        typer.echo(
+            f"  {len(list_cols)} list columns suggested for tensor conversion",
+            err=True,
+        )

hf2vespa/pipeline.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""Streaming pipeline for HuggingFace datasets to Vespa format."""
+from typing import Generator
+from datasets import load_dataset
+from .config import FieldMapping, VespaConfig
+from .converters import converters
+from .utils import generate_vespa_id
+def stream_dataset(
+    dataset_name: str,
+    split: str,
+    include: list[str] | None = None,
+    rename: dict[str, str] | None = None,
+    config: str | None = None,
+    num_proc: int | None = None,
+) -> Generator[dict, None, None]:
+    """
+    Stream a HuggingFace dataset with optional column filtering and renaming.
+    Args:
+        dataset_name: HuggingFace dataset identifier (e.g., "glue", "squad")
+        split: Dataset split to stream (e.g., "train", "test", "validation")
+        include: List of column names to include. If None, include all columns.
+        rename: Dictionary mapping old column names to new names. Applied after filtering.
+        config: Dataset configuration name (e.g., "ax" for glue dataset)
+        num_proc: Number of parallel workers for dataset operations. Note: HuggingFace
+            datasets library does not support num_proc with streaming=True. This parameter
+            is accepted for API consistency but is not passed to load_dataset. For parallel
+            processing with streaming datasets, wrap the dataset with PyTorch DataLoader
+            using num_workers > 1 instead.
+    Yields:
+        Dictionary records from the dataset
+    Examples:
+        >>> records = stream_dataset("glue", "ax", include=["premise", "hypothesis"], config="ax")
+        >>> first = next(records)
+        >>> "premise" in first and "hypothesis" in first
+        True
+    """
+    # Load dataset in streaming mode for O(1) memory usage
+    # Note: num_proc is not supported with streaming=True by HuggingFace datasets
+    # library. Parameter is accepted but not used. For parallel streaming, users
+    # should wrap the output with PyTorch DataLoader.
+    dataset = load_dataset(
+        dataset_name,
+        config,
+        split=split,
+        streaming=True,
+    )
+    # Apply column filtering if specified
+    if include is not None:
+        dataset = dataset.select_columns(include)
+    # Apply column renaming if specified
+    if rename is not None:
+        dataset = dataset.rename_columns(rename)
+    # Yield records one at a time
+    yield from dataset
+def validate_config(config: VespaConfig, dataset_columns: set[str]) -> None:
+    """
+    Validate that config references only columns that exist in the dataset.
+    Args:
+        config: VespaConfig to validate
+        dataset_columns: Set of column names available in the dataset
+    Raises:
+        ValueError: If id_column or any mapping source references a non-existent column
+    Examples:
+        >>> cfg = VespaConfig(id_column="idx", mappings=[FieldMapping(source="text", target="content")])
+        >>> validate_config(cfg, {"idx", "text", "label"})  # No error
+        >>> validate_config(VespaConfig(id_column="missing"), {"idx", "text"})  # doctest: +SKIP
+        Traceback (most recent call last):
+        ...
+        ValueError: id_column 'missing' not found in dataset. Available columns: idx, text
+    """
+    # Validate id_column exists if set
+    if config.id_column is not None and config.id_column not in dataset_columns:
+        available = ", ".join(sorted(dataset_columns))
+        raise ValueError(
+            f"id_column '{config.id_column}' not found in dataset. Available columns: {available}"
+        )
+    # Validate each mapping source exists
+    for mapping in config.mappings:
+        if mapping.source not in dataset_columns:
+            available = ", ".join(sorted(dataset_columns))
+            raise ValueError(
+                f"Mapping source '{mapping.source}' not found in dataset. Available columns: {available}"
+            )
+def apply_mappings(
+    record: dict, mappings: list[FieldMapping], row_num: int = 0
+) -> dict:
+    """
+    Apply field mappings and type conversions to a record.
+    Args:
+        record: Source record from dataset
+        mappings: List of field mappings to apply
+        row_num: Row number for error context (1-based, default 0 for unspecified)
+    Returns:
+        New dictionary with mapped and converted fields
+    Raises:
+        ValueError: If field is missing or type conversion fails.
+            Error message includes row number and field name for debugging.
+    Examples:
+        >>> mappings = [
+        ...     FieldMapping(source="vec", target="embedding", type="tensor"),
+        ...     FieldMapping(source="name", target="title")
+        ... ]
+        >>> record = {"vec": [1.0, 2.0, 3.0], "name": "test"}
+        >>> result = apply_mappings(record, mappings, row_num=1)
+        >>> result
+        {'embedding': {'values': [1.0, 2.0, 3.0]}, 'title': 'test'}
+        >>> # Missing field error includes row and field context
+        >>> apply_mappings({"x": 1}, [FieldMapping(source="missing", target="out")], row_num=42)
+        Traceback (most recent call last):
+        ...
+        ValueError: Row 42: Missing field 'missing'
+    """
+    result = {}
+    for mapping in mappings:
+        try:
+            value = record[mapping.source]
+        except KeyError:
+            raise ValueError(f"Row {row_num}: Missing field '{mapping.source}'")
+        # Apply type conversion if specified
+        if mapping.type is not None:
+            try:
+                value = converters.convert(value, mapping.type)
+            except Exception as e:
+                raise ValueError(f"Row {row_num}, field '{mapping.source}': {e}")
+        result[mapping.target] = value
+    return result
+def format_vespa_put(
+    records: Generator[dict, None, None],
+    namespace: str,
+    doctype: str,
+    config: VespaConfig | None = None,
+) -> Generator[dict, None, None]:
+    """
+    Format dataset records as Vespa PUT operations.
+    Args:
+        records: Generator of dataset records
+        namespace: Vespa namespace for document IDs
+        doctype: Vespa document type for document IDs
+        config: Optional VespaConfig for field mappings and custom ID column
+    Yields:
+        Vespa PUT operation dictionaries with structure:
+        {"put": "id:namespace:doctype::N", "fields": {...}}
+    Examples:
+        >>> records = iter([{"text": "hello"}, {"text": "world"}])
+        >>> vespa = format_vespa_put(records, "test", "doc")
+        >>> doc = next(vespa)
+        >>> doc["put"]
+        'id:test:doc::0'
+        >>> doc["fields"]["text"]
+        'hello'
+    """
+    for idx, record in enumerate(records, start=1):
+        # Determine document ID
+        if config is not None and config.id_column is not None:
+            doc_id = str(record[config.id_column])
+        else:
+            doc_id = str(idx - 1)  # Keep 0-based IDs for compatibility
+        vespa_id = generate_vespa_id(namespace, doctype, doc_id)
+        # Apply field mappings if configured
+        if config is not None and config.mappings:
+            fields = apply_mappings(record, config.mappings, row_num=idx)
+        else:
+            fields = record
+        yield {"put": vespa_id, "fields": fields}

hf2vespa/stats.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""Statistics tracking for pipeline processing."""
+from collections import Counter
+from dataclasses import dataclass, field
+from enum import Enum
+class ErrorMode(str, Enum):
+    """
+    Error handling mode for pipeline processing.
+    Values:
+        fail: Stop processing on first error (default)
+        skip: Skip erroring records and continue processing
+    Examples:
+        >>> ErrorMode.fail.value
+        'fail'
+        >>> ErrorMode.skip.value
+        'skip'
+    """
+    fail = "fail"
+    skip = "skip"
+@dataclass
+class ProcessingStats:
+    """
+    Statistics accumulator for tracking processing success and errors.
+    Attributes:
+        counter: Counter tracking success/error counts by type
+    Examples:
+        >>> stats = ProcessingStats()
+        >>> stats.record_success()
+        >>> stats.record_success()
+        >>> stats.record_error()
+        >>> stats.total_processed
+        3
+        >>> stats.success_count
+        2
+        >>> stats.error_count
+        1
+    """
+    counter: Counter = field(default_factory=Counter)
+    def record_success(self) -> None:
+        """Increment the success counter."""
+        self.counter["success"] += 1
+    def record_error(self, error_type: str = "error") -> None:
+        """
+        Increment an error counter.
+        Args:
+            error_type: Type of error to record (default: "error")
+        """
+        self.counter[error_type] += 1
+    @property
+    def total_processed(self) -> int:
+        """Return total number of records processed (success + errors)."""
+        return self.counter.total()
+    @property
+    def success_count(self) -> int:
+        """Return number of successfully processed records."""
+        return self.counter["success"]
+    @property
+    def error_count(self) -> int:
+        """Return total number of errors (all non-success counts)."""
+        return sum(count for key, count in self.counter.items() if key != "success")

hf2vespa/utils.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Utility functions for Vespa feed generation."""
+import os
+import sys
+from contextlib import contextmanager
+@contextmanager
+def handle_broken_pipe():
+    """
+    Context manager to handle SIGPIPE/BrokenPipeError gracefully.
+    When piped to tools like `head`, the pipe may close before all output is written.
+    This prevents ugly tracebacks by redirecting remaining output to /dev/null.
+    Usage:
+        with handle_broken_pipe():
+            # Write to stdout
+            sys.stdout.buffer.write(data)
+    """
+    try:
+        sys.stdout.flush()
+        yield
+    except BrokenPipeError:
+        # Redirect stdout to /dev/null to suppress further errors
+        devnull = os.open(os.devnull, os.O_WRONLY)
+        os.dup2(devnull, sys.stdout.fileno())
+        sys.exit(0)
+def generate_vespa_id(namespace: str, doctype: str, key: str | int) -> str:
+    """
+    Generate a Vespa document ID in the format: id:namespace:doctype::key
+    Args:
+        namespace: Vespa namespace (must not contain ':')
+        doctype: Vespa document type (must not contain ':')
+        key: Unique identifier (typically an integer)
+    Returns:
+        Formatted Vespa document ID
+    Raises:
+        ValueError: If namespace or doctype contains invalid characters
+    Examples:
+        >>> generate_vespa_id("myns", "doc", 42)
+        'id:myns:doc::42'
+        >>> generate_vespa_id("test", "article", "abc123")
+        'id:test:article::abc123'
+    """
+    if ":" in namespace:
+        raise ValueError(f"Namespace cannot contain ':' character: {namespace}")
+    if ":" in doctype:
+        raise ValueError(f"Doctype cannot contain ':' character: {doctype}")
+    return f"id:{namespace}:{doctype}::{key}"