PyPI - hf2vespa - Versions diffs - 0.1.0__py3-none-any.whl - Mend

hf2vespa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

hf2vespa/__init__.py +1 -0
hf2vespa/__main__.py +4 -0
hf2vespa/cli.py +465 -0
hf2vespa/config.py +131 -0
hf2vespa/converters.py +648 -0
hf2vespa/init.py +351 -0
hf2vespa/pipeline.py +198 -0
hf2vespa/stats.py +76 -0
hf2vespa/utils.py +57 -0
hf2vespa-0.1.0.dist-info/METADATA +820 -0
hf2vespa-0.1.0.dist-info/RECORD +14 -0
hf2vespa-0.1.0.dist-info/WHEEL +5 -0
hf2vespa-0.1.0.dist-info/entry_points.txt +2 -0
hf2vespa-0.1.0.dist-info/top_level.txt +1 -0

hf2vespa/converters.py ADDED Viewed

@@ -0,0 +1,648 @@
+"""Type converter registry for field transformations."""
+import struct
+from functools import lru_cache
+from typing import Any, Callable
+class ConverterRegistry:
+    """
+    Registry for type converters used in field mapping.
+    Converters are functions that transform values from dataset format
+    to Vespa format (e.g., list to tensor).
+    Examples:
+        >>> registry = ConverterRegistry()
+        >>> registry.register("double", lambda x: float(x) * 2)
+        >>> registry.convert(21, "double")
+        42.0
+        >>> registry.has("double")
+        True
+    """
+    def __init__(self) -> None:
+        """Initialize an empty converter registry."""
+        self._converters: dict[str, Callable[[Any], Any]] = {}
+    @lru_cache(maxsize=16)
+    def _get_converter(self, type_name: str) -> Callable[[Any], Any]:
+        """
+        Cached lookup of converter function by type name.
+        Uses lru_cache to avoid repeated dict lookups for the same type.
+        maxsize=16 is sufficient since we have few converter types.
+        Args:
+            type_name: Name of the converter to look up
+        Returns:
+            Converter function
+        Raises:
+            ValueError: If converter is not registered
+        """
+        if type_name not in self._converters:
+            raise ValueError(
+                f"Unknown type converter '{type_name}'. Available converters: {', '.join(sorted(self._converters.keys()))}"
+            )
+        return self._converters[type_name]
+    def register(self, name: str, func: Callable[[Any], Any]) -> None:
+        """
+        Register a type converter function.
+        Args:
+            name: Name of the converter (e.g., "tensor", "string")
+            func: Converter function that takes a value and returns transformed value
+        Examples:
+            >>> registry = ConverterRegistry()
+            >>> registry.register("uppercase", str.upper)
+            >>> registry.convert("hello", "uppercase")
+            'HELLO'
+        """
+        self._converters[name] = func
+    def convert(self, value: Any, type_name: str) -> Any:
+        """
+        Apply a registered converter to a value.
+        Converter function lookup is cached for performance.
+        Args:
+            value: Value to convert
+            type_name: Name of the converter to apply
+        Returns:
+            Converted value
+        Raises:
+            ValueError: If converter is not registered
+        Examples:
+            >>> registry = ConverterRegistry()
+            >>> registry.register("str", str)
+            >>> registry.convert(42, "str")
+            '42'
+        """
+        converter_fn = self._get_converter(type_name)
+        return converter_fn(value)
+    def has(self, type_name: str) -> bool:
+        """
+        Check if a converter is registered.
+        Args:
+            type_name: Name of the converter to check
+        Returns:
+            True if converter is registered, False otherwise
+        Examples:
+            >>> registry = ConverterRegistry()
+            >>> registry.has("nonexistent")
+            False
+            >>> registry.register("exists", str)
+            >>> registry.has("exists")
+            True
+        """
+        return type_name in self._converters
+    def cache_info(self):
+        """
+        Return cache statistics for converter lookups.
+        Returns:
+            CacheInfo named tuple with hits, misses, maxsize, currsize
+        Examples:
+            >>> registry = ConverterRegistry()
+            >>> registry.register("str", str)
+            >>> _ = registry.convert(1, "str")
+            >>> info = registry.cache_info()
+            >>> info.misses >= 1
+            True
+        """
+        return self._get_converter.cache_info()
+def list_to_tensor(value: Any) -> dict[str, list]:
+    """
+    Convert a Python list to Vespa indexed tensor format.
+    Args:
+        value: List of numeric values (int or float)
+    Returns:
+        Dictionary with "values" key containing the list
+    Raises:
+        ValueError: If input is not a list, is empty, or contains non-numeric values
+    Examples:
+        >>> list_to_tensor([1.0, 2.0, 3.0])
+        {'values': [1.0, 2.0, 3.0]}
+        >>> list_to_tensor([1, 2, 3])
+        {'values': [1, 2, 3]}
+    """
+    if not isinstance(value, list):
+        raise ValueError(
+            f"Expected list for tensor conversion, got {type(value).__name__}"
+        )
+    if len(value) == 0:
+        raise ValueError("Cannot convert empty list to tensor")
+    # Validate all values are numeric
+    for i, item in enumerate(value):
+        if not isinstance(item, (int, float)):
+            raise ValueError(
+                f"Tensor values must be numeric (int or float), got {type(item).__name__} at index {i}"
+            )
+    return {"values": value}
+def dict_to_position(value: Any) -> dict[str, float]:
+    """
+    Convert a lat/lng dictionary to Vespa position format.
+    Args:
+        value: Dictionary with "lat" and "lng" keys containing numeric values
+    Returns:
+        Dictionary with "lat" and "lng" as floats
+    Raises:
+        ValueError: If input is not a dict, missing keys, non-numeric values,
+                   or coordinates out of range
+    Examples:
+        >>> dict_to_position({"lat": 37.4, "lng": -122.0})
+        {'lat': 37.4, 'lng': -122.0}
+        >>> dict_to_position({"lat": 90, "lng": 180})
+        {'lat': 90.0, 'lng': 180.0}
+    """
+    if not isinstance(value, dict):
+        raise ValueError(
+            f"Expected dict for position conversion, got {type(value).__name__}"
+        )
+    if "lat" not in value:
+        raise ValueError("Position dict must contain 'lat' key")
+    if "lng" not in value:
+        raise ValueError("Position dict must contain 'lng' key")
+    lat = value["lat"]
+    lng = value["lng"]
+    if not isinstance(lat, (int, float)):
+        raise ValueError(f"Latitude must be numeric, got {type(lat).__name__}")
+    if not isinstance(lng, (int, float)):
+        raise ValueError(f"Longitude must be numeric, got {type(lng).__name__}")
+    if lat < -90 or lat > 90:
+        raise ValueError(f"Latitude must be between -90 and 90, got {lat}")
+    if lng < -180 or lng > 180:
+        raise ValueError(f"Longitude must be between -180 and 180, got {lng}")
+    return {"lat": float(lat), "lng": float(lng)}
+def dict_to_weightedset(value: Any) -> dict[str, int]:
+    """
+    Convert a key-weight dictionary to Vespa weighted set format.
+    Args:
+        value: Dictionary with keys and numeric weights
+    Returns:
+        Dictionary with stringified keys and integer weights
+    Raises:
+        ValueError: If input is not a dict or weights are not numeric
+    Examples:
+        >>> dict_to_weightedset({"tag1": 10, "tag2": 5})
+        {'tag1': 10, 'tag2': 5}
+        >>> dict_to_weightedset({1: 100, 2: 50})
+        {'1': 100, '2': 50}
+    """
+    if not isinstance(value, dict):
+        raise ValueError(
+            f"Expected dict for weightedset conversion, got {type(value).__name__}"
+        )
+    result: dict[str, int] = {}
+    for key, weight in value.items():
+        if not isinstance(weight, (int, float)):
+            raise ValueError(
+                f"Weight for key '{key}' must be numeric, got {type(weight).__name__}"
+            )
+        result[str(key)] = int(weight)
+    return result
+def dict_to_map(value: Any) -> dict[str, Any]:
+    """
+    Convert a dictionary to Vespa map format with stringified keys.
+    Args:
+        value: Dictionary with any keys and values
+    Returns:
+        Dictionary with stringified keys
+    Raises:
+        ValueError: If input is not a dict
+    Examples:
+        >>> dict_to_map({"key": "value"})
+        {'key': 'value'}
+        >>> dict_to_map({1: "one", 2: "two"})
+        {'1': 'one', '2': 'two'}
+    """
+    if not isinstance(value, dict):
+        raise ValueError(
+            f"Expected dict for map conversion, got {type(value).__name__}"
+        )
+    return {str(key): val for key, val in value.items()}
+def list_to_tensor_int8_hex(value: Any, row_num: int | None = None) -> dict[str, str]:
+    """
+    Convert a list of integers to Vespa hex-encoded int8 tensor format.
+    Each int8 value is encoded as 2 hex characters (big-endian).
+    Args:
+        value: List of integers in range -128 to 127
+        row_num: Optional row number for error context
+    Returns:
+        Dictionary with "values" key containing hex string
+    Raises:
+        ValueError: If input is not a list, is empty, or contains out-of-range values
+    Examples:
+        >>> list_to_tensor_int8_hex([11, 34, 3])
+        {'values': '0b2203'}
+        >>> list_to_tensor_int8_hex([0, 127, -128])
+        {'values': '007f80'}
+    """
+    if not isinstance(value, list):
+        raise ValueError(
+            f"Expected list for tensor conversion, got {type(value).__name__}"
+        )
+    if len(value) == 0:
+        raise ValueError("Cannot convert empty list to tensor")
+    hex_parts = []
+    for i, v in enumerate(value):
+        if not isinstance(v, int) or v < -128 or v > 127:
+            row_context = f" (row {row_num})" if row_num is not None else ""
+            raise ValueError(
+                f"Value {v} at index {i}{row_context} outside int8 range (-128 to 127)"
+            )
+        # Pack as signed byte and convert to hex
+        hex_parts.append(struct.pack(">b", v).hex())
+    return {"values": "".join(hex_parts)}
+def list_to_tensor_bfloat16_hex(
+    value: Any, row_num: int | None = None
+) -> dict[str, str]:
+    """
+    Convert a list of floats to Vespa hex-encoded bfloat16 tensor format.
+    Each bfloat16 value is encoded as 4 hex characters (upper 2 bytes of float32).
+    Precision is truncated, not rounded.
+    Args:
+        value: List of numeric values (int or float)
+        row_num: Optional row number for error context (unused, for API consistency)
+    Returns:
+        Dictionary with "values" key containing hex string
+    Raises:
+        ValueError: If input is not a list or is empty
+    Examples:
+        >>> list_to_tensor_bfloat16_hex([1.0, -1.0, 0.0])
+        {'values': '3f80bf800000'}
+        >>> list_to_tensor_bfloat16_hex([3.14159])
+        {'values': '4049'}
+    """
+    if not isinstance(value, list):
+        raise ValueError(
+            f"Expected list for tensor conversion, got {type(value).__name__}"
+        )
+    if len(value) == 0:
+        raise ValueError("Cannot convert empty list to tensor")
+    hex_parts = []
+    for v in value:
+        # Pack as float32, take upper 2 bytes for bfloat16
+        packed = struct.pack(">f", float(v))
+        hex_parts.append(packed[:2].hex())
+    return {"values": "".join(hex_parts)}
+def list_to_tensor_float32_hex(
+    value: Any, row_num: int | None = None
+) -> dict[str, str]:
+    """
+    Convert a list of floats to Vespa hex-encoded float32 tensor format.
+    Each float32 value is encoded as 8 hex characters (big-endian IEEE 754).
+    Args:
+        value: List of numeric values (int or float)
+        row_num: Optional row number for error context (unused, for API consistency)
+    Returns:
+        Dictionary with "values" key containing hex string
+    Raises:
+        ValueError: If input is not a list or is empty
+    Examples:
+        >>> list_to_tensor_float32_hex([1.0, 2.0, 3.0])
+        {'values': '3f8000004000000040400000'}
+        >>> list_to_tensor_float32_hex([3.14159])
+        {'values': '40490fdb'}
+    """
+    if not isinstance(value, list):
+        raise ValueError(
+            f"Expected list for tensor conversion, got {type(value).__name__}"
+        )
+    if len(value) == 0:
+        raise ValueError("Cannot convert empty list to tensor")
+    hex_parts = []
+    for v in value:
+        # Pack as float32 (big-endian) = 4 bytes = 8 hex chars
+        hex_parts.append(struct.pack(">f", float(v)).hex())
+    return {"values": "".join(hex_parts)}
+def list_to_tensor_float64_hex(
+    value: Any, row_num: int | None = None
+) -> dict[str, str]:
+    """
+    Convert a list of floats to Vespa hex-encoded float64 tensor format.
+    Each float64 value is encoded as 16 hex characters (big-endian IEEE 754 double).
+    Args:
+        value: List of numeric values (int or float)
+        row_num: Optional row number for error context (unused, for API consistency)
+    Returns:
+        Dictionary with "values" key containing hex string
+    Raises:
+        ValueError: If input is not a list or is empty
+    Examples:
+        >>> list_to_tensor_float64_hex([1.0, 2.0])
+        {'values': '3ff00000000000004000000000000000'}
+        >>> list_to_tensor_float64_hex([3.141592653589793])
+        {'values': '400921fb54442d18'}
+    """
+    if not isinstance(value, list):
+        raise ValueError(
+            f"Expected list for tensor conversion, got {type(value).__name__}"
+        )
+    if len(value) == 0:
+        raise ValueError("Cannot convert empty list to tensor")
+    hex_parts = []
+    for v in value:
+        # Pack as float64 (big-endian) = 8 bytes = 16 hex chars
+        hex_parts.append(struct.pack(">d", float(v)).hex())
+    return {"values": "".join(hex_parts)}
+def dict_to_sparse_tensor(value: Any, row_num: int | None = None) -> dict[str, list]:
+    """
+    Convert dictionary to Vespa sparse tensor format (cells notation).
+    For tensors with single mapped dimension: tensor<float>(key{})
+    Output format: {"cells": [{"address": {"key": "..."}, "value": ...}, ...]}
+    Args:
+        value: Dict with string keys and numeric values
+        row_num: Optional row number for error context
+    Returns:
+        Dict with "cells" key containing list of address-value pairs
+    Raises:
+        ValueError: If input is not a dict, is empty, or values are non-numeric
+    Examples:
+        >>> dict_to_sparse_tensor({"word1": 0.5, "word2": 0.3})
+        {'cells': [{'address': {'key': 'word1'}, 'value': 0.5},
+                   {'address': {'key': 'word2'}, 'value': 0.3}]}
+    """
+    if not isinstance(value, dict):
+        raise ValueError(
+            f"Expected dict for sparse tensor conversion, got {type(value).__name__}"
+        )
+    if len(value) == 0:
+        raise ValueError("Cannot convert empty dict to sparse tensor")
+    cells = []
+    for key, val in value.items():
+        if not isinstance(val, (int, float)):
+            row_context = f" (row {row_num})" if row_num is not None else ""
+            raise ValueError(
+                f"Value for key '{key}'{row_context} must be numeric, got {type(val).__name__}"
+            )
+        cells.append({"address": {"key": str(key)}, "value": float(val)})
+    return {"cells": cells}
+def dict_to_mixed_tensor(value: Any, row_num: int | None = None) -> dict[str, dict]:
+    """
+    Convert nested dict to Vespa mixed tensor format (blocks notation).
+    For tensors with one mapped and indexed dimensions: tensor<float>(word{},x[N])
+    Output format: {"blocks": {"key1": [...], "key2": [...], ...}}
+    Args:
+        value: Dict with string keys and list values (all lists must be same length)
+        row_num: Optional row number for error context
+    Returns:
+        Dict with "blocks" key containing dict of key->array mappings
+    Raises:
+        ValueError: If input invalid or dimension lengths inconsistent
+    Examples:
+        >>> dict_to_mixed_tensor({"word1": [1.0, 2.0], "word2": [3.0, 4.0]})
+        {'blocks': {'word1': [1.0, 2.0], 'word2': [3.0, 4.0]}}
+    """
+    if not isinstance(value, dict):
+        raise ValueError(
+            f"Expected dict for mixed tensor conversion, got {type(value).__name__}"
+        )
+    if len(value) == 0:
+        raise ValueError("Cannot convert empty dict to mixed tensor")
+    blocks = {}
+    first_key = None
+    first_shape = None
+    for key, val in value.items():
+        if not isinstance(val, list):
+            raise ValueError(
+                f"Value for key '{key}' must be list, got {type(val).__name__}"
+            )
+        if len(val) == 0:
+            raise ValueError(f"Value for key '{key}' cannot be empty list")
+        # Track first key shape for consistency checking
+        if first_key is None:
+            first_key = key
+            first_shape = len(val)
+        else:
+            current_shape = len(val)
+            if current_shape != first_shape:
+                row_ctx = f" (row {row_num})" if row_num is not None else ""
+                raise ValueError(
+                    f"Mixed tensor dimension mismatch{row_ctx}: key '{first_key}' has shape [{first_shape}], "
+                    f"but key '{key}' has shape [{current_shape}]"
+                )
+        # Validate all items in list are numeric
+        for i, item in enumerate(val):
+            if not isinstance(item, (int, float)):
+                raise ValueError(
+                    f"Value at index {i} for key '{key}' must be numeric, got {type(item).__name__}"
+                )
+        blocks[str(key)] = val
+    return {"blocks": blocks}
+def dict_to_mixed_tensor_hex(
+    value: Any, cell_type: str = "float32", row_num: int | None = None
+) -> dict[str, dict]:
+    """
+    Convert nested dict to Vespa mixed tensor format with hex-encoded blocks.
+    Combines sparse mapped dimension with dense hex-encoded indexed dimensions.
+    Uses Phase 9 hex encoders for the dense subspace.
+    Args:
+        value: Dict with string keys and list values (all lists must be same length)
+        cell_type: Cell type for hex encoding (int8, bfloat16, float32, float64)
+        row_num: Optional row number for error context
+    Returns:
+        Dict with "blocks" key containing dict of key->hex-string mappings
+    Raises:
+        ValueError: If input invalid, dimensions inconsistent, or values out of range
+    Examples:
+        >>> dict_to_mixed_tensor_hex({"w1": [11, 34, 3], "w2": [-124, 5, -1]}, cell_type="int8")
+        {'blocks': {'w1': '0b2203', 'w2': '8405ff'}}
+    """
+    if not isinstance(value, dict):
+        raise ValueError(
+            f"Expected dict for mixed tensor conversion, got {type(value).__name__}"
+        )
+    if len(value) == 0:
+        raise ValueError("Cannot convert empty dict to mixed tensor")
+    # Create encoder lookup
+    hex_encoders = {
+        "int8": list_to_tensor_int8_hex,
+        "bfloat16": list_to_tensor_bfloat16_hex,
+        "float32": list_to_tensor_float32_hex,
+        "float64": list_to_tensor_float64_hex,
+    }
+    if cell_type not in hex_encoders:
+        raise ValueError(
+            f"Unknown cell type '{cell_type}'. Must be one of: {', '.join(sorted(hex_encoders.keys()))}"
+        )
+    encoder = hex_encoders[cell_type]
+    blocks = {}
+    first_key = None
+    first_shape = None
+    for key, val in value.items():
+        if not isinstance(val, list):
+            raise ValueError(
+                f"Value for key '{key}' must be list, got {type(val).__name__}"
+            )
+        if len(val) == 0:
+            raise ValueError(f"Value for key '{key}' cannot be empty list")
+        # Track first key shape for consistency checking
+        if first_key is None:
+            first_key = key
+            first_shape = len(val)
+        else:
+            current_shape = len(val)
+            if current_shape != first_shape:
+                row_ctx = f" (row {row_num})" if row_num is not None else ""
+                raise ValueError(
+                    f"Mixed tensor dimension mismatch{row_ctx}: key '{first_key}' has shape [{first_shape}], "
+                    f"but key '{key}' has shape [{current_shape}]"
+                )
+        # Encode the block using Phase 9 hex encoder
+        try:
+            encoded = encoder(val, row_num=row_num)
+            blocks[str(key)] = encoded["values"]
+        except ValueError as e:
+            raise ValueError(f"Error encoding key '{key}': {e}") from e
+    return {"blocks": blocks}
+# Create module-level registry instance
+converters = ConverterRegistry()
+# Register built-in converters
+converters.register("tensor", list_to_tensor)
+converters.register("string", str)
+converters.register("int", int)
+converters.register("float", float)
+converters.register("position", dict_to_position)
+converters.register("weightedset", dict_to_weightedset)
+converters.register("map", dict_to_map)
+converters.register("tensor_int8_hex", list_to_tensor_int8_hex)
+converters.register("tensor_bfloat16_hex", list_to_tensor_bfloat16_hex)
+converters.register("tensor_float32_hex", list_to_tensor_float32_hex)
+converters.register("tensor_float64_hex", list_to_tensor_float64_hex)
+converters.register("sparse_tensor", dict_to_sparse_tensor)
+converters.register("mixed_tensor", dict_to_mixed_tensor)
+converters.register("mixed_tensor_hex", dict_to_mixed_tensor_hex)