PyPI - data-designer-config - Versions diffs - 0.4.0__py3-none-any.whl - Mend

data-designer-config 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

data_designer/config/__init__.py +149 -0
data_designer/config/_version.py +34 -0
data_designer/config/analysis/__init__.py +2 -0
data_designer/config/analysis/column_profilers.py +159 -0
data_designer/config/analysis/column_statistics.py +421 -0
data_designer/config/analysis/dataset_profiler.py +84 -0
data_designer/config/analysis/utils/errors.py +10 -0
data_designer/config/analysis/utils/reporting.py +192 -0
data_designer/config/base.py +69 -0
data_designer/config/column_configs.py +476 -0
data_designer/config/column_types.py +141 -0
data_designer/config/config_builder.py +595 -0
data_designer/config/data_designer_config.py +40 -0
data_designer/config/dataset_builders.py +13 -0
data_designer/config/dataset_metadata.py +18 -0
data_designer/config/default_model_settings.py +129 -0
data_designer/config/errors.py +24 -0
data_designer/config/interface.py +55 -0
data_designer/config/models.py +486 -0
data_designer/config/preview_results.py +41 -0
data_designer/config/processors.py +148 -0
data_designer/config/run_config.py +56 -0
data_designer/config/sampler_constraints.py +52 -0
data_designer/config/sampler_params.py +639 -0
data_designer/config/seed.py +116 -0
data_designer/config/seed_source.py +84 -0
data_designer/config/seed_source_types.py +19 -0
data_designer/config/testing/__init__.py +6 -0
data_designer/config/testing/fixtures.py +308 -0
data_designer/config/utils/code_lang.py +93 -0
data_designer/config/utils/constants.py +365 -0
data_designer/config/utils/errors.py +21 -0
data_designer/config/utils/info.py +94 -0
data_designer/config/utils/io_helpers.py +258 -0
data_designer/config/utils/misc.py +78 -0
data_designer/config/utils/numerical_helpers.py +30 -0
data_designer/config/utils/type_helpers.py +106 -0
data_designer/config/utils/visualization.py +482 -0
data_designer/config/validator_params.py +94 -0
data_designer/errors.py +7 -0
data_designer/lazy_heavy_imports.py +56 -0
data_designer/logging.py +180 -0
data_designer/plugin_manager.py +78 -0
data_designer/plugins/__init__.py +8 -0
data_designer/plugins/errors.py +15 -0
data_designer/plugins/plugin.py +141 -0
data_designer/plugins/registry.py +88 -0
data_designer_config-0.4.0.dist-info/METADATA +75 -0
data_designer_config-0.4.0.dist-info/RECORD +50 -0
data_designer_config-0.4.0.dist-info/WHEEL +4 -0

data_designer/config/utils/io_helpers.py ADDED Viewed

@@ -0,0 +1,258 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import json
+import logging
+import os
+from datetime import date, datetime, timedelta
+from decimal import Decimal
+from numbers import Number
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+import yaml
+from data_designer.config.errors import InvalidFileFormatError, InvalidFilePathError
+from data_designer.lazy_heavy_imports import np, pd
+if TYPE_CHECKING:
+    import numpy as np
+    import pandas as pd
+logger = logging.getLogger(__name__)
+VALID_DATASET_FILE_EXTENSIONS = {".parquet", ".csv", ".json", ".jsonl"}
+def ensure_config_dir_exists(config_dir: Path) -> None:
+    """Create configuration directory if it doesn't exist.
+    Args:
+        config_dir: Directory path to create
+    """
+    config_dir.mkdir(parents=True, exist_ok=True)
+def load_config_file(file_path: Path) -> dict:
+    """Load a YAML configuration file.
+    Args:
+        file_path: Path to the YAML file
+    Returns:
+        Parsed YAML content as dictionary
+    Raises:
+        InvalidFilePathError: If file doesn't exist
+        InvalidFileFormatError: If YAML is malformed
+        InvalidConfigError: If file is empty
+    """
+    from data_designer.config.errors import InvalidConfigError
+    if not file_path.exists():
+        raise InvalidFilePathError(f"Configuration file not found: {file_path}")
+    try:
+        with open(file_path) as f:
+            content = yaml.safe_load(f)
+        if content is None:
+            raise InvalidConfigError(f"Configuration file is empty: {file_path}")
+        return content
+    except yaml.YAMLError as e:
+        raise InvalidFileFormatError(f"Invalid YAML format in {file_path}: {e}")
+def save_config_file(file_path: Path, config: dict) -> None:
+    """Save configuration to a YAML file.
+    Args:
+        file_path: Path where to save the file
+        config: Configuration dictionary to save
+    Raises:
+        IOError: If file cannot be written
+    """
+    # Ensure parent directory exists
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(file_path, "w") as f:
+        yaml.safe_dump(
+            config,
+            f,
+            default_flow_style=False,
+            sort_keys=False,
+            indent=2,
+            allow_unicode=True,
+        )
+def read_parquet_dataset(path: Path) -> pd.DataFrame:
+    """Read a parquet dataset from a path.
+    Args:
+        path: The path to the parquet dataset, can be either a file or a directory.
+    Returns:
+        The parquet dataset as a pandas DataFrame.
+    """
+    try:
+        return pd.read_parquet(path, dtype_backend="pyarrow")
+    except Exception as e:
+        if path.is_dir() and "Unsupported cast" in str(e):
+            logger.warning("Failed to read parquets as folder, falling back to individual files")
+            return pd.concat(
+                [pd.read_parquet(file, dtype_backend="pyarrow") for file in sorted(path.glob("*.parquet"))],
+                ignore_index=True,
+            )
+        else:
+            raise e
+def validate_dataset_file_path(file_path: str | Path, should_exist: bool = True) -> Path:
+    """Validate that a dataset file path has a valid extension and optionally exists.
+    Args:
+        file_path: The path to validate, either as a string or Path object.
+        should_exist: If True, verify that the file exists. Defaults to True.
+    Returns:
+        The validated path as a Path object.
+    Raises:
+        InvalidFilePathError: If the path is not a file.
+        InvalidFileFormatError: If the path does not have a valid extension.
+    """
+    file_path = Path(file_path)
+    if should_exist and not Path(file_path).is_file():
+        raise InvalidFilePathError(f"🛑 Path {file_path} is not a file.")
+    if not file_path.name.lower().endswith(tuple(VALID_DATASET_FILE_EXTENSIONS)):
+        raise InvalidFileFormatError(
+            "🛑 Dataset files must be in parquet, csv, or jsonl/json (orient='records', lines=True) format."
+        )
+    return file_path
+def validate_path_contains_files_of_type(path: str | Path, file_extension: str) -> None:
+    """Validate that a path contains files of a specific type.
+    Args:
+        path: The path to validate. Can contain wildcards like `*.parquet`.
+        file_extension: The extension of the files to validate (without the dot, e.g., "parquet").
+    Returns:
+        None if the path contains files of the specified type, raises an error otherwise.
+    Raises:
+        InvalidFilePathError: If the path does not contain files of the specified type.
+    """
+    if not any(Path(path).glob(f"*.{file_extension}")):
+        raise InvalidFilePathError(f"🛑 Path {path!r} does not contain files of type {file_extension!r}.")
+def smart_load_dataframe(dataframe: str | Path | pd.DataFrame) -> pd.DataFrame:
+    """Load a dataframe from file if a path is given, otherwise return the dataframe.
+    Args:
+        dataframe: A path to a file or a pandas DataFrame object.
+    Returns:
+        A pandas DataFrame object.
+    """
+    if isinstance(dataframe, pd.DataFrame):
+        return dataframe
+    # Get the file extension.
+    if isinstance(dataframe, str) and dataframe.startswith("http"):
+        ext = dataframe.split(".")[-1].lower()
+    else:
+        dataframe = Path(dataframe)
+        ext = dataframe.suffix.lower()
+        if not dataframe.exists():
+            raise FileNotFoundError(f"File not found: {dataframe}")
+    # Load the dataframe based on the file extension.
+    if ext == "csv":
+        return pd.read_csv(dataframe)
+    elif ext == "json":
+        return pd.read_json(dataframe, lines=True)
+    elif ext == "parquet":
+        return pd.read_parquet(dataframe)
+    else:
+        raise ValueError(f"Unsupported file format: {dataframe}")
+def smart_load_yaml(yaml_in: str | Path | dict) -> dict:
+    """Return the yaml config as a dict given flexible input types.
+    Args:
+        config: The config as a dict, yaml string, or yaml file path.
+    Returns:
+        The config as a dict.
+    """
+    if isinstance(yaml_in, dict):
+        yaml_out = yaml_in
+    elif isinstance(yaml_in, Path) or (isinstance(yaml_in, str) and os.path.isfile(yaml_in)):
+        with open(yaml_in) as file:
+            yaml_out = yaml.safe_load(file)
+    elif isinstance(yaml_in, str):
+        if yaml_in.endswith((".yaml", ".yml")) and not os.path.isfile(yaml_in):
+            raise FileNotFoundError(f"File not found: {yaml_in}")
+        else:
+            yaml_out = yaml.safe_load(yaml_in)
+    else:
+        raise ValueError(
+            f"'{yaml_in}' is an invalid yaml config format. Valid options are: dict, yaml string, or yaml file path."
+        )
+    if not isinstance(yaml_out, dict):
+        raise ValueError(f"Loaded yaml must be a dict. Got {yaml_out}, which is of type {type(yaml_out)}.")
+    return yaml_out
+def serialize_data(data: dict | list | str | Number, **kwargs) -> str:
+    if isinstance(data, dict):
+        return json.dumps(data, ensure_ascii=False, default=_convert_to_serializable, **kwargs)
+    elif isinstance(data, list):
+        return json.dumps(data, ensure_ascii=False, default=_convert_to_serializable, **kwargs)
+    elif isinstance(data, str):
+        return data
+    elif isinstance(data, Number):
+        return str(data)
+    else:
+        raise ValueError(f"Invalid data type: {type(data)}")
+def _convert_to_serializable(obj: Any) -> Any:
+    """Convert non-JSON-serializable objects to JSON-serializable Python-native types.
+    Raises:
+        TypeError: If the object type is not supported for serialization.
+    """
+    if isinstance(obj, (set, list)):
+        return list(obj)
+    if isinstance(obj, (pd.Series, np.ndarray)):
+        return obj.tolist()
+    if pd.isna(obj):
+        return None
+    if isinstance(obj, (datetime, date, pd.Timestamp)):
+        return obj.isoformat()
+    if isinstance(obj, timedelta):
+        return obj.total_seconds()
+    if isinstance(obj, (np.datetime64, np.timedelta64)):
+        return str(obj)
+    if isinstance(obj, Decimal):
+        return float(obj)
+    if isinstance(obj, (np.integer, np.floating, np.bool_)):
+        return obj.item()
+    if isinstance(obj, bytes):
+        return obj.decode("utf-8", errors="replace")
+    # Unsupported type
+    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")

data_designer/config/utils/misc.py ADDED Viewed

@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import json
+from contextlib import contextmanager
+from jinja2 import TemplateSyntaxError, meta
+from jinja2.sandbox import ImmutableSandboxedEnvironment
+from data_designer.config.utils.errors import UserJinjaTemplateSyntaxError
+REPR_LIST_LENGTH_USE_JSON = 4
+def kebab_to_snake(s: str) -> str:
+    return s.replace("-", "_")
+@contextmanager
+def template_error_handler():
+    try:
+        yield
+    except TemplateSyntaxError as exception:
+        exception_string = (
+            f"Encountered a syntax error in the provided Jinja2 template:\n{str(exception)}\n"
+            "For more information on writing Jinja2 templates, "
+            "refer to https://jinja.palletsprojects.com/en/stable/templates"
+        )
+        raise UserJinjaTemplateSyntaxError(exception_string)
+    except Exception:
+        raise
+def assert_valid_jinja2_template(template: str) -> None:
+    """Raises an error if the template cannot be parsed."""
+    with template_error_handler():
+        meta.find_undeclared_variables(ImmutableSandboxedEnvironment().parse(template))
+def can_run_data_designer_locally() -> bool:
+    """Returns True if Data Designer can be run locally, False otherwise."""
+    try:
+        from ... import engine  # noqa: F401, TID252
+    except ImportError:
+        return False
+    return True
+def extract_keywords_from_jinja2_template(template: str) -> set[str]:
+    """Extract all keywords from a valid Jinja2 template."""
+    with template_error_handler():
+        ast = ImmutableSandboxedEnvironment().parse(template)
+        keywords = set(meta.find_undeclared_variables(ast))
+    return keywords
+def json_indent_list_of_strings(column_names: list[str], *, indent: int | str | None = None) -> list[str] | str | None:
+    """Convert a list of column names to a JSON string if the list is long.
+    This function helps keep Data Designer's __repr__ output clean and readable.
+    Args:
+        column_names: List of column names.
+        indent: Indentation for the JSON string.
+    Returns:
+        A list of column names or a JSON string if the list is long.
+    """
+    return (
+        None
+        if len(column_names) == 0
+        else (
+            column_names if len(column_names) < REPR_LIST_LENGTH_USE_JSON else json.dumps(column_names, indent=indent)
+        )
+    )

data_designer/config/utils/numerical_helpers.py ADDED Viewed

@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import numbers
+from numbers import Number
+from typing import Any
+from data_designer.config.utils.constants import REPORTING_PRECISION
+def is_int(val: Any) -> bool:
+    return isinstance(val, numbers.Integral)
+def is_float(val: Any) -> bool:
+    return isinstance(val, numbers.Real) and not isinstance(val, numbers.Integral)
+def prepare_number_for_reporting(
+    value: Number,
+    target_type: type[Number],
+    precision: int = REPORTING_PRECISION,
+) -> Number:
+    """Ensure native python types and round to `precision` decimal digits."""
+    value = target_type(value)
+    if is_float(value):
+        return round(value, precision)
+    return value

data_designer/config/utils/type_helpers.py ADDED Viewed

@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import inspect
+from enum import Enum
+from typing import Any, Literal, get_args, get_origin
+from pydantic import BaseModel
+from data_designer.config import sampler_params
+from data_designer.config.utils.errors import (
+    InvalidDiscriminatorFieldError,
+    InvalidEnumValueError,
+    InvalidTypeUnionError,
+)
+class StrEnum(str, Enum):
+    pass
+def create_str_enum_from_discriminated_type_union(
+    enum_name: str,
+    type_union: type,
+    discriminator_field_name: str,
+) -> StrEnum:
+    """Create a string enum from a type union.
+    The type union is assumed to be a union of configs (Pydantic models) that have a discriminator field,
+    which must be a Literal string type - e.g., Literal["expression"].
+    Args:
+        enum_name: Name of the StrEnum.
+        type_union: Type union of configs (Pydantic models).
+        discriminator_field_name: Name of the discriminator field.
+    Returns:
+        StrEnum with values being the discriminator field values of the configs in the type union.
+    Example:
+        DataDesignerColumnType = create_str_enum_from_discriminated_type_union(
+            enum_name="DataDesignerColumnType",
+            type_union=ColumnConfigT,
+            discriminator_field_name="column_type",
+        )
+    """
+    discriminator_field_values = []
+    for model in type_union.__args__:
+        if not issubclass(model, BaseModel):
+            raise InvalidTypeUnionError(f"🛑 {model} must be a subclass of pydantic.BaseModel.")
+        if discriminator_field_name not in model.model_fields:
+            raise InvalidDiscriminatorFieldError(f"🛑 '{discriminator_field_name}' is not a field of {model}.")
+        if get_origin(model.model_fields[discriminator_field_name].annotation) is not Literal:
+            raise InvalidDiscriminatorFieldError(f"🛑 '{discriminator_field_name}' must be a Literal type.")
+        discriminator_field_values.extend(get_args(model.model_fields[discriminator_field_name].annotation))
+    return StrEnum(enum_name, {v.replace("-", "_").upper(): v for v in set(discriminator_field_values)})
+def get_sampler_params() -> dict[str, type[BaseModel]]:
+    """Returns a dictionary of sampler parameter classes."""
+    params_cls_list = [
+        params_cls
+        for cls_name, params_cls in inspect.getmembers(sampler_params, inspect.isclass)
+        if cls_name.endswith("SamplerParams")
+    ]
+    params_cls_dict = {}
+    for source in sampler_params.SamplerType:
+        source_name = source.value.replace("_", "")
+        # Iterate in reverse order so the shortest match is first.
+        # This is necessary for params that start with the same name.
+        # For example, "bernoulli" and "bernoulli_mixture".
+        params_cls_dict[source.value] = [
+            params_cls
+            for params_cls in reversed(params_cls_list)
+            # Match param type string with parameter class.
+            # For example, "gaussian" -> "GaussianSamplerParams"
+            if source_name == params_cls.__name__.lower()[: len(source_name)]
+            # Take the first match.
+        ][0]
+    return params_cls_dict
+def resolve_string_enum(enum_instance: Any, enum_type: type[Enum]) -> Enum:
+    if not issubclass(enum_type, Enum):
+        raise InvalidEnumValueError(f"🛑 `enum_type` must be a subclass of Enum. You provided: {enum_type}")
+    invalid_enum_value_error = InvalidEnumValueError(
+        f"🛑 '{enum_instance}' is not a valid string enum of type {type(enum_type)}. "
+        f"Valid options are: {[option.value for option in enum_type]}"
+    )
+    if isinstance(enum_instance, enum_type):
+        return enum_instance
+    elif isinstance(enum_instance, str):
+        try:
+            return enum_type(enum_instance)
+        except ValueError:
+            raise invalid_enum_value_error
+    else:
+        raise invalid_enum_value_error
+SAMPLER_PARAMS = get_sampler_params()