PyPI - data-designer-config - Versions diffs - 0.4.0__py3-none-any.whl - Mend

data-designer-config 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

data_designer/config/__init__.py +149 -0
data_designer/config/_version.py +34 -0
data_designer/config/analysis/__init__.py +2 -0
data_designer/config/analysis/column_profilers.py +159 -0
data_designer/config/analysis/column_statistics.py +421 -0
data_designer/config/analysis/dataset_profiler.py +84 -0
data_designer/config/analysis/utils/errors.py +10 -0
data_designer/config/analysis/utils/reporting.py +192 -0
data_designer/config/base.py +69 -0
data_designer/config/column_configs.py +476 -0
data_designer/config/column_types.py +141 -0
data_designer/config/config_builder.py +595 -0
data_designer/config/data_designer_config.py +40 -0
data_designer/config/dataset_builders.py +13 -0
data_designer/config/dataset_metadata.py +18 -0
data_designer/config/default_model_settings.py +129 -0
data_designer/config/errors.py +24 -0
data_designer/config/interface.py +55 -0
data_designer/config/models.py +486 -0
data_designer/config/preview_results.py +41 -0
data_designer/config/processors.py +148 -0
data_designer/config/run_config.py +56 -0
data_designer/config/sampler_constraints.py +52 -0
data_designer/config/sampler_params.py +639 -0
data_designer/config/seed.py +116 -0
data_designer/config/seed_source.py +84 -0
data_designer/config/seed_source_types.py +19 -0
data_designer/config/testing/__init__.py +6 -0
data_designer/config/testing/fixtures.py +308 -0
data_designer/config/utils/code_lang.py +93 -0
data_designer/config/utils/constants.py +365 -0
data_designer/config/utils/errors.py +21 -0
data_designer/config/utils/info.py +94 -0
data_designer/config/utils/io_helpers.py +258 -0
data_designer/config/utils/misc.py +78 -0
data_designer/config/utils/numerical_helpers.py +30 -0
data_designer/config/utils/type_helpers.py +106 -0
data_designer/config/utils/visualization.py +482 -0
data_designer/config/validator_params.py +94 -0
data_designer/errors.py +7 -0
data_designer/lazy_heavy_imports.py +56 -0
data_designer/logging.py +180 -0
data_designer/plugin_manager.py +78 -0
data_designer/plugins/__init__.py +8 -0
data_designer/plugins/errors.py +15 -0
data_designer/plugins/plugin.py +141 -0
data_designer/plugins/registry.py +88 -0
data_designer_config-0.4.0.dist-info/METADATA +75 -0
data_designer_config-0.4.0.dist-info/RECORD +50 -0
data_designer_config-0.4.0.dist-info/WHEEL +4 -0

data_designer/config/preview_results.py ADDED Viewed

@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
+from data_designer.config.config_builder import DataDesignerConfigBuilder
+from data_designer.config.dataset_metadata import DatasetMetadata
+from data_designer.config.utils.visualization import WithRecordSamplerMixin
+from data_designer.lazy_heavy_imports import pd
+if TYPE_CHECKING:
+    import pandas as pd
+class PreviewResults(WithRecordSamplerMixin):
+    def __init__(
+        self,
+        *,
+        config_builder: DataDesignerConfigBuilder,
+        dataset_metadata: DatasetMetadata | None = None,
+        dataset: pd.DataFrame | None = None,
+        analysis: DatasetProfilerResults | None = None,
+        processor_artifacts: dict[str, list[str] | str] | None = None,
+    ):
+        """Creates a new instance with results from a Data Designer preview run.
+        Args:
+            config_builder: Data Designer configuration builder.
+            dataset_metadata: Metadata about the generated dataset (e.g., seed column names).
+            dataset: Dataset of the preview run.
+            analysis: Analysis of the preview run.
+            processor_artifacts: Artifacts generated by the processors.
+        """
+        self.dataset: pd.DataFrame | None = dataset
+        self.analysis: DatasetProfilerResults | None = analysis
+        self.processor_artifacts: dict[str, list[str] | str] | None = processor_artifacts
+        self.dataset_metadata: DatasetMetadata | None = dataset_metadata
+        self._config_builder = config_builder

data_designer/config/processors.py ADDED Viewed

@@ -0,0 +1,148 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import json
+from abc import ABC
+from enum import Enum
+from typing import Any, Literal
+from pydantic import Field, field_validator
+from typing_extensions import TypeAlias
+from data_designer.config.base import ConfigBase
+from data_designer.config.dataset_builders import BuildStage
+from data_designer.config.errors import InvalidConfigError
+SUPPORTED_STAGES = [BuildStage.POST_BATCH]
+class ProcessorType(str, Enum):
+    """Enumeration of available processor types.
+    Attributes:
+        DROP_COLUMNS: Processor that removes specified columns from the output dataset.
+        SCHEMA_TRANSFORM: Processor that creates a new dataset with a transformed schema using Jinja2 templates.
+    """
+    DROP_COLUMNS = "drop_columns"
+    SCHEMA_TRANSFORM = "schema_transform"
+class ProcessorConfig(ConfigBase, ABC):
+    """Abstract base class for all processor configuration types.
+    Processors are transformations that run before or after columns are generated.
+    They can modify, reshape, or augment the dataset before it's saved.
+    Attributes:
+        name: Unique name of the processor, used to identify the processor in results
+            and to name output artifacts on disk.
+        build_stage: The stage at which the processor runs. Currently only `POST_BATCH`
+            is supported, meaning processors run after each batch of columns is generated.
+    """
+    name: str = Field(
+        description="The name of the processor, used to identify the processor in the results and to write the artifacts to disk.",
+    )
+    build_stage: BuildStage = Field(
+        default=BuildStage.POST_BATCH,
+        description=f"The stage at which the processor will run. Supported stages: {', '.join(SUPPORTED_STAGES)}",
+    )
+    processor_type: str
+    @field_validator("build_stage")
+    def validate_build_stage(cls, v: BuildStage) -> BuildStage:
+        if v not in SUPPORTED_STAGES:
+            raise ValueError(
+                f"Invalid dataset builder stage: {v}. Only these stages are supported: {', '.join(SUPPORTED_STAGES)}"
+            )
+        return v
+def get_processor_config_from_kwargs(processor_type: ProcessorType, **kwargs: Any) -> ProcessorConfig:
+    """Create a processor configuration from a processor type and keyword arguments.
+    Args:
+        processor_type: The type of processor to create.
+        **kwargs: Additional keyword arguments passed to the processor constructor.
+    Returns:
+        A processor configuration object of the specified type.
+    """
+    if processor_type == ProcessorType.DROP_COLUMNS:
+        return DropColumnsProcessorConfig(**kwargs)
+    elif processor_type == ProcessorType.SCHEMA_TRANSFORM:
+        return SchemaTransformProcessorConfig(**kwargs)
+class DropColumnsProcessorConfig(ProcessorConfig):
+    """Configuration for dropping columns from the output dataset.
+    This processor removes specified columns from the generated dataset. The dropped
+    columns are saved separately in a `dropped-columns` directory for reference.
+    When this processor is added via the config builder, the corresponding column
+    configs are automatically marked with `drop = True`.
+    Alternatively, you can set `drop = True` when configuring a column.
+    Attributes:
+        column_names: List of column names to remove from the output dataset.
+        processor_type: Discriminator field, always `ProcessorType.DROP_COLUMNS` for this configuration type.
+    """
+    column_names: list[str] = Field(description="List of column names to drop from the output dataset.")
+    processor_type: Literal[ProcessorType.DROP_COLUMNS] = ProcessorType.DROP_COLUMNS
+class SchemaTransformProcessorConfig(ProcessorConfig):
+    """Configuration for transforming the dataset schema using Jinja2 templates.
+    This processor creates a new dataset with a transformed schema. Each key in the
+    template becomes a column in the output, and values are Jinja2 templates that
+    can reference any column in the batch. The transformed dataset is written to
+    a `processors-outputs/{processor_name}/` directory alongside the main dataset.
+    Attributes:
+        template: Dictionary defining the output schema. Keys are new column names,
+            values are Jinja2 templates (strings, lists, or nested structures).
+            Must be JSON-serializable.
+        processor_type: Discriminator field, always `ProcessorType.SCHEMA_TRANSFORM` for this configuration type.
+    """
+    template: dict[str, Any] = Field(
+        ...,
+        description="""
+        Dictionary specifying columns and templates to use in the new dataset with transformed schema.
+        Each key is a new column name, and each value is an object containing Jinja2 templates - for instance, a string or a list of strings.
+        Values must be JSON-serializable.
+        Example:
+        ```python
+        template = {
+            "list_of_strings": ["{{ col1 }}", "{{ col2 }}"],
+            "uppercase_string": "{{ col1 | upper }}",
+            "lowercase_string": "{{ col2 | lower }}",
+        }
+        ```
+        The above templates will create an new dataset with three columns: "list_of_strings", "uppercase_string", and "lowercase_string".
+        References to columns "col1" and "col2" in the templates will be replaced with the actual values of the columns in the dataset.
+        """,
+    )
+    processor_type: Literal[ProcessorType.SCHEMA_TRANSFORM] = ProcessorType.SCHEMA_TRANSFORM
+    @field_validator("template")
+    def validate_template(cls, v: dict[str, Any]) -> dict[str, Any]:
+        try:
+            json.dumps(v)
+        except TypeError as e:
+            if "not JSON serializable" in str(e):
+                raise InvalidConfigError("Template must be JSON serializable")
+        return v
+ProcessorConfigT: TypeAlias = DropColumnsProcessorConfig | SchemaTransformProcessorConfig

data_designer/config/run_config.py ADDED Viewed

@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from pydantic import Field, model_validator
+from typing_extensions import Self
+from data_designer.config.base import ConfigBase
+class RunConfig(ConfigBase):
+    """Runtime configuration for dataset generation.
+    Groups configuration options that control generation behavior but aren't
+    part of the dataset configuration itself.
+    Attributes:
+        disable_early_shutdown: If True, disables the executor's early-shutdown behavior entirely.
+            Generation will continue regardless of error rate, and the early-shutdown exception
+            will never be raised. Error counts and summaries are still collected. Default is False.
+        shutdown_error_rate: Error rate threshold (0.0-1.0) that triggers early shutdown when
+            early shutdown is enabled. Default is 0.5.
+        shutdown_error_window: Minimum number of completed tasks before error rate
+            monitoring begins. Must be >= 0. Default is 10.
+        buffer_size: Number of records to process in each batch during dataset generation.
+            A batch is processed end-to-end (column generation, post-batch processors, and writing the batch
+            to artifact storage) before moving on to the next batch. Must be > 0. Default is 1000.
+        non_inference_max_parallel_workers: Maximum number of worker threads used for non-inference
+            cell-by-cell generators. Must be >= 1. Default is 4.
+        max_conversation_restarts: Maximum number of full conversation restarts permitted when
+            generation tasks call `ModelFacade.generate(...)`. Must be >= 0. Default is 5.
+        max_conversation_correction_steps: Maximum number of correction rounds permitted within a
+            single conversation when generation tasks call `ModelFacade.generate(...)`. Must be >= 0.
+            Default is 0.
+        debug_override_save_all_column_traces: If True, overrides per-column `with_trace` settings
+            and includes `__trace` columns for ALL LLM generations, containing the full ordered
+            message history (system/user/assistant) for the final generation attempt.
+            Useful for debugging. Default is False.
+    """
+    disable_early_shutdown: bool = False
+    shutdown_error_rate: float = Field(default=0.5, ge=0.0, le=1.0)
+    shutdown_error_window: int = Field(default=10, ge=0)
+    buffer_size: int = Field(default=1000, gt=0)
+    non_inference_max_parallel_workers: int = Field(default=4, ge=1)
+    max_conversation_restarts: int = Field(default=5, ge=0)
+    max_conversation_correction_steps: int = Field(default=0, ge=0)
+    debug_override_save_all_column_traces: bool = False
+    @model_validator(mode="after")
+    def normalize_shutdown_settings(self) -> Self:
+        """Normalize shutdown settings for compatibility."""
+        if self.disable_early_shutdown:
+            self.shutdown_error_rate = 1.0
+        return self

data_designer/config/sampler_constraints.py ADDED Viewed

@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing_extensions import TypeAlias
+from data_designer.config.base import ConfigBase
+class ConstraintType(str, Enum):
+    SCALAR_INEQUALITY = "scalar_inequality"
+    COLUMN_INEQUALITY = "column_inequality"
+class InequalityOperator(str, Enum):
+    LT = "lt"
+    LE = "le"
+    GT = "gt"
+    GE = "ge"
+class Constraint(ConfigBase, ABC):
+    target_column: str
+    @property
+    @abstractmethod
+    def constraint_type(self) -> ConstraintType: ...
+class ScalarInequalityConstraint(Constraint):
+    rhs: float
+    operator: InequalityOperator
+    @property
+    def constraint_type(self) -> ConstraintType:
+        return ConstraintType.SCALAR_INEQUALITY
+class ColumnInequalityConstraint(Constraint):
+    rhs: str
+    operator: InequalityOperator
+    @property
+    def constraint_type(self) -> ConstraintType:
+        return ConstraintType.COLUMN_INEQUALITY
+ColumnConstraintT: TypeAlias = ScalarInequalityConstraint | ColumnInequalityConstraint