PyPI - data-designer - Versions diffs - 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

data-designer 0.1.5py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

data_designer/_version.py +2 -2
data_designer/cli/README.md +15 -1
data_designer/cli/commands/download.py +56 -0
data_designer/cli/commands/list.py +4 -18
data_designer/cli/controllers/__init__.py +2 -1
data_designer/cli/controllers/download_controller.py +217 -0
data_designer/cli/controllers/model_controller.py +4 -3
data_designer/cli/forms/field.py +65 -19
data_designer/cli/forms/model_builder.py +251 -44
data_designer/cli/main.py +11 -1
data_designer/cli/repositories/persona_repository.py +88 -0
data_designer/cli/services/__init__.py +2 -1
data_designer/cli/services/download_service.py +97 -0
data_designer/cli/ui.py +131 -0
data_designer/cli/utils.py +34 -0
data_designer/config/analysis/__init__.py +2 -0
data_designer/config/analysis/column_profilers.py +75 -7
data_designer/config/analysis/column_statistics.py +192 -48
data_designer/config/analysis/dataset_profiler.py +23 -5
data_designer/config/analysis/utils/reporting.py +3 -3
data_designer/config/base.py +3 -3
data_designer/config/column_configs.py +27 -6
data_designer/config/column_types.py +24 -17
data_designer/config/config_builder.py +34 -26
data_designer/config/data_designer_config.py +7 -7
data_designer/config/datastore.py +6 -6
data_designer/config/default_model_settings.py +27 -34
data_designer/config/exports.py +8 -0
data_designer/config/models.py +155 -29
data_designer/config/preview_results.py +6 -8
data_designer/config/processors.py +63 -2
data_designer/config/sampler_constraints.py +1 -2
data_designer/config/sampler_params.py +31 -31
data_designer/config/seed.py +1 -2
data_designer/config/utils/code_lang.py +4 -5
data_designer/config/utils/constants.py +31 -8
data_designer/config/utils/io_helpers.py +5 -5
data_designer/config/utils/misc.py +1 -4
data_designer/config/utils/numerical_helpers.py +2 -2
data_designer/config/utils/type_helpers.py +3 -3
data_designer/config/utils/validation.py +7 -8
data_designer/config/utils/visualization.py +32 -17
data_designer/config/validator_params.py +4 -8
data_designer/engine/analysis/column_profilers/base.py +0 -7
data_designer/engine/analysis/column_profilers/judge_score_profiler.py +2 -3
data_designer/engine/analysis/column_statistics.py +16 -16
data_designer/engine/analysis/dataset_profiler.py +25 -4
data_designer/engine/analysis/utils/column_statistics_calculations.py +71 -49
data_designer/engine/analysis/utils/judge_score_processing.py +5 -5
data_designer/engine/column_generators/generators/base.py +34 -0
data_designer/engine/column_generators/generators/embedding.py +45 -0
data_designer/engine/column_generators/generators/{llm_generators.py → llm_completion.py} +17 -49
data_designer/engine/column_generators/registry.py +4 -2
data_designer/engine/column_generators/utils/judge_score_factory.py +5 -6
data_designer/engine/configurable_task.py +2 -2
data_designer/engine/dataset_builders/artifact_storage.py +1 -2
data_designer/engine/dataset_builders/column_wise_builder.py +11 -10
data_designer/engine/dataset_builders/utils/concurrency.py +6 -6
data_designer/engine/models/facade.py +66 -9
data_designer/engine/models/litellm_overrides.py +5 -6
data_designer/engine/models/parsers/errors.py +2 -4
data_designer/engine/models/parsers/parser.py +2 -3
data_designer/engine/models/parsers/postprocessors.py +3 -4
data_designer/engine/models/parsers/types.py +4 -4
data_designer/engine/models/registry.py +20 -11
data_designer/engine/models/usage.py +7 -9
data_designer/engine/processing/ginja/ast.py +1 -2
data_designer/engine/processing/utils.py +40 -2
data_designer/engine/registry/base.py +12 -12
data_designer/engine/sampling_gen/constraints.py +1 -2
data_designer/engine/sampling_gen/data_sources/base.py +14 -14
data_designer/engine/sampling_gen/entities/phone_number.py +1 -2
data_designer/engine/sampling_gen/people_gen.py +3 -7
data_designer/engine/validators/base.py +2 -2
data_designer/logging.py +2 -2
data_designer/plugin_manager.py +3 -3
data_designer/plugins/plugin.py +3 -3
data_designer/plugins/registry.py +2 -2
{data_designer-0.1.5.dist-info → data_designer-0.2.0.dist-info}/METADATA +1 -1
{data_designer-0.1.5.dist-info → data_designer-0.2.0.dist-info}/RECORD +83 -77
{data_designer-0.1.5.dist-info → data_designer-0.2.0.dist-info}/WHEEL +0 -0
{data_designer-0.1.5.dist-info → data_designer-0.2.0.dist-info}/entry_points.txt +0 -0
{data_designer-0.1.5.dist-info → data_designer-0.2.0.dist-info}/licenses/LICENSE +0 -0

data_designer/config/exports.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from data_designer.config.analysis.column_profilers import JudgeScoreProfilerConfig
 from data_designer.config.column_configs import (
+    EmbeddingColumnConfig,
     ExpressionColumnConfig,
     LLMCodeColumnConfig,
     LLMJudgeColumnConfig,
@@ -19,6 +20,9 @@ from data_designer.config.data_designer_config import DataDesignerConfig
 from data_designer.config.dataset_builders import BuildStage
 from data_designer.config.datastore import DatastoreSettings
 from data_designer.config.models import (
+    ChatCompletionInferenceParams,
+    EmbeddingInferenceParams,
+    GenerationType,
     ImageContext,
     ImageFormat,
     InferenceParameters,
@@ -81,6 +85,7 @@ def get_config_exports() -> list[str]:
         CodeLang.__name__,
         CodeValidatorParams.__name__,
         ColumnInequalityConstraint.__name__,
+        ChatCompletionInferenceParams.__name__,
         DataDesignerColumnType.__name__,
         DataDesignerConfig.__name__,
         DataDesignerConfigBuilder.__name__,
@@ -89,8 +94,11 @@ def get_config_exports() -> list[str]:
         DatastoreSettings.__name__,
         DatetimeSamplerParams.__name__,
         DropColumnsProcessorConfig.__name__,
+        EmbeddingColumnConfig.__name__,
+        EmbeddingInferenceParams.__name__,
         ExpressionColumnConfig.__name__,
         GaussianSamplerParams.__name__,
+        GenerationType.__name__,
         IndexRange.__name__,
         InfoType.__name__,
         ImageContext.__name__,

data_designer/config/models.py CHANGED Viewed

@@ -5,10 +5,10 @@ import logging
 from abc import ABC, abstractmethod
 from enum import Enum
 from pathlib import Path
-from typing import Any, Generic, List, Optional, TypeVar, Union
+from typing import Any, Generic, Literal, TypeVar
 import numpy as np
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field, field_validator, model_validator
 from typing_extensions import Self, TypeAlias
 from data_designer.config.base import ConfigBase
@@ -74,7 +74,7 @@ class ImageContext(ModalityContext):
     """
     modality: Modality = Modality.IMAGE
-    image_format: Optional[ImageFormat] = None
+    image_format: ImageFormat | None = None
     def get_context(self, record: dict) -> dict[str, Any]:
         """Get the context for the image modality.
@@ -122,8 +122,8 @@ class ManualDistributionParams(ConfigBase):
         weights: Optional list of weights for each value. If not provided, all values have equal probability.
     """
-    values: List[float] = Field(min_length=1)
-    weights: Optional[List[float]] = None
+    values: list[float] = Field(min_length=1)
+    weights: list[float] | None = None
     @model_validator(mode="after")
     def _normalize_weights(self) -> Self:
@@ -149,7 +149,7 @@ class ManualDistribution(Distribution[ManualDistributionParams]):
         params: Distribution parameters (values, weights).
     """
-    distribution_type: Optional[DistributionType] = "manual"
+    distribution_type: DistributionType | None = "manual"
     params: ManualDistributionParams
     def sample(self) -> float:
@@ -190,7 +190,7 @@ class UniformDistribution(Distribution[UniformDistributionParams]):
         params: Distribution parameters (low, high).
     """
-    distribution_type: Optional[DistributionType] = "uniform"
+    distribution_type: DistributionType | None = "uniform"
     params: UniformDistributionParams
     def sample(self) -> float:
@@ -202,36 +202,93 @@ class UniformDistribution(Distribution[UniformDistributionParams]):
         return float(np.random.uniform(low=self.params.low, high=self.params.high, size=1)[0])
-DistributionT: TypeAlias = Union[UniformDistribution, ManualDistribution]
+DistributionT: TypeAlias = UniformDistribution | ManualDistribution
-class InferenceParameters(ConfigBase):
-    """Configuration for LLM inference parameters.
+class GenerationType(str, Enum):
+    CHAT_COMPLETION = "chat-completion"
+    EMBEDDING = "embedding"
+class BaseInferenceParams(ConfigBase, ABC):
+    """Base configuration for inference parameters.
     Attributes:
-        temperature: Sampling temperature (0.0-2.0). Can be a fixed value or a distribution for dynamic sampling.
-        top_p: Nucleus sampling probability (0.0-1.0). Can be a fixed value or a distribution for dynamic sampling.
-        max_tokens: Maximum number of tokens (includes both input and output tokens).
+        generation_type: Type of generation (chat-completion or embedding). Acts as discriminator.
         max_parallel_requests: Maximum number of parallel requests to the model API.
         timeout: Timeout in seconds for each request.
         extra_body: Additional parameters to pass to the model API.
     """
-    temperature: Optional[Union[float, DistributionT]] = None
-    top_p: Optional[Union[float, DistributionT]] = None
-    max_tokens: Optional[int] = Field(default=None, ge=1)
+    generation_type: GenerationType
     max_parallel_requests: int = Field(default=4, ge=1)
-    timeout: Optional[int] = Field(default=None, ge=1)
-    extra_body: Optional[dict[str, Any]] = None
+    timeout: int | None = Field(default=None, ge=1)
+    extra_body: dict[str, Any] | None = None
     @property
-    def generate_kwargs(self) -> dict[str, Union[float, int]]:
+    def generate_kwargs(self) -> dict[str, Any]:
         """Get the generate kwargs for the inference parameters.
         Returns:
             A dictionary of the generate kwargs.
         """
         result = {}
+        if self.timeout is not None:
+            result["timeout"] = self.timeout
+        if self.extra_body is not None and self.extra_body != {}:
+            result["extra_body"] = self.extra_body
+        return result
+    def format_for_display(self) -> str:
+        """Format inference parameters for display.
+        Returns:
+            Formatted string of inference parameters
+        """
+        params_dict = self.model_dump(exclude_none=True, mode="json")
+        if not params_dict:
+            return "(none)"
+        parts = []
+        for key, value in params_dict.items():
+            formatted_value = self._format_value(key, value)
+            parts.append(f"{key}={formatted_value}")
+        return ", ".join(parts)
+    def _format_value(self, key: str, value: Any) -> str:
+        """Format a single parameter value. Override in subclasses for custom formatting.
+        Args:
+            key: Parameter name
+            value: Parameter value
+        Returns:
+            Formatted string representation of the value
+        """
+        if isinstance(value, float):
+            return f"{value:.2f}"
+        return str(value)
+class ChatCompletionInferenceParams(BaseInferenceParams):
+    """Configuration for LLM inference parameters.
+    Attributes:
+        generation_type: Type of generation, always "chat-completion" for this class.
+        temperature: Sampling temperature (0.0-2.0). Can be a fixed value or a distribution for dynamic sampling.
+        top_p: Nucleus sampling probability (0.0-1.0). Can be a fixed value or a distribution for dynamic sampling.
+        max_tokens: Maximum number of tokens (includes both input and output tokens).
+    """
+    generation_type: Literal[GenerationType.CHAT_COMPLETION] = GenerationType.CHAT_COMPLETION
+    temperature: float | DistributionT | None = None
+    top_p: float | DistributionT | None = None
+    max_tokens: int | None = Field(default=None, ge=1)
+    @property
+    def generate_kwargs(self) -> dict[str, Any]:
+        result = super().generate_kwargs
         if self.temperature is not None:
             result["temperature"] = (
                 self.temperature.sample() if hasattr(self.temperature, "sample") else self.temperature
@@ -240,10 +297,6 @@ class InferenceParameters(ConfigBase):
             result["top_p"] = self.top_p.sample() if hasattr(self.top_p, "sample") else self.top_p
         if self.max_tokens is not None:
             result["max_tokens"] = self.max_tokens
-        if self.timeout is not None:
-            result["timeout"] = self.timeout
-        if self.extra_body is not None and self.extra_body != {}:
-            result["extra_body"] = self.extra_body
         return result
     @model_validator(mode="after")
@@ -266,7 +319,7 @@ class InferenceParameters(ConfigBase):
     def _run_validation(
         self,
-        value: Union[float, DistributionT, None],
+        value: float | DistributionT | None,
         param_name: str,
         min_value: float,
         max_value: float,
@@ -289,6 +342,61 @@ class InferenceParameters(ConfigBase):
     def _is_value_in_range(self, value: float, min_value: float, max_value: float) -> bool:
         return min_value <= value <= max_value
+    def _format_value(self, key: str, value: Any) -> str:
+        """Format chat completion parameter values, including distributions.
+        Args:
+            key: Parameter name
+            value: Parameter value
+        Returns:
+            Formatted string representation of the value
+        """
+        if isinstance(value, dict) and "distribution_type" in value:
+            return "dist"
+        return super()._format_value(key, value)
+# Maintain backwards compatibility with a deprecation warning
+class InferenceParameters(ChatCompletionInferenceParams):
+    """
+    Deprecated: Use ChatCompletionInferenceParams instead.
+    This alias will be removed in a future version.
+    """
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        logger.warning(
+            "InferenceParameters is deprecated and will be removed in a future version. "
+            "Use ChatCompletionInferenceParams instead."
+        )
+        super().__init__(*args, **kwargs)
+class EmbeddingInferenceParams(BaseInferenceParams):
+    """Configuration for embedding generation parameters.
+    Attributes:
+        generation_type: Type of generation, always "embedding" for this class.
+        encoding_format: Format of the embedding encoding ("float" or "base64").
+        dimensions: Number of dimensions for the embedding.
+    """
+    generation_type: Literal[GenerationType.EMBEDDING] = GenerationType.EMBEDDING
+    encoding_format: Literal["float", "base64"] = "float"
+    dimensions: int | None = None
+    @property
+    def generate_kwargs(self) -> dict[str, float | int]:
+        result = super().generate_kwargs
+        if self.encoding_format is not None:
+            result["encoding_format"] = self.encoding_format
+        if self.dimensions is not None:
+            result["dimensions"] = self.dimensions
+        return result
+InferenceParamsT: TypeAlias = ChatCompletionInferenceParams | EmbeddingInferenceParams | InferenceParameters
 class ModelConfig(ConfigBase):
     """Configuration for a model used for generation.
@@ -297,13 +405,31 @@ class ModelConfig(ConfigBase):
         alias: User-defined alias to reference in column configurations.
         model: Model identifier (e.g., from build.nvidia.com or other providers).
         inference_parameters: Inference parameters for the model (temperature, top_p, max_tokens, etc.).
+            The generation_type is determined by the type of inference_parameters.
         provider: Optional model provider name if using custom providers.
     """
     alias: str
     model: str
-    inference_parameters: InferenceParameters = Field(default_factory=InferenceParameters)
-    provider: Optional[str] = None
+    inference_parameters: InferenceParamsT = Field(default_factory=ChatCompletionInferenceParams)
+    provider: str | None = None
+    @property
+    def generation_type(self) -> GenerationType:
+        """Get the generation type from the inference parameters."""
+        return self.inference_parameters.generation_type
+    @field_validator("inference_parameters", mode="before")
+    @classmethod
+    def _convert_inference_parameters(cls, value: Any) -> Any:
+        """Convert raw dict to appropriate inference parameters type based on field presence."""
+        if isinstance(value, dict):
+            # Infer type from presence of embedding-specific fields
+            if "encoding_format" in value or "dimensions" in value:
+                return EmbeddingInferenceParams(**value)
+            else:
+                return ChatCompletionInferenceParams(**value)
+        return value
 class ModelProvider(ConfigBase):
@@ -320,11 +446,11 @@ class ModelProvider(ConfigBase):
     name: str
     endpoint: str
     provider_type: str = "openai"
-    api_key: Optional[str] = None
-    extra_body: Optional[dict[str, Any]] = None
+    api_key: str | None = None
+    extra_body: dict[str, Any] | None = None
-def load_model_configs(model_configs: Union[list[ModelConfig], str, Path]) -> list[ModelConfig]:
+def load_model_configs(model_configs: list[ModelConfig] | str | Path) -> list[ModelConfig]:
     if isinstance(model_configs, list) and all(isinstance(mc, ModelConfig) for mc in model_configs):
         return model_configs
     json_config = smart_load_yaml(model_configs)

data_designer/config/preview_results.py CHANGED Viewed

@@ -3,8 +3,6 @@
 from __future__ import annotations
-from typing import Optional, Union
 import pandas as pd
 from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
@@ -17,9 +15,9 @@ class PreviewResults(WithRecordSamplerMixin):
         self,
         *,
         config_builder: DataDesignerConfigBuilder,
-        dataset: Optional[pd.DataFrame] = None,
-        analysis: Optional[DatasetProfilerResults] = None,
-        processor_artifacts: Optional[dict[str, Union[list[str], str]]] = None,
+        dataset: pd.DataFrame | None = None,
+        analysis: DatasetProfilerResults | None = None,
+        processor_artifacts: dict[str, list[str] | str] | None = None,
     ):
         """Creates a new instance with results from a Data Designer preview run.
@@ -29,7 +27,7 @@ class PreviewResults(WithRecordSamplerMixin):
             analysis: Analysis of the preview run.
             processor_artifacts: Artifacts generated by the processors.
         """
-        self.dataset: Optional[pd.DataFrame] = dataset
-        self.analysis: Optional[DatasetProfilerResults] = analysis
-        self.processor_artifacts: Optional[dict[str, Union[list[str], str]]] = processor_artifacts
+        self.dataset: pd.DataFrame | None = dataset
+        self.analysis: DatasetProfilerResults | None = analysis
+        self.processor_artifacts: dict[str, list[str] | str] | None = processor_artifacts
         self._config_builder = config_builder

data_designer/config/processors.py CHANGED Viewed

@@ -7,6 +7,7 @@ from enum import Enum
 from typing import Any, Literal
 from pydantic import Field, field_validator
+from typing_extensions import TypeAlias
 from data_designer.config.base import ConfigBase
 from data_designer.config.dataset_builders import BuildStage
@@ -16,11 +17,30 @@ SUPPORTED_STAGES = [BuildStage.POST_BATCH]
 class ProcessorType(str, Enum):
+    """Enumeration of available processor types.
+    Attributes:
+        DROP_COLUMNS: Processor that removes specified columns from the output dataset.
+        SCHEMA_TRANSFORM: Processor that creates a new dataset with a transformed schema using Jinja2 templates.
+    """
     DROP_COLUMNS = "drop_columns"
     SCHEMA_TRANSFORM = "schema_transform"
 class ProcessorConfig(ConfigBase, ABC):
+    """Abstract base class for all processor configuration types.
+    Processors are transformations that run before or after columns are generated.
+    They can modify, reshape, or augment the dataset before it's saved.
+    Attributes:
+        name: Unique name of the processor, used to identify the processor in results
+            and to name output artifacts on disk.
+        build_stage: The stage at which the processor runs. Currently only `POST_BATCH`
+            is supported, meaning processors run after each batch of columns is generated.
+    """
     name: str = Field(
         description="The name of the processor, used to identify the processor in the results and to write the artifacts to disk.",
     )
@@ -28,6 +48,7 @@ class ProcessorConfig(ConfigBase, ABC):
         default=BuildStage.POST_BATCH,
         description=f"The stage at which the processor will run. Supported stages: {', '.join(SUPPORTED_STAGES)}",
     )
+    processor_type: str
     @field_validator("build_stage")
     def validate_build_stage(cls, v: BuildStage) -> BuildStage:
@@ -38,7 +59,16 @@ class ProcessorConfig(ConfigBase, ABC):
         return v
-def get_processor_config_from_kwargs(processor_type: ProcessorType, **kwargs) -> ProcessorConfig:
+def get_processor_config_from_kwargs(processor_type: ProcessorType, **kwargs: Any) -> ProcessorConfig:
+    """Create a processor configuration from a processor type and keyword arguments.
+    Args:
+        processor_type: The type of processor to create.
+        **kwargs: Additional keyword arguments passed to the processor constructor.
+    Returns:
+        A processor configuration object of the specified type.
+    """
     if processor_type == ProcessorType.DROP_COLUMNS:
         return DropColumnsProcessorConfig(**kwargs)
     elif processor_type == ProcessorType.SCHEMA_TRANSFORM:
@@ -46,11 +76,39 @@ def get_processor_config_from_kwargs(processor_type: ProcessorType, **kwargs) ->
 class DropColumnsProcessorConfig(ProcessorConfig):
-    column_names: list[str]
+    """Configuration for dropping columns from the output dataset.
+    This processor removes specified columns from the generated dataset. The dropped
+    columns are saved separately in a `dropped-columns` directory for reference.
+    When this processor is added via the config builder, the corresponding column
+    configs are automatically marked with `drop = True`.
+    Alternatively, you can set `drop = True` when configuring a column.
+    Attributes:
+        column_names: List of column names to remove from the output dataset.
+        processor_type: Discriminator field, always `ProcessorType.DROP_COLUMNS` for this configuration type.
+    """
+    column_names: list[str] = Field(description="List of column names to drop from the output dataset.")
     processor_type: Literal[ProcessorType.DROP_COLUMNS] = ProcessorType.DROP_COLUMNS
 class SchemaTransformProcessorConfig(ProcessorConfig):
+    """Configuration for transforming the dataset schema using Jinja2 templates.
+    This processor creates a new dataset with a transformed schema. Each key in the
+    template becomes a column in the output, and values are Jinja2 templates that
+    can reference any column in the batch. The transformed dataset is written to
+    a `processors-outputs/{processor_name}/` directory alongside the main dataset.
+    Attributes:
+        template: Dictionary defining the output schema. Keys are new column names,
+            values are Jinja2 templates (strings, lists, or nested structures).
+            Must be JSON-serializable.
+        processor_type: Discriminator field, always `ProcessorType.SCHEMA_TRANSFORM` for this configuration type.
+    """
     template: dict[str, Any] = Field(
         ...,
         description="""
@@ -83,3 +141,6 @@ class SchemaTransformProcessorConfig(ProcessorConfig):
             if "not JSON serializable" in str(e):
                 raise InvalidConfigError("Template must be JSON serializable")
         return v
+ProcessorConfigT: TypeAlias = DropColumnsProcessorConfig | SchemaTransformProcessorConfig

data_designer/config/sampler_constraints.py CHANGED Viewed

@@ -3,7 +3,6 @@
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Union
 from typing_extensions import TypeAlias
@@ -48,4 +47,4 @@ class ColumnInequalityConstraint(Constraint):
         return ConstraintType.COLUMN_INEQUALITY
-ColumnConstraintT: TypeAlias = Union[ScalarInequalityConstraint, ColumnInequalityConstraint]
+ColumnConstraintT: TypeAlias = ScalarInequalityConstraint | ColumnInequalityConstraint

data_designer/config/sampler_params.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from enum import Enum
-from typing import Literal, Optional, Union
+from typing import Literal
 import pandas as pd
 from pydantic import Field, field_validator, model_validator
@@ -54,12 +54,12 @@ class CategorySamplerParams(ConfigBase):
             Larger weights result in higher sampling probability for the corresponding value.
     """
-    values: list[Union[str, int, float]] = Field(
+    values: list[str | int | float] = Field(
         ...,
         min_length=1,
         description="List of possible categorical values that can be sampled from.",
     )
-    weights: Optional[list[float]] = Field(
+    weights: list[float] | None = Field(
         default=None,
         description=(
             "List of unnormalized probability weights to assigned to each value, in order. "
@@ -134,7 +134,7 @@ class SubcategorySamplerParams(ConfigBase):
     """
     category: str = Field(..., description="Name of parent category to this subcategory.")
-    values: dict[str, list[Union[str, int, float]]] = Field(
+    values: dict[str, list[str | int | float]] = Field(
         ...,
         description="Mapping from each value of parent category to a list of subcategory values.",
     )
@@ -214,7 +214,7 @@ class UUIDSamplerParams(ConfigBase):
             lowercase UUIDs.
     """
-    prefix: Optional[str] = Field(default=None, description="String prepended to the front of the UUID.")
+    prefix: str | None = Field(default=None, description="String prepended to the front of the UUID.")
     short_form: bool = Field(
         default=False,
         description="If true, all UUIDs sampled will be truncated at 8 characters.",
@@ -259,7 +259,7 @@ class ScipySamplerParams(ConfigBase):
         ...,
         description="Parameters of the scipy.stats distribution given in `dist_name`.",
     )
-    decimal_places: Optional[int] = Field(
+    decimal_places: int | None = Field(
         default=None, description="Number of decimal places to round the sampled values to."
     )
     sampler_type: Literal[SamplerType.SCIPY] = SamplerType.SCIPY
@@ -356,7 +356,7 @@ class GaussianSamplerParams(ConfigBase):
     mean: float = Field(..., description="Mean of the Gaussian distribution")
     stddev: float = Field(..., description="Standard deviation of the Gaussian distribution")
-    decimal_places: Optional[int] = Field(
+    decimal_places: int | None = Field(
         default=None, description="Number of decimal places to round the sampled values to."
     )
     sampler_type: Literal[SamplerType.GAUSSIAN] = SamplerType.GAUSSIAN
@@ -398,7 +398,7 @@ class UniformSamplerParams(ConfigBase):
     low: float = Field(..., description="Lower bound of the uniform distribution, inclusive.")
     high: float = Field(..., description="Upper bound of the uniform distribution, inclusive.")
-    decimal_places: Optional[int] = Field(
+    decimal_places: int | None = Field(
         default=None, description="Number of decimal places to round the sampled values to."
     )
     sampler_type: Literal[SamplerType.UNIFORM] = SamplerType.UNIFORM
@@ -421,8 +421,8 @@ class PersonSamplerParams(ConfigBase):
     Attributes:
         locale: Locale string determining the language and geographic region for synthetic people.
-            Format: language_COUNTRY (e.g., "en_US", "en_GB", "fr_FR", "de_DE", "es_ES", "ja_JP").
-            Defaults to "en_US".
+            Must be a locale supported by a managed Nemotron Personas dataset. The dataset must
+            be downloaded and available in the managed assets directory.
         sex: If specified, filters to only sample people of the specified sex. Options: "Male" or
             "Female". If None, samples both sexes.
         city: If specified, filters to only sample people from the specified city or cities. Can be
@@ -447,11 +447,11 @@ class PersonSamplerParams(ConfigBase):
             f"{', '.join(LOCALES_WITH_MANAGED_DATASETS)}."
         ),
     )
-    sex: Optional[SexT] = Field(
+    sex: SexT | None = Field(
         default=None,
         description="If specified, then only synthetic people of the specified sex will be sampled.",
     )
-    city: Optional[Union[str, list[str]]] = Field(
+    city: str | list[str] | None = Field(
         default=None,
         description="If specified, then only synthetic people from these cities will be sampled.",
     )
@@ -461,7 +461,7 @@ class PersonSamplerParams(ConfigBase):
         min_length=2,
         max_length=2,
     )
-    select_field_values: Optional[dict[str, list[str]]] = Field(
+    select_field_values: dict[str, list[str]] | None = Field(
         default=None,
         description=(
             "Sample synthetic people with the specified field values. This is meant to be a flexible argument for "
@@ -529,11 +529,11 @@ class PersonFromFakerSamplerParams(ConfigBase):
             "that a synthetic person will be sampled from. E.g, en_US, en_GB, fr_FR, ..."
         ),
     )
-    sex: Optional[SexT] = Field(
+    sex: SexT | None = Field(
         default=None,
         description="If specified, then only synthetic people of the specified sex will be sampled.",
     )
-    city: Optional[Union[str, list[str]]] = Field(
+    city: str | list[str] | None = Field(
         default=None,
         description="If specified, then only synthetic people from these cities will be sampled.",
     )
@@ -585,22 +585,22 @@ class PersonFromFakerSamplerParams(ConfigBase):
         return value
-SamplerParamsT: TypeAlias = Union[
-    SubcategorySamplerParams,
-    CategorySamplerParams,
-    DatetimeSamplerParams,
-    PersonSamplerParams,
-    PersonFromFakerSamplerParams,
-    TimeDeltaSamplerParams,
-    UUIDSamplerParams,
-    BernoulliSamplerParams,
-    BernoulliMixtureSamplerParams,
-    BinomialSamplerParams,
-    GaussianSamplerParams,
-    PoissonSamplerParams,
-    UniformSamplerParams,
-    ScipySamplerParams,
-]
+SamplerParamsT: TypeAlias = (
+    SubcategorySamplerParams
+    | CategorySamplerParams
+    | DatetimeSamplerParams
+    | PersonSamplerParams
+    | PersonFromFakerSamplerParams
+    | TimeDeltaSamplerParams
+    | UUIDSamplerParams
+    | BernoulliSamplerParams
+    | BernoulliMixtureSamplerParams
+    | BinomialSamplerParams
+    | GaussianSamplerParams
+    | PoissonSamplerParams
+    | UniformSamplerParams
+    | ScipySamplerParams
+)
 def is_numerical_sampler_type(sampler_type: SamplerType) -> bool:

data_designer/config/seed.py CHANGED Viewed

@@ -3,7 +3,6 @@
 from abc import ABC
 from enum import Enum
-from typing import Optional, Union
 from pydantic import Field, field_validator, model_validator
 from typing_extensions import Self
@@ -112,7 +111,7 @@ class SeedConfig(ConfigBase):
     dataset: str
     sampling_strategy: SamplingStrategy = SamplingStrategy.ORDERED
-    selection_strategy: Optional[Union[IndexRange, PartitionBlock]] = None
+    selection_strategy: IndexRange | PartitionBlock | None = None
 class SeedDatasetReference(ABC, ConfigBase):

data-designer 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

data-designer 0.1.5py3-none-any.whl → 0.2.0py3-none-any.whl