data-designer-config 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. data_designer/config/__init__.py +149 -0
  2. data_designer/config/_version.py +34 -0
  3. data_designer/config/analysis/__init__.py +2 -0
  4. data_designer/config/analysis/column_profilers.py +159 -0
  5. data_designer/config/analysis/column_statistics.py +421 -0
  6. data_designer/config/analysis/dataset_profiler.py +84 -0
  7. data_designer/config/analysis/utils/errors.py +10 -0
  8. data_designer/config/analysis/utils/reporting.py +192 -0
  9. data_designer/config/base.py +69 -0
  10. data_designer/config/column_configs.py +476 -0
  11. data_designer/config/column_types.py +141 -0
  12. data_designer/config/config_builder.py +595 -0
  13. data_designer/config/data_designer_config.py +40 -0
  14. data_designer/config/dataset_builders.py +13 -0
  15. data_designer/config/dataset_metadata.py +18 -0
  16. data_designer/config/default_model_settings.py +129 -0
  17. data_designer/config/errors.py +24 -0
  18. data_designer/config/interface.py +55 -0
  19. data_designer/config/models.py +486 -0
  20. data_designer/config/preview_results.py +41 -0
  21. data_designer/config/processors.py +148 -0
  22. data_designer/config/run_config.py +56 -0
  23. data_designer/config/sampler_constraints.py +52 -0
  24. data_designer/config/sampler_params.py +639 -0
  25. data_designer/config/seed.py +116 -0
  26. data_designer/config/seed_source.py +84 -0
  27. data_designer/config/seed_source_types.py +19 -0
  28. data_designer/config/testing/__init__.py +6 -0
  29. data_designer/config/testing/fixtures.py +308 -0
  30. data_designer/config/utils/code_lang.py +93 -0
  31. data_designer/config/utils/constants.py +365 -0
  32. data_designer/config/utils/errors.py +21 -0
  33. data_designer/config/utils/info.py +94 -0
  34. data_designer/config/utils/io_helpers.py +258 -0
  35. data_designer/config/utils/misc.py +78 -0
  36. data_designer/config/utils/numerical_helpers.py +30 -0
  37. data_designer/config/utils/type_helpers.py +106 -0
  38. data_designer/config/utils/visualization.py +482 -0
  39. data_designer/config/validator_params.py +94 -0
  40. data_designer/errors.py +7 -0
  41. data_designer/lazy_heavy_imports.py +56 -0
  42. data_designer/logging.py +180 -0
  43. data_designer/plugin_manager.py +78 -0
  44. data_designer/plugins/__init__.py +8 -0
  45. data_designer/plugins/errors.py +15 -0
  46. data_designer/plugins/plugin.py +141 -0
  47. data_designer/plugins/registry.py +88 -0
  48. data_designer_config-0.4.0.dist-info/METADATA +75 -0
  49. data_designer_config-0.4.0.dist-info/RECORD +50 -0
  50. data_designer_config-0.4.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,41 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from typing import TYPE_CHECKING
7
+
8
+ from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
9
+ from data_designer.config.config_builder import DataDesignerConfigBuilder
10
+ from data_designer.config.dataset_metadata import DatasetMetadata
11
+ from data_designer.config.utils.visualization import WithRecordSamplerMixin
12
+ from data_designer.lazy_heavy_imports import pd
13
+
14
+ if TYPE_CHECKING:
15
+ import pandas as pd
16
+
17
+
18
+ class PreviewResults(WithRecordSamplerMixin):
19
+ def __init__(
20
+ self,
21
+ *,
22
+ config_builder: DataDesignerConfigBuilder,
23
+ dataset_metadata: DatasetMetadata | None = None,
24
+ dataset: pd.DataFrame | None = None,
25
+ analysis: DatasetProfilerResults | None = None,
26
+ processor_artifacts: dict[str, list[str] | str] | None = None,
27
+ ):
28
+ """Creates a new instance with results from a Data Designer preview run.
29
+
30
+ Args:
31
+ config_builder: Data Designer configuration builder.
32
+ dataset_metadata: Metadata about the generated dataset (e.g., seed column names).
33
+ dataset: Dataset of the preview run.
34
+ analysis: Analysis of the preview run.
35
+ processor_artifacts: Artifacts generated by the processors.
36
+ """
37
+ self.dataset: pd.DataFrame | None = dataset
38
+ self.analysis: DatasetProfilerResults | None = analysis
39
+ self.processor_artifacts: dict[str, list[str] | str] | None = processor_artifacts
40
+ self.dataset_metadata: DatasetMetadata | None = dataset_metadata
41
+ self._config_builder = config_builder
@@ -0,0 +1,148 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ from abc import ABC
8
+ from enum import Enum
9
+ from typing import Any, Literal
10
+
11
+ from pydantic import Field, field_validator
12
+ from typing_extensions import TypeAlias
13
+
14
+ from data_designer.config.base import ConfigBase
15
+ from data_designer.config.dataset_builders import BuildStage
16
+ from data_designer.config.errors import InvalidConfigError
17
+
18
+ SUPPORTED_STAGES = [BuildStage.POST_BATCH]
19
+
20
+
21
+ class ProcessorType(str, Enum):
22
+ """Enumeration of available processor types.
23
+
24
+ Attributes:
25
+ DROP_COLUMNS: Processor that removes specified columns from the output dataset.
26
+ SCHEMA_TRANSFORM: Processor that creates a new dataset with a transformed schema using Jinja2 templates.
27
+ """
28
+
29
+ DROP_COLUMNS = "drop_columns"
30
+ SCHEMA_TRANSFORM = "schema_transform"
31
+
32
+
33
+ class ProcessorConfig(ConfigBase, ABC):
34
+ """Abstract base class for all processor configuration types.
35
+
36
+ Processors are transformations that run before or after columns are generated.
37
+ They can modify, reshape, or augment the dataset before it's saved.
38
+
39
+ Attributes:
40
+ name: Unique name of the processor, used to identify the processor in results
41
+ and to name output artifacts on disk.
42
+ build_stage: The stage at which the processor runs. Currently only `POST_BATCH`
43
+ is supported, meaning processors run after each batch of columns is generated.
44
+ """
45
+
46
+ name: str = Field(
47
+ description="The name of the processor, used to identify the processor in the results and to write the artifacts to disk.",
48
+ )
49
+ build_stage: BuildStage = Field(
50
+ default=BuildStage.POST_BATCH,
51
+ description=f"The stage at which the processor will run. Supported stages: {', '.join(SUPPORTED_STAGES)}",
52
+ )
53
+ processor_type: str
54
+
55
+ @field_validator("build_stage")
56
+ def validate_build_stage(cls, v: BuildStage) -> BuildStage:
57
+ if v not in SUPPORTED_STAGES:
58
+ raise ValueError(
59
+ f"Invalid dataset builder stage: {v}. Only these stages are supported: {', '.join(SUPPORTED_STAGES)}"
60
+ )
61
+ return v
62
+
63
+
64
+ def get_processor_config_from_kwargs(processor_type: ProcessorType, **kwargs: Any) -> ProcessorConfig:
65
+ """Create a processor configuration from a processor type and keyword arguments.
66
+
67
+ Args:
68
+ processor_type: The type of processor to create.
69
+ **kwargs: Additional keyword arguments passed to the processor constructor.
70
+
71
+ Returns:
72
+ A processor configuration object of the specified type.
73
+ """
74
+ if processor_type == ProcessorType.DROP_COLUMNS:
75
+ return DropColumnsProcessorConfig(**kwargs)
76
+ elif processor_type == ProcessorType.SCHEMA_TRANSFORM:
77
+ return SchemaTransformProcessorConfig(**kwargs)
78
+
79
+
80
+ class DropColumnsProcessorConfig(ProcessorConfig):
81
+ """Configuration for dropping columns from the output dataset.
82
+
83
+ This processor removes specified columns from the generated dataset. The dropped
84
+ columns are saved separately in a `dropped-columns` directory for reference.
85
+ When this processor is added via the config builder, the corresponding column
86
+ configs are automatically marked with `drop = True`.
87
+
88
+ Alternatively, you can set `drop = True` when configuring a column.
89
+
90
+ Attributes:
91
+ column_names: List of column names to remove from the output dataset.
92
+ processor_type: Discriminator field, always `ProcessorType.DROP_COLUMNS` for this configuration type.
93
+ """
94
+
95
+ column_names: list[str] = Field(description="List of column names to drop from the output dataset.")
96
+ processor_type: Literal[ProcessorType.DROP_COLUMNS] = ProcessorType.DROP_COLUMNS
97
+
98
+
99
+ class SchemaTransformProcessorConfig(ProcessorConfig):
100
+ """Configuration for transforming the dataset schema using Jinja2 templates.
101
+
102
+ This processor creates a new dataset with a transformed schema. Each key in the
103
+ template becomes a column in the output, and values are Jinja2 templates that
104
+ can reference any column in the batch. The transformed dataset is written to
105
+ a `processors-outputs/{processor_name}/` directory alongside the main dataset.
106
+
107
+ Attributes:
108
+ template: Dictionary defining the output schema. Keys are new column names,
109
+ values are Jinja2 templates (strings, lists, or nested structures).
110
+ Must be JSON-serializable.
111
+ processor_type: Discriminator field, always `ProcessorType.SCHEMA_TRANSFORM` for this configuration type.
112
+ """
113
+
114
+ template: dict[str, Any] = Field(
115
+ ...,
116
+ description="""
117
+ Dictionary specifying columns and templates to use in the new dataset with transformed schema.
118
+
119
+ Each key is a new column name, and each value is an object containing Jinja2 templates - for instance, a string or a list of strings.
120
+ Values must be JSON-serializable.
121
+
122
+ Example:
123
+
124
+ ```python
125
+ template = {
126
+ "list_of_strings": ["{{ col1 }}", "{{ col2 }}"],
127
+ "uppercase_string": "{{ col1 | upper }}",
128
+ "lowercase_string": "{{ col2 | lower }}",
129
+ }
130
+ ```
131
+
132
+ The above templates will create an new dataset with three columns: "list_of_strings", "uppercase_string", and "lowercase_string".
133
+ References to columns "col1" and "col2" in the templates will be replaced with the actual values of the columns in the dataset.
134
+ """,
135
+ )
136
+ processor_type: Literal[ProcessorType.SCHEMA_TRANSFORM] = ProcessorType.SCHEMA_TRANSFORM
137
+
138
+ @field_validator("template")
139
+ def validate_template(cls, v: dict[str, Any]) -> dict[str, Any]:
140
+ try:
141
+ json.dumps(v)
142
+ except TypeError as e:
143
+ if "not JSON serializable" in str(e):
144
+ raise InvalidConfigError("Template must be JSON serializable")
145
+ return v
146
+
147
+
148
+ ProcessorConfigT: TypeAlias = DropColumnsProcessorConfig | SchemaTransformProcessorConfig
@@ -0,0 +1,56 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from pydantic import Field, model_validator
7
+ from typing_extensions import Self
8
+
9
+ from data_designer.config.base import ConfigBase
10
+
11
+
12
+ class RunConfig(ConfigBase):
13
+ """Runtime configuration for dataset generation.
14
+
15
+ Groups configuration options that control generation behavior but aren't
16
+ part of the dataset configuration itself.
17
+
18
+ Attributes:
19
+ disable_early_shutdown: If True, disables the executor's early-shutdown behavior entirely.
20
+ Generation will continue regardless of error rate, and the early-shutdown exception
21
+ will never be raised. Error counts and summaries are still collected. Default is False.
22
+ shutdown_error_rate: Error rate threshold (0.0-1.0) that triggers early shutdown when
23
+ early shutdown is enabled. Default is 0.5.
24
+ shutdown_error_window: Minimum number of completed tasks before error rate
25
+ monitoring begins. Must be >= 0. Default is 10.
26
+ buffer_size: Number of records to process in each batch during dataset generation.
27
+ A batch is processed end-to-end (column generation, post-batch processors, and writing the batch
28
+ to artifact storage) before moving on to the next batch. Must be > 0. Default is 1000.
29
+ non_inference_max_parallel_workers: Maximum number of worker threads used for non-inference
30
+ cell-by-cell generators. Must be >= 1. Default is 4.
31
+ max_conversation_restarts: Maximum number of full conversation restarts permitted when
32
+ generation tasks call `ModelFacade.generate(...)`. Must be >= 0. Default is 5.
33
+ max_conversation_correction_steps: Maximum number of correction rounds permitted within a
34
+ single conversation when generation tasks call `ModelFacade.generate(...)`. Must be >= 0.
35
+ Default is 0.
36
+ debug_override_save_all_column_traces: If True, overrides per-column `with_trace` settings
37
+ and includes `__trace` columns for ALL LLM generations, containing the full ordered
38
+ message history (system/user/assistant) for the final generation attempt.
39
+ Useful for debugging. Default is False.
40
+ """
41
+
42
+ disable_early_shutdown: bool = False
43
+ shutdown_error_rate: float = Field(default=0.5, ge=0.0, le=1.0)
44
+ shutdown_error_window: int = Field(default=10, ge=0)
45
+ buffer_size: int = Field(default=1000, gt=0)
46
+ non_inference_max_parallel_workers: int = Field(default=4, ge=1)
47
+ max_conversation_restarts: int = Field(default=5, ge=0)
48
+ max_conversation_correction_steps: int = Field(default=0, ge=0)
49
+ debug_override_save_all_column_traces: bool = False
50
+
51
+ @model_validator(mode="after")
52
+ def normalize_shutdown_settings(self) -> Self:
53
+ """Normalize shutdown settings for compatibility."""
54
+ if self.disable_early_shutdown:
55
+ self.shutdown_error_rate = 1.0
56
+ return self
@@ -0,0 +1,52 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from abc import ABC, abstractmethod
7
+ from enum import Enum
8
+
9
+ from typing_extensions import TypeAlias
10
+
11
+ from data_designer.config.base import ConfigBase
12
+
13
+
14
+ class ConstraintType(str, Enum):
15
+ SCALAR_INEQUALITY = "scalar_inequality"
16
+ COLUMN_INEQUALITY = "column_inequality"
17
+
18
+
19
+ class InequalityOperator(str, Enum):
20
+ LT = "lt"
21
+ LE = "le"
22
+ GT = "gt"
23
+ GE = "ge"
24
+
25
+
26
+ class Constraint(ConfigBase, ABC):
27
+ target_column: str
28
+
29
+ @property
30
+ @abstractmethod
31
+ def constraint_type(self) -> ConstraintType: ...
32
+
33
+
34
+ class ScalarInequalityConstraint(Constraint):
35
+ rhs: float
36
+ operator: InequalityOperator
37
+
38
+ @property
39
+ def constraint_type(self) -> ConstraintType:
40
+ return ConstraintType.SCALAR_INEQUALITY
41
+
42
+
43
+ class ColumnInequalityConstraint(Constraint):
44
+ rhs: str
45
+ operator: InequalityOperator
46
+
47
+ @property
48
+ def constraint_type(self) -> ConstraintType:
49
+ return ConstraintType.COLUMN_INEQUALITY
50
+
51
+
52
+ ColumnConstraintT: TypeAlias = ScalarInequalityConstraint | ColumnInequalityConstraint