data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/cli/commands/__init__.py +1 -1
- data_designer/interface/__init__.py +21 -1
- data_designer/{_version.py → interface/_version.py} +2 -2
- data_designer/interface/data_designer.py +1 -7
- {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
- data_designer-0.4.0.dist-info/RECORD +39 -0
- data_designer/__init__.py +0 -17
- data_designer/config/__init__.py +0 -2
- data_designer/config/analysis/__init__.py +0 -2
- data_designer/config/analysis/column_profilers.py +0 -159
- data_designer/config/analysis/column_statistics.py +0 -421
- data_designer/config/analysis/dataset_profiler.py +0 -84
- data_designer/config/analysis/utils/errors.py +0 -10
- data_designer/config/analysis/utils/reporting.py +0 -192
- data_designer/config/base.py +0 -69
- data_designer/config/column_configs.py +0 -470
- data_designer/config/column_types.py +0 -141
- data_designer/config/config_builder.py +0 -595
- data_designer/config/data_designer_config.py +0 -40
- data_designer/config/dataset_builders.py +0 -13
- data_designer/config/dataset_metadata.py +0 -18
- data_designer/config/default_model_settings.py +0 -129
- data_designer/config/errors.py +0 -24
- data_designer/config/exports.py +0 -145
- data_designer/config/interface.py +0 -55
- data_designer/config/models.py +0 -455
- data_designer/config/preview_results.py +0 -41
- data_designer/config/processors.py +0 -148
- data_designer/config/run_config.py +0 -51
- data_designer/config/sampler_constraints.py +0 -52
- data_designer/config/sampler_params.py +0 -639
- data_designer/config/seed.py +0 -116
- data_designer/config/seed_source.py +0 -84
- data_designer/config/seed_source_types.py +0 -19
- data_designer/config/utils/code_lang.py +0 -82
- data_designer/config/utils/constants.py +0 -363
- data_designer/config/utils/errors.py +0 -21
- data_designer/config/utils/info.py +0 -94
- data_designer/config/utils/io_helpers.py +0 -258
- data_designer/config/utils/misc.py +0 -78
- data_designer/config/utils/numerical_helpers.py +0 -30
- data_designer/config/utils/type_helpers.py +0 -106
- data_designer/config/utils/visualization.py +0 -482
- data_designer/config/validator_params.py +0 -94
- data_designer/engine/__init__.py +0 -2
- data_designer/engine/analysis/column_profilers/base.py +0 -49
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
- data_designer/engine/analysis/column_profilers/registry.py +0 -22
- data_designer/engine/analysis/column_statistics.py +0 -145
- data_designer/engine/analysis/dataset_profiler.py +0 -149
- data_designer/engine/analysis/errors.py +0 -9
- data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
- data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
- data_designer/engine/column_generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/base.py +0 -122
- data_designer/engine/column_generators/generators/embedding.py +0 -35
- data_designer/engine/column_generators/generators/expression.py +0 -55
- data_designer/engine/column_generators/generators/llm_completion.py +0 -113
- data_designer/engine/column_generators/generators/samplers.py +0 -69
- data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
- data_designer/engine/column_generators/generators/validation.py +0 -140
- data_designer/engine/column_generators/registry.py +0 -60
- data_designer/engine/column_generators/utils/errors.py +0 -15
- data_designer/engine/column_generators/utils/generator_classification.py +0 -43
- data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
- data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
- data_designer/engine/compiler.py +0 -97
- data_designer/engine/configurable_task.py +0 -71
- data_designer/engine/dataset_builders/artifact_storage.py +0 -283
- data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
- data_designer/engine/dataset_builders/errors.py +0 -15
- data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
- data_designer/engine/dataset_builders/utils/__init__.py +0 -2
- data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
- data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
- data_designer/engine/dataset_builders/utils/dag.py +0 -62
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
- data_designer/engine/dataset_builders/utils/errors.py +0 -15
- data_designer/engine/errors.py +0 -51
- data_designer/engine/model_provider.py +0 -77
- data_designer/engine/models/__init__.py +0 -2
- data_designer/engine/models/errors.py +0 -300
- data_designer/engine/models/facade.py +0 -287
- data_designer/engine/models/factory.py +0 -42
- data_designer/engine/models/litellm_overrides.py +0 -179
- data_designer/engine/models/parsers/__init__.py +0 -2
- data_designer/engine/models/parsers/errors.py +0 -34
- data_designer/engine/models/parsers/parser.py +0 -235
- data_designer/engine/models/parsers/postprocessors.py +0 -93
- data_designer/engine/models/parsers/tag_parsers.py +0 -62
- data_designer/engine/models/parsers/types.py +0 -84
- data_designer/engine/models/recipes/base.py +0 -81
- data_designer/engine/models/recipes/response_recipes.py +0 -293
- data_designer/engine/models/registry.py +0 -146
- data_designer/engine/models/telemetry.py +0 -359
- data_designer/engine/models/usage.py +0 -73
- data_designer/engine/models/utils.py +0 -38
- data_designer/engine/processing/ginja/__init__.py +0 -2
- data_designer/engine/processing/ginja/ast.py +0 -65
- data_designer/engine/processing/ginja/environment.py +0 -463
- data_designer/engine/processing/ginja/exceptions.py +0 -56
- data_designer/engine/processing/ginja/record.py +0 -32
- data_designer/engine/processing/gsonschema/__init__.py +0 -2
- data_designer/engine/processing/gsonschema/exceptions.py +0 -15
- data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
- data_designer/engine/processing/gsonschema/types.py +0 -10
- data_designer/engine/processing/gsonschema/validators.py +0 -202
- data_designer/engine/processing/processors/base.py +0 -13
- data_designer/engine/processing/processors/drop_columns.py +0 -42
- data_designer/engine/processing/processors/registry.py +0 -25
- data_designer/engine/processing/processors/schema_transform.py +0 -49
- data_designer/engine/processing/utils.py +0 -169
- data_designer/engine/registry/base.py +0 -99
- data_designer/engine/registry/data_designer_registry.py +0 -39
- data_designer/engine/registry/errors.py +0 -12
- data_designer/engine/resources/managed_dataset_generator.py +0 -39
- data_designer/engine/resources/managed_dataset_repository.py +0 -197
- data_designer/engine/resources/managed_storage.py +0 -65
- data_designer/engine/resources/resource_provider.py +0 -77
- data_designer/engine/resources/seed_reader.py +0 -154
- data_designer/engine/sampling_gen/column.py +0 -91
- data_designer/engine/sampling_gen/constraints.py +0 -100
- data_designer/engine/sampling_gen/data_sources/base.py +0 -217
- data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
- data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
- data_designer/engine/sampling_gen/entities/__init__.py +0 -2
- data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
- data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
- data_designer/engine/sampling_gen/entities/errors.py +0 -10
- data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
- data_designer/engine/sampling_gen/entities/person.py +0 -144
- data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
- data_designer/engine/sampling_gen/errors.py +0 -26
- data_designer/engine/sampling_gen/generator.py +0 -122
- data_designer/engine/sampling_gen/jinja_utils.py +0 -64
- data_designer/engine/sampling_gen/people_gen.py +0 -199
- data_designer/engine/sampling_gen/person_constants.py +0 -56
- data_designer/engine/sampling_gen/schema.py +0 -147
- data_designer/engine/sampling_gen/schema_builder.py +0 -61
- data_designer/engine/sampling_gen/utils.py +0 -46
- data_designer/engine/secret_resolver.py +0 -82
- data_designer/engine/validation.py +0 -367
- data_designer/engine/validators/__init__.py +0 -19
- data_designer/engine/validators/base.py +0 -38
- data_designer/engine/validators/local_callable.py +0 -39
- data_designer/engine/validators/python.py +0 -254
- data_designer/engine/validators/remote.py +0 -89
- data_designer/engine/validators/sql.py +0 -65
- data_designer/errors.py +0 -7
- data_designer/essentials/__init__.py +0 -33
- data_designer/lazy_heavy_imports.py +0 -54
- data_designer/logging.py +0 -163
- data_designer/plugin_manager.py +0 -78
- data_designer/plugins/__init__.py +0 -8
- data_designer/plugins/errors.py +0 -15
- data_designer/plugins/plugin.py +0 -141
- data_designer/plugins/registry.py +0 -88
- data_designer/plugins/testing/__init__.py +0 -10
- data_designer/plugins/testing/stubs.py +0 -116
- data_designer/plugins/testing/utils.py +0 -20
- data_designer-0.3.8rc2.dist-info/RECORD +0 -196
- data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
- {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
- {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,595 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import json
|
|
7
|
-
import logging
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
|
|
10
|
-
from pygments import highlight
|
|
11
|
-
from pygments.formatters import HtmlFormatter
|
|
12
|
-
from pygments.lexers import PythonLexer
|
|
13
|
-
from typing_extensions import Self
|
|
14
|
-
|
|
15
|
-
from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
|
|
16
|
-
from data_designer.config.base import ExportableConfigBase
|
|
17
|
-
from data_designer.config.column_configs import SeedDatasetColumnConfig
|
|
18
|
-
from data_designer.config.column_types import (
|
|
19
|
-
ColumnConfigT,
|
|
20
|
-
DataDesignerColumnType,
|
|
21
|
-
get_column_config_from_kwargs,
|
|
22
|
-
get_column_display_order,
|
|
23
|
-
)
|
|
24
|
-
from data_designer.config.data_designer_config import DataDesignerConfig
|
|
25
|
-
from data_designer.config.dataset_builders import BuildStage
|
|
26
|
-
from data_designer.config.default_model_settings import get_default_model_configs
|
|
27
|
-
from data_designer.config.errors import BuilderConfigurationError, BuilderSerializationError, InvalidColumnTypeError
|
|
28
|
-
from data_designer.config.models import ModelConfig, load_model_configs
|
|
29
|
-
from data_designer.config.processors import ProcessorConfigT, ProcessorType, get_processor_config_from_kwargs
|
|
30
|
-
from data_designer.config.sampler_constraints import (
|
|
31
|
-
ColumnConstraintT,
|
|
32
|
-
ColumnInequalityConstraint,
|
|
33
|
-
ConstraintType,
|
|
34
|
-
ScalarInequalityConstraint,
|
|
35
|
-
)
|
|
36
|
-
from data_designer.config.seed import (
|
|
37
|
-
IndexRange,
|
|
38
|
-
PartitionBlock,
|
|
39
|
-
SamplingStrategy,
|
|
40
|
-
SeedConfig,
|
|
41
|
-
)
|
|
42
|
-
from data_designer.config.seed_source import DataFrameSeedSource
|
|
43
|
-
from data_designer.config.seed_source_types import SeedSourceT
|
|
44
|
-
from data_designer.config.utils.constants import DEFAULT_REPR_HTML_STYLE, REPR_HTML_TEMPLATE
|
|
45
|
-
from data_designer.config.utils.info import ConfigBuilderInfo
|
|
46
|
-
from data_designer.config.utils.io_helpers import serialize_data, smart_load_yaml
|
|
47
|
-
from data_designer.config.utils.misc import can_run_data_designer_locally, json_indent_list_of_strings, kebab_to_snake
|
|
48
|
-
from data_designer.config.utils.type_helpers import resolve_string_enum
|
|
49
|
-
|
|
50
|
-
logger = logging.getLogger(__name__)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
class BuilderConfig(ExportableConfigBase):
|
|
54
|
-
"""Configuration container for Data Designer builder.
|
|
55
|
-
|
|
56
|
-
This class holds the main Data Designer configuration along with optional
|
|
57
|
-
datastore settings needed for seed dataset operations.
|
|
58
|
-
|
|
59
|
-
Attributes:
|
|
60
|
-
data_designer: The main Data Designer configuration containing columns,
|
|
61
|
-
constraints, profilers, and other settings.
|
|
62
|
-
"""
|
|
63
|
-
|
|
64
|
-
data_designer: DataDesignerConfig
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
class DataDesignerConfigBuilder:
|
|
68
|
-
"""Config builder for Data Designer configurations.
|
|
69
|
-
|
|
70
|
-
This class provides a high-level interface for building Data Designer configurations.
|
|
71
|
-
"""
|
|
72
|
-
|
|
73
|
-
@classmethod
|
|
74
|
-
def from_config(cls, config: dict | str | Path | BuilderConfig) -> Self:
|
|
75
|
-
"""Create a DataDesignerConfigBuilder from an existing configuration.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
config: Configuration source. Can be:
|
|
79
|
-
- A dictionary containing the configuration
|
|
80
|
-
- A string or Path to a YAML/JSON configuration file
|
|
81
|
-
- A BuilderConfig object
|
|
82
|
-
|
|
83
|
-
Returns:
|
|
84
|
-
A new instance populated with the configuration from the provided source.
|
|
85
|
-
|
|
86
|
-
Raises:
|
|
87
|
-
ValueError: If the config format is invalid.
|
|
88
|
-
ValidationError: If the builder config loaded from the config is invalid.
|
|
89
|
-
"""
|
|
90
|
-
if isinstance(config, BuilderConfig):
|
|
91
|
-
builder_config = config
|
|
92
|
-
else:
|
|
93
|
-
json_config = json.loads(serialize_data(smart_load_yaml(config)))
|
|
94
|
-
builder_config = BuilderConfig.model_validate(json_config)
|
|
95
|
-
|
|
96
|
-
builder = cls(model_configs=builder_config.data_designer.model_configs)
|
|
97
|
-
data_designer_config = builder_config.data_designer
|
|
98
|
-
|
|
99
|
-
for col in data_designer_config.columns:
|
|
100
|
-
builder.add_column(col)
|
|
101
|
-
|
|
102
|
-
for constraint in data_designer_config.constraints or []:
|
|
103
|
-
builder.add_constraint(constraint=constraint)
|
|
104
|
-
|
|
105
|
-
if (seed_config := data_designer_config.seed_config) is not None:
|
|
106
|
-
builder.with_seed_dataset(
|
|
107
|
-
seed_config.source,
|
|
108
|
-
sampling_strategy=seed_config.sampling_strategy,
|
|
109
|
-
selection_strategy=seed_config.selection_strategy,
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
return builder
|
|
113
|
-
|
|
114
|
-
def __init__(self, model_configs: list[ModelConfig] | str | Path | None = None):
|
|
115
|
-
"""Initialize a new DataDesignerConfigBuilder instance.
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
model_configs: Model configurations. Can be:
|
|
119
|
-
- None to use default model configurations in local mode
|
|
120
|
-
- A list of ModelConfig objects
|
|
121
|
-
- A string or Path to a model configuration file
|
|
122
|
-
"""
|
|
123
|
-
self._column_configs = {}
|
|
124
|
-
self._model_configs = _load_model_configs(model_configs)
|
|
125
|
-
self._processor_configs: list[ProcessorConfigT] = []
|
|
126
|
-
self._seed_config: SeedConfig | None = None
|
|
127
|
-
self._constraints: list[ColumnConstraintT] = []
|
|
128
|
-
self._profilers: list[ColumnProfilerConfigT] = []
|
|
129
|
-
|
|
130
|
-
@property
|
|
131
|
-
def model_configs(self) -> list[ModelConfig]:
|
|
132
|
-
"""Get the model configurations for this builder.
|
|
133
|
-
|
|
134
|
-
Returns:
|
|
135
|
-
A list of ModelConfig objects used for data generation.
|
|
136
|
-
"""
|
|
137
|
-
return self._model_configs
|
|
138
|
-
|
|
139
|
-
@property
|
|
140
|
-
def allowed_references(self) -> list[str]:
|
|
141
|
-
"""Get all referenceable variables allowed in prompt templates and expressions.
|
|
142
|
-
|
|
143
|
-
This includes all column names and their side effect columns that can be
|
|
144
|
-
referenced in prompt templates and expressions within the configuration.
|
|
145
|
-
|
|
146
|
-
Returns:
|
|
147
|
-
A list of variable names that can be referenced in templates and expressions.
|
|
148
|
-
"""
|
|
149
|
-
side_effect_columns = sum([[c.name] + c.side_effect_columns for c in self._column_configs.values()], [])
|
|
150
|
-
return list(self._column_configs.keys()) + list(set(side_effect_columns))
|
|
151
|
-
|
|
152
|
-
@property
|
|
153
|
-
def info(self) -> ConfigBuilderInfo:
|
|
154
|
-
"""Get the ConfigBuilderInfo object for this builder.
|
|
155
|
-
|
|
156
|
-
Returns:
|
|
157
|
-
An object containing information about the configuration.
|
|
158
|
-
"""
|
|
159
|
-
return ConfigBuilderInfo(model_configs=self._model_configs)
|
|
160
|
-
|
|
161
|
-
def add_model_config(self, model_config: ModelConfig) -> Self:
|
|
162
|
-
"""Add a model configuration to the current Data Designer configuration.
|
|
163
|
-
|
|
164
|
-
Args:
|
|
165
|
-
model_config: The model configuration to add.
|
|
166
|
-
"""
|
|
167
|
-
if model_config.alias in [mc.alias for mc in self._model_configs]:
|
|
168
|
-
raise BuilderConfigurationError(
|
|
169
|
-
f"🛑 Model configuration with alias {model_config.alias} already exists. Please delete the existing model configuration or choose a different alias."
|
|
170
|
-
)
|
|
171
|
-
self._model_configs.append(model_config)
|
|
172
|
-
return self
|
|
173
|
-
|
|
174
|
-
def delete_model_config(self, alias: str) -> Self:
|
|
175
|
-
"""Delete a model configuration from the current Data Designer configuration by alias.
|
|
176
|
-
|
|
177
|
-
Args:
|
|
178
|
-
alias: The alias of the model configuration to delete.
|
|
179
|
-
"""
|
|
180
|
-
self._model_configs = [mc for mc in self._model_configs if mc.alias != alias]
|
|
181
|
-
if len(self._model_configs) == 0:
|
|
182
|
-
logger.warning(
|
|
183
|
-
f"⚠️ No model configurations found after deleting model configuration with alias {alias}. Please add a model configuration before building the configuration."
|
|
184
|
-
)
|
|
185
|
-
return self
|
|
186
|
-
|
|
187
|
-
def add_column(
|
|
188
|
-
self,
|
|
189
|
-
column_config: ColumnConfigT | None = None,
|
|
190
|
-
*,
|
|
191
|
-
name: str | None = None,
|
|
192
|
-
column_type: DataDesignerColumnType | None = None,
|
|
193
|
-
**kwargs,
|
|
194
|
-
) -> Self:
|
|
195
|
-
"""Add a Data Designer column configuration to the current Data Designer configuration.
|
|
196
|
-
|
|
197
|
-
If no column config object is provided, you must provide the `name`, `column_type`, and any
|
|
198
|
-
additional keyword arguments that are required by the column config constructor.
|
|
199
|
-
|
|
200
|
-
Args:
|
|
201
|
-
column_config: Data Designer column config object to add.
|
|
202
|
-
name: Name of the column to add. This is only used if `column_config` is not provided.
|
|
203
|
-
column_type: Column type to add. This is only used if `column_config` is not provided.
|
|
204
|
-
**kwargs: Additional keyword arguments to pass to the column constructor.
|
|
205
|
-
|
|
206
|
-
Returns:
|
|
207
|
-
The current Data Designer config builder instance.
|
|
208
|
-
|
|
209
|
-
Raises:
|
|
210
|
-
BuilderConfigurationError: If the column name collides with an existing seed dataset column.
|
|
211
|
-
"""
|
|
212
|
-
if column_config is None:
|
|
213
|
-
if name is None or column_type is None:
|
|
214
|
-
raise BuilderConfigurationError(
|
|
215
|
-
"🛑 You must provide either a 'column_config' object or 'name' *and* 'column_type' "
|
|
216
|
-
f"with additional keyword arguments. You provided {column_config=}, {name=}, and {column_type=}."
|
|
217
|
-
)
|
|
218
|
-
column_config = get_column_config_from_kwargs(name=name, column_type=column_type, **kwargs)
|
|
219
|
-
|
|
220
|
-
allowed_column_configs = ColumnConfigT.__args__
|
|
221
|
-
if not any(isinstance(column_config, t) for t in allowed_column_configs):
|
|
222
|
-
raise InvalidColumnTypeError(
|
|
223
|
-
f"🛑 Invalid column config object: '{column_config}'. Valid column config options are: "
|
|
224
|
-
f"{', '.join([t.__name__ for t in allowed_column_configs])}"
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
self._column_configs[column_config.name] = column_config
|
|
228
|
-
return self
|
|
229
|
-
|
|
230
|
-
def add_constraint(
|
|
231
|
-
self,
|
|
232
|
-
constraint: ColumnConstraintT | None = None,
|
|
233
|
-
*,
|
|
234
|
-
constraint_type: ConstraintType | None = None,
|
|
235
|
-
**kwargs,
|
|
236
|
-
) -> Self:
|
|
237
|
-
"""Add a constraint to the current Data Designer configuration.
|
|
238
|
-
|
|
239
|
-
Currently, constraints are only supported for numerical samplers.
|
|
240
|
-
|
|
241
|
-
You can either provide a constraint object directly, or provide a constraint type and
|
|
242
|
-
additional keyword arguments to construct the constraint object. Valid constraint types are:
|
|
243
|
-
- "scalar_inequality": Constraint between a column and a scalar value.
|
|
244
|
-
- "column_inequality": Constraint between two columns.
|
|
245
|
-
|
|
246
|
-
Args:
|
|
247
|
-
constraint: Constraint object to add.
|
|
248
|
-
constraint_type: Constraint type to add. Ignored when `constraint` is provided.
|
|
249
|
-
**kwargs: Additional keyword arguments to pass to the constraint constructor.
|
|
250
|
-
|
|
251
|
-
Returns:
|
|
252
|
-
The current Data Designer config builder instance.
|
|
253
|
-
"""
|
|
254
|
-
if constraint is None:
|
|
255
|
-
if constraint_type is None:
|
|
256
|
-
raise BuilderConfigurationError(
|
|
257
|
-
"🛑 You must provide either a 'constraint' object or 'constraint_type' "
|
|
258
|
-
"with additional keyword arguments."
|
|
259
|
-
)
|
|
260
|
-
try:
|
|
261
|
-
constraint_type = ConstraintType(constraint_type)
|
|
262
|
-
except Exception:
|
|
263
|
-
raise BuilderConfigurationError(
|
|
264
|
-
f"🛑 Invalid constraint type: {constraint_type}. Valid options are: "
|
|
265
|
-
f"{', '.join([t.value for t in ConstraintType])}"
|
|
266
|
-
)
|
|
267
|
-
if constraint_type == ConstraintType.SCALAR_INEQUALITY:
|
|
268
|
-
constraint = ScalarInequalityConstraint(**kwargs)
|
|
269
|
-
elif constraint_type == ConstraintType.COLUMN_INEQUALITY:
|
|
270
|
-
constraint = ColumnInequalityConstraint(**kwargs)
|
|
271
|
-
|
|
272
|
-
allowed_constraint_types = ColumnConstraintT.__args__
|
|
273
|
-
if not any(isinstance(constraint, t) for t in allowed_constraint_types):
|
|
274
|
-
raise BuilderConfigurationError(
|
|
275
|
-
"🛑 Invalid constraint object. Valid constraint options are: "
|
|
276
|
-
f"{', '.join([t.__name__ for t in allowed_constraint_types])}"
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
self._constraints.append(constraint)
|
|
280
|
-
return self
|
|
281
|
-
|
|
282
|
-
def add_processor(
|
|
283
|
-
self,
|
|
284
|
-
processor_config: ProcessorConfigT | None = None,
|
|
285
|
-
*,
|
|
286
|
-
processor_type: ProcessorType | None = None,
|
|
287
|
-
**kwargs,
|
|
288
|
-
) -> Self:
|
|
289
|
-
"""Add a processor to the current Data Designer configuration.
|
|
290
|
-
|
|
291
|
-
You can either provide a processor config object directly, or provide a processor type and
|
|
292
|
-
additional keyword arguments to construct the processor config object.
|
|
293
|
-
|
|
294
|
-
Args:
|
|
295
|
-
processor_config: The processor configuration object to add.
|
|
296
|
-
processor_type: The type of processor to add.
|
|
297
|
-
**kwargs: Additional keyword arguments to pass to the processor constructor.
|
|
298
|
-
|
|
299
|
-
Returns:
|
|
300
|
-
The current Data Designer config builder instance.
|
|
301
|
-
"""
|
|
302
|
-
if processor_config is None:
|
|
303
|
-
if processor_type is None:
|
|
304
|
-
raise BuilderConfigurationError(
|
|
305
|
-
"🛑 You must provide either a 'processor_config' object or 'processor_type' "
|
|
306
|
-
"with additional keyword arguments."
|
|
307
|
-
)
|
|
308
|
-
processor_config = get_processor_config_from_kwargs(processor_type=processor_type, **kwargs)
|
|
309
|
-
|
|
310
|
-
# Checks elsewhere fail if DropColumnsProcessor drops a column but it is not marked for drop
|
|
311
|
-
if processor_config.processor_type == ProcessorType.DROP_COLUMNS:
|
|
312
|
-
for column in processor_config.column_names:
|
|
313
|
-
if column in self._column_configs:
|
|
314
|
-
self._column_configs[column].drop = True
|
|
315
|
-
|
|
316
|
-
self._processor_configs.append(processor_config)
|
|
317
|
-
return self
|
|
318
|
-
|
|
319
|
-
def add_profiler(self, profiler_config: ColumnProfilerConfigT) -> Self:
|
|
320
|
-
"""Add a profiler to the current Data Designer configuration.
|
|
321
|
-
|
|
322
|
-
Args:
|
|
323
|
-
profiler_config: The profiler configuration object to add.
|
|
324
|
-
|
|
325
|
-
Returns:
|
|
326
|
-
The current Data Designer config builder instance.
|
|
327
|
-
|
|
328
|
-
Raises:
|
|
329
|
-
BuilderConfigurationError: If the profiler configuration is of an invalid type.
|
|
330
|
-
"""
|
|
331
|
-
if not isinstance(profiler_config, ColumnProfilerConfigT):
|
|
332
|
-
if hasattr(ColumnProfilerConfigT, "__args__"):
|
|
333
|
-
valid_options = ", ".join([t.__name__ for t in ColumnProfilerConfigT.__args__])
|
|
334
|
-
else:
|
|
335
|
-
valid_options = ColumnProfilerConfigT.__name__
|
|
336
|
-
raise BuilderConfigurationError(f"🛑 Invalid profiler object. Valid profiler options are: {valid_options}")
|
|
337
|
-
self._profilers.append(profiler_config)
|
|
338
|
-
return self
|
|
339
|
-
|
|
340
|
-
def get_profilers(self) -> list[ColumnProfilerConfigT]:
|
|
341
|
-
"""Get all profilers.
|
|
342
|
-
|
|
343
|
-
Returns:
|
|
344
|
-
A list of profiler configuration objects.
|
|
345
|
-
"""
|
|
346
|
-
return self._profilers
|
|
347
|
-
|
|
348
|
-
def build(self) -> DataDesignerConfig:
|
|
349
|
-
"""Build a DataDesignerConfig instance based on the current builder configuration.
|
|
350
|
-
|
|
351
|
-
Returns:
|
|
352
|
-
The current Data Designer config object.
|
|
353
|
-
"""
|
|
354
|
-
return DataDesignerConfig(
|
|
355
|
-
model_configs=self._model_configs,
|
|
356
|
-
seed_config=self._seed_config,
|
|
357
|
-
columns=list(self._column_configs.values()),
|
|
358
|
-
constraints=self._constraints or None,
|
|
359
|
-
profilers=self._profilers or None,
|
|
360
|
-
processors=self._processor_configs or None,
|
|
361
|
-
)
|
|
362
|
-
|
|
363
|
-
def delete_constraints(self, target_column: str) -> Self:
|
|
364
|
-
"""Delete all constraints for the given target column.
|
|
365
|
-
|
|
366
|
-
Args:
|
|
367
|
-
target_column: Name of the column to remove constraints for.
|
|
368
|
-
|
|
369
|
-
Returns:
|
|
370
|
-
The current Data Designer config builder instance.
|
|
371
|
-
"""
|
|
372
|
-
self._constraints = [c for c in self._constraints if c.target_column != target_column]
|
|
373
|
-
return self
|
|
374
|
-
|
|
375
|
-
def delete_column(self, column_name: str) -> Self:
|
|
376
|
-
"""Delete the column with the given name.
|
|
377
|
-
|
|
378
|
-
Args:
|
|
379
|
-
column_name: Name of the column to delete.
|
|
380
|
-
|
|
381
|
-
Returns:
|
|
382
|
-
The current Data Designer config builder instance.
|
|
383
|
-
|
|
384
|
-
Raises:
|
|
385
|
-
BuilderConfigurationError: If trying to delete a seed dataset column.
|
|
386
|
-
"""
|
|
387
|
-
if isinstance(self._column_configs.get(column_name), SeedDatasetColumnConfig):
|
|
388
|
-
raise BuilderConfigurationError("Seed columns cannot be deleted. Please update the seed dataset instead.")
|
|
389
|
-
self._column_configs.pop(column_name, None)
|
|
390
|
-
return self
|
|
391
|
-
|
|
392
|
-
def get_column_config(self, name: str) -> ColumnConfigT:
|
|
393
|
-
"""Get a column configuration by name.
|
|
394
|
-
|
|
395
|
-
Args:
|
|
396
|
-
name: Name of the column to retrieve the config for.
|
|
397
|
-
|
|
398
|
-
Returns:
|
|
399
|
-
The column configuration object.
|
|
400
|
-
|
|
401
|
-
Raises:
|
|
402
|
-
KeyError: If no column with the given name exists.
|
|
403
|
-
"""
|
|
404
|
-
return self._column_configs[name]
|
|
405
|
-
|
|
406
|
-
def get_column_configs(self) -> list[ColumnConfigT]:
|
|
407
|
-
"""Get all column configurations.
|
|
408
|
-
|
|
409
|
-
Returns:
|
|
410
|
-
A list of all column configuration objects.
|
|
411
|
-
"""
|
|
412
|
-
return list(self._column_configs.values())
|
|
413
|
-
|
|
414
|
-
def get_constraints(self, target_column: str) -> list[ColumnConstraintT]:
|
|
415
|
-
"""Get all constraints for the given target column.
|
|
416
|
-
|
|
417
|
-
Args:
|
|
418
|
-
target_column: Name of the column to get constraints for.
|
|
419
|
-
|
|
420
|
-
Returns:
|
|
421
|
-
A list of constraint objects targeting the specified column.
|
|
422
|
-
"""
|
|
423
|
-
return [c for c in self._constraints if c.target_column == target_column]
|
|
424
|
-
|
|
425
|
-
def get_columns_of_type(self, column_type: DataDesignerColumnType) -> list[ColumnConfigT]:
|
|
426
|
-
"""Get all column configurations of the specified type.
|
|
427
|
-
|
|
428
|
-
Args:
|
|
429
|
-
column_type: The type of columns to filter by.
|
|
430
|
-
|
|
431
|
-
Returns:
|
|
432
|
-
A list of column configurations matching the specified type.
|
|
433
|
-
"""
|
|
434
|
-
column_type = resolve_string_enum(column_type, DataDesignerColumnType)
|
|
435
|
-
return [c for c in self._column_configs.values() if c.column_type == column_type]
|
|
436
|
-
|
|
437
|
-
def get_columns_excluding_type(self, column_type: DataDesignerColumnType) -> list[ColumnConfigT]:
|
|
438
|
-
"""Get all column configurations excluding the specified type.
|
|
439
|
-
|
|
440
|
-
Args:
|
|
441
|
-
column_type: The type of columns to exclude.
|
|
442
|
-
|
|
443
|
-
Returns:
|
|
444
|
-
A list of column configurations that do not match the specified type.
|
|
445
|
-
"""
|
|
446
|
-
column_type = resolve_string_enum(column_type, DataDesignerColumnType)
|
|
447
|
-
return [c for c in self._column_configs.values() if c.column_type != column_type]
|
|
448
|
-
|
|
449
|
-
def get_processor_configs(self) -> dict[BuildStage, list[ProcessorConfigT]]:
|
|
450
|
-
"""Get processor configuration objects.
|
|
451
|
-
|
|
452
|
-
Returns:
|
|
453
|
-
A dictionary of processor configuration objects by dataset builder stage.
|
|
454
|
-
"""
|
|
455
|
-
return self._processor_configs
|
|
456
|
-
|
|
457
|
-
def get_seed_config(self) -> SeedConfig | None:
|
|
458
|
-
"""Get the seed config for the current Data Designer configuration.
|
|
459
|
-
|
|
460
|
-
Returns:
|
|
461
|
-
The seed config if configured, None otherwise.
|
|
462
|
-
"""
|
|
463
|
-
return self._seed_config
|
|
464
|
-
|
|
465
|
-
def num_columns_of_type(self, column_type: DataDesignerColumnType) -> int:
|
|
466
|
-
"""Get the count of columns of the specified type.
|
|
467
|
-
|
|
468
|
-
Args:
|
|
469
|
-
column_type: The type of columns to count.
|
|
470
|
-
|
|
471
|
-
Returns:
|
|
472
|
-
The number of columns matching the specified type.
|
|
473
|
-
"""
|
|
474
|
-
return len(self.get_columns_of_type(column_type))
|
|
475
|
-
|
|
476
|
-
def with_seed_dataset(
|
|
477
|
-
self,
|
|
478
|
-
seed_source: SeedSourceT,
|
|
479
|
-
*,
|
|
480
|
-
sampling_strategy: SamplingStrategy = SamplingStrategy.ORDERED,
|
|
481
|
-
selection_strategy: IndexRange | PartitionBlock | None = None,
|
|
482
|
-
) -> Self:
|
|
483
|
-
"""Add a seed dataset to the current Data Designer configuration.
|
|
484
|
-
|
|
485
|
-
This method sets the seed dataset for the configuration, but columns are not resolved until
|
|
486
|
-
compilation (including validation) is performed by the engine using a SeedReader.
|
|
487
|
-
|
|
488
|
-
Args:
|
|
489
|
-
seed_source: The pointer to the seed dataset.
|
|
490
|
-
sampling_strategy: The sampling strategy to use when generating data from the seed dataset.
|
|
491
|
-
Defaults to ORDERED sampling.
|
|
492
|
-
selection_strategy: An optional selection strategy to use when generating data from the seed dataset.
|
|
493
|
-
Defaults to None.
|
|
494
|
-
|
|
495
|
-
Returns:
|
|
496
|
-
The current Data Designer config builder instance.
|
|
497
|
-
"""
|
|
498
|
-
self._seed_config = SeedConfig(
|
|
499
|
-
source=seed_source,
|
|
500
|
-
sampling_strategy=sampling_strategy,
|
|
501
|
-
selection_strategy=selection_strategy,
|
|
502
|
-
)
|
|
503
|
-
return self
|
|
504
|
-
|
|
505
|
-
def write_config(self, path: str | Path, indent: int | None = 2, **kwargs) -> None:
|
|
506
|
-
"""Write the current configuration to a file.
|
|
507
|
-
|
|
508
|
-
Args:
|
|
509
|
-
path: Path to the file to write the configuration to.
|
|
510
|
-
indent: Indentation level for the output file (default: 2).
|
|
511
|
-
**kwargs: Additional keyword arguments passed to the serialization methods used.
|
|
512
|
-
|
|
513
|
-
Raises:
|
|
514
|
-
BuilderConfigurationError: If the file format is unsupported.
|
|
515
|
-
BuilderSerializationError: If the configuration cannot be serialized.
|
|
516
|
-
"""
|
|
517
|
-
if (seed_config := self.get_seed_config()) is not None and isinstance(seed_config.source, DataFrameSeedSource):
|
|
518
|
-
raise BuilderSerializationError(
|
|
519
|
-
"This builder was configured with a DataFrame seed dataset. "
|
|
520
|
-
"DataFrame seeds cannot be serialized to config files. "
|
|
521
|
-
"To serialize this configuration, change your seed dataset to a more persistent, serializable source format. "
|
|
522
|
-
"For example, you could make a local file seed source from the dataframe:\n\n"
|
|
523
|
-
"LocalFileSeedSource.from_dataframe(my_dataframe, '/path/to/data.parquet')"
|
|
524
|
-
)
|
|
525
|
-
|
|
526
|
-
cfg = self.get_builder_config()
|
|
527
|
-
suffix = Path(path).suffix
|
|
528
|
-
if suffix in {".yaml", ".yml"}:
|
|
529
|
-
cfg.to_yaml(path, indent=indent, **kwargs)
|
|
530
|
-
elif suffix == ".json":
|
|
531
|
-
cfg.to_json(path, indent=indent, **kwargs)
|
|
532
|
-
else:
|
|
533
|
-
raise BuilderConfigurationError(f"🛑 Unsupported file type: {suffix}. Must be `.yaml`, `.yml` or `.json`.")
|
|
534
|
-
|
|
535
|
-
def get_builder_config(self) -> BuilderConfig:
|
|
536
|
-
"""Get the builder config for the current Data Designer configuration.
|
|
537
|
-
|
|
538
|
-
Returns:
|
|
539
|
-
The builder config.
|
|
540
|
-
"""
|
|
541
|
-
return BuilderConfig(data_designer=self.build())
|
|
542
|
-
|
|
543
|
-
def __repr__(self) -> str:
|
|
544
|
-
"""Generates a string representation of the DataDesignerConfigBuilder instance.
|
|
545
|
-
|
|
546
|
-
Returns:
|
|
547
|
-
A formatted string showing the builder's configuration including seed dataset and column information grouped by type.
|
|
548
|
-
"""
|
|
549
|
-
if len(self._column_configs) == 0:
|
|
550
|
-
return f"{self.__class__.__name__}()"
|
|
551
|
-
|
|
552
|
-
props_to_repr = {
|
|
553
|
-
"seed_dataset": (None if self._seed_config is None else f"{self._seed_config.source.seed_type} seed"),
|
|
554
|
-
}
|
|
555
|
-
|
|
556
|
-
for column_type in get_column_display_order():
|
|
557
|
-
columns = self.get_columns_of_type(column_type)
|
|
558
|
-
if len(columns) > 0:
|
|
559
|
-
column_label = f"{kebab_to_snake(column_type.value)}_columns"
|
|
560
|
-
props_to_repr[column_label] = json_indent_list_of_strings([c.name for c in columns], indent=8)
|
|
561
|
-
|
|
562
|
-
repr_string = f"{self.__class__.__name__}(\n"
|
|
563
|
-
for k, v in props_to_repr.items():
|
|
564
|
-
if v is not None:
|
|
565
|
-
v_indented = v if "[" not in v else f"{v[:-1]}" + " " + v[-1]
|
|
566
|
-
repr_string += f" {k}: {v_indented}\n"
|
|
567
|
-
repr_string += ")"
|
|
568
|
-
return repr_string
|
|
569
|
-
|
|
570
|
-
def _repr_html_(self) -> str:
|
|
571
|
-
"""Return an HTML representation of the DataDesignerConfigBuilder instance..
|
|
572
|
-
|
|
573
|
-
This method provides a syntax-highlighted HTML representation of the
|
|
574
|
-
builder's string representation.
|
|
575
|
-
|
|
576
|
-
Returns:
|
|
577
|
-
HTML string with syntax highlighting for the builder representation.
|
|
578
|
-
"""
|
|
579
|
-
repr_string = self.__repr__()
|
|
580
|
-
formatter = HtmlFormatter(style=DEFAULT_REPR_HTML_STYLE, cssclass="code")
|
|
581
|
-
highlighted_html = highlight(repr_string, PythonLexer(), formatter)
|
|
582
|
-
css = formatter.get_style_defs(".code")
|
|
583
|
-
return REPR_HTML_TEMPLATE.format(css=css, highlighted_html=highlighted_html)
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
def _load_model_configs(model_configs: list[ModelConfig] | str | Path | None = None) -> list[ModelConfig]:
|
|
587
|
-
"""Resolves the provided model_configs, which may be a string or Path to a model configuration file.
|
|
588
|
-
If None or empty, returns default model configurations if possible, otherwise raises an error.
|
|
589
|
-
"""
|
|
590
|
-
if model_configs:
|
|
591
|
-
return load_model_configs(model_configs)
|
|
592
|
-
elif can_run_data_designer_locally():
|
|
593
|
-
return get_default_model_configs()
|
|
594
|
-
else:
|
|
595
|
-
raise BuilderConfigurationError("🛑 Model configurations are required!")
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from typing import Annotated
|
|
7
|
-
|
|
8
|
-
from pydantic import Field
|
|
9
|
-
|
|
10
|
-
from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
|
|
11
|
-
from data_designer.config.base import ExportableConfigBase
|
|
12
|
-
from data_designer.config.column_types import ColumnConfigT
|
|
13
|
-
from data_designer.config.models import ModelConfig
|
|
14
|
-
from data_designer.config.processors import ProcessorConfigT
|
|
15
|
-
from data_designer.config.sampler_constraints import ColumnConstraintT
|
|
16
|
-
from data_designer.config.seed import SeedConfig
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class DataDesignerConfig(ExportableConfigBase):
|
|
20
|
-
"""Configuration for NeMo Data Designer.
|
|
21
|
-
|
|
22
|
-
This class defines the main configuration structure for NeMo Data Designer,
|
|
23
|
-
which orchestrates the generation of synthetic data.
|
|
24
|
-
|
|
25
|
-
Attributes:
|
|
26
|
-
columns: Required list of column configurations defining how each column
|
|
27
|
-
should be generated. Must contain at least one column.
|
|
28
|
-
model_configs: Optional list of model configurations for LLM-based generation.
|
|
29
|
-
Each model config defines the model, provider, and inference parameters.
|
|
30
|
-
seed_config: Optional seed dataset settings to use for generation.
|
|
31
|
-
constraints: Optional list of column constraints.
|
|
32
|
-
profilers: Optional list of column profilers for analyzing generated data characteristics.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
columns: list[Annotated[ColumnConfigT, Field(discriminator="column_type")]] = Field(min_length=1)
|
|
36
|
-
model_configs: list[ModelConfig] | None = None
|
|
37
|
-
seed_config: SeedConfig | None = None
|
|
38
|
-
constraints: list[ColumnConstraintT] | None = None
|
|
39
|
-
profilers: list[ColumnProfilerConfigT] | None = None
|
|
40
|
-
processors: list[Annotated[ProcessorConfigT, Field(discriminator="processor_type")]] | None = None
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from enum import Enum
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class BuildStage(str, Enum):
|
|
10
|
-
PRE_BATCH = "pre_batch"
|
|
11
|
-
POST_BATCH = "post_batch"
|
|
12
|
-
PRE_GENERATION = "pre_generation"
|
|
13
|
-
POST_GENERATION = "post_generation"
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from pydantic import BaseModel
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class DatasetMetadata(BaseModel):
|
|
8
|
-
"""Metadata about a generated dataset.
|
|
9
|
-
|
|
10
|
-
This object is created by the engine and passed to results objects for use
|
|
11
|
-
in visualization and other client-side utilities. It is designed to be
|
|
12
|
-
serializable so it can be sent over the wire in a client-server architecture.
|
|
13
|
-
|
|
14
|
-
Attributes:
|
|
15
|
-
seed_column_names: Names of columns from the seed dataset. Empty list if no seed dataset.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
seed_column_names: list[str] = []
|