data-designer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. data_designer/__init__.py +15 -0
  2. data_designer/_version.py +34 -0
  3. data_designer/cli/README.md +236 -0
  4. data_designer/cli/__init__.py +6 -0
  5. data_designer/cli/commands/__init__.py +2 -0
  6. data_designer/cli/commands/list.py +130 -0
  7. data_designer/cli/commands/models.py +10 -0
  8. data_designer/cli/commands/providers.py +11 -0
  9. data_designer/cli/commands/reset.py +100 -0
  10. data_designer/cli/controllers/__init__.py +7 -0
  11. data_designer/cli/controllers/model_controller.py +246 -0
  12. data_designer/cli/controllers/provider_controller.py +317 -0
  13. data_designer/cli/forms/__init__.py +20 -0
  14. data_designer/cli/forms/builder.py +51 -0
  15. data_designer/cli/forms/field.py +180 -0
  16. data_designer/cli/forms/form.py +59 -0
  17. data_designer/cli/forms/model_builder.py +125 -0
  18. data_designer/cli/forms/provider_builder.py +76 -0
  19. data_designer/cli/main.py +44 -0
  20. data_designer/cli/repositories/__init__.py +8 -0
  21. data_designer/cli/repositories/base.py +39 -0
  22. data_designer/cli/repositories/model_repository.py +42 -0
  23. data_designer/cli/repositories/provider_repository.py +43 -0
  24. data_designer/cli/services/__init__.py +7 -0
  25. data_designer/cli/services/model_service.py +116 -0
  26. data_designer/cli/services/provider_service.py +111 -0
  27. data_designer/cli/ui.py +448 -0
  28. data_designer/cli/utils.py +47 -0
  29. data_designer/config/__init__.py +2 -0
  30. data_designer/config/analysis/column_profilers.py +89 -0
  31. data_designer/config/analysis/column_statistics.py +274 -0
  32. data_designer/config/analysis/dataset_profiler.py +60 -0
  33. data_designer/config/analysis/utils/errors.py +8 -0
  34. data_designer/config/analysis/utils/reporting.py +188 -0
  35. data_designer/config/base.py +68 -0
  36. data_designer/config/column_configs.py +354 -0
  37. data_designer/config/column_types.py +168 -0
  38. data_designer/config/config_builder.py +660 -0
  39. data_designer/config/data_designer_config.py +40 -0
  40. data_designer/config/dataset_builders.py +11 -0
  41. data_designer/config/datastore.py +151 -0
  42. data_designer/config/default_model_settings.py +123 -0
  43. data_designer/config/errors.py +19 -0
  44. data_designer/config/interface.py +54 -0
  45. data_designer/config/models.py +231 -0
  46. data_designer/config/preview_results.py +32 -0
  47. data_designer/config/processors.py +41 -0
  48. data_designer/config/sampler_constraints.py +51 -0
  49. data_designer/config/sampler_params.py +604 -0
  50. data_designer/config/seed.py +145 -0
  51. data_designer/config/utils/code_lang.py +83 -0
  52. data_designer/config/utils/constants.py +313 -0
  53. data_designer/config/utils/errors.py +19 -0
  54. data_designer/config/utils/info.py +88 -0
  55. data_designer/config/utils/io_helpers.py +273 -0
  56. data_designer/config/utils/misc.py +81 -0
  57. data_designer/config/utils/numerical_helpers.py +28 -0
  58. data_designer/config/utils/type_helpers.py +100 -0
  59. data_designer/config/utils/validation.py +336 -0
  60. data_designer/config/utils/visualization.py +427 -0
  61. data_designer/config/validator_params.py +96 -0
  62. data_designer/engine/__init__.py +2 -0
  63. data_designer/engine/analysis/column_profilers/base.py +55 -0
  64. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
  65. data_designer/engine/analysis/column_profilers/registry.py +20 -0
  66. data_designer/engine/analysis/column_statistics.py +142 -0
  67. data_designer/engine/analysis/dataset_profiler.py +125 -0
  68. data_designer/engine/analysis/errors.py +7 -0
  69. data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
  70. data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
  71. data_designer/engine/column_generators/__init__.py +2 -0
  72. data_designer/engine/column_generators/generators/__init__.py +2 -0
  73. data_designer/engine/column_generators/generators/base.py +61 -0
  74. data_designer/engine/column_generators/generators/expression.py +63 -0
  75. data_designer/engine/column_generators/generators/llm_generators.py +172 -0
  76. data_designer/engine/column_generators/generators/samplers.py +75 -0
  77. data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
  78. data_designer/engine/column_generators/generators/validation.py +147 -0
  79. data_designer/engine/column_generators/registry.py +56 -0
  80. data_designer/engine/column_generators/utils/errors.py +13 -0
  81. data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
  82. data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
  83. data_designer/engine/configurable_task.py +82 -0
  84. data_designer/engine/dataset_builders/artifact_storage.py +181 -0
  85. data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
  86. data_designer/engine/dataset_builders/errors.py +13 -0
  87. data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
  88. data_designer/engine/dataset_builders/utils/__init__.py +2 -0
  89. data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
  90. data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
  91. data_designer/engine/dataset_builders/utils/dag.py +56 -0
  92. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
  93. data_designer/engine/dataset_builders/utils/errors.py +13 -0
  94. data_designer/engine/errors.py +49 -0
  95. data_designer/engine/model_provider.py +75 -0
  96. data_designer/engine/models/__init__.py +2 -0
  97. data_designer/engine/models/errors.py +308 -0
  98. data_designer/engine/models/facade.py +225 -0
  99. data_designer/engine/models/litellm_overrides.py +162 -0
  100. data_designer/engine/models/parsers/__init__.py +2 -0
  101. data_designer/engine/models/parsers/errors.py +34 -0
  102. data_designer/engine/models/parsers/parser.py +236 -0
  103. data_designer/engine/models/parsers/postprocessors.py +93 -0
  104. data_designer/engine/models/parsers/tag_parsers.py +60 -0
  105. data_designer/engine/models/parsers/types.py +82 -0
  106. data_designer/engine/models/recipes/base.py +79 -0
  107. data_designer/engine/models/recipes/response_recipes.py +291 -0
  108. data_designer/engine/models/registry.py +118 -0
  109. data_designer/engine/models/usage.py +75 -0
  110. data_designer/engine/models/utils.py +38 -0
  111. data_designer/engine/processing/ginja/__init__.py +2 -0
  112. data_designer/engine/processing/ginja/ast.py +64 -0
  113. data_designer/engine/processing/ginja/environment.py +461 -0
  114. data_designer/engine/processing/ginja/exceptions.py +54 -0
  115. data_designer/engine/processing/ginja/record.py +30 -0
  116. data_designer/engine/processing/gsonschema/__init__.py +2 -0
  117. data_designer/engine/processing/gsonschema/exceptions.py +8 -0
  118. data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
  119. data_designer/engine/processing/gsonschema/types.py +8 -0
  120. data_designer/engine/processing/gsonschema/validators.py +143 -0
  121. data_designer/engine/processing/processors/base.py +15 -0
  122. data_designer/engine/processing/processors/drop_columns.py +46 -0
  123. data_designer/engine/processing/processors/registry.py +20 -0
  124. data_designer/engine/processing/utils.py +120 -0
  125. data_designer/engine/registry/base.py +97 -0
  126. data_designer/engine/registry/data_designer_registry.py +37 -0
  127. data_designer/engine/registry/errors.py +10 -0
  128. data_designer/engine/resources/managed_dataset_generator.py +35 -0
  129. data_designer/engine/resources/managed_dataset_repository.py +194 -0
  130. data_designer/engine/resources/managed_storage.py +63 -0
  131. data_designer/engine/resources/resource_provider.py +46 -0
  132. data_designer/engine/resources/seed_dataset_data_store.py +66 -0
  133. data_designer/engine/sampling_gen/column.py +89 -0
  134. data_designer/engine/sampling_gen/constraints.py +95 -0
  135. data_designer/engine/sampling_gen/data_sources/base.py +214 -0
  136. data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
  137. data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
  138. data_designer/engine/sampling_gen/entities/__init__.py +2 -0
  139. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  140. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
  141. data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
  142. data_designer/engine/sampling_gen/entities/errors.py +8 -0
  143. data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
  144. data_designer/engine/sampling_gen/entities/person.py +142 -0
  145. data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
  146. data_designer/engine/sampling_gen/errors.py +24 -0
  147. data_designer/engine/sampling_gen/generator.py +121 -0
  148. data_designer/engine/sampling_gen/jinja_utils.py +60 -0
  149. data_designer/engine/sampling_gen/people_gen.py +203 -0
  150. data_designer/engine/sampling_gen/person_constants.py +54 -0
  151. data_designer/engine/sampling_gen/schema.py +143 -0
  152. data_designer/engine/sampling_gen/schema_builder.py +59 -0
  153. data_designer/engine/sampling_gen/utils.py +40 -0
  154. data_designer/engine/secret_resolver.py +80 -0
  155. data_designer/engine/validators/__init__.py +17 -0
  156. data_designer/engine/validators/base.py +36 -0
  157. data_designer/engine/validators/local_callable.py +34 -0
  158. data_designer/engine/validators/python.py +245 -0
  159. data_designer/engine/validators/remote.py +83 -0
  160. data_designer/engine/validators/sql.py +60 -0
  161. data_designer/errors.py +5 -0
  162. data_designer/essentials/__init__.py +137 -0
  163. data_designer/interface/__init__.py +2 -0
  164. data_designer/interface/data_designer.py +351 -0
  165. data_designer/interface/errors.py +16 -0
  166. data_designer/interface/results.py +55 -0
  167. data_designer/logging.py +161 -0
  168. data_designer/plugin_manager.py +83 -0
  169. data_designer/plugins/__init__.py +6 -0
  170. data_designer/plugins/errors.py +10 -0
  171. data_designer/plugins/plugin.py +69 -0
  172. data_designer/plugins/registry.py +86 -0
  173. data_designer-0.1.0.dist-info/METADATA +173 -0
  174. data_designer-0.1.0.dist-info/RECORD +177 -0
  175. data_designer-0.1.0.dist-info/WHEEL +4 -0
  176. data_designer-0.1.0.dist-info/entry_points.txt +2 -0
  177. data_designer-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,660 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import Optional, Union
10
+
11
+ from pygments import highlight
12
+ from pygments.formatters import HtmlFormatter
13
+ from pygments.lexers import PythonLexer
14
+ from typing_extensions import Self
15
+
16
+ from .analysis.column_profilers import ColumnProfilerConfigT
17
+ from .base import ExportableConfigBase
18
+ from .column_configs import SeedDatasetColumnConfig
19
+ from .column_types import (
20
+ ColumnConfigT,
21
+ DataDesignerColumnType,
22
+ column_type_is_llm_generated,
23
+ get_column_config_from_kwargs,
24
+ get_column_display_order,
25
+ )
26
+ from .data_designer_config import DataDesignerConfig
27
+ from .dataset_builders import BuildStage
28
+ from .datastore import DatastoreSettings, fetch_seed_dataset_column_names
29
+ from .default_model_settings import get_default_model_configs, resolve_seed_default_model_settings
30
+ from .errors import BuilderConfigurationError, InvalidColumnTypeError, InvalidConfigError
31
+ from .models import ModelConfig, load_model_configs
32
+ from .processors import ProcessorConfig, ProcessorType, get_processor_config_from_kwargs
33
+ from .sampler_constraints import (
34
+ ColumnConstraintT,
35
+ ColumnInequalityConstraint,
36
+ ConstraintType,
37
+ ScalarInequalityConstraint,
38
+ )
39
+ from .seed import (
40
+ DatastoreSeedDatasetReference,
41
+ IndexRange,
42
+ LocalSeedDatasetReference,
43
+ PartitionBlock,
44
+ SamplingStrategy,
45
+ SeedConfig,
46
+ SeedDatasetReference,
47
+ )
48
+ from .utils.constants import DEFAULT_REPR_HTML_STYLE, REPR_HTML_TEMPLATE
49
+ from .utils.info import ConfigBuilderInfo
50
+ from .utils.io_helpers import serialize_data, smart_load_yaml
51
+ from .utils.misc import can_run_data_designer_locally, json_indent_list_of_strings, kebab_to_snake
52
+ from .utils.type_helpers import resolve_string_enum
53
+ from .utils.validation import ViolationLevel, rich_print_violations, validate_data_designer_config
54
+
55
+ logger = logging.getLogger(__name__)
56
+
57
+ # Resolve default model settings on import to ensure they are available when the library is used.
58
+ if can_run_data_designer_locally():
59
+ resolve_seed_default_model_settings()
60
+
61
+
62
+ class BuilderConfig(ExportableConfigBase):
63
+ """Configuration container for Data Designer builder.
64
+
65
+ This class holds the main Data Designer configuration along with optional
66
+ datastore settings needed for seed dataset operations.
67
+
68
+ Attributes:
69
+ data_designer: The main Data Designer configuration containing columns,
70
+ constraints, profilers, and other settings.
71
+ datastore_settings: Optional datastore settings for accessing external
72
+ datasets.
73
+ """
74
+
75
+ data_designer: DataDesignerConfig
76
+ datastore_settings: Optional[DatastoreSettings]
77
+
78
+
79
+ class DataDesignerConfigBuilder:
80
+ """Config builder for Data Designer configurations.
81
+
82
+ This class provides a high-level interface for building Data Designer configurations.
83
+ """
84
+
85
+ @classmethod
86
+ def from_config(cls, config: Union[dict, str, Path, BuilderConfig]) -> Self:
87
+ """Create a DataDesignerConfigBuilder from an existing configuration.
88
+
89
+ Args:
90
+ config: Configuration source. Can be:
91
+ - A dictionary containing the configuration
92
+ - A string or Path to a YAML/JSON configuration file
93
+ - A BuilderConfig object
94
+
95
+ Returns:
96
+ A new instance populated with the configuration from the provided source.
97
+
98
+ Raises:
99
+ ValueError: If the config format is invalid.
100
+ ValidationError: If the builder config loaded from the config is invalid.
101
+ """
102
+ if isinstance(config, BuilderConfig):
103
+ builder_config = config
104
+ else:
105
+ json_config = json.loads(serialize_data(smart_load_yaml(config)))
106
+ builder_config = BuilderConfig.model_validate(json_config)
107
+
108
+ builder = cls(model_configs=builder_config.data_designer.model_configs)
109
+ config = builder_config.data_designer
110
+
111
+ for col in config.columns:
112
+ builder.add_column(col)
113
+
114
+ for constraint in config.constraints or []:
115
+ builder.add_constraint(constraint=constraint)
116
+
117
+ if config.seed_config:
118
+ if builder_config.datastore_settings is None:
119
+ if can_run_data_designer_locally():
120
+ seed_dataset_reference = LocalSeedDatasetReference(dataset=config.seed_config.dataset)
121
+ else:
122
+ raise BuilderConfigurationError("🛑 Datastore settings are required.")
123
+ else:
124
+ seed_dataset_reference = DatastoreSeedDatasetReference(
125
+ dataset=config.seed_config.dataset,
126
+ datastore_settings=builder_config.datastore_settings,
127
+ )
128
+ builder.set_seed_datastore_settings(builder_config.datastore_settings)
129
+ builder.with_seed_dataset(
130
+ seed_dataset_reference,
131
+ sampling_strategy=config.seed_config.sampling_strategy,
132
+ selection_strategy=config.seed_config.selection_strategy,
133
+ )
134
+
135
+ return builder
136
+
137
+ def __init__(self, model_configs: Optional[Union[list[ModelConfig], str, Path]] = None):
138
+ """Initialize a new DataDesignerConfigBuilder instance.
139
+
140
+ Args:
141
+ model_configs: Model configurations. Can be:
142
+ - None to use default model configurations in local mode
143
+ - A list of ModelConfig objects
144
+ - A string or Path to a model configuration file
145
+ """
146
+ if not can_run_data_designer_locally() and (model_configs is None or len(model_configs) == 0):
147
+ raise BuilderConfigurationError("🛑 Model configurations are required!")
148
+
149
+ self._column_configs = {}
150
+ self._model_configs = load_model_configs(model_configs or get_default_model_configs())
151
+ self._processor_configs: list[ProcessorConfig] = []
152
+ self._seed_config: Optional[SeedConfig] = None
153
+ self._constraints: list[ColumnConstraintT] = []
154
+ self._profilers: list[ColumnProfilerConfigT] = []
155
+ self._datastore_settings: Optional[DatastoreSettings] = None
156
+
157
+ @property
158
+ def model_configs(self) -> list[ModelConfig]:
159
+ """Get the model configurations for this builder.
160
+
161
+ Returns:
162
+ A list of ModelConfig objects used for data generation.
163
+ """
164
+ return self._model_configs
165
+
166
+ @property
167
+ def allowed_references(self) -> list[str]:
168
+ """Get all referenceable variables allowed in prompt templates and expressions.
169
+
170
+ This includes all column names and their side effect columns that can be
171
+ referenced in prompt templates and expressions within the configuration.
172
+
173
+ Returns:
174
+ A list of variable names that can be referenced in templates and expressions.
175
+ """
176
+ side_effect_columns = sum([[c.name] + c.side_effect_columns for c in self._column_configs.values()], [])
177
+ return list(self._column_configs.keys()) + list(set(side_effect_columns))
178
+
179
+ @property
180
+ def info(self) -> ConfigBuilderInfo:
181
+ """Get the ConfigBuilderInfo object for this builder.
182
+
183
+ Returns:
184
+ An object containing information about the configuration.
185
+ """
186
+ return ConfigBuilderInfo(model_configs=self._model_configs)
187
+
188
+ def add_model_config(self, model_config: ModelConfig) -> Self:
189
+ """Add a model configuration to the current Data Designer configuration.
190
+
191
+ Args:
192
+ model_config: The model configuration to add.
193
+ """
194
+ if model_config.alias in [mc.alias for mc in self._model_configs]:
195
+ raise BuilderConfigurationError(
196
+ f"🛑 Model configuration with alias {model_config.alias} already exists. Please delete the existing model configuration or choose a different alias."
197
+ )
198
+ self._model_configs.append(model_config)
199
+ return self
200
+
201
+ def delete_model_config(self, alias: str) -> Self:
202
+ """Delete a model configuration from the current Data Designer configuration by alias.
203
+
204
+ Args:
205
+ alias: The alias of the model configuration to delete.
206
+ """
207
+ self._model_configs = [mc for mc in self._model_configs if mc.alias != alias]
208
+ if len(self._model_configs) == 0:
209
+ logger.warning(
210
+ f"⚠️ No model configurations found after deleting model configuration with alias {alias}. Please add a model configuration before building the configuration."
211
+ )
212
+ return self
213
+
214
+ def add_column(
215
+ self,
216
+ column_config: Optional[ColumnConfigT] = None,
217
+ *,
218
+ name: Optional[str] = None,
219
+ column_type: Optional[DataDesignerColumnType] = None,
220
+ **kwargs,
221
+ ) -> Self:
222
+ """Add a Data Designer column configuration to the current Data Designer configuration.
223
+
224
+ If no column config object is provided, you must provide the `name`, `column_type`, and any
225
+ additional keyword arguments that are required by the column config constructor.
226
+
227
+ Args:
228
+ column_config: Data Designer column config object to add.
229
+ name: Name of the column to add. This is only used if `column_config` is not provided.
230
+ column_type: Column type to add. This is only used if `column_config` is not provided.
231
+ **kwargs: Additional keyword arguments to pass to the column constructor.
232
+
233
+ Returns:
234
+ The current Data Designer config builder instance.
235
+ """
236
+ if column_config is None:
237
+ if name is None or column_type is None:
238
+ raise BuilderConfigurationError(
239
+ "🛑 You must provide either a 'column_config' object or 'name' *and* 'column_type' "
240
+ f"with additional keyword arguments. You provided {column_config=}, {name=}, and {column_type=}."
241
+ )
242
+ column_config = get_column_config_from_kwargs(name=name, column_type=column_type, **kwargs)
243
+
244
+ allowed_column_configs = ColumnConfigT.__args__
245
+ if not any(isinstance(column_config, t) for t in allowed_column_configs):
246
+ raise InvalidColumnTypeError(
247
+ f"🛑 Invalid column config object: '{column_config}'. Valid column config options are: "
248
+ f"{', '.join([t.__name__ for t in allowed_column_configs])}"
249
+ )
250
+
251
+ self._column_configs[column_config.name] = column_config
252
+ return self
253
+
254
+ def add_constraint(
255
+ self,
256
+ constraint: Optional[ColumnConstraintT] = None,
257
+ *,
258
+ constraint_type: Optional[ConstraintType] = None,
259
+ **kwargs,
260
+ ) -> Self:
261
+ """Add a constraint to the current Data Designer configuration.
262
+
263
+ Currently, constraints are only supported for numerical samplers.
264
+
265
+ You can either provide a constraint object directly, or provide a constraint type and
266
+ additional keyword arguments to construct the constraint object. Valid constraint types are:
267
+ - "scalar_inequality": Constraint between a column and a scalar value.
268
+ - "column_inequality": Constraint between two columns.
269
+
270
+ Args:
271
+ constraint: Constraint object to add.
272
+ constraint_type: Constraint type to add. Ignored when `constraint` is provided.
273
+ **kwargs: Additional keyword arguments to pass to the constraint constructor.
274
+
275
+ Returns:
276
+ The current Data Designer config builder instance.
277
+ """
278
+ if constraint is None:
279
+ if constraint_type is None:
280
+ raise BuilderConfigurationError(
281
+ "🛑 You must provide either a 'constraint' object or 'constraint_type' "
282
+ "with additional keyword arguments."
283
+ )
284
+ try:
285
+ constraint_type = ConstraintType(constraint_type)
286
+ except Exception:
287
+ raise BuilderConfigurationError(
288
+ f"🛑 Invalid constraint type: {constraint_type}. Valid options are: "
289
+ f"{', '.join([t.value for t in ConstraintType])}"
290
+ )
291
+ if constraint_type == ConstraintType.SCALAR_INEQUALITY:
292
+ constraint = ScalarInequalityConstraint(**kwargs)
293
+ elif constraint_type == ConstraintType.COLUMN_INEQUALITY:
294
+ constraint = ColumnInequalityConstraint(**kwargs)
295
+
296
+ allowed_constraint_types = ColumnConstraintT.__args__
297
+ if not any(isinstance(constraint, t) for t in allowed_constraint_types):
298
+ raise BuilderConfigurationError(
299
+ "🛑 Invalid constraint object. Valid constraint options are: "
300
+ f"{', '.join([t.__name__ for t in allowed_constraint_types])}"
301
+ )
302
+
303
+ self._constraints.append(constraint)
304
+ return self
305
+
306
+ def add_processor(
307
+ self,
308
+ processor_config: Optional[ProcessorConfig] = None,
309
+ *,
310
+ processor_type: Optional[ProcessorType] = None,
311
+ **kwargs,
312
+ ) -> Self:
313
+ """Add a processor to the current Data Designer configuration.
314
+
315
+ You can either provide a processor config object directly, or provide a processor type and
316
+ additional keyword arguments to construct the processor config object.
317
+
318
+ Args:
319
+ processor_config: The processor configuration object to add.
320
+ processor_type: The type of processor to add.
321
+ **kwargs: Additional keyword arguments to pass to the processor constructor.
322
+
323
+ Returns:
324
+ The current Data Designer config builder instance.
325
+ """
326
+ if processor_config is None:
327
+ if processor_type is None:
328
+ raise BuilderConfigurationError(
329
+ "🛑 You must provide either a 'processor_config' object or 'processor_type' "
330
+ "with additional keyword arguments."
331
+ )
332
+ processor_config = get_processor_config_from_kwargs(processor_type=processor_type, **kwargs)
333
+
334
+ # Checks elsewhere fail if DropColumnsProcessor drops a column but it is not marked for drop
335
+ if processor_config.processor_type == ProcessorType.DROP_COLUMNS:
336
+ for column in processor_config.column_names:
337
+ if column in self._column_configs:
338
+ self._column_configs[column].drop = True
339
+
340
+ self._processor_configs.append(processor_config)
341
+ return self
342
+
343
+ def add_profiler(self, profiler_config: ColumnProfilerConfigT) -> Self:
344
+ """Add a profiler to the current Data Designer configuration.
345
+
346
+ Args:
347
+ profiler_config: The profiler configuration object to add.
348
+
349
+ Returns:
350
+ The current Data Designer config builder instance.
351
+
352
+ Raises:
353
+ BuilderConfigurationError: If the profiler configuration is of an invalid type.
354
+ """
355
+ if not isinstance(profiler_config, ColumnProfilerConfigT):
356
+ if hasattr(ColumnProfilerConfigT, "__args__"):
357
+ valid_options = ", ".join([t.__name__ for t in ColumnProfilerConfigT.__args__])
358
+ else:
359
+ valid_options = ColumnProfilerConfigT.__name__
360
+ raise BuilderConfigurationError(f"🛑 Invalid profiler object. Valid profiler options are: {valid_options}")
361
+ self._profilers.append(profiler_config)
362
+ return self
363
+
364
+ def get_profilers(self) -> list[ColumnProfilerConfigT]:
365
+ """Get all profilers.
366
+
367
+ Returns:
368
+ A list of profiler configuration objects.
369
+ """
370
+ return self._profilers
371
+
372
+ def build(self, *, skip_validation: bool = False, raise_exceptions: bool = False) -> DataDesignerConfig:
373
+ """Build a DataDesignerConfig instance based on the current builder configuration.
374
+
375
+ Args:
376
+ skip_validation: Whether to skip validation of the configuration.
377
+ raise_exceptions: Whether to raise an exception if the configuration is invalid.
378
+
379
+ Returns:
380
+ The current Data Designer config object.
381
+ """
382
+ if not skip_validation:
383
+ self.validate(raise_exceptions=raise_exceptions)
384
+
385
+ return DataDesignerConfig(
386
+ model_configs=self._model_configs,
387
+ seed_config=self._seed_config,
388
+ columns=list(self._column_configs.values()),
389
+ constraints=self._constraints or None,
390
+ profilers=self._profilers or None,
391
+ processors=self._processor_configs or None,
392
+ )
393
+
394
+ def delete_constraints(self, target_column: str) -> Self:
395
+ """Delete all constraints for the given target column.
396
+
397
+ Args:
398
+ target_column: Name of the column to remove constraints for.
399
+
400
+ Returns:
401
+ The current Data Designer config builder instance.
402
+ """
403
+ self._constraints = [c for c in self._constraints if c.target_column != target_column]
404
+ return self
405
+
406
+ def delete_column(self, column_name: str) -> Self:
407
+ """Delete the column with the given name.
408
+
409
+ Args:
410
+ column_name: Name of the column to delete.
411
+
412
+ Returns:
413
+ The current Data Designer config builder instance.
414
+
415
+ Raises:
416
+ BuilderConfigurationError: If trying to delete a seed dataset column.
417
+ """
418
+ if isinstance(self._column_configs.get(column_name), SeedDatasetColumnConfig):
419
+ raise BuilderConfigurationError("Seed columns cannot be deleted. Please update the seed dataset instead.")
420
+ self._column_configs.pop(column_name, None)
421
+ return self
422
+
423
+ def get_column_config(self, name: str) -> ColumnConfigT:
424
+ """Get a column configuration by name.
425
+
426
+ Args:
427
+ name: Name of the column to retrieve the config for.
428
+
429
+ Returns:
430
+ The column configuration object.
431
+
432
+ Raises:
433
+ KeyError: If no column with the given name exists.
434
+ """
435
+ return self._column_configs[name]
436
+
437
+ def get_column_configs(self) -> list[ColumnConfigT]:
438
+ """Get all column configurations.
439
+
440
+ Returns:
441
+ A list of all column configuration objects.
442
+ """
443
+ return list(self._column_configs.values())
444
+
445
+ def get_constraints(self, target_column: str) -> list[ColumnConstraintT]:
446
+ """Get all constraints for the given target column.
447
+
448
+ Args:
449
+ target_column: Name of the column to get constraints for.
450
+
451
+ Returns:
452
+ A list of constraint objects targeting the specified column.
453
+ """
454
+ return [c for c in self._constraints if c.target_column == target_column]
455
+
456
+ def get_llm_gen_columns(self) -> list[ColumnConfigT]:
457
+ """Get all LLM-generated column configurations.
458
+
459
+ Returns:
460
+ A list of column configurations that use LLM generation.
461
+ """
462
+ return [c for c in self._column_configs.values() if column_type_is_llm_generated(c.column_type)]
463
+
464
+ def get_columns_of_type(self, column_type: DataDesignerColumnType) -> list[ColumnConfigT]:
465
+ """Get all column configurations of the specified type.
466
+
467
+ Args:
468
+ column_type: The type of columns to filter by.
469
+
470
+ Returns:
471
+ A list of column configurations matching the specified type.
472
+ """
473
+ column_type = resolve_string_enum(column_type, DataDesignerColumnType)
474
+ return [c for c in self._column_configs.values() if c.column_type == column_type]
475
+
476
+ def get_columns_excluding_type(self, column_type: DataDesignerColumnType) -> list[ColumnConfigT]:
477
+ """Get all column configurations excluding the specified type.
478
+
479
+ Args:
480
+ column_type: The type of columns to exclude.
481
+
482
+ Returns:
483
+ A list of column configurations that do not match the specified type.
484
+ """
485
+ column_type = resolve_string_enum(column_type, DataDesignerColumnType)
486
+ return [c for c in self._column_configs.values() if c.column_type != column_type]
487
+
488
+ def get_processor_configs(self) -> dict[BuildStage, list[ProcessorConfig]]:
489
+ """Get processor configuration objects.
490
+
491
+ Returns:
492
+ A dictionary of processor configuration objects by dataset builder stage.
493
+ """
494
+ return self._processor_configs
495
+
496
+ def get_seed_config(self) -> Optional[SeedConfig]:
497
+ """Get the seed config for the current Data Designer configuration.
498
+
499
+ Returns:
500
+ The seed config if configured, None otherwise.
501
+ """
502
+ return self._seed_config
503
+
504
+ def get_seed_datastore_settings(self) -> Optional[DatastoreSettings]:
505
+ """Get most recent datastore settings for the current Data Designer configuration.
506
+
507
+ Returns:
508
+ The datastore settings if configured, None otherwise.
509
+ """
510
+ return None if not self._datastore_settings else DatastoreSettings.model_validate(self._datastore_settings)
511
+
512
+ def num_columns_of_type(self, column_type: DataDesignerColumnType) -> int:
513
+ """Get the count of columns of the specified type.
514
+
515
+ Args:
516
+ column_type: The type of columns to count.
517
+
518
+ Returns:
519
+ The number of columns matching the specified type.
520
+ """
521
+ return len(self.get_columns_of_type(column_type))
522
+
523
+ def set_seed_datastore_settings(self, datastore_settings: Optional[DatastoreSettings]) -> Self:
524
+ """Set the datastore settings for the seed dataset.
525
+
526
+ Args:
527
+ datastore_settings: The datastore settings to use for the seed dataset.
528
+ """
529
+ self._datastore_settings = datastore_settings
530
+ return self
531
+
532
+ def validate(self, *, raise_exceptions: bool = False) -> Self:
533
+ """Validate the current Data Designer configuration.
534
+
535
+ Args:
536
+ raise_exceptions: Whether to raise an exception if the configuration is invalid.
537
+
538
+ Returns:
539
+ The current Data Designer config builder instance.
540
+
541
+ Raises:
542
+ InvalidConfigError: If the configuration is invalid and raise_exceptions is True.
543
+ """
544
+
545
+ violations = validate_data_designer_config(
546
+ columns=list(self._column_configs.values()),
547
+ processor_configs=self._processor_configs,
548
+ allowed_references=self.allowed_references,
549
+ )
550
+ rich_print_violations(violations)
551
+ if raise_exceptions and len([v for v in violations if v.level == ViolationLevel.ERROR]) > 0:
552
+ raise InvalidConfigError(
553
+ "🛑 Your configuration contains validation errors. Please address the indicated issues and try again."
554
+ )
555
+ if len(violations) == 0:
556
+ logger.info("✅ Validation passed")
557
+ return self
558
+
559
+ def with_seed_dataset(
560
+ self,
561
+ dataset_reference: SeedDatasetReference,
562
+ *,
563
+ sampling_strategy: SamplingStrategy = SamplingStrategy.ORDERED,
564
+ selection_strategy: Optional[Union[IndexRange, PartitionBlock]] = None,
565
+ ) -> Self:
566
+ """Add a seed dataset to the current Data Designer configuration.
567
+
568
+ This method sets the seed dataset for the configuration and automatically creates
569
+ SeedDatasetColumnConfig objects for each column found in the dataset. The column
570
+ names are fetched from the dataset source (Hugging Face Hub or NeMo Microservices Datastore).
571
+
572
+ Args:
573
+ dataset_reference: Seed dataset reference for fetching from the datastore.
574
+ sampling_strategy: The sampling strategy to use when generating data from the seed dataset.
575
+ Defaults to ORDERED sampling.
576
+
577
+ Returns:
578
+ The current Data Designer config builder instance.
579
+ """
580
+ self._seed_config = SeedConfig(
581
+ dataset=dataset_reference.dataset,
582
+ sampling_strategy=sampling_strategy,
583
+ selection_strategy=selection_strategy,
584
+ )
585
+ self.set_seed_datastore_settings(
586
+ dataset_reference.datastore_settings if hasattr(dataset_reference, "datastore_settings") else None
587
+ )
588
+ for column_name in fetch_seed_dataset_column_names(dataset_reference):
589
+ self._column_configs[column_name] = SeedDatasetColumnConfig(name=column_name)
590
+ return self
591
+
592
+ def write_config(self, path: Union[str, Path], indent: Optional[int] = 2, **kwargs) -> None:
593
+ """Write the current configuration to a file.
594
+
595
+ Args:
596
+ path: Path to the file to write the configuration to.
597
+ indent: Indentation level for the output file (default: 2).
598
+ **kwargs: Additional keyword arguments passed to the serialization methods used.
599
+
600
+ Raises:
601
+ BuilderConfigurationError: If the file format is unsupported.
602
+ """
603
+ cfg = self.get_builder_config()
604
+ suffix = Path(path).suffix
605
+ if suffix in {".yaml", ".yml"}:
606
+ cfg.to_yaml(path, indent=indent, **kwargs)
607
+ elif suffix == ".json":
608
+ cfg.to_json(path, indent=indent, **kwargs)
609
+ else:
610
+ raise BuilderConfigurationError(f"🛑 Unsupported file type: {suffix}. Must be `.yaml`, `.yml` or `.json`.")
611
+
612
+ def get_builder_config(self) -> BuilderConfig:
613
+ """Get the builder config for the current Data Designer configuration.
614
+
615
+ Returns:
616
+ The builder config.
617
+ """
618
+ return BuilderConfig(data_designer=self.build(), datastore_settings=self._datastore_settings)
619
+
620
+ def __repr__(self) -> str:
621
+ """Generates a string representation of the DataDesignerConfigBuilder instance.
622
+
623
+ Returns:
624
+ A formatted string showing the builder's configuration including seed dataset and column information grouped by type.
625
+ """
626
+ if len(self._column_configs) == 0:
627
+ return f"{self.__class__.__name__}()"
628
+
629
+ props_to_repr = {
630
+ "seed_dataset": (None if self._seed_config is None else f"'{self._seed_config.dataset}'"),
631
+ }
632
+
633
+ for column_type in get_column_display_order():
634
+ columns = self.get_columns_of_type(column_type)
635
+ if len(columns) > 0:
636
+ column_label = f"{kebab_to_snake(column_type.value)}_columns"
637
+ props_to_repr[column_label] = json_indent_list_of_strings([c.name for c in columns], indent=8)
638
+
639
+ repr_string = f"{self.__class__.__name__}(\n"
640
+ for k, v in props_to_repr.items():
641
+ if v is not None:
642
+ v_indented = v if "[" not in v else f"{v[:-1]}" + " " + v[-1]
643
+ repr_string += f" {k}: {v_indented}\n"
644
+ repr_string += ")"
645
+ return repr_string
646
+
647
+ def _repr_html_(self) -> str:
648
+ """Return an HTML representation of the DataDesignerConfigBuilder instance..
649
+
650
+ This method provides a syntax-highlighted HTML representation of the
651
+ builder's string representation.
652
+
653
+ Returns:
654
+ HTML string with syntax highlighting for the builder representation.
655
+ """
656
+ repr_string = self.__repr__()
657
+ formatter = HtmlFormatter(style=DEFAULT_REPR_HTML_STYLE, cssclass="code")
658
+ highlighted_html = highlight(repr_string, PythonLexer(), formatter)
659
+ css = formatter.get_style_defs(".code")
660
+ return REPR_HTML_TEMPLATE.format(css=css, highlighted_html=highlighted_html)