data-designer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. data_designer/__init__.py +15 -0
  2. data_designer/_version.py +34 -0
  3. data_designer/cli/README.md +236 -0
  4. data_designer/cli/__init__.py +6 -0
  5. data_designer/cli/commands/__init__.py +2 -0
  6. data_designer/cli/commands/list.py +130 -0
  7. data_designer/cli/commands/models.py +10 -0
  8. data_designer/cli/commands/providers.py +11 -0
  9. data_designer/cli/commands/reset.py +100 -0
  10. data_designer/cli/controllers/__init__.py +7 -0
  11. data_designer/cli/controllers/model_controller.py +246 -0
  12. data_designer/cli/controllers/provider_controller.py +317 -0
  13. data_designer/cli/forms/__init__.py +20 -0
  14. data_designer/cli/forms/builder.py +51 -0
  15. data_designer/cli/forms/field.py +180 -0
  16. data_designer/cli/forms/form.py +59 -0
  17. data_designer/cli/forms/model_builder.py +125 -0
  18. data_designer/cli/forms/provider_builder.py +76 -0
  19. data_designer/cli/main.py +44 -0
  20. data_designer/cli/repositories/__init__.py +8 -0
  21. data_designer/cli/repositories/base.py +39 -0
  22. data_designer/cli/repositories/model_repository.py +42 -0
  23. data_designer/cli/repositories/provider_repository.py +43 -0
  24. data_designer/cli/services/__init__.py +7 -0
  25. data_designer/cli/services/model_service.py +116 -0
  26. data_designer/cli/services/provider_service.py +111 -0
  27. data_designer/cli/ui.py +448 -0
  28. data_designer/cli/utils.py +47 -0
  29. data_designer/config/__init__.py +2 -0
  30. data_designer/config/analysis/column_profilers.py +89 -0
  31. data_designer/config/analysis/column_statistics.py +274 -0
  32. data_designer/config/analysis/dataset_profiler.py +60 -0
  33. data_designer/config/analysis/utils/errors.py +8 -0
  34. data_designer/config/analysis/utils/reporting.py +188 -0
  35. data_designer/config/base.py +68 -0
  36. data_designer/config/column_configs.py +354 -0
  37. data_designer/config/column_types.py +168 -0
  38. data_designer/config/config_builder.py +660 -0
  39. data_designer/config/data_designer_config.py +40 -0
  40. data_designer/config/dataset_builders.py +11 -0
  41. data_designer/config/datastore.py +151 -0
  42. data_designer/config/default_model_settings.py +123 -0
  43. data_designer/config/errors.py +19 -0
  44. data_designer/config/interface.py +54 -0
  45. data_designer/config/models.py +231 -0
  46. data_designer/config/preview_results.py +32 -0
  47. data_designer/config/processors.py +41 -0
  48. data_designer/config/sampler_constraints.py +51 -0
  49. data_designer/config/sampler_params.py +604 -0
  50. data_designer/config/seed.py +145 -0
  51. data_designer/config/utils/code_lang.py +83 -0
  52. data_designer/config/utils/constants.py +313 -0
  53. data_designer/config/utils/errors.py +19 -0
  54. data_designer/config/utils/info.py +88 -0
  55. data_designer/config/utils/io_helpers.py +273 -0
  56. data_designer/config/utils/misc.py +81 -0
  57. data_designer/config/utils/numerical_helpers.py +28 -0
  58. data_designer/config/utils/type_helpers.py +100 -0
  59. data_designer/config/utils/validation.py +336 -0
  60. data_designer/config/utils/visualization.py +427 -0
  61. data_designer/config/validator_params.py +96 -0
  62. data_designer/engine/__init__.py +2 -0
  63. data_designer/engine/analysis/column_profilers/base.py +55 -0
  64. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
  65. data_designer/engine/analysis/column_profilers/registry.py +20 -0
  66. data_designer/engine/analysis/column_statistics.py +142 -0
  67. data_designer/engine/analysis/dataset_profiler.py +125 -0
  68. data_designer/engine/analysis/errors.py +7 -0
  69. data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
  70. data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
  71. data_designer/engine/column_generators/__init__.py +2 -0
  72. data_designer/engine/column_generators/generators/__init__.py +2 -0
  73. data_designer/engine/column_generators/generators/base.py +61 -0
  74. data_designer/engine/column_generators/generators/expression.py +63 -0
  75. data_designer/engine/column_generators/generators/llm_generators.py +172 -0
  76. data_designer/engine/column_generators/generators/samplers.py +75 -0
  77. data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
  78. data_designer/engine/column_generators/generators/validation.py +147 -0
  79. data_designer/engine/column_generators/registry.py +56 -0
  80. data_designer/engine/column_generators/utils/errors.py +13 -0
  81. data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
  82. data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
  83. data_designer/engine/configurable_task.py +82 -0
  84. data_designer/engine/dataset_builders/artifact_storage.py +181 -0
  85. data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
  86. data_designer/engine/dataset_builders/errors.py +13 -0
  87. data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
  88. data_designer/engine/dataset_builders/utils/__init__.py +2 -0
  89. data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
  90. data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
  91. data_designer/engine/dataset_builders/utils/dag.py +56 -0
  92. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
  93. data_designer/engine/dataset_builders/utils/errors.py +13 -0
  94. data_designer/engine/errors.py +49 -0
  95. data_designer/engine/model_provider.py +75 -0
  96. data_designer/engine/models/__init__.py +2 -0
  97. data_designer/engine/models/errors.py +308 -0
  98. data_designer/engine/models/facade.py +225 -0
  99. data_designer/engine/models/litellm_overrides.py +162 -0
  100. data_designer/engine/models/parsers/__init__.py +2 -0
  101. data_designer/engine/models/parsers/errors.py +34 -0
  102. data_designer/engine/models/parsers/parser.py +236 -0
  103. data_designer/engine/models/parsers/postprocessors.py +93 -0
  104. data_designer/engine/models/parsers/tag_parsers.py +60 -0
  105. data_designer/engine/models/parsers/types.py +82 -0
  106. data_designer/engine/models/recipes/base.py +79 -0
  107. data_designer/engine/models/recipes/response_recipes.py +291 -0
  108. data_designer/engine/models/registry.py +118 -0
  109. data_designer/engine/models/usage.py +75 -0
  110. data_designer/engine/models/utils.py +38 -0
  111. data_designer/engine/processing/ginja/__init__.py +2 -0
  112. data_designer/engine/processing/ginja/ast.py +64 -0
  113. data_designer/engine/processing/ginja/environment.py +461 -0
  114. data_designer/engine/processing/ginja/exceptions.py +54 -0
  115. data_designer/engine/processing/ginja/record.py +30 -0
  116. data_designer/engine/processing/gsonschema/__init__.py +2 -0
  117. data_designer/engine/processing/gsonschema/exceptions.py +8 -0
  118. data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
  119. data_designer/engine/processing/gsonschema/types.py +8 -0
  120. data_designer/engine/processing/gsonschema/validators.py +143 -0
  121. data_designer/engine/processing/processors/base.py +15 -0
  122. data_designer/engine/processing/processors/drop_columns.py +46 -0
  123. data_designer/engine/processing/processors/registry.py +20 -0
  124. data_designer/engine/processing/utils.py +120 -0
  125. data_designer/engine/registry/base.py +97 -0
  126. data_designer/engine/registry/data_designer_registry.py +37 -0
  127. data_designer/engine/registry/errors.py +10 -0
  128. data_designer/engine/resources/managed_dataset_generator.py +35 -0
  129. data_designer/engine/resources/managed_dataset_repository.py +194 -0
  130. data_designer/engine/resources/managed_storage.py +63 -0
  131. data_designer/engine/resources/resource_provider.py +46 -0
  132. data_designer/engine/resources/seed_dataset_data_store.py +66 -0
  133. data_designer/engine/sampling_gen/column.py +89 -0
  134. data_designer/engine/sampling_gen/constraints.py +95 -0
  135. data_designer/engine/sampling_gen/data_sources/base.py +214 -0
  136. data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
  137. data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
  138. data_designer/engine/sampling_gen/entities/__init__.py +2 -0
  139. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  140. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
  141. data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
  142. data_designer/engine/sampling_gen/entities/errors.py +8 -0
  143. data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
  144. data_designer/engine/sampling_gen/entities/person.py +142 -0
  145. data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
  146. data_designer/engine/sampling_gen/errors.py +24 -0
  147. data_designer/engine/sampling_gen/generator.py +121 -0
  148. data_designer/engine/sampling_gen/jinja_utils.py +60 -0
  149. data_designer/engine/sampling_gen/people_gen.py +203 -0
  150. data_designer/engine/sampling_gen/person_constants.py +54 -0
  151. data_designer/engine/sampling_gen/schema.py +143 -0
  152. data_designer/engine/sampling_gen/schema_builder.py +59 -0
  153. data_designer/engine/sampling_gen/utils.py +40 -0
  154. data_designer/engine/secret_resolver.py +80 -0
  155. data_designer/engine/validators/__init__.py +17 -0
  156. data_designer/engine/validators/base.py +36 -0
  157. data_designer/engine/validators/local_callable.py +34 -0
  158. data_designer/engine/validators/python.py +245 -0
  159. data_designer/engine/validators/remote.py +83 -0
  160. data_designer/engine/validators/sql.py +60 -0
  161. data_designer/errors.py +5 -0
  162. data_designer/essentials/__init__.py +137 -0
  163. data_designer/interface/__init__.py +2 -0
  164. data_designer/interface/data_designer.py +351 -0
  165. data_designer/interface/errors.py +16 -0
  166. data_designer/interface/results.py +55 -0
  167. data_designer/logging.py +161 -0
  168. data_designer/plugin_manager.py +83 -0
  169. data_designer/plugins/__init__.py +6 -0
  170. data_designer/plugins/errors.py +10 -0
  171. data_designer/plugins/plugin.py +69 -0
  172. data_designer/plugins/registry.py +86 -0
  173. data_designer-0.1.0.dist-info/METADATA +173 -0
  174. data_designer-0.1.0.dist-info/RECORD +177 -0
  175. data_designer-0.1.0.dist-info/WHEEL +4 -0
  176. data_designer-0.1.0.dist-info/entry_points.txt +2 -0
  177. data_designer-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,351 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+ from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
10
+ from data_designer.config.config_builder import DataDesignerConfigBuilder
11
+ from data_designer.config.default_model_settings import (
12
+ get_default_model_configs,
13
+ get_default_provider_name,
14
+ get_default_providers,
15
+ resolve_seed_default_model_settings,
16
+ )
17
+ from data_designer.config.interface import DataDesignerInterface
18
+ from data_designer.config.models import (
19
+ ModelConfig,
20
+ ModelProvider,
21
+ )
22
+ from data_designer.config.preview_results import PreviewResults
23
+ from data_designer.config.seed import LocalSeedDatasetReference
24
+ from data_designer.config.utils.constants import (
25
+ DEFAULT_NUM_RECORDS,
26
+ MANAGED_ASSETS_PATH,
27
+ MODEL_CONFIGS_FILE_PATH,
28
+ MODEL_PROVIDERS_FILE_PATH,
29
+ )
30
+ from data_designer.config.utils.info import InterfaceInfo
31
+ from data_designer.config.utils.io_helpers import write_seed_dataset
32
+ from data_designer.config.utils.misc import can_run_data_designer_locally
33
+ from data_designer.engine.analysis.dataset_profiler import (
34
+ DataDesignerDatasetProfiler,
35
+ DatasetProfilerConfig,
36
+ )
37
+ from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
38
+ from data_designer.engine.dataset_builders.column_wise_builder import ColumnWiseDatasetBuilder
39
+ from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs
40
+ from data_designer.engine.model_provider import resolve_model_provider_registry
41
+ from data_designer.engine.models.registry import create_model_registry
42
+ from data_designer.engine.resources.managed_storage import init_managed_blob_storage
43
+ from data_designer.engine.resources.resource_provider import ResourceProvider
44
+ from data_designer.engine.resources.seed_dataset_data_store import (
45
+ HfHubSeedDatasetDataStore,
46
+ LocalSeedDatasetDataStore,
47
+ )
48
+ from data_designer.engine.secret_resolver import (
49
+ CompositeResolver,
50
+ EnvironmentResolver,
51
+ PlaintextResolver,
52
+ SecretResolver,
53
+ )
54
+ from data_designer.interface.errors import (
55
+ DataDesignerGenerationError,
56
+ DataDesignerProfilingError,
57
+ InvalidBufferValueError,
58
+ )
59
+ from data_designer.interface.results import DatasetCreationResults
60
+ from data_designer.logging import RandomEmoji
61
+
62
+ DEFAULT_BUFFER_SIZE = 1000
63
+
64
+ logger = logging.getLogger(__name__)
65
+
66
+
67
+ # Resolve default model settings on import to ensure they are available when the library is used.
68
+ if can_run_data_designer_locally():
69
+ resolve_seed_default_model_settings()
70
+
71
+
72
+ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
73
+ """Main interface for creating datasets with Data Designer.
74
+
75
+ This class provides the primary interface for building synthetic datasets using
76
+ Data Designer configurations. It manages model providers, artifact storage, and
77
+ orchestrates the dataset creation and profiling processes.
78
+
79
+ Args:
80
+ artifact_path: Path where generated artifacts will be stored.
81
+ dataset_name: Name for the generated dataset. Defaults to "dataset".
82
+ This will be used as the dataset folder name in the artifact path.
83
+ model_providers: Optional list of model providers for LLM generation. If None,
84
+ uses default providers.
85
+ secret_resolver: Resolver for handling secrets and credentials. Defaults to
86
+ EnvironmentResolver which reads secrets from environment variables.
87
+ managed_assets_path: Path to the managed assets directory. This is used to point
88
+ to the location of managed datasets and other assets used during dataset generation.
89
+ If not provided, will check for an environment variable called DATA_DESIGNER_MANAGED_ASSETS_PATH.
90
+ If the environment variable is not set, will use the default managed assets directory, which
91
+ is defined in `data_designer.config.utils.constants`.
92
+ """
93
+
94
+ def __init__(
95
+ self,
96
+ artifact_path: Path | str | None = None,
97
+ *,
98
+ model_providers: list[ModelProvider] | None = None,
99
+ secret_resolver: SecretResolver | None = None,
100
+ managed_assets_path: Path | str | None = None,
101
+ ):
102
+ self._secret_resolver = secret_resolver or CompositeResolver([EnvironmentResolver(), PlaintextResolver()])
103
+ self._artifact_path = Path(artifact_path) if artifact_path is not None else Path.cwd() / "artifacts"
104
+ self._buffer_size = DEFAULT_BUFFER_SIZE
105
+ self._managed_assets_path = Path(managed_assets_path or MANAGED_ASSETS_PATH)
106
+ self._model_providers = model_providers or self.get_default_model_providers()
107
+ self._model_provider_registry = resolve_model_provider_registry(
108
+ self._model_providers, get_default_provider_name()
109
+ )
110
+
111
+ @staticmethod
112
+ def make_seed_reference_from_file(file_path: str | Path) -> LocalSeedDatasetReference:
113
+ """Create a seed dataset reference from an existing file.
114
+
115
+ Supported file extensions: .parquet (recommended), .csv, .json, .jsonl
116
+
117
+ Args:
118
+ file_path: Path to an existing dataset file.
119
+
120
+ Returns:
121
+ A LocalSeedDatasetReference pointing to the specified file.
122
+ """
123
+ return LocalSeedDatasetReference(dataset=str(file_path))
124
+
125
+ @classmethod
126
+ def make_seed_reference_from_dataframe(
127
+ cls, dataframe: pd.DataFrame, file_path: str | Path
128
+ ) -> LocalSeedDatasetReference:
129
+ """Create a seed dataset reference from a pandas DataFrame.
130
+
131
+ This method writes the DataFrame to disk and returns a reference that can
132
+ be passed to the config builder's `with_seed_dataset` method. If the file
133
+ already exists, it will be overwritten.
134
+
135
+ Supported file extensions: .parquet (recommended), .csv, .json, .jsonl
136
+
137
+ Args:
138
+ dataframe: Pandas DataFrame to use as seed data.
139
+ file_path: Path where to save dataset.
140
+
141
+ Returns:
142
+ A LocalSeedDatasetReference pointing to the written file.
143
+ """
144
+ write_seed_dataset(dataframe, Path(file_path))
145
+ return cls.make_seed_reference_from_file(file_path)
146
+
147
+ @property
148
+ def info(self) -> InterfaceInfo:
149
+ """Get information about the Data Designer interface.
150
+
151
+ Returns:
152
+ InterfaceInfo object with information about the Data Designer interface.
153
+ """
154
+ return InterfaceInfo(model_providers=self._model_providers)
155
+
156
+ def create(
157
+ self,
158
+ config_builder: DataDesignerConfigBuilder,
159
+ *,
160
+ num_records: int = DEFAULT_NUM_RECORDS,
161
+ dataset_name: str = "dataset",
162
+ ) -> DatasetCreationResults:
163
+ """Create dataset and save results to the local artifact storage.
164
+
165
+ This method orchestrates the full dataset creation pipeline including building
166
+ the dataset according to the configuration, profiling the generated data, and
167
+ storing artifacts.
168
+
169
+ Args:
170
+ config_builder: The DataDesignerConfigBuilder containing the dataset
171
+ configuration (columns, constraints, seed data, etc.).
172
+ num_records: Number of records to generate.
173
+ dataset_name: Name of the dataset. This name will be used as the dataset
174
+ folder name in the artifact path directory.
175
+
176
+ Returns:
177
+ DatasetCreationResults object with methods for loading the generated dataset,
178
+ analysis results, and displaying sample records for inspection.
179
+
180
+ Raises:
181
+ DataDesignerGenerationError: If an error occurs during dataset generation.
182
+ DataDesignerProfilingError: If an error occurs during dataset profiling.
183
+ """
184
+ logger.info("🎨 Creating Data Designer dataset")
185
+
186
+ resource_provider = self._create_resource_provider(dataset_name, config_builder)
187
+
188
+ builder = self._create_dataset_builder(config_builder, resource_provider)
189
+
190
+ try:
191
+ builder.build(num_records=num_records, buffer_size=self._buffer_size)
192
+ except Exception as e:
193
+ raise DataDesignerGenerationError(f"🛑 Error generating dataset: {e}")
194
+
195
+ try:
196
+ profiler = self._create_dataset_profiler(config_builder, resource_provider)
197
+ analysis = profiler.profile_dataset(
198
+ num_records,
199
+ builder.artifact_storage.load_dataset_with_dropped_columns(),
200
+ )
201
+ except Exception as e:
202
+ raise DataDesignerProfilingError(f"🛑 Error profiling dataset: {e}")
203
+
204
+ return DatasetCreationResults(
205
+ artifact_storage=builder.artifact_storage,
206
+ analysis=analysis,
207
+ config_builder=config_builder,
208
+ )
209
+
210
+ def preview(
211
+ self, config_builder: DataDesignerConfigBuilder, *, num_records: int = DEFAULT_NUM_RECORDS
212
+ ) -> PreviewResults:
213
+ """Generate preview dataset for fast iteration on your Data Designer configuration.
214
+
215
+ All preview results are stored in memory. Once you are satisfied with the preview,
216
+ use the `create` method to generate data at a larger scale and save results to disk.
217
+
218
+ Args:
219
+ config_builder: The DataDesignerConfigBuilder containing the dataset
220
+ configuration (columns, constraints, seed data, etc.).
221
+ num_records: Number of records to generate.
222
+
223
+ Returns:
224
+ PreviewResults object with methods for inspecting the results.
225
+
226
+ Raises:
227
+ DataDesignerGenerationError: If an error occurs during preview dataset generation.
228
+ DataDesignerProfilingError: If an error occurs during preview dataset profiling.
229
+ """
230
+ logger.info(f"{RandomEmoji.previewing()} Preview generation in progress")
231
+
232
+ resource_provider = self._create_resource_provider("preview-dataset", config_builder)
233
+ builder = self._create_dataset_builder(config_builder, resource_provider)
234
+
235
+ try:
236
+ raw_dataset = builder.build_preview(num_records=num_records)
237
+ processed_dataset = builder.process_preview(raw_dataset)
238
+ except Exception as e:
239
+ raise DataDesignerGenerationError(f"🛑 Error generating preview dataset: {e}")
240
+
241
+ dropped_columns = raw_dataset.columns.difference(processed_dataset.columns)
242
+ if len(dropped_columns) > 0:
243
+ dataset_for_profiler = pd.concat([processed_dataset, raw_dataset[dropped_columns]], axis=1)
244
+ else:
245
+ dataset_for_profiler = processed_dataset
246
+
247
+ try:
248
+ profiler = self._create_dataset_profiler(config_builder, resource_provider)
249
+ analysis = profiler.profile_dataset(num_records, dataset_for_profiler)
250
+ except Exception as e:
251
+ raise DataDesignerProfilingError(f"🛑 Error profiling preview dataset: {e}")
252
+
253
+ if (
254
+ len(processed_dataset) > 0
255
+ and isinstance(analysis, DatasetProfilerResults)
256
+ and len(analysis.column_statistics) > 0
257
+ ):
258
+ logger.info(f"{RandomEmoji.success()} Preview complete!")
259
+
260
+ return PreviewResults(
261
+ dataset=processed_dataset,
262
+ analysis=analysis,
263
+ config_builder=config_builder,
264
+ )
265
+
266
+ def get_default_model_configs(self) -> list[ModelConfig]:
267
+ """Get the default model configurations.
268
+
269
+ Returns:
270
+ List of default model configurations.
271
+ """
272
+ logger.info(f"♻️ Using default model configs from {str(MODEL_CONFIGS_FILE_PATH)!r}")
273
+ return get_default_model_configs()
274
+
275
+ def get_default_model_providers(self) -> list[ModelProvider]:
276
+ """Get the default model providers.
277
+
278
+ Returns:
279
+ List of default model providers.
280
+ """
281
+ logger.info(f"♻️ Using default model providers from {str(MODEL_PROVIDERS_FILE_PATH)!r}")
282
+ return get_default_providers()
283
+
284
+ @property
285
+ def secret_resolver(self) -> SecretResolver:
286
+ """Get the secret resolver used by this DataDesigner instance.
287
+
288
+ Returns:
289
+ The SecretResolver instance handling credentials and secrets.
290
+ """
291
+ return self._secret_resolver
292
+
293
+ def set_buffer_size(self, buffer_size: int) -> None:
294
+ """Set the buffer size for dataset generation.
295
+
296
+ The buffer size controls how many records are processed in memory at once
297
+ during dataset generation using the `create` method. The default value is
298
+ set to the constant `DEFAULT_BUFFER_SIZE` defined in the data_designer module.
299
+
300
+ Args:
301
+ buffer_size: Number of records to process in each buffer.
302
+
303
+ Raises:
304
+ InvalidBufferValueError: If buffer size is less than or equal to 0.
305
+ """
306
+ if buffer_size <= 0:
307
+ raise InvalidBufferValueError("Buffer size must be greater than 0.")
308
+ self._buffer_size = buffer_size
309
+
310
+ def _create_dataset_builder(
311
+ self, config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider
312
+ ) -> ColumnWiseDatasetBuilder:
313
+ return ColumnWiseDatasetBuilder(
314
+ column_configs=compile_dataset_builder_column_configs(config_builder.build(raise_exceptions=True)),
315
+ processor_configs=config_builder.get_processor_configs(),
316
+ resource_provider=resource_provider,
317
+ )
318
+
319
+ def _create_dataset_profiler(
320
+ self, config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider
321
+ ) -> DataDesignerDatasetProfiler:
322
+ return DataDesignerDatasetProfiler(
323
+ config=DatasetProfilerConfig(
324
+ column_configs=config_builder.get_column_configs(),
325
+ column_profiler_configs=config_builder.get_profilers(),
326
+ ),
327
+ resource_provider=resource_provider,
328
+ )
329
+
330
+ def _create_resource_provider(
331
+ self, dataset_name: str, config_builder: DataDesignerConfigBuilder
332
+ ) -> ResourceProvider:
333
+ model_configs = config_builder.model_configs
334
+ ArtifactStorage.mkdir_if_needed(self._artifact_path)
335
+ return ResourceProvider(
336
+ artifact_storage=ArtifactStorage(artifact_path=self._artifact_path, dataset_name=dataset_name),
337
+ model_registry=create_model_registry(
338
+ model_configs=model_configs,
339
+ model_provider_registry=self._model_provider_registry,
340
+ secret_resolver=self._secret_resolver,
341
+ ),
342
+ blob_storage=init_managed_blob_storage(str(self._managed_assets_path)),
343
+ datastore=(
344
+ LocalSeedDatasetDataStore()
345
+ if (settings := config_builder.get_seed_datastore_settings()) is None
346
+ else HfHubSeedDatasetDataStore(
347
+ endpoint=settings.endpoint,
348
+ token=settings.token,
349
+ )
350
+ ),
351
+ )
@@ -0,0 +1,16 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from data_designer.errors import DataDesignerError
5
+
6
+
7
+ class DataDesignerProfilingError(DataDesignerError):
8
+ """Raised for errors related to a Data Designer dataset profiling."""
9
+
10
+
11
+ class DataDesignerGenerationError(DataDesignerError):
12
+ """Raised for errors related to a Data Designer dataset generation."""
13
+
14
+
15
+ class InvalidBufferValueError(DataDesignerError):
16
+ """Raised for errors related to an invalid buffer value."""
@@ -0,0 +1,55 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import pandas as pd
7
+
8
+ from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
9
+ from data_designer.config.config_builder import DataDesignerConfigBuilder
10
+ from data_designer.config.utils.visualization import WithRecordSamplerMixin
11
+ from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
12
+
13
+
14
+ class DatasetCreationResults(WithRecordSamplerMixin):
15
+ """Results container for a Data Designer dataset creation run.
16
+
17
+ This class provides access to the generated dataset, profiling analysis, and
18
+ visualization utilities. It is returned by the DataDesigner.create() method
19
+ and implements ResultsProtocol of the DataDesigner interface.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ *,
25
+ artifact_storage: ArtifactStorage,
26
+ analysis: DatasetProfilerResults,
27
+ config_builder: DataDesignerConfigBuilder,
28
+ ):
29
+ """Creates a new instance with results based on a dataset creation run.
30
+
31
+ Args:
32
+ artifact_storage: Storage manager for accessing generated artifacts.
33
+ analysis: Profiling results for the generated dataset.
34
+ config_builder: Configuration builder used to create the dataset.
35
+ """
36
+ self.artifact_storage = artifact_storage
37
+ self._analysis = analysis
38
+ self._config_builder = config_builder
39
+
40
+ def load_analysis(self) -> DatasetProfilerResults:
41
+ """Load the profiling analysis results for the generated dataset.
42
+
43
+ Returns:
44
+ DatasetProfilerResults containing statistical analysis and quality metrics
45
+ for each column in the generated dataset.
46
+ """
47
+ return self._analysis
48
+
49
+ def load_dataset(self) -> pd.DataFrame:
50
+ """Load the generated dataset as a pandas DataFrame.
51
+
52
+ Returns:
53
+ A pandas DataFrame containing the full generated dataset.
54
+ """
55
+ return self.artifact_storage.load_dataset()
@@ -0,0 +1,161 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from dataclasses import dataclass, field
5
+ import logging
6
+ from pathlib import Path
7
+ import random
8
+ import sys
9
+ from typing import TextIO, Union
10
+
11
+ from pythonjsonlogger import jsonlogger
12
+
13
+
14
+ @dataclass
15
+ class LoggerConfig:
16
+ name: str
17
+ level: str
18
+
19
+
20
+ @dataclass
21
+ class OutputConfig:
22
+ destination: Union[TextIO, Path]
23
+ structured: bool
24
+
25
+
26
+ @dataclass
27
+ class LoggingConfig:
28
+ logger_configs: list[LoggerConfig]
29
+ output_configs: list[OutputConfig]
30
+ root_level: str = "INFO"
31
+ to_silence: list[str] = field(default_factory=lambda: _DEFAULT_NOISY_LOGGERS)
32
+
33
+ @classmethod
34
+ def default(cls):
35
+ return LoggingConfig(
36
+ logger_configs=[LoggerConfig(name="data_designer", level="INFO")],
37
+ output_configs=[OutputConfig(destination=sys.stderr, structured=False)],
38
+ )
39
+
40
+ @classmethod
41
+ def debug(cls):
42
+ return LoggingConfig(
43
+ logger_configs=[LoggerConfig(name="data_designer", level="DEBUG")],
44
+ output_configs=[OutputConfig(destination=sys.stderr, structured=False)],
45
+ )
46
+
47
+
48
+ class RandomEmoji:
49
+ """A generator for various themed emoji collections."""
50
+
51
+ @staticmethod
52
+ def cooking() -> str:
53
+ """Get a random cooking or food preparation emoji."""
54
+ return random.choice(["👨‍🍳", "👩‍🍳", "🍳", "🥘", "🍲", "🔪", "🥄", "🍴", "⏲️", "🥗"])
55
+
56
+ @staticmethod
57
+ def data() -> str:
58
+ """Get a random data or analytics emoji."""
59
+ return random.choice(["📊", "📈", "📉", "💾", "💿", "📀", "🗄️", "📁", "📂", "🗃️"])
60
+
61
+ @staticmethod
62
+ def generating() -> str:
63
+ """Get a random generating or creating emoji."""
64
+ return random.choice(["🏭", "⚙️", "🔨", "🛠️", "🏗️", "🎨", "✍️", "📝", "🔧", "⚒️"])
65
+
66
+ @staticmethod
67
+ def loading() -> str:
68
+ """Get a random loading or waiting emoji."""
69
+ return random.choice(["⏳", "⌛", "🔄", "♻️", "🔃", "⏰", "⏱️", "⏲️", "📡", "🌀"])
70
+
71
+ @staticmethod
72
+ def magic() -> str:
73
+ """Get a random magical or special effect emoji."""
74
+ return random.choice(["✨", "⭐", "🌟", "💫", "🪄", "🔮", "🎩", "🌈", "💎", "🦄"])
75
+
76
+ @staticmethod
77
+ def previewing() -> str:
78
+ """Get a random previewing or looking ahead emoji."""
79
+ return random.choice(["👀", "📺", "🔁", "👁️", "🔭", "🕵️", "🧐", "📸", "🎥", "🖼️"])
80
+
81
+ @staticmethod
82
+ def speed() -> str:
83
+ """Get a random speed or fast emoji."""
84
+ return random.choice(["⚡", "💨", "🏃", "🏎️", "🚄", "✈️", "💥", "⏩", "🏃‍♂️", "🏃‍♀️"])
85
+
86
+ @staticmethod
87
+ def start() -> str:
88
+ """Get a random emoji representing starting or launching something."""
89
+ return random.choice(["🚀", "▶️", "🎬", "🌅", "🏁", "🎯", "🚦", "🔔", "📣", "🎺"])
90
+
91
+ @staticmethod
92
+ def success() -> str:
93
+ """Get a random success or celebration emoji."""
94
+ return random.choice(["🎉", "🎊", "👏", "🙌", "🎆", "🍾", "☀️", "🏆", "✅", "🥳"])
95
+
96
+ @staticmethod
97
+ def thinking() -> str:
98
+ """Get a random thinking or processing emoji."""
99
+ return random.choice(["🤔", "💭", "🧠", "💡", "🔍", "🔎", "🤨", "🧐", "📝", "🧮"])
100
+
101
+ @staticmethod
102
+ def working() -> str:
103
+ """Get a random working or in-progress emoji."""
104
+ return random.choice(["⚙️", "🔧", "🔨", "⚒️", "🛠️", "💼", "👷", "🏗️", "🪛", "👨‍💻"])
105
+
106
+
107
+ def configure_logging(config: LoggingConfig) -> None:
108
+ root_logger = logging.getLogger()
109
+
110
+ # Remove all handlers
111
+ root_logger.handlers.clear()
112
+
113
+ # Create and attach handler(s)
114
+ handlers = [_create_handler(output_config) for output_config in config.output_configs]
115
+ for handler in handlers:
116
+ root_logger.addHandler(handler)
117
+
118
+ # Set levels
119
+ root_logger.setLevel(config.root_level)
120
+ for logger_config in config.logger_configs:
121
+ logger = logging.getLogger(logger_config.name)
122
+ logger.setLevel(logger_config.level)
123
+
124
+ # Adjust noisy loggers
125
+ for name in config.to_silence:
126
+ quiet_noisy_logger(name)
127
+
128
+
129
+ def quiet_noisy_logger(name: str) -> None:
130
+ logger = logging.getLogger(name)
131
+ logger.handlers.clear()
132
+ logger.setLevel(logging.WARNING)
133
+
134
+
135
+ def _create_handler(output_config: OutputConfig) -> logging.Handler:
136
+ if isinstance(output_config.destination, Path):
137
+ handler = logging.FileHandler(str(output_config.destination))
138
+ else:
139
+ handler = logging.StreamHandler()
140
+
141
+ if output_config.structured:
142
+ formatter = _make_json_formatter()
143
+ else:
144
+ formatter = _make_stream_formatter()
145
+
146
+ handler.setFormatter(formatter)
147
+ return handler
148
+
149
+
150
+ def _make_json_formatter() -> logging.Formatter:
151
+ log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
152
+ return jsonlogger.JsonFormatter(log_format)
153
+
154
+
155
+ def _make_stream_formatter() -> logging.Formatter:
156
+ log_format = "[%(asctime)s] [%(levelname)s] %(message)s"
157
+ time_format = "%H:%M:%S"
158
+ return logging.Formatter(log_format, time_format)
159
+
160
+
161
+ _DEFAULT_NOISY_LOGGERS = ["httpx", "matplotlib"]
@@ -0,0 +1,83 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from enum import Enum
7
+ from typing import TYPE_CHECKING, Type, TypeAlias
8
+
9
+ from .config.utils.misc import can_run_data_designer_locally
10
+
11
+ if TYPE_CHECKING:
12
+ from data_designer.plugins.plugin import Plugin
13
+
14
+
15
+ if can_run_data_designer_locally():
16
+ from data_designer.plugins.plugin import PluginType
17
+ from data_designer.plugins.registry import PluginRegistry
18
+
19
+
20
+ class PluginManager:
21
+ def __init__(self):
22
+ if can_run_data_designer_locally():
23
+ self._plugins_supported = True
24
+ self._plugin_registry = PluginRegistry()
25
+ else:
26
+ self._plugins_supported = False
27
+ self._plugin_registry = None
28
+
29
+ def get_column_generator_plugins(self) -> list[Plugin]:
30
+ """Get all column generator plugins.
31
+
32
+ Returns:
33
+ A list of all column generator plugins.
34
+ """
35
+ return self._plugin_registry.get_plugins(PluginType.COLUMN_GENERATOR) if self._plugins_supported else []
36
+
37
+ def get_column_generator_plugin_if_exists(self, plugin_name: str) -> Plugin | None:
38
+ """Get a column generator plugin by name if it exists.
39
+
40
+ Args:
41
+ plugin_name: The name of the plugin to retrieve.
42
+
43
+ Returns:
44
+ The plugin if found, otherwise None.
45
+ """
46
+ if self._plugins_supported and self._plugin_registry.plugin_exists(plugin_name):
47
+ return self._plugin_registry.get_plugin(plugin_name)
48
+ return None
49
+
50
+ def get_plugin_column_types(self, enum_type: Type[Enum], required_resources: list[str] | None = None) -> list[Enum]:
51
+ """Get a list of plugin column types.
52
+
53
+ Args:
54
+ enum_type: The enum type to use for plugin entries.
55
+ required_resources: If provided, only return plugins with the required resources.
56
+
57
+ Returns:
58
+ A list of plugin column types.
59
+ """
60
+ type_list = []
61
+ if self._plugins_supported:
62
+ for plugin in self._plugin_registry.get_plugins(PluginType.COLUMN_GENERATOR):
63
+ if required_resources:
64
+ task_required_resources = plugin.task_cls.metadata().required_resources or []
65
+ if not all(resource in task_required_resources for resource in required_resources):
66
+ continue
67
+ type_list.append(enum_type(plugin.name))
68
+ return type_list
69
+
70
+ def inject_into_column_config_type_union(self, column_config_type: Type[TypeAlias]) -> Type[TypeAlias]:
71
+ """Inject plugins into the column config type.
72
+
73
+ Args:
74
+ column_config_type: The column config type to inject plugins into.
75
+
76
+ Returns:
77
+ The column config type with plugins injected.
78
+ """
79
+ if self._plugins_supported:
80
+ column_config_type = self._plugin_registry.add_plugin_types_to_union(
81
+ column_config_type, PluginType.COLUMN_GENERATOR
82
+ )
83
+ return column_config_type
@@ -0,0 +1,6 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from data_designer.plugins.plugin import Plugin, PluginType
5
+
6
+ __all__ = ["Plugin", "PluginType"]
@@ -0,0 +1,10 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from data_designer.errors import DataDesignerError
5
+
6
+
7
+ class PluginRegistrationError(DataDesignerError): ...
8
+
9
+
10
+ class PluginNotFoundError(DataDesignerError): ...