data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +1 -7
  5. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -129
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -51
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc2.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,335 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import functools
7
- import importlib.metadata
8
- import json
9
- import logging
10
- import time
11
- import uuid
12
- from pathlib import Path
13
- from typing import TYPE_CHECKING, Callable
14
-
15
- from data_designer.config.column_types import ColumnConfigT
16
- from data_designer.config.config_builder import BuilderConfig
17
- from data_designer.config.data_designer_config import DataDesignerConfig
18
- from data_designer.config.dataset_builders import BuildStage
19
- from data_designer.config.processors import (
20
- DropColumnsProcessorConfig,
21
- ProcessorConfig,
22
- ProcessorType,
23
- )
24
- from data_designer.engine.column_generators.generators.base import (
25
- ColumnGenerator,
26
- ColumnGeneratorWithModel,
27
- GenerationStrategy,
28
- )
29
- from data_designer.engine.column_generators.utils.generator_classification import column_type_is_model_generated
30
- from data_designer.engine.compiler import compile_data_designer_config
31
- from data_designer.engine.dataset_builders.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage
32
- from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError
33
- from data_designer.engine.dataset_builders.multi_column_configs import MultiColumnConfig
34
- from data_designer.engine.dataset_builders.utils.concurrency import ConcurrentThreadExecutor
35
- from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs
36
- from data_designer.engine.dataset_builders.utils.dataset_batch_manager import DatasetBatchManager
37
- from data_designer.engine.models.telemetry import InferenceEvent, NemoSourceEnum, TaskStatusEnum, TelemetryHandler
38
- from data_designer.engine.processing.processors.base import Processor
39
- from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor
40
- from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry
41
- from data_designer.engine.resources.resource_provider import ResourceProvider
42
- from data_designer.lazy_heavy_imports import pd
43
-
44
- if TYPE_CHECKING:
45
- import pandas as pd
46
-
47
- from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModelRegistry
48
- from data_designer.engine.models.usage import ModelUsageStats
49
-
50
- logger = logging.getLogger(__name__)
51
-
52
- _CLIENT_VERSION: str = importlib.metadata.version("data_designer")
53
-
54
-
55
- class ColumnWiseDatasetBuilder:
56
- def __init__(
57
- self,
58
- data_designer_config: DataDesignerConfig,
59
- resource_provider: ResourceProvider,
60
- registry: DataDesignerRegistry | None = None,
61
- ):
62
- self.batch_manager = DatasetBatchManager(resource_provider.artifact_storage)
63
- self._resource_provider = resource_provider
64
- self._records_to_drop: set[int] = set()
65
- self._registry = registry or DataDesignerRegistry()
66
-
67
- self._data_designer_config = compile_data_designer_config(data_designer_config, resource_provider)
68
- self._column_configs = compile_dataset_builder_column_configs(self._data_designer_config)
69
- self._processors: dict[BuildStage, list[Processor]] = self._initialize_processors(
70
- self._data_designer_config.processors or []
71
- )
72
- self._validate_column_configs()
73
-
74
- @property
75
- def artifact_storage(self) -> ArtifactStorage:
76
- return self._resource_provider.artifact_storage
77
-
78
- @functools.cached_property
79
- def single_column_configs(self) -> list[ColumnConfigT]:
80
- configs = []
81
- for config in self._column_configs:
82
- if isinstance(config, MultiColumnConfig):
83
- configs.extend(config.columns)
84
- else:
85
- configs.append(config)
86
- return configs
87
-
88
- @functools.cached_property
89
- def llm_generated_column_configs(self) -> list[ColumnConfigT]:
90
- return [config for config in self.single_column_configs if column_type_is_model_generated(config.column_type)]
91
-
92
- def build(
93
- self,
94
- *,
95
- num_records: int,
96
- on_batch_complete: Callable[[Path], None] | None = None,
97
- ) -> Path:
98
- self._run_model_health_check_if_needed()
99
- self._write_builder_config()
100
- generators = self._initialize_generators()
101
- start_time = time.perf_counter()
102
- group_id = uuid.uuid4().hex
103
-
104
- buffer_size = self._resource_provider.run_config.buffer_size
105
- self.batch_manager.start(num_records=num_records, buffer_size=buffer_size)
106
- for batch_idx in range(self.batch_manager.num_batches):
107
- logger.info(f"⏳ Processing batch {batch_idx + 1} of {self.batch_manager.num_batches}")
108
- self._run_batch(generators, batch_mode="batch", group_id=group_id)
109
- df_batch = self._run_processors(
110
- stage=BuildStage.POST_BATCH,
111
- dataframe=self.batch_manager.get_current_batch(as_dataframe=True),
112
- current_batch_number=batch_idx,
113
- )
114
- self._write_processed_batch(df_batch)
115
- self.batch_manager.finish_batch(on_batch_complete)
116
- self.batch_manager.finish()
117
-
118
- model_usage_stats = self._resource_provider.model_registry.get_model_usage_stats(
119
- time.perf_counter() - start_time
120
- )
121
- logger.info(f"📊 Model usage summary:\n{json.dumps(model_usage_stats, indent=4)}")
122
-
123
- return self.artifact_storage.final_dataset_path
124
-
125
- def build_preview(self, *, num_records: int) -> pd.DataFrame:
126
- self._run_model_health_check_if_needed()
127
-
128
- generators = self._initialize_generators()
129
- group_id = uuid.uuid4().hex
130
- start_time = time.perf_counter()
131
- self.batch_manager.start(num_records=num_records, buffer_size=num_records)
132
- self._run_batch(generators, batch_mode="preview", save_partial_results=False, group_id=group_id)
133
- dataset = self.batch_manager.get_current_batch(as_dataframe=True)
134
- self.batch_manager.reset()
135
-
136
- model_usage_stats = self._resource_provider.model_registry.get_model_usage_stats(
137
- time.perf_counter() - start_time
138
- )
139
- logger.info(f"📊 Model usage summary:\n{json.dumps(model_usage_stats, indent=4)}")
140
-
141
- return dataset
142
-
143
- def process_preview(self, dataset: pd.DataFrame) -> pd.DataFrame:
144
- return self._run_processors(
145
- stage=BuildStage.POST_BATCH,
146
- dataframe=dataset.copy(),
147
- current_batch_number=None, # preview mode does not have a batch number
148
- )
149
-
150
- def _initialize_generators(self) -> list[ColumnGenerator]:
151
- return [
152
- self._registry.column_generators.get_for_config_type(type(config))(
153
- config=config, resource_provider=self._resource_provider
154
- )
155
- for config in self._column_configs
156
- ]
157
-
158
- def _write_builder_config(self) -> None:
159
- self.artifact_storage.mkdir_if_needed(self.artifact_storage.base_dataset_path)
160
- BuilderConfig(data_designer=self._data_designer_config).to_json(
161
- self.artifact_storage.base_dataset_path / SDG_CONFIG_FILENAME
162
- )
163
-
164
- def _run_batch(
165
- self, generators: list[ColumnGenerator], *, batch_mode: str, save_partial_results: bool = True, group_id: str
166
- ) -> None:
167
- pre_batch_snapshot = self._resource_provider.model_registry.get_model_usage_snapshot()
168
- for generator in generators:
169
- generator.log_pre_generation()
170
- try:
171
- generation_strategy = generator.get_generation_strategy()
172
- if generator.can_generate_from_scratch and self.batch_manager.buffer_is_empty:
173
- self._run_from_scratch_column_generator(generator)
174
- elif generation_strategy == GenerationStrategy.CELL_BY_CELL:
175
- self._run_cell_by_cell_generator(generator)
176
- elif generation_strategy == GenerationStrategy.FULL_COLUMN:
177
- self._run_full_column_generator(generator)
178
- else:
179
- logger.error(f"❌ Unknown generation strategy: {generation_strategy}")
180
- raise DatasetGenerationError(f"🛑 Unknown generation strategy: {generation_strategy}")
181
- if save_partial_results:
182
- self.batch_manager.write()
183
- except Exception as e:
184
- column_error_str = (
185
- f"columns {generator.config.column_names}"
186
- if hasattr(generator.config, "column_names")
187
- else f"column {generator.config.name!r}"
188
- )
189
- raise DatasetGenerationError(f"🛑 Failed to process {column_error_str}:\n{e}")
190
-
191
- try:
192
- usage_deltas = self._resource_provider.model_registry.get_usage_deltas(pre_batch_snapshot)
193
- self._emit_batch_inference_events(batch_mode, usage_deltas, group_id)
194
- except Exception:
195
- pass
196
-
197
- def _run_from_scratch_column_generator(self, generator: ColumnGenerator) -> None:
198
- df = generator.generate_from_scratch(self.batch_manager.num_records_batch)
199
- self.batch_manager.add_records(df.to_dict(orient="records"))
200
-
201
- def _run_cell_by_cell_generator(self, generator: ColumnGenerator) -> None:
202
- max_workers = self._resource_provider.run_config.non_inference_max_parallel_workers
203
- if isinstance(generator, ColumnGeneratorWithModel):
204
- max_workers = generator.inference_parameters.max_parallel_requests
205
- self._fan_out_with_threads(generator, max_workers=max_workers)
206
-
207
- def _run_full_column_generator(self, generator: ColumnGenerator) -> None:
208
- df = generator.generate(self.batch_manager.get_current_batch(as_dataframe=True))
209
- self.batch_manager.update_records(df.to_dict(orient="records"))
210
-
211
- def _run_model_health_check_if_needed(self) -> bool:
212
- if any(column_type_is_model_generated(config.column_type) for config in self.single_column_configs):
213
- self._resource_provider.model_registry.run_health_check(
214
- list(set(config.model_alias for config in self.llm_generated_column_configs))
215
- )
216
-
217
- def _fan_out_with_threads(self, generator: ColumnGeneratorWithModelRegistry, max_workers: int) -> None:
218
- if generator.get_generation_strategy() != GenerationStrategy.CELL_BY_CELL:
219
- raise DatasetGenerationError(
220
- f"Generator {generator.name} is not a {GenerationStrategy.CELL_BY_CELL} "
221
- "generator so concurrency through threads is not supported."
222
- )
223
-
224
- logger.info(
225
- f"🐙 Processing {generator.config.column_type} column '{generator.config.name}' "
226
- f"with {max_workers} concurrent workers"
227
- )
228
- settings = self._resource_provider.run_config
229
- with ConcurrentThreadExecutor(
230
- max_workers=max_workers,
231
- column_name=generator.config.name,
232
- result_callback=self._worker_result_callback,
233
- error_callback=self._worker_error_callback,
234
- shutdown_error_rate=settings.shutdown_error_rate,
235
- shutdown_error_window=settings.shutdown_error_window,
236
- disable_early_shutdown=settings.disable_early_shutdown,
237
- ) as executor:
238
- for i, record in self.batch_manager.iter_current_batch():
239
- executor.submit(lambda record: generator.generate(record), record, context={"index": i})
240
-
241
- if len(self._records_to_drop) > 0:
242
- self.batch_manager.drop_records(self._records_to_drop)
243
- self._records_to_drop.clear()
244
-
245
- def _write_processed_batch(self, dataframe: pd.DataFrame) -> None:
246
- self.batch_manager.update_records(dataframe.to_dict(orient="records"))
247
- self.batch_manager.write()
248
-
249
- def _validate_column_configs(self) -> None:
250
- if len(self._column_configs) == 0:
251
- raise DatasetGenerationError("🛑 No column configs provided.")
252
-
253
- if not self._registry.column_generators.get_for_config_type(
254
- type(self._column_configs[0])
255
- ).can_generate_from_scratch:
256
- raise DatasetGenerationError("🛑 The first column config must be a from-scratch column generator.")
257
-
258
- def _initialize_processors(self, processor_configs: list[ProcessorConfig]) -> dict[BuildStage, list[Processor]]:
259
- # Check columns marked for drop
260
- columns_to_drop = [config.name for config in self.single_column_configs if config.drop]
261
-
262
- processors: dict[BuildStage, list[Processor]] = {stage: [] for stage in BuildStage}
263
- for config in processor_configs:
264
- processors[config.build_stage].append(
265
- self._registry.processors.get_for_config_type(type(config))(
266
- config=config,
267
- resource_provider=self._resource_provider,
268
- )
269
- )
270
-
271
- # Manually included "drop columns" processor takes precedence (can e.g., pick stages other than post-batch)
272
- if config.processor_type == ProcessorType.DROP_COLUMNS:
273
- for column in config.column_names:
274
- if column in columns_to_drop:
275
- columns_to_drop.remove(column)
276
-
277
- # If there are still columns marked for drop, add the "drop columns" processor to drop them
278
- if len(columns_to_drop) > 0:
279
- processors[BuildStage.POST_BATCH].append( # as post-batch by default
280
- DropColumnsProcessor(
281
- config=DropColumnsProcessorConfig(
282
- name="default_drop_columns_processor",
283
- column_names=columns_to_drop,
284
- build_stage=BuildStage.POST_BATCH,
285
- ),
286
- resource_provider=self._resource_provider,
287
- )
288
- )
289
-
290
- return processors
291
-
292
- def _run_processors(
293
- self, stage: BuildStage, dataframe: pd.DataFrame, current_batch_number: int | None = None
294
- ) -> pd.DataFrame:
295
- for processor in self._processors[stage]:
296
- try:
297
- dataframe = processor.process(dataframe, current_batch_number=current_batch_number)
298
- except Exception as e:
299
- raise DatasetProcessingError(
300
- f"🛑 Failed to process dataset with processor {processor.name} in stage {stage}: {e}"
301
- ) from e
302
- return dataframe
303
-
304
- def _worker_error_callback(self, exc: Exception, *, context: dict | None = None) -> None:
305
- """If a worker fails, we can handle the exception here."""
306
- logger.warning(
307
- f"⚠️ Generation for record at index {context['index']} failed. "
308
- f"Will omit this record from the dataset.\n{exc}"
309
- )
310
- self._records_to_drop.add(context["index"])
311
-
312
- def _worker_result_callback(self, result: dict, *, context: dict | None = None) -> None:
313
- self.batch_manager.update_record(context["index"], result)
314
-
315
- def _emit_batch_inference_events(
316
- self, batch_mode: str, usage_deltas: dict[str, ModelUsageStats], group_id: str
317
- ) -> None:
318
- if not usage_deltas:
319
- return
320
-
321
- events = [
322
- InferenceEvent(
323
- nemo_source=NemoSourceEnum.DATADESIGNER,
324
- task=batch_mode,
325
- task_status=TaskStatusEnum.SUCCESS,
326
- model=model_name,
327
- input_tokens=delta.token_usage.input_tokens,
328
- output_tokens=delta.token_usage.output_tokens,
329
- )
330
- for model_name, delta in usage_deltas.items()
331
- ]
332
-
333
- with TelemetryHandler(source_client_version=_CLIENT_VERSION, session_id=group_id) as telemetry_handler:
334
- for event in events:
335
- telemetry_handler.enqueue(event)
@@ -1,15 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from data_designer.engine.errors import DataDesignerError
7
-
8
-
9
- class ArtifactStorageError(DataDesignerError): ...
10
-
11
-
12
- class DatasetGenerationError(DataDesignerError): ...
13
-
14
-
15
- class DatasetProcessingError(DataDesignerError): ...
@@ -1,46 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from abc import ABC
7
- from typing import TypeAlias
8
-
9
- from pydantic import Field, field_validator
10
-
11
- from data_designer.config.base import ConfigBase
12
- from data_designer.config.column_configs import SamplerColumnConfig, SeedDatasetColumnConfig, SingleColumnConfig
13
- from data_designer.config.column_types import ColumnConfigT, DataDesignerColumnType
14
- from data_designer.config.sampler_constraints import ColumnConstraintT
15
- from data_designer.config.seed import SeedConfig
16
-
17
-
18
- class MultiColumnConfig(ConfigBase, ABC):
19
- columns: list[SingleColumnConfig] = Field(..., min_length=1)
20
-
21
- @property
22
- def column_names(self) -> list[str]:
23
- return [c.name for c in self.columns]
24
-
25
- @property
26
- def column_type(self) -> DataDesignerColumnType:
27
- return self.columns[0].column_type
28
-
29
- @field_validator("columns", mode="after")
30
- def validate_column_types_are_the_same(cls, v: list[SingleColumnConfig]) -> list[SingleColumnConfig]:
31
- if len(set([c.column_type for c in v])) != 1:
32
- raise ValueError("All column types must be of the same type")
33
- return v
34
-
35
-
36
- class SamplerMultiColumnConfig(MultiColumnConfig):
37
- columns: list[SamplerColumnConfig]
38
- constraints: list[ColumnConstraintT] = []
39
- max_rejections_factor: int = 5
40
-
41
-
42
- class SeedDatasetMultiColumnConfig(SeedConfig, MultiColumnConfig):
43
- columns: list[SeedDatasetColumnConfig]
44
-
45
-
46
- DatasetBuilderColumnConfigT: TypeAlias = ColumnConfigT | SeedDatasetMultiColumnConfig | SamplerMultiColumnConfig
@@ -1,2 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
@@ -1,212 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import contextvars
7
- import json
8
- import logging
9
- from concurrent.futures import Future, ThreadPoolExecutor
10
- from threading import Lock, Semaphore
11
- from typing import Any, Protocol
12
-
13
- from pydantic import BaseModel, Field
14
-
15
- from data_designer.engine.errors import DataDesignerRuntimeError, ErrorTrap
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- class ExecutorResults(BaseModel):
21
- failure_threshold: float = 0.0 # Error rate threshold
22
- completed_count: int = 0 # How many tasks/jobs completed
23
- success_count: int = 0 # How many tasks/jobs were successful
24
- early_shutdown: bool = False # Did we shutdown early due to errors?
25
- error_trap: ErrorTrap = Field(default_factory=ErrorTrap)
26
-
27
- @property
28
- def summary(self) -> dict:
29
- summary = self.model_dump(exclude={"error_trap"})
30
- summary |= self.error_trap.model_dump()
31
- return summary
32
-
33
- def get_error_rate(self, window: int) -> float:
34
- # We don't start actually tracking until our minimum window size is met
35
- if self.completed_count < window:
36
- return 0.0
37
- return self.error_trap.error_count / max(1, self.completed_count)
38
-
39
- def is_error_rate_exceeded(self, window: int) -> bool:
40
- return self.get_error_rate(window) >= self.failure_threshold
41
-
42
-
43
- class CallbackWithContext(Protocol):
44
- """Executor callback functions must accept a context kw argument."""
45
-
46
- def __call__(self, result: Any, *, context: dict | None = None) -> Any: ...
47
-
48
-
49
- class ErrorCallbackWithContext(Protocol):
50
- """Error callbacks take the Exception instance and context."""
51
-
52
- def __call__(self, exc: Exception, *, context: dict | None = None) -> Any: ...
53
-
54
-
55
- class ConcurrentThreadExecutor:
56
- """
57
- Interface for executing multiple concurrent tasks with error rate monitoring.
58
-
59
- This interface should be used exclusively as
60
- a context manager. New tasks can be submitted to the executor using the `submit`
61
- method. This submit method functions similarly to the
62
- submit method of a ThreadPoolExecutor.
63
-
64
- The underlying queue of tasks is bounded by the `max_workers`
65
- parameter. This means that only `max_workers` number of
66
- tasks can be queued up for execution. As tasks complete,
67
- if there are errors, those are tracked and counted. If
68
- a certain error rate is exceeded, the executor will shutdown
69
- early. All queued and running tasks will complete.
70
-
71
- The reason we bound the underlying task queue is to ensure that when
72
- a certain error threshold is met there aren't an unbounded
73
- number of tasks that need to complete. Generally speaking,
74
- tasks should not be sitting in the queue for long at all since
75
- the queue size == `max_workers`. The side effect of this is that
76
- the `submit()` method will block, however this should not matter
77
- because upstream Tasks need to wait for all jobs to complete
78
- before the Task can be considered complete.
79
-
80
- ContextVars from the main parent thread are automatically propagated
81
- to all child threads.
82
-
83
- When a task is completed, the user provided `result_callback`
84
- function will be called with the task result as the only argument.
85
- """
86
-
87
- def __init__(
88
- self,
89
- *,
90
- max_workers: int,
91
- column_name: str,
92
- result_callback: CallbackWithContext | None = None,
93
- error_callback: ErrorCallbackWithContext | None = None,
94
- shutdown_error_rate: float = 0.50,
95
- shutdown_error_window: int = 10,
96
- disable_early_shutdown: bool = False,
97
- ):
98
- self._executor = None
99
- self._column_name = column_name
100
- self._max_workers = max_workers
101
- self._lock = Lock()
102
- self._semaphore = Semaphore(self._max_workers)
103
- self._result_callback = result_callback
104
- self._error_callback = error_callback
105
- self._shutdown_error_rate = shutdown_error_rate
106
- self._shutdown_window_size = shutdown_error_window
107
- self._disable_early_shutdown = disable_early_shutdown
108
- self._results = ExecutorResults(failure_threshold=shutdown_error_rate)
109
-
110
- @property
111
- def results(self) -> ExecutorResults:
112
- return self._results
113
-
114
- @property
115
- def max_workers(self) -> int:
116
- return self._max_workers
117
-
118
- @property
119
- def shutdown_error_rate(self) -> float:
120
- return self._shutdown_error_rate
121
-
122
- @property
123
- def shutdown_window_size(self) -> int:
124
- return self._shutdown_window_size
125
-
126
- @property
127
- def semaphore(self) -> Semaphore:
128
- return self._semaphore
129
-
130
- def __enter__(self) -> ConcurrentThreadExecutor:
131
- self._executor = ThreadPoolExecutor(
132
- max_workers=self._max_workers,
133
- thread_name_prefix="ConcurrentThreadExecutor",
134
- initializer=_set_worker_contextvars,
135
- initargs=(contextvars.copy_context(),),
136
- )
137
- return self
138
-
139
- def __exit__(self, exc_type, exc_value, traceback):
140
- self._shutdown_executor()
141
- if not self._disable_early_shutdown and self._results.early_shutdown is True:
142
- self._raise_task_error()
143
-
144
- def _shutdown_executor(self) -> None:
145
- if self._executor is not None:
146
- self._executor.shutdown()
147
-
148
- def _raise_task_error(self):
149
- raise DataDesignerRuntimeError(
150
- "\n".join(
151
- [
152
- " |-- Data generation was terminated early due to error rate exceeding threshold.",
153
- f" |-- The summary of encountered errors is: \n{json.dumps(self._results.summary, indent=4)}",
154
- ]
155
- )
156
- )
157
-
158
- def submit(self, fn, *args, context: dict | None = None, **kwargs) -> None:
159
- if self._executor is None:
160
- raise RuntimeError("Executor is not initialized, this class should be used as a context manager.")
161
-
162
- if not self._disable_early_shutdown and self._results.early_shutdown:
163
- self._shutdown_executor()
164
- self._raise_task_error()
165
-
166
- def _handle_future(future: Future) -> None:
167
- try:
168
- result = future.result()
169
- if self._result_callback is not None:
170
- self._result_callback(result, context=context)
171
- with self._lock:
172
- self._results.completed_count += 1
173
- self._results.success_count += 1
174
- except Exception as err:
175
- with self._lock:
176
- self._results.completed_count += 1
177
- self._results.error_trap.handle_error(err)
178
- if not self._disable_early_shutdown and self._results.is_error_rate_exceeded(
179
- self._shutdown_window_size
180
- ):
181
- # Signal to shutdown early on the next submission (if received).
182
- # We cannot trigger shutdown from within this thread as it can
183
- # cause a deadlock.
184
- if not self._results.early_shutdown:
185
- self._results.early_shutdown = True
186
- if self._error_callback is not None:
187
- self._error_callback(err, context=context)
188
- finally:
189
- self._semaphore.release()
190
-
191
- try:
192
- self._semaphore.acquire()
193
- future = self._executor.submit(fn, *args, **kwargs)
194
- future.add_done_callback(_handle_future)
195
- except Exception as err:
196
- # If we get here, the pool is shutting down (likely due to early termination from errors)
197
- # We'll re-raise a custom error that can be handled at the call-site and the summary
198
- # can also be inspected.
199
- self._semaphore.release()
200
- is_shutdown_error = isinstance(err, RuntimeError) and (
201
- "after shutdown" in str(err) or "Pool shutdown" in str(err)
202
- )
203
- if not is_shutdown_error:
204
- raise err
205
- if self._disable_early_shutdown:
206
- raise err
207
- self._raise_task_error()
208
-
209
-
210
- def _set_worker_contextvars(context: contextvars.Context):
211
- for var, value in context.items():
212
- var.set(value)
@@ -1,62 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from data_designer.config.column_types import DataDesignerColumnType
7
- from data_designer.config.data_designer_config import DataDesignerConfig
8
- from data_designer.config.processors import ProcessorConfig
9
- from data_designer.engine.dataset_builders.multi_column_configs import (
10
- DatasetBuilderColumnConfigT,
11
- SamplerMultiColumnConfig,
12
- SeedDatasetMultiColumnConfig,
13
- )
14
- from data_designer.engine.dataset_builders.utils.dag import topologically_sort_column_configs
15
- from data_designer.engine.dataset_builders.utils.errors import ConfigCompilationError
16
-
17
-
18
- def compile_dataset_builder_column_configs(config: DataDesignerConfig) -> list[DatasetBuilderColumnConfigT]:
19
- seed_column_configs = []
20
- sampler_column_configs = []
21
- generated_column_configs = []
22
-
23
- for column_config in topologically_sort_column_configs(config.columns):
24
- if column_config.column_type == DataDesignerColumnType.SEED_DATASET:
25
- seed_column_configs.append(column_config)
26
- elif column_config.column_type == DataDesignerColumnType.SAMPLER:
27
- sampler_column_configs.append(column_config)
28
- else:
29
- generated_column_configs.append(column_config)
30
-
31
- compiled_column_configs = []
32
-
33
- if len(seed_column_configs) > 0:
34
- if config.seed_config is None:
35
- raise ConfigCompilationError("🛑 Seed column configs require a seed configuration.")
36
- compiled_column_configs.append(
37
- SeedDatasetMultiColumnConfig(
38
- columns=seed_column_configs,
39
- source=config.seed_config.source,
40
- sampling_strategy=config.seed_config.sampling_strategy,
41
- selection_strategy=config.seed_config.selection_strategy,
42
- )
43
- )
44
-
45
- if len(sampler_column_configs) > 0:
46
- compiled_column_configs.append(
47
- SamplerMultiColumnConfig(
48
- columns=sampler_column_configs,
49
- constraints=config.constraints or [],
50
- )
51
- )
52
-
53
- if len(generated_column_configs) > 0:
54
- compiled_column_configs.extend(generated_column_configs)
55
-
56
- return compiled_column_configs
57
-
58
-
59
- def compile_dataset_builder_processor_configs(
60
- config: DataDesignerConfig,
61
- ) -> list[ProcessorConfig]:
62
- return config.processors or []