data-designer 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data_designer/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.3.3'
32
- __version_tuple__ = version_tuple = (0, 3, 3)
31
+ __version__ = version = '0.3.4'
32
+ __version_tuple__ = version_tuple = (0, 3, 4)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -18,6 +18,7 @@ class ConfigBase(BaseModel):
18
18
  use_enum_values=True,
19
19
  arbitrary_types_allowed=True,
20
20
  extra="forbid",
21
+ json_schema_mode_override="validation",
21
22
  )
22
23
 
23
24
 
@@ -0,0 +1,18 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class DatasetMetadata(BaseModel):
8
+ """Metadata about a generated dataset.
9
+
10
+ This object is created by the engine and passed to results objects for use
11
+ in visualization and other client-side utilities. It is designed to be
12
+ serializable so it can be sent over the wire in a client-server architecture.
13
+
14
+ Attributes:
15
+ seed_column_names: Names of columns from the seed dataset. Empty list if no seed dataset.
16
+ """
17
+
18
+ seed_column_names: list[str] = []
@@ -7,6 +7,7 @@ import pandas as pd
7
7
 
8
8
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
9
9
  from data_designer.config.config_builder import DataDesignerConfigBuilder
10
+ from data_designer.config.dataset_metadata import DatasetMetadata
10
11
  from data_designer.config.utils.visualization import WithRecordSamplerMixin
11
12
 
12
13
 
@@ -15,6 +16,7 @@ class PreviewResults(WithRecordSamplerMixin):
15
16
  self,
16
17
  *,
17
18
  config_builder: DataDesignerConfigBuilder,
19
+ dataset_metadata: DatasetMetadata,
18
20
  dataset: pd.DataFrame | None = None,
19
21
  analysis: DatasetProfilerResults | None = None,
20
22
  processor_artifacts: dict[str, list[str] | str] | None = None,
@@ -23,6 +25,7 @@ class PreviewResults(WithRecordSamplerMixin):
23
25
 
24
26
  Args:
25
27
  config_builder: Data Designer configuration builder.
28
+ dataset_metadata: Metadata about the generated dataset (e.g., seed column names).
26
29
  dataset: Dataset of the preview run.
27
30
  analysis: Analysis of the preview run.
28
31
  processor_artifacts: Artifacts generated by the processors.
@@ -30,4 +33,5 @@ class PreviewResults(WithRecordSamplerMixin):
30
33
  self.dataset: pd.DataFrame | None = dataset
31
34
  self.analysis: DatasetProfilerResults | None = analysis
32
35
  self.processor_artifacts: dict[str, list[str] | str] | None = processor_artifacts
36
+ self.dataset_metadata = dataset_metadata
33
37
  self._config_builder = config_builder
@@ -14,21 +14,33 @@ class RunConfig(ConfigBase):
14
14
  part of the dataset configuration itself.
15
15
 
16
16
  Attributes:
17
- disable_early_shutdown: If True, disables early shutdown entirely. Generation
18
- will continue regardless of error rate. Default is False.
19
- shutdown_error_rate: Error rate threshold (0.0-1.0) that triggers early shutdown.
20
- When early shutdown is disabled, this value is normalized to 1.0. Default is 0.5.
17
+ disable_early_shutdown: If True, disables the executor's early-shutdown behavior entirely.
18
+ Generation will continue regardless of error rate, and the early-shutdown exception
19
+ will never be raised. Error counts and summaries are still collected. Default is False.
20
+ shutdown_error_rate: Error rate threshold (0.0-1.0) that triggers early shutdown when
21
+ early shutdown is enabled. Default is 0.5.
21
22
  shutdown_error_window: Minimum number of completed tasks before error rate
22
23
  monitoring begins. Must be >= 0. Default is 10.
24
+ buffer_size: Number of records to process in each batch during dataset generation.
25
+ A batch is processed end-to-end (column generation, post-batch processors, and writing the batch
26
+ to artifact storage) before moving on to the next batch. Must be > 0. Default is 1000.
27
+ max_conversation_restarts: Maximum number of full conversation restarts permitted when
28
+ generation tasks call `ModelFacade.generate(...)`. Must be >= 0. Default is 5.
29
+ max_conversation_correction_steps: Maximum number of correction rounds permitted within a
30
+ single conversation when generation tasks call `ModelFacade.generate(...)`. Must be >= 0.
31
+ Default is 0.
23
32
  """
24
33
 
25
34
  disable_early_shutdown: bool = False
26
35
  shutdown_error_rate: float = Field(default=0.5, ge=0.0, le=1.0)
27
36
  shutdown_error_window: int = Field(default=10, ge=0)
37
+ buffer_size: int = Field(default=1000, gt=0)
38
+ max_conversation_restarts: int = Field(default=5, ge=0)
39
+ max_conversation_correction_steps: int = Field(default=0, ge=0)
28
40
 
29
41
  @model_validator(mode="after")
30
42
  def normalize_shutdown_settings(self) -> Self:
31
- """Set shutdown_error_rate to 1.0 when early shutdown is disabled."""
43
+ """Normalize shutdown settings for compatibility."""
32
44
  if self.disable_early_shutdown:
33
45
  self.shutdown_error_rate = 1.0
34
46
  return self
@@ -6,6 +6,7 @@ from typing import Literal
6
6
 
7
7
  import pandas as pd
8
8
  from pydantic import BaseModel, ConfigDict, Field, field_validator
9
+ from pydantic.json_schema import SkipJsonSchema
9
10
  from typing_extensions import Self
10
11
 
11
12
  from data_designer.config.utils.io_helpers import (
@@ -68,7 +69,7 @@ class DataFrameSeedSource(SeedSource):
68
69
 
69
70
  model_config = ConfigDict(arbitrary_types_allowed=True)
70
71
 
71
- df: pd.DataFrame = Field(
72
+ df: SkipJsonSchema[pd.DataFrame] = Field(
72
73
  ...,
73
74
  exclude=True,
74
75
  description=(
@@ -31,6 +31,7 @@ from data_designer.config.utils.errors import DatasetSampleDisplayError
31
31
 
32
32
  if TYPE_CHECKING:
33
33
  from data_designer.config.config_builder import DataDesignerConfigBuilder
34
+ from data_designer.config.dataset_metadata import DatasetMetadata
34
35
 
35
36
 
36
37
  console = Console()
@@ -57,6 +58,7 @@ class ColorPalette(str, Enum):
57
58
 
58
59
  class WithRecordSamplerMixin:
59
60
  _display_cycle_index: int = 0
61
+ dataset_metadata: DatasetMetadata
60
62
 
61
63
  @cached_property
62
64
  def _record_sampler_dataset(self) -> pd.DataFrame:
@@ -79,22 +81,22 @@ class WithRecordSamplerMixin:
79
81
  self,
80
82
  index: int | None = None,
81
83
  *,
82
- hide_seed_columns: bool = False,
83
84
  syntax_highlighting_theme: str = "dracula",
84
85
  background_color: str | None = None,
85
86
  processors_to_display: list[str] | None = None,
87
+ hide_seed_columns: bool = False,
86
88
  ) -> None:
87
89
  """Display a sample record from the Data Designer dataset preview.
88
90
 
89
91
  Args:
90
92
  index: Index of the record to display. If None, the next record will be displayed.
91
93
  This is useful for running the cell in a notebook multiple times.
92
- hide_seed_columns: If True, the columns from the seed dataset (if any) will not be displayed.
93
94
  syntax_highlighting_theme: Theme to use for syntax highlighting. See the `Syntax`
94
95
  documentation from `rich` for information about available themes.
95
96
  background_color: Background color to use for the record. See the `Syntax`
96
97
  documentation from `rich` for information about available background colors.
97
98
  processors_to_display: List of processors to display the artifacts for. If None, all processors will be displayed.
99
+ hide_seed_columns: If True, seed columns will not be displayed separately.
98
100
  """
99
101
  i = index or self._display_cycle_index
100
102
 
@@ -120,14 +122,16 @@ class WithRecordSamplerMixin:
120
122
  else:
121
123
  processor_data_to_display[processor] = self.processor_artifacts[processor]
122
124
 
125
+ seed_column_names = None if hide_seed_columns else self.dataset_metadata.seed_column_names
126
+
123
127
  display_sample_record(
124
128
  record=record,
125
129
  processor_data_to_display=processor_data_to_display,
126
130
  config_builder=self._config_builder,
127
131
  background_color=background_color,
128
132
  syntax_highlighting_theme=syntax_highlighting_theme,
129
- hide_seed_columns=hide_seed_columns,
130
133
  record_index=i,
134
+ seed_column_names=seed_column_names,
131
135
  )
132
136
  if index is None:
133
137
  self._display_cycle_index = (self._display_cycle_index + 1) % num_records
@@ -160,7 +164,7 @@ def display_sample_record(
160
164
  background_color: str | None = None,
161
165
  syntax_highlighting_theme: str = "dracula",
162
166
  record_index: int | None = None,
163
- hide_seed_columns: bool = False,
167
+ seed_column_names: list[str] | None = None,
164
168
  ):
165
169
  if isinstance(record, (dict, pd.Series)):
166
170
  record = pd.DataFrame([record]).iloc[0]
@@ -179,14 +183,14 @@ def display_sample_record(
179
183
  render_list = []
180
184
  table_kws = dict(show_lines=True, expand=True)
181
185
 
182
- seed_columns = config_builder.get_columns_of_type(DataDesignerColumnType.SEED_DATASET)
183
- if not hide_seed_columns and len(seed_columns) > 0:
186
+ # Display seed columns if seed_column_names is provided and not empty
187
+ if seed_column_names:
184
188
  table = Table(title="Seed Columns", **table_kws)
185
189
  table.add_column("Name")
186
190
  table.add_column("Value")
187
- for col in seed_columns:
188
- if not col.drop:
189
- table.add_row(col.name, convert_to_row_element(record[col.name]))
191
+ for col_name in seed_column_names:
192
+ if col_name in record.index:
193
+ table.add_row(col_name, convert_to_row_element(record[col_name]))
190
194
  render_list.append(pad_console_element(table))
191
195
 
192
196
  non_code_columns = (
@@ -28,10 +28,6 @@ from data_designer.engine.processing.utils import deserialize_json_values
28
28
  logger = logging.getLogger(__name__)
29
29
 
30
30
 
31
- DEFAULT_MAX_CONVERSATION_RESTARTS = 5
32
- DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS = 0
33
-
34
-
35
31
  class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfigT]):
36
32
  @functools.cached_property
37
33
  def response_recipe(self) -> ResponseRecipe:
@@ -39,11 +35,11 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
39
35
 
40
36
  @property
41
37
  def max_conversation_correction_steps(self) -> int:
42
- return DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS
38
+ return self.resource_provider.run_config.max_conversation_correction_steps
43
39
 
44
40
  @property
45
41
  def max_conversation_restarts(self) -> int:
46
- return DEFAULT_MAX_CONVERSATION_RESTARTS
42
+ return self.resource_provider.run_config.max_conversation_restarts
47
43
 
48
44
  @functools.cached_property
49
45
  def prompt_renderer(self) -> RecordBasedPromptRenderer:
@@ -129,7 +125,3 @@ class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColum
129
125
  description="Judge a new dataset cell based on a set of rubrics",
130
126
  generation_strategy=GenerationStrategy.CELL_BY_CELL,
131
127
  )
132
-
133
- @property
134
- def max_conversation_restarts(self) -> int:
135
- return 2 * DEFAULT_MAX_CONVERSATION_RESTARTS
@@ -132,6 +132,7 @@ class ValidationColumnGenerator(ColumnGenerator[ValidationColumnConfig]):
132
132
  error_callback=error_callback,
133
133
  shutdown_error_rate=settings.shutdown_error_rate,
134
134
  shutdown_error_window=settings.shutdown_error_window,
135
+ disable_early_shutdown=settings.disable_early_shutdown,
135
136
  ) as executor:
136
137
  for i, batch in enumerate(batched_records):
137
138
  executor.submit(lambda batch: self._validate_batch(validator, batch), batch, context={"index": i})
@@ -94,7 +94,6 @@ class ColumnWiseDatasetBuilder:
94
94
  self,
95
95
  *,
96
96
  num_records: int,
97
- buffer_size: int,
98
97
  on_batch_complete: Callable[[Path], None] | None = None,
99
98
  ) -> Path:
100
99
  self._write_configs()
@@ -104,6 +103,7 @@ class ColumnWiseDatasetBuilder:
104
103
  start_time = time.perf_counter()
105
104
  group_id = uuid.uuid4().hex
106
105
 
106
+ buffer_size = self._resource_provider.run_config.buffer_size
107
107
  self.batch_manager.start(num_records=num_records, buffer_size=buffer_size)
108
108
  for batch_idx in range(self.batch_manager.num_batches):
109
109
  logger.info(f"⏳ Processing batch {batch_idx + 1} of {self.batch_manager.num_batches}")
@@ -228,6 +228,7 @@ class ColumnWiseDatasetBuilder:
228
228
  error_callback=self._worker_error_callback,
229
229
  shutdown_error_rate=settings.shutdown_error_rate,
230
230
  shutdown_error_window=settings.shutdown_error_window,
231
+ disable_early_shutdown=settings.disable_early_shutdown,
231
232
  ) as executor:
232
233
  for i, record in self.batch_manager.iter_current_batch():
233
234
  executor.submit(lambda record: generator.generate(record), record, context={"index": i})
@@ -96,6 +96,7 @@ class ConcurrentThreadExecutor:
96
96
  error_callback: ErrorCallbackWithContext | None = None,
97
97
  shutdown_error_rate: float = 0.50,
98
98
  shutdown_error_window: int = 10,
99
+ disable_early_shutdown: bool = False,
99
100
  ):
100
101
  self._executor = None
101
102
  self._column_name = column_name
@@ -106,6 +107,7 @@ class ConcurrentThreadExecutor:
106
107
  self._error_callback = error_callback
107
108
  self._shutdown_error_rate = shutdown_error_rate
108
109
  self._shutdown_window_size = shutdown_error_window
110
+ self._disable_early_shutdown = disable_early_shutdown
109
111
  self._results = ExecutorResults(failure_threshold=shutdown_error_rate)
110
112
 
111
113
  @property
@@ -139,7 +141,7 @@ class ConcurrentThreadExecutor:
139
141
 
140
142
  def __exit__(self, exc_type, exc_value, traceback):
141
143
  self._shutdown_executor()
142
- if self._results.early_shutdown is True:
144
+ if not self._disable_early_shutdown and self._results.early_shutdown is True:
143
145
  self._raise_task_error()
144
146
 
145
147
  def _shutdown_executor(self) -> None:
@@ -160,7 +162,7 @@ class ConcurrentThreadExecutor:
160
162
  if self._executor is None:
161
163
  raise RuntimeError("Executor is not initialized, this class should be used as a context manager.")
162
164
 
163
- if self._results.early_shutdown:
165
+ if not self._disable_early_shutdown and self._results.early_shutdown:
164
166
  self._shutdown_executor()
165
167
  self._raise_task_error()
166
168
 
@@ -176,7 +178,9 @@ class ConcurrentThreadExecutor:
176
178
  with self._lock:
177
179
  self._results.completed_count += 1
178
180
  self._results.error_trap.handle_error(err)
179
- if self._results.is_error_rate_exceeded(self._shutdown_window_size):
181
+ if not self._disable_early_shutdown and self._results.is_error_rate_exceeded(
182
+ self._shutdown_window_size
183
+ ):
180
184
  # Signal to shutdown early on the next submission (if received).
181
185
  # We cannot trigger shutdown from within this thread as it can
182
186
  # cause a deadlock.
@@ -196,7 +200,12 @@ class ConcurrentThreadExecutor:
196
200
  # We'll re-raise a custom error that can be handled at the call-site and the summary
197
201
  # can also be inspected.
198
202
  self._semaphore.release()
199
- if not isinstance(err, RuntimeError) and "after shutdown" not in str(err):
203
+ is_shutdown_error = isinstance(err, RuntimeError) and (
204
+ "after shutdown" in str(err) or "Pool shutdown" in str(err)
205
+ )
206
+ if not is_shutdown_error:
207
+ raise err
208
+ if self._disable_early_shutdown:
200
209
  raise err
201
210
  self._raise_task_error()
202
211
 
@@ -69,7 +69,7 @@ class DatasetBatchManager:
69
69
  def drop_records(self, index: Container[int]) -> None:
70
70
  self._buffer = [record for i, record in enumerate(self._buffer) if i not in index]
71
71
 
72
- def finish_batch(self, on_complete: Callable[[Path], None] | None = None) -> Path:
72
+ def finish_batch(self, on_complete: Callable[[Path], None] | None = None) -> Path | None:
73
73
  """Finish the batch by moving the results from the partial results path to the final parquet folder.
74
74
 
75
75
  Returns:
@@ -78,29 +78,36 @@ class DatasetBatchManager:
78
78
  if self._current_batch_number >= self.num_batches:
79
79
  raise DatasetBatchManagementError("🛑 All batches have been processed.")
80
80
 
81
- if not self.write():
82
- raise DatasetBatchManagementError("🛑 Batch finished without any results to write.")
83
-
84
- final_file_path = self.artifact_storage.move_partial_result_to_final_file_path(self._current_batch_number)
85
-
86
- self.artifact_storage.write_metadata(
87
- {
88
- "target_num_records": sum(self.num_records_list),
89
- "total_num_batches": self.num_batches,
90
- "buffer_size": self._buffer_size,
91
- "schema": {field.name: str(field.type) for field in pq.read_schema(final_file_path)},
92
- "file_paths": [str(f) for f in sorted(self.artifact_storage.final_dataset_path.glob("*.parquet"))],
93
- "num_records": self.num_records_list[: self._current_batch_number + 1],
94
- "num_completed_batches": self._current_batch_number + 1,
95
- "dataset_name": self.artifact_storage.dataset_name,
96
- }
97
- )
81
+ if self.write() is not None:
82
+ final_file_path = self.artifact_storage.move_partial_result_to_final_file_path(self._current_batch_number)
83
+
84
+ self.artifact_storage.write_metadata(
85
+ {
86
+ "target_num_records": sum(self.num_records_list),
87
+ "total_num_batches": self.num_batches,
88
+ "buffer_size": self._buffer_size,
89
+ "schema": {field.name: str(field.type) for field in pq.read_schema(final_file_path)},
90
+ "file_paths": [str(f) for f in sorted(self.artifact_storage.final_dataset_path.glob("*.parquet"))],
91
+ "num_records": self.num_records_list[: self._current_batch_number + 1],
92
+ "num_completed_batches": self._current_batch_number + 1,
93
+ "dataset_name": self.artifact_storage.dataset_name,
94
+ }
95
+ )
96
+
97
+ if on_complete:
98
+ on_complete(final_file_path)
99
+ else:
100
+ final_file_path = None
101
+
102
+ logger.warning(
103
+ f"⚠️ Batch {self._current_batch_number + 1} finished without any results to write. "
104
+ "A partial dataset containing the currently available columns has been written to the partial results "
105
+ f"directory: {self.artifact_storage.partial_results_path}"
106
+ )
107
+
98
108
  self._current_batch_number += 1
99
109
  self._buffer: list[dict] = []
100
110
 
101
- if on_complete:
102
- on_complete(final_file_path)
103
-
104
111
  return final_file_path
105
112
 
106
113
  def finish(self) -> None:
@@ -2,6 +2,7 @@
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  from data_designer.config.base import ConfigBase
5
+ from data_designer.config.dataset_metadata import DatasetMetadata
5
6
  from data_designer.config.models import ModelConfig
6
7
  from data_designer.config.run_config import RunConfig
7
8
  from data_designer.config.seed_source import SeedSource
@@ -27,6 +28,17 @@ class ResourceProvider(ConfigBase):
27
28
  run_config: RunConfig = RunConfig()
28
29
  seed_reader: SeedReader | None = None
29
30
 
31
+ def get_dataset_metadata(self) -> DatasetMetadata:
32
+ """Get metadata about the dataset being generated.
33
+
34
+ Returns:
35
+ DatasetMetadata with seed column names and other metadata.
36
+ """
37
+ seed_column_names = []
38
+ if self.seed_reader is not None:
39
+ seed_column_names = self.seed_reader.get_column_names()
40
+ return DatasetMetadata(seed_column_names=seed_column_names)
41
+
30
42
 
31
43
  def create_resource_provider(
32
44
  *,
@@ -56,15 +56,12 @@ from data_designer.engine.secret_resolver import (
56
56
  from data_designer.interface.errors import (
57
57
  DataDesignerGenerationError,
58
58
  DataDesignerProfilingError,
59
- InvalidBufferValueError,
60
59
  )
61
60
  from data_designer.interface.results import DatasetCreationResults
62
61
  from data_designer.logging import RandomEmoji
63
62
  from data_designer.plugins.plugin import PluginType
64
63
  from data_designer.plugins.registry import PluginRegistry
65
64
 
66
- DEFAULT_BUFFER_SIZE = 1000
67
-
68
65
  DEFAULT_SECRET_RESOLVER = CompositeResolver([EnvironmentResolver(), PlaintextResolver()])
69
66
 
70
67
  DEFAULT_SEED_READERS = [
@@ -112,7 +109,6 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
112
109
  ):
113
110
  self._secret_resolver = secret_resolver or DEFAULT_SECRET_RESOLVER
114
111
  self._artifact_path = Path(artifact_path) if artifact_path is not None else Path.cwd() / "artifacts"
115
- self._buffer_size = DEFAULT_BUFFER_SIZE
116
112
  self._run_config = RunConfig()
117
113
  self._managed_assets_path = Path(managed_assets_path or MANAGED_ASSETS_PATH)
118
114
  self._model_providers = self._resolve_model_providers(model_providers)
@@ -169,7 +165,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
169
165
  builder = self._create_dataset_builder(config_builder, resource_provider)
170
166
 
171
167
  try:
172
- builder.build(num_records=num_records, buffer_size=self._buffer_size)
168
+ builder.build(num_records=num_records)
173
169
  except Exception as e:
174
170
  raise DataDesignerGenerationError(f"🛑 Error generating dataset: {e}")
175
171
 
@@ -182,10 +178,13 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
182
178
  except Exception as e:
183
179
  raise DataDesignerProfilingError(f"🛑 Error profiling dataset: {e}")
184
180
 
181
+ dataset_metadata = resource_provider.get_dataset_metadata()
182
+
185
183
  return DatasetCreationResults(
186
184
  artifact_storage=builder.artifact_storage,
187
185
  analysis=analysis,
188
186
  config_builder=config_builder,
187
+ dataset_metadata=dataset_metadata,
189
188
  )
190
189
 
191
190
  def preview(
@@ -249,11 +248,15 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
249
248
  ):
250
249
  logger.info(f"{RandomEmoji.success()} Preview complete!")
251
250
 
251
+ # Create dataset metadata from the resource provider
252
+ dataset_metadata = resource_provider.get_dataset_metadata()
253
+
252
254
  return PreviewResults(
253
255
  dataset=processed_dataset,
254
256
  analysis=analysis,
255
257
  processor_artifacts=processor_artifacts,
256
258
  config_builder=config_builder,
259
+ dataset_metadata=dataset_metadata,
257
260
  )
258
261
 
259
262
  def validate(self, config_builder: DataDesignerConfigBuilder) -> None:
@@ -300,34 +303,22 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
300
303
  """
301
304
  return self._secret_resolver
302
305
 
303
- def set_buffer_size(self, buffer_size: int) -> None:
304
- """Set the buffer size for dataset generation.
305
-
306
- The buffer size controls how many records are processed in memory at once
307
- during dataset generation using the `create` method. The default value is
308
- set to the constant `DEFAULT_BUFFER_SIZE` defined in the data_designer module.
309
-
310
- Args:
311
- buffer_size: Number of records to process in each buffer.
312
-
313
- Raises:
314
- InvalidBufferValueError: If buffer size is less than or equal to 0.
315
- """
316
- if buffer_size <= 0:
317
- raise InvalidBufferValueError("Buffer size must be greater than 0.")
318
- self._buffer_size = buffer_size
319
-
320
306
  def set_run_config(self, run_config: RunConfig) -> None:
321
307
  """Set the runtime configuration for dataset generation.
322
308
 
323
309
  Args:
324
310
  run_config: A RunConfig instance containing runtime settings such as
325
- early shutdown behavior. Import RunConfig from data_designer.essentials.
311
+ early shutdown behavior and batch sizing via `buffer_size`. Import RunConfig from
312
+ data_designer.essentials.
326
313
 
327
314
  Example:
328
315
  >>> from data_designer.essentials import DataDesigner, RunConfig
329
316
  >>> dd = DataDesigner()
330
317
  >>> dd.set_run_config(RunConfig(disable_early_shutdown=True))
318
+
319
+ Notes:
320
+ When `disable_early_shutdown=True`, DataDesigner will never terminate generation early
321
+ due to error-rate thresholds. Errors are still tracked for reporting.
331
322
  """
332
323
  self._run_config = run_config
333
324
 
@@ -9,6 +9,7 @@ import pandas as pd
9
9
 
10
10
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
11
11
  from data_designer.config.config_builder import DataDesignerConfigBuilder
12
+ from data_designer.config.dataset_metadata import DatasetMetadata
12
13
  from data_designer.config.utils.visualization import WithRecordSamplerMixin
13
14
  from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
14
15
  from data_designer.engine.dataset_builders.errors import ArtifactStorageError
@@ -28,6 +29,7 @@ class DatasetCreationResults(WithRecordSamplerMixin):
28
29
  artifact_storage: ArtifactStorage,
29
30
  analysis: DatasetProfilerResults,
30
31
  config_builder: DataDesignerConfigBuilder,
32
+ dataset_metadata: DatasetMetadata,
31
33
  ):
32
34
  """Creates a new instance with results based on a dataset creation run.
33
35
 
@@ -35,10 +37,12 @@ class DatasetCreationResults(WithRecordSamplerMixin):
35
37
  artifact_storage: Storage manager for accessing generated artifacts.
36
38
  analysis: Profiling results for the generated dataset.
37
39
  config_builder: Configuration builder used to create the dataset.
40
+ dataset_metadata: Metadata about the generated dataset (e.g., seed column names).
38
41
  """
39
42
  self.artifact_storage = artifact_storage
40
43
  self._analysis = analysis
41
44
  self._config_builder = config_builder
45
+ self.dataset_metadata = dataset_metadata
42
46
 
43
47
  def load_analysis(self) -> DatasetProfilerResults:
44
48
  """Load the profiling analysis results for the generated dataset.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer
3
- Version: 0.3.3
3
+ Version: 0.3.4
4
4
  Summary: General framework for synthetic data generation
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -193,6 +193,14 @@ The value `openai/gpt-oss-20b` would be collected.
193
193
 
194
194
  To disable telemetry capture, set `NEMO_TELEMETRY_ENABLED=false`.
195
195
 
196
+ ### Top Models
197
+
198
+ This chart represents the breakdown of models used for Data Designer across all synthetic data generation jobs from 12/18/2025 to 1/14/2026.
199
+
200
+ ![Top models used for synthetic data generation](docs/images/top-models.png)
201
+
202
+ _Last updated on 1/14/2026_
203
+
196
204
  ---
197
205
 
198
206
  ## License
@@ -1,5 +1,5 @@
1
1
  data_designer/__init__.py,sha256=xYZCBRleSswcNCARfHQzFy_Edag3Nmmwfa6A5C1d5B8,466
2
- data_designer/_version.py,sha256=lemL_4Kl75FgrO6lVuFrrtw6-Dcf9wtXBalKkXuzkO4,704
2
+ data_designer/_version.py,sha256=3nDaC5e0d_scBB1bUEKPlItbvbY0PmXNNyyOTNFNWNI,704
3
3
  data_designer/errors.py,sha256=BjnxDxwtTGscY3lZbi1RQ666j5PseoxNJRobeMXAJiI,184
4
4
  data_designer/logging.py,sha256=2ToJzPPb6zF-QR-yQnrLVcBiBGL_QVNsnCHSQC0nPpQ,5346
5
5
  data_designer/plugin_manager.py,sha256=C2ZkZiXlcMRiaxfrrho5Shz6DKdExVeBha7ch-d4CnU,2695
@@ -34,24 +34,25 @@ data_designer/cli/services/download_service.py,sha256=9ocQdHZW6VYHqM3nyHn_7dP1z1
34
34
  data_designer/cli/services/model_service.py,sha256=rcsozdIRXMElskUHtmWGj6pZbVFf2JzHHv8rTnXv8xw,3890
35
35
  data_designer/cli/services/provider_service.py,sha256=ru0-AJVZLr8wk5kOV2z2FgPpF-iK8vWQs0qtrMKp-dg,3921
36
36
  data_designer/config/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
37
- data_designer/config/base.py,sha256=DtsUGIwTny57ASN3ffIfvhjrWt1PGhGuRDoqRmRx8Qo,2401
37
+ data_designer/config/base.py,sha256=IGj6sy_GnKzC94uu2rdxe12EqR_AmGJ6O3rl2MxOv6g,2449
38
38
  data_designer/config/column_configs.py,sha256=Z3V8BKMkgse91MGlEcyJBa-lV25_j81SZ-vZNOoUKXc,18966
39
39
  data_designer/config/column_types.py,sha256=qsy04QslsnPEOoStN3sJIoSBeSSTxpNg5VLVmew_YLQ,5753
40
40
  data_designer/config/config_builder.py,sha256=vuPibkodbJxbCXdaI1tt1Uyo1SVCnAOfLBAW1AmhajI,24707
41
41
  data_designer/config/data_designer_config.py,sha256=qOojviug05vHR2S4800sjd4OmxhSVi6kB8SAFXLlPog,1891
42
42
  data_designer/config/dataset_builders.py,sha256=4NSEEqXzgSK8IDXoUSVRSUTcEe-ocKS-iEzyzKkNSJ0,332
43
+ data_designer/config/dataset_metadata.py,sha256=UTlEgnHWgjwPuc7bP95T7gaKmcr7pIhFMy9vvbUwMV4,647
43
44
  data_designer/config/default_model_settings.py,sha256=3iUr10JvSTMDHwlEdSbLE_y90czbGOs_21La3V9fXoM,4462
44
45
  data_designer/config/errors.py,sha256=g64yn9l7lTbcXI3DPdC_3utvm994IXGDhoQRiNON6T0,524
45
46
  data_designer/config/exports.py,sha256=y23KqhwAf4DIarfvqgiuqyK2Fs1zv9cTDcuQ9SBX54o,4720
46
47
  data_designer/config/interface.py,sha256=RRC5JHl6wQ9Icg1IQWOM6t8VkBk6c1BREkka3cEAi8c,1624
47
48
  data_designer/config/models.py,sha256=A8D0qj6L9ndYvScvNWGuoOLe0zjYtoAm0JkwaG3qjRg,15335
48
- data_designer/config/preview_results.py,sha256=fhmxRLbSifdMO_pLrvmGaDJHtNIupX8X3HvXhjHr3hM,1350
49
+ data_designer/config/preview_results.py,sha256=y-zEnRxZt_72XuAFs_e_peK4bVJ_fDSi9b-jLxsQ-S0,1602
49
50
  data_designer/config/processors.py,sha256=CoLUN1AuEBUPigmDT8vCWotvOcFgnC8CFk8VLvNSjxw,5992
50
- data_designer/config/run_config.py,sha256=M6m3oc3e1CTgStfkkgHyIt78JYrrMKdGSJUR07elmp0,1418
51
+ data_designer/config/run_config.py,sha256=yakCcWyT78kLXGZnFVK35pdrkPpWT7AVEp_sHyq67fg,2393
51
52
  data_designer/config/sampler_constraints.py,sha256=XBPxm81J0u2q3WVsRaP2CoIE_0ssY14xc4Nt5r0oyuM,1161
52
53
  data_designer/config/sampler_params.py,sha256=FSMvmdj-9Hiyap1_n-AM_ZDT-sTovox7LK4TO4L2UUc,27843
53
54
  data_designer/config/seed.py,sha256=iUFByadE6GTG9HCcp0nEWm7wTDNaAOuYtGm2Ov0tiaE,4632
54
- data_designer/config/seed_source.py,sha256=zOnLAOyJ_nfFA5sI6DNzDf5IGWiYEjDgHYOzo_5O5Zw,2449
55
+ data_designer/config/seed_source.py,sha256=GYgK9f0_a0FuOtvXlrLEw0MmAELucTB1Qrc_sGGoziw,2513
55
56
  data_designer/config/seed_source_types.py,sha256=4EJ4IsYkfkicYqh0CuEvyI3H541XbtV_ffClJV2-zCQ,679
56
57
  data_designer/config/validator_params.py,sha256=9w9M7Z1rcZOpw-BUBhjaVfA8ykNP5iPvdyJOHmdugEg,3911
57
58
  data_designer/config/analysis/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
@@ -68,7 +69,7 @@ data_designer/config/utils/io_helpers.py,sha256=_14cfoMUIFqiUPBTu5u-BomarqmP6_VB
68
69
  data_designer/config/utils/misc.py,sha256=cRofbhUulLKjV7j_7M5aNNNyIewyyhXLkj00GaZf9uo,2472
69
70
  data_designer/config/utils/numerical_helpers.py,sha256=BadOPY1AR2ZVKFAKORpKT_Corc1SZR7W1of6FXYIMY4,802
70
71
  data_designer/config/utils/type_helpers.py,sha256=2WSGYWTDktOCa9FGNv2IPXIHQHJc7-fimAB1JqIJVx0,4023
71
- data_designer/config/utils/visualization.py,sha256=TbYgHAvxXEmJZH-QZYyOkf28E52rW80ZirpJT2PMeK8,18191
72
+ data_designer/config/utils/visualization.py,sha256=2ZloKN1UimKvIXyKJBSHZ56-JItE5MzMXY31J9x4hbY,18366
72
73
  data_designer/engine/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
73
74
  data_designer/engine/compiler.py,sha256=tzZF5rk_AAid0yRwwsR86-xvW-5GdJskN1RPfvuFq-k,2853
74
75
  data_designer/engine/configurable_task.py,sha256=hdXyfekQ9dHSc_NQG6nZIxKCqWSWKnqA5nje7Uobf6k,2525
@@ -90,23 +91,23 @@ data_designer/engine/column_generators/generators/__init__.py,sha256=XLO09Ei8g0l
90
91
  data_designer/engine/column_generators/generators/base.py,sha256=T3ccXIyKXwYN4Kes6HcTBdELK-wOs9cFCoaTH3C3dyQ,3814
91
92
  data_designer/engine/column_generators/generators/embedding.py,sha256=pdRMzb95CKmBpOiTQrAEiKngBFvAlt5g8HwI8EwWBIY,1565
92
93
  data_designer/engine/column_generators/generators/expression.py,sha256=irBDhTsFyZaWU2EIqy5xOKRBxX-x8W8q5pQ69P7NKTs,2543
93
- data_designer/engine/column_generators/generators/llm_completion.py,sha256=Dp4F1wxZAMwUglRqQHE5lPXDNyPALKWsMr1IJRIvbJI,5031
94
+ data_designer/engine/column_generators/generators/llm_completion.py,sha256=fwL6Xbfg8d9SnZgv2-j3g_6S38xzuwnVY4R4UabvdIw,4881
94
95
  data_designer/engine/column_generators/generators/samplers.py,sha256=0bvJhVK2LfH9aRY1BxqWCjA7LJxy1B63gGmZuWK8auU,3486
95
96
  data_designer/engine/column_generators/generators/seed_dataset.py,sha256=IskfOQkRMRTfu8tiYb426LMchBYKNQs0uSo9E5y0bwg,6905
96
- data_designer/engine/column_generators/generators/validation.py,sha256=hDqE_xZfJ2XGVE89ifDUvsbtZfzQmiNGRBTY733dXgY,6776
97
+ data_designer/engine/column_generators/generators/validation.py,sha256=0gw0Wzq4yVqejWewt2uznQ4UJI7_0MMEdvMenH-KwYc,6844
97
98
  data_designer/engine/column_generators/utils/errors.py,sha256=Nemo7fxg9BpTOf0kdlxxdtXZMfe_ksrRfzWg2E5sFX0,370
98
99
  data_designer/engine/column_generators/utils/generator_classification.py,sha256=1fvMX7lQzY3A1s-V3CtS-W6-zwmby553_Oe9K-tdZKE,1922
99
100
  data_designer/engine/column_generators/utils/judge_score_factory.py,sha256=8l0g-L_O6esmAEf1rJh7o2IASZnLqZ_KDlGaLCMYMK0,2105
100
101
  data_designer/engine/column_generators/utils/prompt_renderer.py,sha256=xXnzJiR60zTmMDorR_wfTleNMJsLKA5gbL4WOjQ-wYg,4765
101
102
  data_designer/engine/dataset_builders/artifact_storage.py,sha256=fYBC569tXVpn7UURcuXfHPhEvvwOHnMxAkA1iQAB-j4,8425
102
- data_designer/engine/dataset_builders/column_wise_builder.py,sha256=y068Eza-MabPviEStsFt14Kl4p8b9o1Exe4-kje4vEs,15197
103
+ data_designer/engine/dataset_builders/column_wise_builder.py,sha256=DUAMEmy7xCSFDYy-WU-ZXflKzy5oJ6yqMESQSb7CoUc,15308
103
104
  data_designer/engine/dataset_builders/errors.py,sha256=ov9cTRvLtLieIFkUGZdk1n_iabdc904ZATwgT5u9uzY,364
104
105
  data_designer/engine/dataset_builders/multi_column_configs.py,sha256=bxRildX3SfzSFKv_rqwwsUeFn_RoyhfFneSj4qBi-Q4,1624
105
106
  data_designer/engine/dataset_builders/utils/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
106
- data_designer/engine/dataset_builders/utils/concurrency.py,sha256=SQlT9Tu3UgATllm_fwle4kkro09NQBVU_nLSZj7lBDQ,7893
107
+ data_designer/engine/dataset_builders/utils/concurrency.py,sha256=Q0ro9UY-3-FFzfi3MZ29nMTSiDZgg1Um6y_HQFztDhk,8338
107
108
  data_designer/engine/dataset_builders/utils/config_compiler.py,sha256=iAbaLiDNBPyjZwSVK1a83KFKln9LACjKdVSpIiZRemw,2405
108
109
  data_designer/engine/dataset_builders/utils/dag.py,sha256=L3-sla2s8oqcy3V-WSxXAWZVakkmb3cvmTh5L6phC1M,2474
109
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py,sha256=c-6ER96oP-0oZ6CncN8sjXy1S1ZjzTreeYBr1oOX8Yg,7774
110
+ data_designer/engine/dataset_builders/utils/dataset_batch_manager.py,sha256=DpEW_zrHv7CPiVFw07r_0Q9V6dFaTqOFJ1uWnuicxHo,8142
110
111
  data_designer/engine/dataset_builders/utils/errors.py,sha256=6hstnyjYZ8pU69qK3OrUvlyeifqRJC2m399GNVcih3I,375
111
112
  data_designer/engine/models/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
112
113
  data_designer/engine/models/errors.py,sha256=HjPF3ERKU7veS7s189_0VBBMpePl6pSmOqaYhOO8xv4,12179
@@ -145,7 +146,7 @@ data_designer/engine/registry/errors.py,sha256=PNWsfU2baNmzshcHah1-P7YT9OqxfxYf1
145
146
  data_designer/engine/resources/managed_dataset_generator.py,sha256=mJAR-dc0vEfiw3xN7ABIOalxGIbQmfZ7P23U-vUut40,1374
146
147
  data_designer/engine/resources/managed_dataset_repository.py,sha256=zxxP9SDdu0PrP28ozJmIwdlbHHuGkcPRmb1Ua5VOsG0,7552
147
148
  data_designer/engine/resources/managed_storage.py,sha256=nTfp14o-BbtDP6XUxD3564g0cXB6B1og-1rZI9dx1H0,2084
148
- data_designer/engine/resources/resource_provider.py,sha256=9IwJ5qQPvoQwXe_wIkp0enNpgKRRu691BQfxsnZic5w,2323
149
+ data_designer/engine/resources/resource_provider.py,sha256=pMh0_j8JCZ54x4zCPtPIb9s5AGwSBswQsYFddxsgM_U,2813
149
150
  data_designer/engine/resources/seed_reader.py,sha256=Gh524gMWh6Lz43bielQG3nmMz5MrZy1BTGs792AhMpc,5591
150
151
  data_designer/engine/sampling_gen/column.py,sha256=lqdMrUebIFJ_C_Laye0TuIkEd_fVVPP5b-zoNGez25Q,3951
151
152
  data_designer/engine/sampling_gen/constraints.py,sha256=HtLxW_VYXlo_A8IMlZkgTb1fBwyrRHuB3LZnF3AgV-I,3009
@@ -176,9 +177,9 @@ data_designer/engine/validators/remote.py,sha256=Qviad8vF7SBtHRvNqKxZyiLJuGJp0rm
176
177
  data_designer/engine/validators/sql.py,sha256=Y2FkM_JmcWzW98qjZruq73vt8RgjkLXuTbyLPGrA2l8,2255
177
178
  data_designer/essentials/__init__.py,sha256=wt84P4qu1-OdIfrZmdOtRtiBoGcVSdTEdkIiUaztPWs,1099
178
179
  data_designer/interface/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
179
- data_designer/interface/data_designer.py,sha256=AJ-1ntSeAkDcmuyI0-erq8xrxdu6C5m47h0k8p2IxFQ,17409
180
+ data_designer/interface/data_designer.py,sha256=OpjWdfjwhp1wzeArdicpCzmaaVCZydSGarQDpvh3egU,17105
180
181
  data_designer/interface/errors.py,sha256=CoH-6V95RDNHNONxKs8MWina-cD6NtaerMW_cyH8WRU,570
181
- data_designer/interface/results.py,sha256=x6bzKaG9ZaDU_UmImNgjULn131jTektMzWX5B3VZXpE,3516
182
+ data_designer/interface/results.py,sha256=aiRWSK2QVAKTGboqn_Fods7dc5lbG5cmDs6O_INbH-0,3768
182
183
  data_designer/plugins/__init__.py,sha256=GmeQ6bzFMNkLthrmLKQ5bcUMPw_W2K4d6oWKvYA8eNw,239
183
184
  data_designer/plugins/errors.py,sha256=EzKPo0rEiTe872sIAvF0_cDjFfl_P2MyDtESZL1P3ug,350
184
185
  data_designer/plugins/plugin.py,sha256=TmdllvWXOKZZNwJTW_rUMD1M68pAh_IEIILZei7fHqU,5468
@@ -186,8 +187,8 @@ data_designer/plugins/registry.py,sha256=1vNlmDyFMCIY8D_z1RbnWMfKnHZ02g8sAvJg1nE
186
187
  data_designer/plugins/testing/__init__.py,sha256=lSuWzt1AVTJg4gV9wh6BUU1Az7IjQ-9FgADdUAtm_qQ,260
187
188
  data_designer/plugins/testing/stubs.py,sha256=E8bovwU4zpNaJM4b1i4biCDXyqvthe8vmsrZ9M0M9vM,4277
188
189
  data_designer/plugins/testing/utils.py,sha256=OLoQzW8-qbA-91wTUWtwNHrMDyGz1-ma0f6a-3NBUNI,937
189
- data_designer-0.3.3.dist-info/METADATA,sha256=zJ5wtn9YBRb9qEXBHT3_93rZxo7RT5P6JhSsAdILimQ,7648
190
- data_designer-0.3.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
191
- data_designer-0.3.3.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
192
- data_designer-0.3.3.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
193
- data_designer-0.3.3.dist-info/RECORD,,
190
+ data_designer-0.3.4.dist-info/METADATA,sha256=10B6euF52J8L-MFg1_I_3lRQ3rK3Q5x_2V0iZ_vTtBQ,7914
191
+ data_designer-0.3.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
192
+ data_designer-0.3.4.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
193
+ data_designer-0.3.4.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
194
+ data_designer-0.3.4.dist-info/RECORD,,