PyPI - data-designer - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

data-designer 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data_designer/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.3.3'
-__version_tuple__ = version_tuple = (0, 3, 3)
+__version__ = version = '0.3.4'
+__version_tuple__ = version_tuple = (0, 3, 4)
 __commit_id__ = commit_id = None

data_designer/config/base.py CHANGED Viewed

@@ -18,6 +18,7 @@ class ConfigBase(BaseModel):
         use_enum_values=True,
         arbitrary_types_allowed=True,
         extra="forbid",
+        json_schema_mode_override="validation",
     )

data_designer/config/dataset_metadata.py ADDED Viewed

@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from pydantic import BaseModel
+class DatasetMetadata(BaseModel):
+    """Metadata about a generated dataset.
+    This object is created by the engine and passed to results objects for use
+    in visualization and other client-side utilities. It is designed to be
+    serializable so it can be sent over the wire in a client-server architecture.
+    Attributes:
+        seed_column_names: Names of columns from the seed dataset. Empty list if no seed dataset.
+    """
+    seed_column_names: list[str] = []

data_designer/config/preview_results.py CHANGED Viewed

@@ -7,6 +7,7 @@ import pandas as pd
 from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
 from data_designer.config.config_builder import DataDesignerConfigBuilder
+from data_designer.config.dataset_metadata import DatasetMetadata
 from data_designer.config.utils.visualization import WithRecordSamplerMixin
@@ -15,6 +16,7 @@ class PreviewResults(WithRecordSamplerMixin):
         self,
         *,
         config_builder: DataDesignerConfigBuilder,
+        dataset_metadata: DatasetMetadata,
         dataset: pd.DataFrame | None = None,
         analysis: DatasetProfilerResults | None = None,
         processor_artifacts: dict[str, list[str] | str] | None = None,
@@ -23,6 +25,7 @@ class PreviewResults(WithRecordSamplerMixin):
         Args:
             config_builder: Data Designer configuration builder.
+            dataset_metadata: Metadata about the generated dataset (e.g., seed column names).
             dataset: Dataset of the preview run.
             analysis: Analysis of the preview run.
             processor_artifacts: Artifacts generated by the processors.
@@ -30,4 +33,5 @@ class PreviewResults(WithRecordSamplerMixin):
         self.dataset: pd.DataFrame | None = dataset
         self.analysis: DatasetProfilerResults | None = analysis
         self.processor_artifacts: dict[str, list[str] | str] | None = processor_artifacts
+        self.dataset_metadata = dataset_metadata
         self._config_builder = config_builder

data_designer/config/run_config.py CHANGED Viewed

@@ -14,21 +14,33 @@ class RunConfig(ConfigBase):
     part of the dataset configuration itself.
     Attributes:
-        disable_early_shutdown: If True, disables early shutdown entirely. Generation
-            will continue regardless of error rate. Default is False.
-        shutdown_error_rate: Error rate threshold (0.0-1.0) that triggers early shutdown.
-            When early shutdown is disabled, this value is normalized to 1.0. Default is 0.5.
+        disable_early_shutdown: If True, disables the executor's early-shutdown behavior entirely.
+            Generation will continue regardless of error rate, and the early-shutdown exception
+            will never be raised. Error counts and summaries are still collected. Default is False.
+        shutdown_error_rate: Error rate threshold (0.0-1.0) that triggers early shutdown when
+            early shutdown is enabled. Default is 0.5.
         shutdown_error_window: Minimum number of completed tasks before error rate
             monitoring begins. Must be >= 0. Default is 10.
+        buffer_size: Number of records to process in each batch during dataset generation.
+            A batch is processed end-to-end (column generation, post-batch processors, and writing the batch
+            to artifact storage) before moving on to the next batch. Must be > 0. Default is 1000.
+        max_conversation_restarts: Maximum number of full conversation restarts permitted when
+            generation tasks call `ModelFacade.generate(...)`. Must be >= 0. Default is 5.
+        max_conversation_correction_steps: Maximum number of correction rounds permitted within a
+            single conversation when generation tasks call `ModelFacade.generate(...)`. Must be >= 0.
+            Default is 0.
     """
     disable_early_shutdown: bool = False
     shutdown_error_rate: float = Field(default=0.5, ge=0.0, le=1.0)
     shutdown_error_window: int = Field(default=10, ge=0)
+    buffer_size: int = Field(default=1000, gt=0)
+    max_conversation_restarts: int = Field(default=5, ge=0)
+    max_conversation_correction_steps: int = Field(default=0, ge=0)
     @model_validator(mode="after")
     def normalize_shutdown_settings(self) -> Self:
-        """Set shutdown_error_rate to 1.0 when early shutdown is disabled."""
+        """Normalize shutdown settings for compatibility."""
         if self.disable_early_shutdown:
             self.shutdown_error_rate = 1.0
         return self

data_designer/config/seed_source.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Literal
 import pandas as pd
 from pydantic import BaseModel, ConfigDict, Field, field_validator
+from pydantic.json_schema import SkipJsonSchema
 from typing_extensions import Self
 from data_designer.config.utils.io_helpers import (
@@ -68,7 +69,7 @@ class DataFrameSeedSource(SeedSource):
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    df: pd.DataFrame = Field(
+    df: SkipJsonSchema[pd.DataFrame] = Field(
         ...,
         exclude=True,
         description=(

data_designer/config/utils/visualization.py CHANGED Viewed

@@ -31,6 +31,7 @@ from data_designer.config.utils.errors import DatasetSampleDisplayError
 if TYPE_CHECKING:
     from data_designer.config.config_builder import DataDesignerConfigBuilder
+    from data_designer.config.dataset_metadata import DatasetMetadata
 console = Console()
@@ -57,6 +58,7 @@ class ColorPalette(str, Enum):
 class WithRecordSamplerMixin:
     _display_cycle_index: int = 0
+    dataset_metadata: DatasetMetadata
     @cached_property
     def _record_sampler_dataset(self) -> pd.DataFrame:
@@ -79,22 +81,22 @@ class WithRecordSamplerMixin:
         self,
         index: int | None = None,
         *,
-        hide_seed_columns: bool = False,
         syntax_highlighting_theme: str = "dracula",
         background_color: str | None = None,
         processors_to_display: list[str] | None = None,
+        hide_seed_columns: bool = False,
     ) -> None:
         """Display a sample record from the Data Designer dataset preview.
         Args:
             index: Index of the record to display. If None, the next record will be displayed.
                 This is useful for running the cell in a notebook multiple times.
-            hide_seed_columns: If True, the columns from the seed dataset (if any) will not be displayed.
             syntax_highlighting_theme: Theme to use for syntax highlighting. See the `Syntax`
                 documentation from `rich` for information about available themes.
             background_color: Background color to use for the record. See the `Syntax`
                 documentation from `rich` for information about available background colors.
             processors_to_display: List of processors to display the artifacts for. If None, all processors will be displayed.
+            hide_seed_columns: If True, seed columns will not be displayed separately.
         """
         i = index or self._display_cycle_index
@@ -120,14 +122,16 @@ class WithRecordSamplerMixin:
                     else:
                         processor_data_to_display[processor] = self.processor_artifacts[processor]
+        seed_column_names = None if hide_seed_columns else self.dataset_metadata.seed_column_names
         display_sample_record(
             record=record,
             processor_data_to_display=processor_data_to_display,
             config_builder=self._config_builder,
             background_color=background_color,
             syntax_highlighting_theme=syntax_highlighting_theme,
-            hide_seed_columns=hide_seed_columns,
             record_index=i,
+            seed_column_names=seed_column_names,
         )
         if index is None:
             self._display_cycle_index = (self._display_cycle_index + 1) % num_records
@@ -160,7 +164,7 @@ def display_sample_record(
     background_color: str | None = None,
     syntax_highlighting_theme: str = "dracula",
     record_index: int | None = None,
-    hide_seed_columns: bool = False,
+    seed_column_names: list[str] | None = None,
 ):
     if isinstance(record, (dict, pd.Series)):
         record = pd.DataFrame([record]).iloc[0]
@@ -179,14 +183,14 @@ def display_sample_record(
     render_list = []
     table_kws = dict(show_lines=True, expand=True)
-    seed_columns = config_builder.get_columns_of_type(DataDesignerColumnType.SEED_DATASET)
-    if not hide_seed_columns and len(seed_columns) > 0:
+    # Display seed columns if seed_column_names is provided and not empty
+    if seed_column_names:
         table = Table(title="Seed Columns", **table_kws)
         table.add_column("Name")
         table.add_column("Value")
-        for col in seed_columns:
-            if not col.drop:
-                table.add_row(col.name, convert_to_row_element(record[col.name]))
+        for col_name in seed_column_names:
+            if col_name in record.index:
+                table.add_row(col_name, convert_to_row_element(record[col_name]))
         render_list.append(pad_console_element(table))
     non_code_columns = (

data_designer/engine/column_generators/generators/llm_completion.py CHANGED Viewed

@@ -28,10 +28,6 @@ from data_designer.engine.processing.utils import deserialize_json_values
 logger = logging.getLogger(__name__)
-DEFAULT_MAX_CONVERSATION_RESTARTS = 5
-DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS = 0
 class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfigT]):
     @functools.cached_property
     def response_recipe(self) -> ResponseRecipe:
@@ -39,11 +35,11 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
     @property
     def max_conversation_correction_steps(self) -> int:
-        return DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS
+        return self.resource_provider.run_config.max_conversation_correction_steps
     @property
     def max_conversation_restarts(self) -> int:
-        return DEFAULT_MAX_CONVERSATION_RESTARTS
+        return self.resource_provider.run_config.max_conversation_restarts
     @functools.cached_property
     def prompt_renderer(self) -> RecordBasedPromptRenderer:
@@ -129,7 +125,3 @@ class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColum
             description="Judge a new dataset cell based on a set of rubrics",
             generation_strategy=GenerationStrategy.CELL_BY_CELL,
         )
-    @property
-    def max_conversation_restarts(self) -> int:
-        return 2 * DEFAULT_MAX_CONVERSATION_RESTARTS

data_designer/engine/column_generators/generators/validation.py CHANGED Viewed

@@ -132,6 +132,7 @@ class ValidationColumnGenerator(ColumnGenerator[ValidationColumnConfig]):
             error_callback=error_callback,
             shutdown_error_rate=settings.shutdown_error_rate,
             shutdown_error_window=settings.shutdown_error_window,
+            disable_early_shutdown=settings.disable_early_shutdown,
         ) as executor:
             for i, batch in enumerate(batched_records):
                 executor.submit(lambda batch: self._validate_batch(validator, batch), batch, context={"index": i})

data_designer/engine/dataset_builders/column_wise_builder.py CHANGED Viewed

@@ -94,7 +94,6 @@ class ColumnWiseDatasetBuilder:
         self,
         *,
         num_records: int,
-        buffer_size: int,
         on_batch_complete: Callable[[Path], None] | None = None,
     ) -> Path:
         self._write_configs()
@@ -104,6 +103,7 @@ class ColumnWiseDatasetBuilder:
         start_time = time.perf_counter()
         group_id = uuid.uuid4().hex
+        buffer_size = self._resource_provider.run_config.buffer_size
         self.batch_manager.start(num_records=num_records, buffer_size=buffer_size)
         for batch_idx in range(self.batch_manager.num_batches):
             logger.info(f"⏳ Processing batch {batch_idx + 1} of {self.batch_manager.num_batches}")
@@ -228,6 +228,7 @@ class ColumnWiseDatasetBuilder:
             error_callback=self._worker_error_callback,
             shutdown_error_rate=settings.shutdown_error_rate,
             shutdown_error_window=settings.shutdown_error_window,
+            disable_early_shutdown=settings.disable_early_shutdown,
         ) as executor:
             for i, record in self.batch_manager.iter_current_batch():
                 executor.submit(lambda record: generator.generate(record), record, context={"index": i})

data_designer/engine/dataset_builders/utils/concurrency.py CHANGED Viewed

@@ -96,6 +96,7 @@ class ConcurrentThreadExecutor:
         error_callback: ErrorCallbackWithContext | None = None,
         shutdown_error_rate: float = 0.50,
         shutdown_error_window: int = 10,
+        disable_early_shutdown: bool = False,
     ):
         self._executor = None
         self._column_name = column_name
@@ -106,6 +107,7 @@ class ConcurrentThreadExecutor:
         self._error_callback = error_callback
         self._shutdown_error_rate = shutdown_error_rate
         self._shutdown_window_size = shutdown_error_window
+        self._disable_early_shutdown = disable_early_shutdown
         self._results = ExecutorResults(failure_threshold=shutdown_error_rate)
     @property
@@ -139,7 +141,7 @@ class ConcurrentThreadExecutor:
     def __exit__(self, exc_type, exc_value, traceback):
         self._shutdown_executor()
-        if self._results.early_shutdown is True:
+        if not self._disable_early_shutdown and self._results.early_shutdown is True:
             self._raise_task_error()
     def _shutdown_executor(self) -> None:
@@ -160,7 +162,7 @@ class ConcurrentThreadExecutor:
         if self._executor is None:
             raise RuntimeError("Executor is not initialized, this class should be used as a context manager.")
-        if self._results.early_shutdown:
+        if not self._disable_early_shutdown and self._results.early_shutdown:
             self._shutdown_executor()
             self._raise_task_error()
@@ -176,7 +178,9 @@ class ConcurrentThreadExecutor:
                 with self._lock:
                     self._results.completed_count += 1
                     self._results.error_trap.handle_error(err)
-                    if self._results.is_error_rate_exceeded(self._shutdown_window_size):
+                    if not self._disable_early_shutdown and self._results.is_error_rate_exceeded(
+                        self._shutdown_window_size
+                    ):
                         # Signal to shutdown early on the next submission (if received).
                         # We cannot trigger shutdown from within this thread as it can
                         # cause a deadlock.
@@ -196,7 +200,12 @@ class ConcurrentThreadExecutor:
             # We'll re-raise a custom error that can be handled at the call-site and the summary
             # can also be inspected.
             self._semaphore.release()
-            if not isinstance(err, RuntimeError) and "after shutdown" not in str(err):
+            is_shutdown_error = isinstance(err, RuntimeError) and (
+                "after shutdown" in str(err) or "Pool shutdown" in str(err)
+            )
+            if not is_shutdown_error:
+                raise err
+            if self._disable_early_shutdown:
                 raise err
             self._raise_task_error()

data_designer/engine/dataset_builders/utils/dataset_batch_manager.py CHANGED Viewed

@@ -69,7 +69,7 @@ class DatasetBatchManager:
     def drop_records(self, index: Container[int]) -> None:
         self._buffer = [record for i, record in enumerate(self._buffer) if i not in index]
-    def finish_batch(self, on_complete: Callable[[Path], None] | None = None) -> Path:
+    def finish_batch(self, on_complete: Callable[[Path], None] | None = None) -> Path | None:
         """Finish the batch by moving the results from the partial results path to the final parquet folder.
         Returns:
@@ -78,29 +78,36 @@ class DatasetBatchManager:
         if self._current_batch_number >= self.num_batches:
             raise DatasetBatchManagementError("🛑 All batches have been processed.")
-        if not self.write():
-            raise DatasetBatchManagementError("🛑 Batch finished without any results to write.")
-        final_file_path = self.artifact_storage.move_partial_result_to_final_file_path(self._current_batch_number)
-        self.artifact_storage.write_metadata(
-            {
-                "target_num_records": sum(self.num_records_list),
-                "total_num_batches": self.num_batches,
-                "buffer_size": self._buffer_size,
-                "schema": {field.name: str(field.type) for field in pq.read_schema(final_file_path)},
-                "file_paths": [str(f) for f in sorted(self.artifact_storage.final_dataset_path.glob("*.parquet"))],
-                "num_records": self.num_records_list[: self._current_batch_number + 1],
-                "num_completed_batches": self._current_batch_number + 1,
-                "dataset_name": self.artifact_storage.dataset_name,
-            }
-        )
+        if self.write() is not None:
+            final_file_path = self.artifact_storage.move_partial_result_to_final_file_path(self._current_batch_number)
+            self.artifact_storage.write_metadata(
+                {
+                    "target_num_records": sum(self.num_records_list),
+                    "total_num_batches": self.num_batches,
+                    "buffer_size": self._buffer_size,
+                    "schema": {field.name: str(field.type) for field in pq.read_schema(final_file_path)},
+                    "file_paths": [str(f) for f in sorted(self.artifact_storage.final_dataset_path.glob("*.parquet"))],
+                    "num_records": self.num_records_list[: self._current_batch_number + 1],
+                    "num_completed_batches": self._current_batch_number + 1,
+                    "dataset_name": self.artifact_storage.dataset_name,
+                }
+            )
+            if on_complete:
+                on_complete(final_file_path)
+        else:
+            final_file_path = None
+            logger.warning(
+                f"⚠️ Batch {self._current_batch_number + 1} finished without any results to write. "
+                "A partial dataset containing the currently available columns has been written to the partial results "
+                f"directory: {self.artifact_storage.partial_results_path}"
+            )
         self._current_batch_number += 1
         self._buffer: list[dict] = []
-        if on_complete:
-            on_complete(final_file_path)
         return final_file_path
     def finish(self) -> None:

data_designer/engine/resources/resource_provider.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from data_designer.config.base import ConfigBase
+from data_designer.config.dataset_metadata import DatasetMetadata
 from data_designer.config.models import ModelConfig
 from data_designer.config.run_config import RunConfig
 from data_designer.config.seed_source import SeedSource
@@ -27,6 +28,17 @@ class ResourceProvider(ConfigBase):
     run_config: RunConfig = RunConfig()
     seed_reader: SeedReader | None = None
+    def get_dataset_metadata(self) -> DatasetMetadata:
+        """Get metadata about the dataset being generated.
+        Returns:
+            DatasetMetadata with seed column names and other metadata.
+        """
+        seed_column_names = []
+        if self.seed_reader is not None:
+            seed_column_names = self.seed_reader.get_column_names()
+        return DatasetMetadata(seed_column_names=seed_column_names)
 def create_resource_provider(
     *,

data_designer/interface/data_designer.py CHANGED Viewed

@@ -56,15 +56,12 @@ from data_designer.engine.secret_resolver import (
 from data_designer.interface.errors import (
     DataDesignerGenerationError,
     DataDesignerProfilingError,
-    InvalidBufferValueError,
 )
 from data_designer.interface.results import DatasetCreationResults
 from data_designer.logging import RandomEmoji
 from data_designer.plugins.plugin import PluginType
 from data_designer.plugins.registry import PluginRegistry
-DEFAULT_BUFFER_SIZE = 1000
 DEFAULT_SECRET_RESOLVER = CompositeResolver([EnvironmentResolver(), PlaintextResolver()])
 DEFAULT_SEED_READERS = [
@@ -112,7 +109,6 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
     ):
         self._secret_resolver = secret_resolver or DEFAULT_SECRET_RESOLVER
         self._artifact_path = Path(artifact_path) if artifact_path is not None else Path.cwd() / "artifacts"
-        self._buffer_size = DEFAULT_BUFFER_SIZE
         self._run_config = RunConfig()
         self._managed_assets_path = Path(managed_assets_path or MANAGED_ASSETS_PATH)
         self._model_providers = self._resolve_model_providers(model_providers)
@@ -169,7 +165,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
         builder = self._create_dataset_builder(config_builder, resource_provider)
         try:
-            builder.build(num_records=num_records, buffer_size=self._buffer_size)
+            builder.build(num_records=num_records)
         except Exception as e:
             raise DataDesignerGenerationError(f"🛑 Error generating dataset: {e}")
@@ -182,10 +178,13 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
         except Exception as e:
             raise DataDesignerProfilingError(f"🛑 Error profiling dataset: {e}")
+        dataset_metadata = resource_provider.get_dataset_metadata()
         return DatasetCreationResults(
             artifact_storage=builder.artifact_storage,
             analysis=analysis,
             config_builder=config_builder,
+            dataset_metadata=dataset_metadata,
         )
     def preview(
@@ -249,11 +248,15 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
         ):
             logger.info(f"{RandomEmoji.success()} Preview complete!")
+        # Create dataset metadata from the resource provider
+        dataset_metadata = resource_provider.get_dataset_metadata()
         return PreviewResults(
             dataset=processed_dataset,
             analysis=analysis,
             processor_artifacts=processor_artifacts,
             config_builder=config_builder,
+            dataset_metadata=dataset_metadata,
         )
     def validate(self, config_builder: DataDesignerConfigBuilder) -> None:
@@ -300,34 +303,22 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
         """
         return self._secret_resolver
-    def set_buffer_size(self, buffer_size: int) -> None:
-        """Set the buffer size for dataset generation.
-        The buffer size controls how many records are processed in memory at once
-        during dataset generation using the `create` method. The default value is
-        set to the constant `DEFAULT_BUFFER_SIZE` defined in the data_designer module.
-        Args:
-            buffer_size: Number of records to process in each buffer.
-        Raises:
-            InvalidBufferValueError: If buffer size is less than or equal to 0.
-        """
-        if buffer_size <= 0:
-            raise InvalidBufferValueError("Buffer size must be greater than 0.")
-        self._buffer_size = buffer_size
     def set_run_config(self, run_config: RunConfig) -> None:
         """Set the runtime configuration for dataset generation.
         Args:
             run_config: A RunConfig instance containing runtime settings such as
-                early shutdown behavior. Import RunConfig from data_designer.essentials.
+                early shutdown behavior and batch sizing via `buffer_size`. Import RunConfig from
+                data_designer.essentials.
         Example:
             >>> from data_designer.essentials import DataDesigner, RunConfig
             >>> dd = DataDesigner()
             >>> dd.set_run_config(RunConfig(disable_early_shutdown=True))
+        Notes:
+            When `disable_early_shutdown=True`, DataDesigner will never terminate generation early
+            due to error-rate thresholds. Errors are still tracked for reporting.
         """
         self._run_config = run_config

data_designer/interface/results.py CHANGED Viewed

@@ -9,6 +9,7 @@ import pandas as pd
 from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
 from data_designer.config.config_builder import DataDesignerConfigBuilder
+from data_designer.config.dataset_metadata import DatasetMetadata
 from data_designer.config.utils.visualization import WithRecordSamplerMixin
 from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
 from data_designer.engine.dataset_builders.errors import ArtifactStorageError
@@ -28,6 +29,7 @@ class DatasetCreationResults(WithRecordSamplerMixin):
         artifact_storage: ArtifactStorage,
         analysis: DatasetProfilerResults,
         config_builder: DataDesignerConfigBuilder,
+        dataset_metadata: DatasetMetadata,
     ):
         """Creates a new instance with results based on a dataset creation run.
@@ -35,10 +37,12 @@ class DatasetCreationResults(WithRecordSamplerMixin):
             artifact_storage: Storage manager for accessing generated artifacts.
             analysis: Profiling results for the generated dataset.
             config_builder: Configuration builder used to create the dataset.
+            dataset_metadata: Metadata about the generated dataset (e.g., seed column names).
         """
         self.artifact_storage = artifact_storage
         self._analysis = analysis
         self._config_builder = config_builder
+        self.dataset_metadata = dataset_metadata
     def load_analysis(self) -> DatasetProfilerResults:
         """Load the profiling analysis results for the generated dataset.

{data_designer-0.3.3.dist-info → data_designer-0.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: data-designer
-Version: 0.3.3
+Version: 0.3.4
 Summary: General framework for synthetic data generation
 License-Expression: Apache-2.0
 License-File: LICENSE
@@ -193,6 +193,14 @@ The value `openai/gpt-oss-20b` would be collected.
 To disable telemetry capture, set `NEMO_TELEMETRY_ENABLED=false`.
+### Top Models
+This chart represents the breakdown of models used for Data Designer across all synthetic data generation jobs from 12/18/2025 to 1/14/2026.
+![Top models used for synthetic data generation](docs/images/top-models.png)
+_Last updated on 1/14/2026_
 ---
 ## License

{data_designer-0.3.3.dist-info → data_designer-0.3.4.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 data_designer/__init__.py,sha256=xYZCBRleSswcNCARfHQzFy_Edag3Nmmwfa6A5C1d5B8,466
-data_designer/_version.py,sha256=lemL_4Kl75FgrO6lVuFrrtw6-Dcf9wtXBalKkXuzkO4,704
+data_designer/_version.py,sha256=3nDaC5e0d_scBB1bUEKPlItbvbY0PmXNNyyOTNFNWNI,704
 data_designer/errors.py,sha256=BjnxDxwtTGscY3lZbi1RQ666j5PseoxNJRobeMXAJiI,184
 data_designer/logging.py,sha256=2ToJzPPb6zF-QR-yQnrLVcBiBGL_QVNsnCHSQC0nPpQ,5346
 data_designer/plugin_manager.py,sha256=C2ZkZiXlcMRiaxfrrho5Shz6DKdExVeBha7ch-d4CnU,2695
@@ -34,24 +34,25 @@ data_designer/cli/services/download_service.py,sha256=9ocQdHZW6VYHqM3nyHn_7dP1z1
 data_designer/cli/services/model_service.py,sha256=rcsozdIRXMElskUHtmWGj6pZbVFf2JzHHv8rTnXv8xw,3890
 data_designer/cli/services/provider_service.py,sha256=ru0-AJVZLr8wk5kOV2z2FgPpF-iK8vWQs0qtrMKp-dg,3921
 data_designer/config/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
-data_designer/config/base.py,sha256=DtsUGIwTny57ASN3ffIfvhjrWt1PGhGuRDoqRmRx8Qo,2401
+data_designer/config/base.py,sha256=IGj6sy_GnKzC94uu2rdxe12EqR_AmGJ6O3rl2MxOv6g,2449
 data_designer/config/column_configs.py,sha256=Z3V8BKMkgse91MGlEcyJBa-lV25_j81SZ-vZNOoUKXc,18966
 data_designer/config/column_types.py,sha256=qsy04QslsnPEOoStN3sJIoSBeSSTxpNg5VLVmew_YLQ,5753
 data_designer/config/config_builder.py,sha256=vuPibkodbJxbCXdaI1tt1Uyo1SVCnAOfLBAW1AmhajI,24707
 data_designer/config/data_designer_config.py,sha256=qOojviug05vHR2S4800sjd4OmxhSVi6kB8SAFXLlPog,1891
 data_designer/config/dataset_builders.py,sha256=4NSEEqXzgSK8IDXoUSVRSUTcEe-ocKS-iEzyzKkNSJ0,332
+data_designer/config/dataset_metadata.py,sha256=UTlEgnHWgjwPuc7bP95T7gaKmcr7pIhFMy9vvbUwMV4,647
 data_designer/config/default_model_settings.py,sha256=3iUr10JvSTMDHwlEdSbLE_y90czbGOs_21La3V9fXoM,4462
 data_designer/config/errors.py,sha256=g64yn9l7lTbcXI3DPdC_3utvm994IXGDhoQRiNON6T0,524
 data_designer/config/exports.py,sha256=y23KqhwAf4DIarfvqgiuqyK2Fs1zv9cTDcuQ9SBX54o,4720
 data_designer/config/interface.py,sha256=RRC5JHl6wQ9Icg1IQWOM6t8VkBk6c1BREkka3cEAi8c,1624
 data_designer/config/models.py,sha256=A8D0qj6L9ndYvScvNWGuoOLe0zjYtoAm0JkwaG3qjRg,15335
-data_designer/config/preview_results.py,sha256=fhmxRLbSifdMO_pLrvmGaDJHtNIupX8X3HvXhjHr3hM,1350
+data_designer/config/preview_results.py,sha256=y-zEnRxZt_72XuAFs_e_peK4bVJ_fDSi9b-jLxsQ-S0,1602
 data_designer/config/processors.py,sha256=CoLUN1AuEBUPigmDT8vCWotvOcFgnC8CFk8VLvNSjxw,5992
-data_designer/config/run_config.py,sha256=M6m3oc3e1CTgStfkkgHyIt78JYrrMKdGSJUR07elmp0,1418
+data_designer/config/run_config.py,sha256=yakCcWyT78kLXGZnFVK35pdrkPpWT7AVEp_sHyq67fg,2393
 data_designer/config/sampler_constraints.py,sha256=XBPxm81J0u2q3WVsRaP2CoIE_0ssY14xc4Nt5r0oyuM,1161
 data_designer/config/sampler_params.py,sha256=FSMvmdj-9Hiyap1_n-AM_ZDT-sTovox7LK4TO4L2UUc,27843
 data_designer/config/seed.py,sha256=iUFByadE6GTG9HCcp0nEWm7wTDNaAOuYtGm2Ov0tiaE,4632
-data_designer/config/seed_source.py,sha256=zOnLAOyJ_nfFA5sI6DNzDf5IGWiYEjDgHYOzo_5O5Zw,2449
+data_designer/config/seed_source.py,sha256=GYgK9f0_a0FuOtvXlrLEw0MmAELucTB1Qrc_sGGoziw,2513
 data_designer/config/seed_source_types.py,sha256=4EJ4IsYkfkicYqh0CuEvyI3H541XbtV_ffClJV2-zCQ,679
 data_designer/config/validator_params.py,sha256=9w9M7Z1rcZOpw-BUBhjaVfA8ykNP5iPvdyJOHmdugEg,3911
 data_designer/config/analysis/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
@@ -68,7 +69,7 @@ data_designer/config/utils/io_helpers.py,sha256=_14cfoMUIFqiUPBTu5u-BomarqmP6_VB
 data_designer/config/utils/misc.py,sha256=cRofbhUulLKjV7j_7M5aNNNyIewyyhXLkj00GaZf9uo,2472
 data_designer/config/utils/numerical_helpers.py,sha256=BadOPY1AR2ZVKFAKORpKT_Corc1SZR7W1of6FXYIMY4,802
 data_designer/config/utils/type_helpers.py,sha256=2WSGYWTDktOCa9FGNv2IPXIHQHJc7-fimAB1JqIJVx0,4023
-data_designer/config/utils/visualization.py,sha256=TbYgHAvxXEmJZH-QZYyOkf28E52rW80ZirpJT2PMeK8,18191
+data_designer/config/utils/visualization.py,sha256=2ZloKN1UimKvIXyKJBSHZ56-JItE5MzMXY31J9x4hbY,18366
 data_designer/engine/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
 data_designer/engine/compiler.py,sha256=tzZF5rk_AAid0yRwwsR86-xvW-5GdJskN1RPfvuFq-k,2853
 data_designer/engine/configurable_task.py,sha256=hdXyfekQ9dHSc_NQG6nZIxKCqWSWKnqA5nje7Uobf6k,2525
@@ -90,23 +91,23 @@ data_designer/engine/column_generators/generators/__init__.py,sha256=XLO09Ei8g0l
 data_designer/engine/column_generators/generators/base.py,sha256=T3ccXIyKXwYN4Kes6HcTBdELK-wOs9cFCoaTH3C3dyQ,3814
 data_designer/engine/column_generators/generators/embedding.py,sha256=pdRMzb95CKmBpOiTQrAEiKngBFvAlt5g8HwI8EwWBIY,1565
 data_designer/engine/column_generators/generators/expression.py,sha256=irBDhTsFyZaWU2EIqy5xOKRBxX-x8W8q5pQ69P7NKTs,2543
-data_designer/engine/column_generators/generators/llm_completion.py,sha256=Dp4F1wxZAMwUglRqQHE5lPXDNyPALKWsMr1IJRIvbJI,5031
+data_designer/engine/column_generators/generators/llm_completion.py,sha256=fwL6Xbfg8d9SnZgv2-j3g_6S38xzuwnVY4R4UabvdIw,4881
 data_designer/engine/column_generators/generators/samplers.py,sha256=0bvJhVK2LfH9aRY1BxqWCjA7LJxy1B63gGmZuWK8auU,3486
 data_designer/engine/column_generators/generators/seed_dataset.py,sha256=IskfOQkRMRTfu8tiYb426LMchBYKNQs0uSo9E5y0bwg,6905
-data_designer/engine/column_generators/generators/validation.py,sha256=hDqE_xZfJ2XGVE89ifDUvsbtZfzQmiNGRBTY733dXgY,6776
+data_designer/engine/column_generators/generators/validation.py,sha256=0gw0Wzq4yVqejWewt2uznQ4UJI7_0MMEdvMenH-KwYc,6844
 data_designer/engine/column_generators/utils/errors.py,sha256=Nemo7fxg9BpTOf0kdlxxdtXZMfe_ksrRfzWg2E5sFX0,370
 data_designer/engine/column_generators/utils/generator_classification.py,sha256=1fvMX7lQzY3A1s-V3CtS-W6-zwmby553_Oe9K-tdZKE,1922
 data_designer/engine/column_generators/utils/judge_score_factory.py,sha256=8l0g-L_O6esmAEf1rJh7o2IASZnLqZ_KDlGaLCMYMK0,2105
 data_designer/engine/column_generators/utils/prompt_renderer.py,sha256=xXnzJiR60zTmMDorR_wfTleNMJsLKA5gbL4WOjQ-wYg,4765
 data_designer/engine/dataset_builders/artifact_storage.py,sha256=fYBC569tXVpn7UURcuXfHPhEvvwOHnMxAkA1iQAB-j4,8425
-data_designer/engine/dataset_builders/column_wise_builder.py,sha256=y068Eza-MabPviEStsFt14Kl4p8b9o1Exe4-kje4vEs,15197
+data_designer/engine/dataset_builders/column_wise_builder.py,sha256=DUAMEmy7xCSFDYy-WU-ZXflKzy5oJ6yqMESQSb7CoUc,15308
 data_designer/engine/dataset_builders/errors.py,sha256=ov9cTRvLtLieIFkUGZdk1n_iabdc904ZATwgT5u9uzY,364
 data_designer/engine/dataset_builders/multi_column_configs.py,sha256=bxRildX3SfzSFKv_rqwwsUeFn_RoyhfFneSj4qBi-Q4,1624
 data_designer/engine/dataset_builders/utils/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
-data_designer/engine/dataset_builders/utils/concurrency.py,sha256=SQlT9Tu3UgATllm_fwle4kkro09NQBVU_nLSZj7lBDQ,7893
+data_designer/engine/dataset_builders/utils/concurrency.py,sha256=Q0ro9UY-3-FFzfi3MZ29nMTSiDZgg1Um6y_HQFztDhk,8338
 data_designer/engine/dataset_builders/utils/config_compiler.py,sha256=iAbaLiDNBPyjZwSVK1a83KFKln9LACjKdVSpIiZRemw,2405
 data_designer/engine/dataset_builders/utils/dag.py,sha256=L3-sla2s8oqcy3V-WSxXAWZVakkmb3cvmTh5L6phC1M,2474
-data_designer/engine/dataset_builders/utils/dataset_batch_manager.py,sha256=c-6ER96oP-0oZ6CncN8sjXy1S1ZjzTreeYBr1oOX8Yg,7774
+data_designer/engine/dataset_builders/utils/dataset_batch_manager.py,sha256=DpEW_zrHv7CPiVFw07r_0Q9V6dFaTqOFJ1uWnuicxHo,8142
 data_designer/engine/dataset_builders/utils/errors.py,sha256=6hstnyjYZ8pU69qK3OrUvlyeifqRJC2m399GNVcih3I,375
 data_designer/engine/models/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
 data_designer/engine/models/errors.py,sha256=HjPF3ERKU7veS7s189_0VBBMpePl6pSmOqaYhOO8xv4,12179
@@ -145,7 +146,7 @@ data_designer/engine/registry/errors.py,sha256=PNWsfU2baNmzshcHah1-P7YT9OqxfxYf1
 data_designer/engine/resources/managed_dataset_generator.py,sha256=mJAR-dc0vEfiw3xN7ABIOalxGIbQmfZ7P23U-vUut40,1374
 data_designer/engine/resources/managed_dataset_repository.py,sha256=zxxP9SDdu0PrP28ozJmIwdlbHHuGkcPRmb1Ua5VOsG0,7552
 data_designer/engine/resources/managed_storage.py,sha256=nTfp14o-BbtDP6XUxD3564g0cXB6B1og-1rZI9dx1H0,2084
-data_designer/engine/resources/resource_provider.py,sha256=9IwJ5qQPvoQwXe_wIkp0enNpgKRRu691BQfxsnZic5w,2323
+data_designer/engine/resources/resource_provider.py,sha256=pMh0_j8JCZ54x4zCPtPIb9s5AGwSBswQsYFddxsgM_U,2813
 data_designer/engine/resources/seed_reader.py,sha256=Gh524gMWh6Lz43bielQG3nmMz5MrZy1BTGs792AhMpc,5591
 data_designer/engine/sampling_gen/column.py,sha256=lqdMrUebIFJ_C_Laye0TuIkEd_fVVPP5b-zoNGez25Q,3951
 data_designer/engine/sampling_gen/constraints.py,sha256=HtLxW_VYXlo_A8IMlZkgTb1fBwyrRHuB3LZnF3AgV-I,3009
@@ -176,9 +177,9 @@ data_designer/engine/validators/remote.py,sha256=Qviad8vF7SBtHRvNqKxZyiLJuGJp0rm
 data_designer/engine/validators/sql.py,sha256=Y2FkM_JmcWzW98qjZruq73vt8RgjkLXuTbyLPGrA2l8,2255
 data_designer/essentials/__init__.py,sha256=wt84P4qu1-OdIfrZmdOtRtiBoGcVSdTEdkIiUaztPWs,1099
 data_designer/interface/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
-data_designer/interface/data_designer.py,sha256=AJ-1ntSeAkDcmuyI0-erq8xrxdu6C5m47h0k8p2IxFQ,17409
+data_designer/interface/data_designer.py,sha256=OpjWdfjwhp1wzeArdicpCzmaaVCZydSGarQDpvh3egU,17105
 data_designer/interface/errors.py,sha256=CoH-6V95RDNHNONxKs8MWina-cD6NtaerMW_cyH8WRU,570
-data_designer/interface/results.py,sha256=x6bzKaG9ZaDU_UmImNgjULn131jTektMzWX5B3VZXpE,3516
+data_designer/interface/results.py,sha256=aiRWSK2QVAKTGboqn_Fods7dc5lbG5cmDs6O_INbH-0,3768
 data_designer/plugins/__init__.py,sha256=GmeQ6bzFMNkLthrmLKQ5bcUMPw_W2K4d6oWKvYA8eNw,239
 data_designer/plugins/errors.py,sha256=EzKPo0rEiTe872sIAvF0_cDjFfl_P2MyDtESZL1P3ug,350
 data_designer/plugins/plugin.py,sha256=TmdllvWXOKZZNwJTW_rUMD1M68pAh_IEIILZei7fHqU,5468
@@ -186,8 +187,8 @@ data_designer/plugins/registry.py,sha256=1vNlmDyFMCIY8D_z1RbnWMfKnHZ02g8sAvJg1nE
 data_designer/plugins/testing/__init__.py,sha256=lSuWzt1AVTJg4gV9wh6BUU1Az7IjQ-9FgADdUAtm_qQ,260
 data_designer/plugins/testing/stubs.py,sha256=E8bovwU4zpNaJM4b1i4biCDXyqvthe8vmsrZ9M0M9vM,4277
 data_designer/plugins/testing/utils.py,sha256=OLoQzW8-qbA-91wTUWtwNHrMDyGz1-ma0f6a-3NBUNI,937
-data_designer-0.3.3.dist-info/METADATA,sha256=zJ5wtn9YBRb9qEXBHT3_93rZxo7RT5P6JhSsAdILimQ,7648
-data_designer-0.3.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-data_designer-0.3.3.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
-data_designer-0.3.3.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
-data_designer-0.3.3.dist-info/RECORD,,
+data_designer-0.3.4.dist-info/METADATA,sha256=10B6euF52J8L-MFg1_I_3lRQ3rK3Q5x_2V0iZ_vTtBQ,7914
+data_designer-0.3.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+data_designer-0.3.4.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
+data_designer-0.3.4.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
+data_designer-0.3.4.dist-info/RECORD,,

{data_designer-0.3.3.dist-info → data_designer-0.3.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{data_designer-0.3.3.dist-info → data_designer-0.3.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{data_designer-0.3.3.dist-info → data_designer-0.3.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

data-designer 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

data-designer 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl