data-designer 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data_designer/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.4'
32
- __version_tuple__ = version_tuple = (0, 1, 4)
31
+ __version__ = version = '0.1.5'
32
+ __version_tuple__ = version_tuple = (0, 1, 5)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -32,7 +32,11 @@ from data_designer.config.models import (
32
32
  UniformDistribution,
33
33
  UniformDistributionParams,
34
34
  )
35
- from data_designer.config.processors import DropColumnsProcessorConfig, ProcessorType
35
+ from data_designer.config.processors import (
36
+ DropColumnsProcessorConfig,
37
+ ProcessorType,
38
+ SchemaTransformProcessorConfig,
39
+ )
36
40
  from data_designer.config.sampler_constraints import ColumnInequalityConstraint, ScalarInequalityConstraint
37
41
  from data_designer.config.sampler_params import (
38
42
  BernoulliMixtureSamplerParams,
@@ -69,6 +73,7 @@ from data_designer.config.validator_params import (
69
73
 
70
74
  def get_config_exports() -> list[str]:
71
75
  return [
76
+ SchemaTransformProcessorConfig.__name__,
72
77
  BernoulliMixtureSamplerParams.__name__,
73
78
  BernoulliSamplerParams.__name__,
74
79
  BinomialSamplerParams.__name__,
@@ -3,7 +3,7 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
- from typing import Optional
6
+ from typing import Optional, Union
7
7
 
8
8
  import pandas as pd
9
9
 
@@ -19,6 +19,7 @@ class PreviewResults(WithRecordSamplerMixin):
19
19
  config_builder: DataDesignerConfigBuilder,
20
20
  dataset: Optional[pd.DataFrame] = None,
21
21
  analysis: Optional[DatasetProfilerResults] = None,
22
+ processor_artifacts: Optional[dict[str, Union[list[str], str]]] = None,
22
23
  ):
23
24
  """Creates a new instance with results from a Data Designer preview run.
24
25
 
@@ -26,7 +27,9 @@ class PreviewResults(WithRecordSamplerMixin):
26
27
  config_builder: Data Designer configuration builder.
27
28
  dataset: Dataset of the preview run.
28
29
  analysis: Analysis of the preview run.
30
+ processor_artifacts: Artifacts generated by the processors.
29
31
  """
30
- self.dataset: pd.DataFrame | None = dataset
31
- self.analysis: DatasetProfilerResults | None = analysis
32
+ self.dataset: Optional[pd.DataFrame] = dataset
33
+ self.analysis: Optional[DatasetProfilerResults] = analysis
34
+ self.processor_artifacts: Optional[dict[str, Union[list[str], str]]] = processor_artifacts
32
35
  self._config_builder = config_builder
@@ -1,25 +1,32 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ import json
4
5
  from abc import ABC
5
6
  from enum import Enum
6
- from typing import Literal
7
+ from typing import Any, Literal
7
8
 
8
9
  from pydantic import Field, field_validator
9
10
 
10
11
  from data_designer.config.base import ConfigBase
11
12
  from data_designer.config.dataset_builders import BuildStage
13
+ from data_designer.config.errors import InvalidConfigError
12
14
 
13
15
  SUPPORTED_STAGES = [BuildStage.POST_BATCH]
14
16
 
15
17
 
16
18
  class ProcessorType(str, Enum):
17
19
  DROP_COLUMNS = "drop_columns"
20
+ SCHEMA_TRANSFORM = "schema_transform"
18
21
 
19
22
 
20
23
  class ProcessorConfig(ConfigBase, ABC):
24
+ name: str = Field(
25
+ description="The name of the processor, used to identify the processor in the results and to write the artifacts to disk.",
26
+ )
21
27
  build_stage: BuildStage = Field(
22
- ..., description=f"The stage at which the processor will run. Supported stages: {', '.join(SUPPORTED_STAGES)}"
28
+ default=BuildStage.POST_BATCH,
29
+ description=f"The stage at which the processor will run. Supported stages: {', '.join(SUPPORTED_STAGES)}",
23
30
  )
24
31
 
25
32
  @field_validator("build_stage")
@@ -34,8 +41,45 @@ class ProcessorConfig(ConfigBase, ABC):
34
41
  def get_processor_config_from_kwargs(processor_type: ProcessorType, **kwargs) -> ProcessorConfig:
35
42
  if processor_type == ProcessorType.DROP_COLUMNS:
36
43
  return DropColumnsProcessorConfig(**kwargs)
44
+ elif processor_type == ProcessorType.SCHEMA_TRANSFORM:
45
+ return SchemaTransformProcessorConfig(**kwargs)
37
46
 
38
47
 
39
48
  class DropColumnsProcessorConfig(ProcessorConfig):
40
49
  column_names: list[str]
41
50
  processor_type: Literal[ProcessorType.DROP_COLUMNS] = ProcessorType.DROP_COLUMNS
51
+
52
+
53
+ class SchemaTransformProcessorConfig(ProcessorConfig):
54
+ template: dict[str, Any] = Field(
55
+ ...,
56
+ description="""
57
+ Dictionary specifying columns and templates to use in the new dataset with transformed schema.
58
+
59
+ Each key is a new column name, and each value is an object containing Jinja2 templates - for instance, a string or a list of strings.
60
+ Values must be JSON-serializable.
61
+
62
+ Example:
63
+
64
+ ```python
65
+ template = {
66
+ "list_of_strings": ["{{ col1 }}", "{{ col2 }}"],
67
+ "uppercase_string": "{{ col1 | upper }}",
68
+ "lowercase_string": "{{ col2 | lower }}",
69
+ }
70
+ ```
71
+
72
+ The above templates will create an new dataset with three columns: "list_of_strings", "uppercase_string", and "lowercase_string".
73
+ References to columns "col1" and "col2" in the templates will be replaced with the actual values of the columns in the dataset.
74
+ """,
75
+ )
76
+ processor_type: Literal[ProcessorType.SCHEMA_TRANSFORM] = ProcessorType.SCHEMA_TRANSFORM
77
+
78
+ @field_validator("template")
79
+ def validate_template(cls, v: dict[str, Any]) -> dict[str, Any]:
80
+ try:
81
+ json.dumps(v)
82
+ except TypeError as e:
83
+ if "not JSON serializable" in str(e):
84
+ raise InvalidConfigError("Template must be JSON serializable")
85
+ return v
@@ -18,7 +18,10 @@ from rich.panel import Panel
18
18
  from data_designer.config.column_types import ColumnConfigT, DataDesignerColumnType, column_type_is_llm_generated
19
19
  from data_designer.config.processors import ProcessorConfig, ProcessorType
20
20
  from data_designer.config.utils.constants import RICH_CONSOLE_THEME
21
- from data_designer.config.utils.misc import can_run_data_designer_locally
21
+ from data_designer.config.utils.misc import (
22
+ can_run_data_designer_locally,
23
+ get_prompt_template_keywords,
24
+ )
22
25
  from data_designer.config.validator_params import ValidatorType
23
26
 
24
27
 
@@ -63,6 +66,7 @@ def validate_data_designer_config(
63
66
  violations.extend(validate_expression_references(columns=columns, allowed_references=allowed_references))
64
67
  violations.extend(validate_columns_not_all_dropped(columns=columns))
65
68
  violations.extend(validate_drop_columns_processor(columns=columns, processor_configs=processor_configs))
69
+ violations.extend(validate_schema_transform_processor(columns=columns, processor_configs=processor_configs))
66
70
  if not can_run_data_designer_locally():
67
71
  violations.extend(validate_local_only_columns(columns=columns))
68
72
  return violations
@@ -271,7 +275,7 @@ def validate_drop_columns_processor(
271
275
  columns: list[ColumnConfigT],
272
276
  processor_configs: list[ProcessorConfig],
273
277
  ) -> list[Violation]:
274
- all_column_names = set([c.name for c in columns])
278
+ all_column_names = {c.name for c in columns}
275
279
  for processor_config in processor_configs:
276
280
  if processor_config.processor_type == ProcessorType.DROP_COLUMNS:
277
281
  invalid_columns = set(processor_config.column_names) - all_column_names
@@ -288,6 +292,33 @@ def validate_drop_columns_processor(
288
292
  return []
289
293
 
290
294
 
295
+ def validate_schema_transform_processor(
296
+ columns: list[ColumnConfigT],
297
+ processor_configs: list[ProcessorConfig],
298
+ ) -> list[Violation]:
299
+ violations = []
300
+
301
+ all_column_names = {c.name for c in columns}
302
+ for processor_config in processor_configs:
303
+ if processor_config.processor_type == ProcessorType.SCHEMA_TRANSFORM:
304
+ for col, template in processor_config.template.items():
305
+ template_keywords = get_prompt_template_keywords(template)
306
+ invalid_keywords = set(template_keywords) - all_column_names
307
+ if len(invalid_keywords) > 0:
308
+ invalid_keywords = ", ".join([f"'{k}'" for k in invalid_keywords])
309
+ message = f"Ancillary dataset processor attempts to reference columns {invalid_keywords} in the template for '{col}', but the columns are not defined in the dataset."
310
+ violations.append(
311
+ Violation(
312
+ column=None,
313
+ type=ViolationType.INVALID_REFERENCE,
314
+ message=message,
315
+ level=ViolationLevel.ERROR,
316
+ )
317
+ )
318
+
319
+ return violations
320
+
321
+
291
322
  def validate_expression_references(
292
323
  columns: list[ColumnConfigT],
293
324
  allowed_references: list[str],
@@ -72,6 +72,9 @@ class WithRecordSamplerMixin:
72
72
  else:
73
73
  raise DatasetSampleDisplayError("No valid dataset found in results object.")
74
74
 
75
+ def _has_processor_artifacts(self) -> bool:
76
+ return hasattr(self, "processor_artifacts") and self.processor_artifacts is not None
77
+
75
78
  def display_sample_record(
76
79
  self,
77
80
  index: Optional[int] = None,
@@ -79,6 +82,7 @@ class WithRecordSamplerMixin:
79
82
  hide_seed_columns: bool = False,
80
83
  syntax_highlighting_theme: str = "dracula",
81
84
  background_color: Optional[str] = None,
85
+ processors_to_display: Optional[list[str]] = None,
82
86
  ) -> None:
83
87
  """Display a sample record from the Data Designer dataset preview.
84
88
 
@@ -90,6 +94,7 @@ class WithRecordSamplerMixin:
90
94
  documentation from `rich` for information about available themes.
91
95
  background_color: Background color to use for the record. See the `Syntax`
92
96
  documentation from `rich` for information about available background colors.
97
+ processors_to_display: List of processors to display the artifacts for. If None, all processors will be displayed.
93
98
  """
94
99
  i = index or self._display_cycle_index
95
100
 
@@ -99,8 +104,25 @@ class WithRecordSamplerMixin:
99
104
  except IndexError:
100
105
  raise DatasetSampleDisplayError(f"Index {i} is out of bounds for dataset of length {num_records}.")
101
106
 
107
+ processor_data_to_display = None
108
+ if self._has_processor_artifacts() and len(self.processor_artifacts) > 0:
109
+ if processors_to_display is None:
110
+ processors_to_display = list(self.processor_artifacts.keys())
111
+
112
+ if len(processors_to_display) > 0:
113
+ processor_data_to_display = {}
114
+ for processor in processors_to_display:
115
+ if (
116
+ isinstance(self.processor_artifacts[processor], list)
117
+ and len(self.processor_artifacts[processor]) == num_records
118
+ ):
119
+ processor_data_to_display[processor] = self.processor_artifacts[processor][i]
120
+ else:
121
+ processor_data_to_display[processor] = self.processor_artifacts[processor]
122
+
102
123
  display_sample_record(
103
124
  record=record,
125
+ processor_data_to_display=processor_data_to_display,
104
126
  config_builder=self._config_builder,
105
127
  background_color=background_color,
106
128
  syntax_highlighting_theme=syntax_highlighting_theme,
@@ -134,6 +156,7 @@ def create_rich_histogram_table(
134
156
  def display_sample_record(
135
157
  record: Union[dict, pd.Series, pd.DataFrame],
136
158
  config_builder: DataDesignerConfigBuilder,
159
+ processor_data_to_display: Optional[dict[str, Union[list[str], str]]] = None,
137
160
  background_color: Optional[str] = None,
138
161
  syntax_highlighting_theme: str = "dracula",
139
162
  record_index: Optional[int] = None,
@@ -230,6 +253,15 @@ def display_sample_record(
230
253
  table.add_row(*row)
231
254
  render_list.append(pad_console_element(table, (1, 0, 1, 0)))
232
255
 
256
+ if processor_data_to_display and len(processor_data_to_display) > 0:
257
+ for processor_name, processor_data in processor_data_to_display.items():
258
+ table = Table(title=f"Processor Outputs: {processor_name}", **table_kws)
259
+ table.add_column("Name")
260
+ table.add_column("Value")
261
+ for col, value in processor_data.items():
262
+ table.add_row(col, convert_to_row_element(value))
263
+ render_list.append(pad_console_element(table, (1, 0, 1, 0)))
264
+
233
265
  if record_index is not None:
234
266
  index_label = Text(f"[index: {record_index}]", justify="center")
235
267
  render_list.append(index_label)
@@ -25,6 +25,7 @@ class BatchStage(StrEnum):
25
25
  PARTIAL_RESULT = "partial_results_path"
26
26
  FINAL_RESULT = "final_dataset_path"
27
27
  DROPPED_COLUMNS = "dropped_columns_dataset_path"
28
+ PROCESSORS_OUTPUTS = "processors_outputs_path"
28
29
 
29
30
 
30
31
  class ArtifactStorage(BaseModel):
@@ -33,6 +34,7 @@ class ArtifactStorage(BaseModel):
33
34
  final_dataset_folder_name: str = "parquet-files"
34
35
  partial_results_folder_name: str = "tmp-partial-parquet-files"
35
36
  dropped_columns_folder_name: str = "dropped-columns-parquet-files"
37
+ processors_outputs_folder_name: str = "processors-files"
36
38
 
37
39
  @property
38
40
  def artifact_path_exists(self) -> bool:
@@ -70,6 +72,10 @@ class ArtifactStorage(BaseModel):
70
72
  def partial_results_path(self) -> Path:
71
73
  return self.base_dataset_path / self.partial_results_folder_name
72
74
 
75
+ @property
76
+ def processors_outputs_path(self) -> Path:
77
+ return self.base_dataset_path / self.processors_outputs_folder_name
78
+
73
79
  @field_validator("artifact_path")
74
80
  def validate_artifact_path(cls, v: Union[Path, str]) -> Path:
75
81
  v = Path(v)
@@ -84,6 +90,7 @@ class ArtifactStorage(BaseModel):
84
90
  self.final_dataset_folder_name,
85
91
  self.partial_results_folder_name,
86
92
  self.dropped_columns_folder_name,
93
+ self.processors_outputs_folder_name,
87
94
  ]
88
95
 
89
96
  for name in folder_names:
@@ -169,9 +176,10 @@ class ArtifactStorage(BaseModel):
169
176
  batch_number: int,
170
177
  dataframe: pd.DataFrame,
171
178
  batch_stage: BatchStage,
179
+ subfolder: str | None = None,
172
180
  ) -> Path:
173
181
  file_path = self.create_batch_file_path(batch_number, batch_stage=batch_stage)
174
- self.write_parquet_file(file_path.name, dataframe, batch_stage)
182
+ self.write_parquet_file(file_path.name, dataframe, batch_stage, subfolder=subfolder)
175
183
  return file_path
176
184
 
177
185
  def write_parquet_file(
@@ -179,9 +187,11 @@ class ArtifactStorage(BaseModel):
179
187
  parquet_file_name: str,
180
188
  dataframe: pd.DataFrame,
181
189
  batch_stage: BatchStage,
190
+ subfolder: str | None = None,
182
191
  ) -> Path:
183
- self.mkdir_if_needed(self._get_stage_path(batch_stage))
184
- file_path = self._get_stage_path(batch_stage) / parquet_file_name
192
+ subfolder = subfolder or ""
193
+ self.mkdir_if_needed(self._get_stage_path(batch_stage) / subfolder)
194
+ file_path = self._get_stage_path(batch_stage) / subfolder / parquet_file_name
185
195
  dataframe.to_parquet(file_path, index=False)
186
196
  return file_path
187
197
 
@@ -171,6 +171,8 @@ class ColumnWiseDatasetBuilder:
171
171
  max_workers = MAX_CONCURRENCY_PER_NON_LLM_GENERATOR
172
172
  if isinstance(generator, WithLLMGeneration):
173
173
  max_workers = generator.inference_parameters.max_parallel_requests
174
+ elif hasattr(generator.config, "max_parallel_requests"):
175
+ max_workers = generator.config.max_parallel_requests
174
176
  self._fan_out_with_threads(generator, max_workers=max_workers)
175
177
 
176
178
  def _run_full_column_generator(self, generator: ColumnGenerator) -> None:
@@ -244,6 +246,7 @@ class ColumnWiseDatasetBuilder:
244
246
  processors[BuildStage.POST_BATCH].append( # as post-batch by default
245
247
  DropColumnsProcessor(
246
248
  config=DropColumnsProcessorConfig(
249
+ name="default_drop_columns_processor",
247
250
  column_names=columns_to_drop,
248
251
  build_stage=BuildStage.POST_BATCH,
249
252
  ),
@@ -17,7 +17,7 @@ class DropColumnsProcessor(Processor[DropColumnsProcessorConfig]):
17
17
  @staticmethod
18
18
  def metadata() -> ConfigurableTaskMetadata:
19
19
  return ConfigurableTaskMetadata(
20
- name="drop_columns",
20
+ name="drop_columns_processor",
21
21
  description="Drop columns from the input dataset.",
22
22
  required_resources=None,
23
23
  )
@@ -5,9 +5,11 @@ from data_designer.config.base import ConfigBase
5
5
  from data_designer.config.processors import (
6
6
  DropColumnsProcessorConfig,
7
7
  ProcessorType,
8
+ SchemaTransformProcessorConfig,
8
9
  )
9
10
  from data_designer.engine.processing.processors.base import Processor
10
11
  from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor
12
+ from data_designer.engine.processing.processors.schema_transform import SchemaTransformProcessor
11
13
  from data_designer.engine.registry.base import TaskRegistry
12
14
 
13
15
 
@@ -16,5 +18,6 @@ class ProcessorRegistry(TaskRegistry[str, Processor, ConfigBase]): ...
16
18
 
17
19
  def create_default_processor_registry() -> ProcessorRegistry:
18
20
  registry = ProcessorRegistry()
21
+ registry.register(ProcessorType.SCHEMA_TRANSFORM, SchemaTransformProcessor, SchemaTransformProcessorConfig, False)
19
22
  registry.register(ProcessorType.DROP_COLUMNS, DropColumnsProcessor, DropColumnsProcessorConfig, False)
20
23
  return registry
@@ -0,0 +1,53 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import json
5
+ import logging
6
+
7
+ import pandas as pd
8
+
9
+ from data_designer.config.processors import SchemaTransformProcessorConfig
10
+ from data_designer.engine.configurable_task import ConfigurableTaskMetadata
11
+ from data_designer.engine.dataset_builders.artifact_storage import BatchStage
12
+ from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
13
+ from data_designer.engine.processing.processors.base import Processor
14
+ from data_designer.engine.processing.utils import deserialize_json_values
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class SchemaTransformProcessor(WithJinja2UserTemplateRendering, Processor[SchemaTransformProcessorConfig]):
20
+ @staticmethod
21
+ def metadata() -> ConfigurableTaskMetadata:
22
+ return ConfigurableTaskMetadata(
23
+ name="schema_transform_processor",
24
+ description="Generate dataset with transformed schema using a Jinja2 template.",
25
+ required_resources=None,
26
+ )
27
+
28
+ @property
29
+ def template_as_str(self) -> str:
30
+ return json.dumps(self.config.template)
31
+
32
+ def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame:
33
+ self.prepare_jinja2_template_renderer(self.template_as_str, data.columns.to_list())
34
+ formatted_records = [
35
+ json.loads(self.render_template(deserialize_json_values(record)).replace("\n", "\\n"))
36
+ for record in data.to_dict(orient="records")
37
+ ]
38
+ formatted_data = pd.DataFrame(formatted_records)
39
+ if current_batch_number is not None:
40
+ self.artifact_storage.write_batch_to_parquet_file(
41
+ batch_number=current_batch_number,
42
+ dataframe=formatted_data,
43
+ batch_stage=BatchStage.PROCESSORS_OUTPUTS,
44
+ subfolder=self.config.name,
45
+ )
46
+ else:
47
+ self.artifact_storage.write_parquet_file(
48
+ parquet_file_name=f"{self.config.name}.parquet",
49
+ dataframe=formatted_data,
50
+ batch_stage=BatchStage.PROCESSORS_OUTPUTS,
51
+ )
52
+
53
+ return data
@@ -249,6 +249,17 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
249
249
  except Exception as e:
250
250
  raise DataDesignerProfilingError(f"🛑 Error profiling preview dataset: {e}")
251
251
 
252
+ if builder.artifact_storage.processors_outputs_path.exists():
253
+ processor_artifacts = {
254
+ processor_config.name: pd.read_parquet(
255
+ builder.artifact_storage.processors_outputs_path / f"{processor_config.name}.parquet",
256
+ dtype_backend="pyarrow",
257
+ ).to_dict(orient="records")
258
+ for processor_config in config_builder.get_processor_configs()
259
+ }
260
+ else:
261
+ processor_artifacts = {}
262
+
252
263
  if (
253
264
  len(processed_dataset) > 0
254
265
  and isinstance(analysis, DatasetProfilerResults)
@@ -259,6 +270,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
259
270
  return PreviewResults(
260
271
  dataset=processed_dataset,
261
272
  analysis=analysis,
273
+ processor_artifacts=processor_artifacts,
262
274
  config_builder=config_builder,
263
275
  )
264
276
 
@@ -3,12 +3,15 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ from pathlib import Path
7
+
6
8
  import pandas as pd
7
9
 
8
10
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
9
11
  from data_designer.config.config_builder import DataDesignerConfigBuilder
10
12
  from data_designer.config.utils.visualization import WithRecordSamplerMixin
11
13
  from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
14
+ from data_designer.engine.dataset_builders.errors import ArtifactStorageError
12
15
 
13
16
 
14
17
  class DatasetCreationResults(WithRecordSamplerMixin):
@@ -53,3 +56,36 @@ class DatasetCreationResults(WithRecordSamplerMixin):
53
56
  A pandas DataFrame containing the full generated dataset.
54
57
  """
55
58
  return self.artifact_storage.load_dataset()
59
+
60
+ def load_processor_dataset(self, processor_name: str) -> pd.DataFrame:
61
+ """Load the dataset generated by a processor.
62
+
63
+ This only works for processors that write their artifacts in Parquet format.
64
+
65
+ Args:
66
+ processor_name: The name of the processor to load the dataset from.
67
+
68
+ Returns:
69
+ A pandas DataFrame containing the dataset generated by the processor.
70
+ """
71
+ try:
72
+ dataset = self.artifact_storage.read_parquet_files(
73
+ self.artifact_storage.processors_outputs_path / processor_name
74
+ )
75
+ except Exception as e:
76
+ raise ArtifactStorageError(f"Failed to load dataset for processor {processor_name}: {e}")
77
+
78
+ return dataset
79
+
80
+ def get_path_to_processor_artifacts(self, processor_name: str) -> Path:
81
+ """Get the path to the artifacts generated by a processor.
82
+
83
+ Args:
84
+ processor_name: The name of the processor to load the artifact from.
85
+
86
+ Returns:
87
+ The path to the artifacts.
88
+ """
89
+ if not self.artifact_storage.processors_outputs_path.exists():
90
+ raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.")
91
+ return self.artifact_storage.processors_outputs_path / processor_name
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: General framework for synthetic data generation
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -144,12 +144,12 @@ preview.display_sample_record()
144
144
 
145
145
  ### 📚 Learn more
146
146
 
147
- - **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner/quick-start/)** – Detailed walkthrough with more examples
148
- - **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/)** – Step-by-step interactive tutorials
149
- - **[Column Types](https://nvidia-nemo.github.io/DataDesigner/concepts/columns/)** – Explore samplers, LLM columns, validators, and more
150
- - **[Validators](https://nvidia-nemo.github.io/DataDesigner/concepts/validators/)** – Learn how to validate generated data with Python, SQL, and remote validators
151
- - **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/concepts/models/model-configs/)** – Configure custom models and providers
152
- - **[Person Sampling](https://nvidia-nemo.github.io/DataDesigner/concepts/person_sampling/)** – Learn how to sample realistic person data with demographic attributes
147
+ - **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner/latest/quick-start/)** – Detailed walkthrough with more examples
148
+ - **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/)** – Step-by-step interactive tutorials
149
+ - **[Column Types](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/columns/)** – Explore samplers, LLM columns, validators, and more
150
+ - **[Validators](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/validators/)** – Learn how to validate generated data with Python, SQL, and remote validators
151
+ - **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/model-configs/)** – Configure custom models and providers
152
+ - **[Person Sampling](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/person_sampling/)** – Learn how to sample realistic person data with demographic attributes
153
153
 
154
154
  ### 🔧 Configure models via CLI
155
155
 
@@ -161,7 +161,7 @@ data-designer config list # View current settings
161
161
 
162
162
  ### 🤝 Get involved
163
163
 
164
- - **[Contributing Guide](https://nvidia-nemo.github.io/DataDesigner/CONTRIBUTING)** – Help improve Data Designer
164
+ - **[Contributing Guide](https://nvidia-nemo.github.io/DataDesigner/latest/CONTRIBUTING)** – Help improve Data Designer
165
165
  - **[GitHub Issues](https://github.com/NVIDIA-NeMo/DataDesigner/issues)** – Report bugs or make a feature request
166
166
 
167
167
  ---
@@ -178,7 +178,7 @@ If you use NeMo Data Designer in your research, please cite it using the followi
178
178
 
179
179
  ```bibtex
180
180
  @misc{nemo-data-designer,
181
- author = {The NeMo Data Designer Team},
181
+ author = {The NeMo Data Designer Team, NVIDIA},
182
182
  title = {NeMo Data Designer: A framework for generating synthetic data from scratch or based on your own seed data},
183
183
  howpublished = {\url{https://github.com/NVIDIA-NeMo/DataDesigner}},
184
184
  year = {2025},
@@ -1,5 +1,5 @@
1
1
  data_designer/__init__.py,sha256=iCeqRnb640RrL2QpA630GY5Ng7JiDt83Vq0DwLnNugU,461
2
- data_designer/_version.py,sha256=rLCrf4heo25FJtBY-2Ap7ZuWW-5FS7sqTjsolIUuI5c,704
2
+ data_designer/_version.py,sha256=rdxBMYpwzYxiWk08QbPLHSAxHoDfeKWwyaJIAM0lSic,704
3
3
  data_designer/errors.py,sha256=Z4eN9XwzZvGRdBluSNoSqQYkPPzNQIDf0ET_OqWRZh8,179
4
4
  data_designer/logging.py,sha256=OqRGvWNlGA3ebRFts7e5k-5GFwoAPaGXYQS4oEzVG0o,5354
5
5
  data_designer/plugin_manager.py,sha256=eXtmmqyyoVHWO1zvlLvKQ-rTrONJxf9jhr4ZMzsXWSE,2610
@@ -39,11 +39,11 @@ data_designer/config/dataset_builders.py,sha256=1pNFy_pkQ5lJ6AVZ43AeTuSbz6yC_l7N
39
39
  data_designer/config/datastore.py,sha256=brMylPuBsT7uDKSy7G59M7Zdx91RTYWMOVcdRVe5Wjs,7632
40
40
  data_designer/config/default_model_settings.py,sha256=HAGyfYzT1fdWMpMSLeJuZZZQHKku2T9KJTOhpwS_5Ek,4577
41
41
  data_designer/config/errors.py,sha256=MNMnqh8G1XzXAMeJ5ju6zkBiIH2aVgyITnzYJbGEwFY,461
42
- data_designer/config/exports.py,sha256=FoyxvW7fckm_KYRU-sgtAJUM8GWWHaDakX8Zk3DAokE,4342
42
+ data_designer/config/exports.py,sha256=vDokNLxoBlaII_-TBIS4w65t-g-MX8ADV85arpOPBRA,4440
43
43
  data_designer/config/interface.py,sha256=ery8a93pnCW1JPbgtiaRsMKSR8Q2o7rDmsZfVYbfkeE,1619
44
44
  data_designer/config/models.py,sha256=kB9Ut9Y00V6nG9zKK2c4xIVZewn3vPPIU6deug_Rttc,11362
45
- data_designer/config/preview_results.py,sha256=6SXdkq3oz15VxB7RGroSWikDP1EVPbbBn8GbMrRn2Wc,1147
46
- data_designer/config/processors.py,sha256=9JeXbGi79QSJanzjiFIDvCxRqriQDlaOtycDVYVUFI4,1368
45
+ data_designer/config/preview_results.py,sha256=6FHBUJAxYEoLq8raCCkQYPUSJTQLvhXFMKciOBU_mVw,1411
46
+ data_designer/config/processors.py,sha256=Q1fCRoL7YSWAnLwJ6sGERwQXdJNx4By8WVyHhjwtd_8,3172
47
47
  data_designer/config/sampler_constraints.py,sha256=Dxbjt5PNNmvm5CMp-Z5CYrfd6oeDeXOUnODR6FgvCDk,1187
48
48
  data_designer/config/sampler_params.py,sha256=50OEhC1AF3EPMoMlpJGGZ72kXej5wsqcZiyt7J7Kx08,26614
49
49
  data_designer/config/seed.py,sha256=tKzNUvHx-9JV8uPDUbQqx44tG88CAeCss_T8xFEPh5g,5547
@@ -61,8 +61,8 @@ data_designer/config/utils/io_helpers.py,sha256=Jl1ihaQM0K_SL86UfP0N1-y4KVph4z3S
61
61
  data_designer/config/utils/misc.py,sha256=HVRvrbpdO5c_oPI-e_3hrS7cBJA1SaG8iHMLtWKVv8A,2526
62
62
  data_designer/config/utils/numerical_helpers.py,sha256=tcm5x5qSURoZZHjN9Bm1-Jkct3G67QefXm10QQXDtlM,803
63
63
  data_designer/config/utils/type_helpers.py,sha256=RvhDk4rxQKDOMBLqJiMM4IJXdLoNUf3uzW52vB5cqrg,4024
64
- data_designer/config/utils/validation.py,sha256=wXXzdZXVAWraM6XbH21zMb-X2RvzQvzZSC290G8-iks,13079
65
- data_designer/config/utils/visualization.py,sha256=dk1TQRTg2Uo08mqEabSxui5wXOk4YgjC-Cd4Autilmc,15784
64
+ data_designer/config/utils/validation.py,sha256=1MoVqrS_DofT0LDIrGpWTPi02chntZT1p2K0FIyUOzs,14463
65
+ data_designer/config/utils/visualization.py,sha256=X0R-EDW-yzIaYtK1ttLsCXEp6a6ubejvm_9xpO2UrMg,17599
66
66
  data_designer/engine/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
67
67
  data_designer/engine/configurable_task.py,sha256=GnaBG6xVBQ1ELpzumNctwKYZJvKKjh2LMKhws4W2GS4,3124
68
68
  data_designer/engine/errors.py,sha256=DUoKhQCSwIBoLSQGv7dstzO3DFGDRqW3MBoWnRPcm1I,1262
@@ -88,8 +88,8 @@ data_designer/engine/column_generators/generators/validation.py,sha256=MbDFXzief
88
88
  data_designer/engine/column_generators/utils/errors.py,sha256=ugNwaqnPdrPZI7YnKLbYwFjYUSm0WAzgaVu_u6i5Rc8,365
89
89
  data_designer/engine/column_generators/utils/judge_score_factory.py,sha256=JRoaZgRGK24dH0zx7MNGSccK196tQK_l0sbwNkurg7c,2132
90
90
  data_designer/engine/column_generators/utils/prompt_renderer.py,sha256=d4tbyPsgmFDikW3nxL5is9RNaajMkoPDCrfkQkxw7rc,4760
91
- data_designer/engine/dataset_builders/artifact_storage.py,sha256=r70ZoohD-givRxd0mQT7JBZiUp1hhQvu_zzH-g_lQbU,7995
92
- data_designer/engine/dataset_builders/column_wise_builder.py,sha256=OSpIAChevQpLjMehP9zFwJbIURaZN-sEF0Hhy8QTbGA,13074
91
+ data_designer/engine/dataset_builders/artifact_storage.py,sha256=GCHuKuQ6Y_ePG515rsqc3NzQtN1v4pEV2L1I2H2_tx4,8451
92
+ data_designer/engine/dataset_builders/column_wise_builder.py,sha256=ljf-2fAKdry1UCVubhkhRWhoVlKZfK77ytwgkjuQ5VY,13267
93
93
  data_designer/engine/dataset_builders/errors.py,sha256=1kChleChG4rASWIiL4Bel6Ox6aFZjQUrh5ogPt1CDWo,359
94
94
  data_designer/engine/dataset_builders/multi_column_configs.py,sha256=t28fhI-WRIBohFnAJ80l5EAETEDB5rJ5RSWInMiRfyE,1619
95
95
  data_designer/engine/dataset_builders/utils/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
@@ -125,8 +125,9 @@ data_designer/engine/processing/gsonschema/schema_transformers.py,sha256=__-dfrC
125
125
  data_designer/engine/processing/gsonschema/types.py,sha256=-x_K2HrVnZ_Z7fzYl4T2Gd7QHf6B6ADvn7E7iYvw5Kc,313
126
126
  data_designer/engine/processing/gsonschema/validators.py,sha256=5Jh864KnA5gWBeLbpz1cE5Kk_GMxI6kPWvunAbLI3vI,4704
127
127
  data_designer/engine/processing/processors/base.py,sha256=WJl7_0dtiUppjfY-lrQ3lDiIgYqRDSEYUwSAQNN7nFE,548
128
- data_designer/engine/processing/processors/drop_columns.py,sha256=-ATddFz8efrM2jwiG6w7vgtj48VVy4ZoDvSbGY0aZfY,2050
129
- data_designer/engine/processing/processors/registry.py,sha256=2zr91IjEMy7duN43fkborPekXohA_X1J8BSKVc1rJKk,804
128
+ data_designer/engine/processing/processors/drop_columns.py,sha256=MIb_CVrpoM3kyN5-8dHZrdFAAUiCCWgDEyQjAk8nZqE,2060
129
+ data_designer/engine/processing/processors/registry.py,sha256=nhB1O4b0wSUkWQeleV9l1MykwZD-dSvY0ydqmSscEY8,1056
130
+ data_designer/engine/processing/processors/schema_transform.py,sha256=amRIw69F5Mn6ZrJvov3ZCRXk-Vil1_adQ1_rC6VKELg,2233
130
131
  data_designer/engine/registry/base.py,sha256=8h5MRPccLGSGcss3qFoQ-i7XGzvn8gdiRR0tYr7mDgk,3544
131
132
  data_designer/engine/registry/data_designer_registry.py,sha256=0nO7JEezwc2wnnDRKAX5BZz6RhBI3-kNU3Eb1WAdCFI,1487
132
133
  data_designer/engine/registry/errors.py,sha256=nO794QVy4DovKGKWEjycVDN9cdDlH-skbZLTb354M3Y,309
@@ -164,15 +165,15 @@ data_designer/engine/validators/remote.py,sha256=jtDIvWzfHh17m2ac_Fp93p49Th8RlkB
164
165
  data_designer/engine/validators/sql.py,sha256=bxbyxPxDT9yuwjhABVEY40iR1pzWRFi65WU4tPgG2bE,2250
165
166
  data_designer/essentials/__init__.py,sha256=eHuZFJTmeRf_b6KQZ2vZeqy1afJ7y7RMTm7q4Jrg58s,1012
166
167
  data_designer/interface/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
167
- data_designer/interface/data_designer.py,sha256=fnkKrhOW-uI6LDxpVHHbnySmCSEuEUfDh4mHgIpDm6c,16288
168
+ data_designer/interface/data_designer.py,sha256=O6PehBIdL4_2d9rFW86J9b3jfJ_CJmFId8T2AviM2zM,16844
168
169
  data_designer/interface/errors.py,sha256=jagKT3tPUnYq4e3e6AkTnBkcayHyEfxjPMBzx-GEKe4,565
169
- data_designer/interface/results.py,sha256=qFxa8SuCXeADiRpaCMBwJcExkJBCfUPeGCdcJSTjoTc,2111
170
+ data_designer/interface/results.py,sha256=zYVX589OUyFuB-8XLmjjdKk3hCDNKu189sH-gOOFreQ,3511
170
171
  data_designer/plugins/__init__.py,sha256=c_V7q4QhfVoNf_uc9UwmXCsWqwtyWogI7YoN_0PzzE4,234
171
172
  data_designer/plugins/errors.py,sha256=yPIHpSddEr-o9ZcNVibb2hI-73O15Kg_Od8SlmQlnRs,297
172
173
  data_designer/plugins/plugin.py,sha256=7ErdUyrTdOb5PCBE3msdhTOrvQpldjOQw90-Bu4Bosc,2522
173
174
  data_designer/plugins/registry.py,sha256=w0o7I3A5UpIaCiqSJIj3kv_dLlh7m_WHznP_O-X13-s,3018
174
- data_designer-0.1.4.dist-info/METADATA,sha256=Sf3MnCQkIfyO53T7lSqn20ckC_mFLx8tlOpd8izk-nc,6653
175
- data_designer-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
176
- data_designer-0.1.4.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
177
- data_designer-0.1.4.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
178
- data_designer-0.1.4.dist-info/RECORD,,
175
+ data_designer-0.1.5.dist-info/METADATA,sha256=s4j9BlO8RDnExQPVbFCYZhY5FNI539DanL-sLEmwzGk,6710
176
+ data_designer-0.1.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
177
+ data_designer-0.1.5.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
178
+ data_designer-0.1.5.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
179
+ data_designer-0.1.5.dist-info/RECORD,,