data-designer 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. data_designer/_version.py +2 -2
  2. data_designer/config/analysis/column_profilers.py +4 -4
  3. data_designer/config/analysis/column_statistics.py +5 -5
  4. data_designer/config/analysis/dataset_profiler.py +6 -6
  5. data_designer/config/analysis/utils/errors.py +1 -1
  6. data_designer/config/analysis/utils/reporting.py +5 -5
  7. data_designer/config/base.py +2 -2
  8. data_designer/config/column_configs.py +8 -8
  9. data_designer/config/column_types.py +9 -5
  10. data_designer/config/config_builder.py +32 -27
  11. data_designer/config/data_designer_config.py +7 -7
  12. data_designer/config/datastore.py +4 -4
  13. data_designer/config/default_model_settings.py +4 -4
  14. data_designer/config/errors.py +1 -1
  15. data_designer/config/exports.py +133 -0
  16. data_designer/config/interface.py +6 -6
  17. data_designer/config/models.py +109 -5
  18. data_designer/config/preview_results.py +9 -6
  19. data_designer/config/processors.py +48 -4
  20. data_designer/config/sampler_constraints.py +1 -1
  21. data_designer/config/sampler_params.py +2 -2
  22. data_designer/config/seed.py +3 -3
  23. data_designer/config/utils/constants.py +1 -1
  24. data_designer/config/utils/errors.py +1 -1
  25. data_designer/config/utils/info.py +8 -4
  26. data_designer/config/utils/io_helpers.py +5 -5
  27. data_designer/config/utils/misc.py +3 -3
  28. data_designer/config/utils/numerical_helpers.py +1 -1
  29. data_designer/config/utils/type_helpers.py +7 -3
  30. data_designer/config/utils/validation.py +37 -6
  31. data_designer/config/utils/visualization.py +42 -10
  32. data_designer/config/validator_params.py +2 -2
  33. data_designer/engine/analysis/column_profilers/base.py +1 -1
  34. data_designer/engine/analysis/dataset_profiler.py +1 -1
  35. data_designer/engine/analysis/utils/judge_score_processing.py +1 -1
  36. data_designer/engine/column_generators/generators/samplers.py +1 -1
  37. data_designer/engine/dataset_builders/artifact_storage.py +16 -6
  38. data_designer/engine/dataset_builders/column_wise_builder.py +4 -1
  39. data_designer/engine/dataset_builders/utils/concurrency.py +1 -1
  40. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +1 -1
  41. data_designer/engine/errors.py +1 -1
  42. data_designer/engine/models/errors.py +1 -1
  43. data_designer/engine/models/facade.py +1 -1
  44. data_designer/engine/models/parsers/parser.py +2 -2
  45. data_designer/engine/models/recipes/response_recipes.py +1 -1
  46. data_designer/engine/processing/ginja/environment.py +1 -1
  47. data_designer/engine/processing/gsonschema/validators.py +1 -1
  48. data_designer/engine/processing/processors/drop_columns.py +1 -1
  49. data_designer/engine/processing/processors/registry.py +3 -0
  50. data_designer/engine/processing/processors/schema_transform.py +53 -0
  51. data_designer/engine/resources/managed_dataset_repository.py +4 -4
  52. data_designer/engine/resources/managed_storage.py +1 -1
  53. data_designer/engine/sampling_gen/constraints.py +1 -1
  54. data_designer/engine/sampling_gen/data_sources/base.py +1 -1
  55. data_designer/engine/sampling_gen/entities/email_address_utils.py +1 -1
  56. data_designer/engine/sampling_gen/entities/national_id_utils.py +1 -1
  57. data_designer/engine/sampling_gen/entities/person.py +1 -1
  58. data_designer/engine/sampling_gen/entities/phone_number.py +1 -1
  59. data_designer/engine/sampling_gen/people_gen.py +3 -3
  60. data_designer/engine/secret_resolver.py +1 -1
  61. data_designer/engine/validators/python.py +2 -2
  62. data_designer/essentials/__init__.py +20 -128
  63. data_designer/interface/data_designer.py +23 -19
  64. data_designer/interface/results.py +36 -0
  65. data_designer/logging.py +2 -2
  66. data_designer/plugin_manager.py +14 -26
  67. data_designer/plugins/registry.py +1 -1
  68. {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/METADATA +9 -9
  69. {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/RECORD +72 -70
  70. {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/WHEEL +0 -0
  71. {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/entry_points.txt +0 -0
  72. {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -3,11 +3,11 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ import json
7
+ import os
6
8
  from collections import OrderedDict
7
9
  from enum import Enum
8
10
  from functools import cached_property
9
- import json
10
- import os
11
11
  from typing import TYPE_CHECKING, Optional, Union
12
12
 
13
13
  import numpy as np
@@ -21,16 +21,16 @@ from rich.syntax import Syntax
21
21
  from rich.table import Table
22
22
  from rich.text import Text
23
23
 
24
- from ..base import ConfigBase
25
- from ..column_types import DataDesignerColumnType
26
- from ..models import ModelConfig, ModelProvider
27
- from ..sampler_params import SamplerType
28
- from .code_lang import code_lang_to_syntax_lexer
29
- from .constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME
30
- from .errors import DatasetSampleDisplayError
24
+ from data_designer.config.base import ConfigBase
25
+ from data_designer.config.column_types import DataDesignerColumnType
26
+ from data_designer.config.models import ModelConfig, ModelProvider
27
+ from data_designer.config.sampler_params import SamplerType
28
+ from data_designer.config.utils.code_lang import code_lang_to_syntax_lexer
29
+ from data_designer.config.utils.constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME
30
+ from data_designer.config.utils.errors import DatasetSampleDisplayError
31
31
 
32
32
  if TYPE_CHECKING:
33
- from ..config_builder import DataDesignerConfigBuilder
33
+ from data_designer.config.config_builder import DataDesignerConfigBuilder
34
34
 
35
35
 
36
36
  console = Console()
@@ -72,6 +72,9 @@ class WithRecordSamplerMixin:
72
72
  else:
73
73
  raise DatasetSampleDisplayError("No valid dataset found in results object.")
74
74
 
75
+ def _has_processor_artifacts(self) -> bool:
76
+ return hasattr(self, "processor_artifacts") and self.processor_artifacts is not None
77
+
75
78
  def display_sample_record(
76
79
  self,
77
80
  index: Optional[int] = None,
@@ -79,6 +82,7 @@ class WithRecordSamplerMixin:
79
82
  hide_seed_columns: bool = False,
80
83
  syntax_highlighting_theme: str = "dracula",
81
84
  background_color: Optional[str] = None,
85
+ processors_to_display: Optional[list[str]] = None,
82
86
  ) -> None:
83
87
  """Display a sample record from the Data Designer dataset preview.
84
88
 
@@ -90,6 +94,7 @@ class WithRecordSamplerMixin:
90
94
  documentation from `rich` for information about available themes.
91
95
  background_color: Background color to use for the record. See the `Syntax`
92
96
  documentation from `rich` for information about available background colors.
97
+ processors_to_display: List of processors to display the artifacts for. If None, all processors will be displayed.
93
98
  """
94
99
  i = index or self._display_cycle_index
95
100
 
@@ -99,8 +104,25 @@ class WithRecordSamplerMixin:
99
104
  except IndexError:
100
105
  raise DatasetSampleDisplayError(f"Index {i} is out of bounds for dataset of length {num_records}.")
101
106
 
107
+ processor_data_to_display = None
108
+ if self._has_processor_artifacts() and len(self.processor_artifacts) > 0:
109
+ if processors_to_display is None:
110
+ processors_to_display = list(self.processor_artifacts.keys())
111
+
112
+ if len(processors_to_display) > 0:
113
+ processor_data_to_display = {}
114
+ for processor in processors_to_display:
115
+ if (
116
+ isinstance(self.processor_artifacts[processor], list)
117
+ and len(self.processor_artifacts[processor]) == num_records
118
+ ):
119
+ processor_data_to_display[processor] = self.processor_artifacts[processor][i]
120
+ else:
121
+ processor_data_to_display[processor] = self.processor_artifacts[processor]
122
+
102
123
  display_sample_record(
103
124
  record=record,
125
+ processor_data_to_display=processor_data_to_display,
104
126
  config_builder=self._config_builder,
105
127
  background_color=background_color,
106
128
  syntax_highlighting_theme=syntax_highlighting_theme,
@@ -134,6 +156,7 @@ def create_rich_histogram_table(
134
156
  def display_sample_record(
135
157
  record: Union[dict, pd.Series, pd.DataFrame],
136
158
  config_builder: DataDesignerConfigBuilder,
159
+ processor_data_to_display: Optional[dict[str, Union[list[str], str]]] = None,
137
160
  background_color: Optional[str] = None,
138
161
  syntax_highlighting_theme: str = "dracula",
139
162
  record_index: Optional[int] = None,
@@ -230,6 +253,15 @@ def display_sample_record(
230
253
  table.add_row(*row)
231
254
  render_list.append(pad_console_element(table, (1, 0, 1, 0)))
232
255
 
256
+ if processor_data_to_display and len(processor_data_to_display) > 0:
257
+ for processor_name, processor_data in processor_data_to_display.items():
258
+ table = Table(title=f"Processor Outputs: {processor_name}", **table_kws)
259
+ table.add_column("Name")
260
+ table.add_column("Value")
261
+ for col, value in processor_data.items():
262
+ table.add_row(col, convert_to_row_element(value))
263
+ render_list.append(pad_console_element(table, (1, 0, 1, 0)))
264
+
233
265
  if record_index is not None:
234
266
  index_label = Text(f"[index: {record_index}]", justify="center")
235
267
  render_list.append(index_label)
@@ -7,8 +7,8 @@ from typing import Any, Optional, Union
7
7
  from pydantic import Field, field_serializer, model_validator
8
8
  from typing_extensions import Self, TypeAlias
9
9
 
10
- from .base import ConfigBase
11
- from .utils.code_lang import SQL_DIALECTS, CodeLang
10
+ from data_designer.config.base import ConfigBase
11
+ from data_designer.config.utils.code_lang import SQL_DIALECTS, CodeLang
12
12
 
13
13
  SUPPORTED_CODE_LANGUAGES = {CodeLang.PYTHON, *SQL_DIALECTS}
14
14
 
@@ -3,8 +3,8 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
- from abc import ABC, abstractmethod
7
6
  import logging
7
+ from abc import ABC, abstractmethod
8
8
 
9
9
  import pandas as pd
10
10
  import pyarrow as pa
@@ -1,9 +1,9 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ import logging
4
5
  from collections.abc import Sequence
5
6
  from functools import cached_property
6
- import logging
7
7
 
8
8
  import pandas as pd
9
9
  from pydantic import Field, field_validator
@@ -1,8 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from collections import defaultdict
5
4
  import logging
5
+ from collections import defaultdict
6
6
  from typing import Any, Optional, Union
7
7
 
8
8
  import pandas as pd
@@ -1,9 +1,9 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from functools import partial
5
4
  import logging
6
5
  import random
6
+ from functools import partial
7
7
  from typing import Callable
8
8
 
9
9
  import pandas as pd
@@ -1,12 +1,12 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from datetime import datetime
5
- from functools import cached_property
6
4
  import json
7
5
  import logging
8
- from pathlib import Path
9
6
  import shutil
7
+ from datetime import datetime
8
+ from functools import cached_property
9
+ from pathlib import Path
10
10
  from typing import Union
11
11
 
12
12
  import pandas as pd
@@ -25,6 +25,7 @@ class BatchStage(StrEnum):
25
25
  PARTIAL_RESULT = "partial_results_path"
26
26
  FINAL_RESULT = "final_dataset_path"
27
27
  DROPPED_COLUMNS = "dropped_columns_dataset_path"
28
+ PROCESSORS_OUTPUTS = "processors_outputs_path"
28
29
 
29
30
 
30
31
  class ArtifactStorage(BaseModel):
@@ -33,6 +34,7 @@ class ArtifactStorage(BaseModel):
33
34
  final_dataset_folder_name: str = "parquet-files"
34
35
  partial_results_folder_name: str = "tmp-partial-parquet-files"
35
36
  dropped_columns_folder_name: str = "dropped-columns-parquet-files"
37
+ processors_outputs_folder_name: str = "processors-files"
36
38
 
37
39
  @property
38
40
  def artifact_path_exists(self) -> bool:
@@ -70,6 +72,10 @@ class ArtifactStorage(BaseModel):
70
72
  def partial_results_path(self) -> Path:
71
73
  return self.base_dataset_path / self.partial_results_folder_name
72
74
 
75
+ @property
76
+ def processors_outputs_path(self) -> Path:
77
+ return self.base_dataset_path / self.processors_outputs_folder_name
78
+
73
79
  @field_validator("artifact_path")
74
80
  def validate_artifact_path(cls, v: Union[Path, str]) -> Path:
75
81
  v = Path(v)
@@ -84,6 +90,7 @@ class ArtifactStorage(BaseModel):
84
90
  self.final_dataset_folder_name,
85
91
  self.partial_results_folder_name,
86
92
  self.dropped_columns_folder_name,
93
+ self.processors_outputs_folder_name,
87
94
  ]
88
95
 
89
96
  for name in folder_names:
@@ -169,9 +176,10 @@ class ArtifactStorage(BaseModel):
169
176
  batch_number: int,
170
177
  dataframe: pd.DataFrame,
171
178
  batch_stage: BatchStage,
179
+ subfolder: str | None = None,
172
180
  ) -> Path:
173
181
  file_path = self.create_batch_file_path(batch_number, batch_stage=batch_stage)
174
- self.write_parquet_file(file_path.name, dataframe, batch_stage)
182
+ self.write_parquet_file(file_path.name, dataframe, batch_stage, subfolder=subfolder)
175
183
  return file_path
176
184
 
177
185
  def write_parquet_file(
@@ -179,9 +187,11 @@ class ArtifactStorage(BaseModel):
179
187
  parquet_file_name: str,
180
188
  dataframe: pd.DataFrame,
181
189
  batch_stage: BatchStage,
190
+ subfolder: str | None = None,
182
191
  ) -> Path:
183
- self.mkdir_if_needed(self._get_stage_path(batch_stage))
184
- file_path = self._get_stage_path(batch_stage) / parquet_file_name
192
+ subfolder = subfolder or ""
193
+ self.mkdir_if_needed(self._get_stage_path(batch_stage) / subfolder)
194
+ file_path = self._get_stage_path(batch_stage) / subfolder / parquet_file_name
185
195
  dataframe.to_parquet(file_path, index=False)
186
196
  return file_path
187
197
 
@@ -4,8 +4,8 @@
4
4
  import functools
5
5
  import json
6
6
  import logging
7
- from pathlib import Path
8
7
  import time
8
+ from pathlib import Path
9
9
  from typing import Callable
10
10
 
11
11
  import pandas as pd
@@ -171,6 +171,8 @@ class ColumnWiseDatasetBuilder:
171
171
  max_workers = MAX_CONCURRENCY_PER_NON_LLM_GENERATOR
172
172
  if isinstance(generator, WithLLMGeneration):
173
173
  max_workers = generator.inference_parameters.max_parallel_requests
174
+ elif hasattr(generator.config, "max_parallel_requests"):
175
+ max_workers = generator.config.max_parallel_requests
174
176
  self._fan_out_with_threads(generator, max_workers=max_workers)
175
177
 
176
178
  def _run_full_column_generator(self, generator: ColumnGenerator) -> None:
@@ -244,6 +246,7 @@ class ColumnWiseDatasetBuilder:
244
246
  processors[BuildStage.POST_BATCH].append( # as post-batch by default
245
247
  DropColumnsProcessor(
246
248
  config=DropColumnsProcessorConfig(
249
+ name="default_drop_columns_processor",
247
250
  column_names=columns_to_drop,
248
251
  build_stage=BuildStage.POST_BATCH,
249
252
  ),
@@ -3,10 +3,10 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
- from concurrent.futures import Future, ThreadPoolExecutor
7
6
  import contextvars
8
7
  import json
9
8
  import logging
9
+ from concurrent.futures import Future, ThreadPoolExecutor
10
10
  from threading import Lock, Semaphore
11
11
  from typing import Any, Optional, Protocol
12
12
 
@@ -2,8 +2,8 @@
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  import logging
5
- from pathlib import Path
6
5
  import shutil
6
+ from pathlib import Path
7
7
  from typing import Callable, Container, Iterator
8
8
 
9
9
  import pandas as pd
@@ -3,7 +3,7 @@
3
3
 
4
4
  from pydantic import BaseModel, Field
5
5
 
6
- from ..errors import DataDesignerError
6
+ from data_designer.errors import DataDesignerError
7
7
 
8
8
 
9
9
  class DataDesignerRuntimeError(DataDesignerError): ...
@@ -3,9 +3,9 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ import logging
6
7
  from collections.abc import Callable
7
8
  from functools import wraps
8
- import logging
9
9
  from typing import Any
10
10
 
11
11
  from litellm.exceptions import (
@@ -3,9 +3,9 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ import logging
6
7
  from collections.abc import Callable
7
8
  from copy import deepcopy
8
- import logging
9
9
  from typing import Any
10
10
 
11
11
  from litellm.types.router import DeploymentTypedDict, LiteLLM_Params
@@ -4,12 +4,12 @@
4
4
  from functools import reduce
5
5
  from typing import Optional
6
6
 
7
+ import marko
7
8
  from lxml import etree
8
9
  from lxml.etree import _Element
9
- import marko
10
10
 
11
- from data_designer.engine.models.parsers.postprocessors import merge_text_blocks
12
11
  import data_designer.engine.models.parsers.tag_parsers as tp
12
+ from data_designer.engine.models.parsers.postprocessors import merge_text_blocks
13
13
  from data_designer.engine.models.parsers.types import (
14
14
  LLMStructuredResponse,
15
15
  PostProcessor,
@@ -1,8 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from collections.abc import Callable
5
4
  import json
5
+ from collections.abc import Callable
6
6
 
7
7
  from pydantic import BaseModel
8
8
 
@@ -1,9 +1,9 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ import re
4
5
  from collections.abc import Callable
5
6
  from functools import partial, wraps
6
- import re
7
7
  from typing import Any
8
8
 
9
9
  from jinja2 import meta
@@ -1,8 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from copy import deepcopy
5
4
  import logging
5
+ from copy import deepcopy
6
6
  from typing import Any, overload
7
7
 
8
8
  from jsonschema import Draft202012Validator, ValidationError, validators
@@ -17,7 +17,7 @@ class DropColumnsProcessor(Processor[DropColumnsProcessorConfig]):
17
17
  @staticmethod
18
18
  def metadata() -> ConfigurableTaskMetadata:
19
19
  return ConfigurableTaskMetadata(
20
- name="drop_columns",
20
+ name="drop_columns_processor",
21
21
  description="Drop columns from the input dataset.",
22
22
  required_resources=None,
23
23
  )
@@ -5,9 +5,11 @@ from data_designer.config.base import ConfigBase
5
5
  from data_designer.config.processors import (
6
6
  DropColumnsProcessorConfig,
7
7
  ProcessorType,
8
+ SchemaTransformProcessorConfig,
8
9
  )
9
10
  from data_designer.engine.processing.processors.base import Processor
10
11
  from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor
12
+ from data_designer.engine.processing.processors.schema_transform import SchemaTransformProcessor
11
13
  from data_designer.engine.registry.base import TaskRegistry
12
14
 
13
15
 
@@ -16,5 +18,6 @@ class ProcessorRegistry(TaskRegistry[str, Processor, ConfigBase]): ...
16
18
 
17
19
  def create_default_processor_registry() -> ProcessorRegistry:
18
20
  registry = ProcessorRegistry()
21
+ registry.register(ProcessorType.SCHEMA_TRANSFORM, SchemaTransformProcessor, SchemaTransformProcessorConfig, False)
19
22
  registry.register(ProcessorType.DROP_COLUMNS, DropColumnsProcessor, DropColumnsProcessorConfig, False)
20
23
  return registry
@@ -0,0 +1,53 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import json
5
+ import logging
6
+
7
+ import pandas as pd
8
+
9
+ from data_designer.config.processors import SchemaTransformProcessorConfig
10
+ from data_designer.engine.configurable_task import ConfigurableTaskMetadata
11
+ from data_designer.engine.dataset_builders.artifact_storage import BatchStage
12
+ from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
13
+ from data_designer.engine.processing.processors.base import Processor
14
+ from data_designer.engine.processing.utils import deserialize_json_values
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class SchemaTransformProcessor(WithJinja2UserTemplateRendering, Processor[SchemaTransformProcessorConfig]):
20
+ @staticmethod
21
+ def metadata() -> ConfigurableTaskMetadata:
22
+ return ConfigurableTaskMetadata(
23
+ name="schema_transform_processor",
24
+ description="Generate dataset with transformed schema using a Jinja2 template.",
25
+ required_resources=None,
26
+ )
27
+
28
+ @property
29
+ def template_as_str(self) -> str:
30
+ return json.dumps(self.config.template)
31
+
32
+ def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame:
33
+ self.prepare_jinja2_template_renderer(self.template_as_str, data.columns.to_list())
34
+ formatted_records = [
35
+ json.loads(self.render_template(deserialize_json_values(record)).replace("\n", "\\n"))
36
+ for record in data.to_dict(orient="records")
37
+ ]
38
+ formatted_data = pd.DataFrame(formatted_records)
39
+ if current_batch_number is not None:
40
+ self.artifact_storage.write_batch_to_parquet_file(
41
+ batch_number=current_batch_number,
42
+ dataframe=formatted_data,
43
+ batch_stage=BatchStage.PROCESSORS_OUTPUTS,
44
+ subfolder=self.config.name,
45
+ )
46
+ else:
47
+ self.artifact_storage.write_parquet_file(
48
+ parquet_file_name=f"{self.config.name}.parquet",
49
+ dataframe=formatted_data,
50
+ batch_stage=BatchStage.PROCESSORS_OUTPUTS,
51
+ )
52
+
53
+ return data
@@ -1,14 +1,14 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from abc import ABC, abstractmethod
5
- from dataclasses import dataclass
6
- from functools import cached_property
7
4
  import logging
8
- from pathlib import Path
9
5
  import tempfile
10
6
  import threading
11
7
  import time
8
+ from abc import ABC, abstractmethod
9
+ from dataclasses import dataclass
10
+ from functools import cached_property
11
+ from pathlib import Path
12
12
  from typing import Any
13
13
 
14
14
  import duckdb
@@ -1,10 +1,10 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ import logging
4
5
  from abc import ABC, abstractmethod
5
6
  from collections.abc import Iterator
6
7
  from contextlib import contextmanager
7
- import logging
8
8
  from pathlib import Path
9
9
  from typing import IO
10
10
 
@@ -5,8 +5,8 @@ from abc import ABC, abstractmethod
5
5
  from typing import Type
6
6
 
7
7
  import numpy as np
8
- from numpy.typing import NDArray
9
8
  import pandas as pd
9
+ from numpy.typing import NDArray
10
10
 
11
11
  from data_designer.config.base import ConfigBase
12
12
  from data_designer.config.sampler_constraints import (
@@ -5,8 +5,8 @@ from abc import ABC, abstractmethod
5
5
  from typing import Any, Generic, Optional, Type, TypeVar, Union
6
6
 
7
7
  import numpy as np
8
- from numpy.typing import NDArray
9
8
  import pandas as pd
9
+ from numpy.typing import NDArray
10
10
  from scipy import stats
11
11
 
12
12
  from data_designer.config.sampler_params import SamplerParamsT
@@ -1,9 +1,9 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from datetime import date
5
4
  import random
6
5
  import re
6
+ from datetime import date
7
7
 
8
8
  import anyascii
9
9
 
@@ -1,8 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from datetime import date
5
4
  import random
5
+ from datetime import date
6
6
 
7
7
  SSN_RANDOMIZATION_DATE = date(2011, 6, 25)
8
8
 
@@ -1,8 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from datetime import date, timedelta
5
4
  import random
5
+ from datetime import date, timedelta
6
6
  from typing import Any, Literal, TypeAlias
7
7
 
8
8
  from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
@@ -1,8 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from pathlib import Path
5
4
  import random
5
+ from pathlib import Path
6
6
  from typing import Optional
7
7
 
8
8
  import pandas as pd
@@ -3,15 +3,15 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ import random
7
+ import uuid
6
8
  from abc import ABC, abstractmethod
7
9
  from collections.abc import Callable
8
10
  from copy import deepcopy
9
- import random
10
11
  from typing import TYPE_CHECKING, Any, Union
11
- import uuid
12
12
 
13
- from faker import Faker
14
13
  import pandas as pd
14
+ from faker import Faker
15
15
 
16
16
  from data_designer.config.utils.constants import AVAILABLE_LOCALES, DEFAULT_AGE_RANGE
17
17
  from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator
@@ -1,10 +1,10 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from collections.abc import Sequence
5
4
  import json
6
5
  import logging
7
6
  import os
7
+ from collections.abc import Sequence
8
8
  from pathlib import Path
9
9
  from typing import Protocol
10
10
 
@@ -2,12 +2,12 @@
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  import ast
5
- from collections import defaultdict
6
5
  import logging
7
- from pathlib import Path
8
6
  import re
9
7
  import subprocess
10
8
  import tempfile
9
+ from collections import defaultdict
10
+ from pathlib import Path
11
11
  from uuid import uuid4
12
12
 
13
13
  import pandas as pd