data-designer 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/_version.py +2 -2
- data_designer/config/analysis/column_profilers.py +4 -4
- data_designer/config/analysis/column_statistics.py +5 -5
- data_designer/config/analysis/dataset_profiler.py +6 -6
- data_designer/config/analysis/utils/errors.py +1 -1
- data_designer/config/analysis/utils/reporting.py +5 -5
- data_designer/config/base.py +2 -2
- data_designer/config/column_configs.py +8 -8
- data_designer/config/column_types.py +9 -5
- data_designer/config/config_builder.py +32 -27
- data_designer/config/data_designer_config.py +7 -7
- data_designer/config/datastore.py +4 -4
- data_designer/config/default_model_settings.py +4 -4
- data_designer/config/errors.py +1 -1
- data_designer/config/exports.py +133 -0
- data_designer/config/interface.py +6 -6
- data_designer/config/models.py +109 -5
- data_designer/config/preview_results.py +9 -6
- data_designer/config/processors.py +48 -4
- data_designer/config/sampler_constraints.py +1 -1
- data_designer/config/sampler_params.py +2 -2
- data_designer/config/seed.py +3 -3
- data_designer/config/utils/constants.py +1 -1
- data_designer/config/utils/errors.py +1 -1
- data_designer/config/utils/info.py +8 -4
- data_designer/config/utils/io_helpers.py +5 -5
- data_designer/config/utils/misc.py +3 -3
- data_designer/config/utils/numerical_helpers.py +1 -1
- data_designer/config/utils/type_helpers.py +7 -3
- data_designer/config/utils/validation.py +37 -6
- data_designer/config/utils/visualization.py +42 -10
- data_designer/config/validator_params.py +2 -2
- data_designer/engine/analysis/column_profilers/base.py +1 -1
- data_designer/engine/analysis/dataset_profiler.py +1 -1
- data_designer/engine/analysis/utils/judge_score_processing.py +1 -1
- data_designer/engine/column_generators/generators/samplers.py +1 -1
- data_designer/engine/dataset_builders/artifact_storage.py +16 -6
- data_designer/engine/dataset_builders/column_wise_builder.py +4 -1
- data_designer/engine/dataset_builders/utils/concurrency.py +1 -1
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +1 -1
- data_designer/engine/errors.py +1 -1
- data_designer/engine/models/errors.py +1 -1
- data_designer/engine/models/facade.py +1 -1
- data_designer/engine/models/parsers/parser.py +2 -2
- data_designer/engine/models/recipes/response_recipes.py +1 -1
- data_designer/engine/processing/ginja/environment.py +1 -1
- data_designer/engine/processing/gsonschema/validators.py +1 -1
- data_designer/engine/processing/processors/drop_columns.py +1 -1
- data_designer/engine/processing/processors/registry.py +3 -0
- data_designer/engine/processing/processors/schema_transform.py +53 -0
- data_designer/engine/resources/managed_dataset_repository.py +4 -4
- data_designer/engine/resources/managed_storage.py +1 -1
- data_designer/engine/sampling_gen/constraints.py +1 -1
- data_designer/engine/sampling_gen/data_sources/base.py +1 -1
- data_designer/engine/sampling_gen/entities/email_address_utils.py +1 -1
- data_designer/engine/sampling_gen/entities/national_id_utils.py +1 -1
- data_designer/engine/sampling_gen/entities/person.py +1 -1
- data_designer/engine/sampling_gen/entities/phone_number.py +1 -1
- data_designer/engine/sampling_gen/people_gen.py +3 -3
- data_designer/engine/secret_resolver.py +1 -1
- data_designer/engine/validators/python.py +2 -2
- data_designer/essentials/__init__.py +20 -128
- data_designer/interface/data_designer.py +23 -19
- data_designer/interface/results.py +36 -0
- data_designer/logging.py +2 -2
- data_designer/plugin_manager.py +14 -26
- data_designer/plugins/registry.py +1 -1
- {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/METADATA +9 -9
- {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/RECORD +72 -70
- {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/WHEEL +0 -0
- {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/entry_points.txt +0 -0
- {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,11 +3,11 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
6
8
|
from collections import OrderedDict
|
|
7
9
|
from enum import Enum
|
|
8
10
|
from functools import cached_property
|
|
9
|
-
import json
|
|
10
|
-
import os
|
|
11
11
|
from typing import TYPE_CHECKING, Optional, Union
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
@@ -21,16 +21,16 @@ from rich.syntax import Syntax
|
|
|
21
21
|
from rich.table import Table
|
|
22
22
|
from rich.text import Text
|
|
23
23
|
|
|
24
|
-
from
|
|
25
|
-
from
|
|
26
|
-
from
|
|
27
|
-
from
|
|
28
|
-
from .code_lang import code_lang_to_syntax_lexer
|
|
29
|
-
from .constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME
|
|
30
|
-
from .errors import DatasetSampleDisplayError
|
|
24
|
+
from data_designer.config.base import ConfigBase
|
|
25
|
+
from data_designer.config.column_types import DataDesignerColumnType
|
|
26
|
+
from data_designer.config.models import ModelConfig, ModelProvider
|
|
27
|
+
from data_designer.config.sampler_params import SamplerType
|
|
28
|
+
from data_designer.config.utils.code_lang import code_lang_to_syntax_lexer
|
|
29
|
+
from data_designer.config.utils.constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME
|
|
30
|
+
from data_designer.config.utils.errors import DatasetSampleDisplayError
|
|
31
31
|
|
|
32
32
|
if TYPE_CHECKING:
|
|
33
|
-
from
|
|
33
|
+
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
console = Console()
|
|
@@ -72,6 +72,9 @@ class WithRecordSamplerMixin:
|
|
|
72
72
|
else:
|
|
73
73
|
raise DatasetSampleDisplayError("No valid dataset found in results object.")
|
|
74
74
|
|
|
75
|
+
def _has_processor_artifacts(self) -> bool:
|
|
76
|
+
return hasattr(self, "processor_artifacts") and self.processor_artifacts is not None
|
|
77
|
+
|
|
75
78
|
def display_sample_record(
|
|
76
79
|
self,
|
|
77
80
|
index: Optional[int] = None,
|
|
@@ -79,6 +82,7 @@ class WithRecordSamplerMixin:
|
|
|
79
82
|
hide_seed_columns: bool = False,
|
|
80
83
|
syntax_highlighting_theme: str = "dracula",
|
|
81
84
|
background_color: Optional[str] = None,
|
|
85
|
+
processors_to_display: Optional[list[str]] = None,
|
|
82
86
|
) -> None:
|
|
83
87
|
"""Display a sample record from the Data Designer dataset preview.
|
|
84
88
|
|
|
@@ -90,6 +94,7 @@ class WithRecordSamplerMixin:
|
|
|
90
94
|
documentation from `rich` for information about available themes.
|
|
91
95
|
background_color: Background color to use for the record. See the `Syntax`
|
|
92
96
|
documentation from `rich` for information about available background colors.
|
|
97
|
+
processors_to_display: List of processors to display the artifacts for. If None, all processors will be displayed.
|
|
93
98
|
"""
|
|
94
99
|
i = index or self._display_cycle_index
|
|
95
100
|
|
|
@@ -99,8 +104,25 @@ class WithRecordSamplerMixin:
|
|
|
99
104
|
except IndexError:
|
|
100
105
|
raise DatasetSampleDisplayError(f"Index {i} is out of bounds for dataset of length {num_records}.")
|
|
101
106
|
|
|
107
|
+
processor_data_to_display = None
|
|
108
|
+
if self._has_processor_artifacts() and len(self.processor_artifacts) > 0:
|
|
109
|
+
if processors_to_display is None:
|
|
110
|
+
processors_to_display = list(self.processor_artifacts.keys())
|
|
111
|
+
|
|
112
|
+
if len(processors_to_display) > 0:
|
|
113
|
+
processor_data_to_display = {}
|
|
114
|
+
for processor in processors_to_display:
|
|
115
|
+
if (
|
|
116
|
+
isinstance(self.processor_artifacts[processor], list)
|
|
117
|
+
and len(self.processor_artifacts[processor]) == num_records
|
|
118
|
+
):
|
|
119
|
+
processor_data_to_display[processor] = self.processor_artifacts[processor][i]
|
|
120
|
+
else:
|
|
121
|
+
processor_data_to_display[processor] = self.processor_artifacts[processor]
|
|
122
|
+
|
|
102
123
|
display_sample_record(
|
|
103
124
|
record=record,
|
|
125
|
+
processor_data_to_display=processor_data_to_display,
|
|
104
126
|
config_builder=self._config_builder,
|
|
105
127
|
background_color=background_color,
|
|
106
128
|
syntax_highlighting_theme=syntax_highlighting_theme,
|
|
@@ -134,6 +156,7 @@ def create_rich_histogram_table(
|
|
|
134
156
|
def display_sample_record(
|
|
135
157
|
record: Union[dict, pd.Series, pd.DataFrame],
|
|
136
158
|
config_builder: DataDesignerConfigBuilder,
|
|
159
|
+
processor_data_to_display: Optional[dict[str, Union[list[str], str]]] = None,
|
|
137
160
|
background_color: Optional[str] = None,
|
|
138
161
|
syntax_highlighting_theme: str = "dracula",
|
|
139
162
|
record_index: Optional[int] = None,
|
|
@@ -230,6 +253,15 @@ def display_sample_record(
|
|
|
230
253
|
table.add_row(*row)
|
|
231
254
|
render_list.append(pad_console_element(table, (1, 0, 1, 0)))
|
|
232
255
|
|
|
256
|
+
if processor_data_to_display and len(processor_data_to_display) > 0:
|
|
257
|
+
for processor_name, processor_data in processor_data_to_display.items():
|
|
258
|
+
table = Table(title=f"Processor Outputs: {processor_name}", **table_kws)
|
|
259
|
+
table.add_column("Name")
|
|
260
|
+
table.add_column("Value")
|
|
261
|
+
for col, value in processor_data.items():
|
|
262
|
+
table.add_row(col, convert_to_row_element(value))
|
|
263
|
+
render_list.append(pad_console_element(table, (1, 0, 1, 0)))
|
|
264
|
+
|
|
233
265
|
if record_index is not None:
|
|
234
266
|
index_label = Text(f"[index: {record_index}]", justify="center")
|
|
235
267
|
render_list.append(index_label)
|
|
@@ -7,8 +7,8 @@ from typing import Any, Optional, Union
|
|
|
7
7
|
from pydantic import Field, field_serializer, model_validator
|
|
8
8
|
from typing_extensions import Self, TypeAlias
|
|
9
9
|
|
|
10
|
-
from .base import ConfigBase
|
|
11
|
-
from .utils.code_lang import SQL_DIALECTS, CodeLang
|
|
10
|
+
from data_designer.config.base import ConfigBase
|
|
11
|
+
from data_designer.config.utils.code_lang import SQL_DIALECTS, CodeLang
|
|
12
12
|
|
|
13
13
|
SUPPORTED_CODE_LANGUAGES = {CodeLang.PYTHON, *SQL_DIALECTS}
|
|
14
14
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
import logging
|
|
4
5
|
from collections.abc import Sequence
|
|
5
6
|
from functools import cached_property
|
|
6
|
-
import logging
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from pydantic import Field, field_validator
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from collections import defaultdict
|
|
5
4
|
import logging
|
|
5
|
+
from collections import defaultdict
|
|
6
6
|
from typing import Any, Optional, Union
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from functools import partial
|
|
5
4
|
import logging
|
|
6
5
|
import random
|
|
6
|
+
from functools import partial
|
|
7
7
|
from typing import Callable
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from functools import cached_property
|
|
6
4
|
import json
|
|
7
5
|
import logging
|
|
8
|
-
from pathlib import Path
|
|
9
6
|
import shutil
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from functools import cached_property
|
|
9
|
+
from pathlib import Path
|
|
10
10
|
from typing import Union
|
|
11
11
|
|
|
12
12
|
import pandas as pd
|
|
@@ -25,6 +25,7 @@ class BatchStage(StrEnum):
|
|
|
25
25
|
PARTIAL_RESULT = "partial_results_path"
|
|
26
26
|
FINAL_RESULT = "final_dataset_path"
|
|
27
27
|
DROPPED_COLUMNS = "dropped_columns_dataset_path"
|
|
28
|
+
PROCESSORS_OUTPUTS = "processors_outputs_path"
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
class ArtifactStorage(BaseModel):
|
|
@@ -33,6 +34,7 @@ class ArtifactStorage(BaseModel):
|
|
|
33
34
|
final_dataset_folder_name: str = "parquet-files"
|
|
34
35
|
partial_results_folder_name: str = "tmp-partial-parquet-files"
|
|
35
36
|
dropped_columns_folder_name: str = "dropped-columns-parquet-files"
|
|
37
|
+
processors_outputs_folder_name: str = "processors-files"
|
|
36
38
|
|
|
37
39
|
@property
|
|
38
40
|
def artifact_path_exists(self) -> bool:
|
|
@@ -70,6 +72,10 @@ class ArtifactStorage(BaseModel):
|
|
|
70
72
|
def partial_results_path(self) -> Path:
|
|
71
73
|
return self.base_dataset_path / self.partial_results_folder_name
|
|
72
74
|
|
|
75
|
+
@property
|
|
76
|
+
def processors_outputs_path(self) -> Path:
|
|
77
|
+
return self.base_dataset_path / self.processors_outputs_folder_name
|
|
78
|
+
|
|
73
79
|
@field_validator("artifact_path")
|
|
74
80
|
def validate_artifact_path(cls, v: Union[Path, str]) -> Path:
|
|
75
81
|
v = Path(v)
|
|
@@ -84,6 +90,7 @@ class ArtifactStorage(BaseModel):
|
|
|
84
90
|
self.final_dataset_folder_name,
|
|
85
91
|
self.partial_results_folder_name,
|
|
86
92
|
self.dropped_columns_folder_name,
|
|
93
|
+
self.processors_outputs_folder_name,
|
|
87
94
|
]
|
|
88
95
|
|
|
89
96
|
for name in folder_names:
|
|
@@ -169,9 +176,10 @@ class ArtifactStorage(BaseModel):
|
|
|
169
176
|
batch_number: int,
|
|
170
177
|
dataframe: pd.DataFrame,
|
|
171
178
|
batch_stage: BatchStage,
|
|
179
|
+
subfolder: str | None = None,
|
|
172
180
|
) -> Path:
|
|
173
181
|
file_path = self.create_batch_file_path(batch_number, batch_stage=batch_stage)
|
|
174
|
-
self.write_parquet_file(file_path.name, dataframe, batch_stage)
|
|
182
|
+
self.write_parquet_file(file_path.name, dataframe, batch_stage, subfolder=subfolder)
|
|
175
183
|
return file_path
|
|
176
184
|
|
|
177
185
|
def write_parquet_file(
|
|
@@ -179,9 +187,11 @@ class ArtifactStorage(BaseModel):
|
|
|
179
187
|
parquet_file_name: str,
|
|
180
188
|
dataframe: pd.DataFrame,
|
|
181
189
|
batch_stage: BatchStage,
|
|
190
|
+
subfolder: str | None = None,
|
|
182
191
|
) -> Path:
|
|
183
|
-
|
|
184
|
-
|
|
192
|
+
subfolder = subfolder or ""
|
|
193
|
+
self.mkdir_if_needed(self._get_stage_path(batch_stage) / subfolder)
|
|
194
|
+
file_path = self._get_stage_path(batch_stage) / subfolder / parquet_file_name
|
|
185
195
|
dataframe.to_parquet(file_path, index=False)
|
|
186
196
|
return file_path
|
|
187
197
|
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
import functools
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
from pathlib import Path
|
|
8
7
|
import time
|
|
8
|
+
from pathlib import Path
|
|
9
9
|
from typing import Callable
|
|
10
10
|
|
|
11
11
|
import pandas as pd
|
|
@@ -171,6 +171,8 @@ class ColumnWiseDatasetBuilder:
|
|
|
171
171
|
max_workers = MAX_CONCURRENCY_PER_NON_LLM_GENERATOR
|
|
172
172
|
if isinstance(generator, WithLLMGeneration):
|
|
173
173
|
max_workers = generator.inference_parameters.max_parallel_requests
|
|
174
|
+
elif hasattr(generator.config, "max_parallel_requests"):
|
|
175
|
+
max_workers = generator.config.max_parallel_requests
|
|
174
176
|
self._fan_out_with_threads(generator, max_workers=max_workers)
|
|
175
177
|
|
|
176
178
|
def _run_full_column_generator(self, generator: ColumnGenerator) -> None:
|
|
@@ -244,6 +246,7 @@ class ColumnWiseDatasetBuilder:
|
|
|
244
246
|
processors[BuildStage.POST_BATCH].append( # as post-batch by default
|
|
245
247
|
DropColumnsProcessor(
|
|
246
248
|
config=DropColumnsProcessorConfig(
|
|
249
|
+
name="default_drop_columns_processor",
|
|
247
250
|
column_names=columns_to_drop,
|
|
248
251
|
build_stage=BuildStage.POST_BATCH,
|
|
249
252
|
),
|
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
-
from concurrent.futures import Future, ThreadPoolExecutor
|
|
7
6
|
import contextvars
|
|
8
7
|
import json
|
|
9
8
|
import logging
|
|
9
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
10
10
|
from threading import Lock, Semaphore
|
|
11
11
|
from typing import Any, Optional, Protocol
|
|
12
12
|
|
data_designer/engine/errors.py
CHANGED
|
@@ -4,12 +4,12 @@
|
|
|
4
4
|
from functools import reduce
|
|
5
5
|
from typing import Optional
|
|
6
6
|
|
|
7
|
+
import marko
|
|
7
8
|
from lxml import etree
|
|
8
9
|
from lxml.etree import _Element
|
|
9
|
-
import marko
|
|
10
10
|
|
|
11
|
-
from data_designer.engine.models.parsers.postprocessors import merge_text_blocks
|
|
12
11
|
import data_designer.engine.models.parsers.tag_parsers as tp
|
|
12
|
+
from data_designer.engine.models.parsers.postprocessors import merge_text_blocks
|
|
13
13
|
from data_designer.engine.models.parsers.types import (
|
|
14
14
|
LLMStructuredResponse,
|
|
15
15
|
PostProcessor,
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from collections.abc import Callable
|
|
5
4
|
import json
|
|
5
|
+
from collections.abc import Callable
|
|
6
6
|
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
import re
|
|
4
5
|
from collections.abc import Callable
|
|
5
6
|
from functools import partial, wraps
|
|
6
|
-
import re
|
|
7
7
|
from typing import Any
|
|
8
8
|
|
|
9
9
|
from jinja2 import meta
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from copy import deepcopy
|
|
5
4
|
import logging
|
|
5
|
+
from copy import deepcopy
|
|
6
6
|
from typing import Any, overload
|
|
7
7
|
|
|
8
8
|
from jsonschema import Draft202012Validator, ValidationError, validators
|
|
@@ -17,7 +17,7 @@ class DropColumnsProcessor(Processor[DropColumnsProcessorConfig]):
|
|
|
17
17
|
@staticmethod
|
|
18
18
|
def metadata() -> ConfigurableTaskMetadata:
|
|
19
19
|
return ConfigurableTaskMetadata(
|
|
20
|
-
name="
|
|
20
|
+
name="drop_columns_processor",
|
|
21
21
|
description="Drop columns from the input dataset.",
|
|
22
22
|
required_resources=None,
|
|
23
23
|
)
|
|
@@ -5,9 +5,11 @@ from data_designer.config.base import ConfigBase
|
|
|
5
5
|
from data_designer.config.processors import (
|
|
6
6
|
DropColumnsProcessorConfig,
|
|
7
7
|
ProcessorType,
|
|
8
|
+
SchemaTransformProcessorConfig,
|
|
8
9
|
)
|
|
9
10
|
from data_designer.engine.processing.processors.base import Processor
|
|
10
11
|
from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor
|
|
12
|
+
from data_designer.engine.processing.processors.schema_transform import SchemaTransformProcessor
|
|
11
13
|
from data_designer.engine.registry.base import TaskRegistry
|
|
12
14
|
|
|
13
15
|
|
|
@@ -16,5 +18,6 @@ class ProcessorRegistry(TaskRegistry[str, Processor, ConfigBase]): ...
|
|
|
16
18
|
|
|
17
19
|
def create_default_processor_registry() -> ProcessorRegistry:
|
|
18
20
|
registry = ProcessorRegistry()
|
|
21
|
+
registry.register(ProcessorType.SCHEMA_TRANSFORM, SchemaTransformProcessor, SchemaTransformProcessorConfig, False)
|
|
19
22
|
registry.register(ProcessorType.DROP_COLUMNS, DropColumnsProcessor, DropColumnsProcessorConfig, False)
|
|
20
23
|
return registry
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from data_designer.config.processors import SchemaTransformProcessorConfig
|
|
10
|
+
from data_designer.engine.configurable_task import ConfigurableTaskMetadata
|
|
11
|
+
from data_designer.engine.dataset_builders.artifact_storage import BatchStage
|
|
12
|
+
from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
|
|
13
|
+
from data_designer.engine.processing.processors.base import Processor
|
|
14
|
+
from data_designer.engine.processing.utils import deserialize_json_values
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SchemaTransformProcessor(WithJinja2UserTemplateRendering, Processor[SchemaTransformProcessorConfig]):
|
|
20
|
+
@staticmethod
|
|
21
|
+
def metadata() -> ConfigurableTaskMetadata:
|
|
22
|
+
return ConfigurableTaskMetadata(
|
|
23
|
+
name="schema_transform_processor",
|
|
24
|
+
description="Generate dataset with transformed schema using a Jinja2 template.",
|
|
25
|
+
required_resources=None,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def template_as_str(self) -> str:
|
|
30
|
+
return json.dumps(self.config.template)
|
|
31
|
+
|
|
32
|
+
def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame:
|
|
33
|
+
self.prepare_jinja2_template_renderer(self.template_as_str, data.columns.to_list())
|
|
34
|
+
formatted_records = [
|
|
35
|
+
json.loads(self.render_template(deserialize_json_values(record)).replace("\n", "\\n"))
|
|
36
|
+
for record in data.to_dict(orient="records")
|
|
37
|
+
]
|
|
38
|
+
formatted_data = pd.DataFrame(formatted_records)
|
|
39
|
+
if current_batch_number is not None:
|
|
40
|
+
self.artifact_storage.write_batch_to_parquet_file(
|
|
41
|
+
batch_number=current_batch_number,
|
|
42
|
+
dataframe=formatted_data,
|
|
43
|
+
batch_stage=BatchStage.PROCESSORS_OUTPUTS,
|
|
44
|
+
subfolder=self.config.name,
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
self.artifact_storage.write_parquet_file(
|
|
48
|
+
parquet_file_name=f"{self.config.name}.parquet",
|
|
49
|
+
dataframe=formatted_data,
|
|
50
|
+
batch_stage=BatchStage.PROCESSORS_OUTPUTS,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return data
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from abc import ABC, abstractmethod
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
from functools import cached_property
|
|
7
4
|
import logging
|
|
8
|
-
from pathlib import Path
|
|
9
5
|
import tempfile
|
|
10
6
|
import threading
|
|
11
7
|
import time
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from functools import cached_property
|
|
11
|
+
from pathlib import Path
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
14
|
import duckdb
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
import logging
|
|
4
5
|
from abc import ABC, abstractmethod
|
|
5
6
|
from collections.abc import Iterator
|
|
6
7
|
from contextlib import contextmanager
|
|
7
|
-
import logging
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import IO
|
|
10
10
|
|
|
@@ -5,8 +5,8 @@ from abc import ABC, abstractmethod
|
|
|
5
5
|
from typing import Type
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
|
-
from numpy.typing import NDArray
|
|
9
8
|
import pandas as pd
|
|
9
|
+
from numpy.typing import NDArray
|
|
10
10
|
|
|
11
11
|
from data_designer.config.base import ConfigBase
|
|
12
12
|
from data_designer.config.sampler_constraints import (
|
|
@@ -5,8 +5,8 @@ from abc import ABC, abstractmethod
|
|
|
5
5
|
from typing import Any, Generic, Optional, Type, TypeVar, Union
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
|
-
from numpy.typing import NDArray
|
|
9
8
|
import pandas as pd
|
|
9
|
+
from numpy.typing import NDArray
|
|
10
10
|
from scipy import stats
|
|
11
11
|
|
|
12
12
|
from data_designer.config.sampler_params import SamplerParamsT
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from datetime import date, timedelta
|
|
5
4
|
import random
|
|
5
|
+
from datetime import date, timedelta
|
|
6
6
|
from typing import Any, Literal, TypeAlias
|
|
7
7
|
|
|
8
8
|
from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from pathlib import Path
|
|
5
4
|
import random
|
|
5
|
+
from pathlib import Path
|
|
6
6
|
from typing import Optional
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
@@ -3,15 +3,15 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
+
import random
|
|
7
|
+
import uuid
|
|
6
8
|
from abc import ABC, abstractmethod
|
|
7
9
|
from collections.abc import Callable
|
|
8
10
|
from copy import deepcopy
|
|
9
|
-
import random
|
|
10
11
|
from typing import TYPE_CHECKING, Any, Union
|
|
11
|
-
import uuid
|
|
12
12
|
|
|
13
|
-
from faker import Faker
|
|
14
13
|
import pandas as pd
|
|
14
|
+
from faker import Faker
|
|
15
15
|
|
|
16
16
|
from data_designer.config.utils.constants import AVAILABLE_LOCALES, DEFAULT_AGE_RANGE
|
|
17
17
|
from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from collections.abc import Sequence
|
|
5
4
|
import json
|
|
6
5
|
import logging
|
|
7
6
|
import os
|
|
7
|
+
from collections.abc import Sequence
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Protocol
|
|
10
10
|
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
import ast
|
|
5
|
-
from collections import defaultdict
|
|
6
5
|
import logging
|
|
7
|
-
from pathlib import Path
|
|
8
6
|
import re
|
|
9
7
|
import subprocess
|
|
10
8
|
import tempfile
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from pathlib import Path
|
|
11
11
|
from uuid import uuid4
|
|
12
12
|
|
|
13
13
|
import pandas as pd
|