data-designer 0.3.8rc1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/cli/commands/__init__.py +1 -1
- data_designer/interface/__init__.py +21 -1
- data_designer/{_version.py → interface/_version.py} +2 -2
- data_designer/interface/data_designer.py +8 -11
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
- data_designer-0.4.0.dist-info/RECORD +39 -0
- data_designer/__init__.py +0 -17
- data_designer/config/__init__.py +0 -2
- data_designer/config/analysis/__init__.py +0 -2
- data_designer/config/analysis/column_profilers.py +0 -159
- data_designer/config/analysis/column_statistics.py +0 -421
- data_designer/config/analysis/dataset_profiler.py +0 -84
- data_designer/config/analysis/utils/errors.py +0 -10
- data_designer/config/analysis/utils/reporting.py +0 -192
- data_designer/config/base.py +0 -69
- data_designer/config/column_configs.py +0 -470
- data_designer/config/column_types.py +0 -141
- data_designer/config/config_builder.py +0 -595
- data_designer/config/data_designer_config.py +0 -40
- data_designer/config/dataset_builders.py +0 -13
- data_designer/config/dataset_metadata.py +0 -18
- data_designer/config/default_model_settings.py +0 -121
- data_designer/config/errors.py +0 -24
- data_designer/config/exports.py +0 -145
- data_designer/config/interface.py +0 -55
- data_designer/config/models.py +0 -455
- data_designer/config/preview_results.py +0 -41
- data_designer/config/processors.py +0 -148
- data_designer/config/run_config.py +0 -48
- data_designer/config/sampler_constraints.py +0 -52
- data_designer/config/sampler_params.py +0 -639
- data_designer/config/seed.py +0 -116
- data_designer/config/seed_source.py +0 -84
- data_designer/config/seed_source_types.py +0 -19
- data_designer/config/utils/code_lang.py +0 -82
- data_designer/config/utils/constants.py +0 -363
- data_designer/config/utils/errors.py +0 -21
- data_designer/config/utils/info.py +0 -94
- data_designer/config/utils/io_helpers.py +0 -258
- data_designer/config/utils/misc.py +0 -78
- data_designer/config/utils/numerical_helpers.py +0 -30
- data_designer/config/utils/type_helpers.py +0 -106
- data_designer/config/utils/visualization.py +0 -482
- data_designer/config/validator_params.py +0 -94
- data_designer/engine/__init__.py +0 -2
- data_designer/engine/analysis/column_profilers/base.py +0 -49
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
- data_designer/engine/analysis/column_profilers/registry.py +0 -22
- data_designer/engine/analysis/column_statistics.py +0 -145
- data_designer/engine/analysis/dataset_profiler.py +0 -149
- data_designer/engine/analysis/errors.py +0 -9
- data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
- data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
- data_designer/engine/column_generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/base.py +0 -122
- data_designer/engine/column_generators/generators/embedding.py +0 -35
- data_designer/engine/column_generators/generators/expression.py +0 -55
- data_designer/engine/column_generators/generators/llm_completion.py +0 -113
- data_designer/engine/column_generators/generators/samplers.py +0 -69
- data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
- data_designer/engine/column_generators/generators/validation.py +0 -140
- data_designer/engine/column_generators/registry.py +0 -60
- data_designer/engine/column_generators/utils/errors.py +0 -15
- data_designer/engine/column_generators/utils/generator_classification.py +0 -43
- data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
- data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
- data_designer/engine/compiler.py +0 -97
- data_designer/engine/configurable_task.py +0 -71
- data_designer/engine/dataset_builders/artifact_storage.py +0 -283
- data_designer/engine/dataset_builders/column_wise_builder.py +0 -338
- data_designer/engine/dataset_builders/errors.py +0 -15
- data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
- data_designer/engine/dataset_builders/utils/__init__.py +0 -2
- data_designer/engine/dataset_builders/utils/concurrency.py +0 -215
- data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
- data_designer/engine/dataset_builders/utils/dag.py +0 -62
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
- data_designer/engine/dataset_builders/utils/errors.py +0 -15
- data_designer/engine/errors.py +0 -51
- data_designer/engine/model_provider.py +0 -77
- data_designer/engine/models/__init__.py +0 -2
- data_designer/engine/models/errors.py +0 -300
- data_designer/engine/models/facade.py +0 -287
- data_designer/engine/models/factory.py +0 -42
- data_designer/engine/models/litellm_overrides.py +0 -179
- data_designer/engine/models/parsers/__init__.py +0 -2
- data_designer/engine/models/parsers/errors.py +0 -34
- data_designer/engine/models/parsers/parser.py +0 -235
- data_designer/engine/models/parsers/postprocessors.py +0 -93
- data_designer/engine/models/parsers/tag_parsers.py +0 -62
- data_designer/engine/models/parsers/types.py +0 -84
- data_designer/engine/models/recipes/base.py +0 -81
- data_designer/engine/models/recipes/response_recipes.py +0 -293
- data_designer/engine/models/registry.py +0 -146
- data_designer/engine/models/telemetry.py +0 -359
- data_designer/engine/models/usage.py +0 -73
- data_designer/engine/models/utils.py +0 -38
- data_designer/engine/processing/ginja/__init__.py +0 -2
- data_designer/engine/processing/ginja/ast.py +0 -65
- data_designer/engine/processing/ginja/environment.py +0 -463
- data_designer/engine/processing/ginja/exceptions.py +0 -56
- data_designer/engine/processing/ginja/record.py +0 -32
- data_designer/engine/processing/gsonschema/__init__.py +0 -2
- data_designer/engine/processing/gsonschema/exceptions.py +0 -15
- data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
- data_designer/engine/processing/gsonschema/types.py +0 -10
- data_designer/engine/processing/gsonschema/validators.py +0 -202
- data_designer/engine/processing/processors/base.py +0 -13
- data_designer/engine/processing/processors/drop_columns.py +0 -42
- data_designer/engine/processing/processors/registry.py +0 -25
- data_designer/engine/processing/processors/schema_transform.py +0 -49
- data_designer/engine/processing/utils.py +0 -169
- data_designer/engine/registry/base.py +0 -99
- data_designer/engine/registry/data_designer_registry.py +0 -39
- data_designer/engine/registry/errors.py +0 -12
- data_designer/engine/resources/managed_dataset_generator.py +0 -39
- data_designer/engine/resources/managed_dataset_repository.py +0 -197
- data_designer/engine/resources/managed_storage.py +0 -65
- data_designer/engine/resources/resource_provider.py +0 -77
- data_designer/engine/resources/seed_reader.py +0 -154
- data_designer/engine/sampling_gen/column.py +0 -91
- data_designer/engine/sampling_gen/constraints.py +0 -100
- data_designer/engine/sampling_gen/data_sources/base.py +0 -217
- data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
- data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
- data_designer/engine/sampling_gen/entities/__init__.py +0 -2
- data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
- data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
- data_designer/engine/sampling_gen/entities/errors.py +0 -10
- data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
- data_designer/engine/sampling_gen/entities/person.py +0 -144
- data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
- data_designer/engine/sampling_gen/errors.py +0 -26
- data_designer/engine/sampling_gen/generator.py +0 -122
- data_designer/engine/sampling_gen/jinja_utils.py +0 -64
- data_designer/engine/sampling_gen/people_gen.py +0 -199
- data_designer/engine/sampling_gen/person_constants.py +0 -56
- data_designer/engine/sampling_gen/schema.py +0 -147
- data_designer/engine/sampling_gen/schema_builder.py +0 -61
- data_designer/engine/sampling_gen/utils.py +0 -46
- data_designer/engine/secret_resolver.py +0 -82
- data_designer/engine/validation.py +0 -367
- data_designer/engine/validators/__init__.py +0 -19
- data_designer/engine/validators/base.py +0 -38
- data_designer/engine/validators/local_callable.py +0 -39
- data_designer/engine/validators/python.py +0 -254
- data_designer/engine/validators/remote.py +0 -89
- data_designer/engine/validators/sql.py +0 -65
- data_designer/errors.py +0 -7
- data_designer/essentials/__init__.py +0 -33
- data_designer/lazy_heavy_imports.py +0 -54
- data_designer/logging.py +0 -163
- data_designer/plugin_manager.py +0 -78
- data_designer/plugins/__init__.py +0 -8
- data_designer/plugins/errors.py +0 -15
- data_designer/plugins/plugin.py +0 -141
- data_designer/plugins/registry.py +0 -88
- data_designer/plugins/testing/__init__.py +0 -10
- data_designer/plugins/testing/stubs.py +0 -116
- data_designer/plugins/testing/utils.py +0 -20
- data_designer-0.3.8rc1.dist-info/RECORD +0 -196
- data_designer-0.3.8rc1.dist-info/licenses/LICENSE +0 -201
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,482 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import json
|
|
7
|
-
import os
|
|
8
|
-
from collections import OrderedDict
|
|
9
|
-
from enum import Enum
|
|
10
|
-
from functools import cached_property
|
|
11
|
-
from typing import TYPE_CHECKING, Any
|
|
12
|
-
|
|
13
|
-
from rich.console import Console, Group
|
|
14
|
-
from rich.padding import Padding
|
|
15
|
-
from rich.panel import Panel
|
|
16
|
-
from rich.pretty import Pretty
|
|
17
|
-
from rich.rule import Rule
|
|
18
|
-
from rich.syntax import Syntax
|
|
19
|
-
from rich.table import Table
|
|
20
|
-
from rich.text import Text
|
|
21
|
-
|
|
22
|
-
from data_designer.config.base import ConfigBase
|
|
23
|
-
from data_designer.config.column_types import DataDesignerColumnType
|
|
24
|
-
from data_designer.config.models import ModelConfig, ModelProvider
|
|
25
|
-
from data_designer.config.sampler_params import SamplerType
|
|
26
|
-
from data_designer.config.utils.code_lang import code_lang_to_syntax_lexer
|
|
27
|
-
from data_designer.config.utils.constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME
|
|
28
|
-
from data_designer.config.utils.errors import DatasetSampleDisplayError
|
|
29
|
-
from data_designer.lazy_heavy_imports import np, pd
|
|
30
|
-
|
|
31
|
-
if TYPE_CHECKING:
|
|
32
|
-
import numpy as np
|
|
33
|
-
import pandas as pd
|
|
34
|
-
|
|
35
|
-
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
36
|
-
from data_designer.config.dataset_metadata import DatasetMetadata
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
console = Console()
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def get_nvidia_api_key() -> str | None:
|
|
43
|
-
return os.getenv(NVIDIA_API_KEY_ENV_VAR_NAME)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def get_openai_api_key() -> str | None:
|
|
47
|
-
return os.getenv(OPENAI_API_KEY_ENV_VAR_NAME)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class ColorPalette(str, Enum):
|
|
51
|
-
NVIDIA_GREEN = "#76b900"
|
|
52
|
-
PURPLE = "#9525c6"
|
|
53
|
-
YELLOW = "#f9c500"
|
|
54
|
-
BLUE = "#0074df"
|
|
55
|
-
RED = "#e52020"
|
|
56
|
-
ORANGE = "#ef9100"
|
|
57
|
-
MAGENTA = "#d2308e"
|
|
58
|
-
TEAL = "#1dbba4"
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class WithRecordSamplerMixin:
|
|
62
|
-
_display_cycle_index: int = 0
|
|
63
|
-
dataset_metadata: DatasetMetadata | None
|
|
64
|
-
|
|
65
|
-
@cached_property
|
|
66
|
-
def _record_sampler_dataset(self) -> pd.DataFrame:
|
|
67
|
-
if hasattr(self, "dataset") and self.dataset is not None and isinstance(self.dataset, pd.DataFrame):
|
|
68
|
-
return self.dataset
|
|
69
|
-
elif (
|
|
70
|
-
hasattr(self, "load_dataset")
|
|
71
|
-
and callable(self.load_dataset)
|
|
72
|
-
and (dataset := self.load_dataset()) is not None
|
|
73
|
-
and isinstance(dataset, pd.DataFrame)
|
|
74
|
-
):
|
|
75
|
-
return dataset
|
|
76
|
-
else:
|
|
77
|
-
raise DatasetSampleDisplayError("No valid dataset found in results object.")
|
|
78
|
-
|
|
79
|
-
def _has_processor_artifacts(self) -> bool:
|
|
80
|
-
return hasattr(self, "processor_artifacts") and self.processor_artifacts is not None
|
|
81
|
-
|
|
82
|
-
def display_sample_record(
|
|
83
|
-
self,
|
|
84
|
-
index: int | None = None,
|
|
85
|
-
*,
|
|
86
|
-
syntax_highlighting_theme: str = "dracula",
|
|
87
|
-
background_color: str | None = None,
|
|
88
|
-
processors_to_display: list[str] | None = None,
|
|
89
|
-
hide_seed_columns: bool = False,
|
|
90
|
-
) -> None:
|
|
91
|
-
"""Display a sample record from the Data Designer dataset preview.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
index: Index of the record to display. If None, the next record will be displayed.
|
|
95
|
-
This is useful for running the cell in a notebook multiple times.
|
|
96
|
-
syntax_highlighting_theme: Theme to use for syntax highlighting. See the `Syntax`
|
|
97
|
-
documentation from `rich` for information about available themes.
|
|
98
|
-
background_color: Background color to use for the record. See the `Syntax`
|
|
99
|
-
documentation from `rich` for information about available background colors.
|
|
100
|
-
processors_to_display: List of processors to display the artifacts for. If None, all processors will be displayed.
|
|
101
|
-
hide_seed_columns: If True, seed columns will not be displayed separately.
|
|
102
|
-
"""
|
|
103
|
-
i = index or self._display_cycle_index
|
|
104
|
-
|
|
105
|
-
try:
|
|
106
|
-
record = self._record_sampler_dataset.iloc[i]
|
|
107
|
-
num_records = len(self._record_sampler_dataset)
|
|
108
|
-
except IndexError:
|
|
109
|
-
raise DatasetSampleDisplayError(f"Index {i} is out of bounds for dataset of length {num_records}.")
|
|
110
|
-
|
|
111
|
-
processor_data_to_display = None
|
|
112
|
-
if self._has_processor_artifacts() and len(self.processor_artifacts) > 0:
|
|
113
|
-
if processors_to_display is None:
|
|
114
|
-
processors_to_display = list(self.processor_artifacts.keys())
|
|
115
|
-
|
|
116
|
-
if len(processors_to_display) > 0:
|
|
117
|
-
processor_data_to_display = {}
|
|
118
|
-
for processor in processors_to_display:
|
|
119
|
-
if (
|
|
120
|
-
isinstance(self.processor_artifacts[processor], list)
|
|
121
|
-
and len(self.processor_artifacts[processor]) == num_records
|
|
122
|
-
):
|
|
123
|
-
processor_data_to_display[processor] = self.processor_artifacts[processor][i]
|
|
124
|
-
else:
|
|
125
|
-
processor_data_to_display[processor] = self.processor_artifacts[processor]
|
|
126
|
-
|
|
127
|
-
seed_column_names = (
|
|
128
|
-
None if hide_seed_columns or self.dataset_metadata is None else self.dataset_metadata.seed_column_names
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
display_sample_record(
|
|
132
|
-
record=record,
|
|
133
|
-
processor_data_to_display=processor_data_to_display,
|
|
134
|
-
config_builder=self._config_builder,
|
|
135
|
-
background_color=background_color,
|
|
136
|
-
syntax_highlighting_theme=syntax_highlighting_theme,
|
|
137
|
-
record_index=i,
|
|
138
|
-
seed_column_names=seed_column_names,
|
|
139
|
-
)
|
|
140
|
-
if index is None:
|
|
141
|
-
self._display_cycle_index = (self._display_cycle_index + 1) % num_records
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
def create_rich_histogram_table(
|
|
145
|
-
data: dict[str, int | float],
|
|
146
|
-
column_names: tuple[int, int],
|
|
147
|
-
name_style: str = ColorPalette.BLUE.value,
|
|
148
|
-
value_style: str = ColorPalette.TEAL.value,
|
|
149
|
-
title: str | None = None,
|
|
150
|
-
**kwargs,
|
|
151
|
-
) -> Table:
|
|
152
|
-
table = Table(title=title, **kwargs)
|
|
153
|
-
table.add_column(column_names[0], justify="right", style=name_style)
|
|
154
|
-
table.add_column(column_names[1], justify="left", style=value_style)
|
|
155
|
-
|
|
156
|
-
max_count = max(data.values())
|
|
157
|
-
for name, value in data.items():
|
|
158
|
-
bar = "" if max_count <= 0 else "█" * int((value / max_count) * 20)
|
|
159
|
-
table.add_row(str(name), f"{bar} {value:.1f}")
|
|
160
|
-
|
|
161
|
-
return table
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def display_sample_record(
|
|
165
|
-
record: dict | pd.Series | pd.DataFrame,
|
|
166
|
-
config_builder: DataDesignerConfigBuilder,
|
|
167
|
-
processor_data_to_display: dict[str, list[str] | str] | None = None,
|
|
168
|
-
background_color: str | None = None,
|
|
169
|
-
syntax_highlighting_theme: str = "dracula",
|
|
170
|
-
record_index: int | None = None,
|
|
171
|
-
seed_column_names: list[str] | None = None,
|
|
172
|
-
):
|
|
173
|
-
if isinstance(record, (dict, pd.Series)):
|
|
174
|
-
record = pd.DataFrame([record]).iloc[0]
|
|
175
|
-
elif isinstance(record, pd.DataFrame):
|
|
176
|
-
if record.shape[0] > 1:
|
|
177
|
-
raise DatasetSampleDisplayError(
|
|
178
|
-
f"The record must be a single record. You provided a DataFrame with {record.shape[0]} records."
|
|
179
|
-
)
|
|
180
|
-
record = record.iloc[0]
|
|
181
|
-
else:
|
|
182
|
-
raise DatasetSampleDisplayError(
|
|
183
|
-
"The record must be a single record in a dictionary, pandas Series, "
|
|
184
|
-
f"or pandas DataFrame. You provided: {type(record)}."
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
render_list = []
|
|
188
|
-
table_kws = dict(show_lines=True, expand=True)
|
|
189
|
-
|
|
190
|
-
# Display seed columns if seed_column_names is provided and not empty
|
|
191
|
-
if seed_column_names:
|
|
192
|
-
table = Table(title="Seed Columns", **table_kws)
|
|
193
|
-
table.add_column("Name")
|
|
194
|
-
table.add_column("Value")
|
|
195
|
-
for col_name in seed_column_names:
|
|
196
|
-
if col_name in record.index:
|
|
197
|
-
table.add_row(col_name, convert_to_row_element(record[col_name]))
|
|
198
|
-
render_list.append(pad_console_element(table))
|
|
199
|
-
|
|
200
|
-
non_code_columns = (
|
|
201
|
-
config_builder.get_columns_of_type(DataDesignerColumnType.SAMPLER)
|
|
202
|
-
+ config_builder.get_columns_of_type(DataDesignerColumnType.EXPRESSION)
|
|
203
|
-
+ config_builder.get_columns_of_type(DataDesignerColumnType.LLM_TEXT)
|
|
204
|
-
+ config_builder.get_columns_of_type(DataDesignerColumnType.LLM_STRUCTURED)
|
|
205
|
-
+ config_builder.get_columns_of_type(DataDesignerColumnType.EMBEDDING)
|
|
206
|
-
)
|
|
207
|
-
if len(non_code_columns) > 0:
|
|
208
|
-
table = Table(title="Generated Columns", **table_kws)
|
|
209
|
-
table.add_column("Name")
|
|
210
|
-
table.add_column("Value")
|
|
211
|
-
for col in non_code_columns:
|
|
212
|
-
if not col.drop:
|
|
213
|
-
if col.column_type == DataDesignerColumnType.EMBEDDING:
|
|
214
|
-
record[col.name]["embeddings"] = [
|
|
215
|
-
get_truncated_list_as_string(embd) for embd in record[col.name].get("embeddings")
|
|
216
|
-
]
|
|
217
|
-
table.add_row(col.name, convert_to_row_element(record[col.name]))
|
|
218
|
-
render_list.append(pad_console_element(table))
|
|
219
|
-
|
|
220
|
-
for col in config_builder.get_columns_of_type(DataDesignerColumnType.LLM_CODE):
|
|
221
|
-
panel = Panel(
|
|
222
|
-
Syntax(
|
|
223
|
-
record[col.name],
|
|
224
|
-
lexer=code_lang_to_syntax_lexer(col.code_lang),
|
|
225
|
-
theme=syntax_highlighting_theme,
|
|
226
|
-
word_wrap=True,
|
|
227
|
-
background_color=background_color,
|
|
228
|
-
),
|
|
229
|
-
title=col.name,
|
|
230
|
-
expand=True,
|
|
231
|
-
)
|
|
232
|
-
render_list.append(pad_console_element(panel))
|
|
233
|
-
|
|
234
|
-
validation_columns = config_builder.get_columns_of_type(DataDesignerColumnType.VALIDATION)
|
|
235
|
-
if len(validation_columns) > 0:
|
|
236
|
-
table = Table(title="Validation", **table_kws)
|
|
237
|
-
table.add_column("Name")
|
|
238
|
-
table.add_column("Value", ratio=1)
|
|
239
|
-
for col in validation_columns:
|
|
240
|
-
if not col.drop:
|
|
241
|
-
# Add is_valid before other fields
|
|
242
|
-
if "is_valid" in record[col.name]:
|
|
243
|
-
value_to_display = {"is_valid": record[col.name].get("is_valid")} | record[col.name]
|
|
244
|
-
else: # if columns treated separately
|
|
245
|
-
value_to_display = {}
|
|
246
|
-
for col_name, validation_output in record[col.name].items():
|
|
247
|
-
value_to_display[col_name] = {
|
|
248
|
-
"is_valid": validation_output.get("is_valid", None)
|
|
249
|
-
} | validation_output
|
|
250
|
-
|
|
251
|
-
table.add_row(col.name, convert_to_row_element(value_to_display))
|
|
252
|
-
render_list.append(pad_console_element(table, (1, 0, 1, 0)))
|
|
253
|
-
|
|
254
|
-
llm_judge_columns = config_builder.get_columns_of_type(DataDesignerColumnType.LLM_JUDGE)
|
|
255
|
-
if len(llm_judge_columns) > 0:
|
|
256
|
-
for col in llm_judge_columns:
|
|
257
|
-
if col.drop:
|
|
258
|
-
continue
|
|
259
|
-
table = Table(title=f"LLM-as-a-Judge: {col.name}", **table_kws)
|
|
260
|
-
row = []
|
|
261
|
-
judge = record[col.name]
|
|
262
|
-
|
|
263
|
-
for measure, results in judge.items():
|
|
264
|
-
table.add_column(measure)
|
|
265
|
-
row.append(f"score: {results['score']}\nreasoning: {results['reasoning']}")
|
|
266
|
-
table.add_row(*row)
|
|
267
|
-
render_list.append(pad_console_element(table, (1, 0, 1, 0)))
|
|
268
|
-
|
|
269
|
-
if processor_data_to_display and len(processor_data_to_display) > 0:
|
|
270
|
-
for processor_name, processor_data in processor_data_to_display.items():
|
|
271
|
-
table = Table(title=f"Processor Outputs: {processor_name}", **table_kws)
|
|
272
|
-
table.add_column("Name")
|
|
273
|
-
table.add_column("Value")
|
|
274
|
-
for col, value in processor_data.items():
|
|
275
|
-
table.add_row(col, convert_to_row_element(value))
|
|
276
|
-
render_list.append(pad_console_element(table, (1, 0, 1, 0)))
|
|
277
|
-
|
|
278
|
-
if record_index is not None:
|
|
279
|
-
index_label = Text(f"[index: {record_index}]", justify="center")
|
|
280
|
-
render_list.append(index_label)
|
|
281
|
-
|
|
282
|
-
console.print(Group(*render_list), markup=False)
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
def get_truncated_list_as_string(long_list: list[Any], max_items: int = 2) -> str:
|
|
286
|
-
if max_items <= 0:
|
|
287
|
-
raise ValueError("max_items must be greater than 0")
|
|
288
|
-
if len(long_list) > max_items:
|
|
289
|
-
truncated_part = long_list[:max_items]
|
|
290
|
-
return f"[{', '.join(str(x) for x in truncated_part)}, ...]"
|
|
291
|
-
else:
|
|
292
|
-
return str(long_list)
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
def display_sampler_table(
|
|
296
|
-
sampler_params: dict[SamplerType, ConfigBase],
|
|
297
|
-
title: str | None = None,
|
|
298
|
-
) -> None:
|
|
299
|
-
table = Table(expand=True)
|
|
300
|
-
table.add_column("Type")
|
|
301
|
-
table.add_column("Parameter")
|
|
302
|
-
table.add_column("Data Type")
|
|
303
|
-
table.add_column("Required", justify="center")
|
|
304
|
-
table.add_column("Constraints")
|
|
305
|
-
|
|
306
|
-
for sampler_type, params in sampler_params.items():
|
|
307
|
-
num = 0
|
|
308
|
-
schema = params.model_json_schema()
|
|
309
|
-
for param_name, field_info in schema["properties"].items():
|
|
310
|
-
is_required = param_name in schema.get("required", [])
|
|
311
|
-
table.add_row(
|
|
312
|
-
sampler_type if num == 0 else "",
|
|
313
|
-
param_name,
|
|
314
|
-
_get_field_type(field_info),
|
|
315
|
-
"✓" if is_required else "",
|
|
316
|
-
_get_field_constraints(field_info, schema),
|
|
317
|
-
)
|
|
318
|
-
num += 1
|
|
319
|
-
table.add_section()
|
|
320
|
-
|
|
321
|
-
title = title or "NeMo Data Designer Samplers"
|
|
322
|
-
|
|
323
|
-
group = Group(Rule(title, end="\n\n"), table)
|
|
324
|
-
console.print(group)
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
def display_model_configs_table(model_configs: list[ModelConfig]) -> None:
|
|
328
|
-
table_model_configs = Table(expand=True)
|
|
329
|
-
table_model_configs.add_column("Alias")
|
|
330
|
-
table_model_configs.add_column("Model")
|
|
331
|
-
table_model_configs.add_column("Provider")
|
|
332
|
-
table_model_configs.add_column("Inference Parameters")
|
|
333
|
-
for model_config in model_configs:
|
|
334
|
-
params_display = model_config.inference_parameters.format_for_display()
|
|
335
|
-
|
|
336
|
-
table_model_configs.add_row(
|
|
337
|
-
model_config.alias,
|
|
338
|
-
model_config.model,
|
|
339
|
-
model_config.provider,
|
|
340
|
-
params_display,
|
|
341
|
-
)
|
|
342
|
-
group_args: list = [Rule(title="Model Configs"), table_model_configs]
|
|
343
|
-
if len(model_configs) == 0:
|
|
344
|
-
subtitle = Text(
|
|
345
|
-
"‼️ No model configs found. Please provide at least one model config to the config builder",
|
|
346
|
-
style="dim",
|
|
347
|
-
justify="center",
|
|
348
|
-
)
|
|
349
|
-
group_args.insert(1, subtitle)
|
|
350
|
-
group = Group(*group_args)
|
|
351
|
-
console.print(group)
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
def display_model_providers_table(model_providers: list[ModelProvider]) -> None:
|
|
355
|
-
table_model_providers = Table(expand=True)
|
|
356
|
-
table_model_providers.add_column("Name")
|
|
357
|
-
table_model_providers.add_column("Endpoint")
|
|
358
|
-
table_model_providers.add_column("API Key")
|
|
359
|
-
for model_provider in model_providers:
|
|
360
|
-
api_key = model_provider.api_key
|
|
361
|
-
if model_provider.api_key == OPENAI_API_KEY_ENV_VAR_NAME:
|
|
362
|
-
if get_openai_api_key() is not None:
|
|
363
|
-
api_key = mask_api_key(get_openai_api_key())
|
|
364
|
-
else:
|
|
365
|
-
api_key = f"* {OPENAI_API_KEY_ENV_VAR_NAME!r} not set in environment variables * "
|
|
366
|
-
elif model_provider.api_key == NVIDIA_API_KEY_ENV_VAR_NAME:
|
|
367
|
-
if get_nvidia_api_key() is not None:
|
|
368
|
-
api_key = mask_api_key(get_nvidia_api_key())
|
|
369
|
-
else:
|
|
370
|
-
api_key = f"* {NVIDIA_API_KEY_ENV_VAR_NAME!r} not set in environment variables *"
|
|
371
|
-
else:
|
|
372
|
-
api_key = mask_api_key(model_provider.api_key)
|
|
373
|
-
table_model_providers.add_row(model_provider.name, model_provider.endpoint, api_key)
|
|
374
|
-
group = Group(Rule(title="Model Providers"), table_model_providers)
|
|
375
|
-
console.print(group)
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
def mask_api_key(api_key: str | None) -> str:
|
|
379
|
-
"""Mask API keys for display.
|
|
380
|
-
|
|
381
|
-
Environment variable names (all uppercase) are kept visible.
|
|
382
|
-
Actual API keys are masked to show only the last 4 characters.
|
|
383
|
-
|
|
384
|
-
Args:
|
|
385
|
-
api_key: The API key to mask.
|
|
386
|
-
|
|
387
|
-
Returns:
|
|
388
|
-
Masked API key string or "(not set)" if None.
|
|
389
|
-
"""
|
|
390
|
-
if not api_key:
|
|
391
|
-
return "(not set)"
|
|
392
|
-
|
|
393
|
-
# Keep environment variable names visible
|
|
394
|
-
if api_key.isupper():
|
|
395
|
-
return api_key
|
|
396
|
-
|
|
397
|
-
# Mask actual API keys
|
|
398
|
-
return "***" + api_key[-4:] if len(api_key) > 4 else "***"
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
def convert_to_row_element(elem):
|
|
402
|
-
try:
|
|
403
|
-
elem = Pretty(json.loads(elem))
|
|
404
|
-
except (TypeError, json.JSONDecodeError):
|
|
405
|
-
pass
|
|
406
|
-
if isinstance(elem, (np.integer, np.floating, np.ndarray)):
|
|
407
|
-
elem = str(elem)
|
|
408
|
-
elif isinstance(elem, (list, dict)):
|
|
409
|
-
elem = Pretty(elem)
|
|
410
|
-
return elem
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
def pad_console_element(elem, padding=(1, 0, 1, 0)):
|
|
414
|
-
return Padding(elem, padding)
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
def _get_field_type(field: dict) -> str:
|
|
418
|
-
"""Extract human-readable type information from a JSON Schema field."""
|
|
419
|
-
|
|
420
|
-
# single type
|
|
421
|
-
if "type" in field:
|
|
422
|
-
if field["type"] == "array":
|
|
423
|
-
return " | ".join([f"{f.strip()}[]" for f in _get_field_type(field["items"]).split("|")])
|
|
424
|
-
if field["type"] == "object":
|
|
425
|
-
return "dict"
|
|
426
|
-
return field["type"]
|
|
427
|
-
|
|
428
|
-
# union type
|
|
429
|
-
elif "anyOf" in field:
|
|
430
|
-
types = []
|
|
431
|
-
for f in field["anyOf"]:
|
|
432
|
-
if "$ref" in f:
|
|
433
|
-
types.append("enum")
|
|
434
|
-
elif f.get("type") == "array":
|
|
435
|
-
if "items" in f and "$ref" in f["items"]:
|
|
436
|
-
types.append("enum[]")
|
|
437
|
-
else:
|
|
438
|
-
types.append(f"{f['items']['type']}[]")
|
|
439
|
-
else:
|
|
440
|
-
types.append(f.get("type", ""))
|
|
441
|
-
return " | ".join(t for t in types if t)
|
|
442
|
-
|
|
443
|
-
return ""
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
def _get_field_constraints(field: dict, schema: dict) -> str:
|
|
447
|
-
"""Extract human-readable constraints from a JSON Schema field."""
|
|
448
|
-
constraints = []
|
|
449
|
-
|
|
450
|
-
# numeric constraints
|
|
451
|
-
if "minimum" in field:
|
|
452
|
-
constraints.append(f">= {field['minimum']}")
|
|
453
|
-
if "exclusiveMinimum" in field:
|
|
454
|
-
constraints.append(f"> {field['exclusiveMinimum']}")
|
|
455
|
-
if "maximum" in field:
|
|
456
|
-
constraints.append(f"<= {field['maximum']}")
|
|
457
|
-
if "exclusiveMaximum" in field:
|
|
458
|
-
constraints.append(f"< {field['exclusiveMaximum']}")
|
|
459
|
-
|
|
460
|
-
# string constraints
|
|
461
|
-
if "minLength" in field:
|
|
462
|
-
constraints.append(f"len > {field['minLength']}")
|
|
463
|
-
if "maxLength" in field:
|
|
464
|
-
constraints.append(f"len < {field['maxLength']}")
|
|
465
|
-
|
|
466
|
-
# array constraints
|
|
467
|
-
if "minItems" in field:
|
|
468
|
-
constraints.append(f"len > {field['minItems']}")
|
|
469
|
-
if "maxItems" in field:
|
|
470
|
-
constraints.append(f"len < {field['maxItems']}")
|
|
471
|
-
|
|
472
|
-
# enum constraints
|
|
473
|
-
if "enum" in _get_field_type(field) and "$defs" in schema:
|
|
474
|
-
enum_values = []
|
|
475
|
-
for defs in schema["$defs"].values():
|
|
476
|
-
if "enum" in defs:
|
|
477
|
-
enum_values.extend(defs["enum"])
|
|
478
|
-
if len(enum_values) > 0:
|
|
479
|
-
enum_values = OrderedDict.fromkeys(enum_values)
|
|
480
|
-
constraints.append(f"allowed: {', '.join(enum_values.keys())}")
|
|
481
|
-
|
|
482
|
-
return ", ".join(constraints)
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from enum import Enum
|
|
7
|
-
from typing import Any
|
|
8
|
-
|
|
9
|
-
from pydantic import Field, field_serializer, model_validator
|
|
10
|
-
from typing_extensions import Self, TypeAlias
|
|
11
|
-
|
|
12
|
-
from data_designer.config.base import ConfigBase
|
|
13
|
-
from data_designer.config.utils.code_lang import SQL_DIALECTS, CodeLang
|
|
14
|
-
|
|
15
|
-
SUPPORTED_CODE_LANGUAGES = {CodeLang.PYTHON, *SQL_DIALECTS}
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class ValidatorType(str, Enum):
|
|
19
|
-
CODE = "code"
|
|
20
|
-
LOCAL_CALLABLE = "local_callable"
|
|
21
|
-
REMOTE = "remote"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class CodeValidatorParams(ConfigBase):
|
|
25
|
-
"""Configuration for code validation. Supports Python and SQL code validation.
|
|
26
|
-
|
|
27
|
-
Attributes:
|
|
28
|
-
code_lang: The language of the code to validate. Supported values include: `python`,
|
|
29
|
-
`sql:sqlite`, `sql:postgres`, `sql:mysql`, `sql:tsql`, `sql:bigquery`, `sql:ansi`.
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
code_lang: CodeLang = Field(description="The language of the code to validate")
|
|
33
|
-
|
|
34
|
-
@model_validator(mode="after")
|
|
35
|
-
def validate_code_lang(self) -> Self:
|
|
36
|
-
if self.code_lang not in SUPPORTED_CODE_LANGUAGES:
|
|
37
|
-
raise ValueError(
|
|
38
|
-
f"Unsupported code language, supported languages are: {[lang.value for lang in SUPPORTED_CODE_LANGUAGES]}"
|
|
39
|
-
)
|
|
40
|
-
return self
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class LocalCallableValidatorParams(ConfigBase):
|
|
44
|
-
"""Configuration for local callable validation. Expects a function to be passed that validates the data.
|
|
45
|
-
|
|
46
|
-
Attributes:
|
|
47
|
-
validation_function: Function (`Callable[[pd.DataFrame], pd.DataFrame]`) to validate the
|
|
48
|
-
data. Output must contain a column `is_valid` of type `bool`.
|
|
49
|
-
output_schema: The JSON schema for the local callable validator's output. If not provided,
|
|
50
|
-
the output will not be validated.
|
|
51
|
-
"""
|
|
52
|
-
|
|
53
|
-
validation_function: Any = Field(
|
|
54
|
-
description="Function (Callable[[pd.DataFrame], pd.DataFrame]) to validate the data"
|
|
55
|
-
)
|
|
56
|
-
output_schema: dict[str, Any] | None = Field(
|
|
57
|
-
default=None, description="Expected schema for local callable validator's output"
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
@field_serializer("validation_function")
|
|
61
|
-
def serialize_validation_function(self, v: Any) -> Any:
|
|
62
|
-
return v.__name__
|
|
63
|
-
|
|
64
|
-
@model_validator(mode="after")
|
|
65
|
-
def validate_validation_function(self) -> Self:
|
|
66
|
-
if not callable(self.validation_function):
|
|
67
|
-
raise ValueError("Validation function must be a callable")
|
|
68
|
-
return self
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
class RemoteValidatorParams(ConfigBase):
|
|
72
|
-
"""Configuration for remote validation. Sends data to a remote endpoint for validation.
|
|
73
|
-
|
|
74
|
-
Attributes:
|
|
75
|
-
endpoint_url: The URL of the remote endpoint.
|
|
76
|
-
output_schema: The JSON schema for the remote validator's output. If not provided,
|
|
77
|
-
the output will not be validated.
|
|
78
|
-
timeout: The timeout for the HTTP request in seconds. Defaults to 30.0.
|
|
79
|
-
max_retries: The maximum number of retry attempts. Defaults to 3.
|
|
80
|
-
retry_backoff: The backoff factor for the retry delay in seconds. Defaults to 2.0.
|
|
81
|
-
max_parallel_requests: The maximum number of parallel requests to make. Defaults to 4.
|
|
82
|
-
"""
|
|
83
|
-
|
|
84
|
-
endpoint_url: str = Field(description="URL of the remote endpoint")
|
|
85
|
-
output_schema: dict[str, Any] | None = Field(
|
|
86
|
-
default=None, description="Expected schema for remote validator's output"
|
|
87
|
-
)
|
|
88
|
-
timeout: float = Field(default=30.0, gt=0, description="The timeout for the HTTP request")
|
|
89
|
-
max_retries: int = Field(default=3, ge=0, description="The maximum number of retry attempts")
|
|
90
|
-
retry_backoff: float = Field(default=2.0, gt=1, description="The backoff factor for the retry delay")
|
|
91
|
-
max_parallel_requests: int = Field(default=4, ge=1, description="The maximum number of parallel requests to make")
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
ValidatorParamsT: TypeAlias = CodeValidatorParams | LocalCallableValidatorParams | RemoteValidatorParams
|
data_designer/engine/__init__.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
from abc import ABC, abstractmethod
|
|
8
|
-
from typing import TYPE_CHECKING
|
|
9
|
-
|
|
10
|
-
from pydantic import BaseModel, model_validator
|
|
11
|
-
from typing_extensions import Self
|
|
12
|
-
|
|
13
|
-
from data_designer.config.base import ConfigBase
|
|
14
|
-
from data_designer.config.column_configs import SingleColumnConfig
|
|
15
|
-
from data_designer.config.column_types import DataDesignerColumnType
|
|
16
|
-
from data_designer.engine.configurable_task import ConfigurableTask, TaskConfigT
|
|
17
|
-
from data_designer.lazy_heavy_imports import pd
|
|
18
|
-
|
|
19
|
-
if TYPE_CHECKING:
|
|
20
|
-
import pandas as pd
|
|
21
|
-
|
|
22
|
-
logger = logging.getLogger(__name__)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class ColumnConfigWithDataFrame(ConfigBase):
|
|
26
|
-
column_config: SingleColumnConfig
|
|
27
|
-
df: pd.DataFrame
|
|
28
|
-
|
|
29
|
-
@model_validator(mode="after")
|
|
30
|
-
def validate_column_exists(self) -> Self:
|
|
31
|
-
if self.column_config.name not in self.df.columns:
|
|
32
|
-
raise ValueError(f"Column {self.column_config.name!r} not found in DataFrame")
|
|
33
|
-
return self
|
|
34
|
-
|
|
35
|
-
def as_tuple(self) -> tuple[SingleColumnConfig, pd.DataFrame]:
|
|
36
|
-
return (self.column_config, self.df)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class ColumnProfiler(ConfigurableTask[TaskConfigT], ABC):
|
|
40
|
-
@staticmethod
|
|
41
|
-
@abstractmethod
|
|
42
|
-
def get_applicable_column_types() -> list[DataDesignerColumnType]:
|
|
43
|
-
"""Returns a list of column types that this profiler can be applied to during dataset profiling."""
|
|
44
|
-
|
|
45
|
-
@abstractmethod
|
|
46
|
-
def profile(self, column_config_with_df: ColumnConfigWithDataFrame) -> BaseModel: ...
|
|
47
|
-
|
|
48
|
-
def _initialize(self) -> None:
|
|
49
|
-
logger.info(f"💫 Initializing column profiler: '{self.name}'")
|