data-designer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/__init__.py +15 -0
- data_designer/_version.py +34 -0
- data_designer/cli/README.md +236 -0
- data_designer/cli/__init__.py +6 -0
- data_designer/cli/commands/__init__.py +2 -0
- data_designer/cli/commands/list.py +130 -0
- data_designer/cli/commands/models.py +10 -0
- data_designer/cli/commands/providers.py +11 -0
- data_designer/cli/commands/reset.py +100 -0
- data_designer/cli/controllers/__init__.py +7 -0
- data_designer/cli/controllers/model_controller.py +246 -0
- data_designer/cli/controllers/provider_controller.py +317 -0
- data_designer/cli/forms/__init__.py +20 -0
- data_designer/cli/forms/builder.py +51 -0
- data_designer/cli/forms/field.py +180 -0
- data_designer/cli/forms/form.py +59 -0
- data_designer/cli/forms/model_builder.py +125 -0
- data_designer/cli/forms/provider_builder.py +76 -0
- data_designer/cli/main.py +44 -0
- data_designer/cli/repositories/__init__.py +8 -0
- data_designer/cli/repositories/base.py +39 -0
- data_designer/cli/repositories/model_repository.py +42 -0
- data_designer/cli/repositories/provider_repository.py +43 -0
- data_designer/cli/services/__init__.py +7 -0
- data_designer/cli/services/model_service.py +116 -0
- data_designer/cli/services/provider_service.py +111 -0
- data_designer/cli/ui.py +448 -0
- data_designer/cli/utils.py +47 -0
- data_designer/config/__init__.py +2 -0
- data_designer/config/analysis/column_profilers.py +89 -0
- data_designer/config/analysis/column_statistics.py +274 -0
- data_designer/config/analysis/dataset_profiler.py +60 -0
- data_designer/config/analysis/utils/errors.py +8 -0
- data_designer/config/analysis/utils/reporting.py +188 -0
- data_designer/config/base.py +68 -0
- data_designer/config/column_configs.py +354 -0
- data_designer/config/column_types.py +168 -0
- data_designer/config/config_builder.py +660 -0
- data_designer/config/data_designer_config.py +40 -0
- data_designer/config/dataset_builders.py +11 -0
- data_designer/config/datastore.py +151 -0
- data_designer/config/default_model_settings.py +123 -0
- data_designer/config/errors.py +19 -0
- data_designer/config/interface.py +54 -0
- data_designer/config/models.py +231 -0
- data_designer/config/preview_results.py +32 -0
- data_designer/config/processors.py +41 -0
- data_designer/config/sampler_constraints.py +51 -0
- data_designer/config/sampler_params.py +604 -0
- data_designer/config/seed.py +145 -0
- data_designer/config/utils/code_lang.py +83 -0
- data_designer/config/utils/constants.py +313 -0
- data_designer/config/utils/errors.py +19 -0
- data_designer/config/utils/info.py +88 -0
- data_designer/config/utils/io_helpers.py +273 -0
- data_designer/config/utils/misc.py +81 -0
- data_designer/config/utils/numerical_helpers.py +28 -0
- data_designer/config/utils/type_helpers.py +100 -0
- data_designer/config/utils/validation.py +336 -0
- data_designer/config/utils/visualization.py +427 -0
- data_designer/config/validator_params.py +96 -0
- data_designer/engine/__init__.py +2 -0
- data_designer/engine/analysis/column_profilers/base.py +55 -0
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
- data_designer/engine/analysis/column_profilers/registry.py +20 -0
- data_designer/engine/analysis/column_statistics.py +142 -0
- data_designer/engine/analysis/dataset_profiler.py +125 -0
- data_designer/engine/analysis/errors.py +7 -0
- data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
- data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
- data_designer/engine/column_generators/__init__.py +2 -0
- data_designer/engine/column_generators/generators/__init__.py +2 -0
- data_designer/engine/column_generators/generators/base.py +61 -0
- data_designer/engine/column_generators/generators/expression.py +63 -0
- data_designer/engine/column_generators/generators/llm_generators.py +172 -0
- data_designer/engine/column_generators/generators/samplers.py +75 -0
- data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
- data_designer/engine/column_generators/generators/validation.py +147 -0
- data_designer/engine/column_generators/registry.py +56 -0
- data_designer/engine/column_generators/utils/errors.py +13 -0
- data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
- data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
- data_designer/engine/configurable_task.py +82 -0
- data_designer/engine/dataset_builders/artifact_storage.py +181 -0
- data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
- data_designer/engine/dataset_builders/errors.py +13 -0
- data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
- data_designer/engine/dataset_builders/utils/__init__.py +2 -0
- data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
- data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
- data_designer/engine/dataset_builders/utils/dag.py +56 -0
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
- data_designer/engine/dataset_builders/utils/errors.py +13 -0
- data_designer/engine/errors.py +49 -0
- data_designer/engine/model_provider.py +75 -0
- data_designer/engine/models/__init__.py +2 -0
- data_designer/engine/models/errors.py +308 -0
- data_designer/engine/models/facade.py +225 -0
- data_designer/engine/models/litellm_overrides.py +162 -0
- data_designer/engine/models/parsers/__init__.py +2 -0
- data_designer/engine/models/parsers/errors.py +34 -0
- data_designer/engine/models/parsers/parser.py +236 -0
- data_designer/engine/models/parsers/postprocessors.py +93 -0
- data_designer/engine/models/parsers/tag_parsers.py +60 -0
- data_designer/engine/models/parsers/types.py +82 -0
- data_designer/engine/models/recipes/base.py +79 -0
- data_designer/engine/models/recipes/response_recipes.py +291 -0
- data_designer/engine/models/registry.py +118 -0
- data_designer/engine/models/usage.py +75 -0
- data_designer/engine/models/utils.py +38 -0
- data_designer/engine/processing/ginja/__init__.py +2 -0
- data_designer/engine/processing/ginja/ast.py +64 -0
- data_designer/engine/processing/ginja/environment.py +461 -0
- data_designer/engine/processing/ginja/exceptions.py +54 -0
- data_designer/engine/processing/ginja/record.py +30 -0
- data_designer/engine/processing/gsonschema/__init__.py +2 -0
- data_designer/engine/processing/gsonschema/exceptions.py +8 -0
- data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
- data_designer/engine/processing/gsonschema/types.py +8 -0
- data_designer/engine/processing/gsonschema/validators.py +143 -0
- data_designer/engine/processing/processors/base.py +15 -0
- data_designer/engine/processing/processors/drop_columns.py +46 -0
- data_designer/engine/processing/processors/registry.py +20 -0
- data_designer/engine/processing/utils.py +120 -0
- data_designer/engine/registry/base.py +97 -0
- data_designer/engine/registry/data_designer_registry.py +37 -0
- data_designer/engine/registry/errors.py +10 -0
- data_designer/engine/resources/managed_dataset_generator.py +35 -0
- data_designer/engine/resources/managed_dataset_repository.py +194 -0
- data_designer/engine/resources/managed_storage.py +63 -0
- data_designer/engine/resources/resource_provider.py +46 -0
- data_designer/engine/resources/seed_dataset_data_store.py +66 -0
- data_designer/engine/sampling_gen/column.py +89 -0
- data_designer/engine/sampling_gen/constraints.py +95 -0
- data_designer/engine/sampling_gen/data_sources/base.py +214 -0
- data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
- data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
- data_designer/engine/sampling_gen/entities/__init__.py +2 -0
- data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
- data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
- data_designer/engine/sampling_gen/entities/errors.py +8 -0
- data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
- data_designer/engine/sampling_gen/entities/person.py +142 -0
- data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
- data_designer/engine/sampling_gen/errors.py +24 -0
- data_designer/engine/sampling_gen/generator.py +121 -0
- data_designer/engine/sampling_gen/jinja_utils.py +60 -0
- data_designer/engine/sampling_gen/people_gen.py +203 -0
- data_designer/engine/sampling_gen/person_constants.py +54 -0
- data_designer/engine/sampling_gen/schema.py +143 -0
- data_designer/engine/sampling_gen/schema_builder.py +59 -0
- data_designer/engine/sampling_gen/utils.py +40 -0
- data_designer/engine/secret_resolver.py +80 -0
- data_designer/engine/validators/__init__.py +17 -0
- data_designer/engine/validators/base.py +36 -0
- data_designer/engine/validators/local_callable.py +34 -0
- data_designer/engine/validators/python.py +245 -0
- data_designer/engine/validators/remote.py +83 -0
- data_designer/engine/validators/sql.py +60 -0
- data_designer/errors.py +5 -0
- data_designer/essentials/__init__.py +137 -0
- data_designer/interface/__init__.py +2 -0
- data_designer/interface/data_designer.py +351 -0
- data_designer/interface/errors.py +16 -0
- data_designer/interface/results.py +55 -0
- data_designer/logging.py +161 -0
- data_designer/plugin_manager.py +83 -0
- data_designer/plugins/__init__.py +6 -0
- data_designer/plugins/errors.py +10 -0
- data_designer/plugins/plugin.py +69 -0
- data_designer/plugins/registry.py +86 -0
- data_designer-0.1.0.dist-info/METADATA +173 -0
- data_designer-0.1.0.dist-info/RECORD +177 -0
- data_designer-0.1.0.dist-info/WHEEL +4 -0
- data_designer-0.1.0.dist-info/entry_points.txt +2 -0
- data_designer-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Optional, Union
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
from .utils.io_helpers import serialize_data
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ConfigBase(BaseModel):
|
|
16
|
+
model_config = ConfigDict(
|
|
17
|
+
protected_namespaces=(),
|
|
18
|
+
use_enum_values=True,
|
|
19
|
+
arbitrary_types_allowed=True,
|
|
20
|
+
extra="forbid",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ExportableConfigBase(ConfigBase):
|
|
25
|
+
def to_dict(self) -> dict[str, Any]:
|
|
26
|
+
"""Convert the configuration to a dictionary.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
A dictionary representation of the configuration using JSON-compatible
|
|
30
|
+
serialization.
|
|
31
|
+
"""
|
|
32
|
+
return self.model_dump(mode="json")
|
|
33
|
+
|
|
34
|
+
def to_yaml(self, path: Optional[Union[str, Path]] = None, *, indent: Optional[int] = 2, **kwargs) -> Optional[str]:
|
|
35
|
+
"""Convert the configuration to a YAML string or file.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
path: Optional file path to write the YAML to. If None, returns the
|
|
39
|
+
YAML string instead of writing to file.
|
|
40
|
+
indent: Number of spaces for YAML indentation. Defaults to 2.
|
|
41
|
+
**kwargs: Additional keyword arguments passed to yaml.dump().
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
The YAML string if path is None, otherwise None (file is written).
|
|
45
|
+
"""
|
|
46
|
+
yaml_str = yaml.dump(self.to_dict(), indent=indent, **kwargs)
|
|
47
|
+
if path is None:
|
|
48
|
+
return yaml_str
|
|
49
|
+
with open(path, "w") as f:
|
|
50
|
+
f.write(yaml_str)
|
|
51
|
+
|
|
52
|
+
def to_json(self, path: Optional[Union[str, Path]] = None, *, indent: Optional[int] = 2, **kwargs) -> Optional[str]:
|
|
53
|
+
"""Convert the configuration to a JSON string or file.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
path: Optional file path to write the JSON to. If None, returns the
|
|
57
|
+
JSON string instead of writing to file.
|
|
58
|
+
indent: Number of spaces for JSON indentation. Defaults to 2.
|
|
59
|
+
**kwargs: Additional keyword arguments passed to json.dumps().
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
The JSON string if path is None, otherwise None (file is written).
|
|
63
|
+
"""
|
|
64
|
+
json_str = serialize_data(self.to_dict(), indent=indent, **kwargs)
|
|
65
|
+
if path is None:
|
|
66
|
+
return json_str
|
|
67
|
+
with open(path, "w") as f:
|
|
68
|
+
f.write(json_str)
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from abc import ABC
|
|
5
|
+
from typing import Literal, Optional, Type, Union
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field, model_validator
|
|
8
|
+
from typing_extensions import Self
|
|
9
|
+
|
|
10
|
+
from .base import ConfigBase
|
|
11
|
+
from .errors import InvalidConfigError
|
|
12
|
+
from .models import ImageContext
|
|
13
|
+
from .sampler_params import SamplerParamsT, SamplerType
|
|
14
|
+
from .utils.code_lang import CodeLang
|
|
15
|
+
from .utils.constants import REASONING_TRACE_COLUMN_POSTFIX
|
|
16
|
+
from .utils.misc import assert_valid_jinja2_template, get_prompt_template_keywords
|
|
17
|
+
from .validator_params import ValidatorParamsT, ValidatorType
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SingleColumnConfig(ConfigBase, ABC):
|
|
21
|
+
"""Abstract base class for all single-column configuration types.
|
|
22
|
+
|
|
23
|
+
This class serves as the foundation for all column configurations in DataDesigner,
|
|
24
|
+
defining shared fields and properties across all column types.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
name: Unique name of the column to be generated.
|
|
28
|
+
drop: If True, the column will be generated but removed from the final dataset.
|
|
29
|
+
Useful for intermediate columns that are dependencies for other columns.
|
|
30
|
+
column_type: Discriminator field that identifies the specific column type.
|
|
31
|
+
Subclasses must override this field to specify the column type with a `Literal` value.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
name: str
|
|
35
|
+
drop: bool = False
|
|
36
|
+
column_type: str
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def required_columns(self) -> list[str]:
|
|
40
|
+
"""Returns a list of column names that must exist before this column can be generated.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
List of column names that this column depends on. Empty list indicates
|
|
44
|
+
no dependencies. Override in subclasses to specify dependencies.
|
|
45
|
+
"""
|
|
46
|
+
return []
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def side_effect_columns(self) -> list[str]:
|
|
50
|
+
"""Returns a list of additional columns that this column will create as a side effect.
|
|
51
|
+
|
|
52
|
+
Some column types generate additional metadata or auxiliary columns alongside
|
|
53
|
+
the primary column (e.g., reasoning traces for LLM columns).
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of column names that this column will create as a side effect. Empty list
|
|
57
|
+
indicates no side effect columns. Override in subclasses to specify side effects.
|
|
58
|
+
"""
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class SamplerColumnConfig(SingleColumnConfig):
|
|
63
|
+
"""Configuration for columns generated using numerical samplers.
|
|
64
|
+
|
|
65
|
+
Sampler columns provide efficient data generation using numerical samplers for
|
|
66
|
+
common data types and distributions. Supported samplers include UUID generation,
|
|
67
|
+
datetime/timedelta sampling, person generation, category / subcategory sampling,
|
|
68
|
+
and various statistical distributions (uniform, gaussian, binomial, poisson, scipy).
|
|
69
|
+
|
|
70
|
+
Attributes:
|
|
71
|
+
sampler_type: Type of sampler to use. Available types include:
|
|
72
|
+
"uuid", "category", "subcategory", "uniform", "gaussian", "bernoulli",
|
|
73
|
+
"bernoulli_mixture", "binomial", "poisson", "scipy", "person", "datetime", "timedelta".
|
|
74
|
+
params: Parameters specific to the chosen sampler type. Type varies based on the `sampler_type`
|
|
75
|
+
(e.g., `CategorySamplerParams`, `UniformSamplerParams`, `PersonSamplerParams`).
|
|
76
|
+
conditional_params: Optional dictionary for conditional parameters. The dict keys
|
|
77
|
+
are the conditions that must be met (e.g., "age > 21") for the conditional parameters
|
|
78
|
+
to be used. The values of dict are the parameters to use when the condition is met.
|
|
79
|
+
convert_to: Optional type conversion to apply after sampling. Must be one of "float", "int", or "str".
|
|
80
|
+
Useful for converting numerical samples to strings or other types.
|
|
81
|
+
column_type: Discriminator field, always "sampler" for this configuration type.
|
|
82
|
+
|
|
83
|
+
!!! tip "Displaying available samplers and their parameters"
|
|
84
|
+
The config builder has an `info` attribute that can be used to display the
|
|
85
|
+
available samplers and their parameters:
|
|
86
|
+
```python
|
|
87
|
+
config_builder.info.display("samplers")
|
|
88
|
+
```
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
sampler_type: SamplerType
|
|
92
|
+
params: SamplerParamsT
|
|
93
|
+
conditional_params: dict[str, SamplerParamsT] = {}
|
|
94
|
+
convert_to: Optional[str] = None
|
|
95
|
+
column_type: Literal["sampler"] = "sampler"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class LLMTextColumnConfig(SingleColumnConfig):
|
|
99
|
+
"""Configuration for text generation columns using Large Language Models.
|
|
100
|
+
|
|
101
|
+
LLM text columns generate free-form text content using language models via LiteLLM.
|
|
102
|
+
Prompts support Jinja2 templating to reference values from other columns, enabling
|
|
103
|
+
context-aware generation. The generated text can optionally include reasoning traces
|
|
104
|
+
when models support extended thinking.
|
|
105
|
+
|
|
106
|
+
Attributes:
|
|
107
|
+
prompt: Prompt template for text generation. Supports Jinja2 syntax to
|
|
108
|
+
reference other columns (e.g., "Write a story about {{ character_name }}").
|
|
109
|
+
Must be a valid Jinja2 template.
|
|
110
|
+
model_alias: Alias of the model configuration to use for generation.
|
|
111
|
+
Must match a model alias defined when initializing the DataDesignerConfigBuilder.
|
|
112
|
+
system_prompt: Optional system prompt to set model behavior and constraints.
|
|
113
|
+
Also supports Jinja2 templating. If provided, must be a valid Jinja2 template.
|
|
114
|
+
Do not put any output parsing instructions in the system prompt. Instead,
|
|
115
|
+
use the appropriate column type for the output you want to generate - e.g.,
|
|
116
|
+
`LLMStructuredColumnConfig` for structured output, `LLMCodeColumnConfig` for code.
|
|
117
|
+
multi_modal_context: Optional list of image contexts for multi-modal generation.
|
|
118
|
+
Enables vision-capable models to generate text based on image inputs.
|
|
119
|
+
column_type: Discriminator field, always "llm-text" for this configuration type.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
prompt: str
|
|
123
|
+
model_alias: str
|
|
124
|
+
system_prompt: Optional[str] = None
|
|
125
|
+
multi_modal_context: Optional[list[ImageContext]] = None
|
|
126
|
+
column_type: Literal["llm-text"] = "llm-text"
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def required_columns(self) -> list[str]:
|
|
130
|
+
"""Get columns referenced in the prompt and system_prompt templates.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
List of unique column names referenced in Jinja2 templates.
|
|
134
|
+
"""
|
|
135
|
+
required_cols = list(get_prompt_template_keywords(self.prompt))
|
|
136
|
+
if self.system_prompt:
|
|
137
|
+
required_cols.extend(list(get_prompt_template_keywords(self.system_prompt)))
|
|
138
|
+
return list(set(required_cols))
|
|
139
|
+
|
|
140
|
+
@property
|
|
141
|
+
def side_effect_columns(self) -> list[str]:
|
|
142
|
+
"""Returns the reasoning trace column, which may be generated alongside the main column.
|
|
143
|
+
|
|
144
|
+
Reasoning traces are only returned if the served model parses and returns reasoning content.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
List containing the reasoning trace column name.
|
|
148
|
+
"""
|
|
149
|
+
return [f"{self.name}{REASONING_TRACE_COLUMN_POSTFIX}"]
|
|
150
|
+
|
|
151
|
+
@model_validator(mode="after")
|
|
152
|
+
def assert_prompt_valid_jinja(self) -> Self:
|
|
153
|
+
"""Validate that prompt and system_prompt are valid Jinja2 templates.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
The validated instance.
|
|
157
|
+
|
|
158
|
+
Raises:
|
|
159
|
+
InvalidConfigError: If prompt or system_prompt contains invalid Jinja2 syntax.
|
|
160
|
+
"""
|
|
161
|
+
assert_valid_jinja2_template(self.prompt)
|
|
162
|
+
if self.system_prompt:
|
|
163
|
+
assert_valid_jinja2_template(self.system_prompt)
|
|
164
|
+
return self
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class LLMCodeColumnConfig(LLMTextColumnConfig):
|
|
168
|
+
"""Configuration for code generation columns using Large Language Models.
|
|
169
|
+
|
|
170
|
+
Extends LLMTextColumnConfig to generate code snippets in specific programming languages
|
|
171
|
+
or SQL dialects. The generated code is automatically extracted from markdown code blocks
|
|
172
|
+
for the specified language. Inherits all prompt templating capabilities.
|
|
173
|
+
|
|
174
|
+
Attributes:
|
|
175
|
+
code_lang: Programming language or SQL dialect for code generation. Supported
|
|
176
|
+
values include: "python", "javascript", "typescript", "java", "kotlin", "go",
|
|
177
|
+
"rust", "ruby", "scala", "swift", "sql:sqlite", "sql:postgres", "sql:mysql",
|
|
178
|
+
"sql:tsql", "sql:bigquery", "sql:ansi". See CodeLang enum for complete list.
|
|
179
|
+
column_type: Discriminator field, always "llm-code" for this configuration type.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
code_lang: CodeLang
|
|
183
|
+
column_type: Literal["llm-code"] = "llm-code"
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class LLMStructuredColumnConfig(LLMTextColumnConfig):
|
|
187
|
+
"""Configuration for structured JSON generation columns using Large Language Models.
|
|
188
|
+
|
|
189
|
+
Extends LLMTextColumnConfig to generate structured data conforming to a specified schema.
|
|
190
|
+
Uses JSON schema or Pydantic models to define the expected output structure, enabling
|
|
191
|
+
type-safe and validated structured output generation. Inherits prompt templating capabilities.
|
|
192
|
+
|
|
193
|
+
Attributes:
|
|
194
|
+
output_format: The schema defining the expected output structure. Can be either:
|
|
195
|
+
- A Pydantic BaseModel class (recommended)
|
|
196
|
+
- A JSON schema dictionary
|
|
197
|
+
column_type: Discriminator field, always "llm-structured" for this configuration type.
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
output_format: Union[dict, Type[BaseModel]]
|
|
201
|
+
column_type: Literal["llm-structured"] = "llm-structured"
|
|
202
|
+
|
|
203
|
+
@model_validator(mode="after")
|
|
204
|
+
def validate_output_format(self) -> Self:
|
|
205
|
+
"""Convert Pydantic model to JSON schema if needed.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
The validated instance with output_format as a JSON schema dict.
|
|
209
|
+
"""
|
|
210
|
+
if not isinstance(self.output_format, dict) and issubclass(self.output_format, BaseModel):
|
|
211
|
+
self.output_format = self.output_format.model_json_schema()
|
|
212
|
+
return self
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class Score(ConfigBase):
|
|
216
|
+
"""Configuration for a "score" in an LLM judge evaluation.
|
|
217
|
+
|
|
218
|
+
Defines a single scoring criterion with its possible values and descriptions. Multiple
|
|
219
|
+
Score objects can be combined in an LLMJudgeColumnConfig to create multi-dimensional
|
|
220
|
+
quality assessments.
|
|
221
|
+
|
|
222
|
+
Attributes:
|
|
223
|
+
name: A clear, concise name for this scoring dimension (e.g., "Relevance", "Fluency").
|
|
224
|
+
description: An informative and detailed assessment guide explaining how to evaluate
|
|
225
|
+
this dimension. Should provide clear criteria for scoring.
|
|
226
|
+
options: Dictionary mapping score values to their descriptions. Keys can be integers
|
|
227
|
+
(e.g., 1-5 scale) or strings (e.g., "Poor", "Good", "Excellent"). Values are
|
|
228
|
+
descriptions explaining what each score level means.
|
|
229
|
+
"""
|
|
230
|
+
|
|
231
|
+
name: str = Field(..., description="A clear name for this score.")
|
|
232
|
+
description: str = Field(..., description="An informative and detailed assessment guide for using this score.")
|
|
233
|
+
options: dict[Union[int, str], str] = Field(..., description="Score options in the format of {score: description}.")
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class LLMJudgeColumnConfig(LLMTextColumnConfig):
|
|
237
|
+
"""Configuration for LLM-as-a-judge quality assessment and scoring columns.
|
|
238
|
+
|
|
239
|
+
Extends LLMTextColumnConfig to create judge columns that evaluate and score other
|
|
240
|
+
generated content based on the defined criteria. Useful for quality assessment, preference
|
|
241
|
+
ranking, and multi-dimensional evaluation of generated data.
|
|
242
|
+
|
|
243
|
+
Attributes:
|
|
244
|
+
scores: List of Score objects defining the evaluation dimensions. Each score
|
|
245
|
+
represents a different aspect to evaluate (e.g., accuracy, relevance, fluency).
|
|
246
|
+
Must contain at least one score.
|
|
247
|
+
column_type: Discriminator field, always "llm-judge" for this configuration type.
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
scores: list[Score] = Field(..., min_length=1)
|
|
251
|
+
column_type: Literal["llm-judge"] = "llm-judge"
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class ExpressionColumnConfig(SingleColumnConfig):
|
|
255
|
+
"""Configuration for derived columns using Jinja2 expressions.
|
|
256
|
+
|
|
257
|
+
Expression columns compute values by evaluating Jinja2 templates that reference other
|
|
258
|
+
columns. Useful for transformations, concatenations, conditional logic, and derived
|
|
259
|
+
features without requiring LLM generation. The expression is evaluated row-by-row.
|
|
260
|
+
|
|
261
|
+
Attributes:
|
|
262
|
+
expr: Jinja2 expression to evaluate. Can reference other column values using
|
|
263
|
+
{{ column_name }} syntax. Supports filters, conditionals, and arithmetic.
|
|
264
|
+
Must be a valid, non-empty Jinja2 template.
|
|
265
|
+
dtype: Data type to cast the result to. Must be one of "int", "float", "str", or "bool".
|
|
266
|
+
Defaults to "str". Type conversion is applied after expression evaluation.
|
|
267
|
+
column_type: Discriminator field, always "expression" for this configuration type.
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
name: str
|
|
271
|
+
expr: str
|
|
272
|
+
dtype: Literal["int", "float", "str", "bool"] = "str"
|
|
273
|
+
column_type: Literal["expression"] = "expression"
|
|
274
|
+
|
|
275
|
+
@property
|
|
276
|
+
def required_columns(self) -> list[str]:
|
|
277
|
+
"""Returns the columns referenced in the expression template."""
|
|
278
|
+
return list(get_prompt_template_keywords(self.expr))
|
|
279
|
+
|
|
280
|
+
@model_validator(mode="after")
|
|
281
|
+
def assert_expression_valid_jinja(self) -> Self:
|
|
282
|
+
"""Validate that the expression is a valid, non-empty Jinja2 template.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
The validated instance.
|
|
286
|
+
|
|
287
|
+
Raises:
|
|
288
|
+
InvalidConfigError: If expression is empty or contains invalid Jinja2 syntax.
|
|
289
|
+
"""
|
|
290
|
+
if not self.expr.strip():
|
|
291
|
+
raise InvalidConfigError(
|
|
292
|
+
f"🛑 Expression column '{self.name}' has an empty or whitespace-only expression. "
|
|
293
|
+
f"Please provide a valid Jinja2 expression (e.g., '{{ column_name }}' or '{{ col1 }} + {{ col2 }}') "
|
|
294
|
+
"or remove this column if not needed."
|
|
295
|
+
)
|
|
296
|
+
assert_valid_jinja2_template(self.expr)
|
|
297
|
+
return self
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class ValidationColumnConfig(SingleColumnConfig):
|
|
301
|
+
"""Configuration for validation columns that validate existing columns.
|
|
302
|
+
|
|
303
|
+
Validation columns execute validation logic against specified target columns and return
|
|
304
|
+
structured results indicating pass/fail status with validation details. Supports multiple
|
|
305
|
+
validation strategies: code execution (Python/SQL), local callable functions (library only),
|
|
306
|
+
and remote HTTP endpoints.
|
|
307
|
+
|
|
308
|
+
Attributes:
|
|
309
|
+
target_columns: List of column names to validate. These columns are passed to the
|
|
310
|
+
validator for validation. All target columns must exist in the dataset
|
|
311
|
+
before validation runs.
|
|
312
|
+
validator_type: The type of validator to use. Options:
|
|
313
|
+
- "code": Execute code (Python or SQL) for validation. The code receives a
|
|
314
|
+
DataFrame with target columns and must return a DataFrame with validation results.
|
|
315
|
+
- "local_callable": Call a local Python function with the data. Only supported
|
|
316
|
+
when running DataDesigner locally.
|
|
317
|
+
- "remote": Send data to a remote HTTP endpoint for validation. Useful for
|
|
318
|
+
validator_params: Parameters specific to the validator type. Type varies by validator:
|
|
319
|
+
- CodeValidatorParams: Specifies code language (python or SQL dialect like
|
|
320
|
+
"sql:postgres", "sql:mysql").
|
|
321
|
+
- LocalCallableValidatorParams: Provides validation function (Callable[[pd.DataFrame],
|
|
322
|
+
pd.DataFrame]) and optional output schema for validation results.
|
|
323
|
+
- RemoteValidatorParams: Configures endpoint URL, HTTP timeout, retry behavior
|
|
324
|
+
(max_retries, retry_backoff), and parallel request limits (max_parallel_requests).
|
|
325
|
+
batch_size: Number of records to process in each validation batch. Defaults to 10.
|
|
326
|
+
Larger batches are more efficient but use more memory. Adjust based on validator
|
|
327
|
+
complexity and available resources.
|
|
328
|
+
column_type: Discriminator field, always "validation" for this configuration type.
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
target_columns: list[str]
|
|
332
|
+
validator_type: ValidatorType
|
|
333
|
+
validator_params: ValidatorParamsT
|
|
334
|
+
batch_size: int = Field(default=10, ge=1, description="Number of records to process in each batch")
|
|
335
|
+
column_type: Literal["validation"] = "validation"
|
|
336
|
+
|
|
337
|
+
@property
|
|
338
|
+
def required_columns(self) -> list[str]:
|
|
339
|
+
"""Returns the columns that need to be validated."""
|
|
340
|
+
return self.target_columns
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
class SeedDatasetColumnConfig(SingleColumnConfig):
|
|
344
|
+
"""Configuration for columns sourced from seed datasets.
|
|
345
|
+
|
|
346
|
+
This config marks columns that come from seed data. It is typically created
|
|
347
|
+
automatically when calling `with_seed_dataset()` on the builder, rather than
|
|
348
|
+
being instantiated directly by users.
|
|
349
|
+
|
|
350
|
+
Attributes:
|
|
351
|
+
column_type: Discriminator field, always "seed-dataset" for this configuration type.
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
column_type: Literal["seed-dataset"] = "seed-dataset"
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
6
|
+
from typing_extensions import TypeAlias
|
|
7
|
+
|
|
8
|
+
from ..plugin_manager import PluginManager
|
|
9
|
+
from .column_configs import (
|
|
10
|
+
ExpressionColumnConfig,
|
|
11
|
+
LLMCodeColumnConfig,
|
|
12
|
+
LLMJudgeColumnConfig,
|
|
13
|
+
LLMStructuredColumnConfig,
|
|
14
|
+
LLMTextColumnConfig,
|
|
15
|
+
SamplerColumnConfig,
|
|
16
|
+
SeedDatasetColumnConfig,
|
|
17
|
+
ValidationColumnConfig,
|
|
18
|
+
)
|
|
19
|
+
from .errors import InvalidColumnTypeError, InvalidConfigError
|
|
20
|
+
from .sampler_params import SamplerType
|
|
21
|
+
from .utils.type_helpers import SAMPLER_PARAMS, create_str_enum_from_discriminated_type_union, resolve_string_enum
|
|
22
|
+
|
|
23
|
+
plugin_manager = PluginManager()
|
|
24
|
+
|
|
25
|
+
ColumnConfigT: TypeAlias = Union[
|
|
26
|
+
ExpressionColumnConfig,
|
|
27
|
+
LLMCodeColumnConfig,
|
|
28
|
+
LLMJudgeColumnConfig,
|
|
29
|
+
LLMStructuredColumnConfig,
|
|
30
|
+
LLMTextColumnConfig,
|
|
31
|
+
SamplerColumnConfig,
|
|
32
|
+
SeedDatasetColumnConfig,
|
|
33
|
+
ValidationColumnConfig,
|
|
34
|
+
]
|
|
35
|
+
ColumnConfigT = plugin_manager.inject_into_column_config_type_union(ColumnConfigT)
|
|
36
|
+
|
|
37
|
+
DataDesignerColumnType = create_str_enum_from_discriminated_type_union(
|
|
38
|
+
enum_name="DataDesignerColumnType",
|
|
39
|
+
type_union=ColumnConfigT,
|
|
40
|
+
discriminator_field_name="column_type",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
COLUMN_TYPE_EMOJI_MAP = {
|
|
44
|
+
"general": "⚛️", # possible analysis column type
|
|
45
|
+
DataDesignerColumnType.EXPRESSION: "🧩",
|
|
46
|
+
DataDesignerColumnType.LLM_CODE: "💻",
|
|
47
|
+
DataDesignerColumnType.LLM_JUDGE: "⚖️",
|
|
48
|
+
DataDesignerColumnType.LLM_STRUCTURED: "🗂️",
|
|
49
|
+
DataDesignerColumnType.LLM_TEXT: "📝",
|
|
50
|
+
DataDesignerColumnType.SEED_DATASET: "🌱",
|
|
51
|
+
DataDesignerColumnType.SAMPLER: "🎲",
|
|
52
|
+
DataDesignerColumnType.VALIDATION: "🔍",
|
|
53
|
+
}
|
|
54
|
+
COLUMN_TYPE_EMOJI_MAP.update(
|
|
55
|
+
{DataDesignerColumnType(p.name): p.emoji for p in plugin_manager.get_column_generator_plugins()}
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def column_type_used_in_execution_dag(column_type: Union[str, DataDesignerColumnType]) -> bool:
|
|
60
|
+
"""Return True if the column type is used in the workflow execution DAG."""
|
|
61
|
+
column_type = resolve_string_enum(column_type, DataDesignerColumnType)
|
|
62
|
+
dag_column_types = {
|
|
63
|
+
DataDesignerColumnType.EXPRESSION,
|
|
64
|
+
DataDesignerColumnType.LLM_CODE,
|
|
65
|
+
DataDesignerColumnType.LLM_JUDGE,
|
|
66
|
+
DataDesignerColumnType.LLM_STRUCTURED,
|
|
67
|
+
DataDesignerColumnType.LLM_TEXT,
|
|
68
|
+
DataDesignerColumnType.VALIDATION,
|
|
69
|
+
}
|
|
70
|
+
dag_column_types.update(plugin_manager.get_plugin_column_types(DataDesignerColumnType))
|
|
71
|
+
return column_type in dag_column_types
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def column_type_is_llm_generated(column_type: Union[str, DataDesignerColumnType]) -> bool:
|
|
75
|
+
"""Return True if the column type is an LLM-generated column."""
|
|
76
|
+
column_type = resolve_string_enum(column_type, DataDesignerColumnType)
|
|
77
|
+
llm_generated_column_types = {
|
|
78
|
+
DataDesignerColumnType.LLM_TEXT,
|
|
79
|
+
DataDesignerColumnType.LLM_CODE,
|
|
80
|
+
DataDesignerColumnType.LLM_STRUCTURED,
|
|
81
|
+
DataDesignerColumnType.LLM_JUDGE,
|
|
82
|
+
}
|
|
83
|
+
llm_generated_column_types.update(
|
|
84
|
+
plugin_manager.get_plugin_column_types(
|
|
85
|
+
DataDesignerColumnType,
|
|
86
|
+
required_resources=["model_registry"],
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
return column_type in llm_generated_column_types
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType, **kwargs) -> ColumnConfigT:
|
|
93
|
+
"""Create a Data Designer column config object from kwargs.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
name: Name of the column.
|
|
97
|
+
column_type: Type of the column.
|
|
98
|
+
**kwargs: Keyword arguments to pass to the column constructor.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Data Designer column object of the appropriate type.
|
|
102
|
+
"""
|
|
103
|
+
column_type = resolve_string_enum(column_type, DataDesignerColumnType)
|
|
104
|
+
if column_type == DataDesignerColumnType.LLM_TEXT:
|
|
105
|
+
return LLMTextColumnConfig(name=name, **kwargs)
|
|
106
|
+
if column_type == DataDesignerColumnType.LLM_CODE:
|
|
107
|
+
return LLMCodeColumnConfig(name=name, **kwargs)
|
|
108
|
+
if column_type == DataDesignerColumnType.LLM_STRUCTURED:
|
|
109
|
+
return LLMStructuredColumnConfig(name=name, **kwargs)
|
|
110
|
+
if column_type == DataDesignerColumnType.LLM_JUDGE:
|
|
111
|
+
return LLMJudgeColumnConfig(name=name, **kwargs)
|
|
112
|
+
if column_type == DataDesignerColumnType.VALIDATION:
|
|
113
|
+
return ValidationColumnConfig(name=name, **kwargs)
|
|
114
|
+
if column_type == DataDesignerColumnType.EXPRESSION:
|
|
115
|
+
return ExpressionColumnConfig(name=name, **kwargs)
|
|
116
|
+
if column_type == DataDesignerColumnType.SAMPLER:
|
|
117
|
+
return SamplerColumnConfig(name=name, **_resolve_sampler_kwargs(name, kwargs))
|
|
118
|
+
if column_type == DataDesignerColumnType.SEED_DATASET:
|
|
119
|
+
return SeedDatasetColumnConfig(name=name, **kwargs)
|
|
120
|
+
if plugin := plugin_manager.get_column_generator_plugin_if_exists(column_type.value):
|
|
121
|
+
return plugin.config_cls(name=name, **kwargs)
|
|
122
|
+
raise InvalidColumnTypeError(f"🛑 {column_type} is not a valid column type.") # pragma: no cover
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def get_column_display_order() -> list[DataDesignerColumnType]:
|
|
126
|
+
"""Return the preferred display order of the column types."""
|
|
127
|
+
display_order = [
|
|
128
|
+
DataDesignerColumnType.SEED_DATASET,
|
|
129
|
+
DataDesignerColumnType.SAMPLER,
|
|
130
|
+
DataDesignerColumnType.LLM_TEXT,
|
|
131
|
+
DataDesignerColumnType.LLM_CODE,
|
|
132
|
+
DataDesignerColumnType.LLM_STRUCTURED,
|
|
133
|
+
DataDesignerColumnType.LLM_JUDGE,
|
|
134
|
+
DataDesignerColumnType.VALIDATION,
|
|
135
|
+
DataDesignerColumnType.EXPRESSION,
|
|
136
|
+
]
|
|
137
|
+
display_order.extend(plugin_manager.get_plugin_column_types(DataDesignerColumnType))
|
|
138
|
+
return display_order
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
|
|
142
|
+
if "sampler_type" not in kwargs:
|
|
143
|
+
raise InvalidConfigError(f"🛑 `sampler_type` is required for sampler column '{name}'.")
|
|
144
|
+
sampler_type = resolve_string_enum(kwargs["sampler_type"], SamplerType)
|
|
145
|
+
|
|
146
|
+
# Handle params - it could be a dict or already a concrete object
|
|
147
|
+
params_value = kwargs.get("params", {})
|
|
148
|
+
expected_params_class = SAMPLER_PARAMS[sampler_type.value]
|
|
149
|
+
|
|
150
|
+
if isinstance(params_value, expected_params_class):
|
|
151
|
+
# params is already a concrete object of the right type
|
|
152
|
+
params = params_value
|
|
153
|
+
elif isinstance(params_value, dict):
|
|
154
|
+
# params is a dictionary, create new instance
|
|
155
|
+
params = expected_params_class(**params_value)
|
|
156
|
+
else:
|
|
157
|
+
# params is neither dict nor expected type
|
|
158
|
+
raise InvalidConfigError(
|
|
159
|
+
f"🛑 Invalid params for sampler column '{name}'. "
|
|
160
|
+
f"Expected a dictionary or an instance of {expected_params_class.__name__}. "
|
|
161
|
+
f"You provided {params_value=}."
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
return {
|
|
165
|
+
"sampler_type": sampler_type,
|
|
166
|
+
"params": params,
|
|
167
|
+
**{k: v for k, v in kwargs.items() if k not in ["sampler_type", "params"]},
|
|
168
|
+
}
|