data-designer-config 0.4.0rc2__tar.gz → 0.5.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/PKG-INFO +1 -1
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/pyproject.toml +11 -6
- data_designer_config-0.5.0rc1/src/data_designer/config/__init__.py +226 -0
- data_designer_config-0.5.0rc1/src/data_designer/config/base.py +67 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/column_configs.py +149 -57
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/column_types.py +5 -1
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/config_builder.py +103 -3
- data_designer_config-0.5.0rc1/src/data_designer/config/custom_column.py +64 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/data_designer_config.py +5 -1
- data_designer_config-0.4.0rc2/src/data_designer/config/base.py → data_designer_config-0.5.0rc1/src/data_designer/config/exportable_config.py +1 -11
- data_designer_config-0.5.0rc1/src/data_designer/config/mcp.py +109 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/run_config.py +1 -1
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/code_lang.py +13 -2
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/constants.py +10 -1
- data_designer_config-0.5.0rc1/src/data_designer/config/utils/trace_type.py +24 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/visualization.py +6 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/logging.py +15 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_columns.py +67 -1
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_config_builder.py +141 -0
- data_designer_config-0.5.0rc1/tests/config/test_mcp.py +53 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_code_lang.py +1 -1
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/conftest.py +0 -1
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/test_logging.py +51 -0
- data_designer_config-0.4.0rc2/src/data_designer/config/__init__.py +0 -149
- data_designer_config-0.4.0rc2/src/data_designer/config/_version.py +0 -34
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/.gitignore +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/README.md +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/__init__.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/column_profilers.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/column_statistics.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/dataset_profiler.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/utils/errors.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/utils/reporting.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/dataset_builders.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/dataset_metadata.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/default_model_settings.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/errors.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/interface.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/models.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/preview_results.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/processors.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/sampler_constraints.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/sampler_params.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/seed.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/seed_source.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/seed_source_types.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/testing/__init__.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/testing/fixtures.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/errors.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/info.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/io_helpers.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/misc.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/numerical_helpers.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/type_helpers.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/validator_params.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/errors.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/lazy_heavy_imports.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/plugin_manager.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/plugins/__init__.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/plugins/errors.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/plugins/plugin.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/plugins/registry.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/analysis/conftest.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/analysis/test_column_statistics.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/analysis/test_dataset_profiler_results.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/analysis/utils/test_reporting.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_data_designer_config.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_default_model_settings.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_models.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_processors.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_sampler_constraints.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_sampler_params.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_seed.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_seed_source.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_validator_params.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/__init__.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_info.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_io_helpers.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_misc.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_type_helpers.py +0 -0
- {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_visualization.py +0 -0
|
@@ -31,16 +31,16 @@ dependencies = [
|
|
|
31
31
|
]
|
|
32
32
|
|
|
33
33
|
[build-system]
|
|
34
|
-
requires = ["hatchling", "
|
|
34
|
+
requires = ["hatchling", "uv-dynamic-versioning>=0.7.0"]
|
|
35
35
|
build-backend = "hatchling.build"
|
|
36
36
|
|
|
37
37
|
[tool.hatch.version]
|
|
38
|
-
source = "
|
|
39
|
-
fallback-version = "0.1.0.dev0"
|
|
40
|
-
raw-options = { root = "../.." }
|
|
38
|
+
source = "uv-dynamic-versioning"
|
|
41
39
|
|
|
42
|
-
[tool.
|
|
43
|
-
|
|
40
|
+
[tool.uv-dynamic-versioning]
|
|
41
|
+
vcs = "git"
|
|
42
|
+
style = "pep440"
|
|
43
|
+
bump = true
|
|
44
44
|
|
|
45
45
|
[tool.hatch.build.targets.wheel]
|
|
46
46
|
packages = ["src/data_designer"]
|
|
@@ -48,5 +48,10 @@ packages = ["src/data_designer"]
|
|
|
48
48
|
[tool.ruff]
|
|
49
49
|
extend = "../../pyproject.toml"
|
|
50
50
|
|
|
51
|
+
[tool.pytest.ini_options]
|
|
52
|
+
testpaths = ["tests"]
|
|
53
|
+
asyncio_default_fixture_loop_scope = "session"
|
|
54
|
+
env = ["DISABLE_DATA_DESIGNER_PLUGINS=true"]
|
|
55
|
+
|
|
51
56
|
[tool.uv]
|
|
52
57
|
package = true
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import importlib
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
# These imports are for IDE autocomplete and type checking only.
|
|
11
|
+
# At runtime, __getattr__ lazily loads the actual objects.
|
|
12
|
+
from data_designer.config.analysis.column_profilers import ( # noqa: F401
|
|
13
|
+
JudgeScoreProfilerConfig,
|
|
14
|
+
)
|
|
15
|
+
from data_designer.config.column_configs import ( # noqa: F401
|
|
16
|
+
CustomColumnConfig,
|
|
17
|
+
EmbeddingColumnConfig,
|
|
18
|
+
ExpressionColumnConfig,
|
|
19
|
+
GenerationStrategy,
|
|
20
|
+
LLMCodeColumnConfig,
|
|
21
|
+
LLMJudgeColumnConfig,
|
|
22
|
+
LLMStructuredColumnConfig,
|
|
23
|
+
LLMTextColumnConfig,
|
|
24
|
+
SamplerColumnConfig,
|
|
25
|
+
Score,
|
|
26
|
+
SeedDatasetColumnConfig,
|
|
27
|
+
ValidationColumnConfig,
|
|
28
|
+
)
|
|
29
|
+
from data_designer.config.column_types import DataDesignerColumnType # noqa: F401
|
|
30
|
+
from data_designer.config.config_builder import DataDesignerConfigBuilder # noqa: F401
|
|
31
|
+
from data_designer.config.custom_column import custom_column_generator # noqa: F401
|
|
32
|
+
from data_designer.config.data_designer_config import DataDesignerConfig # noqa: F401
|
|
33
|
+
from data_designer.config.dataset_builders import BuildStage # noqa: F401
|
|
34
|
+
from data_designer.config.mcp import ( # noqa: F401
|
|
35
|
+
LocalStdioMCPProvider,
|
|
36
|
+
MCPProvider,
|
|
37
|
+
ToolConfig,
|
|
38
|
+
)
|
|
39
|
+
from data_designer.config.models import ( # noqa: F401
|
|
40
|
+
ChatCompletionInferenceParams,
|
|
41
|
+
EmbeddingInferenceParams,
|
|
42
|
+
GenerationType,
|
|
43
|
+
ImageContext,
|
|
44
|
+
ImageFormat,
|
|
45
|
+
ManualDistribution,
|
|
46
|
+
ManualDistributionParams,
|
|
47
|
+
Modality,
|
|
48
|
+
ModalityContext,
|
|
49
|
+
ModalityDataType,
|
|
50
|
+
ModelConfig,
|
|
51
|
+
ModelProvider,
|
|
52
|
+
UniformDistribution,
|
|
53
|
+
UniformDistributionParams,
|
|
54
|
+
)
|
|
55
|
+
from data_designer.config.processors import ( # noqa: F401
|
|
56
|
+
DropColumnsProcessorConfig,
|
|
57
|
+
ProcessorType,
|
|
58
|
+
SchemaTransformProcessorConfig,
|
|
59
|
+
)
|
|
60
|
+
from data_designer.config.run_config import RunConfig # noqa: F401
|
|
61
|
+
from data_designer.config.sampler_constraints import ( # noqa: F401
|
|
62
|
+
ColumnInequalityConstraint,
|
|
63
|
+
ScalarInequalityConstraint,
|
|
64
|
+
)
|
|
65
|
+
from data_designer.config.sampler_params import ( # noqa: F401
|
|
66
|
+
BernoulliMixtureSamplerParams,
|
|
67
|
+
BernoulliSamplerParams,
|
|
68
|
+
BinomialSamplerParams,
|
|
69
|
+
CategorySamplerParams,
|
|
70
|
+
DatetimeSamplerParams,
|
|
71
|
+
GaussianSamplerParams,
|
|
72
|
+
PersonFromFakerSamplerParams,
|
|
73
|
+
PersonSamplerParams,
|
|
74
|
+
PoissonSamplerParams,
|
|
75
|
+
SamplerType,
|
|
76
|
+
ScipySamplerParams,
|
|
77
|
+
SubcategorySamplerParams,
|
|
78
|
+
TimeDeltaSamplerParams,
|
|
79
|
+
UniformSamplerParams,
|
|
80
|
+
UUIDSamplerParams,
|
|
81
|
+
)
|
|
82
|
+
from data_designer.config.seed import ( # noqa: F401
|
|
83
|
+
IndexRange,
|
|
84
|
+
PartitionBlock,
|
|
85
|
+
SamplingStrategy,
|
|
86
|
+
SeedConfig,
|
|
87
|
+
)
|
|
88
|
+
from data_designer.config.seed_source import ( # noqa: F401
|
|
89
|
+
DataFrameSeedSource,
|
|
90
|
+
HuggingFaceSeedSource,
|
|
91
|
+
LocalFileSeedSource,
|
|
92
|
+
)
|
|
93
|
+
from data_designer.config.utils.code_lang import CodeLang # noqa: F401
|
|
94
|
+
from data_designer.config.utils.info import InfoType # noqa: F401
|
|
95
|
+
from data_designer.config.utils.trace_type import TraceType # noqa: F401
|
|
96
|
+
from data_designer.config.validator_params import ( # noqa: F401
|
|
97
|
+
CodeValidatorParams,
|
|
98
|
+
LocalCallableValidatorParams,
|
|
99
|
+
RemoteValidatorParams,
|
|
100
|
+
ValidatorType,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Base module path and submodule paths for lazy imports
|
|
104
|
+
_MOD_BASE = "data_designer.config"
|
|
105
|
+
_MOD_COLUMN_CONFIGS = f"{_MOD_BASE}.column_configs"
|
|
106
|
+
_MOD_MCP = f"{_MOD_BASE}.mcp"
|
|
107
|
+
_MOD_MODELS = f"{_MOD_BASE}.models"
|
|
108
|
+
_MOD_PROCESSORS = f"{_MOD_BASE}.processors"
|
|
109
|
+
_MOD_SAMPLER_CONSTRAINTS = f"{_MOD_BASE}.sampler_constraints"
|
|
110
|
+
_MOD_SAMPLER_PARAMS = f"{_MOD_BASE}.sampler_params"
|
|
111
|
+
_MOD_SEED = f"{_MOD_BASE}.seed"
|
|
112
|
+
_MOD_SEED_SOURCE = f"{_MOD_BASE}.seed_source"
|
|
113
|
+
_MOD_VALIDATOR_PARAMS = f"{_MOD_BASE}.validator_params"
|
|
114
|
+
_MOD_UTILS = f"{_MOD_BASE}.utils"
|
|
115
|
+
|
|
116
|
+
# Mapping of export names to (module_path, attribute_name) for lazy loading
|
|
117
|
+
_LAZY_IMPORTS: dict[str, tuple[str, str]] = {
|
|
118
|
+
# analysis.column_profilers
|
|
119
|
+
"JudgeScoreProfilerConfig": (f"{_MOD_BASE}.analysis.column_profilers", "JudgeScoreProfilerConfig"),
|
|
120
|
+
# column_configs
|
|
121
|
+
"CustomColumnConfig": (_MOD_COLUMN_CONFIGS, "CustomColumnConfig"),
|
|
122
|
+
"EmbeddingColumnConfig": (_MOD_COLUMN_CONFIGS, "EmbeddingColumnConfig"),
|
|
123
|
+
"ExpressionColumnConfig": (_MOD_COLUMN_CONFIGS, "ExpressionColumnConfig"),
|
|
124
|
+
"GenerationStrategy": (_MOD_COLUMN_CONFIGS, "GenerationStrategy"),
|
|
125
|
+
"LLMCodeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMCodeColumnConfig"),
|
|
126
|
+
"LLMJudgeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMJudgeColumnConfig"),
|
|
127
|
+
"LLMStructuredColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMStructuredColumnConfig"),
|
|
128
|
+
"LLMTextColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMTextColumnConfig"),
|
|
129
|
+
"SamplerColumnConfig": (_MOD_COLUMN_CONFIGS, "SamplerColumnConfig"),
|
|
130
|
+
"Score": (_MOD_COLUMN_CONFIGS, "Score"),
|
|
131
|
+
"SeedDatasetColumnConfig": (_MOD_COLUMN_CONFIGS, "SeedDatasetColumnConfig"),
|
|
132
|
+
"ValidationColumnConfig": (_MOD_COLUMN_CONFIGS, "ValidationColumnConfig"),
|
|
133
|
+
# column_types
|
|
134
|
+
"DataDesignerColumnType": (f"{_MOD_BASE}.column_types", "DataDesignerColumnType"),
|
|
135
|
+
# config_builder
|
|
136
|
+
"DataDesignerConfigBuilder": (f"{_MOD_BASE}.config_builder", "DataDesignerConfigBuilder"),
|
|
137
|
+
# custom_column
|
|
138
|
+
"custom_column_generator": (f"{_MOD_BASE}.custom_column", "custom_column_generator"),
|
|
139
|
+
# data_designer_config
|
|
140
|
+
"DataDesignerConfig": (f"{_MOD_BASE}.data_designer_config", "DataDesignerConfig"),
|
|
141
|
+
# dataset_builders
|
|
142
|
+
"BuildStage": (f"{_MOD_BASE}.dataset_builders", "BuildStage"),
|
|
143
|
+
# mcp
|
|
144
|
+
"LocalStdioMCPProvider": (_MOD_MCP, "LocalStdioMCPProvider"),
|
|
145
|
+
"MCPProvider": (_MOD_MCP, "MCPProvider"),
|
|
146
|
+
"ToolConfig": (_MOD_MCP, "ToolConfig"),
|
|
147
|
+
# models
|
|
148
|
+
"ChatCompletionInferenceParams": (_MOD_MODELS, "ChatCompletionInferenceParams"),
|
|
149
|
+
"EmbeddingInferenceParams": (_MOD_MODELS, "EmbeddingInferenceParams"),
|
|
150
|
+
"GenerationType": (_MOD_MODELS, "GenerationType"),
|
|
151
|
+
"ImageContext": (_MOD_MODELS, "ImageContext"),
|
|
152
|
+
"ImageFormat": (_MOD_MODELS, "ImageFormat"),
|
|
153
|
+
"ManualDistribution": (_MOD_MODELS, "ManualDistribution"),
|
|
154
|
+
"ManualDistributionParams": (_MOD_MODELS, "ManualDistributionParams"),
|
|
155
|
+
"Modality": (_MOD_MODELS, "Modality"),
|
|
156
|
+
"ModalityContext": (_MOD_MODELS, "ModalityContext"),
|
|
157
|
+
"ModalityDataType": (_MOD_MODELS, "ModalityDataType"),
|
|
158
|
+
"ModelConfig": (_MOD_MODELS, "ModelConfig"),
|
|
159
|
+
"ModelProvider": (_MOD_MODELS, "ModelProvider"),
|
|
160
|
+
"UniformDistribution": (_MOD_MODELS, "UniformDistribution"),
|
|
161
|
+
"UniformDistributionParams": (_MOD_MODELS, "UniformDistributionParams"),
|
|
162
|
+
# processors
|
|
163
|
+
"DropColumnsProcessorConfig": (_MOD_PROCESSORS, "DropColumnsProcessorConfig"),
|
|
164
|
+
"ProcessorType": (_MOD_PROCESSORS, "ProcessorType"),
|
|
165
|
+
"SchemaTransformProcessorConfig": (_MOD_PROCESSORS, "SchemaTransformProcessorConfig"),
|
|
166
|
+
# run_config
|
|
167
|
+
"RunConfig": (f"{_MOD_BASE}.run_config", "RunConfig"),
|
|
168
|
+
# sampler_constraints
|
|
169
|
+
"ColumnInequalityConstraint": (_MOD_SAMPLER_CONSTRAINTS, "ColumnInequalityConstraint"),
|
|
170
|
+
"ScalarInequalityConstraint": (_MOD_SAMPLER_CONSTRAINTS, "ScalarInequalityConstraint"),
|
|
171
|
+
# sampler_params
|
|
172
|
+
"BernoulliMixtureSamplerParams": (_MOD_SAMPLER_PARAMS, "BernoulliMixtureSamplerParams"),
|
|
173
|
+
"BernoulliSamplerParams": (_MOD_SAMPLER_PARAMS, "BernoulliSamplerParams"),
|
|
174
|
+
"BinomialSamplerParams": (_MOD_SAMPLER_PARAMS, "BinomialSamplerParams"),
|
|
175
|
+
"CategorySamplerParams": (_MOD_SAMPLER_PARAMS, "CategorySamplerParams"),
|
|
176
|
+
"DatetimeSamplerParams": (_MOD_SAMPLER_PARAMS, "DatetimeSamplerParams"),
|
|
177
|
+
"GaussianSamplerParams": (_MOD_SAMPLER_PARAMS, "GaussianSamplerParams"),
|
|
178
|
+
"PersonFromFakerSamplerParams": (_MOD_SAMPLER_PARAMS, "PersonFromFakerSamplerParams"),
|
|
179
|
+
"PersonSamplerParams": (_MOD_SAMPLER_PARAMS, "PersonSamplerParams"),
|
|
180
|
+
"PoissonSamplerParams": (_MOD_SAMPLER_PARAMS, "PoissonSamplerParams"),
|
|
181
|
+
"SamplerType": (_MOD_SAMPLER_PARAMS, "SamplerType"),
|
|
182
|
+
"ScipySamplerParams": (_MOD_SAMPLER_PARAMS, "ScipySamplerParams"),
|
|
183
|
+
"SubcategorySamplerParams": (_MOD_SAMPLER_PARAMS, "SubcategorySamplerParams"),
|
|
184
|
+
"TimeDeltaSamplerParams": (_MOD_SAMPLER_PARAMS, "TimeDeltaSamplerParams"),
|
|
185
|
+
"UniformSamplerParams": (_MOD_SAMPLER_PARAMS, "UniformSamplerParams"),
|
|
186
|
+
"UUIDSamplerParams": (_MOD_SAMPLER_PARAMS, "UUIDSamplerParams"),
|
|
187
|
+
# seed
|
|
188
|
+
"IndexRange": (_MOD_SEED, "IndexRange"),
|
|
189
|
+
"PartitionBlock": (_MOD_SEED, "PartitionBlock"),
|
|
190
|
+
"SamplingStrategy": (_MOD_SEED, "SamplingStrategy"),
|
|
191
|
+
"SeedConfig": (_MOD_SEED, "SeedConfig"),
|
|
192
|
+
# seed_source
|
|
193
|
+
"DataFrameSeedSource": (_MOD_SEED_SOURCE, "DataFrameSeedSource"),
|
|
194
|
+
"HuggingFaceSeedSource": (_MOD_SEED_SOURCE, "HuggingFaceSeedSource"),
|
|
195
|
+
"LocalFileSeedSource": (_MOD_SEED_SOURCE, "LocalFileSeedSource"),
|
|
196
|
+
# utils
|
|
197
|
+
"CodeLang": (f"{_MOD_UTILS}.code_lang", "CodeLang"),
|
|
198
|
+
"InfoType": (f"{_MOD_UTILS}.info", "InfoType"),
|
|
199
|
+
"TraceType": (f"{_MOD_UTILS}.trace_type", "TraceType"),
|
|
200
|
+
# validator_params
|
|
201
|
+
"CodeValidatorParams": (_MOD_VALIDATOR_PARAMS, "CodeValidatorParams"),
|
|
202
|
+
"LocalCallableValidatorParams": (_MOD_VALIDATOR_PARAMS, "LocalCallableValidatorParams"),
|
|
203
|
+
"RemoteValidatorParams": (_MOD_VALIDATOR_PARAMS, "RemoteValidatorParams"),
|
|
204
|
+
"ValidatorType": (_MOD_VALIDATOR_PARAMS, "ValidatorType"),
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
__all__ = list(_LAZY_IMPORTS.keys())
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def __getattr__(name: str) -> object:
|
|
211
|
+
"""Lazily import config module exports when accessed.
|
|
212
|
+
|
|
213
|
+
This allows fast imports of data_designer.config while deferring loading
|
|
214
|
+
of submodules until they're actually needed.
|
|
215
|
+
"""
|
|
216
|
+
if name in _LAZY_IMPORTS:
|
|
217
|
+
module_path, attr_name = _LAZY_IMPORTS[name]
|
|
218
|
+
module = importlib.import_module(module_path)
|
|
219
|
+
return getattr(module, attr_name)
|
|
220
|
+
|
|
221
|
+
raise AttributeError(f"module 'data_designer.config' has no attribute {name!r}")
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def __dir__() -> list[str]:
|
|
225
|
+
"""Return list of available exports for tab-completion."""
|
|
226
|
+
return __all__
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
# IMPORTANT: This module must NOT import from any data_designer submodules (i.e., data_designer.*).
|
|
5
|
+
# These base abstractions are foundational and should only depend on pydantic and Python builtins.
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ConfigBase(BaseModel):
|
|
15
|
+
model_config = ConfigDict(
|
|
16
|
+
protected_namespaces=(),
|
|
17
|
+
use_enum_values=True,
|
|
18
|
+
arbitrary_types_allowed=True,
|
|
19
|
+
extra="forbid",
|
|
20
|
+
json_schema_mode_override="validation",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SingleColumnConfig(ConfigBase, ABC):
|
|
25
|
+
"""Abstract base class for all single-column configuration types.
|
|
26
|
+
|
|
27
|
+
This class serves as the foundation for all column configurations in DataDesigner,
|
|
28
|
+
defining shared fields and properties across all column types.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
name: Unique name of the column to be generated.
|
|
32
|
+
drop: If True, the column will be generated but removed from the final dataset.
|
|
33
|
+
Useful for intermediate columns that are dependencies for other columns.
|
|
34
|
+
column_type: Discriminator field that identifies the specific column type.
|
|
35
|
+
Subclasses must override this field to specify the column type with a `Literal` value.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
name: str
|
|
39
|
+
drop: bool = False
|
|
40
|
+
column_type: str
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def get_column_emoji() -> str:
|
|
44
|
+
return "🎨"
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def required_columns(self) -> list[str]:
|
|
49
|
+
"""Returns a list of column names that must exist before this column can be generated.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
List of column names that this column depends on. Empty list indicates
|
|
53
|
+
no dependencies. Override in subclasses to specify dependencies.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
@abstractmethod
|
|
58
|
+
def side_effect_columns(self) -> list[str]:
|
|
59
|
+
"""Returns a list of additional columns that this column will create as a side effect.
|
|
60
|
+
|
|
61
|
+
Some column types generate additional metadata or auxiliary columns alongside
|
|
62
|
+
the primary column (e.g., reasoning traces for LLM columns).
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
List of column names that this column will create as a side effect. Empty list
|
|
66
|
+
indicates no side effect columns. Override in subclasses to specify side effects.
|
|
67
|
+
"""
|
|
@@ -3,66 +3,28 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from typing import Annotated, Literal
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Annotated, Any, Literal
|
|
8
8
|
|
|
9
|
-
from pydantic import BaseModel, Discriminator, Field, model_validator
|
|
9
|
+
from pydantic import BaseModel, Discriminator, Field, field_serializer, field_validator, model_validator
|
|
10
10
|
from typing_extensions import Self
|
|
11
11
|
|
|
12
|
-
from data_designer.config.base import ConfigBase
|
|
12
|
+
from data_designer.config.base import ConfigBase, SingleColumnConfig
|
|
13
13
|
from data_designer.config.errors import InvalidConfigError
|
|
14
14
|
from data_designer.config.models import ImageContext
|
|
15
15
|
from data_designer.config.sampler_params import SamplerParamsT, SamplerType
|
|
16
16
|
from data_designer.config.utils.code_lang import CodeLang
|
|
17
|
-
from data_designer.config.utils.constants import
|
|
17
|
+
from data_designer.config.utils.constants import REASONING_CONTENT_COLUMN_POSTFIX, TRACE_COLUMN_POSTFIX
|
|
18
18
|
from data_designer.config.utils.misc import assert_valid_jinja2_template, extract_keywords_from_jinja2_template
|
|
19
|
+
from data_designer.config.utils.trace_type import TraceType
|
|
19
20
|
from data_designer.config.validator_params import ValidatorParamsT, ValidatorType
|
|
20
21
|
|
|
21
22
|
|
|
22
|
-
class
|
|
23
|
-
"""
|
|
23
|
+
class GenerationStrategy(str, Enum):
|
|
24
|
+
"""Strategy for custom column generation."""
|
|
24
25
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
Attributes:
|
|
29
|
-
name: Unique name of the column to be generated.
|
|
30
|
-
drop: If True, the column will be generated but removed from the final dataset.
|
|
31
|
-
Useful for intermediate columns that are dependencies for other columns.
|
|
32
|
-
column_type: Discriminator field that identifies the specific column type.
|
|
33
|
-
Subclasses must override this field to specify the column type with a `Literal` value.
|
|
34
|
-
"""
|
|
35
|
-
|
|
36
|
-
name: str
|
|
37
|
-
drop: bool = False
|
|
38
|
-
column_type: str
|
|
39
|
-
|
|
40
|
-
@staticmethod
|
|
41
|
-
def get_column_emoji() -> str:
|
|
42
|
-
return "🎨"
|
|
43
|
-
|
|
44
|
-
@property
|
|
45
|
-
@abstractmethod
|
|
46
|
-
def required_columns(self) -> list[str]:
|
|
47
|
-
"""Returns a list of column names that must exist before this column can be generated.
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
List of column names that this column depends on. Empty list indicates
|
|
51
|
-
no dependencies. Override in subclasses to specify dependencies.
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
@property
|
|
55
|
-
@abstractmethod
|
|
56
|
-
def side_effect_columns(self) -> list[str]:
|
|
57
|
-
"""Returns a list of additional columns that this column will create as a side effect.
|
|
58
|
-
|
|
59
|
-
Some column types generate additional metadata or auxiliary columns alongside
|
|
60
|
-
the primary column (e.g., reasoning traces for LLM columns).
|
|
61
|
-
|
|
62
|
-
Returns:
|
|
63
|
-
List of column names that this column will create as a side effect. Empty list
|
|
64
|
-
indicates no side effect columns. Override in subclasses to specify side effects.
|
|
65
|
-
"""
|
|
26
|
+
CELL_BY_CELL = "cell_by_cell"
|
|
27
|
+
FULL_COLUMN = "full_column"
|
|
66
28
|
|
|
67
29
|
|
|
68
30
|
class SamplerColumnConfig(SingleColumnConfig):
|
|
@@ -143,8 +105,8 @@ class LLMTextColumnConfig(SingleColumnConfig):
|
|
|
143
105
|
|
|
144
106
|
LLM text columns generate free-form text content using language models via LiteLLM.
|
|
145
107
|
Prompts support Jinja2 templating to reference values from other columns, enabling
|
|
146
|
-
context-aware generation. The generated text can optionally include
|
|
147
|
-
|
|
108
|
+
context-aware generation. The generated text can optionally include message traces
|
|
109
|
+
capturing the full conversation history.
|
|
148
110
|
|
|
149
111
|
Attributes:
|
|
150
112
|
prompt: Prompt template for text generation. Supports Jinja2 syntax to
|
|
@@ -159,6 +121,18 @@ class LLMTextColumnConfig(SingleColumnConfig):
|
|
|
159
121
|
`LLMStructuredColumnConfig` for structured output, `LLMCodeColumnConfig` for code.
|
|
160
122
|
multi_modal_context: Optional list of image contexts for multi-modal generation.
|
|
161
123
|
Enables vision-capable models to generate text based on image inputs.
|
|
124
|
+
tool_alias: Optional alias of the tool configuration to use for MCP tool calls.
|
|
125
|
+
Must match a tool alias defined when initializing the DataDesignerConfigBuilder.
|
|
126
|
+
When provided, the model may call permitted tools during generation.
|
|
127
|
+
with_trace: Specifies what trace information to capture in a `{column_name}__trace`
|
|
128
|
+
column. Options are:
|
|
129
|
+
- `TraceType.NONE` (default): No trace is captured.
|
|
130
|
+
- `TraceType.LAST_MESSAGE`: Only the final assistant message is captured.
|
|
131
|
+
- `TraceType.ALL_MESSAGES`: Full conversation history (system/user/assistant/tool).
|
|
132
|
+
extract_reasoning_content: If True, creates a `{column_name}__reasoning_content` column
|
|
133
|
+
containing only the reasoning_content from the final assistant response. This is
|
|
134
|
+
useful for models that expose chain-of-thought reasoning separately from the main
|
|
135
|
+
response. Defaults to False.
|
|
162
136
|
column_type: Discriminator field, always "llm-text" for this configuration type.
|
|
163
137
|
"""
|
|
164
138
|
|
|
@@ -166,6 +140,9 @@ class LLMTextColumnConfig(SingleColumnConfig):
|
|
|
166
140
|
model_alias: str
|
|
167
141
|
system_prompt: str | None = None
|
|
168
142
|
multi_modal_context: list[ImageContext] | None = None
|
|
143
|
+
tool_alias: str | None = None
|
|
144
|
+
with_trace: TraceType = TraceType.NONE
|
|
145
|
+
extract_reasoning_content: bool = False
|
|
169
146
|
column_type: Literal["llm-text"] = "llm-text"
|
|
170
147
|
|
|
171
148
|
@staticmethod
|
|
@@ -186,14 +163,20 @@ class LLMTextColumnConfig(SingleColumnConfig):
|
|
|
186
163
|
|
|
187
164
|
@property
|
|
188
165
|
def side_effect_columns(self) -> list[str]:
|
|
189
|
-
"""Returns
|
|
166
|
+
"""Returns side-effect columns that may be generated alongside the main column.
|
|
190
167
|
|
|
191
|
-
|
|
168
|
+
Side-effect columns include:
|
|
169
|
+
- `{name}__trace`: Generated when `with_trace` is not `TraceType.NONE` on the column
|
|
170
|
+
config.
|
|
171
|
+
- `{name}__reasoning_content`: Generated when `extract_reasoning_content=True`.
|
|
192
172
|
|
|
193
173
|
Returns:
|
|
194
|
-
List
|
|
174
|
+
List of side-effect column names.
|
|
195
175
|
"""
|
|
196
|
-
return [
|
|
176
|
+
return [
|
|
177
|
+
*([f"{self.name}{TRACE_COLUMN_POSTFIX}"] if self.with_trace != TraceType.NONE else []),
|
|
178
|
+
*([f"{self.name}{REASONING_CONTENT_COLUMN_POSTFIX}"] if self.extract_reasoning_content else []),
|
|
179
|
+
]
|
|
197
180
|
|
|
198
181
|
@model_validator(mode="after")
|
|
199
182
|
def assert_prompt_valid_jinja(self) -> Self:
|
|
@@ -216,7 +199,7 @@ class LLMCodeColumnConfig(LLMTextColumnConfig):
|
|
|
216
199
|
|
|
217
200
|
Extends LLMTextColumnConfig to generate code snippets in specific programming languages
|
|
218
201
|
or SQL dialects. The generated code is automatically extracted from markdown code blocks
|
|
219
|
-
for the specified language. Inherits all prompt templating capabilities.
|
|
202
|
+
for the specified language. Inherits all prompt templating capabilities from LLMTextColumnConfig.
|
|
220
203
|
|
|
221
204
|
Attributes:
|
|
222
205
|
code_lang: Programming language or SQL dialect for code generation. Supported
|
|
@@ -224,6 +207,16 @@ class LLMCodeColumnConfig(LLMTextColumnConfig):
|
|
|
224
207
|
"rust", "ruby", "scala", "swift", "sql:sqlite", "sql:postgres", "sql:mysql",
|
|
225
208
|
"sql:tsql", "sql:bigquery", "sql:ansi". See CodeLang enum for complete list.
|
|
226
209
|
column_type: Discriminator field, always "llm-code" for this configuration type.
|
|
210
|
+
|
|
211
|
+
Inherited Attributes:
|
|
212
|
+
prompt: Prompt template for code generation (supports Jinja2).
|
|
213
|
+
model_alias: Alias of the model configuration to use.
|
|
214
|
+
system_prompt: Optional system prompt (supports Jinja2).
|
|
215
|
+
multi_modal_context: Optional image contexts for multi-modal generation.
|
|
216
|
+
tool_alias: Optional tool configuration alias for MCP tool calls.
|
|
217
|
+
with_trace: If True, creates a `{column_name}__trace` column with message history.
|
|
218
|
+
extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
|
|
219
|
+
column containing the reasoning content from the final assistant response.
|
|
227
220
|
"""
|
|
228
221
|
|
|
229
222
|
code_lang: CodeLang
|
|
@@ -239,13 +232,24 @@ class LLMStructuredColumnConfig(LLMTextColumnConfig):
|
|
|
239
232
|
|
|
240
233
|
Extends LLMTextColumnConfig to generate structured data conforming to a specified schema.
|
|
241
234
|
Uses JSON schema or Pydantic models to define the expected output structure, enabling
|
|
242
|
-
type-safe and validated structured output generation. Inherits prompt templating capabilities
|
|
235
|
+
type-safe and validated structured output generation. Inherits prompt templating capabilities
|
|
236
|
+
from LLMTextColumnConfig.
|
|
243
237
|
|
|
244
238
|
Attributes:
|
|
245
239
|
output_format: The schema defining the expected output structure. Can be either:
|
|
246
240
|
- A Pydantic BaseModel class (recommended)
|
|
247
241
|
- A JSON schema dictionary
|
|
248
242
|
column_type: Discriminator field, always "llm-structured" for this configuration type.
|
|
243
|
+
|
|
244
|
+
Inherited Attributes:
|
|
245
|
+
prompt: Prompt template for structured generation (supports Jinja2).
|
|
246
|
+
model_alias: Alias of the model configuration to use.
|
|
247
|
+
system_prompt: Optional system prompt (supports Jinja2).
|
|
248
|
+
multi_modal_context: Optional image contexts for multi-modal generation.
|
|
249
|
+
tool_alias: Optional tool configuration alias for MCP tool calls.
|
|
250
|
+
with_trace: If True, creates a `{column_name}__trace` column with message history.
|
|
251
|
+
extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
|
|
252
|
+
column containing the reasoning content from the final assistant response.
|
|
249
253
|
"""
|
|
250
254
|
|
|
251
255
|
output_format: dict | type[BaseModel]
|
|
@@ -293,13 +297,24 @@ class LLMJudgeColumnConfig(LLMTextColumnConfig):
|
|
|
293
297
|
|
|
294
298
|
Extends LLMTextColumnConfig to create judge columns that evaluate and score other
|
|
295
299
|
generated content based on the defined criteria. Useful for quality assessment, preference
|
|
296
|
-
ranking, and multi-dimensional evaluation of generated data.
|
|
300
|
+
ranking, and multi-dimensional evaluation of generated data. Inherits prompt templating
|
|
301
|
+
capabilities from LLMTextColumnConfig.
|
|
297
302
|
|
|
298
303
|
Attributes:
|
|
299
304
|
scores: List of Score objects defining the evaluation dimensions. Each score
|
|
300
305
|
represents a different aspect to evaluate (e.g., accuracy, relevance, fluency).
|
|
301
306
|
Must contain at least one score.
|
|
302
307
|
column_type: Discriminator field, always "llm-judge" for this configuration type.
|
|
308
|
+
|
|
309
|
+
Inherited Attributes:
|
|
310
|
+
prompt: Prompt template for the judge evaluation (supports Jinja2).
|
|
311
|
+
model_alias: Alias of the model configuration to use.
|
|
312
|
+
system_prompt: Optional system prompt (supports Jinja2).
|
|
313
|
+
multi_modal_context: Optional image contexts for multi-modal generation.
|
|
314
|
+
tool_alias: Optional tool configuration alias for MCP tool calls.
|
|
315
|
+
with_trace: If True, creates a `{column_name}__trace` column with message history.
|
|
316
|
+
extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
|
|
317
|
+
column containing the reasoning content from the final assistant response.
|
|
303
318
|
"""
|
|
304
319
|
|
|
305
320
|
scores: list[Score] = Field(..., min_length=1)
|
|
@@ -468,3 +483,80 @@ class EmbeddingColumnConfig(SingleColumnConfig):
|
|
|
468
483
|
@property
|
|
469
484
|
def side_effect_columns(self) -> list[str]:
|
|
470
485
|
return []
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
class CustomColumnConfig(SingleColumnConfig):
|
|
489
|
+
"""Configuration for custom user-defined column generators.
|
|
490
|
+
|
|
491
|
+
Custom columns allow users to provide their own generation logic via a callable function
|
|
492
|
+
decorated with `@custom_column_generator`. Two strategies are supported: cell_by_cell
|
|
493
|
+
(default, row-based) and full_column (batch-based with DataFrame access).
|
|
494
|
+
|
|
495
|
+
Attributes:
|
|
496
|
+
generator_function: A callable decorated with @custom_column_generator.
|
|
497
|
+
generation_strategy: "cell_by_cell" (row-based) or "full_column" (batch-based).
|
|
498
|
+
generator_params: Optional typed configuration object (Pydantic BaseModel) passed
|
|
499
|
+
as the second argument to the generator function.
|
|
500
|
+
column_type: Discriminator field, always "custom" for this configuration type.
|
|
501
|
+
"""
|
|
502
|
+
|
|
503
|
+
generator_function: Any = Field(description="Function decorated with @custom_column_generator")
|
|
504
|
+
generation_strategy: GenerationStrategy = Field(
|
|
505
|
+
default=GenerationStrategy.CELL_BY_CELL,
|
|
506
|
+
description="Generation strategy: 'cell_by_cell' for row-based or 'full_column' for batch-based",
|
|
507
|
+
)
|
|
508
|
+
generator_params: BaseModel | None = Field(
|
|
509
|
+
default=None,
|
|
510
|
+
description="Optional typed configuration object passed as second argument to generator function",
|
|
511
|
+
)
|
|
512
|
+
column_type: Literal["custom"] = "custom"
|
|
513
|
+
|
|
514
|
+
@field_validator("generator_function")
|
|
515
|
+
@classmethod
|
|
516
|
+
def _validate_generator_function(cls, v: Any) -> Any:
|
|
517
|
+
if not callable(v):
|
|
518
|
+
raise ValueError("generator_function must be callable")
|
|
519
|
+
if not hasattr(v, "custom_column_metadata"):
|
|
520
|
+
raise ValueError("generator_function must be decorated with @custom_column_generator")
|
|
521
|
+
return v
|
|
522
|
+
|
|
523
|
+
@staticmethod
|
|
524
|
+
def get_column_emoji() -> str:
|
|
525
|
+
return "🔧"
|
|
526
|
+
|
|
527
|
+
@property
|
|
528
|
+
def required_columns(self) -> list[str]:
|
|
529
|
+
"""Returns the columns required for custom generation (from decorator metadata)."""
|
|
530
|
+
metadata = getattr(self.generator_function, "custom_column_metadata", {})
|
|
531
|
+
return metadata.get("required_columns", [])
|
|
532
|
+
|
|
533
|
+
@property
|
|
534
|
+
def side_effect_columns(self) -> list[str]:
|
|
535
|
+
"""Returns additional columns created by this generator (from decorator metadata)."""
|
|
536
|
+
metadata = getattr(self.generator_function, "custom_column_metadata", {})
|
|
537
|
+
return metadata.get("side_effect_columns", [])
|
|
538
|
+
|
|
539
|
+
@property
|
|
540
|
+
def model_aliases(self) -> list[str]:
|
|
541
|
+
"""Returns model aliases for LLM access and health checks (from decorator metadata)."""
|
|
542
|
+
metadata = getattr(self.generator_function, "custom_column_metadata", {})
|
|
543
|
+
return metadata.get("model_aliases", [])
|
|
544
|
+
|
|
545
|
+
@field_serializer("generator_function")
|
|
546
|
+
def serialize_generator_function(self, v: Any) -> str:
|
|
547
|
+
return getattr(v, "__name__", repr(v))
|
|
548
|
+
|
|
549
|
+
@field_serializer("generator_params")
|
|
550
|
+
def serialize_generator_params(self, v: BaseModel | None) -> dict[str, Any] | None:
|
|
551
|
+
if v is None:
|
|
552
|
+
return None
|
|
553
|
+
return v.model_dump()
|
|
554
|
+
|
|
555
|
+
@model_validator(mode="after")
|
|
556
|
+
def validate_generator_function(self) -> Self:
|
|
557
|
+
if not callable(self.generator_function):
|
|
558
|
+
raise InvalidConfigError(
|
|
559
|
+
f"🛑 `generator_function` must be a callable for custom column '{self.name}'. "
|
|
560
|
+
f"Expected a function decorated with @custom_column_generator."
|
|
561
|
+
)
|
|
562
|
+
return self
|
|
@@ -6,6 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
from typing_extensions import TypeAlias
|
|
7
7
|
|
|
8
8
|
from data_designer.config.column_configs import (
|
|
9
|
+
CustomColumnConfig,
|
|
9
10
|
EmbeddingColumnConfig,
|
|
10
11
|
ExpressionColumnConfig,
|
|
11
12
|
LLMCodeColumnConfig,
|
|
@@ -28,7 +29,8 @@ from data_designer.plugin_manager import PluginManager
|
|
|
28
29
|
plugin_manager = PluginManager()
|
|
29
30
|
|
|
30
31
|
ColumnConfigT: TypeAlias = (
|
|
31
|
-
|
|
32
|
+
CustomColumnConfig
|
|
33
|
+
| ExpressionColumnConfig
|
|
32
34
|
| LLMCodeColumnConfig
|
|
33
35
|
| LLMJudgeColumnConfig
|
|
34
36
|
| LLMStructuredColumnConfig
|
|
@@ -87,6 +89,7 @@ def get_column_display_order() -> list[DataDesignerColumnType]:
|
|
|
87
89
|
DataDesignerColumnType.EMBEDDING,
|
|
88
90
|
DataDesignerColumnType.VALIDATION,
|
|
89
91
|
DataDesignerColumnType.EXPRESSION,
|
|
92
|
+
DataDesignerColumnType.CUSTOM,
|
|
90
93
|
]
|
|
91
94
|
display_order.extend(plugin_manager.get_plugin_column_types(DataDesignerColumnType))
|
|
92
95
|
return display_order
|
|
@@ -129,6 +132,7 @@ def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
|
|
|
129
132
|
|
|
130
133
|
|
|
131
134
|
_COLUMN_TYPE_CONFIG_CLS_MAP = {
|
|
135
|
+
DataDesignerColumnType.CUSTOM: CustomColumnConfig,
|
|
132
136
|
DataDesignerColumnType.LLM_TEXT: LLMTextColumnConfig,
|
|
133
137
|
DataDesignerColumnType.LLM_CODE: LLMCodeColumnConfig,
|
|
134
138
|
DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnConfig,
|