data-designer-config 0.4.0rc2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/config/__init__.py +219 -142
- data_designer/config/base.py +37 -39
- data_designer/config/column_configs.py +149 -57
- data_designer/config/column_types.py +5 -1
- data_designer/config/config_builder.py +103 -3
- data_designer/config/custom_column.py +64 -0
- data_designer/config/data_designer_config.py +5 -1
- data_designer/config/exportable_config.py +59 -0
- data_designer/config/mcp.py +109 -0
- data_designer/config/run_config.py +1 -1
- data_designer/config/utils/code_lang.py +13 -2
- data_designer/config/utils/constants.py +10 -1
- data_designer/config/utils/trace_type.py +24 -0
- data_designer/config/utils/visualization.py +6 -0
- data_designer/logging.py +15 -0
- {data_designer_config-0.4.0rc2.dist-info → data_designer_config-0.5.0rc1.dist-info}/METADATA +1 -1
- {data_designer_config-0.4.0rc2.dist-info → data_designer_config-0.5.0rc1.dist-info}/RECORD +18 -15
- data_designer/config/_version.py +0 -34
- {data_designer_config-0.4.0rc2.dist-info → data_designer_config-0.5.0rc1.dist-info}/WHEEL +0 -0
data_designer/config/__init__.py
CHANGED
|
@@ -3,147 +3,224 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
from
|
|
8
|
-
EmbeddingColumnConfig,
|
|
9
|
-
ExpressionColumnConfig,
|
|
10
|
-
LLMCodeColumnConfig,
|
|
11
|
-
LLMJudgeColumnConfig,
|
|
12
|
-
LLMStructuredColumnConfig,
|
|
13
|
-
LLMTextColumnConfig,
|
|
14
|
-
SamplerColumnConfig,
|
|
15
|
-
Score,
|
|
16
|
-
SeedDatasetColumnConfig,
|
|
17
|
-
ValidationColumnConfig,
|
|
18
|
-
)
|
|
19
|
-
from data_designer.config.column_types import DataDesignerColumnType
|
|
20
|
-
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
21
|
-
from data_designer.config.data_designer_config import DataDesignerConfig
|
|
22
|
-
from data_designer.config.dataset_builders import BuildStage
|
|
23
|
-
from data_designer.config.models import (
|
|
24
|
-
ChatCompletionInferenceParams,
|
|
25
|
-
EmbeddingInferenceParams,
|
|
26
|
-
GenerationType,
|
|
27
|
-
ImageContext,
|
|
28
|
-
ImageFormat,
|
|
29
|
-
ManualDistribution,
|
|
30
|
-
ManualDistributionParams,
|
|
31
|
-
Modality,
|
|
32
|
-
ModalityContext,
|
|
33
|
-
ModalityDataType,
|
|
34
|
-
ModelConfig,
|
|
35
|
-
ModelProvider,
|
|
36
|
-
UniformDistribution,
|
|
37
|
-
UniformDistributionParams,
|
|
38
|
-
)
|
|
39
|
-
from data_designer.config.processors import (
|
|
40
|
-
DropColumnsProcessorConfig,
|
|
41
|
-
ProcessorType,
|
|
42
|
-
SchemaTransformProcessorConfig,
|
|
43
|
-
)
|
|
44
|
-
from data_designer.config.run_config import RunConfig
|
|
45
|
-
from data_designer.config.sampler_constraints import ColumnInequalityConstraint, ScalarInequalityConstraint
|
|
46
|
-
from data_designer.config.sampler_params import (
|
|
47
|
-
BernoulliMixtureSamplerParams,
|
|
48
|
-
BernoulliSamplerParams,
|
|
49
|
-
BinomialSamplerParams,
|
|
50
|
-
CategorySamplerParams,
|
|
51
|
-
DatetimeSamplerParams,
|
|
52
|
-
GaussianSamplerParams,
|
|
53
|
-
PersonFromFakerSamplerParams,
|
|
54
|
-
PersonSamplerParams,
|
|
55
|
-
PoissonSamplerParams,
|
|
56
|
-
SamplerType,
|
|
57
|
-
ScipySamplerParams,
|
|
58
|
-
SubcategorySamplerParams,
|
|
59
|
-
TimeDeltaSamplerParams,
|
|
60
|
-
UniformSamplerParams,
|
|
61
|
-
UUIDSamplerParams,
|
|
62
|
-
)
|
|
63
|
-
from data_designer.config.seed import (
|
|
64
|
-
IndexRange,
|
|
65
|
-
PartitionBlock,
|
|
66
|
-
SamplingStrategy,
|
|
67
|
-
SeedConfig,
|
|
68
|
-
)
|
|
69
|
-
from data_designer.config.seed_source import (
|
|
70
|
-
DataFrameSeedSource,
|
|
71
|
-
HuggingFaceSeedSource,
|
|
72
|
-
LocalFileSeedSource,
|
|
73
|
-
)
|
|
74
|
-
from data_designer.config.utils.code_lang import CodeLang
|
|
75
|
-
from data_designer.config.utils.info import InfoType
|
|
76
|
-
from data_designer.config.validator_params import (
|
|
77
|
-
CodeValidatorParams,
|
|
78
|
-
LocalCallableValidatorParams,
|
|
79
|
-
RemoteValidatorParams,
|
|
80
|
-
ValidatorType,
|
|
81
|
-
)
|
|
6
|
+
import importlib
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
82
8
|
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
# These imports are for IDE autocomplete and type checking only.
|
|
11
|
+
# At runtime, __getattr__ lazily loads the actual objects.
|
|
12
|
+
from data_designer.config.analysis.column_profilers import ( # noqa: F401
|
|
13
|
+
JudgeScoreProfilerConfig,
|
|
14
|
+
)
|
|
15
|
+
from data_designer.config.column_configs import ( # noqa: F401
|
|
16
|
+
CustomColumnConfig,
|
|
17
|
+
EmbeddingColumnConfig,
|
|
18
|
+
ExpressionColumnConfig,
|
|
19
|
+
GenerationStrategy,
|
|
20
|
+
LLMCodeColumnConfig,
|
|
21
|
+
LLMJudgeColumnConfig,
|
|
22
|
+
LLMStructuredColumnConfig,
|
|
23
|
+
LLMTextColumnConfig,
|
|
24
|
+
SamplerColumnConfig,
|
|
25
|
+
Score,
|
|
26
|
+
SeedDatasetColumnConfig,
|
|
27
|
+
ValidationColumnConfig,
|
|
28
|
+
)
|
|
29
|
+
from data_designer.config.column_types import DataDesignerColumnType # noqa: F401
|
|
30
|
+
from data_designer.config.config_builder import DataDesignerConfigBuilder # noqa: F401
|
|
31
|
+
from data_designer.config.custom_column import custom_column_generator # noqa: F401
|
|
32
|
+
from data_designer.config.data_designer_config import DataDesignerConfig # noqa: F401
|
|
33
|
+
from data_designer.config.dataset_builders import BuildStage # noqa: F401
|
|
34
|
+
from data_designer.config.mcp import ( # noqa: F401
|
|
35
|
+
LocalStdioMCPProvider,
|
|
36
|
+
MCPProvider,
|
|
37
|
+
ToolConfig,
|
|
38
|
+
)
|
|
39
|
+
from data_designer.config.models import ( # noqa: F401
|
|
40
|
+
ChatCompletionInferenceParams,
|
|
41
|
+
EmbeddingInferenceParams,
|
|
42
|
+
GenerationType,
|
|
43
|
+
ImageContext,
|
|
44
|
+
ImageFormat,
|
|
45
|
+
ManualDistribution,
|
|
46
|
+
ManualDistributionParams,
|
|
47
|
+
Modality,
|
|
48
|
+
ModalityContext,
|
|
49
|
+
ModalityDataType,
|
|
50
|
+
ModelConfig,
|
|
51
|
+
ModelProvider,
|
|
52
|
+
UniformDistribution,
|
|
53
|
+
UniformDistributionParams,
|
|
54
|
+
)
|
|
55
|
+
from data_designer.config.processors import ( # noqa: F401
|
|
56
|
+
DropColumnsProcessorConfig,
|
|
57
|
+
ProcessorType,
|
|
58
|
+
SchemaTransformProcessorConfig,
|
|
59
|
+
)
|
|
60
|
+
from data_designer.config.run_config import RunConfig # noqa: F401
|
|
61
|
+
from data_designer.config.sampler_constraints import ( # noqa: F401
|
|
62
|
+
ColumnInequalityConstraint,
|
|
63
|
+
ScalarInequalityConstraint,
|
|
64
|
+
)
|
|
65
|
+
from data_designer.config.sampler_params import ( # noqa: F401
|
|
66
|
+
BernoulliMixtureSamplerParams,
|
|
67
|
+
BernoulliSamplerParams,
|
|
68
|
+
BinomialSamplerParams,
|
|
69
|
+
CategorySamplerParams,
|
|
70
|
+
DatetimeSamplerParams,
|
|
71
|
+
GaussianSamplerParams,
|
|
72
|
+
PersonFromFakerSamplerParams,
|
|
73
|
+
PersonSamplerParams,
|
|
74
|
+
PoissonSamplerParams,
|
|
75
|
+
SamplerType,
|
|
76
|
+
ScipySamplerParams,
|
|
77
|
+
SubcategorySamplerParams,
|
|
78
|
+
TimeDeltaSamplerParams,
|
|
79
|
+
UniformSamplerParams,
|
|
80
|
+
UUIDSamplerParams,
|
|
81
|
+
)
|
|
82
|
+
from data_designer.config.seed import ( # noqa: F401
|
|
83
|
+
IndexRange,
|
|
84
|
+
PartitionBlock,
|
|
85
|
+
SamplingStrategy,
|
|
86
|
+
SeedConfig,
|
|
87
|
+
)
|
|
88
|
+
from data_designer.config.seed_source import ( # noqa: F401
|
|
89
|
+
DataFrameSeedSource,
|
|
90
|
+
HuggingFaceSeedSource,
|
|
91
|
+
LocalFileSeedSource,
|
|
92
|
+
)
|
|
93
|
+
from data_designer.config.utils.code_lang import CodeLang # noqa: F401
|
|
94
|
+
from data_designer.config.utils.info import InfoType # noqa: F401
|
|
95
|
+
from data_designer.config.utils.trace_type import TraceType # noqa: F401
|
|
96
|
+
from data_designer.config.validator_params import ( # noqa: F401
|
|
97
|
+
CodeValidatorParams,
|
|
98
|
+
LocalCallableValidatorParams,
|
|
99
|
+
RemoteValidatorParams,
|
|
100
|
+
ValidatorType,
|
|
101
|
+
)
|
|
83
102
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
103
|
+
# Base module path and submodule paths for lazy imports
|
|
104
|
+
_MOD_BASE = "data_designer.config"
|
|
105
|
+
_MOD_COLUMN_CONFIGS = f"{_MOD_BASE}.column_configs"
|
|
106
|
+
_MOD_MCP = f"{_MOD_BASE}.mcp"
|
|
107
|
+
_MOD_MODELS = f"{_MOD_BASE}.models"
|
|
108
|
+
_MOD_PROCESSORS = f"{_MOD_BASE}.processors"
|
|
109
|
+
_MOD_SAMPLER_CONSTRAINTS = f"{_MOD_BASE}.sampler_constraints"
|
|
110
|
+
_MOD_SAMPLER_PARAMS = f"{_MOD_BASE}.sampler_params"
|
|
111
|
+
_MOD_SEED = f"{_MOD_BASE}.seed"
|
|
112
|
+
_MOD_SEED_SOURCE = f"{_MOD_BASE}.seed_source"
|
|
113
|
+
_MOD_VALIDATOR_PARAMS = f"{_MOD_BASE}.validator_params"
|
|
114
|
+
_MOD_UTILS = f"{_MOD_BASE}.utils"
|
|
115
|
+
|
|
116
|
+
# Mapping of export names to (module_path, attribute_name) for lazy loading
|
|
117
|
+
_LAZY_IMPORTS: dict[str, tuple[str, str]] = {
|
|
118
|
+
# analysis.column_profilers
|
|
119
|
+
"JudgeScoreProfilerConfig": (f"{_MOD_BASE}.analysis.column_profilers", "JudgeScoreProfilerConfig"),
|
|
120
|
+
# column_configs
|
|
121
|
+
"CustomColumnConfig": (_MOD_COLUMN_CONFIGS, "CustomColumnConfig"),
|
|
122
|
+
"EmbeddingColumnConfig": (_MOD_COLUMN_CONFIGS, "EmbeddingColumnConfig"),
|
|
123
|
+
"ExpressionColumnConfig": (_MOD_COLUMN_CONFIGS, "ExpressionColumnConfig"),
|
|
124
|
+
"GenerationStrategy": (_MOD_COLUMN_CONFIGS, "GenerationStrategy"),
|
|
125
|
+
"LLMCodeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMCodeColumnConfig"),
|
|
126
|
+
"LLMJudgeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMJudgeColumnConfig"),
|
|
127
|
+
"LLMStructuredColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMStructuredColumnConfig"),
|
|
128
|
+
"LLMTextColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMTextColumnConfig"),
|
|
129
|
+
"SamplerColumnConfig": (_MOD_COLUMN_CONFIGS, "SamplerColumnConfig"),
|
|
130
|
+
"Score": (_MOD_COLUMN_CONFIGS, "Score"),
|
|
131
|
+
"SeedDatasetColumnConfig": (_MOD_COLUMN_CONFIGS, "SeedDatasetColumnConfig"),
|
|
132
|
+
"ValidationColumnConfig": (_MOD_COLUMN_CONFIGS, "ValidationColumnConfig"),
|
|
133
|
+
# column_types
|
|
134
|
+
"DataDesignerColumnType": (f"{_MOD_BASE}.column_types", "DataDesignerColumnType"),
|
|
135
|
+
# config_builder
|
|
136
|
+
"DataDesignerConfigBuilder": (f"{_MOD_BASE}.config_builder", "DataDesignerConfigBuilder"),
|
|
137
|
+
# custom_column
|
|
138
|
+
"custom_column_generator": (f"{_MOD_BASE}.custom_column", "custom_column_generator"),
|
|
139
|
+
# data_designer_config
|
|
140
|
+
"DataDesignerConfig": (f"{_MOD_BASE}.data_designer_config", "DataDesignerConfig"),
|
|
141
|
+
# dataset_builders
|
|
142
|
+
"BuildStage": (f"{_MOD_BASE}.dataset_builders", "BuildStage"),
|
|
143
|
+
# mcp
|
|
144
|
+
"LocalStdioMCPProvider": (_MOD_MCP, "LocalStdioMCPProvider"),
|
|
145
|
+
"MCPProvider": (_MOD_MCP, "MCPProvider"),
|
|
146
|
+
"ToolConfig": (_MOD_MCP, "ToolConfig"),
|
|
147
|
+
# models
|
|
148
|
+
"ChatCompletionInferenceParams": (_MOD_MODELS, "ChatCompletionInferenceParams"),
|
|
149
|
+
"EmbeddingInferenceParams": (_MOD_MODELS, "EmbeddingInferenceParams"),
|
|
150
|
+
"GenerationType": (_MOD_MODELS, "GenerationType"),
|
|
151
|
+
"ImageContext": (_MOD_MODELS, "ImageContext"),
|
|
152
|
+
"ImageFormat": (_MOD_MODELS, "ImageFormat"),
|
|
153
|
+
"ManualDistribution": (_MOD_MODELS, "ManualDistribution"),
|
|
154
|
+
"ManualDistributionParams": (_MOD_MODELS, "ManualDistributionParams"),
|
|
155
|
+
"Modality": (_MOD_MODELS, "Modality"),
|
|
156
|
+
"ModalityContext": (_MOD_MODELS, "ModalityContext"),
|
|
157
|
+
"ModalityDataType": (_MOD_MODELS, "ModalityDataType"),
|
|
158
|
+
"ModelConfig": (_MOD_MODELS, "ModelConfig"),
|
|
159
|
+
"ModelProvider": (_MOD_MODELS, "ModelProvider"),
|
|
160
|
+
"UniformDistribution": (_MOD_MODELS, "UniformDistribution"),
|
|
161
|
+
"UniformDistributionParams": (_MOD_MODELS, "UniformDistributionParams"),
|
|
162
|
+
# processors
|
|
163
|
+
"DropColumnsProcessorConfig": (_MOD_PROCESSORS, "DropColumnsProcessorConfig"),
|
|
164
|
+
"ProcessorType": (_MOD_PROCESSORS, "ProcessorType"),
|
|
165
|
+
"SchemaTransformProcessorConfig": (_MOD_PROCESSORS, "SchemaTransformProcessorConfig"),
|
|
166
|
+
# run_config
|
|
167
|
+
"RunConfig": (f"{_MOD_BASE}.run_config", "RunConfig"),
|
|
168
|
+
# sampler_constraints
|
|
169
|
+
"ColumnInequalityConstraint": (_MOD_SAMPLER_CONSTRAINTS, "ColumnInequalityConstraint"),
|
|
170
|
+
"ScalarInequalityConstraint": (_MOD_SAMPLER_CONSTRAINTS, "ScalarInequalityConstraint"),
|
|
171
|
+
# sampler_params
|
|
172
|
+
"BernoulliMixtureSamplerParams": (_MOD_SAMPLER_PARAMS, "BernoulliMixtureSamplerParams"),
|
|
173
|
+
"BernoulliSamplerParams": (_MOD_SAMPLER_PARAMS, "BernoulliSamplerParams"),
|
|
174
|
+
"BinomialSamplerParams": (_MOD_SAMPLER_PARAMS, "BinomialSamplerParams"),
|
|
175
|
+
"CategorySamplerParams": (_MOD_SAMPLER_PARAMS, "CategorySamplerParams"),
|
|
176
|
+
"DatetimeSamplerParams": (_MOD_SAMPLER_PARAMS, "DatetimeSamplerParams"),
|
|
177
|
+
"GaussianSamplerParams": (_MOD_SAMPLER_PARAMS, "GaussianSamplerParams"),
|
|
178
|
+
"PersonFromFakerSamplerParams": (_MOD_SAMPLER_PARAMS, "PersonFromFakerSamplerParams"),
|
|
179
|
+
"PersonSamplerParams": (_MOD_SAMPLER_PARAMS, "PersonSamplerParams"),
|
|
180
|
+
"PoissonSamplerParams": (_MOD_SAMPLER_PARAMS, "PoissonSamplerParams"),
|
|
181
|
+
"SamplerType": (_MOD_SAMPLER_PARAMS, "SamplerType"),
|
|
182
|
+
"ScipySamplerParams": (_MOD_SAMPLER_PARAMS, "ScipySamplerParams"),
|
|
183
|
+
"SubcategorySamplerParams": (_MOD_SAMPLER_PARAMS, "SubcategorySamplerParams"),
|
|
184
|
+
"TimeDeltaSamplerParams": (_MOD_SAMPLER_PARAMS, "TimeDeltaSamplerParams"),
|
|
185
|
+
"UniformSamplerParams": (_MOD_SAMPLER_PARAMS, "UniformSamplerParams"),
|
|
186
|
+
"UUIDSamplerParams": (_MOD_SAMPLER_PARAMS, "UUIDSamplerParams"),
|
|
187
|
+
# seed
|
|
188
|
+
"IndexRange": (_MOD_SEED, "IndexRange"),
|
|
189
|
+
"PartitionBlock": (_MOD_SEED, "PartitionBlock"),
|
|
190
|
+
"SamplingStrategy": (_MOD_SEED, "SamplingStrategy"),
|
|
191
|
+
"SeedConfig": (_MOD_SEED, "SeedConfig"),
|
|
192
|
+
# seed_source
|
|
193
|
+
"DataFrameSeedSource": (_MOD_SEED_SOURCE, "DataFrameSeedSource"),
|
|
194
|
+
"HuggingFaceSeedSource": (_MOD_SEED_SOURCE, "HuggingFaceSeedSource"),
|
|
195
|
+
"LocalFileSeedSource": (_MOD_SEED_SOURCE, "LocalFileSeedSource"),
|
|
196
|
+
# utils
|
|
197
|
+
"CodeLang": (f"{_MOD_UTILS}.code_lang", "CodeLang"),
|
|
198
|
+
"InfoType": (f"{_MOD_UTILS}.info", "InfoType"),
|
|
199
|
+
"TraceType": (f"{_MOD_UTILS}.trace_type", "TraceType"),
|
|
200
|
+
# validator_params
|
|
201
|
+
"CodeValidatorParams": (_MOD_VALIDATOR_PARAMS, "CodeValidatorParams"),
|
|
202
|
+
"LocalCallableValidatorParams": (_MOD_VALIDATOR_PARAMS, "LocalCallableValidatorParams"),
|
|
203
|
+
"RemoteValidatorParams": (_MOD_VALIDATOR_PARAMS, "RemoteValidatorParams"),
|
|
204
|
+
"ValidatorType": (_MOD_VALIDATOR_PARAMS, "ValidatorType"),
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
__all__ = list(_LAZY_IMPORTS.keys())
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def __getattr__(name: str) -> object:
|
|
211
|
+
"""Lazily import config module exports when accessed.
|
|
212
|
+
|
|
213
|
+
This allows fast imports of data_designer.config while deferring loading
|
|
214
|
+
of submodules until they're actually needed.
|
|
215
|
+
"""
|
|
216
|
+
if name in _LAZY_IMPORTS:
|
|
217
|
+
module_path, attr_name = _LAZY_IMPORTS[name]
|
|
218
|
+
module = importlib.import_module(module_path)
|
|
219
|
+
return getattr(module, attr_name)
|
|
220
|
+
|
|
221
|
+
raise AttributeError(f"module 'data_designer.config' has no attribute {name!r}")
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def __dir__() -> list[str]:
|
|
225
|
+
"""Return list of available exports for tab-completion."""
|
|
226
|
+
return __all__
|
data_designer/config/base.py
CHANGED
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
# IMPORTANT: This module must NOT import from any data_designer submodules (i.e., data_designer.*).
|
|
5
|
+
# These base abstractions are foundational and should only depend on pydantic and Python builtins.
|
|
6
|
+
|
|
4
7
|
from __future__ import annotations
|
|
5
8
|
|
|
6
|
-
from
|
|
7
|
-
from typing import Any
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
8
10
|
|
|
9
|
-
import yaml
|
|
10
11
|
from pydantic import BaseModel, ConfigDict
|
|
11
12
|
|
|
12
|
-
from data_designer.config.utils.io_helpers import serialize_data
|
|
13
|
-
|
|
14
13
|
|
|
15
14
|
class ConfigBase(BaseModel):
|
|
16
15
|
model_config = ConfigDict(
|
|
@@ -22,48 +21,47 @@ class ConfigBase(BaseModel):
|
|
|
22
21
|
)
|
|
23
22
|
|
|
24
23
|
|
|
25
|
-
class
|
|
26
|
-
|
|
27
|
-
"""Convert the configuration to a dictionary.
|
|
24
|
+
class SingleColumnConfig(ConfigBase, ABC):
|
|
25
|
+
"""Abstract base class for all single-column configuration types.
|
|
28
26
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
27
|
+
This class serves as the foundation for all column configurations in DataDesigner,
|
|
28
|
+
defining shared fields and properties across all column types.
|
|
29
|
+
|
|
30
|
+
Attributes:
|
|
31
|
+
name: Unique name of the column to be generated.
|
|
32
|
+
drop: If True, the column will be generated but removed from the final dataset.
|
|
33
|
+
Useful for intermediate columns that are dependencies for other columns.
|
|
34
|
+
column_type: Discriminator field that identifies the specific column type.
|
|
35
|
+
Subclasses must override this field to specify the column type with a `Literal` value.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
name: str
|
|
39
|
+
drop: bool = False
|
|
40
|
+
column_type: str
|
|
34
41
|
|
|
35
|
-
|
|
36
|
-
|
|
42
|
+
@staticmethod
|
|
43
|
+
def get_column_emoji() -> str:
|
|
44
|
+
return "🎨"
|
|
37
45
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
**kwargs: Additional keyword arguments passed to yaml.dump().
|
|
46
|
+
@property
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def required_columns(self) -> list[str]:
|
|
49
|
+
"""Returns a list of column names that must exist before this column can be generated.
|
|
43
50
|
|
|
44
51
|
Returns:
|
|
45
|
-
|
|
52
|
+
List of column names that this column depends on. Empty list indicates
|
|
53
|
+
no dependencies. Override in subclasses to specify dependencies.
|
|
46
54
|
"""
|
|
47
|
-
yaml_str = yaml.dump(self.to_dict(), indent=indent, **kwargs)
|
|
48
|
-
if path is None:
|
|
49
|
-
return yaml_str
|
|
50
|
-
with open(path, "w") as f:
|
|
51
|
-
f.write(yaml_str)
|
|
52
55
|
|
|
53
|
-
|
|
54
|
-
|
|
56
|
+
@property
|
|
57
|
+
@abstractmethod
|
|
58
|
+
def side_effect_columns(self) -> list[str]:
|
|
59
|
+
"""Returns a list of additional columns that this column will create as a side effect.
|
|
55
60
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
JSON string instead of writing to file.
|
|
59
|
-
indent: Number of spaces for JSON indentation. Defaults to 2.
|
|
60
|
-
**kwargs: Additional keyword arguments passed to json.dumps().
|
|
61
|
+
Some column types generate additional metadata or auxiliary columns alongside
|
|
62
|
+
the primary column (e.g., reasoning traces for LLM columns).
|
|
61
63
|
|
|
62
64
|
Returns:
|
|
63
|
-
|
|
65
|
+
List of column names that this column will create as a side effect. Empty list
|
|
66
|
+
indicates no side effect columns. Override in subclasses to specify side effects.
|
|
64
67
|
"""
|
|
65
|
-
json_str = serialize_data(self.to_dict(), indent=indent, **kwargs)
|
|
66
|
-
if path is None:
|
|
67
|
-
return json_str
|
|
68
|
-
with open(path, "w") as f:
|
|
69
|
-
f.write(json_str)
|