data-designer-config 0.4.0rc3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,147 +3,224 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
- from data_designer.config.analysis.column_profilers import JudgeScoreProfilerConfig
7
- from data_designer.config.column_configs import (
8
- EmbeddingColumnConfig,
9
- ExpressionColumnConfig,
10
- LLMCodeColumnConfig,
11
- LLMJudgeColumnConfig,
12
- LLMStructuredColumnConfig,
13
- LLMTextColumnConfig,
14
- SamplerColumnConfig,
15
- Score,
16
- SeedDatasetColumnConfig,
17
- ValidationColumnConfig,
18
- )
19
- from data_designer.config.column_types import DataDesignerColumnType
20
- from data_designer.config.config_builder import DataDesignerConfigBuilder
21
- from data_designer.config.data_designer_config import DataDesignerConfig
22
- from data_designer.config.dataset_builders import BuildStage
23
- from data_designer.config.models import (
24
- ChatCompletionInferenceParams,
25
- EmbeddingInferenceParams,
26
- GenerationType,
27
- ImageContext,
28
- ImageFormat,
29
- ManualDistribution,
30
- ManualDistributionParams,
31
- Modality,
32
- ModalityContext,
33
- ModalityDataType,
34
- ModelConfig,
35
- ModelProvider,
36
- UniformDistribution,
37
- UniformDistributionParams,
38
- )
39
- from data_designer.config.processors import (
40
- DropColumnsProcessorConfig,
41
- ProcessorType,
42
- SchemaTransformProcessorConfig,
43
- )
44
- from data_designer.config.run_config import RunConfig
45
- from data_designer.config.sampler_constraints import ColumnInequalityConstraint, ScalarInequalityConstraint
46
- from data_designer.config.sampler_params import (
47
- BernoulliMixtureSamplerParams,
48
- BernoulliSamplerParams,
49
- BinomialSamplerParams,
50
- CategorySamplerParams,
51
- DatetimeSamplerParams,
52
- GaussianSamplerParams,
53
- PersonFromFakerSamplerParams,
54
- PersonSamplerParams,
55
- PoissonSamplerParams,
56
- SamplerType,
57
- ScipySamplerParams,
58
- SubcategorySamplerParams,
59
- TimeDeltaSamplerParams,
60
- UniformSamplerParams,
61
- UUIDSamplerParams,
62
- )
63
- from data_designer.config.seed import (
64
- IndexRange,
65
- PartitionBlock,
66
- SamplingStrategy,
67
- SeedConfig,
68
- )
69
- from data_designer.config.seed_source import (
70
- DataFrameSeedSource,
71
- HuggingFaceSeedSource,
72
- LocalFileSeedSource,
73
- )
74
- from data_designer.config.utils.code_lang import CodeLang
75
- from data_designer.config.utils.info import InfoType
76
- from data_designer.config.validator_params import (
77
- CodeValidatorParams,
78
- LocalCallableValidatorParams,
79
- RemoteValidatorParams,
80
- ValidatorType,
81
- )
6
+ import importlib
7
+ from typing import TYPE_CHECKING
82
8
 
9
+ if TYPE_CHECKING:
10
+ # These imports are for IDE autocomplete and type checking only.
11
+ # At runtime, __getattr__ lazily loads the actual objects.
12
+ from data_designer.config.analysis.column_profilers import ( # noqa: F401
13
+ JudgeScoreProfilerConfig,
14
+ )
15
+ from data_designer.config.column_configs import ( # noqa: F401
16
+ CustomColumnConfig,
17
+ EmbeddingColumnConfig,
18
+ ExpressionColumnConfig,
19
+ GenerationStrategy,
20
+ LLMCodeColumnConfig,
21
+ LLMJudgeColumnConfig,
22
+ LLMStructuredColumnConfig,
23
+ LLMTextColumnConfig,
24
+ SamplerColumnConfig,
25
+ Score,
26
+ SeedDatasetColumnConfig,
27
+ ValidationColumnConfig,
28
+ )
29
+ from data_designer.config.column_types import DataDesignerColumnType # noqa: F401
30
+ from data_designer.config.config_builder import DataDesignerConfigBuilder # noqa: F401
31
+ from data_designer.config.custom_column import custom_column_generator # noqa: F401
32
+ from data_designer.config.data_designer_config import DataDesignerConfig # noqa: F401
33
+ from data_designer.config.dataset_builders import BuildStage # noqa: F401
34
+ from data_designer.config.mcp import ( # noqa: F401
35
+ LocalStdioMCPProvider,
36
+ MCPProvider,
37
+ ToolConfig,
38
+ )
39
+ from data_designer.config.models import ( # noqa: F401
40
+ ChatCompletionInferenceParams,
41
+ EmbeddingInferenceParams,
42
+ GenerationType,
43
+ ImageContext,
44
+ ImageFormat,
45
+ ManualDistribution,
46
+ ManualDistributionParams,
47
+ Modality,
48
+ ModalityContext,
49
+ ModalityDataType,
50
+ ModelConfig,
51
+ ModelProvider,
52
+ UniformDistribution,
53
+ UniformDistributionParams,
54
+ )
55
+ from data_designer.config.processors import ( # noqa: F401
56
+ DropColumnsProcessorConfig,
57
+ ProcessorType,
58
+ SchemaTransformProcessorConfig,
59
+ )
60
+ from data_designer.config.run_config import RunConfig # noqa: F401
61
+ from data_designer.config.sampler_constraints import ( # noqa: F401
62
+ ColumnInequalityConstraint,
63
+ ScalarInequalityConstraint,
64
+ )
65
+ from data_designer.config.sampler_params import ( # noqa: F401
66
+ BernoulliMixtureSamplerParams,
67
+ BernoulliSamplerParams,
68
+ BinomialSamplerParams,
69
+ CategorySamplerParams,
70
+ DatetimeSamplerParams,
71
+ GaussianSamplerParams,
72
+ PersonFromFakerSamplerParams,
73
+ PersonSamplerParams,
74
+ PoissonSamplerParams,
75
+ SamplerType,
76
+ ScipySamplerParams,
77
+ SubcategorySamplerParams,
78
+ TimeDeltaSamplerParams,
79
+ UniformSamplerParams,
80
+ UUIDSamplerParams,
81
+ )
82
+ from data_designer.config.seed import ( # noqa: F401
83
+ IndexRange,
84
+ PartitionBlock,
85
+ SamplingStrategy,
86
+ SeedConfig,
87
+ )
88
+ from data_designer.config.seed_source import ( # noqa: F401
89
+ DataFrameSeedSource,
90
+ HuggingFaceSeedSource,
91
+ LocalFileSeedSource,
92
+ )
93
+ from data_designer.config.utils.code_lang import CodeLang # noqa: F401
94
+ from data_designer.config.utils.info import InfoType # noqa: F401
95
+ from data_designer.config.utils.trace_type import TraceType # noqa: F401
96
+ from data_designer.config.validator_params import ( # noqa: F401
97
+ CodeValidatorParams,
98
+ LocalCallableValidatorParams,
99
+ RemoteValidatorParams,
100
+ ValidatorType,
101
+ )
83
102
 
84
- def get_config_exports() -> list[str]:
85
- return [
86
- SchemaTransformProcessorConfig.__name__,
87
- BernoulliMixtureSamplerParams.__name__,
88
- BernoulliSamplerParams.__name__,
89
- BinomialSamplerParams.__name__,
90
- CategorySamplerParams.__name__,
91
- CodeLang.__name__,
92
- CodeValidatorParams.__name__,
93
- ColumnInequalityConstraint.__name__,
94
- ChatCompletionInferenceParams.__name__,
95
- DataDesignerColumnType.__name__,
96
- DataDesignerConfig.__name__,
97
- DataDesignerConfigBuilder.__name__,
98
- DataFrameSeedSource.__name__,
99
- BuildStage.__name__,
100
- DatetimeSamplerParams.__name__,
101
- DropColumnsProcessorConfig.__name__,
102
- EmbeddingColumnConfig.__name__,
103
- EmbeddingInferenceParams.__name__,
104
- ExpressionColumnConfig.__name__,
105
- GaussianSamplerParams.__name__,
106
- GenerationType.__name__,
107
- HuggingFaceSeedSource.__name__,
108
- IndexRange.__name__,
109
- InfoType.__name__,
110
- ImageContext.__name__,
111
- ImageFormat.__name__,
112
- JudgeScoreProfilerConfig.__name__,
113
- LLMCodeColumnConfig.__name__,
114
- LLMJudgeColumnConfig.__name__,
115
- LLMStructuredColumnConfig.__name__,
116
- LLMTextColumnConfig.__name__,
117
- LocalCallableValidatorParams.__name__,
118
- LocalFileSeedSource.__name__,
119
- ManualDistribution.__name__,
120
- ManualDistributionParams.__name__,
121
- Modality.__name__,
122
- ModalityContext.__name__,
123
- ModalityDataType.__name__,
124
- ModelConfig.__name__,
125
- ModelProvider.__name__,
126
- PartitionBlock.__name__,
127
- PersonSamplerParams.__name__,
128
- PersonFromFakerSamplerParams.__name__,
129
- PoissonSamplerParams.__name__,
130
- ProcessorType.__name__,
131
- RemoteValidatorParams.__name__,
132
- RunConfig.__name__,
133
- SamplerColumnConfig.__name__,
134
- SamplerType.__name__,
135
- SamplingStrategy.__name__,
136
- ScalarInequalityConstraint.__name__,
137
- ScipySamplerParams.__name__,
138
- Score.__name__,
139
- SeedConfig.__name__,
140
- SeedDatasetColumnConfig.__name__,
141
- SubcategorySamplerParams.__name__,
142
- TimeDeltaSamplerParams.__name__,
143
- UniformDistribution.__name__,
144
- UniformDistributionParams.__name__,
145
- UniformSamplerParams.__name__,
146
- UUIDSamplerParams.__name__,
147
- ValidationColumnConfig.__name__,
148
- ValidatorType.__name__,
149
- ]
103
+ # Base module path and submodule paths for lazy imports
104
+ _MOD_BASE = "data_designer.config"
105
+ _MOD_COLUMN_CONFIGS = f"{_MOD_BASE}.column_configs"
106
+ _MOD_MCP = f"{_MOD_BASE}.mcp"
107
+ _MOD_MODELS = f"{_MOD_BASE}.models"
108
+ _MOD_PROCESSORS = f"{_MOD_BASE}.processors"
109
+ _MOD_SAMPLER_CONSTRAINTS = f"{_MOD_BASE}.sampler_constraints"
110
+ _MOD_SAMPLER_PARAMS = f"{_MOD_BASE}.sampler_params"
111
+ _MOD_SEED = f"{_MOD_BASE}.seed"
112
+ _MOD_SEED_SOURCE = f"{_MOD_BASE}.seed_source"
113
+ _MOD_VALIDATOR_PARAMS = f"{_MOD_BASE}.validator_params"
114
+ _MOD_UTILS = f"{_MOD_BASE}.utils"
115
+
116
+ # Mapping of export names to (module_path, attribute_name) for lazy loading
117
+ _LAZY_IMPORTS: dict[str, tuple[str, str]] = {
118
+ # analysis.column_profilers
119
+ "JudgeScoreProfilerConfig": (f"{_MOD_BASE}.analysis.column_profilers", "JudgeScoreProfilerConfig"),
120
+ # column_configs
121
+ "CustomColumnConfig": (_MOD_COLUMN_CONFIGS, "CustomColumnConfig"),
122
+ "EmbeddingColumnConfig": (_MOD_COLUMN_CONFIGS, "EmbeddingColumnConfig"),
123
+ "ExpressionColumnConfig": (_MOD_COLUMN_CONFIGS, "ExpressionColumnConfig"),
124
+ "GenerationStrategy": (_MOD_COLUMN_CONFIGS, "GenerationStrategy"),
125
+ "LLMCodeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMCodeColumnConfig"),
126
+ "LLMJudgeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMJudgeColumnConfig"),
127
+ "LLMStructuredColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMStructuredColumnConfig"),
128
+ "LLMTextColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMTextColumnConfig"),
129
+ "SamplerColumnConfig": (_MOD_COLUMN_CONFIGS, "SamplerColumnConfig"),
130
+ "Score": (_MOD_COLUMN_CONFIGS, "Score"),
131
+ "SeedDatasetColumnConfig": (_MOD_COLUMN_CONFIGS, "SeedDatasetColumnConfig"),
132
+ "ValidationColumnConfig": (_MOD_COLUMN_CONFIGS, "ValidationColumnConfig"),
133
+ # column_types
134
+ "DataDesignerColumnType": (f"{_MOD_BASE}.column_types", "DataDesignerColumnType"),
135
+ # config_builder
136
+ "DataDesignerConfigBuilder": (f"{_MOD_BASE}.config_builder", "DataDesignerConfigBuilder"),
137
+ # custom_column
138
+ "custom_column_generator": (f"{_MOD_BASE}.custom_column", "custom_column_generator"),
139
+ # data_designer_config
140
+ "DataDesignerConfig": (f"{_MOD_BASE}.data_designer_config", "DataDesignerConfig"),
141
+ # dataset_builders
142
+ "BuildStage": (f"{_MOD_BASE}.dataset_builders", "BuildStage"),
143
+ # mcp
144
+ "LocalStdioMCPProvider": (_MOD_MCP, "LocalStdioMCPProvider"),
145
+ "MCPProvider": (_MOD_MCP, "MCPProvider"),
146
+ "ToolConfig": (_MOD_MCP, "ToolConfig"),
147
+ # models
148
+ "ChatCompletionInferenceParams": (_MOD_MODELS, "ChatCompletionInferenceParams"),
149
+ "EmbeddingInferenceParams": (_MOD_MODELS, "EmbeddingInferenceParams"),
150
+ "GenerationType": (_MOD_MODELS, "GenerationType"),
151
+ "ImageContext": (_MOD_MODELS, "ImageContext"),
152
+ "ImageFormat": (_MOD_MODELS, "ImageFormat"),
153
+ "ManualDistribution": (_MOD_MODELS, "ManualDistribution"),
154
+ "ManualDistributionParams": (_MOD_MODELS, "ManualDistributionParams"),
155
+ "Modality": (_MOD_MODELS, "Modality"),
156
+ "ModalityContext": (_MOD_MODELS, "ModalityContext"),
157
+ "ModalityDataType": (_MOD_MODELS, "ModalityDataType"),
158
+ "ModelConfig": (_MOD_MODELS, "ModelConfig"),
159
+ "ModelProvider": (_MOD_MODELS, "ModelProvider"),
160
+ "UniformDistribution": (_MOD_MODELS, "UniformDistribution"),
161
+ "UniformDistributionParams": (_MOD_MODELS, "UniformDistributionParams"),
162
+ # processors
163
+ "DropColumnsProcessorConfig": (_MOD_PROCESSORS, "DropColumnsProcessorConfig"),
164
+ "ProcessorType": (_MOD_PROCESSORS, "ProcessorType"),
165
+ "SchemaTransformProcessorConfig": (_MOD_PROCESSORS, "SchemaTransformProcessorConfig"),
166
+ # run_config
167
+ "RunConfig": (f"{_MOD_BASE}.run_config", "RunConfig"),
168
+ # sampler_constraints
169
+ "ColumnInequalityConstraint": (_MOD_SAMPLER_CONSTRAINTS, "ColumnInequalityConstraint"),
170
+ "ScalarInequalityConstraint": (_MOD_SAMPLER_CONSTRAINTS, "ScalarInequalityConstraint"),
171
+ # sampler_params
172
+ "BernoulliMixtureSamplerParams": (_MOD_SAMPLER_PARAMS, "BernoulliMixtureSamplerParams"),
173
+ "BernoulliSamplerParams": (_MOD_SAMPLER_PARAMS, "BernoulliSamplerParams"),
174
+ "BinomialSamplerParams": (_MOD_SAMPLER_PARAMS, "BinomialSamplerParams"),
175
+ "CategorySamplerParams": (_MOD_SAMPLER_PARAMS, "CategorySamplerParams"),
176
+ "DatetimeSamplerParams": (_MOD_SAMPLER_PARAMS, "DatetimeSamplerParams"),
177
+ "GaussianSamplerParams": (_MOD_SAMPLER_PARAMS, "GaussianSamplerParams"),
178
+ "PersonFromFakerSamplerParams": (_MOD_SAMPLER_PARAMS, "PersonFromFakerSamplerParams"),
179
+ "PersonSamplerParams": (_MOD_SAMPLER_PARAMS, "PersonSamplerParams"),
180
+ "PoissonSamplerParams": (_MOD_SAMPLER_PARAMS, "PoissonSamplerParams"),
181
+ "SamplerType": (_MOD_SAMPLER_PARAMS, "SamplerType"),
182
+ "ScipySamplerParams": (_MOD_SAMPLER_PARAMS, "ScipySamplerParams"),
183
+ "SubcategorySamplerParams": (_MOD_SAMPLER_PARAMS, "SubcategorySamplerParams"),
184
+ "TimeDeltaSamplerParams": (_MOD_SAMPLER_PARAMS, "TimeDeltaSamplerParams"),
185
+ "UniformSamplerParams": (_MOD_SAMPLER_PARAMS, "UniformSamplerParams"),
186
+ "UUIDSamplerParams": (_MOD_SAMPLER_PARAMS, "UUIDSamplerParams"),
187
+ # seed
188
+ "IndexRange": (_MOD_SEED, "IndexRange"),
189
+ "PartitionBlock": (_MOD_SEED, "PartitionBlock"),
190
+ "SamplingStrategy": (_MOD_SEED, "SamplingStrategy"),
191
+ "SeedConfig": (_MOD_SEED, "SeedConfig"),
192
+ # seed_source
193
+ "DataFrameSeedSource": (_MOD_SEED_SOURCE, "DataFrameSeedSource"),
194
+ "HuggingFaceSeedSource": (_MOD_SEED_SOURCE, "HuggingFaceSeedSource"),
195
+ "LocalFileSeedSource": (_MOD_SEED_SOURCE, "LocalFileSeedSource"),
196
+ # utils
197
+ "CodeLang": (f"{_MOD_UTILS}.code_lang", "CodeLang"),
198
+ "InfoType": (f"{_MOD_UTILS}.info", "InfoType"),
199
+ "TraceType": (f"{_MOD_UTILS}.trace_type", "TraceType"),
200
+ # validator_params
201
+ "CodeValidatorParams": (_MOD_VALIDATOR_PARAMS, "CodeValidatorParams"),
202
+ "LocalCallableValidatorParams": (_MOD_VALIDATOR_PARAMS, "LocalCallableValidatorParams"),
203
+ "RemoteValidatorParams": (_MOD_VALIDATOR_PARAMS, "RemoteValidatorParams"),
204
+ "ValidatorType": (_MOD_VALIDATOR_PARAMS, "ValidatorType"),
205
+ }
206
+
207
+ __all__ = list(_LAZY_IMPORTS.keys())
208
+
209
+
210
+ def __getattr__(name: str) -> object:
211
+ """Lazily import config module exports when accessed.
212
+
213
+ This allows fast imports of data_designer.config while deferring loading
214
+ of submodules until they're actually needed.
215
+ """
216
+ if name in _LAZY_IMPORTS:
217
+ module_path, attr_name = _LAZY_IMPORTS[name]
218
+ module = importlib.import_module(module_path)
219
+ return getattr(module, attr_name)
220
+
221
+ raise AttributeError(f"module 'data_designer.config' has no attribute {name!r}")
222
+
223
+
224
+ def __dir__() -> list[str]:
225
+ """Return list of available exports for tab-completion."""
226
+ return __all__
@@ -1,16 +1,15 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ # IMPORTANT: This module must NOT import from any data_designer submodules (i.e., data_designer.*).
5
+ # These base abstractions are foundational and should only depend on pydantic and Python builtins.
6
+
4
7
  from __future__ import annotations
5
8
 
6
- from pathlib import Path
7
- from typing import Any
9
+ from abc import ABC, abstractmethod
8
10
 
9
- import yaml
10
11
  from pydantic import BaseModel, ConfigDict
11
12
 
12
- from data_designer.config.utils.io_helpers import serialize_data
13
-
14
13
 
15
14
  class ConfigBase(BaseModel):
16
15
  model_config = ConfigDict(
@@ -22,48 +21,47 @@ class ConfigBase(BaseModel):
22
21
  )
23
22
 
24
23
 
25
- class ExportableConfigBase(ConfigBase):
26
- def to_dict(self) -> dict[str, Any]:
27
- """Convert the configuration to a dictionary.
24
+ class SingleColumnConfig(ConfigBase, ABC):
25
+ """Abstract base class for all single-column configuration types.
28
26
 
29
- Returns:
30
- A dictionary representation of the configuration using JSON-compatible
31
- serialization.
32
- """
33
- return self.model_dump(mode="json")
27
+ This class serves as the foundation for all column configurations in DataDesigner,
28
+ defining shared fields and properties across all column types.
29
+
30
+ Attributes:
31
+ name: Unique name of the column to be generated.
32
+ drop: If True, the column will be generated but removed from the final dataset.
33
+ Useful for intermediate columns that are dependencies for other columns.
34
+ column_type: Discriminator field that identifies the specific column type.
35
+ Subclasses must override this field to specify the column type with a `Literal` value.
36
+ """
37
+
38
+ name: str
39
+ drop: bool = False
40
+ column_type: str
34
41
 
35
- def to_yaml(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
36
- """Convert the configuration to a YAML string or file.
42
+ @staticmethod
43
+ def get_column_emoji() -> str:
44
+ return "🎨"
37
45
 
38
- Args:
39
- path: Optional file path to write the YAML to. If None, returns the
40
- YAML string instead of writing to file.
41
- indent: Number of spaces for YAML indentation. Defaults to 2.
42
- **kwargs: Additional keyword arguments passed to yaml.dump().
46
+ @property
47
+ @abstractmethod
48
+ def required_columns(self) -> list[str]:
49
+ """Returns a list of column names that must exist before this column can be generated.
43
50
 
44
51
  Returns:
45
- The YAML string if path is None, otherwise None (file is written).
52
+ List of column names that this column depends on. Empty list indicates
53
+ no dependencies. Override in subclasses to specify dependencies.
46
54
  """
47
- yaml_str = yaml.dump(self.to_dict(), indent=indent, **kwargs)
48
- if path is None:
49
- return yaml_str
50
- with open(path, "w") as f:
51
- f.write(yaml_str)
52
55
 
53
- def to_json(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
54
- """Convert the configuration to a JSON string or file.
56
+ @property
57
+ @abstractmethod
58
+ def side_effect_columns(self) -> list[str]:
59
+ """Returns a list of additional columns that this column will create as a side effect.
55
60
 
56
- Args:
57
- path: Optional file path to write the JSON to. If None, returns the
58
- JSON string instead of writing to file.
59
- indent: Number of spaces for JSON indentation. Defaults to 2.
60
- **kwargs: Additional keyword arguments passed to json.dumps().
61
+ Some column types generate additional metadata or auxiliary columns alongside
62
+ the primary column (e.g., reasoning traces for LLM columns).
61
63
 
62
64
  Returns:
63
- The JSON string if path is None, otherwise None (file is written).
65
+ List of column names that this column will create as a side effect. Empty list
66
+ indicates no side effect columns. Override in subclasses to specify side effects.
64
67
  """
65
- json_str = serialize_data(self.to_dict(), indent=indent, **kwargs)
66
- if path is None:
67
- return json_str
68
- with open(path, "w") as f:
69
- f.write(json_str)